Reputation: 627
Here's a sample of the data I'm trying to flatten:
location = [{'id': 225,
'country': 'US',
'country_code': 'US',
'country_population': 327167434,
'province': '',
'last_updated': '2020-05-06T11:33:46.184263Z',
'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'timelines': {'confirmed': {
'latest': 1204351,
'timeline': {
'2020-01-22T00:00:00Z': 1,
'2020-01-23T00:00:00Z': 1,
'2020-01-24T00:00:00Z': 2}
},
'deaths': {
'latest': 71064,
'timeline': {
'2020-01-22T00:00:00Z': 0,
'2020-01-23T00:00:00Z': 0,
'2020-01-24T00:00:00Z': 0}
}
}
}]
I'm unable to get the timeline for confirmed & deaths in a usable format. I either get a column for each date with this:
pd.json_normalize(
data = location,
record_path=['timelines','confirmed','timeline'])
Or a row with date and no value for the count:
pd.json_normalize(data = location[0]['timelines']['confirmed'])
Desired output is something like:
Thanks in advance; any assistance is much appreciated.
Upvotes: 1
Views: 2581
Reputation: 627
I'm interested in wrangling/reshaping with Pandas and wanted to try a solution via that method. It didn't make sense to duplicate the latest value with each timeline date row so I broke the data into 2 frames: latest & timeline. Still fairly new to this so any feedback/potential improvements are appreciated.
df = pd.json_normalize(data = location, sep = '_')
#list of current state / non-timeline columns
latest_cols = ['id', 'country', 'country_code', 'country_population', 'province', 'last_updated',
'coordinates_latitude', 'coordinates_longitude', 'latest_confirmed', 'latest_deaths',
'latest_recovered', 'timelines_confirmed_latest', 'timelines_deaths_latest',
'timelines_recovered_latest']
covid_latest = df[latest_cols]
# remove id & country from latest cols for inclusion w/ timeline
latest_cols = latest_cols[2:]
covid_timelines = df.drop(columns = latest_cols)
# reshape from wide to long
covid_timelines = pd.melt(
frame = covid_timelines,
id_vars = ['id', 'country'],
var_name = 'status_date',
value_name = 'case_count')
# split the timelines value and keep columns with date and type
timeline_cols = covid_timelines.status_date.str.split('_', expand=True)
timeline_cols.columns = ['timelines', 'case_type', 'timeline', 'str_date']
timeline_cols = timeline_cols.drop(columns = ['timelines', 'timeline'])
#add split cols back to covid_timelines
covid_timelines = pd.concat([covid_timelines, timeline_cols], axis=1)
# string to date and drop the dup date column
covid_timelines.status_date = pd.to_datetime(timeline_cols.str_date).dt.date
covid_timelines = covid_timelines.drop(columns = 'str_date')
#pivot death & confirmed from rows to cols
covid_timelines.pivot_table(
index = ['id', 'country', 'status_date'],
columns = 'case_type',
values = 'case_count' )
Upvotes: -1
Reputation: 28644
taking it a step further with @Trenton Mckinney's data, we can do all the processing outside of pandas, and bring the finished product into a dataframe :
Step 1 : create a function to process the data :
def extract_data(location,keyword):
collection = []
for ent in location:
#get the timeline data for either 'confirmed' or 'deaths'
#determined by the keyword
b = ent.get('timelines').get(keyword).get('timeline')
#switch out from dicts to a list of key value pairs
items = list(b.items())
#for each value in the items list,
#append the country and population
for val in items:
val = val + (ent.get('country'), ent.get('country_population'))
#collect each result into a collection list
collection.append(val)
#create a dataframe
cols = ['date', 'count', 'country', 'country_population']
res = pd.DataFrame(collection, columns = cols)
res = res.set_index(['country','country_population'])
#conditionals depending on the keyword
if keyword == "confirmed":
res = res.add_prefix('confirmed.timeline.')
elif keyword == "deaths":
res = res.add_prefix('deaths.timeline.')
return res
Step 2 : Apply function per keyword - 'confirmed' or 'deaths'
confirmed = extract_data(location,'confirmed')
deaths = extract_data(location,'deaths')
Step 3 : concatenate the dataframes, on the columns axis :
pd.concat([confirmed,deaths],axis=1)
confirmed.timeline.date confirmed.timeline.count deaths.timeline.date deaths.timeline.count
country country_population
US 327167434 2020-01-22T00:00:00Z 1 2020-01-22T00:00:00Z 0
327167434 2020-01-23T00:00:00Z 1 2020-01-23T00:00:00Z 0
327167434 2020-01-24T00:00:00Z 2 2020-01-24T00:00:00Z 0
AF 327167435 2020-02-22T00:00:00Z 2 2020-02-22T00:00:00Z 1
327167435 2020-02-23T00:00:00Z 2 2020-02-23T00:00:00Z 1
327167435 2020-02-24T00:00:00Z 3 2020-02-24T00:00:00Z 1
AS 327167436 2020-03-22T00:00:00Z 3 2020-03-22T00:00:00Z 2
327167436 2020-03-23T00:00:00Z 3 2020-03-23T00:00:00Z 2
327167436 2020-03-24T00:00:00Z 4 2020-03-24T00:00:00Z 2
UPDATE : love the feedback from @DanilaGanchar and decided to rewrite the code. All the computation occurs outside Pandas till the final moment : speed reduction as well : 853us
d = []
for entry in location:
#pull dictionary for country and population
country_population = {'country':entry.get('country'),
'country_population':entry.get('country_population')}
#pull data for the confirmed and death timelines
confirmed_timeline = entry.get('timelines').get('confirmed').get('timeline').items()
death_timeline = entry.get('timelines').get('deaths').get('timeline').items()
#set keys for the deaths and confirmed dicts
deaths = ('deaths.timeline.date', 'deaths.timeline.count')
confirmed = ('confirmed.timeline.date', 'confirmed.timeline.count')
#attach keys to the values in deaths and confirmed
#this will become the column names in the dataframe
confirmz = (dict(zip(confirmed,conf)) for conf in confirmed_timeline)
deathz = (dict(zip(deaths,death)) for death in death_timeline)
#aggregate the data into one dict
#and lump the dicts into one list
for content, cont in zip(confirmz, deathz):
content.update(country_population)
content.update(cont)
d.append(content)
pd.DataFrame(d)
confirmed.timeline.date confirmed.timeline.count country country_population deaths.timeline.date deaths.timeline.count
0 2020-01-22T00:00:00Z 1 US 327167434 2020-01-22T00:00:00Z 0
1 2020-01-23T00:00:00Z 1 US 327167434 2020-01-23T00:00:00Z 0
2 2020-01-24T00:00:00Z 2 US 327167434 2020-01-24T00:00:00Z 0
3 2020-02-22T00:00:00Z 2 AF 327167435 2020-02-22T00:00:00Z 1
4 2020-02-23T00:00:00Z 2 AF 327167435 2020-02-23T00:00:00Z 1
5 2020-02-24T00:00:00Z 3 AF 327167435 2020-02-24T00:00:00Z 1
6 2020-03-22T00:00:00Z 3 AS 327167436 2020-03-22T00:00:00Z 2
7 2020-03-23T00:00:00Z 3 AS 327167436 2020-03-23T00:00:00Z 2
8 2020-03-24T00:00:00Z 4 AS 327167436 2020-03-24T00:00:00Z 2
Upvotes: 3
Reputation: 11223
I look at the approved answer. 2 df
, 2 apply()
, 1 copy()
, 1 drop()
, 1 concat()
, 1 join()
. But you can do it using only json_normalize
. The main problem in timeline
structure. Is a dict
, but not list
of dicts
. So all what you need is convert timeline
to:
[{
'confirmed_dt': '2020-01-22T00:00:00Z',
'confirmed_count': 0,
'deaths_dt': '2020-01-22T00:00:00Z',
'deaths_count': 0,
}, ...]
Just an example:
def format_timelines(data: list):
for rec in data:
new_timelines = []
confirmed = rec['timelines']['confirmed']['timeline']
deaths = rec['timelines']['deaths']['timeline']
for (k, v), (k2, v2) in zip(confirmed.items(), deaths.items()):
# just set any keys(column names)
new_timelines.append({
'confirmed_dt': k,
'deaths_dt': k2,
'confirmed_count': v,
'deaths_count': v2,
})
rec['new_timelines'] = new_timelines
return data
df = json_normalize(
format_timelines(location),
['new_timelines'],
meta=['country', 'country_population']
)
I think it's easier. Hope this helps.
Upvotes: 1
Reputation: 62403
confirmed.timeline.date
& deaths.timeline.date
must be the same length.import pandas as pd
# create a dataframe
top = pd.DataFrame(location)
# dataframe of desired columns
d = top[['country', 'country_population', 'timelines']].copy()
# transform timelines
d[['confirmed.timeline.date', 'confirmed.timeline.count']] = d.timelines.apply(lambda x: pd.Series([list(x['confirmed']['timeline'].keys()), list(x['confirmed']['timeline'].values())]))
d[['deaths.timeline.date', 'deaths.timeline.count']] = d.timelines.apply(lambda x: pd.Series([list(x['deaths']['timeline'].keys()), list(x['deaths']['timeline'].values())]))
d.drop(columns=['timelines'], inplace=True)
d
# explode the lists created in the last step and then join the data
cols = ['confirmed.timeline.date', 'confirmed.timeline.count', 'deaths.timeline.date', 'deaths.timeline.count']
d = d[['country', 'country_population']].join(pd.concat([d.explode(col)[col] for col in cols], axis=1))
print(d)
country country_population confirmed.timeline.date confirmed.timeline.count deaths.timeline.date deaths.timeline.count
0 US 327167434 2020-01-22T00:00:00Z 1 2020-01-22T00:00:00Z 0
0 US 327167434 2020-01-23T00:00:00Z 1 2020-01-23T00:00:00Z 0
0 US 327167434 2020-01-24T00:00:00Z 2 2020-01-24T00:00:00Z 0
1 AF 327167435 2020-02-22T00:00:00Z 2 2020-02-22T00:00:00Z 1
1 AF 327167435 2020-02-23T00:00:00Z 2 2020-02-23T00:00:00Z 1
1 AF 327167435 2020-02-24T00:00:00Z 3 2020-02-24T00:00:00Z 1
2 AS 327167436 2020-03-22T00:00:00Z 3 2020-03-22T00:00:00Z 2
2 AS 327167436 2020-03-23T00:00:00Z 3 2020-03-23T00:00:00Z 2
2 AS 327167436 2020-03-24T00:00:00Z 4 2020-03-24T00:00:00Z 2
location =
[{'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'country': 'US',
'country_code': 'US',
'country_population': 327167434,
'id': 225,
'last_updated': '2020-05-06T11:33:46.184263Z',
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'province': '',
'timelines': {'confirmed': {'latest': 1204351,
'timeline': {'2020-01-22T00:00:00Z': 1,
'2020-01-23T00:00:00Z': 1,
'2020-01-24T00:00:00Z': 2}},
'deaths': {'latest': 71064,
'timeline': {'2020-01-22T00:00:00Z': 0,
'2020-01-23T00:00:00Z': 0,
'2020-01-24T00:00:00Z': 0}}}},
{'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'country': 'AF',
'country_code': 'AF',
'country_population': 327167435,
'id': 226,
'last_updated': '2020-05-06T11:33:46.184263Z',
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'province': '',
'timelines': {'confirmed': {'latest': 1204351,
'timeline': {'2020-02-22T00:00:00Z': 2,
'2020-02-23T00:00:00Z': 2,
'2020-02-24T00:00:00Z': 3}},
'deaths': {'latest': 71064,
'timeline': {'2020-02-22T00:00:00Z': 1,
'2020-02-23T00:00:00Z': 1,
'2020-02-24T00:00:00Z': 1}}}},
{'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'country': 'AS',
'country_code': 'AS',
'country_population': 327167436,
'id': 227,
'last_updated': '2020-05-06T11:33:46.184263Z',
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'province': '',
'timelines': {'confirmed': {'latest': 1204351,
'timeline': {'2020-03-22T00:00:00Z': 3,
'2020-03-23T00:00:00Z': 3,
'2020-03-24T00:00:00Z': 4}},
'deaths': {'latest': 71064,
'timeline': {'2020-03-22T00:00:00Z': 2,
'2020-03-23T00:00:00Z': 2,
'2020-03-24T00:00:00Z': 2}}}}]
Upvotes: 1