Reputation: 424
Context
I have a json as entry and I want to explode lists and expand dictionaries nested in the original json. I do this in a recursive explode/expand method until there's no more nested lists/dics. Here's my code:
def one_step_parser(df):
# loop over all cols and explode lists
for col, _ in df.items():
# fill nan values with empty dics
df[col].fillna({i: {} for i in df.index}, inplace=True)
try:
df = df.explode(col)
except Exception as e:
pass
# loop over all columns and expand dics
for col, _ in df.items():
# fill nan values with empty dics
df[col] = df[col].fillna({i: {} for i in df.index})
try:
# this expands nested dics in df[col]
series = df[col].apply(pd.Series)
# rename new cols
series_cols = list(series.columns)
names = {}
for output_col in series_cols:
names[output_col] = str(col + "." + output_col)
series = series.rename(columns=names)
# concat the expanded result to the dataframe
df = pd.concat([df, series], axis=1)
df = df.drop(columns=[col])
except Exception as e:
pass
return df
def multi_step_parser(df_bf):
# first parse
df_af = one_step_parser(df_bf)
# keep parsing until there's no change
if len(list(df_bf.columns)) != len(list(df_af.columns)):
df_af = multi_step_parser(df_af)
return df_af
def parser(json_file):
json_f = open(json_file, 'r')
my_json = json.load(json_f)
# 1st json normaliziation to data frame
df = pd.json_normalize(my_json)
return multi_step_parser(df)
Example of input data:
{
"agents": [
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101819,
"name": "O Laptops"
}
],
"id": 2198802,
"ip": "ip1",
"name": "x1x1x1x1",
"platform": "WINDOWS",
"plugin_feed_id": "201810182051",
"status": "on",
"uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
},
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101839,
"name": "windy"
},
{
"id": 102037,
"name": "W6"
},
{
"id": 102049,
"name": "MS8"
}
],
"id": 2097601,
"ip": "ip2s",
"name": "x2xx2x2x2",
"platform": "WINDOWS",
"plugin_feed_id": "201810181351",
"status": "on",
"uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
},
{
"core_version": "7.1.0",
"distro": "win-x86-64",
"id": 2234103,
"ip": "x6x6x6x6x",
"last_connect": 1537384290,
"linked_on": 1537384247,
"name": "x7x7x7x",
"platform": "WINDOWS",
"status": "off",
"uuid": "0696ee38-402a-4866-b753-2816482dfce6"
}],
"pagination": {
"limit": 5000,
"offset": 0,
"sort": [
{
"name": "john",
"order": "asc"
},
{
"name": "mark",
"order": "dfg"
}
],
"total": 14416
}
}
Please note that this is just an example, my input could be very different so I worked on something independent of the content.
Results
pagination.limit pagination.offset pagination.total unique_index agents.core_build ... agents.linked_on pagination.sort.name pagination.sort.order agents.groups.id agents.groups.name
0 5000 0 14416 0 17 ... {} john asc 101819 O Laptops
0 5000 0 14416 0 17 ... {} mark dfg 101819 O Laptops
0 5000 0 14416 0 17 ... {} john asc 101839 windy
0 5000 0 14416 0 17 ... {} john asc 102037 W6
0 5000 0 14416 0 17 ... {} john asc 102049 MS8
0 5000 0 14416 0 17 ... {} mark dfg 101839 windy
0 5000 0 14416 0 17 ... {} mark dfg 102037 W6
0 5000 0 14416 0 17 ... {} mark dfg 102049 MS8
0 5000 0 14416 0 {} ... 1.5373... john asc {} {}
0 5000 0 14416 0 {} ... 1.5373... mark dfg {} {} ```
Problem
As you can see what I did gets the job done. However, when using it on a big json, execution time grows exponentially as more lines and columns are created on every iteration + concat
that is inside the loop. What I want is to make it more efficient (especially the 2nd loop) using apply
or numpy
vectorization.
Attempt
I managed to bypass the for
loop on columns for fillna
function with:
df = df.apply(lambda x: x.fillna({i: {} for i in df.index}))
But I can't find a way to do it for this function
series = df[col].apply(pd.Series)
Upvotes: 0
Views: 425
Reputation: 5601
define a function expand_dict_col
to expand the column with dict.
then use explode
and expand_dict_col
to achive the goal.
import pandas as pd
def expand_dict_col(df, targe_col):
"""
e.g. pagination ->
{'limit': 5000,
'offset': 0,
'sort': [{'name': 'john', 'order': 'asc'}, {'name': 'mark', 'order': 'dfg'}],
'total': 14416}
- pagination.limit
- pagination.offset
- pagination.sort
- pagination.total
"""
# fill na values with empty dict
cond = df[targe_col].isnull()
df.loc[cond, targe_col] = df.loc[cond, targe_col].fillna('{}').map(eval)
dfn = pd.DataFrame(df[targe_col].tolist())
dfn.columns = targe_col + '.' + dfn.columns.astype(str)
for col in dfn.columns:
df[col] = dfn[col].values
Execute.
data # see below
df1 = pd.json_normalize(data, record_path='agents',record_prefix='agents.', meta='pagination', )
df2 = df1.explode('agents.groups')
expand_dict_col(df2, 'agents.groups')
expand_dict_col(df2, 'pagination')
df3 = df2.explode('pagination.sort')
expand_dict_col(df3, 'pagination.sort')
df4 = df3.drop(['pagination.sort', 'pagination', 'agents.groups'], axis=1)
output -> df4.iloc[0]
:
0
agents.core_build 17
agents.core_version 7.1.1
agents.distro win-x86-64
agents.id 2198802
agents.ip ip1
agents.name x1x1x1x1
agents.platform WINDOWS
agents.plugin_feed_id 201810182051
agents.status on
agents.uuid ca8b941a-80cd-4c1c-8044-760e69781eb7
agents.last_connect NaN
agents.linked_on NaN
agents.groups.id 101819
agents.groups.name O Laptops
pagination.limit 5000
pagination.offset 0
pagination.total 14416
pagination.sort.name john
pagination.sort.order asc
use while loop to auto handle df
df = pd.json_normalize(data, record_path='agents',record_prefix='agents.', meta='pagination', )
while True:
col_type = dict()
for col in df.columns:
cond = df[col].notnull()
col_type[col] = type(df.loc[cond, col].iloc[0])
obj_col_type = pd.Series(col_type)
list_cols = obj_col_type[obj_col_type == type(list())].index.tolist()
dict_cols = obj_col_type[obj_col_type == type(dict())].index.tolist()
# list_cols = col_type[col_type == type(list())].index.tolist()
n = len(list_cols) + len(dict_cols)
if n == 0:
break
for col in list_cols:
df = df.explode(col)
for col in dict_cols:
expand_dict_col(df, col)
del df[col]
data
data = {
"agents": [
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101819,
"name": "O Laptops"
}
],
"id": 2198802,
"ip": "ip1",
"name": "x1x1x1x1",
"platform": "WINDOWS",
"plugin_feed_id": "201810182051",
"status": "on",
"uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
},
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101839,
"name": "windy"
},
{
"id": 102037,
"name": "W6"
},
{
"id": 102049,
"name": "MS8"
}
],
"id": 2097601,
"ip": "ip2s",
"name": "x2xx2x2x2",
"platform": "WINDOWS",
"plugin_feed_id": "201810181351",
"status": "on",
"uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
},
{
"core_version": "7.1.0",
"distro": "win-x86-64",
"id": 2234103,
"ip": "x6x6x6x6x",
"last_connect": 1537384290,
"linked_on": 1537384247,
"name": "x7x7x7x",
"platform": "WINDOWS",
"status": "off",
"uuid": "0696ee38-402a-4866-b753-2816482dfce6"
}],
"pagination": {
"limit": 5000,
"offset": 0,
"sort": [
{
"name": "john",
"order": "asc"
},
{
"name": "mark",
"order": "dfg"
}
],
"total": 14416
}
}
Upvotes: 1