mobelahcen
mobelahcen

Reputation: 424

Flatten and expand json in a faster way

Context

I have a json as entry and I want to explode lists and expand dictionaries nested in the original json. I do this in a recursive explode/expand method until there's no more nested lists/dics. Here's my code:

def one_step_parser(df):

    # loop over all cols and explode lists
    for col, _ in df.items():
        # fill nan values with empty dics
        df[col].fillna({i: {} for i in df.index}, inplace=True)
        try:
            df = df.explode(col)
        except Exception as e:
            pass

    # loop over all columns and expand dics
    for col, _ in df.items():
        # fill nan values with empty dics
        df[col] = df[col].fillna({i: {} for i in df.index})
        try:
            # this expands nested dics in df[col]
            series = df[col].apply(pd.Series)
            # rename new cols
            series_cols = list(series.columns)
            names = {}
            for output_col in series_cols:
                names[output_col] = str(col + "." + output_col)
            series = series.rename(columns=names)
            # concat the expanded result to the dataframe
            df = pd.concat([df, series], axis=1)
            df = df.drop(columns=[col])
        except Exception as e:
            pass

    return df


def multi_step_parser(df_bf):
    # first parse
    df_af = one_step_parser(df_bf)
    # keep parsing until there's no change
    if len(list(df_bf.columns)) != len(list(df_af.columns)):
        df_af = multi_step_parser(df_af)
    return df_af

def parser(json_file):
    json_f = open(json_file, 'r')
    my_json = json.load(json_f)
    # 1st json normaliziation to data frame
    df = pd.json_normalize(my_json)
    return multi_step_parser(df)

Example of input data:

{
"agents": [
    {
        "core_build": "17",
        "core_version": "7.1.1",
        "distro": "win-x86-64",
        "groups": [
            {
                "id": 101819,
                "name": "O Laptops"
            }
        ],
        "id": 2198802,
        "ip": "ip1",
        "name": "x1x1x1x1",
        "platform": "WINDOWS",
        "plugin_feed_id": "201810182051",
        "status": "on",
        "uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
    },
    {
        "core_build": "17",
        "core_version": "7.1.1",
        "distro": "win-x86-64",
        "groups": [
            {
                "id": 101839,
                "name": "windy"
            },
            {
                "id": 102037,
                "name": "W6"
            },
            {
                "id": 102049,
                "name": "MS8"
            }
        ],
        "id": 2097601,
        "ip": "ip2s",
        "name": "x2xx2x2x2",
        "platform": "WINDOWS",
        "plugin_feed_id": "201810181351",
        "status": "on",
        "uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
    },
    {
        "core_version": "7.1.0",
        "distro": "win-x86-64",
        "id": 2234103,
        "ip": "x6x6x6x6x",
        "last_connect": 1537384290,
        "linked_on": 1537384247,
        "name": "x7x7x7x",
        "platform": "WINDOWS",
        "status": "off",
        "uuid": "0696ee38-402a-4866-b753-2816482dfce6"
    }],
"pagination": {
    "limit": 5000,
    "offset": 0,
    "sort": [
        {
            "name": "john",
            "order": "asc"
        },
        {
            "name": "mark",
            "order": "dfg"
        }
    ],
    "total": 14416
 }
}

Please note that this is just an example, my input could be very different so I worked on something independent of the content.

Results

pagination.limit  pagination.offset  pagination.total  unique_index agents.core_build  ... agents.linked_on pagination.sort.name  pagination.sort.order agents.groups.id agents.groups.name
0       5000                 0              14416                 0            17         ...         {}             john                  asc                 101819        O Laptops        
0       5000                 0              14416                 0            17         ...         {}             mark                  dfg                 101819        O Laptops        
0       5000                 0              14416                 0            17         ...         {}             john                  asc                 101839            windy        
0       5000                 0              14416                 0            17         ...         {}             john                  asc                 102037               W6        
0       5000                 0              14416                 0            17         ...         {}             john                  asc                 102049              MS8        
0       5000                 0              14416                 0            17         ...         {}             mark                  dfg                 101839            windy        
0       5000                 0              14416                 0            17         ...         {}             mark                  dfg                 102037               W6        
0       5000                 0              14416                 0            17         ...         {}             mark                  dfg                 102049              MS8        
0       5000                 0              14416                 0            {}         ...  1.5373...             john                  asc                     {}               {}        
0       5000                 0              14416                 0            {}         ...  1.5373...             mark                  dfg                     {}               {}   ```

Problem

As you can see what I did gets the job done. However, when using it on a big json, execution time grows exponentially as more lines and columns are created on every iteration + concat that is inside the loop. What I want is to make it more efficient (especially the 2nd loop) using apply or numpy vectorization.

Attempt I managed to bypass the for loop on columns for fillna function with:

df = df.apply(lambda x: x.fillna({i: {} for i in df.index}))

But I can't find a way to do it for this function

series = df[col].apply(pd.Series)

Upvotes: 0

Views: 425

Answers (1)

Ferris
Ferris

Reputation: 5601

define a function expand_dict_col to expand the column with dict.

then use explode and expand_dict_col to achive the goal.

import pandas as pd

def expand_dict_col(df, targe_col):
    """
    e.g. pagination -> 
        {'limit': 5000,
         'offset': 0,
         'sort': [{'name': 'john', 'order': 'asc'}, {'name': 'mark', 'order': 'dfg'}],
         'total': 14416}
         
    - pagination.limit
    - pagination.offset
    - pagination.sort
    - pagination.total
    """
    # fill na values with empty dict
    cond = df[targe_col].isnull()
    df.loc[cond, targe_col] = df.loc[cond, targe_col].fillna('{}').map(eval)
    
    dfn = pd.DataFrame(df[targe_col].tolist())
    dfn.columns =  targe_col + '.' + dfn.columns.astype(str)
    
    for col in dfn.columns:
        df[col] = dfn[col].values

Execute.

data # see below

df1 = pd.json_normalize(data, record_path='agents',record_prefix='agents.', meta='pagination', )
df2 = df1.explode('agents.groups')

expand_dict_col(df2, 'agents.groups')
expand_dict_col(df2, 'pagination')

df3 = df2.explode('pagination.sort')
expand_dict_col(df3, 'pagination.sort')

df4 = df3.drop(['pagination.sort', 'pagination', 'agents.groups'], axis=1)

output -> df4.iloc[0]:

                                                         0  
agents.core_build                                        17   
agents.core_version                                   7.1.1   
agents.distro                                    win-x86-64   
agents.id                                           2198802   
agents.ip                                               ip1   
agents.name                                        x1x1x1x1   
agents.platform                                     WINDOWS   
agents.plugin_feed_id                          201810182051   
agents.status                                            on   
agents.uuid            ca8b941a-80cd-4c1c-8044-760e69781eb7   
agents.last_connect                                     NaN   
agents.linked_on                                        NaN   
agents.groups.id                                     101819   
agents.groups.name                                O Laptops   
pagination.limit                                       5000   
pagination.offset                                         0   
pagination.total                                      14416   
pagination.sort.name                                   john   
pagination.sort.order                                   asc  

use while loop to auto handle df

df = pd.json_normalize(data, record_path='agents',record_prefix='agents.', meta='pagination', )
while True:
    col_type = dict()
    for col in df.columns:
        cond = df[col].notnull()
        col_type[col] = type(df.loc[cond, col].iloc[0])
    obj_col_type = pd.Series(col_type)  
    
    list_cols = obj_col_type[obj_col_type == type(list())].index.tolist()
    dict_cols = obj_col_type[obj_col_type == type(dict())].index.tolist()
    # list_cols = col_type[col_type == type(list())].index.tolist()
    
    n = len(list_cols) + len(dict_cols)
    if n == 0:
        break
        
    for col in list_cols:
        df = df.explode(col)   
        
    for col in dict_cols:
        expand_dict_col(df, col)
        del df[col]   

data

data = {
"agents": [
    {
        "core_build": "17",
        "core_version": "7.1.1",
        "distro": "win-x86-64",
        "groups": [
            {
                "id": 101819,
                "name": "O Laptops"
            }
        ],
        "id": 2198802,
        "ip": "ip1",
        "name": "x1x1x1x1",
        "platform": "WINDOWS",
        "plugin_feed_id": "201810182051",
        "status": "on",
        "uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
    },
    {
        "core_build": "17",
        "core_version": "7.1.1",
        "distro": "win-x86-64",
        "groups": [
            {
                "id": 101839,
                "name": "windy"
            },
            {
                "id": 102037,
                "name": "W6"
            },
            {
                "id": 102049,
                "name": "MS8"
            }
        ],
        "id": 2097601,
        "ip": "ip2s",
        "name": "x2xx2x2x2",
        "platform": "WINDOWS",
        "plugin_feed_id": "201810181351",
        "status": "on",
        "uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
    },
    {
        "core_version": "7.1.0",
        "distro": "win-x86-64",
        "id": 2234103,
        "ip": "x6x6x6x6x",
        "last_connect": 1537384290,
        "linked_on": 1537384247,
        "name": "x7x7x7x",
        "platform": "WINDOWS",
        "status": "off",
        "uuid": "0696ee38-402a-4866-b753-2816482dfce6"
    }],
"pagination": {
    "limit": 5000,
    "offset": 0,
    "sort": [
        {
            "name": "john",
            "order": "asc"
        },
        {
            "name": "mark",
            "order": "dfg"
        }
    ],
    "total": 14416
 }
}

Upvotes: 1

Related Questions