daiyue
daiyue

Reputation: 7448

pandas groupby how to aggregate on a column and convert to a matrix that each cell is a dictionary

I have a df,

key    amount    code    year_month
CHQ    100       1000    201701
EFT    200       1000    201701
CHQ    300       2000    201702
CHQ    400       2000    201702
EFT    500       3000    201703
EFT    600       4000    201703

I like to groupby code and year_month, and then aggregate count on key in a dictionary, so the result looks like

                   key
code year_month
1000 201701        {'CHQ':1, 'EFT':1}
2000 201702        {'CHQ':2, 'EFT':0}
3000 201703        {'CHQ':0, 'EFT':2}

and then transform the above df into a matrix with code values as row labels, and year_month values as column labels, and also do aggregation on row 0 and column 0, that e.g. (1,1) sums count for all code and year_month, (1,2) sums the count for 201701.

     0          1                  2                      3                 4 
 0   -1         0                201701                 201702            201703
 1   0    {'CHQ':3,'EFT':3}  {'CHQ':1,'EFT':1}    {'CHQ':2,'EFT':0}  {'CHQ':0,'EFT':2}              
 2   1000 {'CHQ':1,'EFT':1}  {'CHQ':1,'EFT':1}    {'CHQ':0,'EFT':0}  {'CHQ':0,'EFT':0}                    
 3   2000 {'CHQ':2,'EFT':0}  {'CHQ':0,'EFT':0}    {'CHQ':2,'EFT':0}  {'CHQ':0,'EFT':0}
 4   3000 {'CHQ':0,'EFT':2}  {'CHQ':0,'EFT':0}    {'CHQ':0,'EFT':0}  {'CHQ':0,'EFT':2}        

Upvotes: 1

Views: 770

Answers (2)

Stefan Falk
Stefan Falk

Reputation: 25397

This would be my take if I got that right:

df_tmp = (df.groupby(['code', 'year_month'])['key']
           .agg(lambda x: (Counter(x),))
           .to_frame()
           .reset_index()
          )

# Pivot 
df_pivot = pd.pivot_table(df_tmp, index='code', columns='year_month', values='key', aggfunc=np.sum)
# Filling the gaps
df_final = df_pivot.apply(lambda x: [v[0] if v is not None else Counter(dict.fromkeys(df.key, 0)) for v in x ])

Note that this tupeling "(Counter(x),)" in the agg function is a little workaround since pandas doesn't seem to like working with objects. However, this is going to be unwrapped in the final apply call anyway.

Since you have Counter instances you can to things like summing over columns/rows and get the correct result:

df_final.sum(1)

Copy & paste example

from io import StringIO

import pandas as pd
import numpy as np

from collections import Counter

data = """
key    amount    code    year_month
CHQ    100       1000    201701
EFT    200       1000    201701
EFT    200       1000    201702
EFT    200       1000    201702
EFT    200       1000    201702
EFT    200       1000    201702
EFT    200       1000    201702
CHQ    300       2000    201702
CHQ    400       2000    201702
EFT    500       3000    201703
EFT    600       4000    201703
"""

df = pd.read_csv(StringIO(data), sep='\s+')

df_tmp = (df.groupby(['code', 'year_month'])['key']
           .agg(lambda x: (Counter(x),))
           .to_frame()
           .reset_index()
          )

# Pivot 
df_pivot = pd.pivot_table(df_tmp, index='code', columns='year_month', values='key', aggfunc=np.sum)
# Filling in the gaps 
df_final = df_pivot.apply(lambda x: [v[0] if v is not None else Counter(dict.fromkeys(df.key, 0)) for v in x ])

Upvotes: 1

piRSquared
piRSquared

Reputation: 294258

defaultdict

from collections import defaultdict

a = dict.fromkeys(df.key.unique(), 0)
b = defaultdict(lambda:defaultdict(lambda:a))

for t in df.itertuples():
    b[t.year_month][t.code][t.key] += 1
    b['All'][t.code][t.key] += 1
    b[t.year_month]['All'][t.key] += 1

idx = df.code.unique().tolist() + ['All']
idx = pd.CategoricalIndex(idx, idx, ordered=True)

col = df.year_month.unique().tolist() + ['All']
col = pd.CategoricalIndex(col, col, ordered=True)

m = pd.DataFrame(b, idx, col)
m = m.mask(m.isna(), a)

m

                    201701                201702                201703                   All
1000  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}
2000  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}
3000  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}
4000  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}
All   {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}  {'CHQ': 9, 'EFT': 9}

Sums with append and concat

d = df.groupby(['code', 'year_month', 'key']).size().unstack([1, 2], fill_value=0)
d = d.reindex(columns=pd.MultiIndex.from_product(d.columns.levels), fill_value=0)

def add_level(d, k):
    return pd.concat([d], keys=[k], axis=1)

m = d.join(d.sum(1, level=1).pipe(add_level, k='All')).pipe(
    lambda d: d.append(d.sum().rename('All'))
)

m.groupby(axis=1, level=0).apply(lambda x: pd.Series(x.xs(x.name, 1).to_dict('i')))

                    201701                201702                201703                   All
1000  {'CHQ': 1, 'EFT': 1}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 1, 'EFT': 1}
2000  {'CHQ': 0, 'EFT': 0}  {'CHQ': 2, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 2, 'EFT': 0}
3000  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 1}  {'CHQ': 0, 'EFT': 1}
4000  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 1}  {'CHQ': 0, 'EFT': 1}
All   {'CHQ': 1, 'EFT': 1}  {'CHQ': 2, 'EFT': 0}  {'CHQ': 0, 'EFT': 2}  {'CHQ': 3, 'EFT': 3}

Alternate to_dict

pd.DataFrame({
    j: {
        i: m.loc[i, j].to_dict()
        for i in m.index
    } for j in m.columns.levels[0]
})

Old Answer

import pandas as pd
from collections import Counter

d = df.groupby(['code', 'year_month', 'key']).size().unstack(fill_value=0).assign(
    key=lambda d: d.to_dict('r')
)[['key']].reset_index().rename_axis(None, 1).assign(key=lambda d: list(map(Counter, d.key)))
d

   code  year_month                   key
0  1000      201701  {'CHQ': 1, 'EFT': 1}
1  2000      201702  {'CHQ': 2, 'EFT': 0}
2  3000      201703  {'CHQ': 0, 'EFT': 1}
3  4000      201703  {'CHQ': 0, 'EFT': 1}

Then

fv = Counter(dict.fromkeys(df.key.unique(), 0))
d2 = d.pivot_table(
    'key', 'code', 'year_month', 'sum',
    margins=True
).pipe(lambda d: d.mask(d.isna(), fv))
d2

year_month                201701                201702                201703                   All
code                                                                                              
1000        {'CHQ': 1, 'EFT': 1}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 1, 'EFT': 1}
2000        {'CHQ': 0, 'EFT': 0}  {'CHQ': 2, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 2, 'EFT': 0}
3000        {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 1}  {'CHQ': 0, 'EFT': 1}
4000        {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 1}  {'CHQ': 0, 'EFT': 1}
All         {'CHQ': 1, 'EFT': 1}  {'CHQ': 2, 'EFT': 0}            {'EFT': 2}  {'CHQ': 3, 'EFT': 3}

We can hack some stuff to get things to work. I don't feel good about this but maybe it's enough to get you what you need.

Use Counter.update to alter objects inplace

d2.applymap(lambda x: x.update(fv));

d2

year_month                201701                201702                201703                   All
code                                                                                              
1000        {'CHQ': 1, 'EFT': 1}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 1, 'EFT': 1}
2000        {'CHQ': 0, 'EFT': 0}  {'CHQ': 2, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 2, 'EFT': 0}
3000        {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 1}  {'CHQ': 0, 'EFT': 1}
4000        {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 0}  {'CHQ': 0, 'EFT': 1}  {'CHQ': 0, 'EFT': 1}
All         {'CHQ': 1, 'EFT': 1}  {'CHQ': 2, 'EFT': 0}  {'EFT': 2, 'CHQ': 0}  {'CHQ': 3, 'EFT': 3}

Upvotes: 1

Related Questions