Reputation: 13407
I'm creating a dataframe the following way:
filtered_data.groupby('weekday').agg({'airing': np.sum, 'uplift': [np.sum,np.mean]})
Which creates a table of:
sum sum mean
weekday
1 11 20 1.818182
2 24 46 1.916667
...
What I want is to include a final row which is the total for each column.
Thanks in advance!
Upvotes: 2
Views: 2470
Reputation: 169334
For this purpose I created an aggregation tool which behaves like GROUPING SETS
in SQL. Supply columns by which to group and an aggregate function, and get back an aggregated DataFrame.
import itertools as it
import pandas as pd
from pandas.util.testing import assert_frame_equal
def powerset(iterable):
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return it.chain.from_iterable(it.combinations(s,r) for r in range(len(s)+1))
def grouper(df,grpby,aggfunc):
''' produces aggregate DataFrame from DataFrames for non-redundant groupings
`workingdf` is used to avoid modifying original DataFrame
'''
uniqcols = set(col for col in grpby if len(df[col].unique()) == 1)
subset = set()
for col in uniqcols:
for grp in powerset(grpby):
if col in grp:
subset.add(grp) # add level of aggregation only when non-redundant
if len(subset) == 0:
for grp in powerset(grpby):
subset.add(grp)
workingdf = df.copy()
for idx,i in enumerate(subset):
if i != ():
tmp = aggfunc( workingdf.groupby(i) )
else:
# hack to get output to be a DataFrameGroupBy object:
# insert dummy column on which to group by
dummycolname = hash(tuple(workingdf.columns.tolist()))
workingdf[dummycolname] = ''
tmp = aggfunc( workingdf.groupby(dummycolname) )
# drop the index and add it back
if i == (): tmp.reset_index(drop=True,inplace=True)
else: tmp.reset_index(inplace=True)
for j in grpby:
if j not in tmp: # if column is not in DataFrame add it
tmp[j] = '(All)'
# new list with all columns including aggregate ones; do this only once
if idx == 0:
finalcols = grpby[:]
addlcols = [k for k in tmp if k not in grpby] # aggregate columns
finalcols.extend(addlcols)
# reorder columns
tmp = tmp[finalcols]
if idx == 0:
final = tmp; del tmp
else:
final = pd.concat( [final,tmp] ); del tmp
del workingdf
final.sort_values(finalcols,inplace=True)
final.reset_index(drop=True,inplace=True)
return final
def agg(grpbyobj):
''' the purpose of this function is to:
specify aggregate operation(s) you wish to perform,
name the resulting column(s) in the final DataFrame.
'''
tmp = pd.DataFrame()
tmp['Total (n)'] = grpbyobj['Total'].sum()
return tmp
if __name__ == '__main__':
df = pd.DataFrame({'Area':['a','a','b',],
'Year':[2014,2014,2014,],
'Month':[1,2,3,],
'Total':[4,5,6,],})
final = grouper(df,grpby=['Area','Year'],aggfunc=agg)
# test against expected result
expected = pd.DataFrame({u'Year': {0: 2014, 1: 2014, 2: 2014},
u'Total (n)': {0: 15, 1: 9, 2: 6},
u'Area': {0: u'(All)', 1: u'a', 2: u'b'}})
expected = expected[final.columns.tolist()]
try:
# check_names kwarg True: compare indexes and columns
assert_frame_equal(final,expected,check_names=True)
except AssertionError as e:
raise
Upvotes: 0
Reputation: 109546
In this case, you should probably create a Series that keeps track of your summary stats. You could then concat if you needed to for display purposes.
summary = pd.Series([filtered_data.airing.sum(),
filtered_data.uplift.sum(),
filtered_data.uplift.mean()],
name='summary')
Upvotes: 1
Reputation: 81604
You can use the .loc
function in order to achieve that:
df.loc[len(df)] = [df[col].sum() for col in df.columns]
Upvotes: 1