Reputation: 87
I am trying to group my dataframe by values in one of the columns, 'category'. Although, one of the other columns 'prob' contains a list of tuples on each row. When I try to group-by 'category', the 'prob' column disappears.
My current df:
category other: prob:
one val [(hi, hello), (jimbob, joe)]
one val2 [(this, not), (is, work), (now, any)]
two val2 [(bob, jones), (work, here)]
three val3 [(milk, coffee), (tea, bread)]
two val3 [(money, here), (job, money)]
Expected output:
category: other: prob:
one val, val2 [(hi, hello), (jimbob, joe), (this, not), (is, work), (now, any)]
two val2, val3 [(bob, jones), (work, here), (money, here), (job, money)]
three val3 [(money, here), (job, money)]
What is the best way to do this? Apologies if I have mis-phrased this question please let me know if you have any questions. Thank you!
Upvotes: 2
Views: 88
Reputation: 863166
You can aggregate data by GroupBy.agg
with join
for string column and flatten data for tuples - added 3 solutions, sum
use only if small data and performance is not important:
import functools
import operator
from itertools import chain
f = lambda x: [z for y in x for z in y]
#faster alternative
#f = lambda x: list(chain.from_iterable(x))
#faster alternative2
#f = lambda x: functools.reduce(operator.iadd, x, [])
#slow alternative
#f = lambda x: x.sum()
df = df.groupby('category', as_index=False).agg({'other':', '.join, 'prob':f})
print (df)
category other prob
0 one val, val2 [(hi, hello), (jimbob, joe), (this, not), (is,...
1 three val3 [(milk, coffee), (tea, bread)]
2 two val2, val3 [(bob, jones), (work, here), (money, here), (j...
Performance:
Code for testing:
np.random.seed(2019)
import perfplot
import functools
import operator
from itertools import chain
default_value = 10
def iadd(df1):
f = lambda x: functools.reduce(operator.iadd, x, [])
d = {'other':', '.join, 'prob':f}
return df1.groupby('category', as_index=False).agg(d)
def listcomp(df1):
f = lambda x: [z for y in x for z in y]
d = {'other':', '.join, 'prob':f}
return df1.groupby('category', as_index=False).agg(d)
def from_iterable(df1):
f = lambda x: list(chain.from_iterable(x))
d = {'other':', '.join, 'prob':f}
return df1.groupby('category', as_index=False).agg(d)
def sum_series(df1):
f = lambda x: x.sum()
d = {'other':', '.join, 'prob':f}
return df1.groupby('category', as_index=False).agg(d)
def sum_groupby_cat(df1):
d = {'other':lambda x: x.str.cat(sep=', '), 'prob':'sum'}
return df1.groupby('category', as_index=False).agg(d)
def sum_groupby_join(df1):
d = {'other': ', '.join, 'prob': 'sum'}
return df1.groupby('category', as_index=False).agg(d)
def make_df(n):
a = np.random.randint(0, n / 10, n)
b = np.random.choice(list('abcdef'), len(a))
c = [tuple(np.random.choice(list(string.ascii_letters), 2)) for _ in a]
df = pd.DataFrame({"category":a, "other":b, "prob":c})
df1 = df.groupby(['category','other'])['prob'].apply(list).reset_index()
return df1
perfplot.show(
setup=make_df,
kernels=[iadd, listcomp, from_iterable, sum_series,sum_groupby_cat,sum_groupby_join],
n_range=[10**k for k in range(1, 8)],
logx=True,
logy=True,
equality_check=False,
xlabel='len(df)')
Upvotes: 4
Reputation: 88276
You could GroupBy
the category
column and aggregate with the following functions:
df.groupby('category', as_index=False).agg({'other':lambda x: x.str.cat(sep=', '),
'prob':'sum'})
Which for the first rows gives:
category other prob
0 one val, val2 [(hi, hello), (jimbob, joe), (this, not), (is,...
1 two val2 [(bob, jones), (work, here)]
Upvotes: 2