pandas - group by: create aggregation function using multiple columns

Question

I have the following data frame:

id    my_year   my_month    waiting_time   target
001     2018       1           95            1
002     2018       1           3             3
003     2018       1           4             0
004     2018       1           40            1
005     2018       2           97            1
006     2018       2           3             3
007     2018       3           4             0
008     2018       3           40            1

I want to groupby my_year and my_month, then in each group I want to compute the my_rate based on

(# of records with waiting_time <= 90 and target = 1)/ total_records in the group

i.e. I am expecting output like:

my_year  my_month    my_rate
 2018     1          0.25
 2018     2          0.0
 2018     3          0.5

I wrote the following code to compute the desired value my_rate:

def my_rate(data):
    waiting_time_list = data['waiting_time']
    target_list = data['target']

    total = len(data)
    my_count = 0

    for i in range(len(data)):
      if total_waiting_time_list[i] <= 90 and target_list[i] == 1:
        my_count += 1

    rate = float(my_count)/float(total)
    return rate

df.groupby(['my_year','my_month']).apply(my_rate)

However, I got the following error:

KeyError 0 
KeyErrorTraceback (most recent call last)
 in ()
     17 
---> 18 df.groupby(['my_year','my_month']).apply(my_rate)

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
    714         # ignore SettingWithCopy here in case the user mutates
    715         with option_context('mode.chained_assignment', None):
--> 716             return self._python_apply_general(f)
    717 
    718     def _python_apply_general(self, f):

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
    718     def _python_apply_general(self, f):
    719         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720                                                    self.axis)
    721 
    722         return self._wrap_applied_output(

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, f, data, axis)
   1727             # group might be modified
   1728             group_axes = _get_axes(group)
-> 1729             res = f(group)
   1730             if not _is_indexed_like(res, group_axes):
   1731                 mutated = True

 in conversion_rate(data)
      8       #print total_waiting_time_list[i],  target_list[i]
      9       #print i, total_waiting_time_list[i], target_list[i]
---> 10       if total_waiting_time_list[i] <= 90:# and target_list[i] == 1:
     11         convert_90_count += 1
     12         #print 'convert ', convert_90_count

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_value(self, series, key)
   2426         try:
   2427             return self._engine.get_value(s, k,
-> 2428                                           tz=getattr(series.dtype, 'tz', None))
   2429         except KeyError as e1:
   2430             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4363)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4046)()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5085)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13913)()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13857)()

KeyError: 0

Any idea what I did wrong here? And how do I fix it? Thanks!

jezrael · Accepted Answer

I believe better is use mean of boolean mask per groups:

def my_rate(x):
    return ((x['waiting_time'] <= 90) & (x['target'] == 1)).mean()

df = df.groupby(['my_year','my_month']).apply(my_rate).reset_index(name='my_rate')
print (df)
   my_year  my_month  my_rate
0     2018         1     0.25
1     2018         2     0.00
2     2018         3     0.50

Any idea what I did wrong here?

Problem is waiting_time_list and target_list are not lists, but Series:

waiting_time_list = data['waiting_time']
target_list = data['target']

print (type(waiting_time_list))


print (type(target_list))

So if want indexing it failed, because in second group are indices 4,5, not 0,1.

if waiting_time_list[i] <= 90 and target_list[i] == 1:

For avoid it is possible convert Series to list:

waiting_time_list = data['waiting_time'].tolist()
target_list = data['target'].tolist()

pandas - group by: create aggregation function using multiple columns

Answers (1)

Related Questions