Pandas groupby apply is very slow

Question

grouped = data_v1.sort_values(by = "Strike_Price").groupby(['dateTime','close','Index','Expiry','group'])

def calc_summary(group):
    name = group.name
    if name[3] == "above":
        call_oi = group['Call_OI'].sum()
        call_vol = group['Call_Volume'].sum()
        put_oi = group['Put_OI'].sum()
        put_vol = group['Put_Volume'].sum()
        call_oi_1 = group.head(1)['Call_OI'].sum()
        call_vol_1 = group.head(1)['Call_Volume'].sum()
        put_oi_1 = group.head(1)['Put_OI'].sum()
        put_vol_1 = group.head(1)['Put_Volume'].sum()
   else:
        call_oi = group['Call_OI'].sum()
        call_vol = group['Call_Volume'].sum()
        put_oi = group['Put_OI'].sum()
        put_vol = group['Put_Volume'].sum()
        call_oi_1 = group.tail(1)['Call_OI'].sum()
        call_vol_1 = group.tail(1)['Call_Volume'].sum()
        put_oi_1 = group.tail(1)['Put_OI'].sum()
        put_vol_1 = group.tail(1)['Put_Volume'].sum()
   summary = pd.DataFrame([{'call_oi':call_oi,
                            'call_vol':call_vol,
                            'put_oi':put_oi,
                            'put_vol':put_vol,
                            'call_oi_1':call_oi_1,
                            'call_vol_1':call_vol_1,
                            'put_oi_1':put_oi_1,
                            'put_vol_1':put_vol_1,
   return summary

result = grouped.apply(calc_summary)

This above code takes too much time to run given the dataset is not even that big. Currently, it takes about 23 seconds in my system. I tried swifter but that doesn't work with groupby objects. What should I do to make my code faster?

Edit: The data looks like this

  {'dateTime': {0: Timestamp('2023-02-06 09:21:00'),
  1: Timestamp('2023-02-06 09:21:00'),
  2: Timestamp('2023-02-06 09:21:00'),
  3: Timestamp('2023-02-06 09:21:00'),
  4: Timestamp('2023-02-06 09:21:00')},
 'close': {0: 17780.55, 1: 17780.55, 2: 17780.55, 3: 17780.55, 4: 17780.55},
 'Index': {0: 'NIFTY', 1: 'NIFTY', 2: 'NIFTY', 3: 'NIFTY', 4: 'NIFTY'},
 'Expiry': {0: '16FEB2023',
  1: '23FEB2023',
  2: '9FEB2023',
  3: '16FEB2023',
  4: '23FEB2023'},
 'Expiry_order': {0: 'week_2',
  1: 'week_3',
  2: 'week_1',
  3: 'week_2',
  4: 'week_3'},
 'group': {0: 'below', 1: 'below', 2: 'below', 3: 'below', 4: 'below'},
 'Call_OI': {0: nan, 1: 60.0, 2: 4.0, 3: nan, 4: nan},
 'Put_OI': {0: 1364.0, 1: 11255.0, 2: 91059.0, 3: 343.0, 4: 153.0},
 'Call_Volume': {0: nan, 1: 3.0, 2: 2.0, 3: nan, 4: nan},
 'Put_Volume': {0: 84.0, 1: 1246.0, 2: 5197.0, 3: 24.0, 4: 1.0},
 'Strike_Price': {0: 16100.0, 1: 16100.0, 2: 16100.0, 3: 16150.0, 4: 16150.0}}

jqurious · Accepted Answer

Using your sample data:

import io
import pandas as pd

csv = """
dateTime,close,Index,Expiry,Expiry_order,group,Call_OI,Put_OI,Call_Volume,Put_Volume,Strike_Price
2023-02-06 09:21:00,17780.55,NIFTY,16FEB2023,week_2,below,,1364.0,,84.0,16100.0
2023-02-06 09:21:00,17780.55,NIFTY,23FEB2023,week_3,below,60.0,11255.0,3.0,1246.0,16100.0
2023-02-06 09:21:00,17780.55,NIFTY,9FEB2023,week_1,below,4.0,91059.0,2.0,5197.0,16100.0
2023-02-06 09:21:00,17780.55,NIFTY,16FEB2023,week_2,below,,343.0,,24.0,16150.0
2023-02-06 09:21:00,17780.55,NIFTY,23FEB2023,week_3,below,,153.0,,1.0,16150.0
"""

df = pd.read_csv(io.StringIO(csv))

The output of your calc_summary function:

>>> df.sort_values(by='Strike_Price').groupby(['dateTime', 'close', 'Index', 'Expiry', 'group']).apply(calc_summary)
                                                      call_oi  call_vol   put_oi  put_vol  call_oi_1  call_vol_1  put_oi_1  put_vol_1
dateTime            close    Index Expiry    group                                                                                   
2023-02-06 09:21:00 17780.55 NIFTY 16FEB2023 below 0      0.0       0.0   1707.0    108.0        0.0         0.0     343.0       24.0
                                   23FEB2023 below 0     60.0       3.0  11408.0   1247.0        0.0         0.0     153.0        1.0
                                   9FEB2023  below 0      4.0       2.0  91059.0   5197.0        4.0         2.0   91059.0     5197.0

`.agg()`

You're performing an aggregation where you conditionally want the head/tail depending on the value of the group column.

You could aggregate both values instead and then do the filtering afterwards.

This allows you to use .agg() directly.

We can use first and last aggregations for head/tail but must first fillna(0) as they handle NaN values differently.

summary = (
   df.fillna(0) # needed for first/last as they ignore NaN
     .sort_values(by='Strike_Price')   
     .groupby(['dateTime', 'close', 'Index', 'Expiry', 'group'])
     [['Call_OI', 'Call_Volume', 'Put_OI', 'Put_Volume']]
     .agg(['first', 'last', 'sum'])
     .reset_index()
)

Which produces a multi-indexed column structure like:

              dateTime     close  Index     Expiry  group Call_OI            Call_Volume             Put_OI                   Put_Volume                
                                                            first last   sum       first last  sum    first     last      sum      first    last     sum
0  2023-02-06 09:21:00  17780.55  NIFTY  16FEB2023  below     0.0  0.0   0.0         0.0  0.0  0.0   1364.0    343.0   1707.0       84.0    24.0   108.0
1  2023-02-06 09:21:00  17780.55  NIFTY  23FEB2023  below    60.0  0.0  60.0         3.0  0.0  3.0  11255.0    153.0  11408.0     1246.0     1.0  1247.0
2  2023-02-06 09:21:00  17780.55  NIFTY   9FEB2023  below     4.0  4.0   4.0         2.0  2.0  2.0  91059.0  91059.0  91059.0     5197.0  5197.0  5197.0

To say you want the last values when group != "above" you can:

>>> below = summary.loc[summary['group'] != 'above', summary.columns.get_level_values(1) != 'first']
>>> below
              dateTime     close  Index     Expiry  group Call_OI       Call_Volume        Put_OI          Put_Volume        
                                                             last   sum        last  sum     last      sum       last     sum
0  2023-02-06 09:21:00  17780.55  NIFTY  16FEB2023  below     0.0   0.0         0.0  0.0    343.0   1707.0       24.0   108.0
1  2023-02-06 09:21:00  17780.55  NIFTY  23FEB2023  below     0.0  60.0         0.0  3.0    153.0  11408.0        1.0  1247.0
2  2023-02-06 09:21:00  17780.55  NIFTY   9FEB2023  below     4.0   4.0         2.0  2.0  91059.0  91059.0     5197.0  5197.0

To flatten the column structure similar to your functions output you can:

>>> below.columns = [left.lower() + ('' if right in {'', 'sum'} else '_1') for left, right in below.columns]
>>> below
              datetime     close  index     expiry  group  call_oi_1  call_oi  call_volume_1  call_volume  put_oi_1   put_oi  put_volume_1  put_volume
0  2023-02-06 09:21:00  17780.55  NIFTY  16FEB2023  below        0.0      0.0            0.0          0.0     343.0   1707.0          24.0       108.0
1  2023-02-06 09:21:00  17780.55  NIFTY  23FEB2023  below        0.0     60.0            0.0          3.0     153.0  11408.0           1.0      1247.0
2  2023-02-06 09:21:00  17780.55  NIFTY   9FEB2023  below        4.0      4.0            2.0          2.0   91059.0  91059.0        5197.0      5197.0

There are no examples of above in your data - but you could do the same for those rows using == 'above' and != 'last' and concat both sets of rows into a single dataframe.

Polars

You may also wish to compare how the dataset performs with polars.

One possible approach which generates the same output:

import io
import polars as pl

df = pl.read_csv(io.StringIO(csv))

columns = ["Call_OI", "Call_Volume", "Put_OI", "Put_Volume"]

(
   df
   .sort("Strike_Price")
   .groupby(["dateTime", "close", "Index", "Expiry", "group"], maintain_order=True)
   .agg([
      pl.col(columns).sum(),
      pl.when(pl.col("group").first() == "above")
        .then(pl.col(columns).first())
        .otherwise(pl.col(columns).last())
      .suffix("_1")
   ])
   .fill_null(0)
)

shape: (3, 13)
┌─────────────────────┬──────────┬───────┬───────────┬───────┬─────────┬─────────────┬─────────┬────────────┬───────────┬───────────────┬──────────┬──────────────┐
│ dateTime            | close    | Index | Expiry    | group | Call_OI | Call_Volume | Put_OI  | Put_Volume | Call_OI_1 | Call_Volume_1 | Put_OI_1 | Put_Volume_1 │
│ ---                 | ---      | ---   | ---       | ---   | ---     | ---         | ---     | ---        | ---       | ---           | ---      | ---          │
│ str                 | f64      | str   | str       | str   | f64     | f64         | f64     | f64        | f64       | f64           | f64      | f64          │
╞═════════════════════╪══════════╪═══════╪═══════════╪═══════╪═════════╪═════════════╪═════════╪════════════╪═══════════╪═══════════════╪══════════╪══════════════╡
│ 2023-02-06 09:21:00 | 17780.55 | NIFTY | 16FEB2023 | below | 0.0     | 0.0         | 1707.0  | 108.0      | 0.0       | 0.0           | 343.0    | 24.0         │
│ 2023-02-06 09:21:00 | 17780.55 | NIFTY | 23FEB2023 | below | 60.0    | 3.0         | 11408.0 | 1247.0     | 0.0       | 0.0           | 153.0    | 1.0          │
│ 2023-02-06 09:21:00 | 17780.55 | NIFTY | 9FEB2023  | below | 4.0     | 2.0         | 91059.0 | 5197.0     | 4.0       | 2.0           | 91059.0  | 5197.0       │
└─────────────────────┴──────────┴───────┴───────────┴───────┴─────────┴─────────────┴─────────┴────────────┴───────────┴───────────────┴──────────┴──────────────┘

Pandas groupby apply is very slow

Answers (1)

`.agg()`

Polars

Related Questions