tktktk0711
tktktk0711

Reputation: 1694

Python3 to speed up the computing of dataframe

I have a dataframe (df) as following

   id     date         t_slot  dayofweek  label
   1     2021-01-01    2        0          1
   1     2021-01-02    3        1          0
   2     2021-01-01    4        6          1
   .......

The data frame is very large(6 million rows). the t_slot is from 1 to 6 value. dayofweek is from 0-6. I want to get the rate:

    - the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
    - the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
    - the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
    - the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.

I have used loop to compute the rate, but it is very slow, do you have fast way to compute it. My code is copied as following:

def get_time_slot_rate(df):
    import numpy as np

    if len(df)==0:
        return np.nan, np.nan, np.nan, np.nan
    else:
        work = df.loc[df['dayofweek']<5]
        weekend = df.loc[df['dayofweek']>=5]
        if len(work)==0:
             work_14, work_56 = np.nan, np.nan
        else:
            work_14 = len(work.loc[(work['time_slot']<5)*(work['label']==1)])/len(work)
            work_56 = len(work.loc[(work['time_slot']>5)*(work['label']==1)])/len(work)
        if len(weekend)==0:
            weekend_14, weekend_56 = np.nan, np.nan
        else:
            weekend_14 = len(weekend.loc[(weekend['time_slot']<5)*(weekend['label']==1)])/len(weekend)
            weekend_56 = len(weekend.loc[(weekend['time_slot']>5)*(weekend['label']==1)])/len(weekend)
        
       
  return work_14, work_56, weekend_14, weekend_56

import datetime as d_t
lst_id = list(df['id'])
lst_date = list(df['date'])

lst_t14_work = []
lst_t56_work = []
lst_t14_weekend = []
lst_t56_weekend = []
for i in range(len(lst_id)):
    if i%100==0:
        print(i)
    d_date = lst_date[i]
    dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
    month_step = relativedelta(months=3)
    pre_date = str(dt - month_step).split(' ')[0]
    df_s = df.loc[(df['easy_id']==lst_easy[i]) 
                             & ((df['delivery_date']>=pre_date)
                                &(df['delivery_date']< d_date))].reset_index(drop=True)
    
    work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df_s)
    lst_t14_work.append(work_14_rate)
    lst_t56_work.append(work_56_rate)
    lst_t14_weekend.append(weekend_14_rate)
    lst_t56_weekend.append(weekend_56_rate)

Upvotes: 0

Views: 71

Answers (1)

Sy Ker
Sy Ker

Reputation: 2190

I could only fix your function and it's completely untested, but here we go:

  • Import only once by putting the imports at the top of your .py.
  • try/except blocks are more efficient than if/else statements.
  • True and False equals to 1 and 0 respectively in Python.
  • Don't multiply boolean selectors and use the reverse operator ~
  • Create the least amount of copies.
import numpy as np 
    
def get_time_slot_rate(df):
    
    # much faster than counting
    if df.empty:
        return np.nan, np.nan, np.nan, np.nan

    # assuming df['label'] is either 0 or 1
    df = df.loc[df['label']]

    # create boolean selectors to be inverted with '~'
    weekdays = df['dayofweek']<=5
    slot_selector = df['time_slot']<=5
 
    weekday_count = np.sum(weekdays)
    try:
        work_14 = len(df.loc[weekdays & slot_selector])/weekday_count
        work_56 = len(df.loc[weekdays & ~slot_selector])/weekday_count
    except ZeroDivisionError: 
        work_14 = work_56 = np.nan

    weekend_count = np.sum(~weekdays)
    try: 
        weekend_14 = len(df.loc[~weekdays & slot_selector])/weekend_count
        weekend_56 = len(df.loc[~weekdays & ~slot_selector])/weekend_count
    except ZeroDivisionError: 
        weekend_14 = weekend_56 = np.nan 
       
    return work_14, work_56, weekend_14, weekend_56

The rest of your script doesn't really make sense, see my comments:

for i in range(len(lst_id)):
    if i%100==0:
        print(i)
    d_date = date[i]
 
    # what is d_t ?
    dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')

    month_step = relativedelta(months=3)
    pre_date = str(dt - month_step).split(' ')[0]
    df_s = df.loc[(df['easy_id']==lst_easy[i]) 
                   & (df['delivery_date']>=pre_date)
                    &(df['delivery_date']< d_date)].reset_index(drop=True)
    
    # is it df or df_s ?
    work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df)

If your date column is a datetime object than you can compare dates directly (no need for strings).

Upvotes: 1

Related Questions