dko512
dko512

Reputation: 431

Difference of datetimes in hours, excluding the weekend

I currently have a dataframe, where an uniqueID has multiple dates in another column. I want extract the hours between each date, but ignore the weekend if the next date is after the weekend. For example, if today is friday at 12 pm, and the following date is tuesday at 12 pm then the difference in hours between these two dates would be 48 hours.

Here is my dataset with the expected output:

df = pd.DataFrame({"UniqueID": ["A","A","A","B","B","B","C","C"],"Date":
["2018-12-07 10:30:00","2018-12-10 14:30:00","2018-12-11 17:30:00",
"2018-12-14 09:00:00","2018-12-18 09:00:00",
"2018-12-21 11:00:00","2019-01-01 15:00:00","2019-01-07 15:00:00"],
"ExpectedOutput": ["28.0","27.0","Nan","48.0","74.0","NaN","96.0","NaN"]})

df["Date"] = df["Date"].astype(np.datetime64)

This is what I have so far, but it includes the weekends:

df["date_diff"] = df.groupby(["UniqueID"])["Date"].apply(lambda x: x.diff() 
/ np.timedelta64(1 ,'h')).shift(-1)

Thanks!

Upvotes: 2

Views: 354

Answers (1)

jezrael
jezrael

Reputation: 863281

Idea is floor datetimes for remove times and get number of business days between start day + one day and shifted day to hours3 column by numpy.busday_count and then create hour1 and hour2 columns for start and end hours if not weekends hours. Last sum all hours columns together:

df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values(['UniqueID','Date'])

df["shifted"] = df.groupby(["UniqueID"])["Date"].shift(-1)
df["hours1"] = df["Date"].dt.floor('d') 
df["hours2"] = df["shifted"].dt.floor('d') 

mask = df['shifted'].notnull()
f = lambda x: np.busday_count(x['hours1'] + pd.Timedelta(1, unit='d'), x['hours2'])
df.loc[mask, 'hours3'] = df[mask].apply(f, axis=1) * 24

mask1 = df['hours1'].dt.dayofweek < 5
hours1 = df['hours1'] + pd.Timedelta(1, unit='d') - df['Date']
df['hours1'] = np.where(mask1, hours1, np.nan) / np.timedelta64(1 ,'h')

mask1 = df['hours2'].dt.dayofweek < 5
df['hours2'] = np.where(mask1, df['shifted']-df['hours2'], np.nan) / np.timedelta64(1 ,'h')

df['date_diff'] = df['hours1'].fillna(0) + df['hours2'] + df['hours3']

print (df)
  UniqueID                Date ExpectedOutput             shifted  hours1  \
0        A 2018-12-07 10:30:00           28.0 2018-12-10 14:30:00    13.5   
1        A 2018-12-10 14:30:00           27.0 2018-12-11 17:30:00     9.5   
2        A 2018-12-11 17:30:00            Nan                 NaT     6.5   
3        B 2018-12-14 09:00:00           48.0 2018-12-18 09:00:00    15.0   
4        B 2018-12-18 09:00:00           74.0 2018-12-21 11:00:00    15.0   
5        B 2018-12-21 11:00:00            NaN                 NaT    13.0   
6        C 2019-01-01 15:00:00           96.0 2019-01-07 15:00:00     9.0   
7        C 2019-01-07 15:00:00            NaN                 NaT     9.0   

   hours2  hours3  date_diff  
0    14.5     0.0       28.0  
1    17.5     0.0       27.0  
2     NaN     NaN        NaN  
3     9.0    24.0       48.0  
4    11.0    48.0       74.0  
5     NaN     NaN        NaN  
6    15.0    72.0       96.0  
7     NaN     NaN        NaN  

First solution was removed with 2 reasons - was not accurate and slow:

np.random.seed(2019)

dates = pd.date_range('2015-01-01','2018-01-01', freq='H')
df = pd.DataFrame({"UniqueID": np.random.choice(list('ABCDEFGHIJ'), size=100),
                   "Date": np.random.choice(dates, size=100)})
print (df)

def old(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values(['UniqueID','Date'])

    df["shifted"] = df.groupby(["UniqueID"])["Date"].shift(-1)

    def f(x):
        a = pd.date_range(x['Date'],  x['shifted'], freq='T')
        return ((a.dayofweek < 5).sum() / 60).round()


    mask = df['shifted'].notnull()
    df.loc[mask, 'date_diff'] = df[mask].apply(f, axis=1)  
    return df

def new(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values(['UniqueID','Date'])

    df["shifted"] = df.groupby(["UniqueID"])["Date"].shift(-1)
    df["hours1"] = df["Date"].dt.floor('d') 
    df["hours2"] = df["shifted"].dt.floor('d') 

    mask = df['shifted'].notnull()
    f = lambda x: np.busday_count(x['hours1'] + pd.Timedelta(1, unit='d'), x['hours2'])
    df.loc[mask, 'hours3'] = df[mask].apply(f, axis=1) * 24

    mask1 = df['hours1'].dt.dayofweek < 5
    hours1 = df['hours1'] + pd.Timedelta(1, unit='d') - df['Date']
    df['hours1'] = np.where(mask1, hours1, np.nan) / np.timedelta64(1 ,'h')

    mask1 = df['hours2'].dt.dayofweek < 5
    df['hours2'] = np.where(mask1, df['shifted'] - df['hours2'], np.nan) / np.timedelta64(1 ,'h')

    df['date_diff'] = df['hours1'].fillna(0) + df['hours2'] + df['hours3']
    return df
print (new(df))
print (old(df))

In [44]: %timeit (new(df))
22.7 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [45]: %timeit (old(df))
1.01 s ± 8.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Upvotes: 2

Related Questions