Difference of datetimes in hours, excluding the weekend

Question

I currently have a dataframe, where an uniqueID has multiple dates in another column. I want extract the hours between each date, but ignore the weekend if the next date is after the weekend. For example, if today is friday at 12 pm, and the following date is tuesday at 12 pm then the difference in hours between these two dates would be 48 hours.

Here is my dataset with the expected output:

df = pd.DataFrame({"UniqueID": ["A","A","A","B","B","B","C","C"],"Date":
["2018-12-07 10:30:00","2018-12-10 14:30:00","2018-12-11 17:30:00",
"2018-12-14 09:00:00","2018-12-18 09:00:00",
"2018-12-21 11:00:00","2019-01-01 15:00:00","2019-01-07 15:00:00"],
"ExpectedOutput": ["28.0","27.0","Nan","48.0","74.0","NaN","96.0","NaN"]})

df["Date"] = df["Date"].astype(np.datetime64)

This is what I have so far, but it includes the weekends:

df["date_diff"] = df.groupby(["UniqueID"])["Date"].apply(lambda x: x.diff() 
/ np.timedelta64(1 ,'h')).shift(-1)

Thanks!

jezrael · Accepted Answer

Idea is floor datetimes for remove times and get number of business days between start day + one day and shifted day to hours3 column by numpy.busday_count and then create hour1 and hour2 columns for start and end hours if not weekends hours. Last sum all hours columns together:

df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values(['UniqueID','Date'])

df["shifted"] = df.groupby(["UniqueID"])["Date"].shift(-1)
df["hours1"] = df["Date"].dt.floor('d') 
df["hours2"] = df["shifted"].dt.floor('d') 

mask = df['shifted'].notnull()
f = lambda x: np.busday_count(x['hours1'] + pd.Timedelta(1, unit='d'), x['hours2'])
df.loc[mask, 'hours3'] = df[mask].apply(f, axis=1) * 24

mask1 = df['hours1'].dt.dayofweek < 5
hours1 = df['hours1'] + pd.Timedelta(1, unit='d') - df['Date']
df['hours1'] = np.where(mask1, hours1, np.nan) / np.timedelta64(1 ,'h')

mask1 = df['hours2'].dt.dayofweek < 5
df['hours2'] = np.where(mask1, df['shifted']-df['hours2'], np.nan) / np.timedelta64(1 ,'h')

df['date_diff'] = df['hours1'].fillna(0) + df['hours2'] + df['hours3']

print (df)
  UniqueID                Date ExpectedOutput             shifted  hours1  \
0        A 2018-12-07 10:30:00           28.0 2018-12-10 14:30:00    13.5   
1        A 2018-12-10 14:30:00           27.0 2018-12-11 17:30:00     9.5   
2        A 2018-12-11 17:30:00            Nan                 NaT     6.5   
3        B 2018-12-14 09:00:00           48.0 2018-12-18 09:00:00    15.0   
4        B 2018-12-18 09:00:00           74.0 2018-12-21 11:00:00    15.0   
5        B 2018-12-21 11:00:00            NaN                 NaT    13.0   
6        C 2019-01-01 15:00:00           96.0 2019-01-07 15:00:00     9.0   
7        C 2019-01-07 15:00:00            NaN                 NaT     9.0   

   hours2  hours3  date_diff  
0    14.5     0.0       28.0  
1    17.5     0.0       27.0  
2     NaN     NaN        NaN  
3     9.0    24.0       48.0  
4    11.0    48.0       74.0  
5     NaN     NaN        NaN  
6    15.0    72.0       96.0  
7     NaN     NaN        NaN

First solution was removed with 2 reasons - was not accurate and slow:

np.random.seed(2019)

dates = pd.date_range('2015-01-01','2018-01-01', freq='H')
df = pd.DataFrame({"UniqueID": np.random.choice(list('ABCDEFGHIJ'), size=100),
                   "Date": np.random.choice(dates, size=100)})
print (df)

def old(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values(['UniqueID','Date'])

    df["shifted"] = df.groupby(["UniqueID"])["Date"].shift(-1)

    def f(x):
        a = pd.date_range(x['Date'],  x['shifted'], freq='T')
        return ((a.dayofweek < 5).sum() / 60).round()


    mask = df['shifted'].notnull()
    df.loc[mask, 'date_diff'] = df[mask].apply(f, axis=1)  
    return df

def new(df):
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values(['UniqueID','Date'])

    df["shifted"] = df.groupby(["UniqueID"])["Date"].shift(-1)
    df["hours1"] = df["Date"].dt.floor('d') 
    df["hours2"] = df["shifted"].dt.floor('d') 

    mask = df['shifted'].notnull()
    f = lambda x: np.busday_count(x['hours1'] + pd.Timedelta(1, unit='d'), x['hours2'])
    df.loc[mask, 'hours3'] = df[mask].apply(f, axis=1) * 24

    mask1 = df['hours1'].dt.dayofweek < 5
    hours1 = df['hours1'] + pd.Timedelta(1, unit='d') - df['Date']
    df['hours1'] = np.where(mask1, hours1, np.nan) / np.timedelta64(1 ,'h')

    mask1 = df['hours2'].dt.dayofweek < 5
    df['hours2'] = np.where(mask1, df['shifted'] - df['hours2'], np.nan) / np.timedelta64(1 ,'h')

    df['date_diff'] = df['hours1'].fillna(0) + df['hours2'] + df['hours3']
    return df
print (new(df))
print (old(df))

In [44]: %timeit (new(df))
22.7 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [45]: %timeit (old(df))
1.01 s ± 8.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Difference of datetimes in hours, excluding the weekend

Answers (1)

Related Questions