Reputation: 3097
I have 2 dataframes, each having a column with datatime as datatype. I want to join second dataframe with first with the following conditions
find the rows of first dataframe whose datetime value between datetime value of second dataframe and 10 minutes before
if more than one such rows, then take the first one
if no such row, then fill with empty or null
One row can be joined only once.
Now I am doing in the following way. I would like to know, if there are any better ways to reduce the total running time.
from datetime import datetime
import datetime as dt
import pandas as pd
df1 = pd.DataFrame(columns = ['Enter_Time', 'Unique_Id'])
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 06:29:00','%Y-%m-%d %H:%M:%S'), 'A']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 06:30:00','%Y-%m-%d %H:%M:%S'), 'B']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 06:31:00','%Y-%m-%d %H:%M:%S'), 'C']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 06:32:00','%Y-%m-%d %H:%M:%S'), 'D']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 06:33:00','%Y-%m-%d %H:%M:%S'), 'E']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 08:29:00','%Y-%m-%d %H:%M:%S'), 'F']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 08:30:00','%Y-%m-%d %H:%M:%S'), 'G']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 08:31:00','%Y-%m-%d %H:%M:%S'), 'H']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 08:32:00','%Y-%m-%d %H:%M:%S'), 'I']
df1.loc[len(df1)] = [datetime.strptime('2018-10-01 08:33:00','%Y-%m-%d %H:%M:%S'), 'j']
df2 = pd.DataFrame(columns = ['Transaction_Time', 'Amount'])
df2.loc[len(df2)] = [datetime.strptime('2018-10-01 06:40:00','%Y-%m-%d %H:%M:%S'), 10.25]
df2.loc[len(df2)] = [datetime.strptime('2018-10-01 07:40:00','%Y-%m-%d %H:%M:%S'), 3.96]
df2.loc[len(df2)] = [datetime.strptime('2018-10-01 08:31:00','%Y-%m-%d %H:%M:%S'), 9.65]
df2.loc[len(df2)] = [datetime.strptime('2018-10-01 08:32:00','%Y-%m-%d %H:%M:%S'), 2.84]
df3 = pd.DataFrame(columns = ['Transaction_Time', 'Amount', 'Enter_Time', 'Unique_Id'])
for id, row in df2.iterrows():
Transaction_Time = row['Transaction_Time']
Transaction_Time_Before = Transaction_Time - dt.timedelta(seconds = 600)
Result_Row = {
'Transaction_Time' : row['Transaction_Time'],
'Amount' : row['Amount'],
'Enter_Time' : '',
'Unique_Id' : ''
}
dfFiletered = df1[(df1["Enter_Time"] < Transaction_Time) & (df1["Enter_Time"] >= Transaction_Time_Before)].sort_values(by= ['Enter_Time'],ascending=True)
if len(dfFiletered) > 0:
firstRow = dfFiletered.iloc[0]
Result_Row['Enter_Time'] = firstRow['Enter_Time']
Result_Row['Unique_Id'] = firstRow['Unique_Id']
df1.drop(df1[df1["Unique_Id"] == firstRow['Unique_Id']].index, inplace=True)
df3.loc[len(df3)] = Result_Row
print(df3)
Upvotes: 2
Views: 60
Reputation: 27869
You can use merge_asof():
pd.merge_asof(df1,
df2,
left_on='Enter_Time',
right_on='Transaction_Time',
tolerance=pd.Timedelta('10m'),
direction='forward')
And it would produce:
# Enter_Time Unique_Id Transaction_Time Amount
#0 2018-10-01 06:29:00 A NaT NaN
#1 2018-10-01 06:30:00 B 2018-10-01 06:40:00 10.25
#2 2018-10-01 06:31:00 C 2018-10-01 06:40:00 10.25
#3 2018-10-01 06:32:00 D 2018-10-01 06:40:00 10.25
#4 2018-10-01 06:33:00 E 2018-10-01 06:40:00 10.25
#5 2018-10-01 08:29:00 F 2018-10-01 08:31:00 9.65
#6 2018-10-01 08:30:00 G 2018-10-01 08:31:00 9.65
#7 2018-10-01 08:31:00 H 2018-10-01 08:31:00 9.65
#8 2018-10-01 08:32:00 I 2018-10-01 08:32:00 2.84
#9 2018-10-01 08:33:00 j NaT NaN
And to keep only first use:
df = pd.merge_asof(df1,
df2,
left_on='Enter_Time',
right_on='Transaction_Time',
tolerance=pd.Timedelta('10m'),
direction='forward')
df.loc[df.duplicated(['Transaction_Time', 'Amount']), ['Transaction_Time', 'Amount']] = (np.nan, np.nan)
df
# Enter_Time Unique_Id Transaction_Time Amount
#0 2018-10-01 06:29:00 A NaT NaN
#1 2018-10-01 06:30:00 B 2018-10-01 06:40:00 10.25
#2 2018-10-01 06:31:00 C NaT NaN
#3 2018-10-01 06:32:00 D NaT NaN
#4 2018-10-01 06:33:00 E NaT NaN
#5 2018-10-01 08:29:00 F 2018-10-01 08:31:00 9.65
#6 2018-10-01 08:30:00 G NaT NaN
#7 2018-10-01 08:31:00 H NaT NaN
#8 2018-10-01 08:32:00 I 2018-10-01 08:32:00 2.84
#9 2018-10-01 08:33:00 j NaT NaN
EDIT
To merge df2
with df1
I guess you need to leave default direction ('backward'
):
df = pd.merge_asof(df2,
df1,
left_on='Transaction_Time',
right_on='Enter_Time',
tolerance=pd.Timedelta('10m'))
df.loc[df.duplicated(['Transaction_Time', 'Amount']), ['Transaction_Time', 'Amount']] = (np.nan, np.nan)
# Transaction_Time Amount Enter_Time Unique_Id
#0 2018-10-01 06:40:00 10.25 2018-10-01 06:33:00 E
#1 2018-10-01 07:40:00 3.96 NaT NaN
#2 2018-10-01 08:31:00 9.65 2018-10-01 08:31:00 H
#3 2018-10-01 08:32:00 2.84 2018-10-01 08:32:00 I
The transformation of duplicated doesn't affect your example, but it is there to solve issue in question.
Upvotes: 2