Reputation:
Trying to find some insides in two data frames,know that loops are not solution in pandas and using two sheets with 15k rows each. How can I improve the speed on the code that follows? Its not possible to use merge because after matching the condition row from err need to be removed in order to don't be matched again. It takes 140 min with 16k rows.
import pandas as pd
scr = pd.DataFrame({
'Product':['10101.A', '10101.A', '10101.A', '10147.A', '10147.A', '10147.A', '10147.A','10147.A'],
'Source Handling Unit':['7000000051481339', '7000000051481342', '7000000051722237','7000000051530150','7000000051530152', '7000000051530157', '7000000051546193', '7000000051761150'],
'Available Qty BUoM':[1,1,1,1,1,1,1,1],
'Confirmation Date':['10-5-2022', '10-5-2022', '9-5-2022', '6-5-2022', '6-5-2022', '6-5-2022', '6-5-2022', '11-5-2022']
})
err = pd.DataFrame({
'Posting Date':['4-5-2022','6-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','13-5-2022','15-5-2022','16-5-2022','25-5-2022'],
'Product':['10101.A', '10147.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10147.A'],
'Reason':['L400', 'CCIV', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'L400', 'L400'],
'Activity Area':['A970', 'D300', 'A990', 'A990', 'A990', 'A990', 'A990', 'A990','A990', 'A990', 'A990','A970','A970'],
'Difference Quantity':[1, 5, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1]
})
#Creating filter for scr
filt_scr_col = ['Product', 'Source Handling Unit', 'Available Qty BUoM', 'Confirmation Date']
#Applying filter
scr = scr[filt_scr_col]
#Creating filter for err
filt_post_col = ['Posting Date', 'Product', 'Reason', 'Activity Area', 'Difference Quantity']
#Applying filter
err = err[filt_post_col]
#Replace empty characters with underscore
scr.columns = scr.columns.str.replace(' ', '_')
err.columns = err.columns.str.replace(' ', '_')
#Creating filter to extract A450 rows from postings
filt = err['Activity_Area'] == 'A450'
#Assign A450 rows to new dataframe a450
a450 = err.loc[filt]
#.groupby 'Posting_Date', 'Product','Reason' but when I pass an argument as_index = False doent aggregate Products returns relevant object
#.query gets Difference_Quantity > 0 this evaluates and refers to exact row
a450 = (
a450.groupby(['Posting_Date', 'Product','Reason'], as_index = False, sort = False).sum()
.query('Difference_Quantity > 0')
)
#Creating filter to remove all < 0
filt = err['Difference_Quantity'] > 0
#Applying filter to dataframe
err = err.loc[filt]
#Removing column 'Activity_Area' from err don't need it and columns will match when appent with a450
err = err.drop(columns='Activity_Area')
#Concat err and a450
err = pd.concat([err, a450], ignore_index= True)
scr['Confirmation_Date'] = pd.to_datetime(scr['Confirmation_Date'], format = "%d-%m-%Y")
err['Posting_Date'] = pd.to_datetime(err['Posting_Date'], format = "%d-%m-%Y")
scr = scr.sort_values(by='Product', ascending=True)
err = err.sort_values(by='Posting_Date', ascending=True)
scr['Reason'] = None
match = []
for i, row in scr.iterrows():
for e, erow in err.iterrows():
if (row['Product'] == erow['Product']) & (erow['Posting_Date'] >= row['Confirmation_Date']) & (erow['Difference_Quantity'] - row['Available_Qty_BUoM'] >= 0):
row['Reason'] = err['Reason'][e]
err['Difference_Quantity'][e]-= row['Available_Qty_BUoM']
row_to_dict = row.to_dict()
match.append(row_to_dict)
break
report = pd.DataFrame(match)
report = report[['Product', 'Source_Handling_Unit', 'Available_Qty_BUoM', 'Confirmation_Date', 'Reason']]
#Pandas think that report['Source_Handling_Unit'] has floats and love to round them
report = report.astype({'Source_Handling_Unit': str})
report
Upvotes: 1
Views: 110
Reputation: 13242
Setup:
scr = pd.DataFrame({'id' : ['10101.A', '10101.A', '10101.A'],'date' : ['10-5-2022', '10-5-2022', '9-5-2022'], 'qty': [1, 1, 1]})
err = pd.DataFrame({'id' : ['10101.A', '10101.A', '10101.A'], 'date' : ['4-5-2022', '13-5-2022', '16-5-2022'],'qty': [1, 1, 1], 'r':['a', 'b', 'c']})
scr['date'] = pd.to_datetime(scr['date'], format = "%d-%m-%Y")
err['date'] = pd.to_datetime(err['date'], format = "%d-%m-%Y")
Doing:
df = scr.merge(err, on='id', suffixes=[None, '_er']).drop_duplicates()
report = df[df['date_er'].ge(df['date']) & df['qty_er'].sub(df['qty']).ge(0)]
print(report[['id', 'date', 'qty', 'r']])
Output:
id date qty r
1 10101.A 2022-05-10 1 b
2 10101.A 2022-05-10 1 c
7 10101.A 2022-05-09 1 b
8 10101.A 2022-05-09 1 c
Another approach would be a sql-like one, this may be more efficient on a large dataset:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
query = '''
select distinct scr.id, scr.date, scr.qty, r
from scr
join err
on scr.id == err.id
and err.date >= scr.date
and err.qty - scr.qty >= 0
'''
df = pysqldf(query)
df.date = pd.to_datetime(df.date)
print(df)
...
id date qty r
0 10101.A 2022-05-10 1 b
1 10101.A 2022-05-10 1 c
2 10101.A 2022-05-09 1 b
3 10101.A 2022-05-09 1 c
Upvotes: 2
Reputation: 6564
If I understood you right, the bellow code is quick without loops:
import pandas as pd
scr = pd.DataFrame({'id' : ['1a', '1a', '1a', '1b'], 'date' : ['10-5-2022', '10-5-2022', '9-5-2022', '11-5-2022'], 'qty': [1, 1, 2, 2]})
err = pd.DataFrame({'id' : ['1a', '1a', '1a', '2s'], 'date' : ['4-5-2022', '13-5-2022', '16-5-2022', '12-3-2022'], 'qty': [1, 1, 3, 5], 'r':['a', 'b', 'c', 'a']})
scr['date'] = pd.to_datetime(scr['date'], format = "%d-%m-%Y")
err['date'] = pd.to_datetime(err['date'], format = "%d-%m-%Y")
# merging the two dataframes
arr = scr.merge(err, on='id', how = 'left')
# filtering
arr = arr[(arr['date_x']<=arr['date_y']) & (arr['qty_x'] == arr['qty_y'])]
arr
Upvotes: 0