user18977534
user18977534

Reputation:

Improvement of iteration thru data frames

Trying to find some insides in two data frames,know that loops are not solution in pandas and using two sheets with 15k rows each. How can I improve the speed on the code that follows? Its not possible to use merge because after matching the condition row from err need to be removed in order to don't be matched again. It takes 140 min with 16k rows.

import pandas as pd

scr = pd.DataFrame({
    'Product':['10101.A', '10101.A', '10101.A', '10147.A', '10147.A', '10147.A', '10147.A','10147.A'],
    'Source Handling Unit':['7000000051481339', '7000000051481342', '7000000051722237','7000000051530150','7000000051530152', '7000000051530157', '7000000051546193', '7000000051761150'],
    'Available Qty BUoM':[1,1,1,1,1,1,1,1],
    'Confirmation Date':['10-5-2022', '10-5-2022', '9-5-2022', '6-5-2022', '6-5-2022', '6-5-2022', '6-5-2022', '11-5-2022']
    })

err = pd.DataFrame({
    'Posting Date':['4-5-2022','6-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','13-5-2022','15-5-2022','16-5-2022','25-5-2022'],
    'Product':['10101.A', '10147.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10147.A'],
    'Reason':['L400', 'CCIV', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'L400', 'L400'],
    'Activity Area':['A970', 'D300', 'A990', 'A990', 'A990', 'A990', 'A990', 'A990','A990', 'A990', 'A990','A970','A970'],
    'Difference Quantity':[1, 5, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1]
})

#Creating filter for scr
filt_scr_col = ['Product', 'Source Handling Unit', 'Available Qty BUoM', 'Confirmation Date']
#Applying filter
scr = scr[filt_scr_col]

#Creating filter for err
filt_post_col = ['Posting Date', 'Product', 'Reason', 'Activity Area', 'Difference Quantity']

#Applying filter
err = err[filt_post_col] 

#Replace empty characters with underscore 
scr.columns = scr.columns.str.replace(' ', '_')
err.columns = err.columns.str.replace(' ', '_')

#Creating filter to extract A450 rows from postings
filt = err['Activity_Area'] == 'A450'

#Assign A450 rows to new dataframe a450
a450 = err.loc[filt]

#.groupby 'Posting_Date', 'Product','Reason' but when I pass an argument as_index = False doent aggregate Products returns relevant object
#.query gets Difference_Quantity > 0 this evaluates and refers to exact row
a450 = (
    a450.groupby(['Posting_Date', 'Product','Reason'], as_index = False, sort = False).sum()
    .query('Difference_Quantity > 0')
    )

#Creating filter to remove all < 0
filt = err['Difference_Quantity'] > 0

#Applying filter to dataframe
err = err.loc[filt]

#Removing column 'Activity_Area' from err don't need it and columns will match when appent with a450
err = err.drop(columns='Activity_Area')

#Concat err and a450
err = pd.concat([err, a450], ignore_index= True)

scr['Confirmation_Date'] = pd.to_datetime(scr['Confirmation_Date'], format = "%d-%m-%Y")  
err['Posting_Date'] = pd.to_datetime(err['Posting_Date'], format = "%d-%m-%Y")

scr = scr.sort_values(by='Product', ascending=True)
err = err.sort_values(by='Posting_Date', ascending=True)

scr['Reason'] = None
match = []
for i, row in scr.iterrows():
    for e, erow in err.iterrows():
        if (row['Product'] == erow['Product']) & (erow['Posting_Date'] >= row['Confirmation_Date']) & (erow['Difference_Quantity'] - row['Available_Qty_BUoM'] >= 0):
            row['Reason'] = err['Reason'][e]
            err['Difference_Quantity'][e]-= row['Available_Qty_BUoM']
            row_to_dict = row.to_dict()
            match.append(row_to_dict)
            break
report = pd.DataFrame(match)  
report = report[['Product', 'Source_Handling_Unit', 'Available_Qty_BUoM', 'Confirmation_Date', 'Reason']]

#Pandas think that report['Source_Handling_Unit'] has floats and love to round them
report = report.astype({'Source_Handling_Unit': str})
report

Upvotes: 1

Views: 110

Answers (2)

BeRT2me
BeRT2me

Reputation: 13242

Setup:

scr = pd.DataFrame({'id' : ['10101.A', '10101.A', '10101.A'],'date' : ['10-5-2022', '10-5-2022', '9-5-2022'], 'qty': [1, 1, 1]})
err = pd.DataFrame({'id' : ['10101.A', '10101.A', '10101.A'], 'date' : ['4-5-2022', '13-5-2022', '16-5-2022'],'qty': [1, 1, 1], 'r':['a', 'b', 'c']})
scr['date'] = pd.to_datetime(scr['date'], format = "%d-%m-%Y")  
err['date'] = pd.to_datetime(err['date'], format = "%d-%m-%Y")

Doing:

df = scr.merge(err, on='id', suffixes=[None, '_er']).drop_duplicates()
report = df[df['date_er'].ge(df['date']) & df['qty_er'].sub(df['qty']).ge(0)]
print(report[['id', 'date', 'qty', 'r']])

Output:

        id       date  qty  r
1  10101.A 2022-05-10    1  b
2  10101.A 2022-05-10    1  c
7  10101.A 2022-05-09    1  b
8  10101.A 2022-05-09    1  c

Another approach would be a sql-like one, this may be more efficient on a large dataset:

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

query = '''
    select distinct scr.id, scr.date, scr.qty, r
    from scr
    join err
    on scr.id == err.id
    and err.date >= scr.date
    and err.qty - scr.qty >= 0
'''

df = pysqldf(query)
df.date = pd.to_datetime(df.date)
print(df)

...

        id       date  qty  r
0  10101.A 2022-05-10    1  b
1  10101.A 2022-05-10    1  c
2  10101.A 2022-05-09    1  b
3  10101.A 2022-05-09    1  c

Upvotes: 2

gtomer
gtomer

Reputation: 6564

If I understood you right, the bellow code is quick without loops:

import pandas as pd
scr = pd.DataFrame({'id' : ['1a', '1a', '1a', '1b'], 'date' : ['10-5-2022', '10-5-2022', '9-5-2022', '11-5-2022'], 'qty': [1, 1, 2, 2]})
err = pd.DataFrame({'id' : ['1a', '1a', '1a', '2s'], 'date' : ['4-5-2022', '13-5-2022', '16-5-2022', '12-3-2022'], 'qty': [1, 1, 3, 5], 'r':['a', 'b', 'c', 'a']})
scr['date'] = pd.to_datetime(scr['date'], format = "%d-%m-%Y")  
err['date'] = pd.to_datetime(err['date'], format = "%d-%m-%Y")
# merging the two dataframes
arr = scr.merge(err, on='id', how = 'left')
# filtering
arr = arr[(arr['date_x']<=arr['date_y']) & (arr['qty_x'] == arr['qty_y'])]
arr

Upvotes: 0

Related Questions