Reputation: 529
I have two excel files like below-
File1.xlsx
File2.xlsx
I want to compare each row and each column and eliminate common rows from both files (except the column heading). The output should look like below -
File1.xlsx
File2.xlsx
My code-
import pandas
from datetime import datetime
import numpy
df = pandas.read_excel('File1.xlsx')
FORMAT = ['col1','col2','col3']
df_selected = df[FORMAT]
df2 = pandas.read_excel('File2.xlsx')
FORMAT2 = ['col1','col2','col3']
df_selected2 = df2[FORMAT2]
def compare(row1,row2):
# print(row1,row2)
flag = False
i = 0
for t1, t2 in zip(row1, row2):
#if it is first value
#check dates
if(i==0):
formattedDT = datetime.strptime(t1, '%d-%b-%Y')
if(formattedDT!=t2):
flag = True
break
elif(t1!=t2):
flag = True
break
i=i+1
if(flag is False):
return 'matched'
else:
return 'mismatched'
def iterate(array1, array2):
result = array1
for index, row1 in enumerate(array1):
for row2 in array2:
if(compare(row1,row2)=='matched'):
del result[index]
break
df = pandas.DataFrame(result)
df.columns=['col1','col2','col3']
writer = pandas.ExcelWriter('output.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome', index=False)
writer.save()
iterate(df_selected.values,df_selected2.values)
But I am getting this error output-
ValueError: cannot delete array elements
Please help.
Upvotes: 1
Views: 2511
Reputation: 470
You can try this solution .It should give you desired result
df1 =pd.read_excel('File1.xlsx',parse_dates=['col1'],sep='\t')
df2 = pd.read_excel('File2.xlsx',parse_dates=['col1'],sep='\t',dayfirst=True)
temp_df = pd.concat([df1,df2])
temp_df.reset_index(inplace=True,drop=True)
temp_df.drop_duplicates(subset=['col1','col2','col3'], keep=False, inplace=True)
final_df1 = pd.merge(df1, temp_df, how='inner', on=['col1','col2','col3'])
final_df2 = pd.merge(df2, temp_df, how='inner', on=['col1','col2','col3'])
final_df1.to_excel('file_1_modified.xlsx',index=False)
final_df2.to_excel('file_2_modified.xlsx',index=False)
Below are snapshot for my result.Please note I have read from csv files ..so I have used pd.read_csv() on my snapshot. But it will give same result
Upvotes: 1