Reputation:
I'm trying to compare two csv files in python and output the differences along with the headers of each column. So far, with what I'm doing, it outputs all columns instead of just the ones with differences
import csv
with open('firstfile.csv', 'r') as f1:
file1 = f1.readlines()
with open('secondfile.csv', 'r') as f2:
file2 = f2.readlines()
with open('results.csv', 'w') as outFile:
outFile.write(file1[0])
for line in file2:
if line not in file1:
outFile.write(line)
Upvotes: 0
Views: 1832
Reputation: 1
import csv
def compareList(l1,l2):
if(len(l1)==len(l2) and len(l1)==sum([1 for i,j in zip(l1,l2) if i==j])):
return "Equal"
else:
return "Non equal"
file1 = "C:/Users/Sarvesh/Downloads/a.csv"
file2 = "C:/Users/Sarvesh/Downloads/b.csv"
with open(file1, 'r') as csv1, open(file2, 'r') as csv2: # Import CSV files
import1 = csv1.readlines()
import2 = csv2.readlines()
# creating an object of csv reader
# with the delimiter as ,
csv_reader = csv.reader(import1, delimiter='|')
# list to store the names of columns
list_of_column_name1 = []
# loop to iterate through the rows of csv
for row in csv_reader:
# adding the first row
list_of_column_name1.append(row)
# breaking the loop after the
# first iteration itself
break
csv_reader = csv.reader(import2, delimiter='|')
# list to store the names of columns
list_of_column_name2 = []
# loop to iterate through the rows of csv
for row in csv_reader:
# adding the first row
list_of_column_name2.append(row)
# breaking the loop after the
# first iteration itself
break
# printing the result
print("1List of column names : ", list_of_column_name1[0])
print("2List of column names : ", list_of_column_name2[0])
print("First comparison",compareList(list_of_column_name1,list_of_column_name2))
Upvotes: 0
Reputation: 256
I think this code resolves your problem
import sys
with open('file1.csv', 'r') as f1:
file1 = f1.readlines()
with open('file2.csv', 'r') as f2:
file2 = f2.readlines()
delimiter = '\t' # Column delimiter in you file
headers_of_first_file = file1[0].strip().split(delimiter)
headers_of_second_file = file2[0].strip().split(delimiter)
# You can remove this assert if you want to work files with different columns then you have to add some more code in next blocks
different_headers = set(headers_of_first_file).symmetric_difference(headers_of_second_file)
if different_headers:
print('Files have difference in headers: ', different_headers)
sys.exit(-1)
# Build map {header: [all_values]}
first_file_map = {header: [] for header in headers_of_first_file}
for row in file1[1:]:
for index, cell in enumerate(row.strip().split(delimiter)):
first_file_map[headers_of_first_file[index]].append(cell)
# Check by built map. Dont forget that columns may change order
result = set()
for row in file2[1:]:
for index, cell in enumerate(row.strip().split(delimiter)):
if cell not in first_file_map[headers_of_second_file[index]]:
result.add(headers_of_second_file[index])
with open('results.csv', 'w') as out_file:
out_file.write('\t'.join(result))
UPD files example:
Column1 Column2 Column3 Column5 Column4
1 2 3 5 4
10 20 30 50 40
Column1 Column2 Column3 Column4 Column5
11 2 3 4 5
10 10 30 40 50
'\t' is delimiter
Upvotes: 1