user6302747
user6302747

Reputation:

Compare two csv files in python and retain headers of changes

I'm trying to compare two csv files in python and output the differences along with the headers of each column. So far, with what I'm doing, it outputs all columns instead of just the ones with differences

import csv

with open('firstfile.csv', 'r') as f1:
    file1 = f1.readlines()

with open('secondfile.csv', 'r') as f2:
    file2 = f2.readlines()

with open('results.csv', 'w') as outFile:
    outFile.write(file1[0])
    for line in file2:
        if line not in file1:
            outFile.write(line)

Upvotes: 0

Views: 1832

Answers (2)

sarvesh shukla
sarvesh shukla

Reputation: 1

import csv

def compareList(l1,l2):
   if(len(l1)==len(l2) and len(l1)==sum([1 for i,j in zip(l1,l2) if i==j])):
      return "Equal"
   else:
      return "Non equal"

file1 = "C:/Users/Sarvesh/Downloads/a.csv"
file2 = "C:/Users/Sarvesh/Downloads/b.csv"

with open(file1, 'r') as csv1, open(file2, 'r') as csv2:  # Import CSV files
    import1 = csv1.readlines()
    import2 = csv2.readlines()

    # creating an object of csv reader
    # with the delimiter as ,
    csv_reader = csv.reader(import1, delimiter='|')
    # list to store the names of columns
    list_of_column_name1 = []
    # loop to iterate through the rows of csv
    for row in csv_reader:
        # adding the first row
        list_of_column_name1.append(row)
        # breaking the loop after the
        # first iteration itself
        break

    csv_reader = csv.reader(import2, delimiter='|')
    # list to store the names of columns
    list_of_column_name2 = []
    # loop to iterate through the rows of csv
    for row in csv_reader:
        # adding the first row
        list_of_column_name2.append(row)
        # breaking the loop after the
        # first iteration itself
        break

# printing the result
print("1List of column names : ", list_of_column_name1[0])

print("2List of column names : ", list_of_column_name2[0])

print("First comparison",compareList(list_of_column_name1,list_of_column_name2))

Upvotes: 0

Ruslan  Galimov
Ruslan Galimov

Reputation: 256

I think this code resolves your problem

import sys

with open('file1.csv', 'r') as f1:
    file1 = f1.readlines()

with open('file2.csv', 'r') as f2:
    file2 = f2.readlines()

delimiter = '\t'  # Column delimiter in you file
headers_of_first_file = file1[0].strip().split(delimiter)
headers_of_second_file = file2[0].strip().split(delimiter)

# You can remove this assert if you want to work files with different columns then you have to add some more code in next blocks
different_headers = set(headers_of_first_file).symmetric_difference(headers_of_second_file)
if different_headers:
    print('Files have difference in headers: ', different_headers)
    sys.exit(-1)

# Build map {header: [all_values]}
first_file_map = {header: [] for header in headers_of_first_file}
for row in file1[1:]:
    for index, cell in enumerate(row.strip().split(delimiter)):
        first_file_map[headers_of_first_file[index]].append(cell)

# Check by built map. Dont forget that columns may change order
result = set()
for row in file2[1:]:
    for index, cell in enumerate(row.strip().split(delimiter)):
        if cell not in first_file_map[headers_of_second_file[index]]:
            result.add(headers_of_second_file[index])

with open('results.csv', 'w') as out_file:
    out_file.write('\t'.join(result))

UPD files example:

Column1 Column2 Column3 Column5 Column4
1   2   3   5   4
10  20  30  50  40

Column1 Column2 Column3 Column4 Column5
11  2   3   4   5
10  10  30  40  50

'\t' is delimiter

Upvotes: 1

Related Questions