Reputation: 89
I am trying to read two CSV files and compare the first column (ID) and see if there are duplicates. If they are duplicates I have to tell the user which ID's are duplicates and only copy the unique ID's values into the first csv.
first csv = transactions_ledgers.csv
(contains column headers) and has some values stored in it. Column[0] is ID.
second csv = transactions_q1.csv
(no column headers) the values on column[0] are ID's
So I need to read the second CSV to see if there are duplicates on the first CSV. If there are, I need to print the ids that are duplicated and only append the unique ids from csv 2 to csv 1.
Unfortunately, I can not use Pandas; it has to be using Python.
#Using set
with open('transactions_q1.csv') as check_file:
#header = next(check_file)
check_set = set([row.split(',')[0].strip().upper() for row in check_file])
with open('transactions_q2.csv', 'r') as in_file, open('unique_transaction_ledgers.csv', 'w') as out_file:
for line in in_file:
if line.split(',')[0].strip().upper() in check_set:
out_file.write(line)```
# Second Option
with open('transaction_ledger.csv','r') as file:
read = csv.reader(file)
header=next(file)
for line in read:
ids=line[0]
with open('transactions_q2.csv','r') as file:
read = csv.reader(file)
header=next(file)
for line in read:
ids2=line[0]
if ids == ids2:
print("oh no duplicates")
Here is the function I am trying to incorporate @craigb answer to my initial question.
The code that I am sharing is a function that ultimately does the following:
My current code is running but I am facing several inconveniences.
Thank you in advance for any guidance provided!
#Create the ImportFunction
#Import dependencies
def ImportFunction():
#This function is created for the user to be able to import new files into the main file and check for duplicate transactions
#Set counters, lists and sets that will be used in the code
ids= set()
total_rows = 0
duplicate_lines = []
duplicate=[]
#First step is to ask the user what file they would like to import
which_file = input("Which file would you like to import?")
#Using os.path.exists check if the 'main' file transaction_ledgeres.csv exists. if it does set varible to 'yes' if not set variable to 'no'
if os.path.exists('transaction_ledger.csv'):
main_file ='yes'
else:
main_file = 'no'
#Created a conditional statement to check if the "user choice" file existis or not
if os.path.exists(which_file):
is_file='yes'
else:
#Prompt the user the file doesnt exist in this file path
print(f"This file '{which_file}' does not exist.")
is_file='no'
#Created a conditional statement if the file does or doesn't exist to perform actions
if main_file == 'yes':
#If true then open the file and perform actions
if is_file == 'yes':
#Second step is to load the file into python.
with open(which_file,'r') as file:
user_file = file.read()
#print(user_file)
print("File Imported succesfully, checking for possible duplicate transactions...")
#Reading in the transaction_ledger.csv Since its true
with open('transaction_ledger.csv','r') as file:
trans_file = csv.reader(file)
header = next(trans_file) #This will skip the headers on CSV to be able to compare ID in column one and see if there is duplicate values
#Created loop with the help @craigb anwser on my quesiton on stackover overflow
for i in trans_file:
ids.add(i[0].strip().upper())
#NEW FILE WITH UNIQUE TRANSACTION CONTENT ONLY NO DUPLICATES with the help @craigb anwser on my quesiton on stackover overflow
with open(which_file,'r') as file, open('unique_transaction_ledgers.csv', 'w') as out_file,open('duplicated_transaction_ledgers.csv', 'w') as duplicated_file:
reader = csv.reader(file)
writer = csv.writer(out_file)
writer2 = csv.writer(duplicated_file)
for line in reader:
if line[0].strip().upper() in ids:
total_rows += 1
writer.writerow(line)
duplicate.append(line[0])
print(f"The following transactions have already been recorded: {duplicate} ")
print(f"Number of current transactions:{total_rows-len(duplicate)}")
for line in reader:
if line[0].strip().upper() not in ids:
total_rows += 1
duplicate_lines.append(line[0])
writer2.writerow(line)
print(f"The following transactions have already been recorded: {line} ")
print(f"Number of current transactions:{total_rows-len(duplicate_lines)}")# this is always zero in the terminal and it should not be
#If file doesn't exist set flag to true
else:
flag = True
#If the main file does not exist then create it and perform the actions needed
if main_file == 'no':
#Open the user file and read it to be able to append to the new empty file transaction_ledger.csv
with open(which_file,'r') as old_file:
reader_obj = csv.reader(old_file) #read the current csv file
with open('transaction_ledger.csv', 'w') as new_file:
writer_obj = csv.writer(new_file, delimiter=",",lineterminator = "\n")
header = ['ID', 'COMPANY NAME', 'DATE', 'AMOUNT', 'STATUS']
writer_obj.writerow(header)
for data in reader_obj:
total_rows += 1
#loop through the read data and write each row in transaction_Ledger.csv
writer_obj.writerow(data)
print("New file created and filled in with imported file data")
print(f"Number of current transactions:{total_rows}")
#Create boolean value for while loop name it flag
flag = True
#Created while loop
while flag:
#Ask the user what action they want to perform.
user_selection = input("What would you like to perform (import/statistics) or end the simulation?")
# Conditional - if they choose stats then call the function statistics
if user_selection.strip().lower() == 'statistics':
statistics()
#Ask user if they want to run another analysis
repeat1 = input('Would you like to run any further analyses (yes/no)?' )
if 'y' in repeat1.strip().lower():
#ask user what they want to do
next_step = input("What would you like to perform (import/statistics)?")
if next_step.lower().strip() == 'statistics':
#if statistics than run the statistics function
statistics()
#keep flag true to loop again if the user wants to
flag = True
#If import run the ImportFunction and keep flag true
elif next_step.lower().strip() == 'import':
ImportFunction()
flag = True
#If they don't want to continue then end the simulation and turn flag False
else:
print("Thanks for using this simulation")
flag = False
#Conditional - if they choose the import function then call the ImportFunction
elif user_selection.lower().strip() == 'import':
ImportFunction()
#Ask user if they want to run another analysis
repeat = input('Would you like to run any further analyses (yes/no)?' )
if 'y' in repeat.strip().lower():
flag = True
else:
print("Thanks for using this simulation")
flag = False
elif 'e' in user_selection[0].lower().strip():
print("Thanks for using this simulation")
flag = False
#If the user inputs an invalid operation then prompt them that the program doesn't perform the action and ask them to try again
else:
print("Sorry, this program can only run the following two functions: 1. Import 2. Statistics or you can end the simulation")
flag = True
'''
Upvotes: 0
Views: 567
Reputation: 1091
Thanks for writing some code. The first option works as is, although it writes IDs that are common to both files. Here's the code with that logic reversed - it only writes lines whose IDs are in transactions_q2.csv
but not in transactions_q1.csv
(just replace in
with not in
):
#Using set
with open('transactions_q1.csv') as check_file:
check_set = set([row.split(',')[0].strip().upper() for row in check_file])
with open('transactions_q2.csv', 'r') as in_file, open('unique_transaction_ledgers.csv', 'w') as out_file:
for line in in_file:
if line.split(',')[0].strip().upper() not in check_set:
out_file.write(line)
Your 2nd option using the csv
module doesn't gather up the IDs from the first file. Here are some changes that builds a set and checks it like the first option (I removed the header()
code - put that back in if there are headers; I wanted this to work the same as option #1):
# Second Option
ids = set()
with open('transactions_q1.csv','r') as file:
read = csv.reader(file)
for line in read:
ids.add(line[0].strip().upper())
with open('transactions_q2.csv','r') as file, open('unique_transaction_ledgers.csv', 'w') as out_file:
reader = csv.reader(file)
writer = csv.writer(out_file)
for line in reader:
if line[0].strip().upper() not in ids:
writer.writerow(line)
Your third option isn't complete and it's hard to understand what you are trying to do.
Upvotes: 2