Reputation: 5
I am trying to read multiple xml
files and extract the data from these files. I am trying to extract two sets of data and save them into two separate csv
files.
From the extractData
function, I get a list of data from a single file. In the createCSV
function, I extract only the data which I require.
I want to save all the extracted data from all the files I read into one csv
file. Currently, I am only able to save the last file.
import json
import os
import pandas as pd
import numpy as np
import bs4
import glob
import csv
def extractData(path):
for filename in glob.glob(os.path.join(path, '*.xml')):
genre = bs4.BeautifulSoup(open(filename, 'r', encoding="utf8"), features="lxml")
#print(genre)
if genre.find_all("name") == []:
print('Not Available')
else:
tags = genre.find_all("name")
genre_list = []
for name in tags:
genres = name.text.strip()
genre_list.append(genres)
#print(genre_list)
return genre_list
def createCSV(list_genre):
new_artist_list = []
new_genre_list = []
complete_list = pd.DataFrame(list_genre)
new_col = len(complete_list)
#print(complete_list)
#print(new_col)
if new_col == 2:
#for complete_list in complete_lists:
column_names_1 = ["Song", "Artist"]
final_list_1 = complete_list.T
final_list_1.columns = column_names_1
#print(final_list_1)
new_artist_list.append(final_list_1)
#print(new_artist_list)
elif new_col > 2:
#for complete_list in complete_lists:
column_names_2 = ["Song", "Artist", "Genre"]
#print(complete_list)
final_list_2 = complete_list.T
final_list_3 = np.array(final_list_2)
#print(final_list_3)
song_list = final_list_3[:, 0]
artist_list = final_list_3[:, 1]
final_list_2['Genre'] = final_list_2[final_list_2.columns[2:]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
#print(final_list_2['Genre'])
combined = np.concatenate((song_list, artist_list))
combined_list = np.concatenate((combined, final_list_2['Genre']))
#print(combined_list)
complete_list_final = pd.DataFrame(combined_list)
#print(complete_list_final)
complete_list_final_1 = complete_list_final.T
complete_list_final_1.columns = column_names_2
new_genre_list.append(complete_list_final_1)
#print(new_genre_list)
print(new_artist_list)
print(new_genre_list)
return new_artist_list, new_genre_list
def writeCSV(partial_list, complete_list):
partial_lists = partial_list
complete_lists = complete_list
artist_init = []
complete_init = []
for i, j in zip(partial_lists, complete_lists):
with open('data_only_artist_name.csv', 'w', encoding="utf8") as data_partial:
artist_csv = csv.writer(data_partial)
artist_csv.writerow(i)
artist_init.append(artist_csv)
with open('data_complete.csv', 'w', encoding="utf8") as data_complete:
complete_csv = csv.writer(data_complete)
complete_csv.writerow(j)
complete_init.append(complete_csv)
#print(new_genre_list)
a = artist_init
b = complete_init
return a, b
path = 'XML Data'
data = extractData(path)
print(data)
partial, complete = createCSV(data)
list_partial, list_complete = writeCSV(partial, complete)
Upvotes: 0
Views: 390
Reputation: 3775
As mentioned by Serge, open your files in append
mode to avoid overwriting them in the for loop:
def writeCSV(partial_list, complete_list):
artist_init = []
complete_init = []
# Create empty files
# Comment to keep previously written files
open('data_only_artist_name.csv', 'w')
open('data_complete.csv', 'w')
for i, j in zip(partial_list, complete_list):
with open('data_only_artist_name.csv', 'a', encoding="utf8") as data_partial:
artist_csv = csv.writer(data_partial)
artist_csv.writerow(i)
artist_init.append(artist_csv)
with open('data_complete.csv', 'a', encoding="utf8") as data_complete:
complete_csv = csv.writer(data_complete)
complete_csv.writerow(j)
complete_init.append(complete_csv)
#print(new_genre_list)
return artist_init, complete_init
Upvotes: 1
Reputation: 148890
In write_csv
you open the files in w
mode, so you truncate them to 0 size on each pass and at the end you only have the values from the last pass.
Use a
mode instead (with open(..., 'a', encoding="utf8") as ...
) and you will add new data after previous one.
Upvotes: 1