Reputation:
I want to convert text to csv. Input file contains 10000K lines. Sample of input file is as below:-
Item=a
Price=10
colour=pink
Item=b
Price=20
colour=blue Pattern=checks
My output should look like this
Item Price Colour Pattern
a 10 pink
b 20 blue checks
My code is as below
import csv
import glob
import os
def dat_to_csv(filename, table_name):
with open(filename, 'r',errors='ignore') as reader:
list_of_columns = []
table_values = []
master_table = []
counter = 0
column_name1 = []
value1 = []
column_name2 = []
value2 = []
column_name3 = []
value3 = []
column_name4 = []
value4 = []
lines_after_23 = reader.readlines()[23:]
for line in lines_after_23:
#stripped_line = line.strip()
if line.startswith("#"):
continue
if line.startswith("Associate"):
continue
if line == "\n":
if (table_values):
master_table.append([])
master_table[counter] = table_values.copy()
counter = counter + 1
length = len(table_values)
for element in range(length):
table_values[element] = []
continue
if line == "\n":
continue
if line == "\n":
master_table.append([])
master_table[counter] = table_values.copy()
counter = counter + 1
length = len(table_values)
for element in range(length):
table_values[element] = []
break
extra_stripped_line = ' '.join(line.split())
data = extra_stripped_line.split("=",1)
column_name = data[0].strip()
if '=' in data[1].strip():
data1 = data[1].split(" ",1)
value = data1[0].strip()
data2 = data1[1].split("=",1)
column_name1 = data2[0].strip()
if '=' in data2[1].strip():
column_name2 = []
value2 = []
data3 = data2[1].split(" ",1)
value1 = data3[0].strip()
data4 = data3[1].split("=",1)
column_name2 = data4[0].strip()
if '=' in data4[1].strip():
data5 = data4[1].split(" ",1)
value2 = data5[0].strip()
data6 = data5[1].split("=",1)
column_name3 = data6[0].strip()
if '=' in data6[1].strip():
data7 = data6[1].split(" ",1)
value3 = data7[0].strip()
data8 = data7[1].split("=",1)
column_name4 = data8[0].strip()
if '=' in data8[1].strip():
data9 = data8[1].split(" ",1)
value3 = data9[0].strip()
data10 = data9[1].split("=",1)
column_name4 = data10[0].strip()
value4 = data10[1].strip()
else:
value4 = data8[1].strip()
else:
value3 = data6[1].strip()
else:
value2 = data4[1].strip()
else:
value1 = data2[1].strip()
else:
value = data[1].strip()
if column_name not in list_of_columns:
list_of_columns.append(column_name)
table_values.append([])
if column_name1 is not []:
if column_name1 not in list_of_columns:
list_of_columns.append(column_name1)
table_values.append([])
if column_name2 is not []:
if column_name2 not in list_of_columns:
list_of_columns.append(column_name2)
table_values.append([])
if column_name3 is not []:
if column_name3 not in list_of_columns:
list_of_columns.append(column_name3)
table_values.append([])
if column_name4 is not []:
if column_name4 not in list_of_columns:
list_of_columns.append(column_name4)
table_values.append([])
index = list_of_columns.index(column_name)
if column_name1 is not []:
index1 = list_of_columns.index(column_name1)
if column_name2 is not []:
index2 = list_of_columns.index(column_name2)
if column_name3 is not []:
index3 = list_of_columns.index(column_name3)
if column_name4 is not []:
index4 = list_of_columns.index(column_name4)
#table_values[index].append(value)
table_values[index] = value
if value1 is not []:
table_values[index1] = value1
if value2 is not []:
table_values[index2] = value2
if value3 is not []:
table_values[index3] = value3
if value4 is not []:
table_values[index4] = value4
#with open("output\\{}.csv".format(table_name), 'w', newline='') as csvfile:
with open("yourpath\\{}.csv".format(table_name), 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow(list_of_columns)
#t_table_values = zip(*table_values)
max_elements = len(master_table)
master_table_transp = []
cntr = 0
for cntr in range(max_elements):
master_table_transp.append([])
num_objects = len(master_table)
for cntr_obj in range(num_objects):
for cntr_row in range(max_elements):
if (cntr_row<len(master_table[cntr_obj])):
master_table_transp[cntr_row].append(master_table[cntr_obj][cntr_row])
else:
master_table_transp[cntr_row].append([])
t_table_values = zip(*master_table_transp)
for values in t_table_values:
writer.writerow(values)
if value1 is not []:
for value1s in t_table_values:
writer.writerow(value1s)
if value2 is not []:
for value2s in t_table_values:
writer.writerow(value2s)
if value3 is not []:
for value3s in t_table_values:
writer.writerow(value3s)
if value4 is not []:
for value4s in t_table_values:
writer.writerow(value4s)
if __name__ == '__main__':
path = "your path"
for filename in glob.glob((os.path.join(path, '*.dat'))):
name_only = os.path.basename(filename).replace(".dat", "")
dat_to_csv(filename, name_only)
I am getting required output but there are few issues:-
Upvotes: 1
Views: 277
Reputation: 11178
I like to make little state machines for these kinds of problems, since as much as we'd all like to believe the sample data matches the real world, there are probably some gotchas, and you want a solution that's flexible.
For me, that flexibility means as I'm looping over the input lines:
And as you try it out, try starting out with smaller samples of your input and build up to the whole, big thing.
#!/usr/bin/env python3
import csv
field_names = {} # use dict as ordered set to collect all field names as data is parsed
records = []
record = None
with open('input.txt') as f:
for line in f:
line = line.strip()
if line.startswith('Item'):
record = {}
if record is None:
continue
if line == '':
records.append(record)
record = None
continue
# Finally, line must be data in a record, parse it
fields = line.split(' ')
kvps = [field.split('=', 1) for field in fields] # 1 in split('=', 1) is for the `===FANCY==` example @MarkTolonen threw at us
kvp_dict = dict(kvps)
record.update(kvp_dict)
field_names.update(kvp_dict) # pass in keys & vals (it's simpler) even if we only need the keys
# Deal with "straggling record" (if your input ends with a line of data (and not an empty line))
if record is not None:
records.append(record)
out_f = open('output.csv', 'w', newline='')
writer = csv.DictWriter(out_f, fieldnames=field_names)
writer.writeheader()
writer.writerows(records)
Here's my output.csv:
| Item | Price | colour | Pattern |
|------|--------|-----------|---------|
| a | 10 | pink | |
| b | 20 | blue | checks |
| aa | 10 | | |
| bb | 20 | blue | checks |
| cc | 5 | | |
| dd | 25 | blue | |
| ee | 1/2=$1 | ==FANCY== | |
Upvotes: 0
Reputation: 26915
Unless I've misunderstood something then it's as simple as this:
from pandas import DataFrame
from numpy import nan
master = [dict()]
with open('foo.txt') as foo:
for line in foo:
if (line := line.strip()):
for token in line.split():
k, v = token.split('=')
master[-1][k] = v
elif master[-1]:
master.append(dict())
if not master[-1]:
del master[-1]
if master:
df = DataFrame(master).replace(nan, '', regex=True)
df.to_csv('foo.csv', index=False)
Output (of the csv file):
Item,Price,colour,Pattern
a,10,pink,
b,20,blue,checks
Upvotes: 2
Reputation: 177481
With some assumptions this works. I added some test cases. This does require all the records to fit in memory, but if you know all the possible column names in advance you can set columns
accordingly and write the lines as generated instead of all at the end. Even with 10000K (10M) records unless the records are really large that will easily fit in a modern system memory.
input.csv
Item=a
Price=10
Item=b
Price=20
colour=blue Pattern=checks
Item=c
Price=5
Item=d Price=25 colour=blue
Item=e colour===FANCY== Price=1/2=$1
test.py
from collections import defaultdict
import csv
columns = {}
lines = []
with open('input.txt') as fin:
for line in fin:
if not line.strip(): # write record on blank line
needs_flush = False
lines.append(columns)
# blank all the columns to start next record.
columns = {k:'' for k in columns}
continue
# assume multiple items on a line are separated by a single space
items = line.strip().split(' ')
# assume column name is before first = sign in each item
for column,value in [item.split('=',1) for item in items]:
needs_flush = True
columns[column] = value
# write record on EOF if hasn't been flushed
if needs_flush:
lines.append(columns)
# dump records to CSV
with open('output.csv','w',newline='') as fout:
writer = csv.DictWriter(fout, fieldnames=columns)
writer.writeheader()
writer.writerows(lines)
output.csv:
Item,Price,colour,Pattern
a,10,,
b,20,blue,checks
c,5,,
d,25,blue,
e,1/2=$1,==FANCY==,
Upvotes: 1