Reputation: 11
I'm trying to gather data from a text file. When I print the outputs, they return the correct values that I am looking for, however, when I try to put these outputs into a table using xlsxwriter, the table only contains a the outputs of the final line of the txt file repeated the amount of times that there are lines in the text file. i.e. there are 5000 lines of text I need 3 pieces of info from, the .xlsx file has 5000 rows and 3 columns but all contain the information for the final line in the text file.
EC:1 > GO:N-ethylmaleimide reductase activity ; GO:0008748
EC:1 > GO:oxidoreductase activity ; GO:0016491
EC:1 > GO:reduced coenzyme F420 dehydrogenase activity ; GO:0043738
EC:1 > GO:sulfur oxygenase reductase activity ; GO:0043826
EC:1 > GO:malolactic enzyme activity ; GO:0043883
^what the txt file looks like
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
6.6.1.2 cobaltochelatase activity 0051116
... ... ...
(how the table looks but for 5000 lines)
Any help would be appreciated, Regards
import xlsxwriter
File = 'EC_to_GO.txt'
def analysis(line, output):
with open(File) as fp:
lines = fp.readlines()
for line in lines:
output[0] = line[3:].split(' > ')[0]
output[1] = line[:-14].split(' > GO:')[-1]
output[2] = line[-8:]
return output
with open(File) as fp:
lines = fp.readlines()
for line in lines:
if 'Generated on 2018-07-04T09:08Z' in line:
a = lines.index(line)
for line in lines:
if 'GO:cobaltochelatase activity ; GO:0051116' in line:
b = lines.index(line)
req_list = lines[a:b]
rxn_end_index = []
for i in range(len(req_list)):
if '> GO:' in req_list[i]:
rxn_end_index.append(i)
inner_list = []
outer_list =[]
spare = [0] + rxn_end_index
for i in range(len(spare)-1):
inner_list = req_list[spare[i]:spare[i+1]]
outer_list.append(inner_list)
res_list=[]
for i in range(len(outer_list)):
res_list.append(analysis(outer_list[i],['NA','NA','NA']))
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('EC_to_GO.xlsx')
worksheet = workbook.add_worksheet('EC_to_GO')
#res_list1 = [EC, Genome name, GO]
#for i in res_list:
#res_list1.append(i)
# Some data we want to write to the worksheet.
t = tuple(res_list)
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
for a,b,c in (t):
worksheet.write(row, col, a)
worksheet.write(row, col + 1, b)
worksheet.write(row, col + 2, c)
row += 1
workbook.close()
Upvotes: 1
Views: 66
Reputation: 1066
You are basically appending the same list to res_list
. So you have multiple copies of the same output
list.
To fix: Instead of
res_list.append(analysis(outer_list[i],['NA','NA','NA']))
#And in the previous loop
for i in range(len(spare)-1):
inner_list = req_list[spare[i]:spare[i+1]]
outer_list.append(inner_list)
change it to:
res_list.append(analysis(outer_list[i],['NA','NA','NA'])[:])
for i in range(len(spare)-1):
inner_list = req_list[spare[i]:spare[i+1]]
outer_list.append(inner_list[:])
Or
res_list.append(copy(analysis(outer_list[i],['NA','NA','NA'])))
for i in range(len(spare)-1):
inner_list = req_list[spare[i]:spare[i+1]]
outer_list.append(copy(inner_list))
The notation list[:] creates a copy of the list. Technically, you are creating a slice of the whole list.
Upvotes: 1