Reputation: 43
I have a code where text file manipulation is taking place. Although text files are very large and with my current code as i calculated it needs 30 days to finish.
If multiprocessing is he only way i have a server with 40 cores.
Cell_line_final2.bed:
chr1 778704 778912 MSPC_Peak_37509 8.43 cell_line GM12878 CTCF ENCSR000AKB CNhs12333 132
chr1 778704 778912 MSPC_Peak_37509 8.43 cell_line GM12878 CTCF ENCSR000AKB CNhs12331 132
chr1 778704 778912 MSPC_Peak_37509 8.43 cell_line GM12878 CTCF ENCSR000AKB CNhs12332 132
chr1 869773 870132 MSPC_Peak_37508 74.0 cell_line GM12878 CTCF ENCSR000AKB CNhs12333 132
...
...
tf_TPM2.bed:
CNhs12333 2228319 4.41 CTCF
CNhs12331 6419919 0.0 HES2
CNhs12332 6579994 0.78 ZBTB48
CNhs12333 8817465 0.0 RERE
...
...
The desired output is to add a column in "Cell_line_final2.bed" where 1st and 4th column of "tf_TPM2.bed" match 10th and 8th column of "Cell_line_final2.bed" simultaneously .
chr1 778704 778912 MSPC_Peak_37509 8.43 cell_line GM12878 CTCF ENCSR000AKB CNhs12333 132 4.41
chr1 778704 778912 MSPC_Peak_37509 8.43 cell_line GM12878 HES2 ENCSR000AKB CNhs12331 132 0.0
chr1 778704 778912 MSPC_Peak_37509 8.43 cell_line GM12878 CTCF ENCSR000AKB CNhs12332 132 0.78
chr1 869773 870132 MSPC_Peak_37508 74.0 cell_line GM12878 RERE ENCSR000AKB CNhs12333 132 0.0
...
...
My code so far:
def read_file(file):
with open(file) as f:
current = []
for line in f: # read rest of lines
current.append([x for x in line.split()])
return(current)
inputfile = "/home/lside/Desktop/database_files/Cell_line_final2.bed" # 2.7GB text file
outpufile = "/home/lside/Desktop/database_files/Cell_line_final3.bed"
file_in = read_file("/home/lside/Desktop/tf_TPM2.csv") # 22.5MB text file
new_line = ""
with open(inputfile, 'r') as infile:
with open(outpufile, 'w') as outfile:
for line in infile:
line = line.split("\t")
for j in file_in:
if j[0] == line[9] and j[3] == line[7]:
new_line = new_line + '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\n'.format(line[0], line[1], line[2],line[3], line[4], line[5],line[6], line[7], line[8], line[9], line[10].rstrip(), j[2])
continue
outfile.write(new_line)
Upvotes: 2
Views: 112
Reputation: 3593
I agree with comments saying that this should not take 30 days to run so the bottleneck should be somewhere else. Probably the biggest offender is the huge string that you are building, instead of just dumping each line to file at every iteration (^).
(^) The biggest offender is more likely to have been the continue
statement in the inner loop, as that will always force the code to compare the current line with all the elements in the lookup file, rather than stop at the first match. Replacing that with a break
should be the way to go.
Here what I would do and see how fast this performs:
def read_file(filename):
with open(filename) as f:
current = []
for line in f: # read rest of lines
e0, e2, e3 = line.split()[0], line.split()[2], line.split()[3]
current.append((e0, e2, e3)) # you only use these three elements
return current
inputfile = "/home/lside/Desktop/database_files/Cell_line_final2.bed" # 2.7GB text file
outpufile = "/home/lside/Desktop/database_files/Cell_line_final3.bed"
file_in = read_file("/home/lside/Desktop/tf_TPM2.csv") # 22.5MB text file
with open(inputfile, 'r') as infile:
with open(outpufile, 'w') as outfile:
for line in infile:
line = line.split("\t")
for e0, e2, e3 in file_in:
if e0 == line[9] and e3 == line[7]:
new_line = '{0}\t{1}\n'.format(line.rstrip(), e2) # just append the column to the entire line
outfile.write(new_line) # dump to file, don't linger around with an ever-growing string
break
If we want to go further, we can make a lookup table from the file_in
. The idea is that, instead of having to loop through every element extracted from file_in
, we prepare a dictionary where the key is prepared from j[0],j[3]
- which are the fields you compare - and the value is j[2]
. This way, the lookup is going to be practically instantaneous, no loop necessary anymore.
The modified code to use this logic looks like this:
def make_lookup_table(filename):
lookup = {}
with open(filename) as f:
for line in f: # read rest of lines
e0, e2, e3 = line.split()[0], line.split()[2], line.split()[3]
lookup[(e0, e3)] = e2 # use (e0,e3) as key, and e2 as value
return lookup
inputfile = "/home/lside/Desktop/database_files/Cell_line_final2.bed" # 2.7GB text file
outpufile = "/home/lside/Desktop/database_files/Cell_line_final3.bed"
lookup = make_lookup_table("/home/lside/Desktop/tf_TPM2.csv") # 22.5MB text file
with open(inputfile, 'r') as infile:
with open(outpufile, 'w') as outfile:
for line in infile:
line = line.split("\t")
value = lookup[(line[9],line[7])]
new_line = '{0}\t{1}\n'.format(line.rstrip(), value) # just append the column to the entire line
outfile.write(new_line) # dump to file, don't linger around with an ever-growing string
Upvotes: 3
Reputation: 40778
Here is another example using a set
for lookup:
def main():
f = Filter(TPM_fn='tf_TPM2.bed', final_fn='Cell_line_final2.bed',
save_fn='Cell_line_final3.bed')
class Filter:
def __init__(self, **kwargs):
self.args = kwargs
self.read_TPM()
with open(self.args['save_fn'], 'w') as outfile:
with open(self.args['final_fn'], 'r') as infile:
self.read_infile(infile, outfile)
def read_infile(self, infile, outfile):
for line in infile:
fields = line.split()
key = fields[9]+fields[7]
if key in self.tpm:
outfile.write(line)
return
def read_TPM(self):
fn = self.args['TPM_fn']
tpm = set()
with open(fn) as f:
for line in f:
fields = line.split()
if len(fields) != 4:
continue
key = fields[0]+fields[3]
tpm.add(key)
self.tpm = tpm
main()
Upvotes: 0
Reputation: 2945
I'd like to suggest quite unconventional solution which makes use of SQL. Fist, create two tables which will store your data and line number.
import sqlite3
conn = sqlite3.connect(':memory:') # you may consider file if short on RAM
c = conn.cursor()
c.execute('CREATE TABLE table1 (line INT, col1, col4);')
c.execute('CREATE TABLE table2 (line INT, col8, col10);')
conn.execute()
Then, read lines from your files and write rows to the database
for index, line in enumerate(open('tf_TPM2.csv')):
tokens = line.split()
c.execute('INSERT INTO table1 VALUES (?, ?, ?);', (index, tokens[0], tokens[3])
conn.commit()
for index, lint in enumerate(open('Cell_line_final2.bed')):
tokens = line.split()
c.execute('INSERT INTO table2 VALUES (?, ?, ?);', (index, tokens[7], tokens[9])
conn.commit()
Finally, issue the query, which checks which rows have matching values and fetch the line number.
query = c.execute(
'SELECT table2.line, table1.line '
'FROM table1, table2 '
'WHERE table1.col1 == table2.col10 AND table1.col4 == table2.col8 '
'ORDER BY table2.line;'
)
while True:
result = query.fetchone()
if result is None: break
# print result to file
The result will contain the line numbers, but you can put and query other columns as well.
Upvotes: 2