Reputation: 1151
I have a BLAST tabular output with millions of hits.Con is my sequence and P is a protein hit. I am interested in differentiating hits corresponding to the 3 cases illustrated below. They should all be printed in 3 separate new files, and contigs that are in file 1 should not be in file 2,3 and etc. How to do that?
con1 ----------------------- (Contigs with both overlapping and non overlapping hits)
p1---- p2 ------ p4---
p3-----
con2 --------------------- (only overlapping) con3 ----------------(only non overlp)
p1 ----- p1 ---- p2 -----
p2 -------
p3 -----
If I know the Protein start and end sites it becomes possible to identify overlap or non overlap; if S1 < E2 < S2 and E1 < S2 < E2 OR S2 - E1 > 0. My input file, i.e.
contig protein start end
con1 P1 481 931
con1 P2 140 602
con1 P3 232 548
con2 P4 335 406
con2 P5 642 732
con2 P6 2282 2433
con2 P7 729 812
con3 P8 17 148
con3 P9 289 45
My script (this prints me only the hits that do not overlap)
from itertools import groupby
def nonoverlapping(hits):
"""Returns a list of non-overlapping hits."""
nonover = []
overst = False
for i in range(1,len(hits)):
(p, c) = hits[i-1], hits[i]
if c[2] > p[3]:
if not overst: nonover.append(p)
nonover.append(c)
overst = True
return nonover
fh = open('file.txt')
oh = open('result.txt', 'w')
for qid, grp in groupby(fh, lambda l: l.split()[0]):
hits = []
for line in grp:
hsp = line.split()
hsp[2], hsp[3] = int(hsp[2]), int(hsp[3])
hits.append(hsp)
if len(hits) > 1:
hits.sort(key=lambda x: x[2])
for hit in nonoverlapping(hits):
oh.write('\t'.join([str(f) for f in hit])+'\n')
Upvotes: 1
Views: 2162
Reputation: 1084
I would do something like this. Define an "overlaps" function for two hits, and then test for each contig whether all, some or none overlap. Then write all the contigs to the desired file:
from itertools import groupby
def overlaps(a, b):
result = True
# Supposing a[2] is the start, a[3] the end.
# If end before start, they are not overlapping
if a[3] < b[2] or b[3] < a[2]:
result = False
return result
def test_overlapping(hits):
overlapping = 'None'
overlapping_count = 0
for i in range(len(hits)-1):
if overlaps(hits[i], hits[i+1]):
overlapping_count += 1
if overlapping_count == 0:
overlapping = 'None'
elif overlapping_count == len(hits) -1:
overlapping = 'All'
else:
overlapping = 'Some'
return overlapping
fh = open('file.txt')
file_all = open('result_all.txt', 'w')
file_some = open('result_some.txt', 'w')
file_none = open('result_none.txt', 'w')
line = fh.readline() # quit header
for qid, grp in groupby(fh, lambda l: l.split()[0]):
hits = []
for line in grp:
hsp = line.split()
hsp[2], hsp[3] = int(hsp[2]), int(hsp[3])
hits.append(hsp)
if len(hits) > 1:
hits.sort(key=lambda x: x[2])
overlapping = test_overlapping(hits)
out_file = file_none
if overlapping == 'All':
out_file = file_all
elif overlapping == 'Some':
out_file = file_some
for h in hits:
out_file.write('\t'.join([str(v) for v in h]))
out_file.write('\n')
file_all.close()
file_some.close()
file_none.close()
Upvotes: 2