Finding overlapping and non overlapping regions

Question

I have a BLAST tabular output with millions of hits.Con is my sequence and P is a protein hit. I am interested in differentiating hits corresponding to the 3 cases illustrated below. They should all be printed in 3 separate new files, and contigs that are in file 1 should not be in file 2,3 and etc. How to do that?

con1 ----------------------- (Contigs with both overlapping and non overlapping hits)
   p1---- p2 ------    p4---
               p3-----
con2 --------------------- (only overlapping)  con3 ----------------(only non overlp)
   p1 -----                                       p1 ----  p2 -----
      p2 -------
          p3 -----

If I know the Protein start and end sites it becomes possible to identify overlap or non overlap; if S1 < E2 < S2 and E1 < S2 < E2 OR S2 - E1 > 0. My input file, i.e.

contig  protein start end
con1    P1  481 931
con1    P2  140 602
con1    P3  232 548
con2    P4  335 406
con2    P5  642 732
con2    P6  2282    2433
con2    P7  729 812
con3    P8  17  148
con3    P9  289 45

My script (this prints me only the hits that do not overlap)

from itertools import groupby

def nonoverlapping(hits):
    """Returns a list of non-overlapping hits."""
    nonover = []
    overst =  False
    for i in range(1,len(hits)):
        (p, c) = hits[i-1], hits[i]
        if c[2] > p[3]:
            if not overst: nonover.append(p)
            nonover.append(c)
            overst = True  
    return nonover

fh = open('file.txt')
oh = open('result.txt', 'w')
for qid, grp in groupby(fh, lambda l: l.split()[0]):
    hits = []
    for line in grp:
        hsp = line.split()
        hsp[2], hsp[3] = int(hsp[2]), int(hsp[3])
        hits.append(hsp)
    if len(hits) > 1: 
        hits.sort(key=lambda x: x[2])
        for hit in nonoverlapping(hits):
            oh.write('	'.join([str(f) for f in hit])+'
')

cnluzon · Accepted Answer

I would do something like this. Define an "overlaps" function for two hits, and then test for each contig whether all, some or none overlap. Then write all the contigs to the desired file:

from itertools import groupby

def overlaps(a, b):
    result = True
    # Supposing a[2] is the start, a[3] the end. 
    # If end before start, they are not overlapping
    if a[3] < b[2] or b[3] < a[2]:
        result = False

    return result

def test_overlapping(hits):
    overlapping = 'None'
    overlapping_count = 0
    for i in range(len(hits)-1):
        if overlaps(hits[i], hits[i+1]):
            overlapping_count += 1


    if overlapping_count == 0:
        overlapping = 'None'
    elif overlapping_count == len(hits) -1:
        overlapping = 'All'
    else:
        overlapping = 'Some'

    return overlapping


fh = open('file.txt')

file_all = open('result_all.txt', 'w')
file_some = open('result_some.txt', 'w')
file_none = open('result_none.txt', 'w')

line = fh.readline() # quit header

for qid, grp in groupby(fh, lambda l: l.split()[0]):
    hits = []
    for line in grp:
        hsp = line.split()
        hsp[2], hsp[3] = int(hsp[2]), int(hsp[3])
        hits.append(hsp)
    if len(hits) > 1: 
        hits.sort(key=lambda x: x[2])
        overlapping = test_overlapping(hits)
        out_file = file_none
        if overlapping == 'All':
            out_file = file_all
        elif overlapping == 'Some':
            out_file = file_some

        for h in hits:
            out_file.write('	'.join([str(v) for v in h]))
            out_file.write('
')


file_all.close()
file_some.close()
file_none.close()

Finding overlapping and non overlapping regions

Answers (1)

Related Questions