Reputation: 1096
My script is working but I have not been able to print output in a table format. I have two files - I read file1 sequences and see if any of them binds with any sequence in file2.
file1.fa
>seq1
ACCGG
>seq2
AATTTC
file2.fa
>seq3
CCGGT
>seq9
GGGGGGCCC
current output:
---
file1.seq
>seq1
ACCGG
file2.seq
>seq3
CCGGT
Based on the files only seq1
binds with seq3
and the current output based on my code is correct but then I have to get the output I want in a format showing every sequence ids against each other - marked with 1 if they interact and marked with 0 if they do not.
I need help to fix the print
part to get output in a table format as below:
file2 seq3 seq9
file1
seq1 1 0
seq2 0 0
My code:
from Bio import SeqIO
from Bio.Seq import Seq
records=list(SeqIO.parse("file1.fa","fasta"))
window_size = 5
step_size = 1
target_records=list(SeqIO.parse("file2.fa","fasta"))
for i in records:
for j in range(0, len(i.seq)-window_size+1):
for k in (i.seq.reverse_complement()[j: j+5].split()):
for l in target_records:
if l.seq.find(k)!=-1:
print('---\n{}\n>{}\n{}\n{}\n>{}\n{}'.format ("file1.seq", i.id,i.seq, "file2.seq", l.id,l.seq))
Thank you
Upvotes: 0
Views: 915
Reputation: 1387
i'd like to uses pandas when i want get the table format:
for example:
from Bio import SeqIO
import pandas as pd
def same_seq(a_record, brecord):
window_size = 5
step_size = 1
for j in range(0, len(a_record.seq) - window_size + 1):
for k in (a_record.seq.reverse_complement()[j: j + 5].split()):
return brecord.seq.find(k) != -1
if __name__ == '__main__':
records = list(SeqIO.parse("file1.fa", "fasta"))
target_records = list(SeqIO.parse("file2.fa", "fasta"))
rows_list = []
for target_record in target_records:
new_row = {'name': target_record.name}
for record in records:
if same_seq(record, target_record):
new_row[record.name] = 1
else:
new_row[record.name] = 0
rows_list.append(new_row)
df = pd.DataFrame(rows_list)
df = df.set_index(["name"])
print(df)
seq1 seq2
name
seq3 1 0
seq9 0 0
if you wana change the compare of seq:
you should use seq[j: j+5].reverse_complement()
but seq.reverse_complement()[j: j+5]
I'm not sure if I understand it right
file1.fa
>seq1
ACCGG
>seq2
AATTTC
>seqtest1
NNNACCGTGCNN
file1.fa
>seq1
ACCGG
>seq2
AATTTC
>seqtest1
NNNACCGTGCNN
file2.fa
>seq3
CCGGT
>seq9
GGGGGGCCC
>seqtest2
NNCACGGTNN
from Bio import SeqIO
import pandas as pd
def same_seq_window(a_record, b_record, window_size):
for i in range(len(a_record.seq)- window_size + 1):
a_seq = a_record.seq[i: i + window_size]
if b_record.seq.find(a_seq.reverse_complement()) != -1:
return True
return False
def same_seq(a_record, b_record):
window_sizes = range(5, len(a_record.seq)+1)
for window_size in window_sizes:
if same_seq_window(a_record, b_record, window_size):
return True
return False
if __name__ == '__main__':
records = list(SeqIO.parse("file1.fa", "fasta"))
target_records = list(SeqIO.parse("file2.fa", "fasta"))
rows_list = []
for target_record in target_records:
new_row = {'name': target_record.name}
for record in records:
if same_seq(record, target_record):
new_row[record.name] = 1
else:
new_row[record.name] = 0
rows_list.append(new_row)
df = pd.DataFrame(rows_list)
df = df.set_index(["name"])
print(df)
seq1 seq2 seqtest1
name
seq3 1 0 0
seq9 0 0 0
seqtest2 0 0 1
Upvotes: 1