Reputation: 33
I have a file that looks something like this:
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC
And I need to split it into the "non-N" sequences, so two separate files like this:
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCT
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGC
TATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACA
TGTGTTGC
What I currently have is this:
UMfile = open ("C:\Users\Manuel\Desktop\sequence.txt","r")
contignumber = 1
contigfile = open ("contig "+str(contignumber), "w")
DNA = UMfile.read()
DNAstring = str(DNA)
for s in DNAstring:
DNAstring.split("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",1)
contigfile.write(DNAstring)
contigfile.close()
contignumber = contignumber+1
contigfile = open ("contig "+str(contignumber), "w")
The thing is that I realize there is a linebreak between the "Ns" and that is why it is not splitting my file, but the "file" I'm showing is just a part of a much much bigger one. So sometimes the "Ns" will look like this "NNNNNN\n" and sometimes like "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", yet there is always a count of 1000 Ns between my sequences that I need to split.
So my question is: How do I tell python to split and wite into different files every 1000xNs knowing that there will be different number of Ns in each line?
Thank you all very much, I really have no informatics background and my python skills are at best basic.
Upvotes: 0
Views: 2027
Reputation: 55479
Just split your string on 'N'
and then remove all the strings that are empty, or just contain a newline. Like this:
#!/usr/bin/env python
DNAstring = '''AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC'''
sequences = [u for u in DNAstring.split('N') if u and u != '\n']
for i, seq in enumerate(sequences):
print i
print seq.replace('\n', '') + '\n'
output
0
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACTTCTCAATGGGCAGTACATATCATCTCT
1
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACATGTGTTGC
The code snippet above also removes newlines inside the sequences using .replace('\n', '')
.
Here are a few programs that you may find useful.
Firstly, a line buffer class. You initialise it with a file name and a line width. You can then feed it random length strings and it will automatically save them to the text file, line by line, with all lines (except possibly the last line) having the given length. You can use this class in other programs to make your output look neat.
Save this file as linebuffer.py
to somewhere in your Python path; the simplest way is to save it wherever you save your Python programs and make that the current directory when you run the programs.
linebuffer.py
#! /usr/bin/env python
''' Text output buffer
Write fixed width lines to a text file
Written by PM 2Ring 2015.03.23
'''
class LineBuffer(object):
''' Text output buffer
Write fixed width lines to file fname
'''
def __init__(self, fname, width):
self.fh = open(fname, 'wt')
self.width = width
self.buff = []
self.bufflen = 0
def write(self, data):
''' Write a string to the buffer '''
self.buff.append(data)
self.bufflen += len(data)
if self.bufflen >= self.width:
self._save()
def _save(self):
''' Write the buffer to the file '''
buff = ''.join(self.buff)
#Split buff into lines
lines = []
while len(buff) >= self.width:
lines.append(buff[:self.width])
buff = buff[self.width:]
#Add an empty line so we get a trailing newline
lines.append('')
self.fh.write('\n'.join(lines))
self.buff = [buff]
self.bufflen = len(buff)
def close(self):
''' Flush the buffer & close the file '''
if self.bufflen > 0:
self.fh.write(''.join(self.buff) + '\n')
self.fh.close()
def testLB():
alpha = 'abcdefghijklmnopqrstuvwxyz'
fname = 'linebuffer_test.txt'
lb = LineBuffer(fname, 27)
for _ in xrange(30):
lb.write(alpha)
lb.write(' bye.')
lb.close()
if __name__ == '__main__':
testLB()
Here is a program that makes random DNA sequences of the form you described in your question. It uses linebuffer.py
to handle the output. I wrote this so I could test my DNA sequence splitter properly.
Random_DNA0.py
#! /usr/bin/env python
''' Make random DNA sequences
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Takes approx 3 seconds per megabyte generated and saved
on a 2GHz CPU single core machine.
Written by PM 2Ring 2015.03.23
'''
import sys
import random
from linebuffer import LineBuffer
#Set seed to None to seed randomizer from system time
random.seed(37)
#Output line width
linewidth = 120
#Subsequence base length ranges
minsub, maxsub = 15, 300
#Subsequences per sequence ranges
minseq, maxseq = 5, 50
#random 'N' sequence ranges
minn, maxn = 5, 200
#Probability that a random 'N' sequence occurs after a subsequence
randn = 0.2
#Sequence separator
nsepblock = 'N' * 1000
def main():
#Get number of sequences from the command line
numsequences = int(sys.argv[1]) if len(sys.argv) > 1 else 2
outname = 'DNA_sequence.txt'
lb = LineBuffer(outname, linewidth)
for i in xrange(numsequences):
#Write the 1000*'N' separator between sequences
if i > 0:
lb.write(nsepblock)
for j in xrange(random.randint(minseq, maxseq)):
#Possibly make a short run of 'N's in the sequence
if j > 0 and random.random() < randn:
lb.write(''.join('N' * random.randint(minn, maxn)))
#Create a single subsequence
r = xrange(random.randint(minsub, maxsub))
lb.write(''.join([random.choice('ACGT') for _ in r]))
lb.close()
if __name__ == '__main__':
main()
Finally, we have a program that splits your random DNA sequences. Once again, it uses linebuffer.py
to handle the output.
DNA_Splitter0.py
#! /usr/bin/env python
''' Split DNA sequences and save to separate files
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Written by PM 2Ring 2015.03.23
'''
import sys
from linebuffer import LineBuffer
#Output line width
linewidth = 120
#Sequence separator
nsepblock = 'N' * 1000
def main():
iname = 'DNA_sequence.txt'
outbase = 'contig'
with open(iname, 'rt') as f:
data = f.read()
#Remove all newlines
data = data.replace('\n', '')
sequences = data.split(nsepblock)
#Save each sequence to a series of files
for i, seq in enumerate(sequences, 1):
outname = '%s%05d' % (outbase, i)
print outname
#Write sequence data, with line breaks
lb = LineBuffer(outname, linewidth)
lb.write(seq)
lb.close()
if __name__ == '__main__':
main()
Upvotes: 1
Reputation: 367
You could simply replace every N and \n with a space, and then split.
result = DNAstring.replace("\n", " ").replace("N", " ").split()
This will give you back a list of strings, and the 'ACGT' sequences will also be split with every new line.
if this is not you goal an you want to conserve the \n in the 'ACGT' and not split along it, you can do the following:
result = DNAstring.replace("N\n", " ").replace("N", " ").split()
this will only remove the \n if it is in the middle of an N sequence.
To split your string exactly after 1000 Ns:
# 1/ Get rid of line breaks in the N sequence
result = DNAstring.replace("N\n", "N")
# 2/ split every 1000 Ns
result = result.split(1000*"N")
Upvotes: 0
Reputation: 3759
assuming you can read the whole file at once
s=DNAstring.replace("\n","") # first remove the nasty linebreaks
l=[x for x in s.split("N") if x] # split and drop empty lines
for x in l: # print in chunks
while x:
print x[:10]
x=x[10:]
print # extra linebreak between chunks
Upvotes: 0