Rocket
Rocket

Reputation: 553

Finding minimum value of word from a lot of files in python?

I have 1000 .txt files on which I run this code. What I have to do is find the largest value of ENSG in the files and delete other values of ENSG which are less than the largest one. Then I have to find the minimum value from the same text file by looking at utr length and save it in my output.txt file. I want the output of 1000 .txt files in 1 output.txt file. This code works approximately fine, but it overwrites the result. It only shows the result of last file:

import glob
f2 = glob.glob("./*.txt")
all_text=""
for fpath in f2:
    f = open(fpath,"r")
    list_lines = f.readlines()
    dic={}
    sent="ENSG"
    temp_list=[]
    for line in list_lines:
        all_text=all_text+line
        name= line.rsplit()[0].strip()
        score=line.rsplit()[1].strip()
        dic[name]=score
    for i in dic.keys():
        if sent in i:
            temp_list.append(dic[i])
            hiegh_score=max(temp_list)
    def check(index):
        reverse_text=all_text[index+1::-1]
        index2=reverse_text.find("\n")
        if sent==reverse_text[:index2+1][::-1][1:len(sent)+1]:
            return False
        else:
            return True

    list_to_min=dic.values()
    for i in temp_list:
        if i!=hiegh_score:
            index=all_text.find(str(i))
            while check(index):
                index=all_text.find(str(i),index+len(str(i)))
            all_text=all_text[0:index]+all_text[index+len(str(i)):]
            list_to_min.remove(str(i))
file2=open("my_try4.txt","w")
file2.write(all_text)
min_score= min(list_to_min)
for j in dic.keys():
    if min_score==dic[j]:
        k="min score is :"+str(min_score)+" for person "+j
        file2.write(k)
print "%6d : %s" % (len(list_lines),fpath)
file2.close()
f.close()

I have text files like this 4.txt:

ENSBTAG00000020679  197
ENSCAFG00000009872  2585
ENSG00000018236 1935
ENSG00000018236 230
ENSG00000018236 257
ENSG00000018236 338
ENSG00000018236 922
ENSG00000018236 922
ENSRNOG00000004438  14
ENSRNOG00000004438  14

Now it should select ENSG with 1935 and delete all other value of ENSG. Now the text file should look like this:

    ENSBTAG00000020679  197
    ENSCAFG00000009872  2585
    ENSG00000018236 1935
    ENSRNOG00000004438  14
    ENSRNOG00000004438  14

And now, by looking at this text file, we find the shortest value and save it in a text file (We do this on 1000 files and output should be on 1 file).

output.txt
textfile4 14

Upvotes: 0

Views: 801

Answers (1)

Francis Avila
Francis Avila

Reputation: 31641

It was easier to rewrite this than to figure out what was wrong with your code:

import os.path
import glob
import re
import itertools
from collections import namedtuple, deque
from operator import attrgetter

R_PREFIX_VALUE = re.compile(r'^(?P<prefix>[A-Z]+)(?P<suffix>\d+)\s+(?P<value>\d+)\s*$')

getvalue  = attrgetter('value')

def interleave(seq, val):
    return itertools.chain.from_iterable(itertools.izip(seq, itertools.repeat(val)))

class Fileline(namedtuple('Fileline', 'filename prefix suffix value')):
    @classmethod
    def _fromstr(cls, s, filename=None, rematch=R_PREFIX_VALUE.match):
        m = rematch(s)
        if not m:
            raise ValueError('No valid line found in %r' % s)
        d = m.groupdict()
        d['value'] = int(d['value'])
        d['filename'] = filename
        return cls(**d)

    def _asstr(self):
        return '{}{} {}'.format(self.prefix, self.suffix, self.value)

def max_value_with_prefix(lineseq, prefix, getvalue=getvalue):
    withprefix = (line for line in lineseq if line.prefix==prefix)
    return max_value(withprefix)

def filter_lt_line(lineseq, maxline):
    for line in lineseq:
        if line.prefix != maxline.prefix or line.value >= maxline.value:
            yield line

def extreme_value(fn, lineseq, getvalue=getvalue):
    try:
        return fn((l for l in lineseq if l is not None), key=getvalue)
    except ValueError:
        return None

def max_value(lineseq):
    return extreme_value(max, lineseq)

def min_value(lineseq):
    return extreme_value(min, lineseq)

def read_lines(fn, maker=Fileline._fromstr):
    with open(fn, 'rb') as f:
        return deque(maker(l, fn) for l in f)

def write_file(fn, lineseq):
    lines = (l._asstr() for l in lineseq)
    newlines = interleave(lines, '\n')
    with open(fn, 'wb') as f:
        f.writelines(newlines)

def write_output_file(fn, lineseq):
    lines = ("{} {}".format(l.filename, l.value) for l in lineseq)
    newlines = interleave(lines, "\n")
    with open(fn, 'wb') as f:
        f.writelines(newlines)

def filter_max_returning_min(fn, prefix):
    lineseq = read_lines(fn)
    maxvalue = max_value_with_prefix(lineseq, prefix)
    filteredlineseq = deque(filter_lt_line(lineseq, maxvalue))
    write_file(fn, filteredlineseq)
    minline = min_value(filteredlineseq)
    return minline

def main(fileglob, prefix, outputfile):
    minlines = []
    for fn in glob.iglob(fileglob):
        minlines.append(filter_max_returning_min(fn, prefix))
    write_output_file(outputfile, minlines)

The entry point is main(), which is called like main('txtdir', 'ENSG', 'output.txt'). For each file filter_max_returning_min() will open and rewrite the file and return the min value. There's no need to keep a dict or list of every line of every file you visited.

(BTW, destructively overwriting files seems like a bad idea! Have you considered copying them elsewhere?)

When you isolate separate concerns into separate functions, it becomes very easy to recompose them for different execution behavior. For example, it's trivial to run this task on all files in parallel by adding two small functions:

def _worker(args):
    return filter_max_returning_min(*args)

def multi_main(fileglob, prefix, outputfile, processes):
    from multiprocessing import Pool
    pool = Pool(processes=processes)
    workerargs = ((fn, prefix) for fn in glob.iglob(fileglob))
    minlines = pool.imap_unordered(_worker, workerargs, processes)
    write_file(outputfile, minlines)

Now you can start up a configurable number of workers, each of which will work on one file, and collect their min values when they are done. If you have very large files or a great number of files and are not IO bound this might be faster.

Just for fun, you can also easily turn this into a CLI utility:

def _argparse():
    import argparse

    def positive_int(s):
        v = int(s)
        if v < 1:
            raise argparse.ArgumentTypeError('{:r} must be a positive integer'.format(s))
        return v

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""Filter text files and write min value.

    Performs these operations on the text files in supplied `filedir`:

    1. In each file, identify lines starting with the matching `maxprefix`
       which do *not* contain the maximum value for that prefix in that file.
    2. DESTRUCTIVELY REWRITE each file with lines found in step 1 removed!
    3. Write the minimum value (for all lines in all files) to `outputfile`.
    """)
    parser.add_argument('filedir',
        help="Directory containg the text files to process. WILL REWRITE FILES!")
    parser.add_argument('maxprefix', nargs="?", default="ENSG", 
        help="Line prefix which should have values less than max value removed in each file")
    parser.add_argument('outputfile', nargs="?", default="output.txt",
        help="File in which to write min value found. WILL REWRITE FILES!")
    parser.add_argument('-p', '--parallel', metavar="N", nargs="?", type=positive_int, const=10,
        help="Process files in parallel, with N workers. Default is to process a file at a time.")
    return parser.parse_args()

if __name__ == '__main__':
    args = _argparse()
    fileglob = os.path.join(args.filedir, '*.txt')
    prefix = args.maxprefix
    outputfile = args.outputfile
    if args.parallel:
        multi_main(fileglob, prefix, outputfile, args.parallel)
    else:
        main(fileglob, prefix, outputfile)

Now you can invoke it from the command line:

$ python ENSG.py txtdir ENSCAFG --parallel=4

Upvotes: 1

Related Questions