Reputation: 23
I have a text file that looks likes this
P1 : Some data
P2 : blabla
P4 : whatever
F1 : something
F2 : something else
G6 : This entry continues
G6 : down here
This is followed by a empty line and then a new record which looks the same as above (about 100k total). I need to get a text file in which every line contains the P2, p4 and G6 entry separated by a tab.
This is what I have so far
output = open('out.txt', 'w')
output.write("P1\tG6\n")
P1_ = False
G6_ = False
with open("data.txt", 'r') as data:
for line in data:
if line.startswith('P1 :'):
P1 = line[4:10]
P1_ = True
elif line.startswith('G6'):
G6 = line.lstrip('G6 :')
G6_ = True
else:
continue
if P1_ and G6_ :
output.write(year + "\t" + abstract)
year_ = False
abstract_ = False
output.close()
data.close()
The problem I encounter is that some records do not have all entries I need and some have the G6 spread over several lines. Any ideas on how to do this?
EDIT: After reading all of your answers I realised my question was a bit vague. I do need the records which do not have all entries.
Upvotes: 2
Views: 149
Reputation: 4396
As others have suggested, you could create a dictionary for each data chunk. When you hit blank lines write to your output file and clear the dictionary. Using a defaultdict
makes concatenating the multiple entries per key easier.
from collections import defaultdict
keep = ['P2', 'P4', 'G6']
tmp_dict = defaultdict(str)
# a function to handle formatting of output
def output_format(the_dict):
return '\t'.join([the_dict[k].rstrip() for k in keep]) + '\n'
with open('test.txt') as infile, open('output.txt', 'w') as outfile:
for line in infile:
# if there's non-whitespace text on this line
if line.strip():
k, v = line.split(' : ')
# store the data if the key is relevant,
# appending if the key has already been hit
if k in keep:
tmp_dict[k] += v.rstrip() + ' '
# when there's a blank line, write the data to
# the output file and clear the temporary dict
else:
outfile.write(output_format(tmp_dict))
tmp_dict = defaultdict(str)
# one last time, in case file doesn't end in newline
outfile.write(output_format(tmp_dict))
Not sure I fully understand the desired output. This prints the value for P2, then for P4 then for G6 all on one line with a tab between each. For example:
blabla whatever This entry continues down here
If an element is missing, the tabs will still be printed, so if P4 is missing there will be two tabs in a row.
But the output is flexible. If you wanted, say, to preserve the original format after filtering and merging G6, then you could use:
def output_format(the_dict):
output = ''
for k, v in the_dict.iteritems():
output += '{} : {}\n'.format(k, the_dict[k])
return output + '\n'
Upvotes: 1
Reputation: 541
@blackcloud, following @Kursian's original comment, I think the dictionary approach is the cleanest. Here's some sample code based on what i think you are after:
f_in = open('data.txt','r')
f_out = open('out.txt', 'w')
f_out.write("P1\tG6\n")
myDict = {}
for line in f_in: #loop through each line in the input text file...
if not len(line.strip()): #check for a blank line - your "record separator"...
if 'P1' in myDict and 'G6' in myDict: #check if this record meets your conditions
abstract = myDict['P1']+'\t'+myDict['G6']+'\n' #build your tab-separated abstract
f_out.write(abstract) #and write a record if so
myDict = {} #reset your dictionary for your next "record"
else: #otherwise,
key,val = line.split(' : ') #split the line into its key and value components
if key == 'P1': #if it has a P1
myDict[key] = val.strip() #add it...
if key == 'G6': #if it has a G6
if 'G6' in myDict: #check if it's already in the dictionary
myDict[key] += ''.join([' ',val.strip()]) #and just append to the existing value with a space
else: #otherwise
myDict[key] = val.strip() #just add the G6 key,value pair
f_out.close()
f_in.close() #cleanup!
If I understand your original challenge, you are looking for blocks of data separated by a blank line
and having at least one P1
entry and one or more G6
lines. So, extending your example, given an input text file like this:
P1 : Some data
P2 : blabla
P4 : whatever
F1 : something
F2 : something else
G6 : This entry continues
G6 : down here
P1 : Some data
P2 : blabla
P4 : whatever
G6 : This entry continues
F1 : something
F2 : something else
G6 : down here
P2 : blabla
P4 : whatever
F1 : something
F2 : something else
G6 : This entry continues
G6 : down here
P1 : Some data
P2 : blabla
P4 : whatever
F1 : something
G6 : This entry continues
F2 : something else
You'll end up with an output file that looks like this after running the above script:
P1 G6
Some data This entry continues down here
Some data This entry continues down here
Some data This entry continues
Hope this all helps and big thanks to all the contributors here!!
Upvotes: 0
Reputation: 40894
So your problem is parsing, I suppose.
I hope you can somehow logically detect when a new record starts, even if some lines are missing. I suppose that the order of the entries is fixed, so you can detect when one group is over and the next starts by the fact that e.g. P fields always go before G fields.
This code works reasonably and handles the missing fields:
FIELD_ORDER = ['P1', 'P2', 'P4', 'F1', 'F2', 'G6']
class RecordReader(object):
def __init__(self, input_stream):
self.input_stream = input_stream
self.prev_record = None
def readNext(self):
if self.prev_record:
# if we have a value pushed back, return it first
prev_value = self.prev_record
self.prev_record = None
return prev_value
# read things for real
line = self.input_stream.readline()
if line == '':
return None # end of file
tag, value = line.split(':') # assume the format is always correct
return tag.strip(), value.strip()
def pushBack(self, record):
self.prev_record = record
def readGroup(reader):
data = {} # accumulate values here
expected = FIELD_ORDER[:] # make a copy, for we'll change the expectations
while True:
record = reader.readNext()
if not record:
return data # end of file, we are done
tag, content = record
# are we still in the same group?
if tag in expected:
if tag == 'G6':
# append to existing G6, if any
data[tag] = data.get(tag, '') + ' ' + content
else:
data[tag] = content
# cut away all the expected fields up to the field we just found
point = expected.index(tag)
if tag != 'G6':
point += 1 # stop expecting the tag we just found (unless it's G6)
expected = expected[point:]
else:
reader.pushBack(record) # it belongs to next group
return data
def read(input_stream):
reader = RecordReader(input_stream)
while True:
group = readGroup(reader)
if group:
p1 = group.get('P1', 'No P1 found')
g6 = group.get('G6', 'No G6 found')
print "%s\t%s" % (p1, g6) # edit to taste
else:
# we could only get an empty group at end of file.
break
def main():
with open('foo.bar') as input_stream:
read(input_stream)
Upvotes: 0
Reputation: 7886
So I added a new variable G6_Done
that keeps track of when we've started seeing G6 which prevents entering the bottom if (which writes to the file) till we're no longer seeing G6 (as then G6_Done == True
). Then, by using G6 = ""
in its definition, we can write to it continuously till we reset things.
output = open('out.txt', 'w')
output.write("P1\tG6\n")
P1_ = False
G6_ = False
G6_Done = False
G6 = ""
with open("data.txt", 'r') as data:
for line in data:
if line.startswith('P1 :'):
P1 = line[4:10]
P1_ = True
elif line.startswith('G6'):
G6 += line.lstrip('G6 :')
G6_Done = False
G6_ = True
elif G6 and not G6_Done:
G6 = True
else:
continue
if P1_ and G6_ and G6_Done:
output.write(year + "\t" + abstract)
year_ = False
abstract_ = False
G6 = ""
P1_ = False
G6_ = False
G6_Done = False
output.close()
data.close()
Upvotes: 0