Reputation: 29
I have a .txt filled with data that I want to filter (approx. 5800 lines), since some lines occur as duplicates with the only difference that the timestamp is exactly 2 hours later. Those lines that are the late version of the duplicate (so for example the first line in the attached example) should be left out. All other lines should be left in and written to a new .txt file.
1_3_IMM 2016-07-19 16:11:56 00:00:40 2 Sensor Check # should go
1_3_IMM 2016-07-19 14:12:40 00:00:33 2 Sensor Check # should stay
1_3_IMM 2016-07-19 14:11:56 00:00:40 2 Sensor Check # should stay
1_3_IMM 2016-07-19 16:12:40 00:00:33 2 Sensor Check # should go
1_4_IMM 2016-07-19 17:23:25 00:00:20 2 Sensor Check # should stay
1_4_IMM 2016-07-19 19:23:25 00:00:20 2 Sensor Check # should go
1_4_IMM 2016-07-19 19:15:24 00:02:21 2 Sensor Check # should stay
1_4_IMM 2016-07-19 19:25:13 00:02:13 2 Sensor Check # should stay
I wrote some code in Python, output is a .txt file with only 1 line of text:
deleted
I can't seem to solve this one. Can you help? See code below.
import os
def filter_file():
with open("output.txt", "w") as output:
#open the input file from a specified directory
directory = os.path.normpath("C:/Users/sande_000/Documents/Python files")
for subdir, dirs, files in os.walk(directory):
for file in files:
if file.startswith("input"):
input_file=open(os.path.join(subdir, file))
#iterate over each line of the file
for line in input_file:
machine = line[0:7] #stores machine number
date = line[8:18] #stores date stamp
time_1 = int(line[19:21]) #stores hour stamp
time_2 = int(line[22:24]) #stores minutes stamp
time_3 = int(line[25:27]) #stores second stamp
#check current line with other lines for duplicates by iterating over each line of the file
for otherline in input_file:
compare_machine = otherline[0:7]
compare_date = otherline[8:18]
compare_time_1 = int(otherline[19:21])+2
compare_time_2 = int(otherline[22:24])
compare_time_3 = int(otherline[25:27])
#check whether machine number & date/hour+2/minutes/seconds stamp are similar.
#If yes, write 'deleted' to output.txt and stop comparing lines.
#If no, continue with comparing next line.
if compare_machine == machine and compare_date == date and compare_time_1 == time_1 and compare_time_2 == time_2 and compare_time_3 == time_3:
output.write("deleted"+"\n")
break
else:
continue
#If no overlap between one line with any other line from the file, write that line to output.txt since it is no duplicate.
output.write(line)
input_file.close()
if __name__ == "__main__":
filter_file()
Upvotes: 1
Views: 222
Reputation: 9076
I believe the code below works. Note that this code won't work if there's any variation in the smallest three time components of the records (milliseconds, microseconds, nanoseconds) on account of the fact that datetime
doesn't support resolution beyond microseconds. In your example, that won't make a difference though.
import os
from datetime import datetime, timedelta
INPUT_DIR = 'C:\Temp'
OUTPUT_FILE = 'output.txt'
def parse_data(data):
for line in data.splitlines():
date_s = ' '.join(line.split()[1:3])
date = datetime.strptime(date_s, '%Y-%m-%d %H:%M:%S')
yield line, date
def filter_duplicates(data):
duplicate_offset = timedelta(hours=2)
parsed_data = list(parse_data(data))
lines, dates = zip(*parsed_data)
for line, date in parsed_data:
if (date - duplicate_offset) not in dates:
yield line
def get_input_data_from_dir(directory):
data = ''
for sub_dir, _, files in os.walk(directory):
for file in files:
if file.startswith('input'):
with open(os.path.join(sub_dir, file)) as f:
data += f.read() + '\n'
return data
if __name__ == '__main__':
data = get_input_data_from_dir(INPUT_DIR)
with open(OUTPUT_FILE, 'w') as f_out:
content = '\n'.join(filter_duplicates(data))
f_out.write(content)
Tested for input directory with structure:
me@my-computer /cygdrive/c/Temp
$ tree
.
├── input_1.txt
└── input_2.txt
input_1.txt
:
1_3_IMM 2016-07-19 16:11:56 00:00:40 2 Sensor Check
1_3_IMM 2016-07-19 14:12:40 00:00:33 2 Sensor Check
1_3_IMM 2016-07-19 14:11:56 00:00:40 2 Sensor Check
1_3_IMM 2016-07-19 16:12:40 00:00:33 2 Sensor Check
input_2.txt
:
1_4_IMM 2016-07-19 17:23:25 00:00:20 2 Sensor Check
1_4_IMM 2016-07-19 19:23:25 00:00:20 2 Sensor Check
1_4_IMM 2016-07-19 19:15:24 00:02:21 2 Sensor Check
1_4_IMM 2016-07-19 19:25:13 00:02:13 2 Sensor Check
output.txt
after execution:
1_3_IMM 2016-07-19 14:12:40 00:00:33 2 Sensor Check
1_3_IMM 2016-07-19 14:11:56 00:00:40 2 Sensor Check
1_4_IMM 2016-07-19 17:23:25 00:00:20 2 Sensor Check
1_4_IMM 2016-07-19 19:15:24 00:02:21 2 Sensor Check
1_4_IMM 2016-07-19 19:25:13 00:02:13 2 Sensor Check
Your expected output below, copied for convenience:
1_3_IMM 2016-07-19 16:11:56 00:00:40 2 Sensor Check # should go
1_3_IMM 2016-07-19 14:12:40 00:00:33 2 Sensor Check # should stay
1_3_IMM 2016-07-19 14:11:56 00:00:40 2 Sensor Check # should stay
1_3_IMM 2016-07-19 16:12:40 00:00:33 2 Sensor Check # should go
1_4_IMM 2016-07-19 17:23:25 00:00:20 2 Sensor Check # should stay
1_4_IMM 2016-07-19 19:23:25 00:00:20 2 Sensor Check # should go
1_4_IMM 2016-07-19 19:15:24 00:02:21 2 Sensor Check # should stay
1_4_IMM 2016-07-19 19:25:13 00:02:13 2 Sensor Check # should stay
Upvotes: 1
Reputation: 726
I think this shorter code should do it. Is has two consecutive loops instead of nested loops which should increase performance.
from datetime import datetime, timedelta
# os.walk etc.
for file in files:
if not file.startswith("input"):
continue
entries = set()
# build up entries
for line in input_file:
machine = line[0:7] #stores machine number
date = datetime.strptime(line[8:27], '%Y-%m-%d %H:%M:%S')
entries.add((machine, date))
#check entries
for line in input_file:
machine = line[0:7] #stores machine number
date = datetime.strptime(line[8:27], '%Y-%m-%d %H:%M:%S') - timedelta(hours=2)
if (machine, date) in entries:
output.write("deleted\n")
else:
output.write(line)
output.flush()
Upvotes: 0