Reputation: 71
I have a list strings of filenames and their names all end with a date eg. hello_20200825.pdf, hello_20200720 etc. How can I strip the end of the string to get the date, and then check which string in the list has the most recent date and return that string? Here is my code so far:
import os
import datetime
def most_recent_file(region, wsp):
path = r'PDFs/'+region
files = []
for i in os.listdir(path):
if os.path.isfile(os.path.join(path, i)) and wsp in i:
files.append(i)
for i in files:
print(files)
Upvotes: 0
Views: 147
Reputation: 11
The module Delorean has a parse
method that's good at handling random date formats, and/or you could use the regex below to strip non-numerical characters in the above solutions for parsing, too. If you cast the filename to a tuple along with the datetime once you get it, you can still use max
and return the filename by its tuple index.
import os
import datetime
import re
from delorean import parse
def most_recent_file(region, wsp):
path = r'PDFs/'+region
files = []
for i in os.listdir(path):
if os.path.isfile(os.path.join(path, i)) and wsp in i:
files.append(i)
# create a blank list
result_list = []
for i in files:
# remove everything that's not a digit with regex
digits_only = re.sub("[^0-9]", "", i)
# parse the remaining digits and return a datetime
parsed = parse(digits_only).datetime
# add a tuple with the datetime and filename to the list
result_list.append((parsed, i))
# get filename element from max result
most_recent_filename = max(result_list)[1]
Upvotes: 1
Reputation: 887
If you don't want to use the datetime component you can play with formatted dates and lists.
import os
import datetime
def most_recent_file(region, wsp):
path = r'PDFs/'+region
files = []
dates = [] #list of dates
for i in os.listdir(path):
if os.path.isfile(os.path.join(path, i)) and wsp in i:
files.append(i)
for i in files:
dateFromString = i.split('_')[1].split('.')[0] #get the numbers from file names
dates.append(dateFromString)
latestFile = max(dates) #returns the max number from your dates which will be the latest
#another approch will be to get the latest number from dates list - dates.reverse() and then return dates[0]
print(latestFile)
here is code with the date component
for i in files:
dateFromString = i.split('_')[1].split('.')[0]
date = datetime.datetime.strptime(dateFromString, '%Y%m%d')
dates.append(date)
print(max(dates)) # it will print the max date value - which will be your latest file
Upvotes: 2
Reputation: 5615
You can split the file name by _
, grab the date from the 1st index and parse it using datetime.strptime
- then it's just simple maths
import os
from datetime import datetime
def most_recent_file(region, wsp):
path = r'PDFs/' + region
# Set date_diff to the highest value
date_diff = float('inf')
today = datetime.now()
# Variable to store the filename to
filename = ''
for file in os.listdir(path):
# Split the file by _ and grab the result at 1st index
# Split that by `.` and grab the result at 0 index
# This will be the date
date_str = file.split('_')[1].split('.')[0]
# Parse the string with the format YYYYMMDD
# Then get the diff between today and the parsed time
curr_diff = today - datetime.strptime(date_str, '%Y%m%d')
if date_diff == float('inf') or curr_diff < date_diff:
# If the difference is less than the date_diff - update date_diff
date_diff = curr_diff
# Also set the filename
filename = file
return filename
Upvotes: 0