Reputation: 6389
I am accessing some data through an API where I need to provide the date range for my request, ex. start='20100101', end='20150415'. I thought I would speed this up by breaking up the date range into non-overlapping intervals and use multiprocessing on each interval.
My problem is that how I am breaking up the date range is not consistently giving me the expected result. Here is what I have done:
from datetime import date
begin = '20100101'
end = '20101231'
Suppose we wanted to break this up into quarters. First I change the string into dates:
def get_yyyy_mm_dd(yyyymmdd):
# given string 'yyyymmdd' return (yyyy, mm, dd)
year = yyyymmdd[0:4]
month = yyyymmdd[4:6]
day = yyyymmdd[6:]
return int(year), int(month), int(day)
y1, m1, d1 = get_yyyy_mm_dd(begin)
d1 = date(y1, m1, d1)
y2, m2, d2 = get_yyyy_mm_dd(end)
d2 = date(y2, m2, d2)
Then divide this range into sub-intervals:
def remove_tack(dates_list):
# given a list of dates in form YYYY-MM-DD return a list of strings in form 'YYYYMMDD'
tackless = []
for d in dates_list:
s = str(d)
tackless.append(s[0:4]+s[5:7]+s[8:])
return tackless
def divide_date(date1, date2, intervals):
dates = [date1]
for i in range(0, intervals):
dates.append(dates[i] + (date2 - date1)/intervals)
return remove_tack(dates)
Using begin and end from above we get:
listdates = divide_date(d1, d2, 4)
print listdates # ['20100101', '20100402', '20100702', '20101001', '20101231'] looks correct
But if instead I use the dates:
begin = '20150101'
end = '20150228'
...
listdates = divide_date(d1, d2, 4)
print listdates # ['20150101', '20150115', '20150129', '20150212', '20150226']
I am missing two days at the end of February. I don't need time or timezone for my application and I don't mind installing another library.
Upvotes: 35
Views: 40034
Reputation: 79
# create bins
bins = pd.date_range(start='2020-12-27', end='2022-11-27', periods=3)
bins
# DatetimeIndex(['2020-12-27', '2021-12-12', '2022-11-27'], dtype='datetime64[ns]', freq=None)
# cut into intervals
pd.cut(df['datetime_col'], bins=bins)
Upvotes: 7
Reputation: 121
Extending Abhijit's answer, you can specify the max amount of block for a specified frequency in minutes:
def date_range(start, end, freq, max_block):
start = datetime.strptime(start,"%Y%m%d")
end = datetime.strptime(end,"%Y%m%d")
diff = timedelta(minutes=freq)*(max_block)
chunk = int((end - start ) / timedelta(minutes=freq) // (max_block))
for i in range(chunk):
yield (start + diff * i).strftime("%Y%m%d")
yield end.strftime("%Y%m%d")
period = 365*2
endDate = datetime.today().strftime("%Y%m%d")
startDate = (datetime.today() - timedelta(days=period)).strftime("%Y%m%d")
list(date_range(startDate, endDate, 15, 5000))
Pretty useful in paginating REST api requests
Upvotes: 1
Reputation: 529
Piggybacking on @Abhijit's answer, here's a version where one of the arguments is max_capacity_days based on which interval gets calculated internally.
from datetime import datetime
from typing import Iterable
def date_groups(
start_at: datetime,
end_at: datetime,
max_capacity_days: float) -> Iterable[datetime]:
capacity = timedelta(days=max_capacity_days)
interval = int( (end_at - start_at ) / capacity) + 1
for i in range(interval):
yield (start_at + capacity * i)
yield end_at
>>> list(map(str, date_groups(datetime(2021,1,1), datetime(2021,5,1), 30)))
['2021-01-01 00:00:00', '2021-01-31 00:00:00', '2021-03-02 00:00:00', '2021-04-01 00:00:00', '2021-05-01 00:00:00', '2021-05-01 00:00:00']
>>> list(map(str, date_groups(datetime(2021,1,1), datetime(2021,5,1), 50)))
['2021-01-01 00:00:00', '2021-02-20 00:00:00', '2021-04-11 00:00:00', '2021-05-01 00:00:00']
Take an action for each date pair
>>> dg = date_groups(datetime(2021,2,1, 1,33,33), datetime(2021,5,5), 30)
>>> dates = list(dg)
>>> for start_at, end_at in zip(dates[:-1],dates[1:]):
... print(f"Delta in [{start_at}, {end_at}] = {(end_at-start_at)}")
...
Delta in [2021-02-01 01:33:33, 2021-03-03 01:33:33] = 30 days, 0:00:00
Delta in [2021-03-03 01:33:33, 2021-04-02 01:33:33] = 30 days, 0:00:00
Delta in [2021-04-02 01:33:33, 2021-05-02 01:33:33] = 30 days, 0:00:00
Delta in [2021-05-02 01:33:33, 2021-05-05 00:00:00] = 2 days, 22:26:27
Upvotes: 0
Reputation: 1
If you want to split the date rang by number of days. You can use the following snippet.
import datetime
firstDate = datetime.datetime.strptime("2019-01-01", "%Y-%m-%d")
lastDate = datetime.datetime.strptime("2019-03-30", "%Y-%m-%d")
numberOfDays = 15
startdate = firstDate
startdatelist = []
enddatelist = []
while startdate <= lastDate:
enddate = startdate + datetime.timedelta(days=numberOfDays - 1)
startdatelist.append(startdate.strftime("%Y-%m-%d 00:00:00"))
if enddate > lastDate: enddatelist.append(lastDate.strftime("%Y-%m-%d 23:59:59"))
enddatelist.append(enddate.strftime("%Y-%m-%d 23:59:59"))
startdate = enddate + datetime.timedelta(days=1)
for a, b in zip(startdatelist, enddatelist):
print(str(a) + " - " + str(b))
Upvotes: 0
Reputation: 599
I've created a function, which includes the end date in date split.
from dateutil import rrule
from dateutil.relativedelta import relativedelta
from dateutil.rrule import DAILY
def date_split(start_date, end_date, freq=DAILY, interval=1):
"""
:param start_date:
:param end_date:
:param freq: refer rrule arguments can be SECONDLY, MINUTELY, HOURLY, DAILY, WEEKLY etc
:param interval: The interval between each freq iteration.
:return: iterator object
"""
# remove microsecond from date object as minimum allowed frequency is in seconds.
start_date = start_date.replace(microsecond=0)
end_date = end_date.replace(microsecond=0)
assert end_date > start_date, "end_date should be greated than start date."
date_intervals = rrule.rrule(freq, interval=interval, dtstart=start_date, until=end_date)
for date in date_intervals:
yield date
if date != end_date:
yield end_date
Upvotes: 1
Reputation: 63737
I would actually follow a different approach and rely on timedelta and date addition to determine the non-overlapping ranges
Implementation
def date_range(start, end, intv):
from datetime import datetime
start = datetime.strptime(start,"%Y%m%d")
end = datetime.strptime(end,"%Y%m%d")
diff = (end - start ) / intv
for i in range(intv):
yield (start + diff * i).strftime("%Y%m%d")
yield end.strftime("%Y%m%d")
Execution
>>> begin = '20150101'
>>> end = '20150228'
>>> list(date_range(begin, end, 4))
['20150101', '20150115', '20150130', '20150213', '20150228']
Upvotes: 49
Reputation: 109546
Using Datetimeindex and Periods from Pandas, together with dictionary comprehension:
import pandas as pd
begin = '20100101'
end = '20101231'
start = dt.datetime.strptime(begin, '%Y%m%d')
finish = dt.datetime.strptime(end, '%Y%m%d')
dates = pd.DatetimeIndex(start=start, end=finish, freq='D').tolist()
quarters = [d.to_period('Q') for d in dates]
df = pd.DataFrame([quarters, dates], index=['Quarter', 'Date']).T
quarterly_dates = {str(q): [ts.strftime('%Y%m%d')
for ts in df[df.Quarter == q].Date.values.tolist()]
for q in quarters}
>>> quarterly_dates
{'2010Q1': ['20100101',
'20100102',
'20100103',
'20100104',
'20100105',
...
'20101227',
'20101228',
'20101229',
'20101230',
'20101231']}
>>> quarterly_dates.keys()
['2010Q1', '2010Q2', '2010Q3', '2010Q4']
Upvotes: 1
Reputation: 8164
you should change date for datetime
from datetime import date, datetime, timedelta
begin = '20150101'
end = '20150228'
def get_yyyy_mm_dd(yyyymmdd):
# given string 'yyyymmdd' return (yyyy, mm, dd)
year = yyyymmdd[0:4]
month = yyyymmdd[4:6]
day = yyyymmdd[6:]
return int(year), int(month), int(day)
y1, m1, d1 = get_yyyy_mm_dd(begin)
d1 = datetime(y1, m1, d1)
y2, m2, d2 = get_yyyy_mm_dd(end)
d2 = datetime(y2, m2, d2)
def remove_tack(dates_list):
# given a list of dates in form YYYY-MM-DD return a list of strings in form 'YYYYMMDD'
tackless = []
for d in dates_list:
s = str(d)
tackless.append(s[0:4]+s[5:7]+s[8:])
return tackless
def divide_date(date1, date2, intervals):
dates = [date1]
delta = (date2-date1).total_seconds()/4
for i in range(0, intervals):
dates.append(dates[i] + timedelta(0,delta))
return remove_tack(dates)
listdates = divide_date(d1, d2, 4)
print listdates
result:
['20150101 00:00:00', '20150115 12:00:00', '20150130 00:00:00', '20150213 12:00:00', '20150228 00:00:00']
Upvotes: 3
Reputation: 14500
Could you use the datetime.date objects instead?
If you do:
import datetime
begin = datetime.date(2001, 1, 1)
end = datetime.date(2010, 12, 31)
intervals = 4
date_list = []
delta = (end - begin)/4
for i in range(1, intervals + 1):
date_list.append((begin+i*delta).strftime('%Y%m%d'))
and date_list should have the end dates for each inteval.
Upvotes: 3