Reputation: 8247
I am using following if loop for creating half an hourly buckets for approx 1 million observations which is taking hell lot of time. Following is my if loop
def half_hourly_buckets(dataframe,time_column):
dataframe[time_column] = pd.to_datetime(dataframe[time_column],format = '%H:%M:%S').dt.time
for j in range(len(dataframe)):
x = dataframe.loc[j,time_column]
if (x >= datetime.time(0,0,1)) & (x <= datetime.time(0,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "00:00:01 - 00:30:00"
elif (x >= datetime.time(0,30,1)) & (x <= datetime.time(1,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "00:30:01 - 01:00:00"
elif (x >= datetime.time(1,0,1)) & (x <= datetime.time(1,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "01:00:01 - 01:30:00"
elif (x >= datetime.time(1,30,1)) & (x <= datetime.time(2,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "01:30:01 - 02:00:00"
elif (x >= datetime.time(2,0,1)) & (x <= datetime.time(2,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "02:00:01 - 02:30:00"
elif (x >= datetime.time(2,30,1)) & (x <= datetime.time(3,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "02:30:01 - 03:00:00"
elif (x >= datetime.time(3,0,1)) & (x <= datetime.time(3,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "03:00:01 - 03:30:00"
elif (x >= datetime.time(3,30,1)) & (x <= datetime.time(4,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "03:30:01 - 04:00:00"
elif (x >= datetime.time(4,0,1)) & (x <= datetime.time(4,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "04:00:01 - 04:30:00"
elif (x >= datetime.time(4,30,1)) & (x <= datetime.time(5,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "04:30:01 - 05:00:00"
elif (x >= datetime.time(5,0,1)) & (x <= datetime.time(5,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "05:00:01 - 05:30:00"
elif (x >= datetime.time(5,30,1)) & (x <= datetime.time(6,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "05:30:01 - 06:00:00"
elif (x >= datetime.time(6,0,1)) & (x <= datetime.time(6,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "06:00:01 - 06:30:00"
elif (x >= datetime.time(6,30,1)) & (x <= datetime.time(7,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "06:30:01 - 07:00:00"
elif (x >= datetime.time(7,0,1)) & (x <= datetime.time(7,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "07:00:01 - 07:30:00"
elif (x >= datetime.time(7,30,1)) & (x <= datetime.time(8,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "07:30:01 - 08:00:00"
elif (x >= datetime.time(8,0,1)) & (x <= datetime.time(8,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "08:00:01 - 08:30:00"
elif (x >= datetime.time(8,30,1)) & (x <= datetime.time(9,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "08:30:01 - 09:00:00"
elif (x >= datetime.time(9,0,1)) & (x <= datetime.time(9,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "09:00:01 - 09:30:00"
elif (x >= datetime.time(9,30,1)) & (x <= datetime.time(10,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "09:30:01 - 10:00:00"
elif (x >= datetime.time(10,0,1)) & (x <= datetime.time(10,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "10:00:01 - 10:30:00"
elif (x >= datetime.time(10,30,1)) & (x <= datetime.time(11,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "10:30:01 - 11:00:00"
elif (x >= datetime.time(11,0,1)) & (x <= datetime.time(11,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "11:00:01 - 11:30:00"
elif (x >= datetime.time(11,30,1)) & (x <= datetime.time(12,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "11:30:01 - 12:00:00"
elif (x >= datetime.time(12,0,1)) & (x <= datetime.time(12,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "12:00:01 - 12:30:00"
elif (x >= datetime.time(12,30,1)) & (x <= datetime.time(13,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "12:30:01 - 13:00:00"
elif (x >= datetime.time(13,0,1)) & (x <= datetime.time(13,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "13:00:01 - 13:30:00"
elif (x >= datetime.time(13,30,1)) & (x <= datetime.time(14,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "13:30:01 - 14:00:00"
elif (x >= datetime.time(14,0,1)) & (x <= datetime.time(14,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "14:00:01 - 14:30:00"
elif (x >= datetime.time(14,30,1)) & (x <= datetime.time(15,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "14:30:01 - 15:00:00"
elif (x >= datetime.time(15,0,1)) & (x <= datetime.time(15,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "15:00:01 - 15:30:00"
elif (x >= datetime.time(15,30,1)) & (x <= datetime.time(16,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "15:30:01 - 16:00:00"
elif (x >= datetime.time(16,0,1)) & (x <= datetime.time(16,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "16:00:01 - 16:30:00"
elif (x >= datetime.time(16,30,1)) & (x <= datetime.time(17,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "16:30:01 - 17:00:00"
elif (x >= datetime.time(17,0,1)) & (x <= datetime.time(17,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "17:00:01 - 17:30:00"
elif (x >= datetime.time(17,30,1)) & (x <= datetime.time(18,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "17:30:01 - 18:00:00"
elif (x >= datetime.time(18,0,1)) & (x <= datetime.time(18,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "18:00:01 - 18:30:00"
elif (x >= datetime.time(18,30,1)) & (x <= datetime.time(19,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "18:30:01 - 19:00:00"
elif (x >= datetime.time(19,0,1)) & (x <= datetime.time(19,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "19:00:01 - 19:30:00"
elif (x >= datetime.time(19,30,1)) & (x <= datetime.time(20,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "19:30:01 - 20:00:00"
elif (x >= datetime.time(20,0,1)) & (x <= datetime.time(20,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "20:00:01 - 20:30:00"
elif (x >= datetime.time(20,30,1)) & (x <= datetime.time(21,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "20:30:01 - 21:00:00"
elif (x >= datetime.time(21,0,1)) & (x <= datetime.time(21,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "21:00:01 - 21:30:00"
elif (x >= datetime.time(21,30,1)) & (x <= datetime.time(22,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "21:30:01 - 22:00:00"
elif (x >= datetime.time(22,0,1)) & (x <= datetime.time(22,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "22:00:01 - 22:30:00"
elif (x >= datetime.time(22,30,1)) & (x <= datetime.time(23,0,0)):
dataframe.loc[j,'half_hourly_bucket'] = "22:30:01 - 23:00:00"
elif (x >= datetime.time(23,0,1)) & (x <= datetime.time(23,30,0)):
dataframe.loc[j,'half_hourly_bucket'] = "23:00:01 - 23:30:00"
else:
dataframe.loc[j,'half_hourly_bucket'] = "23:30:01 - 00:00:00"
return dataframe
Is there any way to avoid this loop and increase the processing speed?
Upvotes: 3
Views: 205
Reputation: 6475
You could take a different approach and use timedelta
to define the bucket, which severely simplifies this code:
from datetime import datetime, timedelta
def ceil_dt(dt, delta):
return dt + (datetime.min - dt) % delta
def floor_dt(dt, delta):
return dt - (dt - datetime.min) % delta
now = datetime.now()
print(now)
print(
floor_dt(now, timedelta(minutes=30)), ceil_dt(now, timedelta(minutes=30))
)
https://repl.it/@ryanpcmcquen/TwinHelplessLifecycles-1
floor_dt
gives you the beginning of the bucket and ceil_dt
gives you the end of it.
Upvotes: 3
Reputation: 18625
You can do this very quickly with vectorized operations in Pandas. The only trick is converting your datetime.time
values into timestamps that Pandas can use. Here is some code that will do the whole job in about 4 seconds:
import datetime, random
import pandas as pd, numpy as np
# create random times
df = pd.DataFrame({'time': [
datetime.time(int(24 * h), int(60 * m), int(60 * s))
for h, m, s in np.random.rand(500000, 3)
]})
def half_hourly_buckets(dataframe, time_column):
# convert time values to datetimes on arbitrary date
base_date = datetime.datetime(2000, 1, 1)
dt = dataframe[time_column].apply(
lambda t: datetime.datetime.combine(base_date, t)
)
# assign corresponding bins
one_second = pd.Timedelta(seconds=1)
thirty_minutes = pd.Timedelta(minutes=30)
bin = (dt - one_second).dt.floor('30T')
dataframe['half_hourly_bucket'] = (
(bin + one_second).dt.strftime("%H:%M:%S")
+ ' - '
+ (bin + thirty_minutes).dt.strftime("%H:%M:%S")
)
half_hourly_buckets(df, 'time')
df
# time half_hourly_bucket
# 0 07:00:49 07:00:01 - 07:30:00
# 1 06:16:19 06:00:01 - 06:30:00
# 2 06:17:06 06:00:01 - 06:30:00
# 3 17:28:31 17:00:01 - 17:30:00
# ... ... ...
# 739 18:00:01 18:00:01 - 18:30:00
# ... ... ...
# 4259 00:00:00 23:30:01 - 00:00:00
# ... ... ...
# 4520 17:30:00 17:00:01 - 17:30:00
# ... ... ...
Upvotes: 1
Reputation: 18625
You can do this very quickly by creating a more efficient helper function to calculate the bins, then using the Pandas .apply
function to call it. Here's an example that runs in about 2 seconds.
Note that unlike some other answers, this moves the :00:00 and :30:00 times into the bin for the prior half hour, as shown in your code.
import datetime, random
import pandas as pd, numpy as np
# create random times
df = pd.DataFrame({'time': [
datetime.time(int(24 * h), int(60 * m), int(60 * s))
for h, m, s in np.random.rand(500000, 3)
]})
def make_bin(t):
h, m, s = t.hour, t.minute, t.second
# move the first second of each half-hour back into the prior bin
if s == 0 and m in {0, 30}:
if m == 0:
h = (h - 1) % 24
m = 59
else:
m = 29
# is this in the first half of the hour?
first_half = m < 30
# calculate bin start and end
start = datetime.time(h, 0 if first_half else 30, 1)
end = datetime.time(
h if first_half else (h + 1) % 24,
30 if first_half else 0,
0
)
return '{} - {}'.format(start, end)
def half_hourly_buckets(dataframe, time_column):
dataframe['half_hourly_bucket'] = dataframe[time_column].apply(make_bin)
half_hourly_buckets(df, 'time')
df
# time half_hourly_bucket
# 0 02:07:13 02:00:01 - 02:30:00
# 1 21:52:35 21:30:01 - 22:00:00
# 2 07:41:36 07:30:01 - 08:00:00
# 3 20:41:36 20:30:01 - 21:00:00
# ...
# 761 17:00:00 16:30:01 - 17:00:00
# 1460 17:30:00 17:00:01 - 17:30:00
# ...
# 219253 00:00:00 23:30:01 - 00:00:00
# ...
Upvotes: 0
Reputation: 18625
To give the most direct answer to your question: you can move your bin-calculation code into a helper function, then use Pandas' .apply()
method to apply it to all the elements of a series. That will run much more quickly than your loop, mainly because the incremental assignments you are currently using are very slow. There's an example of how to do this below, based on your original code.
This code processes 500,000 rows in about 5 seconds on my computer. You can speed it up further by using more efficient methods to calculate the bins; I gave an example in another answer that takes about 2 seconds to run.
df['half_hourly_bucket'] = df['time'].apply(make_bin)
where make_bin()
is defined via
def make_bin(x):
if (x >= datetime.time(0,0,1)) & (x <= datetime.time(0,30,0)):
bin = "00:00:01 - 00:30:00"
elif (x >= datetime.time(0,30,1)) & (x <= datetime.time(1,0,0)):
bin = "00:30:01 - 01:00:00"
elif (x >= datetime.time(1,0,1)) & (x <= datetime.time(1,30,0)):
bin = "01:00:01 - 01:30:00"
elif (x >= datetime.time(1,30,1)) & (x <= datetime.time(2,0,0)):
bin = "01:30:01 - 02:00:00"
elif (x >= datetime.time(2,0,1)) & (x <= datetime.time(2,30,0)):
bin = "02:00:01 - 02:30:00"
elif (x >= datetime.time(2,30,1)) & (x <= datetime.time(3,0,0)):
bin = "02:30:01 - 03:00:00"
elif (x >= datetime.time(3,0,1)) & (x <= datetime.time(3,30,0)):
bin = "03:00:01 - 03:30:00"
elif (x >= datetime.time(3,30,1)) & (x <= datetime.time(4,0,0)):
bin = "03:30:01 - 04:00:00"
elif (x >= datetime.time(4,0,1)) & (x <= datetime.time(4,30,0)):
bin = "04:00:01 - 04:30:00"
elif (x >= datetime.time(4,30,1)) & (x <= datetime.time(5,0,0)):
bin = "04:30:01 - 05:00:00"
elif (x >= datetime.time(5,0,1)) & (x <= datetime.time(5,30,0)):
bin = "05:00:01 - 05:30:00"
elif (x >= datetime.time(5,30,1)) & (x <= datetime.time(6,0,0)):
bin = "05:30:01 - 06:00:00"
elif (x >= datetime.time(6,0,1)) & (x <= datetime.time(6,30,0)):
bin = "06:00:01 - 06:30:00"
elif (x >= datetime.time(6,30,1)) & (x <= datetime.time(7,0,0)):
bin = "06:30:01 - 07:00:00"
elif (x >= datetime.time(7,0,1)) & (x <= datetime.time(7,30,0)):
bin = "07:00:01 - 07:30:00"
elif (x >= datetime.time(7,30,1)) & (x <= datetime.time(8,0,0)):
bin = "07:30:01 - 08:00:00"
elif (x >= datetime.time(8,0,1)) & (x <= datetime.time(8,30,0)):
bin = "08:00:01 - 08:30:00"
elif (x >= datetime.time(8,30,1)) & (x <= datetime.time(9,0,0)):
bin = "08:30:01 - 09:00:00"
elif (x >= datetime.time(9,0,1)) & (x <= datetime.time(9,30,0)):
bin = "09:00:01 - 09:30:00"
elif (x >= datetime.time(9,30,1)) & (x <= datetime.time(10,0,0)):
bin = "09:30:01 - 10:00:00"
elif (x >= datetime.time(10,0,1)) & (x <= datetime.time(10,30,0)):
bin = "10:00:01 - 10:30:00"
elif (x >= datetime.time(10,30,1)) & (x <= datetime.time(11,0,0)):
bin = "10:30:01 - 11:00:00"
elif (x >= datetime.time(11,0,1)) & (x <= datetime.time(11,30,0)):
bin = "11:00:01 - 11:30:00"
elif (x >= datetime.time(11,30,1)) & (x <= datetime.time(12,0,0)):
bin = "11:30:01 - 12:00:00"
elif (x >= datetime.time(12,0,1)) & (x <= datetime.time(12,30,0)):
bin = "12:00:01 - 12:30:00"
elif (x >= datetime.time(12,30,1)) & (x <= datetime.time(13,0,0)):
bin = "12:30:01 - 13:00:00"
elif (x >= datetime.time(13,0,1)) & (x <= datetime.time(13,30,0)):
bin = "13:00:01 - 13:30:00"
elif (x >= datetime.time(13,30,1)) & (x <= datetime.time(14,0,0)):
bin = "13:30:01 - 14:00:00"
elif (x >= datetime.time(14,0,1)) & (x <= datetime.time(14,30,0)):
bin = "14:00:01 - 14:30:00"
elif (x >= datetime.time(14,30,1)) & (x <= datetime.time(15,0,0)):
bin = "14:30:01 - 15:00:00"
elif (x >= datetime.time(15,0,1)) & (x <= datetime.time(15,30,0)):
bin = "15:00:01 - 15:30:00"
elif (x >= datetime.time(15,30,1)) & (x <= datetime.time(16,0,0)):
bin = "15:30:01 - 16:00:00"
elif (x >= datetime.time(16,0,1)) & (x <= datetime.time(16,30,0)):
bin = "16:00:01 - 16:30:00"
elif (x >= datetime.time(16,30,1)) & (x <= datetime.time(17,0,0)):
bin = "16:30:01 - 17:00:00"
elif (x >= datetime.time(17,0,1)) & (x <= datetime.time(17,30,0)):
bin = "17:00:01 - 17:30:00"
elif (x >= datetime.time(17,30,1)) & (x <= datetime.time(18,0,0)):
bin = "17:30:01 - 18:00:00"
elif (x >= datetime.time(18,0,1)) & (x <= datetime.time(18,30,0)):
bin = "18:00:01 - 18:30:00"
elif (x >= datetime.time(18,30,1)) & (x <= datetime.time(19,0,0)):
bin = "18:30:01 - 19:00:00"
elif (x >= datetime.time(19,0,1)) & (x <= datetime.time(19,30,0)):
bin = "19:00:01 - 19:30:00"
elif (x >= datetime.time(19,30,1)) & (x <= datetime.time(20,0,0)):
bin = "19:30:01 - 20:00:00"
elif (x >= datetime.time(20,0,1)) & (x <= datetime.time(20,30,0)):
bin = "20:00:01 - 20:30:00"
elif (x >= datetime.time(20,30,1)) & (x <= datetime.time(21,0,0)):
bin = "20:30:01 - 21:00:00"
elif (x >= datetime.time(21,0,1)) & (x <= datetime.time(21,30,0)):
bin = "21:00:01 - 21:30:00"
elif (x >= datetime.time(21,30,1)) & (x <= datetime.time(22,0,0)):
bin = "21:30:01 - 22:00:00"
elif (x >= datetime.time(22,0,1)) & (x <= datetime.time(22,30,0)):
bin = "22:00:01 - 22:30:00"
elif (x >= datetime.time(22,30,1)) & (x <= datetime.time(23,0,0)):
bin = "22:30:01 - 23:00:00"
elif (x >= datetime.time(23,0,1)) & (x <= datetime.time(23,30,0)):
bin = "23:00:01 - 23:30:00"
else:
bin = "23:30:01 - 00:00:00"
return bin
Upvotes: 0
Reputation: 104032
First, make a sorted list of all those time buckets as tuples:
>>> times=[('00:00:01', '00:30:00'), ('00:30:01', '01:00:00'), ('01:00:01', '01:30:00'), ('01:30:01', '02:00:00'), ('02:00:01', '02:30:00'), ('02:30:01', '03:00:00'), ('03:00:01', '03:30:00'), ('03:30:01', '04:00:00'), ('04:00:01', '04:30:00'), ('04:30:01', '05:00:00'), ('05:00:01', '05:30:00'), ('05:30:01', '06:00:00'), ('06:00:01', '06:30:00'), ('06:30:01', '07:00:00'), ('07:00:01', '07:30:00'), ('07:30:01', '08:00:00'), ('08:00:01', '08:30:00'), ('08:30:01', '09:00:00'), ('09:00:01', '09:30:00'), ('09:30:01', '10:00:00'), ('10:00:01', '10:30:00'), ('10:30:01', '11:00:00'), ('11:00:01', '11:30:00'), ('11:30:01', '12:00:00'), ('12:00:01', '12:30:00'), ('12:30:01', '13:00:00'), ('13:00:01', '13:30:00'), ('13:30:01', '14:00:00'), ('14:00:01', '14:30:00'), ('14:30:01', '15:00:00'), ('15:00:01', '15:30:00'), ('15:30:01', '16:00:00'), ('16:00:01', '16:30:00'), ('16:30:01', '17:00:00'), ('17:00:01', '17:30:00'), ('17:30:01', '18:00:00'), ('18:00:01', '18:30:00'), ('18:30:01', '19:00:00'), ('19:00:01', '19:30:00'), ('19:30:01', '20:00:00'), ('20:00:01', '20:30:00'), ('20:30:01', '21:00:00'), ('21:00:01', '21:30:00'), ('21:30:01', '22:00:00'), ('22:00:01', '22:30:00'), ('22:30:01', '23:00:00'), ('23:00:01', '23:30:00'), ('23:30:01', '00:00:00')]
Then an index based on the right hand value:
>>> idx=[t[1] for t in times]
Then you can use the bisect module to select the correct bucket:
>>> times[bisect.bisect_left(idx,'00:31:00')]
('00:30:01', '01:00:00')
Given that the 00:00:00
value goes in the last bucket (as you have written the example), you would test for that separately:
>>> bucket = ('23:30:01', '00:00:00') if ts=='00:00:00' else times[bisect.bisect_left(idx, ts)]
The bisect module is very fast and efficient. Perhaps several hundred times faster than other native Python methods to do similar things.
Upvotes: 2
Reputation: 38415
You can use pandas cut to bin the data into half an hour interval
dates = pd.date_range(end = '09/18/2018', periods = 10000, freq='S')
df = pd.DataFrame({'datetime': np.random.choice(dates, 10000)})
bins = pd.date_range(df.datetime.dt.floor('30T').min(), df.datetime.dt.ceil('30T').max(), freq = '30T')
pd.cut(df.datetime, bins = bins)
You get
0 (2018-09-17 22:30:00, 2018-09-17 23:00:00]
1 (2018-09-17 22:00:00, 2018-09-17 22:30:00]
2 (2018-09-17 21:00:00, 2018-09-17 21:30:00]
3 (2018-09-17 21:30:00, 2018-09-17 22:00:00]
4 (2018-09-17 22:00:00, 2018-09-17 22:30:00]
5 (2018-09-17 23:30:00, 2018-09-18]
6 (2018-09-17 22:00:00, 2018-09-17 22:30:00]
7 (2018-09-17 22:00:00, 2018-09-17 22:30:00]
The bins can be easily grouped for aggregations
dates = pd.date_range(end = '09/18/2018', periods = 10000, freq='S')
df = pd.DataFrame({'id' : np.random.randint(10, 1000, 10000),'datetime': np.random.choice(dates, 10000)})
bins = pd.date_range(df.datetime.dt.floor('30T').min(), df.datetime.dt.ceil('30T').max(), freq = '30T')
df.groupby(['id', pd.cut(df.datetime, bins = bins)]).size()
id datetime
10 (2018-09-17 21:00:00, 2018-09-17 21:30:00] 2
(2018-09-17 21:30:00, 2018-09-17 22:00:00] 1
(2018-09-17 22:00:00, 2018-09-17 22:30:00] 1
(2018-09-17 22:30:00, 2018-09-17 23:00:00] 3
(2018-09-17 23:00:00, 2018-09-17 23:30:00] 4
11 (2018-09-17 21:30:00, 2018-09-17 22:00:00] 1
(2018-09-17 22:00:00, 2018-09-17 22:30:00] 1
(2018-09-17 22:30:00, 2018-09-17 23:00:00] 1
(2018-09-17 23:30:00, 2018-09-18] 1
Upvotes: 2
Reputation: 9279
Firstly, you're doing about twice the number of comparisons you need for this approach. If you don't pass the first test, you already know that
x >= datetime.time(0,30,1))
so you don't have to test this a second time on the next elif
.
Secondly, because of the regular buckets you're using, you can work out which bucket you need by taking the number of seconds and using the integer part of the result of dividing that by thirty minutes. Assuming x
is a time object you could do something like this:
bucket_number = int((datetime.datetime.combine(datetime.date.min, x) -
datetime.datetime.combine(datetime.date.min, datetime.time(0))
).total_seconds() / (30 * 60))
bucket_start = datetime.datetime.combine(datetime.date.min, datetime.time(0)) + \
datetime.timedelta(seconds = bucket_number * 30 * 60)
bucket_end = datetime.datetime.combine(datetime.date.min, datetime.time(0)) + \
datetime.timedelta(seconds = (bucket_number + 1) * 30 * 60)
dataframe.loc[j,'half_hourly_bucket'] = "{} - {}".format(bucket_start.strftime('%H:%M:%S'),
bucket_end.strftime('%H:%M:%S'))
This will remove the need for any tests.
Note: A lot of the hard work here is because it's difficult to work with time
objects. If you could use datetime
objects instead, this would be a whole lot easier.
Upvotes: 3
Reputation: 276
You don't need cases at all.
x = dataframe.loc[j,time_column]
if x > datetime.time(23,30,0):
dataframe.loc[j,'half_hourly_bucket'] = "23:30:01 - 00:00:00"
else:
past = x.minute >= 30
min_hour = str(x.hour).zfill(2)
max_hour = str(x.hour+1 if past else x.hour).zfill(2)
min_time = ':30:01' if past else ':00:01'
max_time = ':00:00' if past else ':30:00'
dataframe.loc[j,'half_hourly_bucket'] = min_hour+min_time+' - '+max_hour+max_time
Upvotes: 2