Reputation: 8247

how to convert multiple if else loop to list comprehension in python

I am using following if loop for creating half an hourly buckets for approx 1 million observations which is taking hell lot of time. Following is my if loop

def half_hourly_buckets(dataframe,time_column):
   dataframe[time_column] = pd.to_datetime(dataframe[time_column],format = '%H:%M:%S').dt.time
   for j in range(len(dataframe)):
    x = dataframe.loc[j,time_column]
    if (x >= datetime.time(0,0,1)) & (x <= datetime.time(0,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "00:00:01 - 00:30:00"
    elif (x >= datetime.time(0,30,1)) & (x <= datetime.time(1,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "00:30:01 - 01:00:00"
    elif (x >= datetime.time(1,0,1)) & (x <= datetime.time(1,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "01:00:01 - 01:30:00"
    elif (x >= datetime.time(1,30,1)) & (x <= datetime.time(2,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "01:30:01 - 02:00:00"
    elif (x >= datetime.time(2,0,1)) & (x <= datetime.time(2,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "02:00:01 - 02:30:00"
    elif (x >= datetime.time(2,30,1)) & (x <= datetime.time(3,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "02:30:01 - 03:00:00"
    elif (x >= datetime.time(3,0,1)) & (x <= datetime.time(3,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "03:00:01 - 03:30:00"
    elif (x >= datetime.time(3,30,1)) & (x <= datetime.time(4,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "03:30:01 - 04:00:00"
    elif (x >= datetime.time(4,0,1)) & (x <= datetime.time(4,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "04:00:01 - 04:30:00"
    elif (x >= datetime.time(4,30,1)) & (x <= datetime.time(5,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "04:30:01 - 05:00:00"
    elif (x >= datetime.time(5,0,1)) & (x <= datetime.time(5,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "05:00:01 - 05:30:00"
    elif (x >= datetime.time(5,30,1)) & (x <= datetime.time(6,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "05:30:01 - 06:00:00"
    elif (x >= datetime.time(6,0,1)) & (x <= datetime.time(6,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "06:00:01 - 06:30:00"
    elif (x >= datetime.time(6,30,1)) & (x <= datetime.time(7,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "06:30:01 - 07:00:00"
    elif (x >= datetime.time(7,0,1)) & (x <= datetime.time(7,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "07:00:01 - 07:30:00"
    elif (x >= datetime.time(7,30,1)) & (x <= datetime.time(8,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "07:30:01 - 08:00:00"
    elif (x >= datetime.time(8,0,1)) & (x <= datetime.time(8,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "08:00:01 - 08:30:00"
    elif (x >= datetime.time(8,30,1)) & (x <= datetime.time(9,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "08:30:01 - 09:00:00"
    elif (x >= datetime.time(9,0,1)) & (x <= datetime.time(9,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "09:00:01 - 09:30:00"
    elif (x >= datetime.time(9,30,1)) & (x <= datetime.time(10,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "09:30:01 - 10:00:00"
    elif (x >= datetime.time(10,0,1)) & (x <= datetime.time(10,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "10:00:01 - 10:30:00"
    elif (x >= datetime.time(10,30,1)) & (x <= datetime.time(11,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "10:30:01 - 11:00:00"
    elif (x >= datetime.time(11,0,1)) & (x <= datetime.time(11,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "11:00:01 - 11:30:00"
    elif (x >= datetime.time(11,30,1)) & (x <= datetime.time(12,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "11:30:01 - 12:00:00"
    elif (x >= datetime.time(12,0,1)) & (x <= datetime.time(12,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "12:00:01 - 12:30:00"
    elif (x >= datetime.time(12,30,1)) & (x <= datetime.time(13,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "12:30:01 - 13:00:00"
    elif (x >= datetime.time(13,0,1)) & (x <= datetime.time(13,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "13:00:01 - 13:30:00"
    elif (x >= datetime.time(13,30,1)) & (x <= datetime.time(14,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "13:30:01 - 14:00:00"
    elif (x >= datetime.time(14,0,1)) & (x <= datetime.time(14,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "14:00:01 - 14:30:00"
    elif (x >= datetime.time(14,30,1)) & (x <= datetime.time(15,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "14:30:01 - 15:00:00"
    elif (x >= datetime.time(15,0,1)) & (x <= datetime.time(15,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "15:00:01 - 15:30:00"
    elif (x >= datetime.time(15,30,1)) & (x <= datetime.time(16,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "15:30:01 - 16:00:00"
    elif (x >= datetime.time(16,0,1)) & (x <= datetime.time(16,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "16:00:01 - 16:30:00"
    elif (x >= datetime.time(16,30,1)) & (x <= datetime.time(17,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "16:30:01 - 17:00:00"
    elif (x >= datetime.time(17,0,1)) & (x <= datetime.time(17,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "17:00:01 - 17:30:00"
    elif (x >= datetime.time(17,30,1)) & (x <= datetime.time(18,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "17:30:01 - 18:00:00"
    elif (x >= datetime.time(18,0,1)) & (x <= datetime.time(18,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "18:00:01 - 18:30:00"
    elif (x >= datetime.time(18,30,1)) & (x <= datetime.time(19,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "18:30:01 - 19:00:00"
    elif (x >= datetime.time(19,0,1)) & (x <= datetime.time(19,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "19:00:01 - 19:30:00"
    elif (x >= datetime.time(19,30,1)) & (x <= datetime.time(20,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "19:30:01 - 20:00:00"
    elif (x >= datetime.time(20,0,1)) & (x <= datetime.time(20,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "20:00:01 - 20:30:00"
    elif (x >= datetime.time(20,30,1)) & (x <= datetime.time(21,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "20:30:01 - 21:00:00"
    elif (x >= datetime.time(21,0,1)) & (x <= datetime.time(21,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "21:00:01 - 21:30:00"
    elif (x >= datetime.time(21,30,1)) & (x <= datetime.time(22,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "21:30:01 - 22:00:00"
    elif (x >= datetime.time(22,0,1)) & (x <= datetime.time(22,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "22:00:01 - 22:30:00"
    elif (x >= datetime.time(22,30,1)) & (x <= datetime.time(23,0,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "22:30:01 - 23:00:00"
    elif (x >= datetime.time(23,0,1)) & (x <= datetime.time(23,30,0)):
        dataframe.loc[j,'half_hourly_bucket'] = "23:00:01 - 23:30:00"
    else:
        dataframe.loc[j,'half_hourly_bucket'] = "23:30:01 - 00:00:00"
return dataframe

Is there any way to avoid this loop and increase the processing speed?

Upvotes: 3

Answers (8)

ryanpcmcquen

Reputation: 6475

You could take a different approach and use timedelta to define the bucket, which severely simplifies this code:

from datetime import datetime, timedelta


def ceil_dt(dt, delta):
    return dt + (datetime.min - dt) % delta


def floor_dt(dt, delta):
    return dt - (dt - datetime.min) % delta


now = datetime.now()
print(now)
print(
    floor_dt(now, timedelta(minutes=30)), ceil_dt(now, timedelta(minutes=30))
)

https://repl.it/@ryanpcmcquen/TwinHelplessLifecycles-1

floor_dt gives you the beginning of the bucket and ceil_dt gives you the end of it.

Upvotes: 3

Matthias Fripp

Reputation: 18625

You can do this very quickly with vectorized operations in Pandas. The only trick is converting your datetime.time values into timestamps that Pandas can use. Here is some code that will do the whole job in about 4 seconds:

import datetime, random
import pandas as pd, numpy as np

# create random times
df = pd.DataFrame({'time': [
    datetime.time(int(24 * h), int(60 * m), int(60 * s)) 
    for h, m, s in np.random.rand(500000, 3)
]})

def half_hourly_buckets(dataframe, time_column):
    # convert time values to datetimes on arbitrary date
    base_date = datetime.datetime(2000, 1, 1)
    dt = dataframe[time_column].apply(
        lambda t: datetime.datetime.combine(base_date, t)
    )
    # assign corresponding bins
    one_second = pd.Timedelta(seconds=1)
    thirty_minutes = pd.Timedelta(minutes=30)
    bin = (dt - one_second).dt.floor('30T')
    dataframe['half_hourly_bucket'] = (
        (bin + one_second).dt.strftime("%H:%M:%S") 
        + ' - '
        + (bin + thirty_minutes).dt.strftime("%H:%M:%S") 
    )

half_hourly_buckets(df, 'time')
df
#             time   half_hourly_bucket
# 0       07:00:49  07:00:01 - 07:30:00
# 1       06:16:19  06:00:01 - 06:30:00
# 2       06:17:06  06:00:01 - 06:30:00
# 3       17:28:31  17:00:01 - 17:30:00
# ...          ...                  ...
# 739     18:00:01  18:00:01 - 18:30:00
# ...          ...                  ...
# 4259    00:00:00  23:30:01 - 00:00:00
# ...          ...                  ...
# 4520    17:30:00  17:00:01 - 17:30:00
# ...          ...                  ...

Upvotes: 1

Matthias Fripp

Reputation: 18625

You can do this very quickly by creating a more efficient helper function to calculate the bins, then using the Pandas .apply function to call it. Here's an example that runs in about 2 seconds.

Note that unlike some other answers, this moves the :00:00 and :30:00 times into the bin for the prior half hour, as shown in your code.

import datetime, random
import pandas as pd, numpy as np

# create random times
df = pd.DataFrame({'time': [
    datetime.time(int(24 * h), int(60 * m), int(60 * s)) 
    for h, m, s in np.random.rand(500000, 3)
]})

def make_bin(t):
    h, m, s = t.hour, t.minute, t.second
    # move the first second of each half-hour back into the prior bin
    if s == 0 and m in {0, 30}:
        if m == 0:
            h = (h - 1) % 24
            m = 59
        else:
            m = 29
    # is this in the first half of the hour?
    first_half = m < 30
    # calculate bin start and end
    start = datetime.time(h, 0 if first_half else 30, 1)
    end = datetime.time(
        h if first_half else (h + 1) % 24,
        30 if first_half else 0,
        0
    )
    return '{} - {}'.format(start, end)

def half_hourly_buckets(dataframe, time_column):
    dataframe['half_hourly_bucket'] = dataframe[time_column].apply(make_bin)

half_hourly_buckets(df, 'time')
df
#             time   half_hourly_bucket
# 0       02:07:13  02:00:01 - 02:30:00
# 1       21:52:35  21:30:01 - 22:00:00
# 2       07:41:36  07:30:01 - 08:00:00
# 3       20:41:36  20:30:01 - 21:00:00
# ...
# 761     17:00:00  16:30:01 - 17:00:00
# 1460    17:30:00  17:00:01 - 17:30:00
# ...
# 219253  00:00:00  23:30:01 - 00:00:00
# ...

Upvotes: 0

Matthias Fripp

Reputation: 18625

To give the most direct answer to your question: you can move your bin-calculation code into a helper function, then use Pandas' .apply() method to apply it to all the elements of a series. That will run much more quickly than your loop, mainly because the incremental assignments you are currently using are very slow. There's an example of how to do this below, based on your original code.

This code processes 500,000 rows in about 5 seconds on my computer. You can speed it up further by using more efficient methods to calculate the bins; I gave an example in another answer that takes about 2 seconds to run.

df['half_hourly_bucket'] = df['time'].apply(make_bin)

where make_bin() is defined via

def make_bin(x):
    if (x >= datetime.time(0,0,1)) & (x <= datetime.time(0,30,0)):
        bin = "00:00:01 - 00:30:00"
    elif (x >= datetime.time(0,30,1)) & (x <= datetime.time(1,0,0)):
        bin = "00:30:01 - 01:00:00"
    elif (x >= datetime.time(1,0,1)) & (x <= datetime.time(1,30,0)):
        bin = "01:00:01 - 01:30:00"
    elif (x >= datetime.time(1,30,1)) & (x <= datetime.time(2,0,0)):
        bin = "01:30:01 - 02:00:00"
    elif (x >= datetime.time(2,0,1)) & (x <= datetime.time(2,30,0)):
        bin = "02:00:01 - 02:30:00"
    elif (x >= datetime.time(2,30,1)) & (x <= datetime.time(3,0,0)):
        bin = "02:30:01 - 03:00:00"
    elif (x >= datetime.time(3,0,1)) & (x <= datetime.time(3,30,0)):
        bin = "03:00:01 - 03:30:00"
    elif (x >= datetime.time(3,30,1)) & (x <= datetime.time(4,0,0)):
        bin = "03:30:01 - 04:00:00"
    elif (x >= datetime.time(4,0,1)) & (x <= datetime.time(4,30,0)):
        bin = "04:00:01 - 04:30:00"
    elif (x >= datetime.time(4,30,1)) & (x <= datetime.time(5,0,0)):
        bin = "04:30:01 - 05:00:00"
    elif (x >= datetime.time(5,0,1)) & (x <= datetime.time(5,30,0)):
        bin = "05:00:01 - 05:30:00"
    elif (x >= datetime.time(5,30,1)) & (x <= datetime.time(6,0,0)):
        bin = "05:30:01 - 06:00:00"
    elif (x >= datetime.time(6,0,1)) & (x <= datetime.time(6,30,0)):
        bin = "06:00:01 - 06:30:00"
    elif (x >= datetime.time(6,30,1)) & (x <= datetime.time(7,0,0)):
        bin = "06:30:01 - 07:00:00"
    elif (x >= datetime.time(7,0,1)) & (x <= datetime.time(7,30,0)):
        bin = "07:00:01 - 07:30:00"
    elif (x >= datetime.time(7,30,1)) & (x <= datetime.time(8,0,0)):
        bin = "07:30:01 - 08:00:00"
    elif (x >= datetime.time(8,0,1)) & (x <= datetime.time(8,30,0)):
        bin = "08:00:01 - 08:30:00"
    elif (x >= datetime.time(8,30,1)) & (x <= datetime.time(9,0,0)):
        bin = "08:30:01 - 09:00:00"
    elif (x >= datetime.time(9,0,1)) & (x <= datetime.time(9,30,0)):
        bin = "09:00:01 - 09:30:00"
    elif (x >= datetime.time(9,30,1)) & (x <= datetime.time(10,0,0)):
        bin = "09:30:01 - 10:00:00"
    elif (x >= datetime.time(10,0,1)) & (x <= datetime.time(10,30,0)):
        bin = "10:00:01 - 10:30:00"
    elif (x >= datetime.time(10,30,1)) & (x <= datetime.time(11,0,0)):
        bin = "10:30:01 - 11:00:00"
    elif (x >= datetime.time(11,0,1)) & (x <= datetime.time(11,30,0)):
        bin = "11:00:01 - 11:30:00"
    elif (x >= datetime.time(11,30,1)) & (x <= datetime.time(12,0,0)):
        bin = "11:30:01 - 12:00:00"
    elif (x >= datetime.time(12,0,1)) & (x <= datetime.time(12,30,0)):
        bin = "12:00:01 - 12:30:00"
    elif (x >= datetime.time(12,30,1)) & (x <= datetime.time(13,0,0)):
        bin = "12:30:01 - 13:00:00"
    elif (x >= datetime.time(13,0,1)) & (x <= datetime.time(13,30,0)):
        bin = "13:00:01 - 13:30:00"
    elif (x >= datetime.time(13,30,1)) & (x <= datetime.time(14,0,0)):
        bin = "13:30:01 - 14:00:00"
    elif (x >= datetime.time(14,0,1)) & (x <= datetime.time(14,30,0)):
        bin = "14:00:01 - 14:30:00"
    elif (x >= datetime.time(14,30,1)) & (x <= datetime.time(15,0,0)):
        bin = "14:30:01 - 15:00:00"
    elif (x >= datetime.time(15,0,1)) & (x <= datetime.time(15,30,0)):
        bin = "15:00:01 - 15:30:00"
    elif (x >= datetime.time(15,30,1)) & (x <= datetime.time(16,0,0)):
        bin = "15:30:01 - 16:00:00"
    elif (x >= datetime.time(16,0,1)) & (x <= datetime.time(16,30,0)):
        bin = "16:00:01 - 16:30:00"
    elif (x >= datetime.time(16,30,1)) & (x <= datetime.time(17,0,0)):
        bin = "16:30:01 - 17:00:00"
    elif (x >= datetime.time(17,0,1)) & (x <= datetime.time(17,30,0)):
        bin = "17:00:01 - 17:30:00"
    elif (x >= datetime.time(17,30,1)) & (x <= datetime.time(18,0,0)):
        bin = "17:30:01 - 18:00:00"
    elif (x >= datetime.time(18,0,1)) & (x <= datetime.time(18,30,0)):
        bin = "18:00:01 - 18:30:00"
    elif (x >= datetime.time(18,30,1)) & (x <= datetime.time(19,0,0)):
        bin = "18:30:01 - 19:00:00"
    elif (x >= datetime.time(19,0,1)) & (x <= datetime.time(19,30,0)):
        bin = "19:00:01 - 19:30:00"
    elif (x >= datetime.time(19,30,1)) & (x <= datetime.time(20,0,0)):
        bin = "19:30:01 - 20:00:00"
    elif (x >= datetime.time(20,0,1)) & (x <= datetime.time(20,30,0)):
        bin = "20:00:01 - 20:30:00"
    elif (x >= datetime.time(20,30,1)) & (x <= datetime.time(21,0,0)):
        bin = "20:30:01 - 21:00:00"
    elif (x >= datetime.time(21,0,1)) & (x <= datetime.time(21,30,0)):
        bin = "21:00:01 - 21:30:00"
    elif (x >= datetime.time(21,30,1)) & (x <= datetime.time(22,0,0)):
        bin = "21:30:01 - 22:00:00"
    elif (x >= datetime.time(22,0,1)) & (x <= datetime.time(22,30,0)):
        bin = "22:00:01 - 22:30:00"
    elif (x >= datetime.time(22,30,1)) & (x <= datetime.time(23,0,0)):
        bin = "22:30:01 - 23:00:00"
    elif (x >= datetime.time(23,0,1)) & (x <= datetime.time(23,30,0)):
        bin = "23:00:01 - 23:30:00"
    else:
        bin = "23:30:01 - 00:00:00"
    return bin

Upvotes: 0

dawg

Reputation: 104032

First, make a sorted list of all those time buckets as tuples:

>>> times=[('00:00:01', '00:30:00'), ('00:30:01', '01:00:00'), ('01:00:01', '01:30:00'), ('01:30:01', '02:00:00'), ('02:00:01', '02:30:00'), ('02:30:01', '03:00:00'), ('03:00:01', '03:30:00'), ('03:30:01', '04:00:00'), ('04:00:01', '04:30:00'), ('04:30:01', '05:00:00'), ('05:00:01', '05:30:00'), ('05:30:01', '06:00:00'), ('06:00:01', '06:30:00'), ('06:30:01', '07:00:00'), ('07:00:01', '07:30:00'), ('07:30:01', '08:00:00'), ('08:00:01', '08:30:00'), ('08:30:01', '09:00:00'), ('09:00:01', '09:30:00'), ('09:30:01', '10:00:00'), ('10:00:01', '10:30:00'), ('10:30:01', '11:00:00'), ('11:00:01', '11:30:00'), ('11:30:01', '12:00:00'), ('12:00:01', '12:30:00'), ('12:30:01', '13:00:00'), ('13:00:01', '13:30:00'), ('13:30:01', '14:00:00'), ('14:00:01', '14:30:00'), ('14:30:01', '15:00:00'), ('15:00:01', '15:30:00'), ('15:30:01', '16:00:00'), ('16:00:01', '16:30:00'), ('16:30:01', '17:00:00'), ('17:00:01', '17:30:00'), ('17:30:01', '18:00:00'), ('18:00:01', '18:30:00'), ('18:30:01', '19:00:00'), ('19:00:01', '19:30:00'), ('19:30:01', '20:00:00'), ('20:00:01', '20:30:00'), ('20:30:01', '21:00:00'), ('21:00:01', '21:30:00'), ('21:30:01', '22:00:00'), ('22:00:01', '22:30:00'), ('22:30:01', '23:00:00'), ('23:00:01', '23:30:00'), ('23:30:01', '00:00:00')]

Then an index based on the right hand value:

>>> idx=[t[1] for t in times]

Then you can use the bisect module to select the correct bucket:

>>> times[bisect.bisect_left(idx,'00:31:00')]
('00:30:01', '01:00:00')

Given that the 00:00:00 value goes in the last bucket (as you have written the example), you would test for that separately:

>>> bucket = ('23:30:01', '00:00:00') if ts=='00:00:00' else times[bisect.bisect_left(idx, ts)]

The bisect module is very fast and efficient. Perhaps several hundred times faster than other native Python methods to do similar things.

Upvotes: 2

Vaishali

Reputation: 38415

You can use pandas cut to bin the data into half an hour interval

dates = pd.date_range(end = '09/18/2018', periods = 10000, freq='S')
df = pd.DataFrame({'datetime': np.random.choice(dates, 10000)})

bins = pd.date_range(df.datetime.dt.floor('30T').min(), df.datetime.dt.ceil('30T').max(), freq = '30T')
pd.cut(df.datetime, bins = bins)

You get

0       (2018-09-17 22:30:00, 2018-09-17 23:00:00]
1       (2018-09-17 22:00:00, 2018-09-17 22:30:00]
2       (2018-09-17 21:00:00, 2018-09-17 21:30:00]
3       (2018-09-17 21:30:00, 2018-09-17 22:00:00]
4       (2018-09-17 22:00:00, 2018-09-17 22:30:00]
5                (2018-09-17 23:30:00, 2018-09-18]
6       (2018-09-17 22:00:00, 2018-09-17 22:30:00]
7       (2018-09-17 22:00:00, 2018-09-17 22:30:00]

The bins can be easily grouped for aggregations

dates = pd.date_range(end = '09/18/2018', periods = 10000, freq='S')
df = pd.DataFrame({'id' : np.random.randint(10, 1000, 10000),'datetime': np.random.choice(dates, 10000)})

bins = pd.date_range(df.datetime.dt.floor('30T').min(), df.datetime.dt.ceil('30T').max(), freq = '30T')
df.groupby(['id', pd.cut(df.datetime, bins = bins)]).size()

id   datetime                                  
10   (2018-09-17 21:00:00, 2018-09-17 21:30:00]    2
     (2018-09-17 21:30:00, 2018-09-17 22:00:00]    1
     (2018-09-17 22:00:00, 2018-09-17 22:30:00]    1
     (2018-09-17 22:30:00, 2018-09-17 23:00:00]    3
     (2018-09-17 23:00:00, 2018-09-17 23:30:00]    4
11   (2018-09-17 21:30:00, 2018-09-17 22:00:00]    1
     (2018-09-17 22:00:00, 2018-09-17 22:30:00]    1
     (2018-09-17 22:30:00, 2018-09-17 23:00:00]    1
     (2018-09-17 23:30:00, 2018-09-18]             1

Upvotes: 2

Tim

Reputation: 9279

Firstly, you're doing about twice the number of comparisons you need for this approach. If you don't pass the first test, you already know that

x >= datetime.time(0,30,1))

so you don't have to test this a second time on the next elif.

Secondly, because of the regular buckets you're using, you can work out which bucket you need by taking the number of seconds and using the integer part of the result of dividing that by thirty minutes. Assuming x is a time object you could do something like this:

bucket_number = int((datetime.datetime.combine(datetime.date.min, x) -
                     datetime.datetime.combine(datetime.date.min, datetime.time(0))
                    ).total_seconds() / (30 * 60))
bucket_start = datetime.datetime.combine(datetime.date.min, datetime.time(0)) + \
               datetime.timedelta(seconds = bucket_number * 30 * 60)
bucket_end = datetime.datetime.combine(datetime.date.min, datetime.time(0)) + \
             datetime.timedelta(seconds = (bucket_number + 1) * 30 * 60)
dataframe.loc[j,'half_hourly_bucket'] = "{} - {}".format(bucket_start.strftime('%H:%M:%S'),
                                                         bucket_end.strftime('%H:%M:%S'))

This will remove the need for any tests.

Note: A lot of the hard work here is because it's difficult to work with time objects. If you could use datetime objects instead, this would be a whole lot easier.

Upvotes: 3

Landar

Reputation: 276

You don't need cases at all.

 x = dataframe.loc[j,time_column]
 if x > datetime.time(23,30,0):
     dataframe.loc[j,'half_hourly_bucket'] = "23:30:01 - 00:00:00"
 else:
     past = x.minute >= 30
     min_hour = str(x.hour).zfill(2)
     max_hour = str(x.hour+1 if past else x.hour).zfill(2)
     min_time = ':30:01' if past else ':00:01'
     max_time = ':00:00' if past else ':30:00'
     dataframe.loc[j,'half_hourly_bucket'] = min_hour+min_time+' - '+max_hour+max_time

Upvotes: 2

how to convert multiple if else loop to list comprehension in python

Answers (8)

Related Questions