Reputation: 33
In a dataframe df
containing the columns person_id
(int), dates
(datetimes), and is_hosp
(boolean), I need to find out for each date of each person_id if is_hosp is true in the next 15 days following the date of the row.
EDIT:
Following pertinent comments to the orginal question, I should specify that the dates of observations for each row / person_id are not always consecutive (ie a person_id can have an observation on the 01/01/2018 followed by the next observation on the 08/01/2018).
I have written the code below but it is very slow.
To replicate the issue, here is a fake dataframe of the same format as the dataframe I am working on:
import pandas as pd
import datetime as dt
import numpy as np
count_unique_person = 10
dates = pd.date_range(dt.datetime(2019, 1, 1), dt.datetime(2021, 1, 1))
dates = dates.to_pydatetime().tolist()
person_id = [i for i in range(1, count_unique_person + 1) for _ in range(len(dates))]
dates = dates * count_unique_person
sample_arr = [True, False]
col_to_check = np.random.choice(sample_arr, size = len(dates))
df = pd.DataFrame()
df['person_id'] = person_id
df['dates'] = dates
df['is_hosp'] = col_to_check
Here is how I have implemented the check (column is_inh_15d
), but it takes too long to run (my original dataframe contains a million rows):
is_inh = []
is_inh_idx = []
for each_p in np.unique(df['person_id'].values):
t_df = df[df['person_id'] == each_p].copy()
for index, e_row in t_df.iterrows():
s_dt = e_row['dates']
e_dt = s_dt + dt.timedelta(days = 15)
t_df2 = t_df[(t_df['dates'] >= s_dt) & (t_df['dates'] <= e_dt)]
is_inh.append(np.any(t_df2['is_hosp'] == True))
is_inh_idx.append(index)
h_15d_df = pd.DataFrame()
h_15d_df['is_inh_15d'] = is_inh
h_15d_df['is_inh_idx'] = is_inh_idx
h_15d_df.set_index('is_inh_idx', inplace = True)
df = df.join(h_15d_df)
I don't see how to vectorize the logic of checking each of the next 15 days for each row to see if "is_hosp" is True.
Could someone please advise?
Upvotes: 2
Views: 370
Reputation: 474
I modified the method by sorting by person and date and then using the number of days gaps between appointments per person to generate h_15d_df column. It's kind of hard check because of the dataset but I reduced the days from 15 down to 1 to check and it seems to work.
start = time.time()
ori_func()
end = time.time()
print(end - start)
16.748218297958374
start = time.time()
new_func()
end = time.time()
print(end - start)
0.15230989456176758
It's a little bit convoluted but I measured ~x100 times improvement
import pandas as pd
import datetime as dt
import numpy as np
import time
def create_df():
count_unique_person = 10
dates = pd.date_range(dt.datetime(2019, 1, 1), dt.datetime(2021, 1, 1))
dates = dates.to_pydatetime().tolist()
person_id = [i for i in range(1, count_unique_person + 1) for _ in range(len(dates))]
dates = dates * count_unique_person
sample_arr = [True, False]
col_to_check = np.random.choice(sample_arr, size = len(dates))
df = pd.DataFrame()
df['person_id'] = person_id
df['dates'] = dates
df['is_hosp'] = col_to_check
return(df)
def ori_func():
df = create_df()
is_inh = []
is_inh_idx = []
for each_p in np.unique(df['person_id'].values):
t_df = df[df['person_id'] == each_p].copy()
for index, e_row in t_df.iterrows():
s_dt = e_row['dates']
e_dt = s_dt + dt.timedelta(days = 15)
t_df2 = t_df[(t_df['dates'] >= s_dt) & (t_df['dates'] <= e_dt)]
is_inh.append(np.any(t_df2['is_hosp'] == True))
is_inh_idx.append(index)
h_15d_df = pd.DataFrame()
h_15d_df['is_inh_15d'] = is_inh
h_15d_df['is_inh_idx'] = is_inh_idx
h_15d_df.set_index('is_inh_idx', inplace = True)
df = df.join(h_15d_df)
return(df)
def ExpandGaps(gaps,d,length):
expand_gaps = []
for x in gaps:
if x <= d:
expand_gaps += [True]*x
if x > d:
expand_gaps += [False]*(x-d)
expand_gaps += [True]*d
##Fill in the remainder with false
expand_gaps += [False]*(length - len(expand_gaps))
return(expand_gaps)
def new_func():
df = create_df()
is_inh = []
is_inh_idx = []
#Sort by ID and dates.
df = df.sort_values(by=["person_id","dates"])
for each_p in np.unique(df['person_id'].values):
t_df = df[df['person_id'] == each_p].copy()
#Create list of dates per patient where is_hosp==True
if t_df.iloc[0]['is_hosp'] == False:
#true_dates = t_df.iloc[0]['dates'](t_df['dates'][t_df['is_hosp'] == True])
#Dates where is_hosp is true
true_dates = t_df['dates'][(t_df['is_hosp'] == True)]
first_row = pd.Series([t_df['dates'][t_df.index[0]]])
#Add first row to start if is_hosp == False
true_dates = pd.concat([first_row,true_dates])
else:
true_dates = t_df['dates'][t_df['is_hosp'] == True]
##Calculate the gaps between the visits and convert to number of days
gaps = np.array([(t - s).days for s, t in zip(true_dates, true_dates[1:])])
expand_gaps = ExpandGaps(gaps,15,len(t_df))
is_inh += expand_gaps
h_15d_df = pd.DataFrame()
h_15d_df['is_inh_15d'] = is_inh
#h_15d_df['is_inh_idx'] = is_inh_idx
#h_15d_df.set_index('is_inh_idx', inplace = True)
df = df.join(h_15d_df)
return(df)
start = time.time()
ori_func()
end = time.time()
print(end - start)
start = time.time()
new_func()
end = time.time()
print(end - start)
Upvotes: 0
Reputation: 246
.rolling()
accepts "16D"
for 16 days on time series data or datetime-like index.
(16 days means the day of a row and its next 15 days)
is_inh_15d = (
df.sort_values('dates', ascending=False)
.groupby('person_id')
.rolling('16D', on='dates')
.is_hosp.max() == 1
)
df2 = df.join(is_inh_15d, on=['person_id', 'dates'], rsuffix='_15d')
Upvotes: 2
Reputation: 3168
I think I find something. I modify the data generation at the beginning in order to have much more False. sample_arr = [True, False]
is changed to sample_arr = [True] + [False] * 10
. With the previous line, it's really hard to get 15 days False.
The solution I found is:
to_concat = []
for i in range(15):
to_concat.append(
df.groupby('person_id')
# the resample get the 15 days following and I use the offset to move
# forward
.resample('15D', offset=pd.to_timedelta(i, 'D'), on='dates')
# max function give if there is a True (consider as 1)
.max()
# there is some date outside initial date range due to the offset I
# removed with the following line
.drop(['dates', 'person_id'], axis='columns')[
lambda x: (x.index.get_level_values('dates') >= df['dates'].min())
& (x.index.get_level_values('dates') <= df['dates'].max())
]
)
# I reset index in order to have them as column
# which more looks like your initial dataframe
df = pd.concat(to_concat).sort_index().reset_index()
I try to compare both method by doing an pd.testing.assert_frame_equal
on both result dataframe and I get 2% of difference on the value for the is_inh_15d
column. I don't know exactly where it comes from.
The solution seems to be around 8 times faster from a time
command on Linux:
python ./op_solution.py 6,90s user 0,79s system 113% cpu 6,797 total
python ./other_solution.py 0,86s user 0,76s system 231% cpu 0,699 total
Upvotes: 1