Reputation: 679
I am trying to do in pandas something that I can do outside pandas (code below), but it's poorly readable.
Goal: subsample rows of a list of lists (or DataFrame) at a maximum interval of 10 rows or less, depending if the values from a "state" column change. In addition, this should be done separately for column values 'a' and 'b' of a dtype
column.
Code to reproduce the intended output:
# input (list of list, but could be converted to DataFrame)
# columns: 1:index, 2:state, 3:dtype, 4:value.
x = [
[1, 0, 'b', 93.8],
[2, 0, 'b', 97.4],
[3, 0, 'b', 76.1],
[4, 0, 'b', 21.1],
[5, 0, 'b', 65.7],
[6, 0, 'b', 90.8],
[7, 0, 'b', 63.8],
[8, 0, 'b', 82.9],
[9, 0, 'b', 19.8],
[10, 0, 'b', 10.2],
[11, 0, 'b', 1.3],
[12, 1, 'b', 37.6],
[13, 0, 'b', 18.2],
[14, 0, 'b', 16.9],
[15, 0, 'b', 95.6],
[16, 1, 'b', 23.7],
[17, 0, 'b', 54.1],
[18, 0, 'b', 99.0],
[19, 0, 'b', 16.3],
[20, 0, 'a', 80.7],
[21, 0, 'a', 23.1],
[22, 0, 'a', 96.6],
[23, 0, 'a', 56.7],
[24, 0, 'a', 45.3],
[25, 1, 'a', 58.0],
[26, 0, 'a', 49.9],
[27, 0, 'a', 91.3],
[28, 0, 'b', 60.2],
[29, 0, 'b', 76.8],
[30, 0, 'b', 45.3],
[31, 0, 'b', 69.6],
[32, 0, 'b', 99.0],
[33, 0, 'b', 29.5],
[34, 0, 'b', 11.0],
[35, 0, 'b', 68.9],
[36, 0, 'b', 75.8],
[37, 1, 'b', 89.8],
[38, 0, 'b', 57.7],
[39, 1, 'b', 20.3],
[40, 0, 'b', 98.6],
[41, 0, 'b', 96.7],
[42, 0, 'b', 17.9],
[43, 1, 'b', 14.6],
[44, 0, 'b', 92.5],
[45, 0, 'b', 33.6],
[46, 1, 'b', 58.9],
[47, 1, 'b', 71.9],
[48, 0, 'b', 74.9],
[49, 0, 'b', 43.3],
[50, 1, 'b', 29.5],
[51, 0, 'b', 24.6],
[52, 0, 'b', 2.3],
[53, 0, 'b', 19.1],
[54, 0, 'b', 31.6],
[55, 0, 'b', 80.6],
[56, 0, 'b', 3.2],
[57, 0, 'b', 58.5],
[58, 1, 'b', 30.2],
[59, 1, 'b', 29.1],
[60, 0, 'b', 47.6],
[61, 0, 'b', 76.4],
[62, 0, 'b', 21.6],
[63, 0, 'b', 82.7],
[64, 0, 'b', 0.2],
[65, 0, 'b', 9.4],
[66, 0, 'b', 75.1],
[67, 0, 'b', 33.8],
[68, 0, 'b', 82.0],
[69, 0, 'b', 56.9],
[70, 0, 'b', 62.5],
[71, 0, 'b', 53.5],
[72, 0, 'b', 7.0],
[73, 0, 'a', 37.4],
[74, 0, 'a', 88.8],
[75, 0, 'a', 46.4],
[76, 0, 'a', 86.3],
[77, 0, 'a', 54.3],
[78, 0, 'b', 23.4],
[79, 0, 'b', 1.1],
[80, 0, 'b', 78.5],
[81, 0, 'b', 39.1],
[82, 1, 'b', 79.0],
[83, 0, 'b', 41.0],
[84, 0, 'b', 40.3],
[85, 0, 'a', 66.5],
[86, 0, 'a', 66.8],
[87, 0, 'a', 86.8],
[88, 1, 'b', 96.9],
[89, 0, 'b', 2.1],
[90, 0, 'b', 46.3],
[91, 0, 'b', 28.9],
[92, 0, 'b', 43.2],
[93, 0, 'b', 58.9],
[94, 0, 'b', 60.6],
[95, 0, 'b', 15.4],
[96, 0, 'b', 69.4],
[97, 1, 'b', 18.4],
[98, 0, 'b', 41.3],
[99, 0, 'b', 40.5]
]
]
Code to resample x
for state
'a' and 'b':
def resample(x, log_interval, dtype):
if not x:
return
red = []
prev_state, next_val, last_val = 0, 0, 0
for row in x:
if row[2] == dtype:
if row[0] >= next_val or row[1] != prev_state and row[0] > last_val:
red.append(row)
prev_state = row[1]
next_val = row[0] + log_interval
last_val = row[0]
return red
red_a = resample(x, 10, 'a')
red_b = resample(x, 10, 'b')
And expected outcome for red_a
and red_b
:
red_a = [
[20, 0, a, 80.7],
[25, 1, a, 58.0],
[26, 0, a, 49.9],
[73, 0, a, 37.4],
[85, 0, a, 66.5]
]
red_b = [
[1, 0, b, 93.8],
[11, 0, b, 1.3],
[12, 1, b, 37.6],
[13, 0, b, 18.2],
[16, 1, b, 23.7],
[17, 0, b, 54.1],
[28, 0, b, 60.2],
[37, 1, b, 89.8],
[38, 0, b, 57.7],
[39, 1, b, 20.3],
[40, 0, b, 98.6],
[43, 1, b, 14.6],
[44, 0, b, 92.5],
[46, 1, b, 58.9],
[48, 0, b, 74.9],
[50, 1, b, 29.5],
[51, 0, b, 24.6],
[58, 1, b, 30.2],
[60, 0, b, 47.6],
[70, 0, b, 62.5],
[80, 0, b, 78.5],
[82, 1, b, 79.0],
[83, 0, b, 41.0],
[88, 1, b, 96.9],
[89, 0, b, 2.1],
[97, 1, b, 18.4],
[98, 0, b, 41.3]
]
How can I do this in pandas?
A starting point is:
columns = ['ind', 'state', 'dtype', 'value']
df = pd.DataFrame(x, columns=columns)
But if I try a for loop it is extremely slow (eg for row in df: ...
).
Any idea how to proceed from here?
Upvotes: 2
Views: 1591
Reputation: 29635
So starting with df = pd.DataFrame(x, columns=['ind', 'state', 'dtype', 'value'])
, at first you can create two DFs (df_a
and df_b
) selecting the states such as:
df_a = df[df['dtype'] =='a'].copy()
df_b = df[df['dtype'] =='b'].copy()
Then you create a function select_row
that you will apply
to these DFs:
def select_row( row, log_interval):
# using global varaibles might be a bit dangerous but I didn't find another way
global prev_state, next_val, last_val
# Here your conditions
if (row['ind'] >= next_val) or (row['state'] != prev_state and row['ind'] > last_val):
# change the values of the global variables
prev_state = row['state']
next_val = row['ind'] + log_interval
last_val = row['ind']
return True # return True if your condition is met
else: # return False otherwise
return False
Now you can create a column in df_a
and df_b
with a Boolean value such as:
log_interval = 10
prev_state, next_val, last_val = 0, 0, 0
df_a['bool'] = df_a.apply(select_row, args = ([log_interval ]), axis = 1)
#same for df_b but don't forget to reset your global values
prev_state, next_val, last_val = 0, 0, 0
df_b['bool'] = df_b.apply(select_row, args = ([log_interval ]), axis = 1)
Finally, you can create your two output by selecting the row of df_a
(and df_b
) having True
in column 'bool'
and drop this column:
red_a = df_a[df_a['bool'] == True].drop('bool',axis=1)
red_b = df_b[df_b['bool'] == True].drop('bool',axis=1)
Upvotes: 1