Reputation: 285
say I have:
df ={'animal' : [1, 1, 1, 1, 1, 1, 1, 2, 2],
'x':[76.551, 77.529, 78.336,77, 78.02, 79.23, 77.733, 79.249, 76.077],
'y': [151.933, 152.945, 153.970, 119.369, 120.615, 118.935, 119.115, 152.004, 153.027],
'time': [0, 1, 2, 0, 3,2,5, 0, 1]}
df = pd.DataFrame(df)
# get distance travelled between points
def get_diff(df):
dx = (df['x'] - df.groupby('animal')['x'].shift(1))
dy = (df['y'] - df.groupby('animal')['y'].shift(1))
df['distance'] = (dx**2 + dy**2)**0.5
return df
# get the start/end coordinates
def get_startend(df):
for i in range(len(df)):
df.loc[df['distance'] > 5, 'start'] = 'start' #if distance >5, assign 'start'
df.loc[df['distance'].isnull(), 'start'] = 'start' #if distance = NaN, assign 'start'
cond = df['start'].shift(-1).str.contains('start').fillna(False) #for every idx before row 'start', assign 'end'
df.loc[cond, 'start'] = 'end'
df['start'].iloc[-1] = 'end' #assign 'end' for last idx
return df
df = get_diff(df)
df = get_startend(df)
after some preprocessing, I end up with:
animal x y time distance start
0 1 76.551 151.933 0 NaN start
1 1 77.529 152.945 1 1.407348 NaN
2 1 78.336 153.970 2 1.304559 end
3 1 77.000 119.369 0 34.626783 start
4 1 76.020 120.615 3 1.585218 NaN
5 1 79.230 118.935 2 3.623051 NaN
6 1 77.733 119.115 5 1.507783 end
7 2 79.249 152.004 0 NaN start
8 2 76.077 153.027 1 3.332884 end
I want to artificially recenter the start coordinates at (0,0). So if the start column has 'start', then subtract the x,y values from all the following rows until reach the next start index, then subtract the new x,y values, etc.
output should look something like:
animal x y time distance start newX newY
0 1 76.551 151.933 0 NaN start 0 0 #(76.551-76.551, 151.993-151.933)
1 1 77.529 152.945 1 1.407348 NaN 0.978 1.012 #(77.529-76.551, 152.945-151.933)
2 1 78.336 153.970 2 1.304559 end 1.785 2.012 #(78.336-76.551, 153.970-151.933)
3 1 77.000 119.369 0 34.626783 start 0 0 #(77-77, 119.369-119.369)
4 1 76.020 120.615 3 1.610253 NaN -0.98 1.246 #(76.020-77, 120.615-119.363)
5 1 79.230 118.935 2 3.623051 NaN 2.23 -0.434 #(..., ...)
6 1 77.733 119.115 5 1.507783 end 0.733 -0.254
7 2 79.249 152.004 0 NaN start 0 0 #(79.249-79.249, 152.004-152.004)
8 2 76.077 153.027 1 3.332884 end -3.172 1.023 #(76.077-79.249,153.027-152.004)
Upvotes: 1
Views: 258
Reputation: 120409
You can use diff
to compute the difference between previous rows.
df[['new_x', 'new_y']] = \
df.groupby(df['start'].notna().cumsum())[['x', 'y']].diff().fillna(0)
print(df)
# Output
animal x y time distance start new_x new_y
0 1 76.551 151.933 0 NaN start 0.000 0.000
1 1 77.529 152.945 1 1.407348 NaN 0.978 1.012
2 1 78.336 153.970 2 1.304559 NaN 0.807 1.025
3 1 77.000 119.369 0 34.626783 start 0.000 0.000
4 1 78.020 120.615 3 1.610253 NaN 1.020 1.246
5 1 79.230 118.935 2 2.070386 NaN 1.210 -1.680
6 1 77.733 119.115 5 1.507783 NaN -1.497 0.180
7 2 79.249 152.004 0 NaN start 0.000 0.000
8 2 76.077 153.027 1 3.332884 NaN -3.172 1.023
Upvotes: 1
Reputation:
You can create a boolean mask based on start
, and then use cumsum
to turn that into a perfect grouper. Group by it, and then get the first value of x
and y
for each group. Subtract x
and y
from those firsts and you have your new columns:
df[['newX', 'newY']] = df[['x', 'y']] - df.groupby(df['start'].eq('start').cumsum())[['x', 'y']].transform('first')
Output:
animal x y time distance start newX newY
0 1 76.551 151.933 0 NaN start 0.000 0.000
1 1 77.529 152.945 1 1.407348 NaN 0.978 1.012
2 1 78.336 153.970 2 1.304559 NaN 1.785 2.037
3 1 77.000 119.369 0 34.626783 start 0.000 0.000
4 1 76.020 120.615 3 1.585218 NaN -0.980 1.246
5 1 79.230 118.935 2 3.623051 NaN 2.230 -0.434
6 1 77.733 119.115 5 1.507783 NaN 0.733 -0.254
7 2 79.249 152.004 0 NaN start 0.000 0.000
8 2 76.077 153.027 1 3.332884 NaN -3.172 1.023
Upvotes: 1