Reputation: 11
I have a huge dataset in a dataframe with 1 column having location name, 1 column having a depth value and then a set of other columns with values measured at those locations for those depths. The various locations have varying depth sampling rate.
What I need to do is to resample all the locations measures to a similar sampling rate. The depth sample values themselves can be misaligned (ie 2 locations having 5m sampling rate but with samples [0,5,10,...] and [1,6,11,...] is fine). While resampling the depth samples without an equivalent in input data must be interpolated linearly from the input sample just above and below.
I have a working slow solution resampling locations 1 by 1 and I am looking for a better solution using groupby. Below is code:
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal
resample_interval = 5
data = [
(2.17, 100, 1, "A"),
(3.04, 10, 1, "A"),
(3.91, 3, 3, "A"),
(4.78, 5, 3, "A"),
(5.65, 6, 5, "A"),
(6.52, 3, 5, "A"),
(7.39, 7, 6, "A"),
(8.26, 9, 7, "A"),
(9.13, 5, 8, "A"),
(10, 3, 1, "A"),
(10.87, 4, 3, "A"),
(11.74, 7, 4, "A"),
######################
(2.17, 100, 2, "B"),
(3.04, 10, 5, "B"),
(3.91, 3, 6, "B"),
(4.78, 5, 3, "B"),
(5.65, 6, 2, "B"),
(6.52, 3, 4, "B"),
(7.39, 7, 2, "B"),
(8.26, 9, 1, "B"),
(9.13, 5, 2, "B"),
(10, 3, 1, "B"),
(10.87, 4, 1, "B"),
(11.74, 7, 5, "B"),
######################
(100.0, 12, 1, "C"),
(100.9, 12, 9, "C"),
(101.8, 12, 6, "C"),
(102.7, 12, 6, "C"),
(103.6, 12, 7, "C"),
(104.5, 14, 7, "C"),
(105.4, 16, 7, "C"),
(106.3, 12, 8, "C"),
(107.2, 12, 8, "C"),
(108.1, 12, 8, "C"),
(109.0, 12, 3, "C"),
(109.9, 12, 3, "C"),
(110.8, 12, 4, "C"),
(111.7, 12, 1, "C"),
]
df_input = pd.DataFrame(data, columns=["Depth", "Value1", "Value2", "locations"])
df_input = df_input.set_index("Depth", drop=True).fillna("NAN VALUE")
def resample_location(
location_values: pd.DataFrame, resample_interval: int
) -> pd.DataFrame:
indx = np.arange(
location_values.index.min(), location_values.index.max(), resample_interval
)
location_values_ind = location_values.reindex(
pd.DataFrame(index=indx).index.union(location_values.index)
).interpolate(method="index", limit_area="inside")
location_values_interpolated = pd.merge(
left=pd.DataFrame(index=indx),
right=location_values_ind,
left_index=True,
right_index=True,
how="left",
)
location_values_interpolated[["locations"]] = location_values_interpolated[
["locations"]
].fillna(location_values["locations"].iloc[0])
return location_values_interpolated
def resample_locations_OLD(
locations_values: pd.DataFrame, resample_interval: int
) -> pd.DataFrame:
locations_resampled_dfs = []
for location in locations_values["locations"].unique():
one_location_values = locations_values[
locations_values["locations"] == location
].copy()
df_out = resample_location(one_location_values, resample_interval)
locations_resampled_dfs.append(df_out)
df_output_OLD = pd.concat(locations_resampled_dfs, axis=0)
df_output_OLD.index.name = "DEPTH"
df_output_OLD = df_output_OLD.sort_values(by=["locations", "DEPTH"])
return df_output_OLD
df_output_OLD = resample_locations_OLD(df_input, resample_interval)
def resample_locations_NEW(
locations_values: pd.DataFrame, resample_interval: int
) -> pd.DataFrame:
# convert depth to timedelta milliseconds to be able to use pandas resampling
locations_values["DEPTH_time"] = locations_values.index
locations_values["DEPTH_time"] = locations_values["DEPTH_time"].apply(
pd.to_timedelta, unit="ms"
)
locations_values = locations_values.set_index(["DEPTH_time"], drop=True)
locations_values_resampled = (
locations_values.groupby("locations")
.resample(f"{resample_interval}ms")
.ffill()
.interpolate("linear", limit_area="inside")
)
# Transform back into initial columns
# can be dropped instead reset_index below will re-add locations and DEPTH_time as columns
locations_values_resampled = locations_values_resampled.drop(
columns=["locations"]
).reset_index()
# DEPTH_time back to original DEPTH column
locations_values_resampled["DEPTH"] = (
locations_values_resampled["DEPTH_time"].dt.microseconds / 1000
)
# Set DEPTH as index
locations_values_resampled = locations_values_resampled.set_index(
"DEPTH", drop=True
)
# Clean output dataframe to match old function output
locations_values_resampled = locations_values_resampled[
["Value1", "Value2", "locations"]
]
locations_values_resampled = locations_values_resampled.sort_values(
by=["locations", "DEPTH"]
)
return locations_values_resampled
df_output_NEW = resample_locations_NEW(df_input.copy(), resample_interval)
For reference, output of both functions:
OLD FUNCTION OUTPUT - DESIRED OUTPUT :
Value1 Value2 locations
DEPTH
2.17 100.000000 1.000000 A
7.17 5.988506 5.747126 A
2.17 100.000000 2.000000 B
7.17 5.988506 2.505747 B
100.00 12.000000 1.000000 C
105.00 15.111111 7.000000 C
110.00 12.000000 3.111111 C
NEW FUNCTION OUTPUT :
Value1 Value2 locations
DEPTH
2.17 100 1 A
7.17 3 5 A
2.17 100 2 B
7.17 3 4 B
100.00 12 1 C
105.00 14 7 C
110.00 12 3 C
Upvotes: 0
Views: 47