Reputation: 1166
I just found out about the Polars lib and I wanted to convert some old functions to get familiar.
However, I stumbled upon an issue with my code. The "Mean_Angle" column is not calculated, and I have no idea if the last part even works as intended, it aborts during the group_by operation as the column is missing.
This is the pandas code I want to convert:
def calc_mean_and_error(df: pd.DataFrame, columns=None, groupby="Magn_Pos") -> pd.DataFrame:
data = df.copy()
if columns is None:
columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']
if 'Left_Angle' in columns and 'Right_Angle' in columns:
data['Mean_Angle'] = (data['Left_Angle'] + data['Right_Angle']) / 2
columns.append('Mean_Angle')
grouped_df = data[columns].groupby(groupby,sort=False)
num_points_per_group = grouped_df.size().values
mean_df = grouped_df.mean()
# standard deviation
mean_df[['Left_Angle_SDEV','Right_Angle_SDEV','Mean_Angle_SDEV']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].std()
# standard error, 1 sigma confidence interval
mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].sem()
# standard error, 2 sigma confidence interval - t distribution
t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
mean_df[['Left_Angle_SEM_95','Right_Angle_SEM_95','Mean_Angle_SEM_95']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_95_conf_int, axis=0)
# standard error, 3 sigma confidence interval - t distribution
t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
mean_df[['Left_Angle_SEM_99','Right_Angle_SEM_99','Mean_Angle_SEM_99']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_99_conf_int, axis=0)
mean_df = mean_df.reset_index()
return mean_df
This is what I have so far:
def calc_mean_and_error(df: pl.DataFrame, columns=None, group_by="Magn_Pos") -> pl.DataFrame:
if columns is None:
columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']
if 'Left_Angle' in columns and 'Right_Angle' in columns:
# this doesn't work?
df.with_columns(
pl.struct('Left_Angle', 'Right_Angle').map_elements(lambda x: (x['Left_Angle'] + x['Right_Angle']) / 2).alias("Mean_Angle")
)
columns.append('Mean_Angle')
grouped_df = df.select(columns).group_by(group_by)
num_points_per_group = grouped_df.count()['count'][0]
mean_df = grouped_df.mean()
t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
# standard deviation
mean_df = df.select(columns).group_by(group_by).agg(
pl.all().mean(),
pl.all().std().name.suffix('_SDEV'),
pl.all().std().map_elements(lambda x: x / np.sqrt(num_points_per_group)).name.suffix('_SEM_68'), # standard error
pl.all().std().map_elements(lambda x: x*t_fac_95_conf_int / np.sqrt(num_points_per_group)).name.suffix('_SEM_95'),
pl.all().std().map_elements(lambda x: x*t_fac_99_conf_int / np.sqrt(num_points_per_group)).name.suffix('_SEM_99'),
)
return mean_df
Example:
import polars as pl
from scipy import stats
data_raw = """Time\tRepetition\tLeft_Angle\tRight_Angle\tMagn_Pos\tMagn_Field
0.0\t0\t111.62539060014953\t111.65929559305457\t20.0\t0.05012
289.75\t1\t113.43406129503042\t113.29101205027376\t20.0\t0.05012
343.420999999973\t2\t113.21669960326668\t113.30918399000467\t20.0\t0.05012
397.68700000003446\t0\t114.50650196149256\t114.78488582815113\t10.0\t0.1317
456.10900000005495\t1\t114.7078936381882\t114.70239460290726\t10.0\t0.1317
507.8279999999795\t2\t115.71894177915732\t115.70104461571628\t10.0\t0.1317
565.3429999999935\t0\t121.71521327349599\t121.55379420624988\t5.0\t0.2276
612.045999999973\t1\t122.53171995914443\t122.4555143281342\t5.0\t0.2276
668.3120000000345\t2\t121.65748098845367\t121.60313424823333\t5.0\t0.2276
714.484000000055\t0\t130.88884567117995\t130.82365731381574\t2.5\t0.3011
774.9679999999935\t1\t132.72366563179372\t132.59019277520363\t2.5\t0.3011
817.765000000014\t2\t133.5549497954158\t133.4637401535662\t2.5\t0.3011
891.7029999999795\t0\t139.9155468732065\t139.78384156146674\t0.0\t0.3907
940.655999999959\t1\t143.34707217674438\t143.2278696177915\t0.0\t0.3907
984.125\t2\t144.30042471080577\t144.16800277145435\t0.0\t0.3907""".encode("utf8")
df = pl.read_csv(data_raw, separator='\t')
df = calc_mean_and_error(df, columns=['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field'])
print(df)
Error:
# ColumnNotFoundError: Mean_Angle
I'm not really sure about the last part though! I am not entirely familiar with the syntax of the expressions. And I am not sure how to prevent calling group_by twice. Can someone lead me in the right direction? Thanks!
Upvotes: 0
Views: 597
Reputation: 21229
The error of Mean_Angle
not found is because .with_columns()
returns a new dataframe - it does not modify in-place.
You need to assign the result.
df = df.with_columns(...)
After fixing that, you'll want to replace every .map_elements()
usage with Expressions.
df = df.with_columns(
((pl.col("Left_Angle") + pl.col("Right_Angle")) / 2).alias("Mean_Angle")
)
This produces the same output as your pandas function.
def calc_mean_and_error(df: pl.DataFrame, columns=None, group_by="Magn_Pos") -> pl.DataFrame:
if columns is None:
columns = ["Left_Angle", "Right_Angle", "Magn_Pos", "Magn_Field"]
aggs = ["Left", "Right"]
if "Left_Angle" in columns and "Right_Angle" in columns:
df = df.with_columns(Mean_Angle = (pl.col("Left_Angle") + pl.col("Right_Angle")) / 2)
columns.append("Mean_Angle")
aggs.append("Mean")
aggs = [f"{agg}_Angle" for agg in aggs]
mean_df = (
df
.with_columns(pl.len().over(group_by))
.with_columns(
t_fac_95_conf_int = pl.col("len").map_batches(lambda col: stats.t.ppf(0.95, col)),
t_fac_99_conf_int = pl.col("len").map_batches(lambda col: stats.t.ppf(0.997, col)),
sqrt = pl.col("len").sqrt().alias("sqrt")
)
.group_by(group_by)
.agg(
pl.col(aggs + ["Magn_Field"]).mean(),
pl.col(aggs).std().name.suffix("_SDEV"),
(pl.col(aggs).std() / pl.col("sqrt")).first().name.suffix("_SEM_68"),
(pl.col(aggs).std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().name.suffix("_SEM_95"),
(pl.col(aggs).std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().name.suffix("_SEM_99"),
)
)
return mean_df
pl.len().over()
is used instead of the first group_by
.map_batches()
is used to give the whole column to stats.t.ppf()
I've used aggs
here along with your initial columns
list - but that could be cleaned up and a single list used instead.
Upvotes: 1