How to convert multi column expressions from Pandas to Polars

Question

I just found out about the Polars lib and I wanted to convert some old functions to get familiar.

However, I stumbled upon an issue with my code. The "Mean_Angle" column is not calculated, and I have no idea if the last part even works as intended, it aborts during the group_by operation as the column is missing.

This is the pandas code I want to convert:

def calc_mean_and_error(df: pd.DataFrame, columns=None, groupby="Magn_Pos") -> pd.DataFrame:
    data = df.copy()
    if columns is None:
        columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']

    if 'Left_Angle' in columns and 'Right_Angle' in columns:
        data['Mean_Angle'] = (data['Left_Angle'] + data['Right_Angle']) / 2
        columns.append('Mean_Angle')
    grouped_df = data[columns].groupby(groupby,sort=False)

    num_points_per_group = grouped_df.size().values
    mean_df = grouped_df.mean()

    # standard deviation
    mean_df[['Left_Angle_SDEV','Right_Angle_SDEV','Mean_Angle_SDEV']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].std()

    # standard error, 1 sigma confidence interval
    mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].sem()

    # standard error, 2 sigma confidence interval - t distribution
    t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
    mean_df[['Left_Angle_SEM_95','Right_Angle_SEM_95','Mean_Angle_SEM_95']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_95_conf_int, axis=0)

    # standard error, 3 sigma confidence interval - t distribution
    t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
    mean_df[['Left_Angle_SEM_99','Right_Angle_SEM_99','Mean_Angle_SEM_99']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_99_conf_int, axis=0)

    mean_df = mean_df.reset_index()

    return mean_df

This is what I have so far:

def calc_mean_and_error(df: pl.DataFrame, columns=None, group_by="Magn_Pos") -> pl.DataFrame:
    if columns is None:
        columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']

    if 'Left_Angle' in columns and 'Right_Angle' in columns:
        # this doesn't work?
        df.with_columns(
            pl.struct('Left_Angle', 'Right_Angle').map_elements(lambda x: (x['Left_Angle'] + x['Right_Angle']) / 2).alias("Mean_Angle")
        )
        columns.append('Mean_Angle')
    grouped_df = df.select(columns).group_by(group_by)

    num_points_per_group = grouped_df.count()['count'][0]
    mean_df = grouped_df.mean()

    t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
    t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
    # standard deviation
    mean_df = df.select(columns).group_by(group_by).agg(
        pl.all().mean(),
        pl.all().std().name.suffix('_SDEV'),
        pl.all().std().map_elements(lambda x: x / np.sqrt(num_points_per_group)).name.suffix('_SEM_68'), # standard error
        pl.all().std().map_elements(lambda x: x*t_fac_95_conf_int / np.sqrt(num_points_per_group)).name.suffix('_SEM_95'),
        pl.all().std().map_elements(lambda x: x*t_fac_99_conf_int / np.sqrt(num_points_per_group)).name.suffix('_SEM_99'),
    )

    return mean_df

Example:

import polars as pl
from scipy import stats

data_raw = """Time	Repetition	Left_Angle	Right_Angle	Magn_Pos	Magn_Field
0.0	0	111.62539060014953	111.65929559305457	20.0	0.05012
289.75	1	113.43406129503042	113.29101205027376	20.0	0.05012
343.420999999973	2	113.21669960326668	113.30918399000467	20.0	0.05012
397.68700000003446	0	114.50650196149256	114.78488582815113	10.0	0.1317
456.10900000005495	1	114.7078936381882	114.70239460290726	10.0	0.1317
507.8279999999795	2	115.71894177915732	115.70104461571628	10.0	0.1317
565.3429999999935	0	121.71521327349599	121.55379420624988	5.0	0.2276
612.045999999973	1	122.53171995914443	122.4555143281342	5.0	0.2276
668.3120000000345	2	121.65748098845367	121.60313424823333	5.0	0.2276
714.484000000055	0	130.88884567117995	130.82365731381574	2.5	0.3011
774.9679999999935	1	132.72366563179372	132.59019277520363	2.5	0.3011
817.765000000014	2	133.5549497954158	133.4637401535662	2.5	0.3011
891.7029999999795	0	139.9155468732065	139.78384156146674	0.0	0.3907
940.655999999959	1	143.34707217674438	143.2278696177915	0.0	0.3907
984.125	2	144.30042471080577	144.16800277145435	0.0	0.3907""".encode("utf8")

df = pl.read_csv(data_raw, separator='	')
df = calc_mean_and_error(df, columns=['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field'])
print(df)

Error:

# ColumnNotFoundError: Mean_Angle

I'm not really sure about the last part though! I am not entirely familiar with the syntax of the expressions. And I am not sure how to prevent calling group_by twice. Can someone lead me in the right direction? Thanks!

jqurious · Accepted Answer

The error of Mean_Angle not found is because .with_columns() returns a new dataframe - it does not modify in-place.

You need to assign the result.

df = df.with_columns(...)

After fixing that, you'll want to replace every .map_elements() usage with Expressions.

df = df.with_columns(
    ((pl.col("Left_Angle") + pl.col("Right_Angle")) / 2).alias("Mean_Angle")
)

This produces the same output as your pandas function.

def calc_mean_and_error(df: pl.DataFrame, columns=None, group_by="Magn_Pos") -> pl.DataFrame:
    if columns is None:
        columns = ["Left_Angle", "Right_Angle", "Magn_Pos", "Magn_Field"]

    aggs = ["Left", "Right"]
    if "Left_Angle" in columns and "Right_Angle" in columns:
        df = df.with_columns(Mean_Angle = (pl.col("Left_Angle") + pl.col("Right_Angle")) / 2)
        columns.append("Mean_Angle")
        aggs.append("Mean")
        
    aggs = [f"{agg}_Angle" for agg in aggs]
    
    mean_df = (
        df
        .with_columns(pl.len().over(group_by))
        .with_columns(
            t_fac_95_conf_int = pl.col("len").map_batches(lambda col: stats.t.ppf(0.95,  col)),
            t_fac_99_conf_int = pl.col("len").map_batches(lambda col: stats.t.ppf(0.997, col)),
            sqrt = pl.col("len").sqrt().alias("sqrt")
        )
        .group_by(group_by)
        .agg(
            pl.col(aggs + ["Magn_Field"]).mean(),
            pl.col(aggs).std().name.suffix("_SDEV"),
            (pl.col(aggs).std() / pl.col("sqrt")).first().name.suffix("_SEM_68"),
            (pl.col(aggs).std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().name.suffix("_SEM_95"),
            (pl.col(aggs).std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().name.suffix("_SEM_99"),
        )
    )

    return mean_df

pl.len().over() is used instead of the first group_by
.map_batches() is used to give the whole column to stats.t.ppf()

I've used aggs here along with your initial columns list - but that could be cleaned up and a single list used instead.

How to convert multi column expressions from Pandas to Polars

Answers (1)

Related Questions