Raphael
Raphael

Reputation: 1166

How to convert multi column expressions from Pandas to Polars

I just found out about the Polars lib and I wanted to convert some old functions to get familiar.

However, I stumbled upon an issue with my code. The "Mean_Angle" column is not calculated, and I have no idea if the last part even works as intended, it aborts during the group_by operation as the column is missing.

This is the pandas code I want to convert:

def calc_mean_and_error(df: pd.DataFrame, columns=None, groupby="Magn_Pos") -> pd.DataFrame:
    data = df.copy()
    if columns is None:
        columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']

    if 'Left_Angle' in columns and 'Right_Angle' in columns:
        data['Mean_Angle'] = (data['Left_Angle'] + data['Right_Angle']) / 2
        columns.append('Mean_Angle')
    grouped_df = data[columns].groupby(groupby,sort=False)

    num_points_per_group = grouped_df.size().values
    mean_df = grouped_df.mean()

    # standard deviation
    mean_df[['Left_Angle_SDEV','Right_Angle_SDEV','Mean_Angle_SDEV']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].std()

    # standard error, 1 sigma confidence interval
    mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']] = grouped_df[['Left_Angle','Right_Angle','Mean_Angle']].sem()

    # standard error, 2 sigma confidence interval - t distribution
    t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
    mean_df[['Left_Angle_SEM_95','Right_Angle_SEM_95','Mean_Angle_SEM_95']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_95_conf_int, axis=0)

    # standard error, 3 sigma confidence interval - t distribution
    t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
    mean_df[['Left_Angle_SEM_99','Right_Angle_SEM_99','Mean_Angle_SEM_99']] = mean_df[['Left_Angle_SEM_68','Right_Angle_SEM_68','Mean_Angle_SEM_68']].multiply(t_fac_99_conf_int, axis=0)

    mean_df = mean_df.reset_index()

    return mean_df

This is what I have so far:

def calc_mean_and_error(df: pl.DataFrame, columns=None, group_by="Magn_Pos") -> pl.DataFrame:
    if columns is None:
        columns = ['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field']

    if 'Left_Angle' in columns and 'Right_Angle' in columns:
        # this doesn't work?
        df.with_columns(
            pl.struct('Left_Angle', 'Right_Angle').map_elements(lambda x: (x['Left_Angle'] + x['Right_Angle']) / 2).alias("Mean_Angle")
        )
        columns.append('Mean_Angle')
    grouped_df = df.select(columns).group_by(group_by)

    num_points_per_group = grouped_df.count()['count'][0]
    mean_df = grouped_df.mean()

    t_fac_95_conf_int = stats.t.ppf(0.95, num_points_per_group) # factor according to https://en.wikipedia.org/wiki/Student%27s_t-distribution
    t_fac_99_conf_int = stats.t.ppf(0.997, num_points_per_group)
    # standard deviation
    mean_df = df.select(columns).group_by(group_by).agg(
        pl.all().mean(),
        pl.all().std().name.suffix('_SDEV'),
        pl.all().std().map_elements(lambda x: x / np.sqrt(num_points_per_group)).name.suffix('_SEM_68'), # standard error
        pl.all().std().map_elements(lambda x: x*t_fac_95_conf_int / np.sqrt(num_points_per_group)).name.suffix('_SEM_95'),
        pl.all().std().map_elements(lambda x: x*t_fac_99_conf_int / np.sqrt(num_points_per_group)).name.suffix('_SEM_99'),
    )

    return mean_df

Example:

import polars as pl
from scipy import stats

data_raw = """Time\tRepetition\tLeft_Angle\tRight_Angle\tMagn_Pos\tMagn_Field
0.0\t0\t111.62539060014953\t111.65929559305457\t20.0\t0.05012
289.75\t1\t113.43406129503042\t113.29101205027376\t20.0\t0.05012
343.420999999973\t2\t113.21669960326668\t113.30918399000467\t20.0\t0.05012
397.68700000003446\t0\t114.50650196149256\t114.78488582815113\t10.0\t0.1317
456.10900000005495\t1\t114.7078936381882\t114.70239460290726\t10.0\t0.1317
507.8279999999795\t2\t115.71894177915732\t115.70104461571628\t10.0\t0.1317
565.3429999999935\t0\t121.71521327349599\t121.55379420624988\t5.0\t0.2276
612.045999999973\t1\t122.53171995914443\t122.4555143281342\t5.0\t0.2276
668.3120000000345\t2\t121.65748098845367\t121.60313424823333\t5.0\t0.2276
714.484000000055\t0\t130.88884567117995\t130.82365731381574\t2.5\t0.3011
774.9679999999935\t1\t132.72366563179372\t132.59019277520363\t2.5\t0.3011
817.765000000014\t2\t133.5549497954158\t133.4637401535662\t2.5\t0.3011
891.7029999999795\t0\t139.9155468732065\t139.78384156146674\t0.0\t0.3907
940.655999999959\t1\t143.34707217674438\t143.2278696177915\t0.0\t0.3907
984.125\t2\t144.30042471080577\t144.16800277145435\t0.0\t0.3907""".encode("utf8")

df = pl.read_csv(data_raw, separator='\t')
df = calc_mean_and_error(df, columns=['Left_Angle', 'Right_Angle', 'Magn_Pos', 'Magn_Field'])
print(df)

Error:

# ColumnNotFoundError: Mean_Angle

I'm not really sure about the last part though! I am not entirely familiar with the syntax of the expressions. And I am not sure how to prevent calling group_by twice. Can someone lead me in the right direction? Thanks!

Upvotes: 0

Views: 597

Answers (1)

jqurious
jqurious

Reputation: 21229

The error of Mean_Angle not found is because .with_columns() returns a new dataframe - it does not modify in-place.

You need to assign the result.

df = df.with_columns(...)

After fixing that, you'll want to replace every .map_elements() usage with Expressions.

df = df.with_columns(
    ((pl.col("Left_Angle") + pl.col("Right_Angle")) / 2).alias("Mean_Angle")
)

This produces the same output as your pandas function.

def calc_mean_and_error(df: pl.DataFrame, columns=None, group_by="Magn_Pos") -> pl.DataFrame:
    if columns is None:
        columns = ["Left_Angle", "Right_Angle", "Magn_Pos", "Magn_Field"]

    aggs = ["Left", "Right"]
    if "Left_Angle" in columns and "Right_Angle" in columns:
        df = df.with_columns(Mean_Angle = (pl.col("Left_Angle") + pl.col("Right_Angle")) / 2)
        columns.append("Mean_Angle")
        aggs.append("Mean")
        
    aggs = [f"{agg}_Angle" for agg in aggs]
    
    mean_df = (
        df
        .with_columns(pl.len().over(group_by))
        .with_columns(
            t_fac_95_conf_int = pl.col("len").map_batches(lambda col: stats.t.ppf(0.95,  col)),
            t_fac_99_conf_int = pl.col("len").map_batches(lambda col: stats.t.ppf(0.997, col)),
            sqrt = pl.col("len").sqrt().alias("sqrt")
        )
        .group_by(group_by)
        .agg(
            pl.col(aggs + ["Magn_Field"]).mean(),
            pl.col(aggs).std().name.suffix("_SDEV"),
            (pl.col(aggs).std() / pl.col("sqrt")).first().name.suffix("_SEM_68"),
            (pl.col(aggs).std() * pl.col("t_fac_95_conf_int") / pl.col("sqrt")).first().name.suffix("_SEM_95"),
            (pl.col(aggs).std() * pl.col("t_fac_99_conf_int") / pl.col("sqrt")).first().name.suffix("_SEM_99"),
        )
    )

    return mean_df

I've used aggs here along with your initial columns list - but that could be cleaned up and a single list used instead.

Upvotes: 1

Related Questions