Reputation: 11
I would like to replace with NaN
s values that are more than 0.99 quantile and less than 0.01 quantile in the whole dataframe.
For now I found a way of doing so with one column, so I can do it one-at-a-time, but maybe there is possibility to apply the function to all the columns without ugly for-loops?
I also tried numpy
implementation with masking, but since the length of the result is not constant, this does not seem like a proper solution to me.
Quantile replacer for one column that works:
train_pl.select(
pl.when(pl.col('B_14') > pl.col('B_14').quantile(0.99))
.then(float("nan"))
.otherwise(pl.col('B_14'))
)
And here are my numpy functions in case you need them:
def replace_high_quantile(arr, q = 0.99):
mask = arr <= np.quantile(arr, q)
return ma.masked_array(arr, ~mask).filled(np.nan)
def replace_low_quantile(arr, q = 0.01):
mask = arr >= np.quantile(arr, q)
return ma.masked_array(arr, ~mask).filled(np.nan)
def replace_both_quantiles(arr, low = 0.01, high = 0.99):
mask = (arr >= np.quantile(arr, low)) & (arr <= np.quantile(arr, high)
return ma.masked_array(arr, ~mask).filled(np.nan)
Upvotes: 1
Views: 186
Reputation:
Let's start with this data:
import polars as pl
train_pl = pl.DataFrame(
{
"B_14": pl.int_range(101, eager=True),
"B_15": pl.int_range(101, eager=True).reverse(),
"B_16": pl.int_range(101, eager=True),
"other": pl.int_range(101, eager=True).reverse(),
}
)
train_pl
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╡
│ 0 ┆ 100 ┆ 0 ┆ 100 │
│ 1 ┆ 99 ┆ 1 ┆ 99 │
│ 2 ┆ 98 ┆ 2 ┆ 98 │
│ 3 ┆ 97 ┆ 3 ┆ 97 │
│ 4 ┆ 96 ┆ 4 ┆ 96 │
│ … ┆ … ┆ … ┆ … │
│ 96 ┆ 4 ┆ 96 ┆ 4 │
│ 97 ┆ 3 ┆ 97 ┆ 3 │
│ 98 ┆ 2 ┆ 98 ┆ 2 │
│ 99 ┆ 1 ┆ 99 ┆ 1 │
│ 100 ┆ 0 ┆ 100 ┆ 0 │
└──────┴──────┴──────┴───────┘
To replace all values above the 0.99 quantile and below the 0.01 quantile, simultaneously, and for all columns in parallel:
(
train_pl
.with_columns(
pl.when(
(pl.all() > pl.all().quantile(0.99)) |
(pl.all() < pl.all().quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.all())
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
To restrict the operation to only some columns:
only_these = ['B_14', 'B_15', 'other']
(
train_pl
.with_columns(
pl.when(
(pl.col(only_these) > pl.col(only_these).quantile(0.99)) |
(pl.col(only_these) < pl.col(only_these).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(only_these))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ 0 ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99 ┆ 1.0 │
│ NaN ┆ NaN ┆ 100 ┆ NaN │
└──────┴──────┴──────┴───────┘
I should also mention that you can use regex expressions in polars.col
. Just remember that the regex expression must start with ^
and end with $
. (These cannot be omitted.)
regex = r"^B_.*$"
(
train_pl
.with_columns(
pl.when(
(pl.col(regex) > pl.col(regex).quantile(0.99)) |
(pl.col(regex) < pl.col(regex).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(regex))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ i64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ 100 │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1 │
│ NaN ┆ NaN ┆ NaN ┆ 0 │
└──────┴──────┴──────┴───────┘
You can also select which columns by datatype.
these_types = [pl.Int64, pl.Float64]
(
train_pl
.with_columns(
pl.when(
(pl.col(these_types) > pl.col(these_types).quantile(0.99)) |
(pl.col(these_types) < pl.col(these_types).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.col(these_types))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
Sometimes it's easier to specify the columns that you do not want. The above methods will work with polars.exclude
, so that you can exclude columns that fit some criteria. For example, to select all columns except columns of strings (polars.String
):
these_types = [pl.String]
(
train_pl
.with_columns(
pl.when(
(pl.exclude(these_types) > pl.exclude(these_types).quantile(0.99)) |
(pl.exclude(these_types) < pl.exclude(these_types).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.exclude(these_types))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ NaN ┆ NaN ┆ NaN ┆ NaN │
│ 1.0 ┆ 99.0 ┆ 1.0 ┆ 99.0 │
│ 2.0 ┆ 98.0 ┆ 2.0 ┆ 98.0 │
│ 3.0 ┆ 97.0 ┆ 3.0 ┆ 97.0 │
│ 4.0 ┆ 96.0 ┆ 4.0 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96.0 ┆ 4.0 ┆ 96.0 ┆ 4.0 │
│ 97.0 ┆ 3.0 ┆ 97.0 ┆ 3.0 │
│ 98.0 ┆ 2.0 ┆ 98.0 ┆ 2.0 │
│ 99.0 ┆ 1.0 ┆ 99.0 ┆ 1.0 │
│ NaN ┆ NaN ┆ NaN ┆ NaN │
└──────┴──────┴──────┴───────┘
Or for example, to exclude any column that beings with B_
:
regex = r"^B_.*$"
(
train_pl
.with_columns(
pl.when(
(pl.exclude(regex) > pl.exclude(regex).quantile(0.99)) |
(pl.exclude(regex) < pl.exclude(regex).quantile(0.01))
)
.then(float("nan"))
.otherwise(pl.exclude(regex))
.name.keep()
)
)
shape: (101, 4)
┌──────┬──────┬──────┬───────┐
│ B_14 ┆ B_15 ┆ B_16 ┆ other │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ f64 │
╞══════╪══════╪══════╪═══════╡
│ 0 ┆ 100 ┆ 0 ┆ NaN │
│ 1 ┆ 99 ┆ 1 ┆ 99.0 │
│ 2 ┆ 98 ┆ 2 ┆ 98.0 │
│ 3 ┆ 97 ┆ 3 ┆ 97.0 │
│ 4 ┆ 96 ┆ 4 ┆ 96.0 │
│ … ┆ … ┆ … ┆ … │
│ 96 ┆ 4 ┆ 96 ┆ 4.0 │
│ 97 ┆ 3 ┆ 97 ┆ 3.0 │
│ 98 ┆ 2 ┆ 98 ┆ 2.0 │
│ 99 ┆ 1 ┆ 99 ┆ 1.0 │
│ 100 ┆ 0 ┆ 100 ┆ NaN │
└──────┴──────┴──────┴───────┘
Upvotes: 3