Robert
Robert

Reputation: 161

Peek at the next value in a rust-polars LazyFrame column while still working on the current one

I guess this is a conceptual oxymoron "peeking ahead in a LazyFrame-column" ... maybe one of you can enlighten me how to best do it.

I want to put the result of this for each date into a new column:

Ok( (next_weekday_number - current_weekday_number) == 1 )

Here is the sample code to help me find an answer:

// PLEASE be aware to add the needed feature flags in your toml file

use polars::export::arrow::temporal_conversions::date32_to_date;
use polars::prelude::*;

fn main() -> Result<()> {
    let days = df!(
        "date_string" => &["1900-01-01", "1900-01-02", "1900-01-03", "1900-01-04", "1900-01-05",
        "1900-01-06", "1900-01-07", "1900-01-09", "1900-01-10"])?;

    let options = StrpTimeOptions {
        date_dtype: DataType::Date,   // the result column-datatype
        fmt: Some("%Y-%m-%d".into()), // the source format of the date-string
        strict: false,
        exact: true,
    };

    // convert date_string into dtype(date) and put into new column "date_type"
    // we convert the days DataFrame to a LazyFrame ...
    // because in my real-world example I am getting a LazyFrame
    let mut new_days = days.lazy().with_column(
        col("date_string")
            .alias("date_type")
            .str()
            .strptime(options),
    );

    // This is what I wanted to do ... but I get a string result .. need u32
    // let o = GetOutput::from_type(DataType::Date);
    // new_days = new_days.with_column(
    //     col("date_type")
    //         .alias("weekday_number")
    //         .map(|x| Ok(x.strftime("%w").unwrap()), o.clone()),
    // );

    // This is the convoluted workaround
    let o = GetOutput::from_type(DataType::Date);
    new_days = new_days.with_column(col("date_type").alias("weekday_number").map(
        |x| {
            Ok(x.date()
                .unwrap()
                .clone()
                .into_iter()
                .map(|opt_name: Option<i32>| {
                    opt_name.map(|datum: i32| {
                        // println!("{:?}", datum);
                        date32_to_date(datum)
                            .format("%w")
                            .to_string()
                            .parse::<u32>()
                            .unwrap()
                    })
                })
                .collect::<UInt32Chunked>()
                .into_series())
        },
        o,
    ));

    // Here is where my challenge is ..
    // I need to get the weekday_number of the following day to determine a condition
    // my pseudo code:
    // new_days = new_days.with_column(
    //     col("weekday_number")
    //         .alias("cold_day")
    //         .map(|x| Ok( (next_weekday_number - current_weekday_number) == 1 ), o.clone()),
    // );

    println!("{:?}", new_days.clone().collect());

    Ok(())
}

Upvotes: 0

Views: 321

Answers (1)

Robert
Robert

Reputation: 161

Ok, I could not find a way to do everything with a LazyFrame, thus I converted the LazyFrame to an eager DataFrame and was able to process two columns at the same time.

So its working for now. Maybe someone can help me realize a solution just with a LazyFrame.

Here is the working code:

use polars::export::arrow::temporal_conversions::date32_to_date;

use polars::prelude::*;

fn main() -> Result<()> {
    let days = df!(
        "date_string" => &["1900-01-01", "1900-01-02", "1900-01-03", "1900-01-04", "1900-01-05",
        "1900-01-06", "1900-01-07", "1900-01-09", "1900-01-10"])?;

    let options = StrpTimeOptions {
        date_dtype: DataType::Date,   // the result column-datatype
        fmt: Some("%Y-%m-%d".into()), // the source format of the date-string
        strict: false,
        exact: true,
    };

    // convert date_string into dtype(date) and put into new column "date_type"
    // we convert the days DataFrame to a LazyFrame ...
    // because in my real-world example I am getting a LazyFrame
    let mut new_days_lf = days.lazy().with_column(
        col("date_string")
            .alias("date_type")
            .str()
            .strptime(options),
    );

    // Getting the weekday as a number:
    // This is what I wanted to do ... but I get a string result .. need u32
    // let o = GetOutput::from_type(DataType::Date);
    // new_days_lf = new_days_lf.with_column(
    //     col("date_type")
    //         .alias("weekday_number")
    //         .map(|x| Ok(x.strftime("%w").unwrap()), o.clone()),
    // );

    // This is the convoluted workaround for getting the weekday as a number
    let o = GetOutput::from_type(DataType::Date);
    new_days_lf = new_days_lf.with_column(col("date_type").alias("weekday_number").map(
        |x| {
            Ok(x.date()
                .unwrap()
                .clone()
                .into_iter()
                .map(|opt_name: Option<i32>| {
                    opt_name.map(|datum: i32| {
                        // println!("{:?}", datum);
                        date32_to_date(datum)
                            .format("%w")
                            .to_string()
                            .parse::<u32>()
                            .unwrap()
                    })
                })
                .collect::<UInt32Chunked>()
                .into_series())
        },
        o,
    ));

    // The "peek" ==> add a shifted column
    new_days_lf = new_days_lf.with_column(
        col("weekday_number")
            .shift_and_fill(-1, 9999)
            .alias("next_weekday_number"),
    );

    // now we convert the LazyFrame into a normal DataFrame for further processing:
    let mut new_days_df = new_days_lf.collect()?;

    // convert the column to a series
    // to get a column by name we need to collect the LazyFrame into a normal DataFrame
    let col1 = new_days_df.column("weekday_number")?;

    // convert the column to a series
    let col2 = new_days_df.column("next_weekday_number")?;

    // now I can use series-arithmetics
    let diff = col2 - col1;

    // create a bool column based on "element == 2"
    // add bool column to DataFrame
    new_days_df.replace_or_add("weekday diff eq(2)", diff.equal(2)?.into_series())?;

    println!("{:?}", new_days_df);

    Ok(())
}

Upvotes: 0

Related Questions