YeO
YeO

Reputation: 1170

Reading a fixed width file with Rust

I am trying to read a fixed width file with Rust, with the goal to load the data into a Polars dataframe, however, I can't parse the file correctly.
I am trying to use the fixed_width crate as polars lacks the utility to parse such files.
Unfortunately, the fixed_width documentation does not have any example where a file is read, all the examples read data from a string...

Here is my failed attempt (the commented code works as it is a copy-paste from the docs) and obviously, the file contains same data as the string:

use serde::{Serialize, Deserialize};
use fixed_width::{FixedWidth, FieldSet, Reader};


fn main() {
    let r_file = "path/to/file.r01";
    // let data = String::from("R   5001.00   1001.00                          513777.5 2093285.7   0.0\nR   5001.00   1002.00                          513786.6 2093281.6   0.0\nR   5001.00   1003.00                          513795.7 2093277.4   0.0\nR   5001.00   1004.00                          513708.8 2093273.3   0.0\n");

    #[derive(Serialize, Deserialize, Debug)]
    struct SpsRecord {
        pub line: f32,
        pub point: f32,
        pub easting: f32,
        pub northing: f32
    }

    impl FixedWidth for SpsRecord {
        fn fields() -> FieldSet {
            FieldSet::Seq(vec![
                FieldSet::new_field(1..11).name("line"),
                FieldSet::new_field(11..21).name("point"),
                FieldSet::new_field(46..55).name("easting"),
                FieldSet::new_field(55..65).name("northing")
            ])
        }
    }

    impl SpsRecord {
        fn from_file(path: &str) -> Result<Vec<Self>, fixed_width::Error> {
            let mut reader = Reader::from_file(path)?;
            let records: Result<Vec<Self>, fixed_width::Error> = reader
                .byte_reader()
                .filter_map(Result::ok)
                .map(|bytes| fixed_width::from_bytes(&bytes))
                .collect();
            match records {
                Ok(records) => Ok(records),
                Err(err) => Err(fixed_width::Error::from(err))
            }
        }

        // fn from_string(data: &str) -> Result<Vec<Self>, fixed_width::Error> {
        //     let mut reader = Reader::from_string(data).width(72);
        //     let records: Result<Vec<Self>, fixed_width::Error> = reader
        //         .byte_reader()
        //         .filter_map(Result::ok)
        //         .map(|bytes| fixed_width::from_bytes(&bytes))
        //         .collect();
        //     match records {
        //         Ok(records) => Ok(records),
        //         Err(err) => Err(fixed_width::Error::from(err))
        //     }
        // }
    }

    println!("Reading {}...", r_file);
    match SpsRecord::from_file(r_file) {
        Ok(records) => {
            for record in records {
                println!("{:?}", record);
            }
        }
        Err(err) => {
            eprintln!("{:#?}", err);
        }
    }

    // println!("Reading\n{}...", data);
    // match SpsRecord::from_string(&data) {
    //     Ok(records) => {
    //         for record in records {
    //             println!("{:?}", record);
    //         }
    //     }
    //     Err(err) => {
    //         eprintln!("{:#?}", err);
    //     }
    // } 
}

The code runs, prints the "Reading..." line and does absolutely nothing, so I don't know where to look.

Upvotes: 0

Views: 253

Answers (1)

Guillaume
Guillaume

Reputation: 681

Here is an implementation directly with rust-polars inspired by the python one.

The idea is to read the file as one column and split it with polars slice expressions.

As an example here two types are generated: integer and character but it's possible to extend the ugly if/else, would be better to use a match to infer polars types from short types (c : char, i : int, d : date).

temp_file_fwf.txt

000000001Strawberryfieldsforever
000000002Strawberryfieldsforever
000000003Strawberryfieldsforever
000000004Strawberryfieldsforever
000000005Strawberryfieldsforever
000000006Strawberryfieldsforever
pub fn main() {
    use polars::prelude::*;

    // Specify columns positions and lengths, names and types of columns to parse
    let column_names:   Vec<&str> = vec!["col_1", "col_2", "col_3", "col_4"];
    let column_starts:  Vec<i64>  = vec![0, 9, 19, 25];
    let column_lengths: Vec<i64>  = vec![9, 10, 6, 40];
    let column_types:   Vec<&str> = vec!["i", "c", "c", "c"];
    
    // initialize a vector to "list" polars expressions, on per column
    let mut _vec_expr:Vec<Expr> = vec![];

   let mut _i = 0;


    while _i < column_names.len() {
                // character first
            if column_types[_i] == "c" {
                // if character then slice 
                _vec_expr.append(&mut 
                vec![col("l").str()
                .slice(column_starts[_i], Some(column_lengths[_i].try_into().unwrap()))
                .alias(column_names[_i])]);
            } // if integers then slice and cast to integer (i32 here) 
            else if ["i"].contains(&column_types[_i]){
                // integer next
                _vec_expr.append(&mut 
                vec![col("l").str()
                .slice(column_starts[_i], Some(column_lengths[_i].try_into().unwrap()))
                .cast(DataType::Int32)
                .alias(column_names[_i])]);
        }
        _i +=1;
    }

    println!("{:?}", _vec_expr);

    let path = "temp_file_fwf.txt";
    // Read with csv reader lazily (if you have comma in the file, change the delimiter)
    let data_ = LazyCsvReader::new(path)
    // read just one column named "l" for line
    .with_schema(Some(Arc::new(Schema::from_iter(vec![Field::new("l", DataType::Utf8)]).into())))
    .has_header(false)
    // test 100 first lines
    .with_n_rows(Some(100))
    .finish()
    .unwrap();

    // append the polars lazyframe with the expressions generated above
    let data_ = data_
            .with_columns(
            _vec_expr
            );

    // collect
    println!("{:?}", data_.collect());
    
}
[col("l").str.slice().cast(Int32).alias("col_1"), col("l").str.slice().alias("col_2"), col("l").str.slice().alias("col_3"), col("l").str.slice().alias("col_4")]
Ok(shape: (6, 5)
┌──────────────────────────────────┬───────┬────────────┬────────┬─────────┐
│ l                                ┆ col_1 ┆ col_2      ┆ col_3  ┆ col_4   │
│ ---                              ┆ ---   ┆ ---        ┆ ---    ┆ ---     │
│ str                              ┆ i32   ┆ str        ┆ str    ┆ str     │
╞══════════════════════════════════╪═══════╪════════════╪════════╪═════════╡
│ 000000001Strawberryfieldsforever ┆ 1     ┆ Strawberry ┆ fields ┆ forever │
│ 000000002Strawberryfieldsforever ┆ 2     ┆ Strawberry ┆ fields ┆ forever │
│ 000000003Strawberryfieldsforever ┆ 3     ┆ Strawberry ┆ fields ┆ forever │
│ 000000004Strawberryfieldsforever ┆ 4     ┆ Strawberry ┆ fields ┆ forever │
│ 000000005Strawberryfieldsforever ┆ 5     ┆ Strawberry ┆ fields ┆ forever │
│ 000000006Strawberryfieldsforever ┆ 6     ┆ Strawberry ┆ fields ┆ forever │
└──────────────────────────────────┴───────┴────────────┴────────┴─────────┘)

Duplicated on the github polars repo: https://github.com/pola-rs/polars/issues/13632

Upvotes: 0

Related Questions