Reputation: 1170
I am trying to read a fixed width file with Rust
, with the goal to load the data into a Polars
dataframe, however, I can't parse the file correctly.
I am trying to use the fixed_width
crate as polars
lacks the utility to parse such files.
Unfortunately, the fixed_width
documentation does not have any example where a file is read, all the examples read data from a string...
Here is my failed attempt (the commented code works as it is a copy-paste from the docs) and obviously, the file contains same data as the string:
use serde::{Serialize, Deserialize};
use fixed_width::{FixedWidth, FieldSet, Reader};
fn main() {
let r_file = "path/to/file.r01";
// let data = String::from("R 5001.00 1001.00 513777.5 2093285.7 0.0\nR 5001.00 1002.00 513786.6 2093281.6 0.0\nR 5001.00 1003.00 513795.7 2093277.4 0.0\nR 5001.00 1004.00 513708.8 2093273.3 0.0\n");
#[derive(Serialize, Deserialize, Debug)]
struct SpsRecord {
pub line: f32,
pub point: f32,
pub easting: f32,
pub northing: f32
}
impl FixedWidth for SpsRecord {
fn fields() -> FieldSet {
FieldSet::Seq(vec![
FieldSet::new_field(1..11).name("line"),
FieldSet::new_field(11..21).name("point"),
FieldSet::new_field(46..55).name("easting"),
FieldSet::new_field(55..65).name("northing")
])
}
}
impl SpsRecord {
fn from_file(path: &str) -> Result<Vec<Self>, fixed_width::Error> {
let mut reader = Reader::from_file(path)?;
let records: Result<Vec<Self>, fixed_width::Error> = reader
.byte_reader()
.filter_map(Result::ok)
.map(|bytes| fixed_width::from_bytes(&bytes))
.collect();
match records {
Ok(records) => Ok(records),
Err(err) => Err(fixed_width::Error::from(err))
}
}
// fn from_string(data: &str) -> Result<Vec<Self>, fixed_width::Error> {
// let mut reader = Reader::from_string(data).width(72);
// let records: Result<Vec<Self>, fixed_width::Error> = reader
// .byte_reader()
// .filter_map(Result::ok)
// .map(|bytes| fixed_width::from_bytes(&bytes))
// .collect();
// match records {
// Ok(records) => Ok(records),
// Err(err) => Err(fixed_width::Error::from(err))
// }
// }
}
println!("Reading {}...", r_file);
match SpsRecord::from_file(r_file) {
Ok(records) => {
for record in records {
println!("{:?}", record);
}
}
Err(err) => {
eprintln!("{:#?}", err);
}
}
// println!("Reading\n{}...", data);
// match SpsRecord::from_string(&data) {
// Ok(records) => {
// for record in records {
// println!("{:?}", record);
// }
// }
// Err(err) => {
// eprintln!("{:#?}", err);
// }
// }
}
The code runs, prints the "Reading..." line and does absolutely nothing, so I don't know where to look.
Upvotes: 0
Views: 253
Reputation: 681
Here is an implementation directly with rust-polars inspired by the python one.
The idea is to read the file as one column and split it with polars slice expressions.
As an example here two types are generated: integer and character but it's possible to extend the ugly if/else, would be better to use a match to infer polars types from short types (c : char, i : int, d : date).
temp_file_fwf.txt
000000001Strawberryfieldsforever
000000002Strawberryfieldsforever
000000003Strawberryfieldsforever
000000004Strawberryfieldsforever
000000005Strawberryfieldsforever
000000006Strawberryfieldsforever
pub fn main() {
use polars::prelude::*;
// Specify columns positions and lengths, names and types of columns to parse
let column_names: Vec<&str> = vec!["col_1", "col_2", "col_3", "col_4"];
let column_starts: Vec<i64> = vec![0, 9, 19, 25];
let column_lengths: Vec<i64> = vec![9, 10, 6, 40];
let column_types: Vec<&str> = vec!["i", "c", "c", "c"];
// initialize a vector to "list" polars expressions, on per column
let mut _vec_expr:Vec<Expr> = vec![];
let mut _i = 0;
while _i < column_names.len() {
// character first
if column_types[_i] == "c" {
// if character then slice
_vec_expr.append(&mut
vec![col("l").str()
.slice(column_starts[_i], Some(column_lengths[_i].try_into().unwrap()))
.alias(column_names[_i])]);
} // if integers then slice and cast to integer (i32 here)
else if ["i"].contains(&column_types[_i]){
// integer next
_vec_expr.append(&mut
vec![col("l").str()
.slice(column_starts[_i], Some(column_lengths[_i].try_into().unwrap()))
.cast(DataType::Int32)
.alias(column_names[_i])]);
}
_i +=1;
}
println!("{:?}", _vec_expr);
let path = "temp_file_fwf.txt";
// Read with csv reader lazily (if you have comma in the file, change the delimiter)
let data_ = LazyCsvReader::new(path)
// read just one column named "l" for line
.with_schema(Some(Arc::new(Schema::from_iter(vec![Field::new("l", DataType::Utf8)]).into())))
.has_header(false)
// test 100 first lines
.with_n_rows(Some(100))
.finish()
.unwrap();
// append the polars lazyframe with the expressions generated above
let data_ = data_
.with_columns(
_vec_expr
);
// collect
println!("{:?}", data_.collect());
}
[col("l").str.slice().cast(Int32).alias("col_1"), col("l").str.slice().alias("col_2"), col("l").str.slice().alias("col_3"), col("l").str.slice().alias("col_4")]
Ok(shape: (6, 5)
┌──────────────────────────────────┬───────┬────────────┬────────┬─────────┐
│ l ┆ col_1 ┆ col_2 ┆ col_3 ┆ col_4 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ str ┆ str ┆ str │
╞══════════════════════════════════╪═══════╪════════════╪════════╪═════════╡
│ 000000001Strawberryfieldsforever ┆ 1 ┆ Strawberry ┆ fields ┆ forever │
│ 000000002Strawberryfieldsforever ┆ 2 ┆ Strawberry ┆ fields ┆ forever │
│ 000000003Strawberryfieldsforever ┆ 3 ┆ Strawberry ┆ fields ┆ forever │
│ 000000004Strawberryfieldsforever ┆ 4 ┆ Strawberry ┆ fields ┆ forever │
│ 000000005Strawberryfieldsforever ┆ 5 ┆ Strawberry ┆ fields ┆ forever │
│ 000000006Strawberryfieldsforever ┆ 6 ┆ Strawberry ┆ fields ┆ forever │
└──────────────────────────────────┴───────┴────────────┴────────┴─────────┘)
Duplicated on the github polars repo: https://github.com/pola-rs/polars/issues/13632
Upvotes: 0