Reputation: 119

Transform JSON Key into a Polars DataFrame

I was wondering how to read in a JSON file into a polars DataFrame in Rust on the "data" key. However, I believe the structure of the JSON file that I have would be hard to achieve.

Here is the first Structure of the JSON File where it contains dataTypes.

{
  "data": [
    {
      "dataItemName": "TICKER",
      "result": [
        "AAPL",
        "MSFT",
        "TSLA"
      ],
      "dataType": "STRING",
      "error": 0
    },
    {
      "dataItemName": "SALES",
      "result": [ 
        259968,
        143015,
        24578
      ],
      "dataType": "DOUBLE",
      "error": 0
    },
    {
      "dataItemName": "CNAME",
      "result": [
        "Apple Inc.",
        "Microsoft Corporation",
        "Tesla Inc"
      ],
      "dataType": "STRING",
      "error": 0
    },
    {
      "dataItemName": "PRICE",
      "result": [
        115.98,
        214.22,
        430.83
      ],
      "dataType": "DOUBLE",
      "error": 0
    },
    {
      "dataItemName": "ASSETS",
      "result": [
        338516,
        301311,
        34309
      ],
      "dataType": "DOUBLE",
      "error": 0
    }
  ]
}

Here is what I have tried in Rust.

use polars::prelude::*;


fn main() {
    let json_file = std::fs::File::open("data/test_merged.json").unwrap();
    let df = JsonReader::new(json_file).finish().unwrap();
    println!("{:?}", df);
}

Here is the Rust output example which single column/row DataFrame

shape: (1, 1)
┌───────────────────────────────────┐
│ data                              │
│ ---                               │
│ list[struct[63]]                  │
╞═══════════════════════════════════╡
│ [{0.0,0.530558,3.38631,"2023-06-… │
└───────────────────────────────────┘

There are only 3 Data Types Which are Stings Floats and Integers.

Here is a similar question for a Python Version. transform json to polars dataframe

Upvotes: 3

Answers (3)

Chayim Friedman

Reputation: 70990

If the JSON is self-describing, i.e. it has no two data types represented with the same JSON type (for example, dates and strings represented as strings) - in other words, the dataType field is redundant, then the absolute fastest way is to deserialize directly into Series:

pub fn directly_into_series(json: &str) -> Result<DataFrame, Box<dyn Error>> {
    use serde::de::{DeserializeSeed, Deserializer, Error, SeqAccess, Visitor};
    use serde::Deserialize;

    #[derive(Debug, Deserialize)]
    #[serde(rename_all = "camelCase")]
    struct Column {
        data_item_name: String,
        #[serde(deserialize_with = "deserialize_values")]
        result: Series,
    }

    fn deserialize_values<'de, D: Deserializer<'de>>(deserializer: D) -> Result<Series, D::Error> {
        struct Builders {
            strings: StringChunkedBuilder,
            floats: PrimitiveChunkedBuilder<Float64Type>,
            has_strings: bool,
            has_floats: bool,
        }
        impl Default for Builders {
            fn default() -> Self {
                Self {
                    strings: StringChunkedBuilder::new("", 0),
                    floats: PrimitiveChunkedBuilder::new("", 0),
                    has_strings: false,
                    has_floats: false,
                }
            }
        }

        struct ElementDeserializer<'a>(&'a mut Builders);

        impl<'de, 'a> DeserializeSeed<'de> for ElementDeserializer<'a> {
            type Value = ();

            fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
            where
                D: Deserializer<'de>,
            {
                deserializer.deserialize_any(self)
            }
        }

        impl<'de, 'a> Visitor<'de> for ElementDeserializer<'a> {
            type Value = ();

            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                write!(formatter, "expected a float or string")
            }

            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: Error,
            {
                self.0.strings.append_value(v);
                self.0.has_strings = true;
                Ok(())
            }

            fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E>
            where
                E: Error,
            {
                self.0.floats.append_value(v);
                self.0.has_floats = true;
                Ok(())
            }
        }

        struct SeqVisitor;

        impl<'de> Visitor<'de> for SeqVisitor {
            type Value = Series;

            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                write!(formatter, "expected a sequence of floats or strings")
            }

            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
            where
                A: SeqAccess<'de>,
            {
                let mut builders = Builders::default();
                while let Some(()) = seq.next_element_seed(ElementDeserializer(&mut builders))? {}

                match (builders.has_strings, builders.has_floats) {
                    (false, false) | (true, false) => Ok(builders.strings.finish().into_series()),
                    (false, true) => Ok(builders.floats.finish().into_series()),
                    (true, true) => Err(A::Error::custom("sequence with both floats and strings")),
                }
            }
        }

        deserializer.deserialize_seq(SeqVisitor)
    }

    #[derive(Debug, Deserialize)]
    struct Data {
        data: Vec<Column>,
    }

    let data = serde_json::from_str::<Data>(json)?;
    let df = data
        .data
        .into_iter()
        .map(|mut column| {
            column.result.rename(&column.data_item_name);
            column.result
        })
        .collect::<DataFrame>();

    Ok(df)
}

Benchmark with random 10,000 entries:

BallpointBen            time:   [3.2342 ms 3.2510 ms 3.2692 ms]
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild

Benchmarking Mine (other answer): Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 9.5s, enable flat sampling, or reduce sample count to 50.
Mine (other answer)     time:   [1.8601 ms 1.8670 ms 1.8745 ms]
Found 6 outliers among 100 measurements (6.00%)
  2 (2.00%) high mild
  4 (4.00%) high severe

Deserialize directly into `Series`
                        time:   [739.58 µs 741.60 µs 743.79 µs]
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high severe

Upvotes: 1

Chayim Friedman

Reputation: 70990

If performance is important, then @BallpointBen's version is not the fastest you can get; here's a more performant version:

pub fn convert(json: &str) -> Result<DataFrame, Box<dyn Error>> {
    use serde::Deserialize;

    #[derive(Debug, Deserialize)]
    #[serde(untagged)]
    enum Values {
        String(Vec<String>),
        Double(Vec<f64>),
    }

    #[derive(Debug, Deserialize)]
    #[serde(rename_all = "UPPERCASE")]
    enum DataType {
        String,
        Double,
    }

    #[derive(Debug, Deserialize)]
    #[serde(rename_all = "camelCase")]
    struct Column {
        data_item_name: String,
        result: Values,
        data_type: DataType,
    }

    #[derive(Debug, Deserialize)]
    struct Data {
        data: Vec<Column>,
    }

    let data = serde_json::from_str::<Data>(json)?;
    let df = data
        .data
        .into_iter()
        .map(|column| match column.data_type {
            DataType::String => {
                let Values::String(values) = column.result else {
                    return Err("column type mismatch");
                };
                Ok(Series::new(&column.data_item_name, values))
            }
            DataType::Double => {
                let Values::Double(values) = column.result else {
                    return Err("column type mismatch");
                };
                Ok(Series::from_vec(&column.data_item_name, values))
            }
        })
        .collect::<Result<DataFrame, _>>()?;

    Ok(df)
}

Benchmark with 1,000 random entries:

BallpointBen            time:   [338.41 µs 340.05 µs 341.85 µs]
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild

Mine                    time:   [195.82 µs 196.79 µs 197.95 µs]
Found 11 outliers among 100 measurements (11.00%)
  8 (8.00%) high mild
  3 (3.00%) high severe

Upvotes: 2

BallpointBen

Reputation: 13750

First you should note that 1. in your Python code, Polars is not reading from json, but rather reading from an already-instantiated in-memory dict created from that json 2. the resulting df is almost surely not what you want.

Polars does support serde, but it has its own format, so it's not as simple as just massaging the incoming data. The easiest way is probably to create structs mimicking the structure that Polars expects and implementing all the necessary renaming of fields, then deserializing to make the renaming happen, re-serializing with the renamed fields, and then deserializing again into a DataFrame. The code below requires crates polars with feature serde enabled, serde, and serde_json.

use polars::prelude::*;
use serde::{Deserialize, Serialize};

#[derive(Debug, Deserialize, Serialize)]
#[serde(untagged)]
enum Values {
    String(Vec<String>),
    Double(Vec<f64>),
}

#[derive(Debug, Deserialize, Serialize)]
enum DataType {
    #[serde(rename(deserialize = "STRING", serialize = "Utf8"))]
    String,
    #[serde(rename(deserialize = "DOUBLE"))]
    Float64,
}

#[derive(Debug, Deserialize, Serialize)]
struct Column {
    #[serde(rename(deserialize = "dataItemName"))]
    name: String,
    #[serde(rename(deserialize = "result"))]
    values: Values,
    #[serde(rename(deserialize = "dataType"))]
    datatype: DataType,
}

#[derive(Debug, Deserialize, Serialize)]
struct Data {
    #[serde(rename(deserialize = "data"))]
    columns: Vec<Column>,
}

fn main() -> anyhow::Result<()> {
    let data = serde_json::from_str::<Data>(DATA)?;
    let df = serde_json::from_value::<DataFrame>(serde_json::to_value(data)?)?;
    println!("{df:?}");

    Ok(())
}

The result is

shape: (3, 5)
┌────────┬──────────┬───────────────────────┬────────┬──────────┐
│ TICKER ┆ SALES    ┆ CNAME                 ┆ PRICE  ┆ ASSETS   │
│ ---    ┆ ---      ┆ ---                   ┆ ---    ┆ ---      │
│ str    ┆ f64      ┆ str                   ┆ f64    ┆ f64      │
╞════════╪══════════╪═══════════════════════╪════════╪══════════╡
│ AAPL   ┆ 259968.0 ┆ Apple Inc.            ┆ 115.98 ┆ 338516.0 │
│ MSFT   ┆ 143015.0 ┆ Microsoft Corporation ┆ 214.22 ┆ 301311.0 │
│ TSLA   ┆ 24578.0  ┆ Tesla Inc             ┆ 430.83 ┆ 34309.0  │
└────────┴──────────┴───────────────────────┴────────┴──────────┘

Of course, DATA is the string you provided,

const DATA: &str = r#"
{
  "data": [
    {
      "dataItemName": "TICKER",
      "result": [
        "AAPL",
        "MSFT",
        "TSLA"
      ],
      "dataType": "STRING",
      "error": 0
    },
    {
      "dataItemName": "SALES",
      "result": [
        259968,
        143015,
        24578
      ],
      "dataType": "DOUBLE",
      "error": 0
    },
    {
      "dataItemName": "CNAME",
      "result": [
        "Apple Inc.",
        "Microsoft Corporation",
        "Tesla Inc"
      ],
      "dataType": "STRING",
      "error": 0
    },
    {
      "dataItemName": "PRICE",
      "result": [
        115.98,
        214.22,
        430.83
      ],
      "dataType": "DOUBLE",
      "error": 0
    },
    {
      "dataItemName": "ASSETS",
      "result": [
        338516,
        301311,
        34309
      ],
      "dataType": "DOUBLE",
      "error": 0
    }
  ]
}
"#;

Upvotes: 1

Transform JSON Key into a Polars DataFrame

Answers (3)

Related Questions