anderwyang
anderwyang

Reputation: 2411

In arrow , how to create schema according the original variable type and avoid type in manualy

In arrow , can create schema by map just as below my_schema_1 . I want to create my_schema_2 , the schema type to double or string accoring varible type, but the my_schema_2 list value sequence not match width the name of csv file sample.csv, So arrow_data_2 %>% as.data.frame() failed . How to fix it ? Thanks!

library(tidyverse)
library(arrow)
library(lubridate)
data(lakers)

create file sample.csv

write.csv(lakers,"sample.csv",row.names = FALSE)

create my_schema_1 and following process successed

my_schema_1 <- schema(
  purrr::map(names(lakers),
             ~ Field$create(name=.x,type = string()))
)

arrow_data_1 <- arrow::open_csv_dataset('sample.csv',schema = my_schema_1,skip  =1)
arrow_data_1 %>% as.data.frame()

create my_schema_2 and arrow_data_2 %>% as.data.frame() failed, cause the value sequence in my_schema_2 not match with the names of sample.csv

my_schema_2 <- schema(
  c(purrr::map(lakers %>% sapply(.,is.numeric) %>% which() %>% names(),
             ~ Field$create(name=.x,type = double())),
  purrr::map(lakers %>% sapply(.,is.character) %>% which() %>% names(),
             ~ Field$create(name=.x,type = string()))
  )
)

arrow_data_2 <- arrow::open_csv_dataset('sample.csv',schema = my_schema_2,skip=1)
arrow_data_2 %>% as.data.frame()

Upvotes: 0

Views: 125

Answers (2)

Edward
Edward

Reputation: 18493

Another way to create schema_2 is to use a loop, which preserves the order of the variables.

my_schema_2 <- schema(sapply(names(lakers), \(x) {
  type = typeof(get(x, lakers))
  switch(type,
         integer = field(name=x, type=double()),
         character = field(name=x, type=string())
         )
  }))

arrow_data_2 <- arrow::open_csv_dataset('sample.csv',
                   schema = my_schema_2,skip=1)
arrow_data_2 %>% as.data.frame()

# A tibble: 34,624 × 13
       date opponent game_type time  period etype      team  player              result points type      x     y
      <dbl> <chr>    <chr>     <chr>  <dbl> <chr>      <chr> <chr>               <chr>   <dbl> <chr> <dbl> <dbl>
 1 20081028 POR      home      12:00      1 jump ball  OFF   NA                  NA          0 NA       NA    NA
 2 20081028 POR      home      11:39      1 shot       LAL   Pau Gasol           missed      0 hook     23    13
 3 20081028 POR      home      11:37      1 rebound    LAL   Vladimir Radmanovic NA          0 off      NA    NA
 4 20081028 POR      home      11:25      1 shot       LAL   Derek Fisher        missed      0 layup    25     6
 5 20081028 POR      home      11:23      1 rebound    LAL   Pau Gasol           NA          0 off      NA    NA
 6 20081028 POR      home      11:22      1 shot       LAL   Pau Gasol           made        2 hook     25    10
 7 20081028 POR      home      11:22      1 foul       POR   Greg Oden           NA          0 shoo…    NA    NA
 8 20081028 POR      home      11:22      1 free throw LAL   Pau Gasol           made        1 NA       NA    NA
 9 20081028 POR      home      11:00      1 foul       LAL   Vladimir Radmanovic NA          0 pers…    NA    NA
10 20081028 POR      home      10:53      1 shot       POR   LaMarcus Aldridge   made        2 jump     36    21

Upvotes: 1

Edward
Edward

Reputation: 18493

The problem is that the order of the fields in schema_2 differs from the order of the variables in the csv file. You could reorder the fields so that they match:

my_schema_2 <- my_schema_2[match(names(lakers), names(my_schema_2))]
arrow_data_2 <- arrow::open_csv_dataset('sample.csv', 
                    schema = my_schema_2, skip=1)
arrow_data_2 %>% as.data.frame()

# A tibble: 34,624 × 13
       date opponent game_type time  period etype      team  player result points type      x     y
      <dbl> <chr>    <chr>     <chr>  <dbl> <chr>      <chr> <chr>  <chr>   <dbl> <chr> <dbl> <dbl>
 1 20081028 POR      home      12:00      1 jump ball  OFF   NA     NA          0 NA       NA    NA
 2 20081028 POR      home      11:39      1 shot       LAL   Pau G… missed      0 hook     23    13
 3 20081028 POR      home      11:37      1 rebound    LAL   Vladi… NA          0 off      NA    NA
 4 20081028 POR      home      11:25      1 shot       LAL   Derek… missed      0 layup    25     6
 5 20081028 POR      home      11:23      1 rebound    LAL   Pau G… NA          0 off      NA    NA
 6 20081028 POR      home      11:22      1 shot       LAL   Pau G… made        2 hook     25    10
 7 20081028 POR      home      11:22      1 foul       POR   Greg … NA          0 shoo…    NA    NA
 8 20081028 POR      home      11:22      1 free throw LAL   Pau G… made        1 NA       NA    NA
 9 20081028 POR      home      11:00      1 foul       LAL   Vladi… NA          0 pers…    NA    NA
10 20081028 POR      home      10:53      1 shot       POR   LaMar… made        2 jump     36    21
# ℹ 34,614 more rows
# ℹ Use `print(n = ...)` to see more rows

Upvotes: 1

Related Questions