Tito Sanz
Tito Sanz

Reputation: 1362

From column of lists to a data frame in a tidyverse way in r

PROBLEM: I want to convert a column from named lists into a data frame. However now there are only 9 items, problems will appear in other cases when variables in lists are up to 120. So, is there an efficient way to extract all the variables in inside the lists without calling all one by one?

HERE IS THE DPUT FOR FOO:

foo <- structure(list(column_a = c("a", "b", "c"), column_b = list(list(country_code = "US", url = "https://api.twitter.com/1.1/geo/id/c3f37afa9efcf94b.json", country = "United States", place_type = "city", bounding_box = list(type = "Polygon", coordinates = structure(c(-97.928935, -97.928935, -97.580513, -97.580513, 30.127892, 30.518799, 30.518799, 30.127892), .Dim = c(1L, 4L, 2L))), full_name = "Austin, TX", attributes = structure(list(), .Names = character(0)), id = "c3f37afa9efcf94b", name = "Austin"), list(country_code = "UG", url = "https://api.twitter.com/1.1/geo/id/0092409a629e836c.json", country = "Uganda", place_type = "admin", bounding_box = list(type = "Polygon", coordinates = structure(c(32.192297, 32.192297, 32.683699, 32.683699, -0.147789, 0.585072, 0.585072, -0.147789), .Dim = c(1L, 4L, 2L))), full_name = "Wakiso, Uganda", attributes = structure(list(), .Names = character(0)), id = "0092409a629e836c", name = "Wakiso"), list(country_code = "US", url = "https://api.twitter.com/1.1/geo/id/080b8d8543aab399.json", country = "United States", place_type = "city", bounding_box = list(type = "Polygon", coordinates = structure(c(-93.399443, -93.399443, -93.203245, -93.203245, 44.78542, 44.863519, 44.863519, 44.78542), .Dim = c(1L, 4L, 2L))), full_name = "Bloomington, MN", attributes = structure(list(), .Names = character(0)), id = "080b8d8543aab399", name = "Bloomington"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L))

DATA: My foo data frame has two columns column_a with letters and column_b which is a column of lists. Each list has 9 items.

library(tidyverse)

foo
#> # A tibble: 3 x 2
#>   column_a column_b        
#>   <chr>    <list>          
#> 1 a        <named list [9]>
#> 2 b        <named list [9]>
#> 3 c        <named list [9]>

str(foo[1,])
#> Classes 'tbl_df', 'tbl' and 'data.frame':    1 obs. of  2 variables:
#>  $ column_a: chr "a"
#>  $ column_b:List of 1
#>   ..$ :List of 9
#>   .. ..$ country_code: chr "US"
#>   .. ..$ url         : chr "https://api.twitter.com/1.1/geo/id/c3f37afa9efcf94b.json"
#>   .. ..$ country     : chr "United States"
#>   .. ..$ place_type  : chr "city"
#>   .. ..$ bounding_box:List of 2
#>   .. .. ..$ type       : chr "Polygon"
#>   .. .. ..$ coordinates: num [1, 1:4, 1:2] -97.9 -97.9 -97.6 -97.6 30.1 ...
#>   .. ..$ full_name   : chr "Austin, TX"
#>   .. ..$ attributes  : Named list()
#>   .. ..$ id          : chr "c3f37afa9efcf94b"
#>   .. ..$ name        : chr "Austin"

SOLUTION A: A solution is to call all the names of the variables in the lists and extract them using map_chr one by one.

foo %>%
  mutate(
    country_code = map_chr(column_b, "country_code"),
    country = map_chr(column_b, "country")
  )
#> # A tibble: 3 x 4
#>   column_a column_b         country_code country      
#>   <chr>    <list>           <chr>        <chr>        
#> 1 a        <named list [9]> US           United States
#> 2 b        <named list [9]> UG           Uganda       
#> 3 c        <named list [9]> US           United States

SOLUTION B: Following this link, the solution is to call two times map function within mutate. But I had no success:

foo %>%
  mutate(repo_info = column_b %>%
    map(~ .x %>%
      map_df(`[`, c("country_code", "country")))) %>%
  select(-column_b) %>%
  unnest()
#> # A tibble: 6 x 10
#>   column_a country_code url   country place_type bounding_box full_name
#>   <chr>    <chr>        <chr> <chr>   <chr>      <list>       <chr>    
#> 1 a        <NA>         <NA>  <NA>    <NA>       <NULL>       <NA>     
#> 2 a        <NA>         <NA>  <NA>    <NA>       <NULL>       <NA>     
#> 3 b        <NA>         <NA>  <NA>    <NA>       <NULL>       <NA>     
#> 4 b        <NA>         <NA>  <NA>    <NA>       <NULL>       <NA>     
#> 5 c        <NA>         <NA>  <NA>    <NA>       <NULL>       <NA>     
#> 6 c        <NA>         <NA>  <NA>    <NA>       <NULL>       <NA>     
#> # … with 3 more variables: attributes <list>, id <chr>, name <chr>

Created on 2019-08-19 by the reprex package (v0.3.0)

Upvotes: 3

Views: 1033

Answers (2)

Aur&#232;le
Aur&#232;le

Reputation: 12839

Here is a jsonlite trick (converting back and forth to use the power of fromJSON).
Also, was the data originally JSON? If yes, fromJSON(flatten = TRUE) directly might be more sensible.

library(jsonlite)

foo$column_b %>% 
  toJSON(auto_unbox = TRUE) %>% 
  fromJSON(flatten = TRUE) 
# Observations: 3
# Variables: 9
# $ country_code             <chr> "US", "UG", "US"
# $ url                      <chr> "https://api.twitter.com/1.1/geo/id/c3f37afa…
# $ country                  <chr> "United States", "Uganda", "United States"
# $ place_type               <chr> "city", "admin", "city"
# $ full_name                <chr> "Austin, TX", "Wakiso, Uganda", "Bloomington…
# $ id                       <chr> "c3f37afa9efcf94b", "0092409a629e836c", "080…
# $ name                     <chr> "Austin", "Wakiso", "Bloomington"
# $ bounding_box.type        <chr> "Polygon", "Polygon", "Polygon"
# $ bounding_box.coordinates <list> [<array[1 x 4 x 2]>, <array[1 x 4 x 2]>, <a…

Upvotes: 0

akrun
akrun

Reputation: 887851

One option is after extracting the columns, convert to a tibble and then do the unnest

library(dplyr)
library(tidyr)
foo %>%
   mutate(out = map(column_b,   ~ .x[c( "country_code", "country")] %>% 
               as_tibble)) %>% 
   unnest(out)
# A tibble: 3 x 4
#  column_a column_b         country_code country      
#  <chr>    <list>           <chr>        <chr>        
#1 a        <named list [9]> US           United States
#2 b        <named list [9]> UG           Uganda       
#3 c        <named list [9]> US           United States

If we need to extract the full columns

library(tibble)
foo %>%
    mutate(out = map(column_b, enframe)) %>% 
    unnest(out) %>% 
    spread(name, value) %>%
    unnest(setdiff(names(.), c("column_b", "attributes","bounding_box")))
# A tibble: 3 x 11
#  column_a column_b      attributes     bounding_box    country    country_code full_name    id         name     place_type url                             
#  <chr>    <list>        <list>         <list>          <chr>      <chr>        <chr>        <chr>      <chr>    <chr>      <chr>                           
#1 a        <named list … <named list [… <named list [2… United St… US           Austin, TX   c3f37afa9… Austin   city       https://api.twitter.com/1.1/geo…
#2 b        <named list … <named list [… <named list [2… Uganda     UG           Wakiso, Uga… 0092409a6… Wakiso   admin      https://api.twitter.com/1.1/geo…
#3 c        <named list … <named list [… <named list [2… United St… US           Bloomington… 080b8d854… Bloomin… city       https://api.twitter.com/1.1/geo…

Upvotes: 2

Related Questions