Reputation: 1362
PROBLEM: I want to convert a column from named lists into a data frame. However now there are only 9 items, problems will appear in other cases when variables in lists are up to 120. So, is there an efficient way to extract all the variables in inside the lists without calling all one by one?
HERE IS THE DPUT
FOR FOO
:
foo <- structure(list(column_a = c("a", "b", "c"), column_b = list(list(country_code = "US", url = "https://api.twitter.com/1.1/geo/id/c3f37afa9efcf94b.json", country = "United States", place_type = "city", bounding_box = list(type = "Polygon", coordinates = structure(c(-97.928935, -97.928935, -97.580513, -97.580513, 30.127892, 30.518799, 30.518799, 30.127892), .Dim = c(1L, 4L, 2L))), full_name = "Austin, TX", attributes = structure(list(), .Names = character(0)), id = "c3f37afa9efcf94b", name = "Austin"), list(country_code = "UG", url = "https://api.twitter.com/1.1/geo/id/0092409a629e836c.json", country = "Uganda", place_type = "admin", bounding_box = list(type = "Polygon", coordinates = structure(c(32.192297, 32.192297, 32.683699, 32.683699, -0.147789, 0.585072, 0.585072, -0.147789), .Dim = c(1L, 4L, 2L))), full_name = "Wakiso, Uganda", attributes = structure(list(), .Names = character(0)), id = "0092409a629e836c", name = "Wakiso"), list(country_code = "US", url = "https://api.twitter.com/1.1/geo/id/080b8d8543aab399.json", country = "United States", place_type = "city", bounding_box = list(type = "Polygon", coordinates = structure(c(-93.399443, -93.399443, -93.203245, -93.203245, 44.78542, 44.863519, 44.863519, 44.78542), .Dim = c(1L, 4L, 2L))), full_name = "Bloomington, MN", attributes = structure(list(), .Names = character(0)), id = "080b8d8543aab399", name = "Bloomington"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L))
DATA:
My foo
data frame has two columns column_a
with letters and column_b
which is a column of lists. Each list has 9 items.
library(tidyverse)
foo
#> # A tibble: 3 x 2
#> column_a column_b
#> <chr> <list>
#> 1 a <named list [9]>
#> 2 b <named list [9]>
#> 3 c <named list [9]>
str(foo[1,])
#> Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 2 variables:
#> $ column_a: chr "a"
#> $ column_b:List of 1
#> ..$ :List of 9
#> .. ..$ country_code: chr "US"
#> .. ..$ url : chr "https://api.twitter.com/1.1/geo/id/c3f37afa9efcf94b.json"
#> .. ..$ country : chr "United States"
#> .. ..$ place_type : chr "city"
#> .. ..$ bounding_box:List of 2
#> .. .. ..$ type : chr "Polygon"
#> .. .. ..$ coordinates: num [1, 1:4, 1:2] -97.9 -97.9 -97.6 -97.6 30.1 ...
#> .. ..$ full_name : chr "Austin, TX"
#> .. ..$ attributes : Named list()
#> .. ..$ id : chr "c3f37afa9efcf94b"
#> .. ..$ name : chr "Austin"
SOLUTION A: A solution is to call all the names of the variables in the lists and extract them using map_chr
one by one.
foo %>%
mutate(
country_code = map_chr(column_b, "country_code"),
country = map_chr(column_b, "country")
)
#> # A tibble: 3 x 4
#> column_a column_b country_code country
#> <chr> <list> <chr> <chr>
#> 1 a <named list [9]> US United States
#> 2 b <named list [9]> UG Uganda
#> 3 c <named list [9]> US United States
SOLUTION B: Following this link, the solution is to call two times map
function within mutate
. But I had no success:
foo %>%
mutate(repo_info = column_b %>%
map(~ .x %>%
map_df(`[`, c("country_code", "country")))) %>%
select(-column_b) %>%
unnest()
#> # A tibble: 6 x 10
#> column_a country_code url country place_type bounding_box full_name
#> <chr> <chr> <chr> <chr> <chr> <list> <chr>
#> 1 a <NA> <NA> <NA> <NA> <NULL> <NA>
#> 2 a <NA> <NA> <NA> <NA> <NULL> <NA>
#> 3 b <NA> <NA> <NA> <NA> <NULL> <NA>
#> 4 b <NA> <NA> <NA> <NA> <NULL> <NA>
#> 5 c <NA> <NA> <NA> <NA> <NULL> <NA>
#> 6 c <NA> <NA> <NA> <NA> <NULL> <NA>
#> # … with 3 more variables: attributes <list>, id <chr>, name <chr>
Created on 2019-08-19 by the reprex package (v0.3.0)
Upvotes: 3
Views: 1033
Reputation: 12839
Here is a jsonlite
trick (converting back and forth to use the power of fromJSON
).
Also, was the data originally JSON? If yes, fromJSON(flatten = TRUE)
directly might be more sensible.
library(jsonlite)
foo$column_b %>%
toJSON(auto_unbox = TRUE) %>%
fromJSON(flatten = TRUE)
# Observations: 3
# Variables: 9
# $ country_code <chr> "US", "UG", "US"
# $ url <chr> "https://api.twitter.com/1.1/geo/id/c3f37afa…
# $ country <chr> "United States", "Uganda", "United States"
# $ place_type <chr> "city", "admin", "city"
# $ full_name <chr> "Austin, TX", "Wakiso, Uganda", "Bloomington…
# $ id <chr> "c3f37afa9efcf94b", "0092409a629e836c", "080…
# $ name <chr> "Austin", "Wakiso", "Bloomington"
# $ bounding_box.type <chr> "Polygon", "Polygon", "Polygon"
# $ bounding_box.coordinates <list> [<array[1 x 4 x 2]>, <array[1 x 4 x 2]>, <a…
Upvotes: 0
Reputation: 887851
One option is after extracting the columns, convert to a tibble
and then do the unnest
library(dplyr)
library(tidyr)
foo %>%
mutate(out = map(column_b, ~ .x[c( "country_code", "country")] %>%
as_tibble)) %>%
unnest(out)
# A tibble: 3 x 4
# column_a column_b country_code country
# <chr> <list> <chr> <chr>
#1 a <named list [9]> US United States
#2 b <named list [9]> UG Uganda
#3 c <named list [9]> US United States
If we need to extract the full columns
library(tibble)
foo %>%
mutate(out = map(column_b, enframe)) %>%
unnest(out) %>%
spread(name, value) %>%
unnest(setdiff(names(.), c("column_b", "attributes","bounding_box")))
# A tibble: 3 x 11
# column_a column_b attributes bounding_box country country_code full_name id name place_type url
# <chr> <list> <list> <list> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 a <named list … <named list [… <named list [2… United St… US Austin, TX c3f37afa9… Austin city https://api.twitter.com/1.1/geo…
#2 b <named list … <named list [… <named list [2… Uganda UG Wakiso, Uga… 0092409a6… Wakiso admin https://api.twitter.com/1.1/geo…
#3 c <named list … <named list [… <named list [2… United St… US Bloomington… 080b8d854… Bloomin… city https://api.twitter.com/1.1/geo…
Upvotes: 2