Luc
Luc

Reputation: 958

R Flatten nested lists of different lengths (Google geocode API output) in R

I have been using the Geocoding API from Google to geocode address lists. It returns results in nested lists. The elements in the lists might vary, and sometimes there are partial matches, resulting in multiple nested lists nested at the highest level. So far, I have saved each GoogleResult into a single data frame cell.

Here is an example of my dataframe:

    df <- structure(list(address = structure(c(3L, 1L, 2L), .Label = c("115 Civic Parade, Altona VIC 3018", 
"Civic Parade, Altona VIC 3018", "EAST LA CLARKEFIELD 3430"), class = "factor"), 
    GoogleResult = list(list(list(access_points = list(), address_components = list(
        list(long_name = "Los Angeles", short_name = "Los Angeles", 
            types = list("locality", "political")), list(long_name = "Los Angeles County", 
            short_name = "Los Angeles County", types = list("administrative_area_level_2", 
                "political")), list(long_name = "California", 
            short_name = "CA", types = list("administrative_area_level_1", 
                "political")), list(long_name = "United States", 
            short_name = "US", types = list("country", "political"))), 
        formatted_address = "Los Angeles, CA, USA", geometry = list(
            bounds = list(northeast = list(lat = 34.3373061, 
                lng = -118.1552891), southwest = list(lat = 33.7036519, 
                lng = -118.6681759)), location = list(lat = 34.0522342, 
                lng = -118.2436849), location_type = "APPROXIMATE", 
            viewport = list(northeast = list(lat = 34.3373061, 
                lng = -118.1552891), southwest = list(lat = 33.7036519, 
                lng = -118.6681759))), partial_match = TRUE, 
        place_id = "ChIJE9on3F3HwoAR9AhGJW_fL-I", types = list(
            "locality", "political")), list(access_points = list(), 
        address_components = list(list(long_name = "3430", short_name = "3430", 
            types = list("postal_code")), list(long_name = "Clarkefield", 
            short_name = "Clarkefield", types = list("locality", 
                "political")), list(long_name = "Victoria", short_name = "VIC", 
            types = list("administrative_area_level_1", "political")), 
            list(long_name = "Australia", short_name = "AU", 
                types = list("country", "political"))), formatted_address = "Clarkefield VIC 3430, Australia", 
        geometry = list(bounds = list(northeast = list(lat = -37.4364578, 
            lng = 144.8986988), southwest = list(lat = -37.5280439, 
            lng = 144.7012193)), location = list(lat = -37.497542, 
            lng = 144.8071366), location_type = "APPROXIMATE", 
            viewport = list(northeast = list(lat = -37.4364578, 
                lng = 144.8986988), southwest = list(lat = -37.5280439, 
                lng = 144.7012193))), partial_match = TRUE, place_id = "ChIJS3IdP-xX1moRkD8uRnhWBBw", 
        types = list("postal_code"))), list(list(access_points = list(), 
        address_components = list(list(long_name = "115", short_name = "115", 
            types = list("street_number")), list(long_name = "Civic Parade", 
            short_name = "Civic Parade", types = list("route")), 
            list(long_name = "Altona", short_name = "Altona", 
                types = list("locality", "political")), list(
                long_name = "Hobsons Bay City", short_name = "Hobsons Bay", 
                types = list("administrative_area_level_2", "political")), 
            list(long_name = "Victoria", short_name = "VIC", 
                types = list("administrative_area_level_1", "political")), 
            list(long_name = "Australia", short_name = "AU", 
                types = list("country", "political")), list(long_name = "3018", 
                short_name = "3018", types = list("postal_code"))), 
        formatted_address = "115 Civic Parade, Altona VIC 3018, Australia", 
        geometry = list(bounds = list(northeast = list(lat = -37.8633208, 
            lng = 144.8316509), southwest = list(lat = -37.86409, 
            lng = 144.8303929)), location = list(lat = -37.863727, 
            lng = 144.8310159), location_type = "ROOFTOP", viewport = list(
            northeast = list(lat = -37.8623564197085, lng = 144.832370880292), 
            southwest = list(lat = -37.8650543802915, lng = 144.829672919709))), 
        place_id = "ChIJBXz75NRj1moRpVRt21nooQw", types = list(
            "premise"))), list(list(access_points = list(), address_components = list(
        list(long_name = "Civic Parade", short_name = "Civic Parade", 
            types = list("route")), list(long_name = "Altona", 
            short_name = "Altona", types = list("locality", "political")), 
        list(long_name = "Hobsons Bay City", short_name = "Hobsons Bay", 
            types = list("administrative_area_level_2", "political")), 
        list(long_name = "Victoria", short_name = "VIC", types = list(
            "administrative_area_level_1", "political")), list(
            long_name = "Australia", short_name = "AU", types = list(
                "country", "political")), list(long_name = "3018", 
            short_name = "3018", types = list("postal_code"))), 
        formatted_address = "Civic Parade, Altona VIC 3018, Australia", 
        geometry = list(bounds = list(northeast = list(lat = -37.8626502, 
            lng = 144.8449271), southwest = list(lat = -37.8661171, 
            lng = 144.81081)), location = list(lat = -37.864412, 
            lng = 144.8303004), location_type = "GEOMETRIC_CENTER", 
            viewport = list(northeast = list(lat = -37.8626502, 
                lng = 144.8449271), southwest = list(lat = -37.8661171, 
                lng = 144.81081))), place_id = "EihDaXZpYyBQYXJhZGUsIEFsdG9uYSBWSUMgMzAxOCwgQXVzdHJhbGlhIi4qLAoUChIJtbGXUCti1moRKcxHhdx2QrYSFAoSCSEyccGdYdZqEXDajCF1VgQF", 
        types = list("route"))))), row.names = c(NA, -3L), class = "data.frame")

The first case has a partial match, which two nested lists of results.

My expected output is:

I tried things like:

lapply(df$GoogleResult, data.frame, stringsAsFactors = FALSE)

but elements differ in length...resulting in:

arguments imply differing number of rows: 0, 1

In case of partial matches, the results could be shown as two rows in the dataframe, or as an additional set of columns.

Upvotes: 6

Views: 278

Answers (3)

user8118328
user8118328

Reputation: 703

df <- df %>% unnest(col = GoogleResult)
GoogleResult <- as.list(df2$GoogleResult)
GoogleResult <- lapply(GoogleResult, function(i) as.list(unlist(i, recursive = FALSE)))
GoogleResult <- plyr::rbind.fill(lapply(GoogleResult, as.data.frame))
df <- cbind(address = df$address, GoogleResult)

This returns a data frame with the following properties (in line with the stated criteria). However it doesn't seem like very clean data.

  • a data frame with all elements of all lists as columns
  • all columns named according to the list element the value stems from
  • 1 row per match for partial matches

Upvotes: 0

TaylorV
TaylorV

Reputation: 906

I'm not able to follow how you got the df object in that form based on what is returned from the ggmap::geocode() function. Instead, I tried just taking a stab at flattening out the exact return object from running ggmap::geocode() from the documentation example, but making sure to use source="google" and output="all".

The function below won't accept vectors, but I think you should be able to edit it pretty easily to do so. Specifically, geo_mat <- matrix(geo2, nrow=1, byrow=T) is the line of code that makes this not very "vector-friendly." That is simply the method I chose to get the data.frame to be wide instead of long.

I suspect you're not super concerned about efficiency since the Google API is rate limited anyway, so this function might just work best in a loop or something.

It's also worth noting that there are two named components to the list object returned by the ggmap::geocode() function. One is results and the other is status. The status component will not be very interesting to you unless there is some type of error during the geocoding process. Otherwise, it will just be the character 'Ok'. The function below does not use the status portion of the list object in any way.

I definitely see what you're saying about "lining up" the different column names based on the variable number of elements in the returned JSON. I use the google map API very often, and my strategy for this is to get a good sample of the addresses I'm geocoding, then get an idea of what fields I want to capture, then write a function to specifically hunt down those fields, returning NA if they don't exist.

Anyway, here's my attempt at flattening a single returned value from the geocode function.

library(jsonlite)
library(purrr)
library(ggmap)
register_google(key=key)  # <-- I stored my key in a variable called key...

flatten_geocode <- function(geocode_output) {
    #' geocode_output: output of ggmap::geocode()
    #' tested only when source="google" and output="all"


    # isolate the results
    geo1      <- purrr::flatten(geocode_output$results)

    # unlist (this will retain the flattened names to be used later)
    geo2      <- unlist(geo1)

    # convert to wide-matrix, then dataframe
    geo_mat   <- matrix(geo2, nrow=1, byrow=T)  
    geo_df    <- data.frame(geo_mat, stringsAsFactors=F)

    # clean the column names up (I hate periods in R data.frame names)
    # the second "data.frame()" call is to use "check.names" to remove 
    # duplicate column names
    names(geo_df) <- names(geo2)
    geo_df    <- data.frame(geo_df, stringsAsFactors = F, check.names = T)
    names(geo_df) <- gsub("\\.", "_", tolower(names(geo_df)))

    return(geo_df)
}



this_geocode_output <- ggmap::geocode("1600 pennsylvania avenue, washington dc",
           source="google", output="all")

df_output <- flatten_geocode(this_geocode_output)
df_output

Upvotes: 0

Onyambu
Onyambu

Reputation: 79288

can you try something like:

df %>% 
 unnest(col = GoogleResult) %>% unnest(col = GoogleResult)%>%
  filter(lengths(GoogleResult)>0)%>%
  {map2(.$GoogleResult,.$address,
        ~cbind(address = .y,data.frame(fromJSON(toJSON(.x))))%>%unnest())}%>%
  plyr::rbind.fill()

Upvotes: 1

Related Questions