Reputation: 4284
I need to read a bunch of files from a grouped list, and combine them based on the group (files from the same group will have the same columns and thus can be reduced with bind_rows()
.
I can't seem to get a handle on how the data is changing as I move through the purrr::map()
function as I keep geting warnings that I can't use the $
on atomic vectors.
The first thing I do is split by the group so that I get a list of a list of the files I want to read within each group. Then I use map to go through each item in that list, and a second map to go through the rows on each sublist to read the files. However something happens at that level where it no longer treats the data the same as if I was just working with a single group at the top level.
(The lack of being able to debug and look at my environment inside a map function is really an issue in understanding the mechanics.)
require(tidyverse)
#> Loading required package: tidyverse
x <- structure(list(survey = c("adm2014", "adm2015", "adm2016", "eap2008",
"eap2009", "eap2011", "eap2012", "eap2013", "eap2014", "eap2015",
"eap2016", "ef2008a", "ef2008b", "ef2008c", "ef2008cp", "ef2008d",
"ef2009a", "ef2009b", "ef2009c", "ef2009d", "ef2010a", "ef2010b",
"ef2010c", "ef2010cp", "ef2010d", "ef2011a", "ef2011b", "ef2011c",
"ef2011d", "ef2012a", "ef2012b", "ef2012c", "ef2012cp", "ef2012d",
"ef2013a", "ef2013b", "ef2013c", "ef2013d", "ef2014a", "ef2014b",
"ef2014c", "ef2014cp", "ef2014d", "ef2015a", "ef2015b", "ef2015c",
"ef2015d", "ef2016a", "ef2016b", "ef2016c", "ef2016cp", "ef2016d",
"efest2008", "efest2009", "effy2008", "effy2009", "effy2010",
"effy2011", "effy2012", "effy2013", "effy2014", "effy2015", "effy2016",
"effy2017", "efia2008", "efia2009", "efia2011", "efia2012", "efia2013",
"efia2014", "efia2015", "efia2016", "efia2017", "f0708_f1a",
"f0708_f2", "f0708_f3", "f0809_f1a", "f0809_f2", "f0809_f3",
"f0910_f1a", "f0910_f2", "f0910_f3", "f1011_f1a", "f1011_f2",
"f1011_f3", "f1112_f1a", "f1112_f2", "f1112_f3", "f1213_f1a",
"f1213_f2", "f1213_f3", "f1314_f1a", "f1314_f2", "f1314_f3",
"f1415_f1a", "f1415_f2", "f1415_f3", "f1516_f1a", "f1516_f2",
"f1516_f3", "gr2008", "gr2008_l2", "gr2009", "gr2009_l2", "gr200_08",
"gr200_09", "gr200_10", "gr200_11", "gr200_12", "gr200_13", "gr200_14",
"gr200_15", "gr200_16", "gr2010", "gr2010_l2", "gr2011", "gr2011_l2",
"gr2012", "gr2012_l2", "gr2013", "gr2013_l2", "gr2014", "gr2014_l2",
"gr2015", "gr2015_l2", "gr2016", "gr2016_l2", "hd2008", "hd2009",
"hd2010", "hd2011", "hd2012", "hd2013", "hd2014", "hd2015", "hd2017",
"ic2008", "ic2008_ay", "ic2008_py", "ic2009", "ic2009_ay", "ic2009_py",
"ic2010", "ic2010_ay", "ic2010_py", "ic2011", "ic2011_ay", "ic2011_py",
"ic2012", "ic2012_ay", "ic2012_py", "ic2013", "ic2013_ay", "ic2013_py",
"ic2014", "ic2014_ay", "ic2014_py", "ic2015", "ic2015_ay", "ic2015_py",
"ic2016", "ic2016_ay", "ic2016_py", "ic2017", "ic2017_ay", "ic2017_py",
"s2008_abd", "s2008_cn", "s2008_f", "s2008_g", "s2009_abd", "s2009_cn",
"s2009_f", "s2009_g", "s2010_abd", "s2010_cn", "s2010_f", "s2010_g",
"s2011_abd", "s2011_cn", "s2011_f", "s2011_g", "sal2008_a", "sal2008_a_lt9",
"sal2008_b", "sal2008_faculty", "sal2009_a", "sal2009_a_lt9",
"sal2009_b", "sal2009_faculty", "sal2010_a", "sal2010_a_lt9",
"sal2010_b", "sal2010_faculty", "sal2011_a", "sal2011_a_lt9",
"sal2011_faculty"), survgroup = c("adm", "adm", "adm", "eap",
"eap", "eap", "eap", "eap", "eap", "eap", "eap", "efa", "efb",
"efc", "efcp", "efd", "efa", "efb", "efc", "efd", "efa", "efb",
"efc", "efcp", "efd", "efa", "efb", "efc", "efd", "efa", "efb",
"efc", "efcp", "efd", "efa", "efb", "efc", "efd", "efa", "efb",
"efc", "efcp", "efd", "efa", "efb", "efc", "efd", "efa", "efb",
"efc", "efcp", "efd", "efest", "efest", "effy", "effy", "effy",
"effy", "effy", "effy", "effy", "effy", "effy", "effy", "efia",
"efia", "efia", "efia", "efia", "efia", "efia", "efia", "efia",
"f_f1a", "f_f2", "f_f3", "f_f1a", "f_f2", "f_f3", "f_f1a", "f_f2",
"f_f3", "f_f1a", "f_f2", "f_f3", "f_f1a", "f_f2", "f_f3", "f_f1a",
"f_f2", "f_f3", "f_f1a", "f_f2", "f_f3", "f_f1a", "f_f2", "f_f3",
"f_f1a", "f_f2", "f_f3", "gr", "gr_l2", "gr", "gr_l2", "gr_08",
"gr_09", "gr_10", "gr_11", "gr_12", "gr_13", "gr_14", "gr_15",
"gr_16", "gr", "gr_l2", "gr", "gr_l2", "gr", "gr_l2", "gr", "gr_l2",
"gr", "gr_l2", "gr", "gr_l2", "gr", "gr_l2", "hd", "hd", "hd",
"hd", "hd", "hd", "hd", "hd", "hd", "ic", "ic_ay", "ic_py", "ic",
"ic_ay", "ic_py", "ic", "ic_ay", "ic_py", "ic", "ic_ay", "ic_py",
"ic", "ic_ay", "ic_py", "ic", "ic_ay", "ic_py", "ic", "ic_ay",
"ic_py", "ic", "ic_ay", "ic_py", "ic", "ic_ay", "ic_py", "ic",
"ic_ay", "ic_py", "s_abd", "s_cn", "s_f", "s_g", "s_abd", "s_cn",
"s_f", "s_g", "s_abd", "s_cn", "s_f", "s_g", "s_abd", "s_cn",
"s_f", "s_g", "sal_a", "sal_a_lt9", "sal_b", "sal_faculty", "sal_a",
"sal_a_lt9", "sal_b", "sal_faculty", "sal_a", "sal_a_lt9", "sal_b",
"sal_faculty", "sal_a", "sal_a_lt9", "sal_faculty")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -197L))
x %>%
split(.$survgroup) %>%
map(function(currentgroup) {
#currentgroup should now be a tibble of each group.
currentgroup %>%
map(function(singlesurvey) { #singlesurvey should be each row in the group
x <- read_csv(path_expand(paste0("~data/IPEDS/API Pulls/datadownloaded/", singlesurvey$survey, ".csv")))
}) %>% bind_rows()
})
#> Error in path_expand(paste0("~data/IPEDS/API Pulls/datadownloaded/", singlesurvey$survey, : could not find function "path_expand"
Created on 2018-11-12 by the reprex package (v0.2.1)
Upvotes: 2
Views: 780
Reputation: 887088
The issue is that we need to loop through the individual files in the column instead of the looping through the columns in the dataset. In the OP's post, the second map
loops through the data.frame
with a single column. Here, the basic unit is a data.frame
with one column. If the column was extracted as a vector
, the unit becomes vector
and it loops through each element of the vector
x %>%
split(.$survgroup) %>%
map(~ .x %>%
pull(survey) %>%
map(~ .x %>%
paste0("~data/IPEDS/API Pulls/datadownloaded/", ., '.csv') %>%
path.expand %>%
read_csv)))
Upvotes: 3
Reputation: 1635
An alternative solution is to use list-columns to read the data frames into a column, and do the split afterwards.
x %>%
mutate(data = map(survey, ~ read_csv(path.expand(paste0("~data/IPEDS/API Pulls/datadownloaded/", .x, ".csv"))))) %>%
unnest() %>%
split(.$survgroup)
Upvotes: 1