Reputation: 1891
I am attempting to use the drake
R package to process multiple file inputs across multiple plans, so I can build up my targets iteratively, testing what works at each stage. Below is a trivial reprex showing what I am trying to accomplish. The official docs show how to do this type of thing within a single plan, but my difficulty is I want to do this across multiple plans.
I cannot work out what the transform should be to get the correct input names (targets from read_in_plan
) into the munge_plan
library(drake)
dplyr_version_dep = pkg_description = utils::packageDescription('dplyr')
munge_data = function(input){
message("I did something!")
}
file_inputs = c("file1.csv", "file2.csv")
# get my data in
read_in_plan = drake_plan(
# make the plan dependent on changes to dplyr
pkg = utils::packageDescription('dplyr'),
data = target(
read.csv(input),
transform = map(input = !!file_inputs)
)
)
read_in_plan
#> # A tibble: 3 x 2
#> target command
#> <chr> <expr>
#> 1 pkg utils::packageDescription("dplyr")
#> 2 data_file1.csv read.csv("file1.csv")
#> 3 data_file2.csv read.csv("file2.csv")
# now do something to each of those targets
munge_plan = drake_plan(
munged = munge_data(data_file1.csv)
)
munge_plan
#> # A tibble: 1 x 2
#> target command
#> <chr> <expr>
#> 1 munged munge_data(data_file1.csv)
# but really I want to do munge data on all of the
# data_file1.csv
# data_file2.csv
# munge_data_proper = drake_plan(
# munged = target(
# # some kind of transform here
# )
# )
full_plan = bind_plans(read_in_plan,
munge_plan)
# make(full_plan)
Created on 2019-05-23 by the reprex package (v0.2.1)
Upvotes: 1
Views: 257
Reputation: 5841
Transformations are designed to all happen inside a single call to drake_plan()
, so it is difficult to split up the data_*
targets and the munged_*
targets below into different plans.
library(drake)
dplyr_version_dep = pkg_description = utils::packageDescription('dplyr')
munge_data = function(input){
message("I did something!")
}
file_inputs <- c("file1.csv", "file2.csv")
plan <- drake_plan(
pkg = target(
dplyr_version_dep,
# Triggers are always checked even though commands do not always run:
trigger = trigger(change = utils::packageDescription("dplyr"))
),
data = target(
read.csv(input),
transform = map(input = !!file_inputs, .id = FALSE)
),
# Borrow from the previous transform:
munged = target(
munge_data(data),
transform = map(data)
)
)
drake_plan_source(plan)
#> drake_plan(
#> pkg = target(
#> command = dplyr_version_dep,
#> trigger = trigger(
#> change = utils::packageDescription("dplyr")
#> )
#> ),
#> data = read.csv("file1.csv"),
#> data_2 = read.csv("file2.csv"),
#> munged_data = munge_data(data),
#> munged_data_2 = munge_data(data_2)
#> )
Created on 2019-05-23 by the reprex package (v0.3.0)
For you, one hack is to grab information from drake_plan(trace = TRUE)
. Brittle, but useful in this small example.
library(drake)
dplyr_version_dep = pkg_description = utils::packageDescription('dplyr')
munge_data = function(input){
message("I did something!")
}
file_inputs <- c("file1.csv", "file2.csv")
plan1 <- drake_plan(
pkg = target(
dplyr_version_dep,
# Triggers are always checked even though commands do not always run:
trigger = trigger(change = utils::packageDescription("dplyr"))
),
data = target(
read.csv(input),
transform = map(input = !!file_inputs, .id = FALSE)
),
trace = TRUE
)
plan1
#> # A tibble: 3 x 5
#> target command trigger input data
#> <chr> <expr> <expr> <chr> <chr>
#> 1 pkg dplyr_version_… trigger(change = utils::packageD… <NA> <NA>
#> 2 data read.csv("file… NA … "\"file1.… data
#> 3 data_2 read.csv("file… NA … "\"file2.… data…
plan1$input
#> [1] NA "\"file1.csv\"" "\"file2.csv\""
plan1$data
#> [1] NA "data" "data_2"
# Put together the data manually for the next transformation.
library(magrittr)
data <- plan1$data %>%
na.omit() %>%
unique() %>%
rlang::syms()
str(data)
#> List of 2
#> $ : symbol data
#> $ : symbol data_2
plan2 <- drake_plan(
munged = target(
munge_data(d),
transform = map(d = !!data) # !! is key
)
)
plan2
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 munged_data munge_data(data)
#> 2 munged_data_2 munge_data(data_2)
full_plan <- bind_plans(dplyr::select(plan1, target, command), plan2)
full_plan
#> # A tibble: 5 x 2
#> target command
#> <chr> <expr>
#> 1 pkg dplyr_version_dep
#> 2 data read.csv("file1.csv")
#> 3 data_2 read.csv("file2.csv")
#> 4 munged_data munge_data(data)
#> 5 munged_data_2 munge_data(data_2)
Created on 2019-05-23 by the reprex package (v0.3.0)
Upvotes: 2