Working with multiple files across multiple plans in drake

Question

I am attempting to use the drake R package to process multiple file inputs across multiple plans, so I can build up my targets iteratively, testing what works at each stage. Below is a trivial reprex showing what I am trying to accomplish. The official docs show how to do this type of thing within a single plan, but my difficulty is I want to do this across multiple plans.

I cannot work out what the transform should be to get the correct input names (targets from read_in_plan) into the munge_plan

library(drake)
dplyr_version_dep = pkg_description = utils::packageDescription('dplyr')

munge_data = function(input){
  message("I did something!")
}

file_inputs = c("file1.csv", "file2.csv")

# get my data in
read_in_plan = drake_plan(
  # make the plan dependent on changes to dplyr
  pkg = utils::packageDescription('dplyr'),
  data = target(
    read.csv(input),
    transform = map(input = !!file_inputs)
  )
)

read_in_plan
#> # A tibble: 3 x 2
#>   target         command                           
#>                                         
#> 1 pkg            utils::packageDescription("dplyr")
#> 2 data_file1.csv read.csv("file1.csv")             
#> 3 data_file2.csv read.csv("file2.csv")

# now do something to each of those targets
munge_plan = drake_plan(
  munged = munge_data(data_file1.csv)
)

munge_plan
#> # A tibble: 1 x 2
#>   target command                   
#>                         
#> 1 munged munge_data(data_file1.csv)

# but really I want to do munge data on all of the
# data_file1.csv
# data_file2.csv

# munge_data_proper = drake_plan(
#   munged = target(
#     # some kind of transform here
#   )
# )

full_plan = bind_plans(read_in_plan,
                       munge_plan)
# make(full_plan)

^{Created on 2019-05-23 by the reprex package (v0.2.1)}

landau · Accepted Answer

Transformations are designed to all happen inside a single call to drake_plan(), so it is difficult to split up the data_* targets and the munged_* targets below into different plans.

library(drake)
dplyr_version_dep = pkg_description = utils::packageDescription('dplyr')

munge_data = function(input){
    message("I did something!")
}

file_inputs <- c("file1.csv", "file2.csv")

plan <- drake_plan(
    pkg = target(
        dplyr_version_dep,
        # Triggers are always checked even though commands do not always run:
        trigger = trigger(change = utils::packageDescription("dplyr"))
    ),
    data = target(
        read.csv(input),
        transform = map(input = !!file_inputs, .id = FALSE)
    ),
    # Borrow from the previous transform:
    munged = target(
        munge_data(data),
        transform = map(data)
    )
)

drake_plan_source(plan)
#> drake_plan(
#>   pkg = target(
#>     command = dplyr_version_dep,
#>     trigger = trigger(
#>       change = utils::packageDescription("dplyr")
#>     )
#>   ),
#>   data = read.csv("file1.csv"),
#>   data_2 = read.csv("file2.csv"),
#>   munged_data = munge_data(data),
#>   munged_data_2 = munge_data(data_2)
#> )

^{Created on 2019-05-23 by the reprex package (v0.3.0)}

For you, one hack is to grab information from drake_plan(trace = TRUE). Brittle, but useful in this small example.

library(drake)
dplyr_version_dep = pkg_description = utils::packageDescription('dplyr')

munge_data = function(input){
    message("I did something!")
}

file_inputs <- c("file1.csv", "file2.csv")

plan1 <- drake_plan(
    pkg = target(
        dplyr_version_dep,
        # Triggers are always checked even though commands do not always run:
        trigger = trigger(change = utils::packageDescription("dplyr"))
    ),
    data = target(
        read.csv(input),
        transform = map(input = !!file_inputs, .id = FALSE)
    ),
    trace = TRUE
)

plan1
#> # A tibble: 3 x 5
#>   target command         trigger                           input      data 
#>                                                 
#> 1 pkg    dplyr_version_… trigger(change = utils::packageD…         
#> 2 data   read.csv("file… NA                              … "\"file1.… data 
#> 3 data_2 read.csv("file… NA                              … "\"file2.… data…

plan1$input
#> [1] NA              "\"file1.csv\"" "\"file2.csv\""

plan1$data
#> [1] NA       "data"   "data_2"

# Put together the data manually for the next transformation.
library(magrittr)
data <- plan1$data %>%
  na.omit() %>%
    unique() %>%
    rlang::syms()

str(data)
#> List of 2
#>  $ : symbol data
#>  $ : symbol data_2

plan2 <- drake_plan(
    munged = target(
        munge_data(d),
        transform = map(d = !!data) # !! is key
    )
)

plan2
#> # A tibble: 2 x 2
#>   target        command           
#>                        
#> 1 munged_data   munge_data(data)  
#> 2 munged_data_2 munge_data(data_2)

full_plan <- bind_plans(dplyr::select(plan1, target, command), plan2)

full_plan
#> # A tibble: 5 x 2
#>   target        command              
#>                           
#> 1 pkg           dplyr_version_dep    
#> 2 data          read.csv("file1.csv")
#> 3 data_2        read.csv("file2.csv")
#> 4 munged_data   munge_data(data)     
#> 5 munged_data_2 munge_data(data_2)

^{Created on 2019-05-23 by the reprex package (v0.3.0)}

Working with multiple files across multiple plans in drake

Answers (1)

Related Questions