tim_jc
tim_jc

Reputation: 13

Using purrr to extract values from nested dataframe based on condition

I'm working with a set of patient test results some of which are positive and negative. I'm reducing to individual patient level using dplyr::nest() and then extracting values for the first positive test only using purrr::map() and a function I've written. My dataset isn't huge - ~40k unique patients, ~110k test results - but I gave up running my script after 40 mins. I'm sure there's a better way of extracting these values but am struggling to work it out. The code chunk below illustrates the method I'm using (though obviously this runs in no time).

library(tidyverse)

example_data <- tribble(
  
  ~patient, ~is_first_positive, ~score_1, ~score_2,
  "A", F, 10, 45,
  "A", T, 16, 76,
  "A", F, 24, 86,
  "B", T, 17, 5,
  "B", F, 24, 22,
  "B", F, 55, 97,
  "C", F, 67, 48,
  "C", F, 23, 38,
  "C", F, 45, 16
  
)

example_data <- example_data %>% 
  group_by(patient) %>% 
  nest()

# function to extract values based on value of another column
get_field <- function(df, logical_field, rtn_field) {
  
  df <- df %>% filter_(logical_field)
  
  if(nrow(df)==0) {
    return(NA_integer_)
  } else {
    df %>% pull({{rtn_field}}) %>% as.integer() %>% return()
  }

}

# Use purrr to run function against each nested df
example_data <- example_data %>% 
  mutate(first_positive_score1 = map_int(data, ~get_field(., "is_first_positive", score_1)),
         first_positive_score2 = map_int(data, ~get_field(., "is_first_positive", score_2)))

Upvotes: 0

Views: 359

Answers (2)

rjen
rjen

Reputation: 1972

If you can forgive the long lines, you can use map() in the following way.

library(dplyr)
library(tibble)
library(purrr)

example_data %>%
  mutate(score_1 = as.double(map(data, ~ deframe(.x[2])[which(deframe(.x[1]) == TRUE)])),
         score_2 = as.double(map(data, ~ deframe(.x[3])[which(deframe(.x[1]) == TRUE)]))) 

#   patient data             score_1 score_2
#   <chr>   <list>             <dbl>   <dbl>
# 1 A       <tibble [3 × 3]>      16      76
# 2 B       <tibble [3 × 3]>      17       5
# 3 C       <tibble [3 × 3]>      NA      NA

Upvotes: 0

Vincent
Vincent

Reputation: 17725

Here's an alternative that should be quite fast:

  1. Filter to keep only first positive score
  2. Merge back in the nested data if you need it
library(tidyverse)

example_data <- tribble(
  ~patient, ~is_first_positive, ~score_1, ~score_2,
  "A", F, 10, 45,
  "A", T, 16, 76,
  "A", F, 24, 86,
  "B", T, 17, 5,
  "B", F, 24, 22,
  "B", F, 55, 97,
  "C", F, 67, 48,
  "C", F, 23, 38,
  "C", F, 45, 16
)

nested_data <- example_data %>% 
  group_by(patient) %>% 
  nest()

example_data %>% 
  filter(is_first_positive) %>%
  group_by(patient) %>%
  top_n(1) %>%
  full_join(nested_data)

#> Selecting by score_2
#> Joining, by = "patient"
#> # A tibble: 3 x 5
#> # Groups:   patient [3]
#>   patient is_first_positive score_1 score_2 data            
#>   <chr>   <lgl>               <dbl>   <dbl> <list>          
#> 1 A       TRUE                   16      76 <tibble [3 × 3]>
#> 2 B       TRUE                   17       5 <tibble [3 × 3]>
#> 3 C       NA                     NA      NA <tibble [3 × 3]>

Considering your comment, I re-wrote your get_field function using Base R functions and was able to get a 10x speed improvement:

get_field <- function(df, logical_field, rtn_field) {
  df <- df %>% filter_(logical_field)
  if(nrow(df)==0) {
    return(NA_integer_)
  } else {
    df %>% pull({{rtn_field}}) %>% as.integer() %>% return()
  }
}

get_field2 <- function(x, logical_field, rtn_field) {
  x <- x[x[[logical_field]], ]
  ifelse(nrow(x)==0, NA_integer_, x[[rtn_field]])
}

approach1 <- function() {
  example_data %>% 
  mutate(first_positive_score1 = map_int(data, ~get_field(., "is_first_positive", score_1)),
         first_positive_score2 = map_int(data, ~get_field(., "is_first_positive", score_2)))
}

approach2 <- function() {
  example_data %>% 
    mutate(first_positive_score1 = map_dbl(data, get_field2, "is_first_positive", "score_1"),
           first_positive_score2 = map_dbl(data, get_field2, "is_first_positive", "score_2"))
}

library(microbenchmark)
microbenchmark(approach1(), approach2())
#> Unit: milliseconds
#>         expr       min        lq      mean    median        uq       max neval
#>  approach1() 19.849982 24.047509 26.470304 25.001731 26.896622 95.951980   100
#>  approach2()  2.159769  2.587905  2.783555  2.648321  2.740863  7.620581   100

Upvotes: 1

Related Questions