Reputation: 13
I'm working with a set of patient test results some of which are positive and negative. I'm reducing to individual patient level using dplyr::nest()
and then extracting values for the first positive test only using purrr::map()
and a function I've written. My dataset isn't huge - ~40k unique patients, ~110k test results - but I gave up running my script after 40 mins. I'm sure there's a better way of extracting these values but am struggling to work it out. The code chunk below illustrates the method I'm using (though obviously this runs in no time).
library(tidyverse)
example_data <- tribble(
~patient, ~is_first_positive, ~score_1, ~score_2,
"A", F, 10, 45,
"A", T, 16, 76,
"A", F, 24, 86,
"B", T, 17, 5,
"B", F, 24, 22,
"B", F, 55, 97,
"C", F, 67, 48,
"C", F, 23, 38,
"C", F, 45, 16
)
example_data <- example_data %>%
group_by(patient) %>%
nest()
# function to extract values based on value of another column
get_field <- function(df, logical_field, rtn_field) {
df <- df %>% filter_(logical_field)
if(nrow(df)==0) {
return(NA_integer_)
} else {
df %>% pull({{rtn_field}}) %>% as.integer() %>% return()
}
}
# Use purrr to run function against each nested df
example_data <- example_data %>%
mutate(first_positive_score1 = map_int(data, ~get_field(., "is_first_positive", score_1)),
first_positive_score2 = map_int(data, ~get_field(., "is_first_positive", score_2)))
Upvotes: 0
Views: 359
Reputation: 1972
If you can forgive the long lines, you can use map()
in the following way.
library(dplyr)
library(tibble)
library(purrr)
example_data %>%
mutate(score_1 = as.double(map(data, ~ deframe(.x[2])[which(deframe(.x[1]) == TRUE)])),
score_2 = as.double(map(data, ~ deframe(.x[3])[which(deframe(.x[1]) == TRUE)])))
# patient data score_1 score_2
# <chr> <list> <dbl> <dbl>
# 1 A <tibble [3 × 3]> 16 76
# 2 B <tibble [3 × 3]> 17 5
# 3 C <tibble [3 × 3]> NA NA
Upvotes: 0
Reputation: 17725
Here's an alternative that should be quite fast:
library(tidyverse)
example_data <- tribble(
~patient, ~is_first_positive, ~score_1, ~score_2,
"A", F, 10, 45,
"A", T, 16, 76,
"A", F, 24, 86,
"B", T, 17, 5,
"B", F, 24, 22,
"B", F, 55, 97,
"C", F, 67, 48,
"C", F, 23, 38,
"C", F, 45, 16
)
nested_data <- example_data %>%
group_by(patient) %>%
nest()
example_data %>%
filter(is_first_positive) %>%
group_by(patient) %>%
top_n(1) %>%
full_join(nested_data)
#> Selecting by score_2
#> Joining, by = "patient"
#> # A tibble: 3 x 5
#> # Groups: patient [3]
#> patient is_first_positive score_1 score_2 data
#> <chr> <lgl> <dbl> <dbl> <list>
#> 1 A TRUE 16 76 <tibble [3 × 3]>
#> 2 B TRUE 17 5 <tibble [3 × 3]>
#> 3 C NA NA NA <tibble [3 × 3]>
Considering your comment, I re-wrote your get_field
function using Base R functions and was able to get a 10x speed improvement:
get_field <- function(df, logical_field, rtn_field) {
df <- df %>% filter_(logical_field)
if(nrow(df)==0) {
return(NA_integer_)
} else {
df %>% pull({{rtn_field}}) %>% as.integer() %>% return()
}
}
get_field2 <- function(x, logical_field, rtn_field) {
x <- x[x[[logical_field]], ]
ifelse(nrow(x)==0, NA_integer_, x[[rtn_field]])
}
approach1 <- function() {
example_data %>%
mutate(first_positive_score1 = map_int(data, ~get_field(., "is_first_positive", score_1)),
first_positive_score2 = map_int(data, ~get_field(., "is_first_positive", score_2)))
}
approach2 <- function() {
example_data %>%
mutate(first_positive_score1 = map_dbl(data, get_field2, "is_first_positive", "score_1"),
first_positive_score2 = map_dbl(data, get_field2, "is_first_positive", "score_2"))
}
library(microbenchmark)
microbenchmark(approach1(), approach2())
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> approach1() 19.849982 24.047509 26.470304 25.001731 26.896622 95.951980 100
#> approach2() 2.159769 2.587905 2.783555 2.648321 2.740863 7.620581 100
Upvotes: 1