neurosnap
neurosnap

Reputation: 5808

Combine two dataframes meeting conditional criteria in R

I have dataframe A which have events in time and dataframe B with event ranges for patients. I want to only include rows from dataframe A if the events in time are not between the date ranges of dataframe B. If there is a patient from dataframe A that doesn't exist in dataframe B, then add the events to dataframe B.

Since the dataframes are not identical, rows added to dataframe B from dataframe A should add rows where date = start and date = end.

I was trying to figure out how to get this to work with dplyr but it seems complicated. I managed to get it to work with a for-loop but for my education I was wondering how other people might accomplish the same task

dfa <- data.frame(
    date = c("2021-01-01", "2021-02-02", "2021-02-05"),
    patient = c("one", "two", "three"))
dfb <- data.frame(
    start = c("2020-12-31", "2021-02-01"),
    end = c("2021-01-02", "2021-02-03"),
    patient = c("one", "one"))

dfa$date <- as.Date(dfa$date, "%Y-%m-%d")
dfb$start <- as.Date(dfb$start, "%Y-%m-%d")
dfb$end <- as.Date(dfb$end, "%Y-%m-%d")

for (i in 1:nrow(dfa)) {
  date <- dfa[i, "date"]
  d_patient <- dfa[i, "patient"]
  res <- dfb[d_patient == dfb$patient &
            date >= dfb$start &
            date <= dfb$end,]

  if (nrow(res) == 0) {
    tf <- data.frame("start" = date,
                     "end" = date,
                     "patient" = d_patient)
    dfb <- rbind(dfb, tf)
  }
}

print(dfb)

Results:

       start        end patient
1 2020-12-31 2021-01-02     one
2 2021-02-01 2021-02-03     one
3 2021-02-02 2021-02-02     two
4 2021-02-05 2021-02-05   three

Upvotes: 0

Views: 50

Answers (1)

Yuriy Saraykin
Yuriy Saraykin

Reputation: 8880

dfa <- data.frame(
  date = c("2021-01-01", "2021-02-02", "2021-02-05"),
  patient = c("one", "two", "three"))

dfb <- data.frame(
  start = c("2020-12-31", "2021-02-01"),
  end = c("2021-01-02", "2021-02-03"),
  patient = c("one", "one"))

dfa$date <- as.Date(dfa$date, "%Y-%m-%d")
dfb$start <- as.Date(dfb$start, "%Y-%m-%d")
dfb$end <- as.Date(dfb$end, "%Y-%m-%d")

dfa
#>         date patient
#> 1 2021-01-01     one
#> 2 2021-02-02     two
#> 3 2021-02-05   three
dfb
#>        start        end patient
#> 1 2020-12-31 2021-01-02     one
#> 2 2021-02-01 2021-02-03     one

library(tidyverse)
library(fuzzyjoin)

fuzzy_anti_join(
  x = dfa, 
  y = dfb, 
  by = c("patient", "date" = "start", "date" = "end"),
  match_fun = list(`==`, `>=`, `<=`)
  ) %>% 
  transmute(patient, start = date, end = date) %>% 
  bind_rows(dfb)
#>   patient      start        end
#> 1     two 2021-02-02 2021-02-02
#> 2   three 2021-02-05 2021-02-05
#> 3     one 2020-12-31 2021-01-02
#> 4     one 2021-02-01 2021-02-03

Created on 2022-01-22 by the reprex package (v2.0.1)

data.table

library(magrittr)
library(data.table)

setDT(dfa)
setDT(dfb)

tmp <- dfa[!dfb, on = list(patient, date >= start, date <= end)] %>% 
  .[, `:=`(start = date, end = date, date = NULL)]

l <- list(tmp, dfb)

rbindlist(l = l, use.names = TRUE)
#>    patient      start        end
#> 1:     two 2021-02-02 2021-02-02
#> 2:   three 2021-02-05 2021-02-05
#> 3:     one 2020-12-31 2021-01-02
#> 4:     one 2021-02-01 2021-02-03

Created on 2022-01-22 by the reprex package (v2.0.1)

Upvotes: 2

Related Questions