Niro Mal
Niro Mal

Reputation: 117

Duplicate data within a start and end time interval when joining (merging) tables

I have two data frames that I have to join. But when joining the two data tables I want to duplicate each row of the df2 within the start time and the end time. Rest of the rows of the new data frame should display as NA.

I tried with left join but it doesn't duplicate the rows within the start and end time.

df <- dplyr::left_join(df1, df2, by = "Session_start")

The two data frames look like below.

head(df1)
#         Session_start Robot_ID
# 1 2022-07-07 00:05:19       R1
# 2 2022-07-07 00:05:20       R2
# 3 2022-07-07 00:05:21       R3
# 4 2022-07-07 00:05:22       R4
# 5 2022-07-07 00:05:23       R5
# 6 2022-07-07 00:05:24       R6

df2
#         Session_start         Session_End Animal_ID
# 1 2022-07-07 00:05:19 2022-07-07 00:05:21       ID1
# 2 2022-07-07 00:05:24 2022-07-07 00:05:26       ID2
# 3 2022-07-07 00:05:27 2022-07-07 00:05:31       ID3
# 4 2022-07-07 00:05:33 2022-07-07 00:05:34       ID4

Desired output will be:

Session_start Robot_ID Session_End Animal_ID
2022-07-07 00:05:19 R1 2022-07-07 00:05:21 ID1
2022-07-07 00:05:20 R2 2022-07-07 00:05:21 ID1
2022-07-07 00:05:21 R3 2022-07-07 00:05:21 ID1
2022-07-07 00:05:22 R4 NA NA
2022-07-07 00:05:23 R5 NA NA
2022-07-07 00:05:24 R6 2022-07-07 00:05:26 ID1
2022-07-07 00:05:25 R7 2022-07-07 00:05:26 ID2
2022-07-07 00:05:26 R8 2022-07-07 00:05:26 ID2
2022-07-07 00:05:27 R9 2022-07-07 00:05:31 ID3
2022-07-07 00:05:28 R10 2022-07-07 00:05:31 ID3
2022-07-07 00:05:29 R11 2022-07-07 00:05:31 ID3
2022-07-07 00:05:30 R12 2022-07-07 00:05:31 ID3
2022-07-07 00:05:31 R13 2022-07-07 00:05:31 ID3
2022-07-07 00:05:32 R14 NA NA
2022-07-07 00:05:33 R15 2022-07-07 00:05:34 ID4
2022-07-07 00:05:34 R16 2022-07-07 00:05:34 ID4

How I do this using R?

Data

df1 <- data.frame(Session_start=c("2022-07-07 00:05:19", "2022-07-07 00:05:20", "2022-07-07 00:05:21", 
                                  "2022-07-07 00:05:22", "2022-07-07 00:05:23", "2022-07-07 00:05:24", 
                                  "2022-07-07 00:05:25", "2022-07-07 00:05:26", "2022-07-07 00:05:27", 
                                  "2022-07-07 00:05:28", "2022-07-07 00:05:29", "2022-07-07 00:05:30", 
                                  "2022-07-07 00:05:31", "2022-07-07 00:05:32", "2022-07-07 00:05:33", 
                                  "2022-07-07 00:05:34"), 
                  Robot_ID =c("R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R10", 
                              "R11", "R12", "R13", "R14", "R15", "R16"))

df2 <- data.frame(Session_start=c("2022-07-07 00:05:19", "2022-07-07 00:05:24", 
                                  "2022-07-07 00:05:27", "2022-07-07 00:05:33"), 
                  Session_End=c("2022-07-07 00:05:21", "2022-07-07 00:05:26", 
                                "2022-07-07 00:05:31", "2022-07-07 00:05:34"), 
                  Animal_ID =c("ID1", "ID2", "ID3", "ID4"))

Upvotes: 1

Views: 152

Answers (4)

Esther
Esther

Reputation: 436

Here's my answer. It's a little simple, but it works fine with your dataset:

# Package needed
library(dplyr)

# First, preprocess the data
df1 <- df1 %>% 
  mutate_at(vars(Session_start), as.POSIXct)

df2 <- df2 %>% 
  mutate_at(vars(Session_start, Session_End), as.POSIXct)

df3 <- merge(df1, df2, all = TRUE)

# Then, fill the voids 
for (i in 1:nrow(df3)) {
  
  if (!is.na(df3$Session_End[i])) {
    
    session_end1 <- df3$Session_End[i]
    animal_id1 <- df3$Animal_ID[i]
    
  } else {
    
    if (i < nrow(df3)) {
      
      if (df3$Session_start[i] < df3$Session_start[i+1]) {
        
        df3$Session_End[i] <- session_end1
        df3$Animal_ID[i] <- animal_id1
        
      }
      
    } else if (i == nrow(df3)) {
      
      df3$Session_End[i] <- session_end1
      df3$Animal_ID[i] <- animal_id1
      
    }
    
  }
  
}

Upvotes: 0

This answer is much longer than thelatemail's and jay.sf's, but I'll still post it, so you have more ideas.

My approach was to use auxiliary variables, use lubridate to be sure I was working with the right format, and then start propagating Animal_ID and Session_End data.

# Loading libraries -------------------------------------------------------

library(dplyr)
library(lubridate)

# Defining datasets -------------------------------------------------------

Session_start <-
  c(
    "2022-07-07 00:05:19",
    "2022-07-07 00:05:24",
    "2022-07-07 00:05:27",
    "2022-07-07 00:05:33"
  )

Session_End <-
  c(
    "2022-07-07 00:05:21",
    "2022-07-07 00:05:26",
    "2022-07-07 00:05:31",
    "2022-07-07 00:05:34"
  )

Animal_ID <- c("ID1", "ID2", "ID3", "ID4")

df2 <- data.frame(Session_start, Session_End, Animal_ID)

Session_start <-
  c(
    "2022-07-07 00:05:19",
    "2022-07-07 00:05:20",
    "2022-07-07 00:05:21",
    "2022-07-07 00:05:22",
    "2022-07-07 00:05:23",
    "2022-07-07 00:05:24",
    "2022-07-07 00:05:25",
    "2022-07-07 00:05:26",
    "2022-07-07 00:05:27",
    "2022-07-07 00:05:28",
    "2022-07-07 00:05:29",
    "2022-07-07 00:05:30",
    "2022-07-07 00:05:31",
    "2022-07-07 00:05:32",
    "2022-07-07 00:05:33",
    "2022-07-07 00:05:34"
  )

Robot_ID <-
  c(
    "R1",
    "R2",
    "R3",
    "R4",
    "R5",
    "R6",
    "R7",
    "R8",
    "R9",
    "R10",
    "R11",
    "R12",
    "R13",
    "R14",
    "R15",
    "R16"
  )

df1 <- data.frame(Session_start, Robot_ID)

# Joining with data propagation -------------------------------------------

df <-
  dplyr::left_join(df1, df2, by = "Session_start") |>
  arrange(Session_start) |>
  mutate(
    Session_start =
      Session_start |>
      lubridate::as_datetime(),
    Session_End =
      Session_End |>
      lubridate::as_datetime()
  ) |>
  mutate(
    is_na_Session_End = if_else(
      condition = is.na(Session_End),
      true = FALSE,
      false = TRUE
    ),
    number_of_non_NA_Session_End = cumsum(is_na_Session_End)
  ) |>
  group_by(number_of_non_NA_Session_End) |>
  mutate(Session_End =
           Session_End |>
           first(),
         Animal_ID =
           Animal_ID |>
           first()) |>
  mutate(
    Session_End = if_else(
      condition = Session_start <= Session_End,
      true = Session_End,
      false = NA_POSIXct_
    ),
    Animal_ID = if_else(
      condition = Session_start <= Session_End,
      true = Animal_ID,
      false = NA_character_
    )
  ) |>
  ungroup() |>
  select(-is_na_Session_End,
         -number_of_non_NA_Session_End) |>
  as.data.frame()

df

Upvotes: 0

jay.sf
jay.sf

Reputation: 72828

First, find the indices w, where session start of df lie in between the session intervals of df2 using outer(). Next cbind them to the respective slices. Finally merge the remainder.

w <- outer(df1[, 1], as.data.frame(t(df2[1:2])), 
           Vectorize(\(x, y) x >= y[1] & x <= y[2])) |>
  apply(2, which)

Map(\(x, y) cbind(df1[x, ], df2[y, -1]), w, seq_len(nrow(df2))) |>
  do.call(what=rbind) |> merge(df1, all=TRUE)
#          Session_start Robot_ID         Session_End Animal_ID
# 1  2022-07-07 00:05:19       R1 2022-07-07 00:05:21       ID1
# 2  2022-07-07 00:05:20       R2 2022-07-07 00:05:21       ID1
# 3  2022-07-07 00:05:21       R3 2022-07-07 00:05:21       ID1
# 4  2022-07-07 00:05:22       R4                <NA>      <NA>
# 5  2022-07-07 00:05:23       R5                <NA>      <NA>
# 6  2022-07-07 00:05:24       R6 2022-07-07 00:05:26       ID2
# 7  2022-07-07 00:05:25       R7 2022-07-07 00:05:26       ID2
# 8  2022-07-07 00:05:26       R8 2022-07-07 00:05:26       ID2
# 9  2022-07-07 00:05:27       R9 2022-07-07 00:05:31       ID3
# 10 2022-07-07 00:05:28      R10 2022-07-07 00:05:31       ID3
# 11 2022-07-07 00:05:29      R11 2022-07-07 00:05:31       ID3
# 12 2022-07-07 00:05:30      R12 2022-07-07 00:05:31       ID3
# 13 2022-07-07 00:05:31      R13 2022-07-07 00:05:31       ID3
# 14 2022-07-07 00:05:32      R14                <NA>      <NA>
# 15 2022-07-07 00:05:33      R15 2022-07-07 00:05:34       ID4
# 16 2022-07-07 00:05:34      R16 2022-07-07 00:05:34       ID4

Note: Even though, the solution works without it (the dates are compared alphabetically), you should always use "POSIXct" format when you work with date-times. If you don't have it yet, convert it:

df1$Session_start <- as.POSIXct(df1$Session_start)
df2[1:2] <- lapply(df2[1:2], as.POSIXct)

Upvotes: 1

thelatemail
thelatemail

Reputation: 93813

data.table with a non-equi, update-join might make this considerably nicer:

library(data.table)
setDT(df1)
setDT(df2)
df1[
    df2,
    on=.(Session_start>=Session_start, Session_start<=Session_End),
    c("Animal_ID","Session_End") := .(i.Animal_ID, i.Session_End)
]
df1
##          Session_start Robot_ID Animal_ID         Session_End
## 1: 2022-07-07 08:05:19       R1       ID1 2022-07-07 08:05:21
## 2: 2022-07-07 08:05:20       R2       ID1 2022-07-07 08:05:21
## 3: 2022-07-07 08:05:21       R3       ID1 2022-07-07 08:05:21
## 4: 2022-07-07 08:05:22       R4      <NA>                <NA>
## 5: 2022-07-07 08:05:23       R5      <NA>                <NA>
## 6: 2022-07-07 08:05:24       R6       ID2 2022-07-07 08:05:26
## 7: 2022-07-07 08:05:25       R7       ID2 2022-07-07 08:05:26
## 8: 2022-07-07 08:05:26       R8       ID2 2022-07-07 08:05:26
## 9: 2022-07-07 08:05:27       R9       ID3 2022-07-07 08:05:31
##10: 2022-07-07 08:05:28      R10       ID3 2022-07-07 08:05:31
##11: 2022-07-07 08:05:29      R11       ID3 2022-07-07 08:05:31
##12: 2022-07-07 08:05:30      R12       ID3 2022-07-07 08:05:31
##13: 2022-07-07 08:05:31      R13       ID3 2022-07-07 08:05:31
##14: 2022-07-07 08:05:32      R14      <NA>                <NA>
##15: 2022-07-07 08:05:33      R15       ID4 2022-07-07 08:05:34
##16: 2022-07-07 08:05:34      R16       ID4 2022-07-07 08:05:34

Upvotes: 1

Related Questions