Reputation: 117
I have two data frames that I have to join. But when joining the two data tables I want to duplicate each row of the df2
within the start time and the end time. Rest of the rows of the new data frame should display as NA
.
I tried with left join
but it doesn't duplicate the rows within the start and end time.
df <- dplyr::left_join(df1, df2, by = "Session_start")
The two data frames look like below.
head(df1)
# Session_start Robot_ID
# 1 2022-07-07 00:05:19 R1
# 2 2022-07-07 00:05:20 R2
# 3 2022-07-07 00:05:21 R3
# 4 2022-07-07 00:05:22 R4
# 5 2022-07-07 00:05:23 R5
# 6 2022-07-07 00:05:24 R6
df2
# Session_start Session_End Animal_ID
# 1 2022-07-07 00:05:19 2022-07-07 00:05:21 ID1
# 2 2022-07-07 00:05:24 2022-07-07 00:05:26 ID2
# 3 2022-07-07 00:05:27 2022-07-07 00:05:31 ID3
# 4 2022-07-07 00:05:33 2022-07-07 00:05:34 ID4
Desired output will be:
Session_start | Robot_ID | Session_End | Animal_ID |
---|---|---|---|
2022-07-07 00:05:19 | R1 | 2022-07-07 00:05:21 | ID1 |
2022-07-07 00:05:20 | R2 | 2022-07-07 00:05:21 | ID1 |
2022-07-07 00:05:21 | R3 | 2022-07-07 00:05:21 | ID1 |
2022-07-07 00:05:22 | R4 | NA | NA |
2022-07-07 00:05:23 | R5 | NA | NA |
2022-07-07 00:05:24 | R6 | 2022-07-07 00:05:26 | ID1 |
2022-07-07 00:05:25 | R7 | 2022-07-07 00:05:26 | ID2 |
2022-07-07 00:05:26 | R8 | 2022-07-07 00:05:26 | ID2 |
2022-07-07 00:05:27 | R9 | 2022-07-07 00:05:31 | ID3 |
2022-07-07 00:05:28 | R10 | 2022-07-07 00:05:31 | ID3 |
2022-07-07 00:05:29 | R11 | 2022-07-07 00:05:31 | ID3 |
2022-07-07 00:05:30 | R12 | 2022-07-07 00:05:31 | ID3 |
2022-07-07 00:05:31 | R13 | 2022-07-07 00:05:31 | ID3 |
2022-07-07 00:05:32 | R14 | NA | NA |
2022-07-07 00:05:33 | R15 | 2022-07-07 00:05:34 | ID4 |
2022-07-07 00:05:34 | R16 | 2022-07-07 00:05:34 | ID4 |
How I do this using R?
df1 <- data.frame(Session_start=c("2022-07-07 00:05:19", "2022-07-07 00:05:20", "2022-07-07 00:05:21",
"2022-07-07 00:05:22", "2022-07-07 00:05:23", "2022-07-07 00:05:24",
"2022-07-07 00:05:25", "2022-07-07 00:05:26", "2022-07-07 00:05:27",
"2022-07-07 00:05:28", "2022-07-07 00:05:29", "2022-07-07 00:05:30",
"2022-07-07 00:05:31", "2022-07-07 00:05:32", "2022-07-07 00:05:33",
"2022-07-07 00:05:34"),
Robot_ID =c("R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R10",
"R11", "R12", "R13", "R14", "R15", "R16"))
df2 <- data.frame(Session_start=c("2022-07-07 00:05:19", "2022-07-07 00:05:24",
"2022-07-07 00:05:27", "2022-07-07 00:05:33"),
Session_End=c("2022-07-07 00:05:21", "2022-07-07 00:05:26",
"2022-07-07 00:05:31", "2022-07-07 00:05:34"),
Animal_ID =c("ID1", "ID2", "ID3", "ID4"))
Upvotes: 1
Views: 152
Reputation: 436
Here's my answer. It's a little simple, but it works fine with your dataset:
# Package needed
library(dplyr)
# First, preprocess the data
df1 <- df1 %>%
mutate_at(vars(Session_start), as.POSIXct)
df2 <- df2 %>%
mutate_at(vars(Session_start, Session_End), as.POSIXct)
df3 <- merge(df1, df2, all = TRUE)
# Then, fill the voids
for (i in 1:nrow(df3)) {
if (!is.na(df3$Session_End[i])) {
session_end1 <- df3$Session_End[i]
animal_id1 <- df3$Animal_ID[i]
} else {
if (i < nrow(df3)) {
if (df3$Session_start[i] < df3$Session_start[i+1]) {
df3$Session_End[i] <- session_end1
df3$Animal_ID[i] <- animal_id1
}
} else if (i == nrow(df3)) {
df3$Session_End[i] <- session_end1
df3$Animal_ID[i] <- animal_id1
}
}
}
Upvotes: 0
Reputation: 90
This answer is much longer than thelatemail's and jay.sf's, but I'll still post it, so you have more ideas.
My approach was to use auxiliary variables, use lubridate to be sure I was working with the right format, and then start propagating Animal_ID and Session_End data.
# Loading libraries -------------------------------------------------------
library(dplyr)
library(lubridate)
# Defining datasets -------------------------------------------------------
Session_start <-
c(
"2022-07-07 00:05:19",
"2022-07-07 00:05:24",
"2022-07-07 00:05:27",
"2022-07-07 00:05:33"
)
Session_End <-
c(
"2022-07-07 00:05:21",
"2022-07-07 00:05:26",
"2022-07-07 00:05:31",
"2022-07-07 00:05:34"
)
Animal_ID <- c("ID1", "ID2", "ID3", "ID4")
df2 <- data.frame(Session_start, Session_End, Animal_ID)
Session_start <-
c(
"2022-07-07 00:05:19",
"2022-07-07 00:05:20",
"2022-07-07 00:05:21",
"2022-07-07 00:05:22",
"2022-07-07 00:05:23",
"2022-07-07 00:05:24",
"2022-07-07 00:05:25",
"2022-07-07 00:05:26",
"2022-07-07 00:05:27",
"2022-07-07 00:05:28",
"2022-07-07 00:05:29",
"2022-07-07 00:05:30",
"2022-07-07 00:05:31",
"2022-07-07 00:05:32",
"2022-07-07 00:05:33",
"2022-07-07 00:05:34"
)
Robot_ID <-
c(
"R1",
"R2",
"R3",
"R4",
"R5",
"R6",
"R7",
"R8",
"R9",
"R10",
"R11",
"R12",
"R13",
"R14",
"R15",
"R16"
)
df1 <- data.frame(Session_start, Robot_ID)
# Joining with data propagation -------------------------------------------
df <-
dplyr::left_join(df1, df2, by = "Session_start") |>
arrange(Session_start) |>
mutate(
Session_start =
Session_start |>
lubridate::as_datetime(),
Session_End =
Session_End |>
lubridate::as_datetime()
) |>
mutate(
is_na_Session_End = if_else(
condition = is.na(Session_End),
true = FALSE,
false = TRUE
),
number_of_non_NA_Session_End = cumsum(is_na_Session_End)
) |>
group_by(number_of_non_NA_Session_End) |>
mutate(Session_End =
Session_End |>
first(),
Animal_ID =
Animal_ID |>
first()) |>
mutate(
Session_End = if_else(
condition = Session_start <= Session_End,
true = Session_End,
false = NA_POSIXct_
),
Animal_ID = if_else(
condition = Session_start <= Session_End,
true = Animal_ID,
false = NA_character_
)
) |>
ungroup() |>
select(-is_na_Session_End,
-number_of_non_NA_Session_End) |>
as.data.frame()
df
Upvotes: 0
Reputation: 72828
First, find the indices w
, where session start of df
lie in between the session intervals of df2
using outer()
. Next cbind
them to the respective slices. Finally merge
the remainder.
w <- outer(df1[, 1], as.data.frame(t(df2[1:2])),
Vectorize(\(x, y) x >= y[1] & x <= y[2])) |>
apply(2, which)
Map(\(x, y) cbind(df1[x, ], df2[y, -1]), w, seq_len(nrow(df2))) |>
do.call(what=rbind) |> merge(df1, all=TRUE)
# Session_start Robot_ID Session_End Animal_ID
# 1 2022-07-07 00:05:19 R1 2022-07-07 00:05:21 ID1
# 2 2022-07-07 00:05:20 R2 2022-07-07 00:05:21 ID1
# 3 2022-07-07 00:05:21 R3 2022-07-07 00:05:21 ID1
# 4 2022-07-07 00:05:22 R4 <NA> <NA>
# 5 2022-07-07 00:05:23 R5 <NA> <NA>
# 6 2022-07-07 00:05:24 R6 2022-07-07 00:05:26 ID2
# 7 2022-07-07 00:05:25 R7 2022-07-07 00:05:26 ID2
# 8 2022-07-07 00:05:26 R8 2022-07-07 00:05:26 ID2
# 9 2022-07-07 00:05:27 R9 2022-07-07 00:05:31 ID3
# 10 2022-07-07 00:05:28 R10 2022-07-07 00:05:31 ID3
# 11 2022-07-07 00:05:29 R11 2022-07-07 00:05:31 ID3
# 12 2022-07-07 00:05:30 R12 2022-07-07 00:05:31 ID3
# 13 2022-07-07 00:05:31 R13 2022-07-07 00:05:31 ID3
# 14 2022-07-07 00:05:32 R14 <NA> <NA>
# 15 2022-07-07 00:05:33 R15 2022-07-07 00:05:34 ID4
# 16 2022-07-07 00:05:34 R16 2022-07-07 00:05:34 ID4
Note: Even though, the solution works without it (the dates are compared alphabetically), you should always use "POSIXct"
format when you work with date-times. If you don't have it yet, convert it:
df1$Session_start <- as.POSIXct(df1$Session_start)
df2[1:2] <- lapply(df2[1:2], as.POSIXct)
Upvotes: 1
Reputation: 93813
data.table with a non-equi, update-join might make this considerably nicer:
library(data.table)
setDT(df1)
setDT(df2)
df1[
df2,
on=.(Session_start>=Session_start, Session_start<=Session_End),
c("Animal_ID","Session_End") := .(i.Animal_ID, i.Session_End)
]
df1
## Session_start Robot_ID Animal_ID Session_End
## 1: 2022-07-07 08:05:19 R1 ID1 2022-07-07 08:05:21
## 2: 2022-07-07 08:05:20 R2 ID1 2022-07-07 08:05:21
## 3: 2022-07-07 08:05:21 R3 ID1 2022-07-07 08:05:21
## 4: 2022-07-07 08:05:22 R4 <NA> <NA>
## 5: 2022-07-07 08:05:23 R5 <NA> <NA>
## 6: 2022-07-07 08:05:24 R6 ID2 2022-07-07 08:05:26
## 7: 2022-07-07 08:05:25 R7 ID2 2022-07-07 08:05:26
## 8: 2022-07-07 08:05:26 R8 ID2 2022-07-07 08:05:26
## 9: 2022-07-07 08:05:27 R9 ID3 2022-07-07 08:05:31
##10: 2022-07-07 08:05:28 R10 ID3 2022-07-07 08:05:31
##11: 2022-07-07 08:05:29 R11 ID3 2022-07-07 08:05:31
##12: 2022-07-07 08:05:30 R12 ID3 2022-07-07 08:05:31
##13: 2022-07-07 08:05:31 R13 ID3 2022-07-07 08:05:31
##14: 2022-07-07 08:05:32 R14 <NA> <NA>
##15: 2022-07-07 08:05:33 R15 ID4 2022-07-07 08:05:34
##16: 2022-07-07 08:05:34 R16 ID4 2022-07-07 08:05:34
Upvotes: 1