Reputation: 476
I have two data frames, df1
and df2
. I would like to join these two in a way, that I add the target value from df2
to df1
. df1
and df2
are related via the column group and value. While in df1
, I have a specific value, in df2
I have only lower and upper limit of the applicable value.
I believe the task should be clear if we look at df1
and df2
.
df1 <- data.frame(group = c("A","B","C","D"),
value = c(15, 0, 40, 70))
df2 <- data.frame(group = c("A","A","A","A",
"B","B","B","B",
"C","C","C","C",
"D","D","D","D"),
lower_limit = c(0, 25, 60, 91,
0, 35, 70, 92,
0, 45, 80, 93,
0, 55, 90, 94),
upper_limit = c(25, 60, 91, 100,
35, 70, 92, 100,
45, 80, 93, 100,
55, 90, 94, 100),
target = c("AGE0", "AGE1", "AGE3", "AGE4",
"AGE0", "AGE1", "AGE3", "AGE4",
"AGE0", "AGE1", "AGE3", "AGE4",
"AGE0", "AGE1", "AGE3", "AGE4"))
Using a nested for and if loops, I can perform this task. But my original data is much bigger, and I can’t use this loop. I am sure there is a much easier solution for my task. Any suggestions?
for (i in 1:nrow(df1)){
subset_string = df1[i, 1]
target_value = df1[i, 2]
df2_subset <- df2[df2$group == subset_string, ]
for (j in 1:nrow(df2_subset)){
temp_sequence <- seq(from = df2_subset[j, 2], to = df2_subset[j, 3] - 1)
if (target_value %in% temp_sequence){
target_string <- df2_subset[j, 4]
}
df1[i, 3] <- target_string
}
}
Upvotes: 3
Views: 97
Reputation: 11975
data.table
approach could be
library(data.table)
setDT(df2)[setDT(df1), .(group, value, target),
on = .(lower_limit <= value, upper_limit >= value, group)]
which gives
group value target
1: A 15 AGE0
2: B 0 AGE0
3: C 40 AGE0
4: D 70 AGE1
Sample data:
df1 <- structure(list(group = structure(1:4, .Label = c("A", "B", "C",
"D"), class = "factor"), value = c(15, 0, 40, 70)), .Names = c("group",
"value"), row.names = c(NA, -4L), class = "data.frame")
df2 <- structure(list(group = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), .Label = c("A", "B", "C",
"D"), class = "factor"), lower_limit = c(0, 25, 60, 91, 0, 35,
70, 92, 0, 45, 80, 93, 0, 55, 90, 94), upper_limit = c(25, 60,
91, 100, 35, 70, 92, 100, 45, 80, 93, 100, 55, 90, 94, 100),
target = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("AGE0", "AGE1", "AGE3",
"AGE4"), class = "factor")), .Names = c("group", "lower_limit",
"upper_limit", "target"), row.names = c(NA, -16L), class = "data.frame")
Update: as per OP's request dplyr
solution is
library(dplyr)
df1 %>%
left_join(df2, by = "group") %>%
filter(value >= lower_limit, value <= upper_limit) %>%
select(group, value, target)
# group value target
#1 A 15 AGE0
#2 B 0 AGE0
#3 C 40 AGE0
#4 D 70 AGE1
Upvotes: 3
Reputation: 5456
Not sure about wanted result. Maybe with sdqldf:
df1 <- data.frame(group = c("A","B","C","D"),
value = c(15, 0, 40, 70))
df2 <- data.frame(group = c("A","A","A","A",
"B","B","B","B",
"C","C","C","C",
"D","D","D","D"),
lower_limit = c(0, 25, 60, 91,
0, 35, 70, 92,
0, 45, 80, 93,
0, 55, 90, 94),
upper_limit = c(25, 60, 91, 100,
35, 70, 92, 100,
45, 80, 93, 100,
55, 90, 94, 100),
target = c("AGE0", "AGE1", "AGE3", "AGE4",
"AGE0", "AGE1", "AGE3", "AGE4",
"AGE0", "AGE1", "AGE3", "AGE4",
"AGE0", "AGE1", "AGE3", "AGE4"))
library(sqldf)
sqldf("select a.*, b.target
from df1 a
left join df2 b
on a.`group` = b.`group`
AND a.value >= b.lower_limit
AND a.value <= b.upper_limit")
# group value target
#1 A 15 AGE0
#2 B 0 AGE0
#3 C 40 AGE0
#4 D 70 AGE1
Upvotes: 3