Reputation: 71
I want to assign 1 in the response column of the "data1" by matching the altitude column of data2 with its closest altitude value within the corresponding ID1 and ID2.
here I chose the sample of a large dataset as for demo.
data1:
structure(list(ID1 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2), ID2 = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L), altitude = c(0L,
500L, 1000L, 1500L, 2000L, 0L, 500L, 1000L, 1500L, 2000L, 0L,
500L, 1000L, 1500L, 2000L, 0L, 500L, 1000L, 1500L, 2000L), response = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L)), row.names = c(NA, -20L), class = "data.frame")
data2
structure(list(ID1 = c(1, 1, 2, 2), ID2 = c(1L, 2L, 7L, 9L),
altitude = c(500L, 1920L, 128L, 261L), response = c(1L, 1L,
1L, 1L)), row.names = c(NA, -4L), class = "data.frame")
expected outcome:
Upvotes: 1
Views: 110
Reputation: 46908
A direct approach using base R is:
to_replace = sapply(1:nrow(data2),function(i){
#get row index
row_idx = with(data1,which(ID1 == data2$ID1[i] & ID2== data2$ID2[i]))
#find the difference
delta = abs(data2$altitude[i] - data1$altitude[row_idx])
row_idx[which.min(delta)]
})
data1$response[to_replace] = 1
ID1 ID2 altitude response
1 1 1 0 0
2 1 1 500 1
3 1 1 1000 0
4 1 1 1500 0
5 1 1 2000 0
6 1 2 0 0
7 1 2 500 0
8 1 2 1000 0
9 1 2 1500 0
10 1 2 2000 1
11 2 7 0 1
12 2 7 500 0
13 2 7 1000 0
14 2 7 1500 0
15 2 7 2000 0
16 2 9 0 0
17 2 9 500 1
18 2 9 1000 0
19 2 9 1500 0
20 2 9 2000 0
If you want to use dplyr, try something like this, where you rbind the two frames, then calculate the absolute difference to the second data.frame within the groups, replace the closest with 1s, and lastly throw out the original dataframe:
library(dplyr)
rbind(cbind(data1,data=1),cbind(data2,data=2)) %>%
group_by(ID1,ID2) %>%
mutate(delta = abs(altitude-altitude[response==1])) %>%
mutate(response=replace(response,delta==sort(delta)[2],1)) %>%
filter(data==1) %>%
select(-delta)
# A tibble: 20 x 4
# Groups: ID1, ID2 [4]
ID1 ID2 altitude response
<dbl> <int> <int> <dbl>
1 1 1 0 0
2 1 1 500 1
3 1 1 1000 0
4 1 1 1500 0
5 1 1 2000 0
6 1 2 0 0
7 1 2 500 0
8 1 2 1000 0
9 1 2 1500 0
10 1 2 2000 1
11 2 7 0 1
12 2 7 500 0
13 2 7 1000 0
14 2 7 1500 0
15 2 7 2000 0
16 2 9 0 0
17 2 9 500 1
18 2 9 1000 0
19 2 9 1500 0
20 2 9 2000 0
Upvotes: 1