Join based on a condition of another column

Question

So I want to join df_1 to df_2 based on id:

df_1 <- data.frame(id=c(1,2,3,4,5))


df_2<- data.frame(id=c(1,1,2,3,3,4,4,5,5,5,5),name=c("a","a1","b","c","c2","d","d1","e","e1","e2","e3"),status=c("IN","OUT","OUT","PENDING","OUT","OUT","REFER","IN","OUT","REFER","OUT"))

and this is what my resulting dataset should look like:

df_merge<-data.frame(id=c(1,2,3,4,5),name=c("a","b","c","d1","e"))

So I want to merge the name associated with "IN" and if that's not available then the name associated with "PENDING", and after that it should be "REFER", if none of these are there then match the id with the status of "OUT". How can I do this?

akrun · Accepted Answer

We do a join, then arrange by 'id' and factor converted 'status' with levels in the specified order, grouped by 'id', get the first row with slice

library(dplyr)
left_join(df_2, df_1) %>% 
  arrange(id, factor(status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))) %>% 
  group_by(id) %>%
  slice(1) %>%
  ungroup %>%
  select(-status)
# A tibble: 5 x 2
#     id name 
#   
#1     1 a    
#2     2 b    
#3     3 c    
#4     4 d1   
#5     5 e

It the 'df_1' is duplicated, then do the left_join with the distinct rows, later do a right_join

df_1 <- data.frame(id=c(1,1,2,3,4,5,5)) 
left_join(df_2, distinct(df_1)) %>% 
    arrange(id, factor(status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))) %>%  
    group_by(id) %>%
    slice(1) %>%
    ungroup %>%
    select(-status)  %>%
    right_join(df_1)
# A tibble: 7 x 2
#     id name 
#*  
#1     1 a    
#2     1 a    
#3     2 b    
#4     3 c    
#5     4 d1   
#6     5 e    
#7     5 e

Or we can have a list columnornestthe 'id' and theunnest`

library(tidyr)
df_1 %>% 
    group_by(id) %>% 
    nest %>%
    right_join(df_2) %>%
    arrange(id, factor(status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))) %>%   
    group_by(id) %>%
    slice(1) %>%
    ungroup %>%
    select(-status) %>% 
    unnest

Or using data.table

library(data.table)
setDT(df_2)[df_1,  .(name = name[order(match(status,
      c('IN', 'PENDING', 'REFER', 'OUT')))[1]]), on = .(id), by = .EACHI]
#   id name
#1:  1    a
#2:  2    b
#3:  3    c
#4:  4   d1
#5:  5    e

Or a base R option is to first order the 'df_2' dataset, and then extract the elements of 'name' based on duplicated elements in 'id'

df_2n <- df_2[order(df_2$id, factor(df_2$status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))),] 
df_1$name <-  df_2n$name[!duplicated(df_2$id)]

Join based on a condition of another column

Answers (1)

Related Questions