Hao WU
Hao WU

Reputation: 13

replace some value in dataframe from another dataframe

I have two dataframe:

df1 <- data.frame(id = c("LABEL1", "LABEL2", "LABEL3", "LABEL4", "LABEL5", "LABEL6"),matrix(1:60,6,10))
df1[c(4:6), c(2:4)] = NA

df2 = data.frame(id = c( "LABEL3", "LABEL4", "LABEL5", "LABEL6"),matrix(seq(100,10000, length.out = 32),4,8))

I would like to look up ONLY the missing values from DF1 in DF2 using a key value = 'id'. Here is the desired output: enter image description here

Here is the methods I tried: 1. merge: but I get duplicated columns for X1:X3. 2. match:

df1[,2]= df2[,2][match(df1$id, df2$id)] 

but I will get the lable 3 in DF1 covered. 3. lookup from qdap package:

library(qdap)
apply(df1, 2, lookup, df2)

same result as method 2.

Thanks!

Upvotes: 0

Views: 972

Answers (2)

ClancyStats
ClancyStats

Reputation: 1231

There may be someone with a better method than I have, but this should work. This does assume that the order of the columns are the same, so be careful of that.

row.matches = match(df1$id, df2$id)
nas = which(is.na(df1), arr.ind = TRUE)
replacements = nas
replacements[ ,1] = row.matches[nas[ ,1]]
df1[nas] = df2[replacements]

Essentially, all I've done is find the matching rows and the indices where NAs occur in df1. Replace the row index of those NA indices using the matching rows vector, and replace those values in df1 with their corresponding values in df2.

Upvotes: 0

cderv
cderv

Reputation: 6542

you could use tidyr to work in tidy data form then dplyr to combine table

library(dplyr)
library(tidyr)

In one way with pipe

df1 %>% 
  mutate(id = as.character(id)) %>%
  gather(key = "col", value = "val", -id) %>%
  left_join(df2 %>% 
                mutate(id = as.character(id)) %>%
                gather(key = "col", value = "val", -id), 
            by =c("id", "col")) %>%
  transmute(id, col, val = ifelse(is.na(val.x), val.y, val.x)) %>%
  spread(col, val) %>%
  select(id, num_range("X", 1:10))
#>       id        X1       X2       X3 X4 X5 X6 X7 X8 X9 X10
#> 1 LABEL1    1.0000    7.000   13.000 19 25 31 37 43 49  55
#> 2 LABEL2    2.0000    8.000   14.000 20 26 32 38 44 50  56
#> 3 LABEL3    3.0000    9.000   15.000 21 27 33 39 45 51  57
#> 4 LABEL4  419.3548 1696.774 2974.194 22 28 34 40 46 52  58
#> 5 LABEL5  738.7097 2016.129 3293.548 23 29 35 41 47 53  59
#> 6 LABEL6 1058.0645 2335.484 3612.903 24 30 36 42 48 54  60

Step by step for explanation

# id as character instead of factor
df1 <- df1 %>% mutate(id = as.character(id))
# tidy data
df1 <- df1 %>% gather(key = "col", value = "val", -id)
# print result as dplyr tbl
df1 %>% as.tbl()
#> Source: local data frame [60 x 3]
#> 
#>        id   col   val
#>     (chr) (chr) (int)
#> 1  LABEL1    X1     1
#> 2  LABEL2    X1     2
#> 3  LABEL3    X1     3
#> 4  LABEL4    X1    NA
#> 5  LABEL5    X1    NA
#> 6  LABEL6    X1    NA
#> 7  LABEL1    X2     7
#> 8  LABEL2    X2     8
#> 9  LABEL3    X2     9
#> 10 LABEL4    X2    NA
#> ..    ...   ...   ...
# idem on df2
df2 <- df2 %>% 
  mutate(id = as.character(id)) %>%
  tidyr::gather(key = "col", value = "val", -id)
# print result as dplyr tbl
df2 %>% as.tbl()
#> Source: local data frame [32 x 3]
#> 
#>        id   col       val
#>     (chr) (chr)     (dbl)
#> 1  LABEL3    X1  100.0000
#> 2  LABEL4    X1  419.3548
#> 3  LABEL5    X1  738.7097
#> 4  LABEL6    X1 1058.0645
#> 5  LABEL3    X2 1377.4194
#> 6  LABEL4    X2 1696.7742
#> 7  LABEL5    X2 2016.1290
#> 8  LABEL6    X2 2335.4839
#> 9  LABEL3    X3 2654.8387
#> 10 LABEL4    X3 2974.1935
#> ..    ...   ...       ...

# join only id and col level of df1 with df2
new.df <- left_join(df1, df2, by = c("id", "col"))
# print result as dplyr tbl
new.df %>% as.tbl()
#> Source: local data frame [60 x 4]
#> 
#>        id   col val.x     val.y
#>     (chr) (chr) (int)     (dbl)
#> 1  LABEL1    X1     1        NA
#> 2  LABEL2    X1     2        NA
#> 3  LABEL3    X1     3  100.0000
#> 4  LABEL4    X1    NA  419.3548
#> 5  LABEL5    X1    NA  738.7097
#> 6  LABEL6    X1    NA 1058.0645
#> 7  LABEL1    X2     7        NA
#> 8  LABEL2    X2     8        NA
#> 9  LABEL3    X2     9 1377.4194
#> 10 LABEL4    X2    NA 1696.7742
#> ..    ...   ...   ...       ...

#replace NA in col val.x from df1 by value val.y of df2
# and only keep id, col and new column val
new.df <- new.df %>% transmute(id, col, val = ifelse(is.na(val.x), val.y, val.x)) 
new.df %>% as.tbl()
#> Source: local data frame [60 x 3]
#> 
#>        id   col       val
#>     (chr) (chr)     (dbl)
#> 1  LABEL1    X1    1.0000
#> 2  LABEL2    X1    2.0000
#> 3  LABEL3    X1    3.0000
#> 4  LABEL4    X1  419.3548
#> 5  LABEL5    X1  738.7097
#> 6  LABEL6    X1 1058.0645
#> 7  LABEL1    X2    7.0000
#> 8  LABEL2    X2    8.0000
#> 9  LABEL3    X2    9.0000
#> 10 LABEL4    X2 1696.7742
#> ..    ...   ...       ...

# put back data in wide format
new.df %>% 
  spread(col, val) %>%
  select(id, num_range("X", 1:10)) # put column in same order as df1
#>       id        X1       X2       X3 X4 X5 X6 X7 X8 X9 X10
#> 1 LABEL1    1.0000    7.000   13.000 19 25 31 37 43 49  55
#> 2 LABEL2    2.0000    8.000   14.000 20 26 32 38 44 50  56
#> 3 LABEL3    3.0000    9.000   15.000 21 27 33 39 45 51  57
#> 4 LABEL4  419.3548 1696.774 2974.194 22 28 34 40 46 52  58
#> 5 LABEL5  738.7097 2016.129 3293.548 23 29 35 41 47 53  59
#> 6 LABEL6 1058.0645 2335.484 3612.903 24 30 36 42 48 54  60

Upvotes: 2

Related Questions