Using two dataframes but one Data Table

Question

dataHAVE1=data.frame("student"=c(1,2,3,4,5),
                      "v1"=c('a','a','b','b','b'),
                    "v2"=c('b','a','a','a','b'),
                    "v3"=c('a','a','a',NA,'b'))



dataHAVE2=data.frame("student"=c(1,2,3,4,5),
                     "score"=c(4,6,7,1,3),
                     "class"=c(10,4,3,66,2))


dataWANT=data.frame("student"=c(1,2,3,4,5),
                    "score"=c(4,6,7,1,3),
                    "class"=c(10,4,3,66,2),
                    "v1x"=c(1,1,0,0,0),
                    "v2x"=c(0,1,1,1,0),
                    "v3x"=c(1,1,1,NA,0))




setDT(dataHAVE1)
dataHAVE2=cbind(dataHAVE2, dataHAVE1[, c("v1","v2","v3") := lapply(.SD, function(x) as.integer(x =='a')), .SDcols = paste0(c("v1","v2","v3"))]

I have 'dataHAVE1' and 'dataHAVE2'. I hope not to merge because these are such big files. I want to add v1-v3 from dataHAVE1 to dataHAVE2 but I want to clean them before adding them and I do not want to clean them in 'dataHAVE1' and then bring them over. The rule is: if v1-v3 = a then v1x-v3x = 1, if b then 0.

say this process were repeated but the new variables q1 and q2 depend on c1/f1 and c2/f2

dataHAVE1=data.frame("student"=c(1,2,3,4,5),
                     "c1"=c('a','a','b','b','b'),
                     "f1"=c('b','a','a','a','b'),
                     "c2"=c('b','b','a',NA,'b'),
                     "f2"=c('b','a','a','a','b'))



dataHAVE2=data.frame("student"=c(1,2,3,4,5),
                     "score"=c(4,6,7,1,3),
                     "class"=c(10,4,3,66,2))


dataWANT=data.frame("student"=c(1,2,3,4,5),
                    "score"=c(4,6,7,1,3),
                    "class"=c(10,4,3,66,2),
                    "q1"=c(2,2,1,1,0),
                    "q2"=c(0,1,1,1,0))

if c1 = a and f1 = b, q1 = 2

if c1 = a and f1 = a, q1 = 2

if c1 = b and f1 = a, q1 = 1

if c1 = b and f1 = b, q1 = 0

Here is output of d1;

> d1
  student q1 q2
1       1  2  0
2       2  2  1
3       3  1  2
4       4  1  0
5       5  0  0
>

library(data.table)
dataHAVE=data.frame("student"=c(1,2,3,4,5),
                     "class"=c('a','a','b','c','c'),
                     "qq1scorep"=c(1,1,1,0,0),
                     "qq1scoref"=c(0,1,0,1,0),
                     "qq2scorep"=c(0,0,0,NA,1),
                     "qq2scoref"=c(0,0,NA,NA,1),
                     "qq3scorep"=c(0,0,0,0,1),
                     "qq3scoref"=c(1,1,0,1,1),
                     "qq4scorep"=c(0,0,1,0,1),
                     "qq4scoref"=c(1,1,1,1,1))




dataWANT=data.frame("student"=c(1,2,3,4,5),
                     "class"=c('a','a','b','c','c'),
                     "score1"=c(2,2,2,1,0),
                     "score2"=c(0,0,0,NA,2),
                     "score3"=c(1,1,0,1,2),
                     "score4"=c(1,1,2,0,2))



setDT(dataWANT)[, c(paste0('score',1:4)):= dcast(melt(setDT(dataHAVE), 
                                             measure = patterns('^scorep\d+', '^scoref\d+'), 
                                             value.name = c('p', 'f'))[, q := fifelse(p == 0 & !is.na(c) & f %in% c(0,1), 2, 
                                                                              fifelse(p == 1 & !is.na(c) & f == 1 , 1, 0))], 
                                        student ~ paste0('score',1:4), 
                                        value.var = 'score', fill = 0)[, .c(paste0('score',1:4)]]

Error: unexpected ']' in: " student ~ paste0('score',1:4), value.var = 'score', fill = 0)[, .c(paste0('score',1:4)]"

akrun · Accepted Answer

We can just assign (:=) the subset of 'dataHAVE1' to dataHAVE2 assuming the 'student' are the same in both datasets and in the same order

library(data.table)
setDT(dataHAVE2)[, paste0(names(dataHAVE1)[-1], "x") := lapply(dataHAVE1[-1], 
             function(x) +(x == 'a'))]
dataHAVE2
#   student score class v1x v2x v3x
#1:       1     4    10   1   0   1
#2:       2     6     4   1   1   1
#3:       3     7     3   0   1   1
#4:       4     1    66   0   1  NA
#5:       5     3     2   0   0   0

For the updated example

d1 <- dcast(melt(setDT(dataHAVE1), measure = patterns('^c\d+', 
 '^f\d+'), value.name = c('c', 'f'))[, q := fifelse(c == 'a' & 
  !is.na(c) &  f %in% c('a', 'b'), 2,
    fifelse(c == 'b' & !is.na(c) & f == 'a' , 1, 0))], 
     student ~ paste0("q", variable), value.var = 'q', fill = 0)

d1
#   student q1 q2
#1:       1  2  0
#2:       2  2  1
#3:       3  1  2
#4:       4  1  0
#5:       5  0  0



setDT(dataHAVE2)[, names(d1)[-1]:= d1[, .(q1, q2)]][]
#    student score class q1 q2
#1:       1     4    10  2  0
#2:       2     6     4  2  1
#3:       3     7     3  1  2
#4:       4     1    66  1  0
#5:       5     3     2  0  0

Using two dataframes but one Data Table

Answers (1)

Related Questions