Reputation: 3384
dataHAVE1=data.frame("student"=c(1,2,3,4,5),
"v1"=c('a','a','b','b','b'),
"v2"=c('b','a','a','a','b'),
"v3"=c('a','a','a',NA,'b'))
dataHAVE2=data.frame("student"=c(1,2,3,4,5),
"score"=c(4,6,7,1,3),
"class"=c(10,4,3,66,2))
dataWANT=data.frame("student"=c(1,2,3,4,5),
"score"=c(4,6,7,1,3),
"class"=c(10,4,3,66,2),
"v1x"=c(1,1,0,0,0),
"v2x"=c(0,1,1,1,0),
"v3x"=c(1,1,1,NA,0))
setDT(dataHAVE1)
dataHAVE2=cbind(dataHAVE2, dataHAVE1[, c("v1","v2","v3") := lapply(.SD, function(x) as.integer(x =='a')), .SDcols = paste0(c("v1","v2","v3"))]
I have 'dataHAVE1' and 'dataHAVE2'. I hope not to merge because these are such big files. I want to add v1-v3 from dataHAVE1 to dataHAVE2 but I want to clean them before adding them and I do not want to clean them in 'dataHAVE1' and then bring them over. The rule is: if v1-v3 = a then v1x-v3x = 1, if b then 0.
say this process were repeated but the new variables q1 and q2 depend on c1/f1 and c2/f2
dataHAVE1=data.frame("student"=c(1,2,3,4,5),
"c1"=c('a','a','b','b','b'),
"f1"=c('b','a','a','a','b'),
"c2"=c('b','b','a',NA,'b'),
"f2"=c('b','a','a','a','b'))
dataHAVE2=data.frame("student"=c(1,2,3,4,5),
"score"=c(4,6,7,1,3),
"class"=c(10,4,3,66,2))
dataWANT=data.frame("student"=c(1,2,3,4,5),
"score"=c(4,6,7,1,3),
"class"=c(10,4,3,66,2),
"q1"=c(2,2,1,1,0),
"q2"=c(0,1,1,1,0))
if c1 = a and f1 = b, q1 = 2
if c1 = a and f1 = a, q1 = 2
if c1 = b and f1 = a, q1 = 1
if c1 = b and f1 = b, q1 = 0
Here is output of d1;
> d1
student q1 q2
1 1 2 0
2 2 2 1
3 3 1 2
4 4 1 0
5 5 0 0
>
library(data.table)
dataHAVE=data.frame("student"=c(1,2,3,4,5),
"class"=c('a','a','b','c','c'),
"qq1scorep"=c(1,1,1,0,0),
"qq1scoref"=c(0,1,0,1,0),
"qq2scorep"=c(0,0,0,NA,1),
"qq2scoref"=c(0,0,NA,NA,1),
"qq3scorep"=c(0,0,0,0,1),
"qq3scoref"=c(1,1,0,1,1),
"qq4scorep"=c(0,0,1,0,1),
"qq4scoref"=c(1,1,1,1,1))
dataWANT=data.frame("student"=c(1,2,3,4,5),
"class"=c('a','a','b','c','c'),
"score1"=c(2,2,2,1,0),
"score2"=c(0,0,0,NA,2),
"score3"=c(1,1,0,1,2),
"score4"=c(1,1,2,0,2))
setDT(dataWANT)[, c(paste0('score',1:4)):= dcast(melt(setDT(dataHAVE),
measure = patterns('^scorep\\d+', '^scoref\\d+'),
value.name = c('p', 'f'))[, q := fifelse(p == 0 & !is.na(c) & f %in% c(0,1), 2,
fifelse(p == 1 & !is.na(c) & f == 1 , 1, 0))],
student ~ paste0('score',1:4),
value.var = 'score', fill = 0)[, .c(paste0('score',1:4)]]
Error: unexpected ']' in: " student ~ paste0('score',1:4), value.var = 'score', fill = 0)[, .c(paste0('score',1:4)]"
Upvotes: 1
Views: 50
Reputation: 887153
We can just assign (:=
) the subset of 'dataHAVE1' to dataHAVE2
assuming the 'student' are the same in both datasets and in the same order
library(data.table)
setDT(dataHAVE2)[, paste0(names(dataHAVE1)[-1], "x") := lapply(dataHAVE1[-1],
function(x) +(x == 'a'))]
dataHAVE2
# student score class v1x v2x v3x
#1: 1 4 10 1 0 1
#2: 2 6 4 1 1 1
#3: 3 7 3 0 1 1
#4: 4 1 66 0 1 NA
#5: 5 3 2 0 0 0
For the updated example
d1 <- dcast(melt(setDT(dataHAVE1), measure = patterns('^c\\d+',
'^f\\d+'), value.name = c('c', 'f'))[, q := fifelse(c == 'a' &
!is.na(c) & f %in% c('a', 'b'), 2,
fifelse(c == 'b' & !is.na(c) & f == 'a' , 1, 0))],
student ~ paste0("q", variable), value.var = 'q', fill = 0)
d1
# student q1 q2
#1: 1 2 0
#2: 2 2 1
#3: 3 1 2
#4: 4 1 0
#5: 5 0 0
setDT(dataHAVE2)[, names(d1)[-1]:= d1[, .(q1, q2)]][]
# student score class q1 q2
#1: 1 4 10 2 0
#2: 2 6 4 2 1
#3: 3 7 3 1 2
#4: 4 1 66 1 0
#5: 5 3 2 0 0
Upvotes: 2