Reputation: 31
Let's say one has the following data.table defined in R
Drug1 Dose1 Freq1 Drug2 Dose2 Freq2 Drug3 Dose3 Freq3
1234567890 2 1 1548768954 23 2 2222132435 2 2
4356678344 2 2 6547894356 3 1 2123456789 2 2
5673452976 4 1 1234567890 4 0.5 4568789076 33 4
How would one search for a particular integer value only in columns "Drug1"-"Drug[x]" and, if found, create a new variable that is the product of the values of the two following columns, corresponding to the same row where the integer value was found (all other values in this new variable should be NA)?
Note: the integer values in "Drug[x]" columns are all of length 10 (e.g. 1234567890, 4593480033, etc.) and the search term of interest is only of length 5, starting with the first 5 digits of the integer (e.g. 12345, 45934, etc).
So if my search term is integer value 12345, the result would look something like this:
Drug1 Dose1 Freq1 Newvar1 Drug2 Dose2 Freq2 Newvar2 Drug3 Dose3 Freq3
1234567890 2 1 2 1548768954 23 2 NA 2222132435 2 2
4356678344 2 2 NA 6547894356 3 1 NA 2123456789 2 2
5673452976 4 1 NA 1234567890 4 0.5 -2 4568789076 33 4
Thank you.
Upvotes: 3
Views: 88
Reputation: 887108
You could try Map
v1 <- grep("Drug", colnames(df))
m1 <- matrix(sort(v1+rep(1:2,each=3)),ncol=3)
df[paste0('NewVar',1:3)] <- Map(function(x,y) {
x1 <-substr(df[,x],1,5)==12345
Reduce(`*`,df[y]*(NA^!x1))}, v1, split(m1, col(m1)))
df
# Drug1 Dose1 Freq1 Drug2 Dose2 Freq2 Drug3 Dose3 Freq3 NewVar1
#1 1234567890 2 1 1548768954 23 2.0 2222132435 2 2 2
#2 4356678344 2 2 6547894356 3 1.0 2123456789 2 2 NA
#3 5673452976 4 1 1234567890 4 0.5 4568789076 33 4 NA
# NewVar2 NewVar3
#1 NA NA
#2 NA NA
#3 2 NA
May be you can try a for
loop with data.table
library(data.table)
DT <- as.data.table(df)
nm1 <- grep('Drug', colnames(DT))
nm2 <- lapply(nm1, function(x) c(x+1,x+2))
nm3 <- paste0('NewVar', seq_along(nm1))
for(j in seq_along(nm1)){
DT[, (nm3[j]):= Reduce(`*`,DT[,nm2[[j]],with=FALSE
]*NA^!substr(DT[[nm1[j]]],1,5)==12345)]
}
DT
# Drug1 Dose1 Freq1 Drug2 Dose2 Freq2 Drug3 Dose3 Freq3 NewVar1
#1: 1234567890 2 1 1548768954 23 2.0 2222132435 2 2 2
#2: 4356678344 2 2 6547894356 3 1.0 2123456789 2 2 NA
#3: 5673452976 4 1 1234567890 4 0.5 4568789076 33 4 NA
# NewVar2 NewVar3
#1: NA NA
#2: NA NA
#3: 2 NA
Or a slightly changed alternative based on the index
numbers using @nicola's approach
DT <- as.data.table(df)
indx <- 1:3
for(j in indx){
DT[, (paste0('NewVar', j)):= DT[[paste0("Dose",j)]]*
DT[[paste0("Freq",j)]]*(NA^!substr(DT[[paste0("Drug",j)]],1,5)==12345)]
}
DT
# Drug1 Dose1 Freq1 Drug2 Dose2 Freq2 Drug3 Dose3 Freq3 NewVar1
#1: 1234567890 2 1 1548768954 23 2.0 2222132435 2 2 2
#2: 4356678344 2 2 6547894356 3 1.0 2123456789 2 2 NA
#3: 5673452976 4 1 1234567890 4 0.5 4568789076 33 4 NA
# NewVar2 NewVar3
#1: NA NA
#2: NA NA
#3: 2 NA
df <- structure(list(Drug1 = c(1234567890, 4356678344, 5673452976),
Dose1 = c(2L, 2L, 4L), Freq1 = c(1L, 2L, 1L), Drug2 = c(1548768954,
6547894356, 1234567890), Dose2 = c(23L, 3L, 4L), Freq2 = c(2,
1, 0.5), Drug3 = c(2222132435, 2123456789, 4568789076), Dose3 = c(2L,
2L, 33L), Freq3 = c(2L, 2L, 4L)), .Names = c("Drug1", "Dose1",
"Freq1", "Drug2", "Dose2", "Freq2", "Drug3", "Dose3", "Freq3"
), class = "data.frame", row.names = c(NA, -3L))
Upvotes: 3
Reputation: 22293
If you really just have 3 drugs, you can just manually repeat the creation of Newvar
three times and then rearrange the columns:
drug.id <- 12345
df[, 'Newvar1'] <- ifelse(abs(df[, 'Drug1'] - drug.id*100000)<100000,
df[, 'Dose1'] * df[, 'Freq1'],
NA)
However, if this was just an example and your real data has more drugs, it will be easier to reshape the data to long format first and then do your calculations there. If you must, you can always go back to wide format.
# read data
df <- read.table(text='Drug1 Dose1 Freq1 Drug2 Dose2 Freq2 Drug3 Dose3 Freq3
1234567890 2 1 1548768954 23 2 2222132435 2 2
4356678344 2 2 6547894356 3 1 2123456789 2 2
5673452976 4 1 1234567890 4 0.5 4568789076 33 4', header=TRUE)
# reshape to long format
long.df <- reshape(df,
direction = 'long',
varying = list(paste0('Drug', 1:3),
paste0('Dose', 1:3),
paste0('Freq', 1:3)),
v.names = c('Drug', 'Dose', 'Freq'),
sep = '')
# calculation of Newvar
drug.id <- 12345
long.df[, 'Newvar'] <- ifelse(abs(long.df[, 'Drug'] - drug.id*100000)<100000,
long.df[, 'Dose'] * long.df[, 'Freq'],
NA)
# back to wide format
wide.df <- reshape(long.df,
direction = 'wide',
timevar = 'time',
idvar = 'id',
sep = '')
wide.df
Upvotes: 0