mazkaz
mazkaz

Reputation: 31

R's data.table - How to find an integer value and mutiply the values of the subsequent columns together?

Let's say one has the following data.table defined in R

Drug1   Dose1   Freq1   Drug2   Dose2   Freq2   Drug3   Dose3   Freq3
1234567890  2   1   1548768954  23  2   2222132435  2   2
4356678344  2   2   6547894356  3   1   2123456789  2   2
5673452976  4   1   1234567890  4   0.5 4568789076  33  4

How would one search for a particular integer value only in columns "Drug1"-"Drug[x]" and, if found, create a new variable that is the product of the values of the two following columns, corresponding to the same row where the integer value was found (all other values in this new variable should be NA)?

Note: the integer values in "Drug[x]" columns are all of length 10 (e.g. 1234567890, 4593480033, etc.) and the search term of interest is only of length 5, starting with the first 5 digits of the integer (e.g. 12345, 45934, etc).

So if my search term is integer value 12345, the result would look something like this:

Drug1   Dose1   Freq1   Newvar1 Drug2   Dose2   Freq2   Newvar2 Drug3   Dose3   Freq3
1234567890  2   1   2   1548768954  23  2   NA  2222132435  2   2
4356678344  2   2   NA  6547894356  3   1   NA  2123456789  2   2
5673452976  4   1   NA  1234567890  4   0.5 -2  4568789076  33  4

Thank you.

Upvotes: 3

Views: 88

Answers (2)

akrun
akrun

Reputation: 887108

You could try Map

v1 <- grep("Drug", colnames(df))
m1 <- matrix(sort(v1+rep(1:2,each=3)),ncol=3)
df[paste0('NewVar',1:3)] <- Map(function(x,y) {
      x1 <-substr(df[,x],1,5)==12345
     Reduce(`*`,df[y]*(NA^!x1))}, v1, split(m1, col(m1)))
df
#       Drug1 Dose1 Freq1      Drug2 Dose2 Freq2      Drug3 Dose3 Freq3 NewVar1
#1 1234567890     2     1 1548768954    23   2.0 2222132435     2     2       2
#2 4356678344     2     2 6547894356     3   1.0 2123456789     2     2      NA
#3 5673452976     4     1 1234567890     4   0.5 4568789076    33     4      NA
#  NewVar2 NewVar3
#1      NA      NA
#2      NA      NA
#3       2      NA

Update

May be you can try a for loop with data.table

 library(data.table)
 DT <- as.data.table(df)
 nm1 <- grep('Drug', colnames(DT))
 nm2 <- lapply(nm1, function(x) c(x+1,x+2))
 nm3 <- paste0('NewVar', seq_along(nm1))

 for(j in seq_along(nm1)){
     DT[, (nm3[j]):= Reduce(`*`,DT[,nm2[[j]],with=FALSE
         ]*NA^!substr(DT[[nm1[j]]],1,5)==12345)]
  }

 DT
 #        Drug1 Dose1 Freq1      Drug2 Dose2 Freq2      Drug3 Dose3 Freq3 NewVar1
 #1: 1234567890     2     1 1548768954    23   2.0 2222132435     2     2       2
 #2: 4356678344     2     2 6547894356     3   1.0 2123456789     2     2      NA
 #3: 5673452976     4     1 1234567890     4   0.5 4568789076    33     4      NA
  #   NewVar2 NewVar3
  #1:      NA      NA
  #2:      NA      NA
  #3:       2      NA

Or a slightly changed alternative based on the index numbers using @nicola's approach

 DT <- as.data.table(df)
 indx <- 1:3
  for(j in indx){
    DT[, (paste0('NewVar', j)):=  DT[[paste0("Dose",j)]]*
    DT[[paste0("Freq",j)]]*(NA^!substr(DT[[paste0("Drug",j)]],1,5)==12345)]
   }
 DT
 #        Drug1 Dose1 Freq1      Drug2 Dose2 Freq2      Drug3 Dose3 Freq3 NewVar1
 #1: 1234567890     2     1 1548768954    23   2.0 2222132435     2     2       2
 #2: 4356678344     2     2 6547894356     3   1.0 2123456789     2     2      NA
 #3: 5673452976     4     1 1234567890     4   0.5 4568789076    33     4      NA
 #   NewVar2 NewVar3
 #1:      NA      NA
 #2:      NA      NA
 #3:       2      NA

data

df <- structure(list(Drug1 = c(1234567890, 4356678344, 5673452976), 
Dose1 = c(2L, 2L, 4L), Freq1 = c(1L, 2L, 1L), Drug2 = c(1548768954, 
6547894356, 1234567890), Dose2 = c(23L, 3L, 4L), Freq2 = c(2, 
1, 0.5), Drug3 = c(2222132435, 2123456789, 4568789076), Dose3 = c(2L, 
2L, 33L), Freq3 = c(2L, 2L, 4L)), .Names = c("Drug1", "Dose1", 
"Freq1", "Drug2", "Dose2", "Freq2", "Drug3", "Dose3", "Freq3"
), class = "data.frame", row.names = c(NA, -3L))

Upvotes: 3

shadow
shadow

Reputation: 22293

If you really just have 3 drugs, you can just manually repeat the creation of Newvar three times and then rearrange the columns:

drug.id <- 12345
df[, 'Newvar1'] <- ifelse(abs(df[, 'Drug1'] - drug.id*100000)<100000, 
                          df[, 'Dose1'] * df[, 'Freq1'], 
                          NA)

However, if this was just an example and your real data has more drugs, it will be easier to reshape the data to long format first and then do your calculations there. If you must, you can always go back to wide format.

# read data
df <- read.table(text='Drug1   Dose1   Freq1   Drug2   Dose2   Freq2   Drug3   Dose3   Freq3
1234567890  2   1   1548768954  23  2   2222132435  2   2
4356678344  2   2   6547894356  3   1   2123456789  2   2
5673452976  4   1   1234567890  4   0.5 4568789076  33  4', header=TRUE)
# reshape to long format
long.df <- reshape(df, 
                   direction = 'long', 
                   varying = list(paste0('Drug', 1:3), 
                                  paste0('Dose', 1:3), 
                                  paste0('Freq', 1:3)), 
                   v.names = c('Drug', 'Dose', 'Freq'),
                   sep = '')
# calculation of Newvar
drug.id <- 12345
long.df[, 'Newvar'] <- ifelse(abs(long.df[, 'Drug'] - drug.id*100000)<100000, 
                              long.df[, 'Dose'] * long.df[, 'Freq'], 
                              NA)
# back to wide format
wide.df <- reshape(long.df, 
                   direction = 'wide', 
                   timevar = 'time', 
                   idvar = 'id', 
                   sep = '')
wide.df

Upvotes: 0

Related Questions