89_Simple
89_Simple

Reputation: 3805

R: using dplyr to do calculation row-wise

I have a vector:

vec <- c(44,0,13,18,32,13,25,42,13,24)

I want to calculate fT as follows:

fT <- ifelse(vec >= 10 & vec <= 20, min(vec) - max(vec), 
         ifelse(vec > 20 & vec <= 50, max(vec) - min(vec),0))

I want to extent this calculation for each row of a dataframe i.e. I have a dataframe and I want to calculate fT for each row.

A sample data:

dat <- data.frame(replicate(10,sample(0:50,1000,rep=TRUE)))

That means I will have another dataframe which will have the fT value for each value in dat.

To calculate fT for each row, I thought of using dplyr,

dat%>%
  rowwise()%>%
   mutate(fT = ifelse(dat[,1:10] >= 10 & dat[,1:10] <= 30, min(dat[,1:10]) - max(dat[,1:10]), 
                  ifelse(dat[,1:10] > 30 & dat[,1:10] <= 50, max(dat[,1:10]) - min(dat[,1:10]),0)))

I am stuck at this stage. I do not know how to index by column so that for each row of dat, I have a fT.

Upvotes: 2

Views: 4507

Answers (2)

acylam
acylam

Reputation: 18661

If you want the sums of fT, you can do this with apply:

dat$fT = apply(dat, 1, function(x) sum(ifelse(x >= 10 & x <= 20, min(x) - max(x), 
                                              ifelse(x > 20 & x <= 50, max(x) - min(x),0))))

Result:

  X1 X2 X3 X4 X5 X6 X7 X8 X9 X10  fT
1 14 13  8 10 15 12 22 47 29  40 -39
2 40 30  7 48 42 50 20 30 24  44 301
3 20  8  7 19 30 36 18  4 37  12 -33
4 45 43 26 31 41 33 26 43 11  28 272
5 47 43 25  9 14 12  3  1 38  46 138
6  2 24 31 33  7  4 36 41 42   0 252

Note:

1 in apply specifies the row margin. This loops through the rows of the input, dat, and output a single sum of fT for each row.

Edit:

If you actually want the value of fT (not the sum), you can still use apply, but wrap the output with matrix and specify ncol=10 and byrow=TRUE. This means that you want an output matrix with 10 columns (just like dat) and fill the matrix rowwise with the output of apply:

new_dat = matrix(apply(dat, 1, 
                       function(x) ifelse(x >= 10 & x <= 20, min(x) - max(x), 
                                          ifelse(x > 20 & x <= 50, max(x) - min(x),0))),
                 ncol = 10, byrow = TRUE)

Result:

> head(new_dat)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]  -39  -39    0  -39  -39  -39   39   39   39    39
[2,]   43   43    0   43   43   43  -43   43   43    43
[3,]  -33    0    0  -33   33   33  -33    0   33   -33
[4,]   34   34   34   34   34   34   34   34  -34    34
[5,]   46   46   46    0  -46  -46    0    0   46    46
[6,]    0   42   42   42    0    0   42   42   42     0

If you prefer sticking to dplyr, you can first transpose your dat and map it on the "columns", then transpose back:

library(dplyr)
library(purrr)

dat %>%
  transpose() %>%
  map_dfr(~ ifelse(. >= 10 & . <= 20, min(.) - max(.), 
                  ifelse(. > 20 & . <= 50, max(.) - min(.),0))) %>%
  transpose()

Result:

> head(new_dat2)
   V1  V2 V3  V4  V5  V6  V7 V8  V9 V10
1 -39 -39  0 -39 -39 -39  39 39  39  39
2  43  43  0  43  43  43 -43 43  43  43
3 -33   0  0 -33  33  33 -33  0  33 -33
4  34  34 34  34  34  34  34 34 -34  34
5  46  46 46   0 -46 -46   0  0  46  46
6   0  42 42  42   0   0  42 42  42   0

Note:

The advantage of using transpose instead of t in Base R is that you get a data.frame after transposing instead of a matrix.

Data:

set.seed(123)
dat <- data.frame(replicate(10,sample(0:50,1000,rep=TRUE)))

Upvotes: 4

akrun
akrun

Reputation: 886938

Here is one option with pmax/pmin which would be efficient

m1 <- (do.call(pmax, dat) - do.call(pmin, dat))[row(dat)] 
out <-  (-1*m1 *(dat >=10 & dat <=20)) +   (m1*(dat > 20 & dat <=50))
all.equal(new_dat, out, check.attributes = FALSE)   
#[1] TRUE

Benchmarks

set.seed(24)
dat <- data.frame(replicate(500,sample(0:50,15000,rep=TRUE)))

system.time({
new_dat = matrix(apply(dat, 1, 
                       function(x) ifelse(x >= 10 & x <= 20, min(x) - max(x), 
                                          ifelse(x > 20 & x <= 50, max(x) - min(x),0))),
                 ncol =  ncol(dat), byrow = TRUE)
  })
#user  system elapsed 
#   2.67    0.10    2.77 
system.time({
   m1 <- (do.call(pmax, dat) - do.call(pmin, dat))[row(dat)] 
   out <-  (-1*m1 *(dat >=10 & dat <=20)) +   (m1*(dat > 20 & dat <=50))
 })
#  user  system elapsed 
#   0.48    0.11    0.60 


#all.equal(new_dat, out, check.attributes = FALSE)   
#[1] TRUE

Upvotes: 0

Related Questions