hyhno01
hyhno01

Reputation: 177

R apply function on DF rowwise if condition is met

Assume the following dataframe and functions:

cond    var_x    var_y
cond1       2        3
cond1       3        0.1
cond2       2        2
cond2       1        0.2
my_fun1 <- function(x) {
  act_vec <- rnorm(500, x, x/4)
  RT = 0
  for (i in 1:length(act_vec)) {
      my_act = act_vec[i]
      my_rt = my_act
      RT = RT + my_rt
  }
  RT = RT/500
  return(RT)
} 
my_fun2 <- function(x,y) {
  act_vec1 <- rnorm(500, x, x/4)
  act_vec2 <- rnorm(500, y, y/4)
  RT = 0
  for (i in 1:length(act_vec1)) {
      my_act1 = act_vec1[i]
      my_act2 = act_vec1[i]*act_vec2[i]
      my_rt = min(my_act1, my_act2)
      RT = RT + my_rt
  }
  RT = RT/500
  return(RT)
} 

The following code applies the function my_fun1 to every row in DF if DF$cond == 'cond1' and otherwise my_fun2.

my_test_vec = c()
for (i in (1:nrow(DF))) {
  if (DF$cond[i] == 'cond1') {
    my_test = my_fun1(DF$var_x[i])
  } else {
    my_test = my_fun2(DF$var_x[i], DF$var_y[i])
  }
  my_test_vec = c(my_test_vec, my_test)
}

This for-loop, however, is computational inefficient. Because of that, I would like to implement it in a way such that all the operations are applied in one go. My approach so far doesn't work and looks like this:

DF$results <- ifelse(DF$cond == 'cond1', 
                           my_fun1(DF$var_x), 
                           my_fun2(DF$var_x, DF$var_y))

Any suggestions?

Upvotes: 1

Views: 88

Answers (1)

jay.sf
jay.sf

Reputation: 72683

You should improve your knowledge of the vectorization of R. Below are vectorized versions of your functions. First, the code applies vector arithmetic within the functions. Second, also the whole functions are vectorized; Vectorize() makes sure that they can work with vectors as input as it is the case in anifelse().

my_fun1A <- Vectorize(function(x) {
  act_vec <- rnorm(500, x, x/4)
  RT <- sum(act_vec) / 500
  return(RT)
}) 

my_fun2A <- Vectorize(function(x, y) {
  act_vec1 <- rnorm(500, x, x/4)
  act_vec2 <- rnorm(500, y, y/4)
  RT <- sum(pmin(act_vec1, act_vec1*act_vec2)) / 500
  return(RT)
}) 

Result

set.seed(42)
resA <- ifelse(DF$cond == 'cond1', my_fun1A(DF$var_x), my_fun2A(DF$var_x, DF$var_y))
resA
# [1] 1.9849769 2.9837980 1.9772901 0.2028555

Comparison with original version

Vectorizeing your original versions with the for loops inside:

my_fun1v <- Vectorize(my_fun1)
my_fun2v <- Vectorize(my_fun2)
set.seed(42)
res <- ifelse(DF$cond == 'cond1', my_fun1v(DF$var_x), my_fun2v(DF$var_x, DF$var_y))
stopifnot(all.equal(res, resA))  ## same result

Benchmark

fun <- function() ifelse(DF$cond == 'cond1', my_fun1v(DF$var_x), my_fun2v(DF$var_x, DF$var_y))
funA <- function() ifelse(DF$cond == 'cond1', my_fun1A(DF$var_x), my_fun2A(DF$var_x, DF$var_y))
set.seed(42)
DF <- DF[sample(1:nrow(DF), 1e4, replace=T), ]

set.seed(42)
microbenchmark::microbenchmark(fun(), funA(), times=3)
# Unit: seconds
# expr      min       lq     mean   median       uq      max neval cld
#  fun() 5.319736 5.352284 5.392068 5.384831 5.428234 5.471636     3   b
# funA() 1.793795 1.793863 1.835880 1.793931 1.856923 1.919914     3  a 

We can clearly see an improvement of over 60%.


Data:

DF <- read.table(header=T, text="cond    var_x    var_y
cond1       2        3
cond1       3        0.1
cond2       2        2
cond2       1        0.2")

Upvotes: 2

Related Questions