Optimizing code with for loop and filter

Question

I got one huge dataset I simplified for this question and I try to apply a function to each row of it in function of one specific column.

I tried a for-loop approach and then did some profiling with Rprof and profvis. I know that I could try some apply or other approach but the profiling seems to say that the slowest parts are due to other steps.

This is what I want to do :

library(dplyr)

# Example data frame
id <- rep(c(1:100), each = 5)
ab <- runif(length(id), 0, 1)
char1 <- runif(length(id), 0, 1)
char2 <- runif(length(id), 0, 1)
dat <- data.frame(cbind(id, ab, char1, char2))

dat$result <- NA

# Loop
com <- unique(id)
for (k in com){
  dat_k <- filter(dat, id==k) # slowest line
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line                                                    
}

The slowest parts of my code are due to the lines with filter and then when I reallocate the result obtained into the original data frame. I tried to replace filter function with a subset or a which but it's even slower.

Thus, the organization of this code should be improved but I don't really see how.

alexwhitworth · Accepted Answer

I get a minor speedup via lapply:

library(microbenchmark)
microbenchmark(
  OP=
for (k in com){
  dat_k <- filter(dat, id==k) # slowest line
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line                                                    
}, 
  phiver=
for (k in com){
  dat_k <- dat[id == k, ] # no need for filter
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[id==k, "result"] <- dat_k$result # 2nd no need for which 
},

  alex= {
dat2 <- split(dat, factor(dat$id))
dat2 <- lapply(dat2, function(l) {
  dat_k_dist <- cluster::daisy(l[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
  denom <- sum(l[, "ab"]) - l[, "ab"]
  l[, "result"] <- as.numeric(num / denom)
  return(l)
})
  dat$result <- Reduce("c",lapply(dat2, function(l) l$result))
})

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval cld
    OP 126.72184 129.94344 133.47666 132.11949 134.14558 196.44860   100   c
    phiver  73.78996  77.13434  79.61202  78.21638  79.81958 139.15854   100  b 
    alex  67.86450  71.61277  73.26273  72.34813  73.50353  90.31229   100 a

But this is also an embarrasingly parallel problem, so we can parallelize it. Note: this WILL NOT be faster on the example data because of the overhead from parallel. But it should be faster on your so-called "huge dataset"

library(parallel)

cl <- makeCluster(detectCores())
dat$result <- Reduce("c", parLapply(cl, dat2, fun= function(l) {
  dat_k_dist <- as.matrix(cluster::daisy(l[, c("char1", "char2")], metric = "gower"))
  num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
  denom <- sum(l[, "ab"]) - l[, "ab"]
  return(as.numeric(num / denom))
}))
stopCluster(cl)

Optimizing code with for loop and filter

Answers (2)

Related Questions