AMfrn
AMfrn

Reputation: 11

R - How can I make this loop run faster?

The for loop below iterates over nodes in an igraph graph. There are 2048 of these, so it is very slow. I've tried to code as efficiently as possible (for example, by not growing vectors). How can I make the loop run faster?

Edit: I've also thought about writing this in C++ via Rcpp. I just don't know how I would use igraph in that case.

Edit 2: compatible_models actually depends on child_node. What I gave here is an example of what it could be for a particular value of child_node.

library(igraph)
library(Metrics)

set.seed(1234)
N <- 10000
A <- rnorm(N, 10, 2)
B <- rnorm(N, 9, 2)
C <- rnorm(N, 12, 1)
D <- rnorm(N, 7, 3)
Y <- A + B + A*B + D + A^2 + rnorm(N)
data <- data.frame(Y = Y, A = A, B = B, C = C, D = D)
partition <- sort(sample(N, 0.7*N))
data_train <- data[partition, ]
data_test <- data[-partition, ]

g <- make_empty_graph()
g <- g + vertices(1:2049)

generate_edges <- function(start_vertex, end_vertices) {
  edges <- c()
  for (i in 1:length(end_vertices)) {
    edges <- c(edges, start_vertex, end_vertices[i])
  }
  return(edges)
}

outward_edges <- generate_edges(V(g)[1], V(g)[2:vcount(g)])
g <- g + edges(outward_edges, attr1 = rep(0, length(outward_edges) / 2), attr2 = rep(0, length(outward_edges) / 2))


successors <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) # nrow = number of successors of node 1
i <- 1

for (child_node in 2:2049) {
   # compatible_models <- lapply(...) # suppose this is a list of "formula" objects
   # like: 
   compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
   compatible_models <- lapply(compatible_models, lm, data = data_train)
   predictions <- sapply(compatible_models, predict, newdata = data_test)
   successors[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]), 
                                     sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))

   i <- i + 1

  }




Upvotes: 1

Views: 91

Answers (1)

user12256545
user12256545

Reputation: 3002

Correct me if i am wrong but i think you could evaluate the first three lines (or any lines that build model objects, but do not evaluate anything) outside of the loop, which ~ triples the performance of the code on my machine:

successors <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) # nrow = number of successors of node 1

i <- 1

start_time <- Sys.time()
for (child_node in 2:2049) {
# build models inside loop:
  compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
  compatible_models <- lapply(compatible_models, lm, data = data_train)
  predictions <- sapply(compatible_models, predict, newdata = data_test)
  successors[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]), 
                          sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
  
  i <- i + 1
  
}
Sys.time()-start_time
#Time difference of 26.69914 secs

Optimized code with model creation outside of loop:



## model building:
compatible_models <- list(Y ~ A + B + C, Y ~ I(A^2) + B + C + D, Y ~ B + D)
compatible_models <- lapply(compatible_models, lm, data = data_train)
predictions <- sapply(compatible_models, predict, newdata = data_test)
## initialisation:
successors2 <- matrix(nrow = length(g[[1, ]][[1]]), ncol = 9) 
i <- 1

start_time <- Sys.time()
for (child_node in 2:2049) {

  successors2[i, 1:3] <- c(edge_attr(g, name = "attr1", g[[1, V(g)[child_node], edges = TRUE]]), edge_attr(g, name = "attr2", g[[1, V(g)[child_node], edges = TRUE]]), 
                          sum(apply(predictions, 2, rmse, actual = data_test$Y))/length(compatible_models))
  
  i <- i + 1
  
}
Sys.time()-start_time
#Time difference of 8.885826 secs

all.equal(successors,successors2)
# [1] TRUE

Upvotes: 1

Related Questions