optimisation and parallelisation of the for loop

Question

I got some problem with my code. First of all,i it's, I think, a double code, as one is a set of the functions, which are used for the main code which is a nested for loop, smth like this:

hijos <- function(con, clasification, element){
  example <- sprintf("(select hcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2)) minus (select pcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2))", clasification, element, clasification, element)
  codhijos <- dbSendQuery(con, example)
  hijos_df <- fetch(codhijos, n=-1)
  if (length(hijos_df$HCODNIV2) == 0) {
    hijos_df <- data.frame("HCODNIV2" = element)
  }
  dbClearResult(codhijos)
  hijos_df
}

hijos_string <- function(con, clasification, element) {
  df1 <- hijos(con, clasification, element)
  df1 <- paste(df1$HCODNIV2,collapse=",")
  df1
}


resultado <- function(datos_inversion, classProduct, classBranch, year, product, branch, df1, df2) {
year_1 = year - 1
df5 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df4 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df3 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df2 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df7 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch, year, product, branch))
df6 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch,  year_1, product, branch))
colnames(df5)[1] <- 'VALOR_IR14'
colnames(df4)[1] <- 'VALOR_IN14'
colnames(df3)[1] <- 'VALOR_IN15'
colnames(df2)[1] <- 'VALOR_IR15'
colnames(df6)[1] <- 'VALOR_IN14_tot'
colnames(df7)[1] <- 'VALOR_IN15_tot'
colnames(df5)[3] <- 'CODUNIDAD_IR'
colnames(df2)[3] <- 'CODUNIDAD_IR'
colnames(df3)[3] <- 'CODUNIDAD_IN'
colnames(df4)[3] <- 'CODUNIDAD_IN'
df6 <- cbind(df6, df7[!names(df7) %in% names(df6)])
df5 <- cbind(df5, df2[!names(df2) %in% names(df5)])
df4 <- cbind(df4, df3[!names(df3) %in% names(df4)])
newdf <- cbind(df5, df4[!names(df4) %in% names(df5)])
newdf$RES1p = log(newdf$VALOR_IR14)
newdf$RES1p_15 = log(newdf$VALOR_IR15)
newdf$Res1p_tot = newdf$RES1p_15 - newdf$RES1p
newdf$Res2p = newdf$VALOR_IN14/ df6$VALOR_IN14
newdf$Res2p_15 = newdf$VALOR_IN15/ df6$VALOR_IN15
newdf$Res2p_tot = (newdf$Res2p + newdf$Res2p_15)/ 2
newdf$RESULTADO = newdf$Res1p_tot * newdf$Res2p_tot
df_resultados = sum(newdf$RESULTADO, na.rm = TRUE)
data_frame <- data.frame("ANYO" = year, "RESULTADO" = df_resultados, "PRODUCTO" = product, "RAMA" = branch, "CLASE_PR" = classProduct, "CLASE_R" = classBranch) 
data_frame
}

These functions are just making calculations from the data from the data frame from the Oracle.

The code itself this one:

x = 1965:2015
d = 2000
g = 1000
y = c(2026,2017,2019,2023)
z = c(1034,1002,1056,1057)
start <- proc.time() # Start clock
dat <- data.frame()
for (year in x) {
  for (classProduct in d){
   for (product in y) {
      prod_string <- hijos_string(con, classProduct, product)
  product_df <- c(product, prod_string)
  product_df <- paste(product_df, collapse = ",")
  for (classBranch in g){
    for (branch in z) {
      branch_string <- hijos_string(con, classBranch, branch)
      branch_df <- c(branch, branch_string)
      branch_df <- paste(branch_df, collapse = ",")

      consulta <- sprintf("select * from stock.V_CALCULOS where  CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",classProduct, classBranch, product_df, branch_df)
      resuni <- dbSendQuery(con, consulta)
      datos_inversion <- fetch(resuni, n=-1)
      dbClearResult(resuni)

      df<-resultado(datos_inversion, classProduct, classBranch, year, product, branch, prod_string, branch_string)
      dat <- rbind(dat, df)
        }
      }
    }
  }
}  
print(dat)
time_elapsed_parallel <- proc.time() - start # End clock
time_elapsed_parallel

dbDisconnect(con)

So, my problem is.. I need somehow to parallelize this code to work with multiple cores at the same time. Meanwhile the packages pbdR, multicore y etc are not available for the version of RStudio I'm using (3.4.4).
I'm thinking to optimize the for loop in order to use the parallel code more efficient, but have no idea how to do it.

So, finally, my question: Any ideas how to optimize the for loop and after how to parallelize it, in plan (maybe) how to split it or what to do for it to be executed with 4 cores at the same time and save the time, bc its take forever to execute and I need as less as possible (the data frame initial has more than 250 000 rows)? I use Windows virtual machine with 4 cores

Thanks for the help, in advance!

PS. it's precise to have the result added by rows in a final data frame

r2evans · Accepted Answer

Iteratively building a data.frame row-by-row scales horribly. Instead, I suggest creating each iteration's output into a list and then bind at the end. I don't have your data, so this is untested. I'm also going to assume that Matt's comment about 1,632 SQL queries is unavoidable (because that's likely a big bottleneck).

Bottom line: your problem areas are likely (1) thousand-plus SQL queries, and (2) iteratively building your results with rbind.

This pre-builds all of the iterations within your nested for loops.

eg <- expand.grid(Year=x, classProduct=d, product=y, classBranch=g, branch=z)

Now let's add the _df and _string variables; I'm using dplyr here for the grouping and such, though there are other ways (both base-R and data.table):

library(dplyr)
eg <- eg %>%
  group_by(classProduct, product) %>%
  mutate(
    prod_string = hijos_string(con, classProduct[1], product[1]),
    product_df = paste(product, prod_string[1], sep=",")
  ) %>%
  group_by(classBranch, branch) %>%
  mutate(
    branch_string = hijos_string(con, classProduct[1], branch[1]),
    branch_df = paste(branch, branch_string[1], sep=",")
  ) %>%
  ungroup()

This is still inefficient by Matt's comment, but far less inefficient from the iteratively-adding-rows perspective:

lst <- lapply(seq_len(nrow(eg)), function(i) {
  consulta <- sprintf("select * from stock.V_CALCULOS where  CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",
                      eg$classProduct[i], eg$classBranch[i], eg$product_df[i], eg$branch_df[i])
  resuni <- dbSendQuery(con, consulta)
  datos_inversion <- fetch(resuni, n=-1)
  dbClearResult(resuni)
  resultado(datos_inversion, eg$classProduct[i], eg$classBranch[i], eg$year[i], eg$product[i],
            eg$branch[i], eg$prod_string[i], eg$branch_string[i])
})
dat <- do.call(rbind, lst)

Again, I don't know if this'll work perfectly, but the rationale is sound and (I think) relatively straight-forward.

optimisation and parallelisation of the for loop

Answers (1)

Related Questions