Reputation: 15
I got some problem with my code. First of all,i it's, I think, a double code, as one is a set of the functions, which are used for the main code which is a nested for loop, smth like this:
hijos <- function(con, clasification, element){
example <- sprintf("(select hcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2)) minus (select pcodniv2 from (select pcodniv1, hcodniv1, pcodniv2, hcodniv2, level from stock.clasificaciones where pcodniv1=%d and hcodniv1=pcodniv1 start with pcodniv2=%d connect by prior hcodniv2=pcodniv2))", clasification, element, clasification, element)
codhijos <- dbSendQuery(con, example)
hijos_df <- fetch(codhijos, n=-1)
if (length(hijos_df$HCODNIV2) == 0) {
hijos_df <- data.frame("HCODNIV2" = element)
}
dbClearResult(codhijos)
hijos_df
}
hijos_string <- function(con, clasification, element) {
df1 <- hijos(con, clasification, element)
df1 <- paste(df1$HCODNIV2,collapse=",")
df1
}
resultado <- function(datos_inversion, classProduct, classBranch, year, product, branch, df1, df2) {
year_1 = year - 1
df5 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df4 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year_1, df1, df2))
df3 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df2 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 76 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 in (%s) and CODNIV5 in (%s)",classProduct, classBranch, year, df1, df2))
df7 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch, year, product, branch))
df6 <- sqldf(sprintf("select * from datos_inversion where CODUNIDAD = 54 and CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and ANYO = %d and CODNIV3 = %d and CODNIV5= %d",classProduct, classBranch, year_1, product, branch))
colnames(df5)[1] <- 'VALOR_IR14'
colnames(df4)[1] <- 'VALOR_IN14'
colnames(df3)[1] <- 'VALOR_IN15'
colnames(df2)[1] <- 'VALOR_IR15'
colnames(df6)[1] <- 'VALOR_IN14_tot'
colnames(df7)[1] <- 'VALOR_IN15_tot'
colnames(df5)[3] <- 'CODUNIDAD_IR'
colnames(df2)[3] <- 'CODUNIDAD_IR'
colnames(df3)[3] <- 'CODUNIDAD_IN'
colnames(df4)[3] <- 'CODUNIDAD_IN'
df6 <- cbind(df6, df7[!names(df7) %in% names(df6)])
df5 <- cbind(df5, df2[!names(df2) %in% names(df5)])
df4 <- cbind(df4, df3[!names(df3) %in% names(df4)])
newdf <- cbind(df5, df4[!names(df4) %in% names(df5)])
newdf$RES1p = log(newdf$VALOR_IR14)
newdf$RES1p_15 = log(newdf$VALOR_IR15)
newdf$Res1p_tot = newdf$RES1p_15 - newdf$RES1p
newdf$Res2p = newdf$VALOR_IN14/ df6$VALOR_IN14
newdf$Res2p_15 = newdf$VALOR_IN15/ df6$VALOR_IN15
newdf$Res2p_tot = (newdf$Res2p + newdf$Res2p_15)/ 2
newdf$RESULTADO = newdf$Res1p_tot * newdf$Res2p_tot
df_resultados = sum(newdf$RESULTADO, na.rm = TRUE)
data_frame <- data.frame("ANYO" = year, "RESULTADO" = df_resultados, "PRODUCTO" = product, "RAMA" = branch, "CLASE_PR" = classProduct, "CLASE_R" = classBranch)
data_frame
}
These functions are just making calculations from the data from the data frame from the Oracle.
The code itself this one:
x = 1965:2015
d = 2000
g = 1000
y = c(2026,2017,2019,2023)
z = c(1034,1002,1056,1057)
start <- proc.time() # Start clock
dat <- data.frame()
for (year in x) {
for (classProduct in d){
for (product in y) {
prod_string <- hijos_string(con, classProduct, product)
product_df <- c(product, prod_string)
product_df <- paste(product_df, collapse = ",")
for (classBranch in g){
for (branch in z) {
branch_string <- hijos_string(con, classBranch, branch)
branch_df <- c(branch, branch_string)
branch_df <- paste(branch_df, collapse = ",")
consulta <- sprintf("select * from stock.V_CALCULOS where CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",classProduct, classBranch, product_df, branch_df)
resuni <- dbSendQuery(con, consulta)
datos_inversion <- fetch(resuni, n=-1)
dbClearResult(resuni)
df<-resultado(datos_inversion, classProduct, classBranch, year, product, branch, prod_string, branch_string)
dat <- rbind(dat, df)
}
}
}
}
}
print(dat)
time_elapsed_parallel <- proc.time() - start # End clock
time_elapsed_parallel
dbDisconnect(con)
So, finally, my question: Any ideas how to optimize the for loop and after how to parallelize it, in plan (maybe) how to split it or what to do for it to be executed with 4 cores at the same time and save the time, bc its take forever to execute and I need as less as possible (the data frame initial has more than 250 000 rows)? I use Windows virtual machine with 4 cores
Thanks for the help, in advance!
PS. it's precise to have the result added by rows in a final data frame
Upvotes: 0
Views: 61
Reputation: 160417
Iteratively building a data.frame
row-by-row scales horribly. Instead, I suggest creating each iteration's output into a list
and then bind at the end. I don't have your data, so this is untested. I'm also going to assume that Matt's comment about 1,632 SQL queries is unavoidable (because that's likely a big bottleneck).
Bottom line: your problem areas are likely (1) thousand-plus SQL queries, and (2) iteratively building your results with rbind
.
This pre-builds all of the iterations within your nested for
loops.
eg <- expand.grid(Year=x, classProduct=d, product=y, classBranch=g, branch=z)
Now let's add the _df
and _string
variables; I'm using dplyr
here for the grouping and such, though there are other ways (both base-R and data.table
):
library(dplyr)
eg <- eg %>%
group_by(classProduct, product) %>%
mutate(
prod_string = hijos_string(con, classProduct[1], product[1]),
product_df = paste(product, prod_string[1], sep=",")
) %>%
group_by(classBranch, branch) %>%
mutate(
branch_string = hijos_string(con, classProduct[1], branch[1]),
branch_df = paste(branch, branch_string[1], sep=",")
) %>%
ungroup()
This is still inefficient by Matt's comment, but far less inefficient from the iteratively-adding-rows perspective:
lst <- lapply(seq_len(nrow(eg)), function(i) {
consulta <- sprintf("select * from stock.V_CALCULOS where CODNIV1 = 9 and CODNIV2 = %d and CODNIV4 = %d and CODNIV3 in (%s) and CODNIV5 in (%s) order by codniv1, codniv2, codniv3, codniv4, codniv5, codlugar, codunidad, anyo",
eg$classProduct[i], eg$classBranch[i], eg$product_df[i], eg$branch_df[i])
resuni <- dbSendQuery(con, consulta)
datos_inversion <- fetch(resuni, n=-1)
dbClearResult(resuni)
resultado(datos_inversion, eg$classProduct[i], eg$classBranch[i], eg$year[i], eg$product[i],
eg$branch[i], eg$prod_string[i], eg$branch_string[i])
})
dat <- do.call(rbind, lst)
Again, I don't know if this'll work perfectly, but the rationale is sound and (I think) relatively straight-forward.
Upvotes: 1