Reputation: 23
I have this huge loop, which I'm sure that can be optimized, but that works as it is. The problem is that it takes like 8 days to compute on a dataset of 45,000 rows. So I tried using foreach. I was quite happy in the beginning because all cores were used when I ran it, however, none of the variables calculated or iterators used inside the loop were actually computed (I cannot see the iterator "row", prov_list (in fourth line),... or any other object, in the environment).
I tried to include .export = "row", but it didn't make any difference. Funny thing is that the loop actually takes time to compute and the use of CPU jumps up to 100% during the process.
I'd appreciate if someone could give a hint on it. See loop below.
foreach(row = 1:50, .packages = c("dplyr", "hablar", "foreach")) %dopar% { # 1:nrow(data_QI)
# Prepare data to be used in the loop
obs <- data_QI[row,]
prov_list <- colnames(GTC_matrix[-1])
if (!is.na(obs$EXPIRATIONYEAR)) {
tm_years <- seq(obs$APPLICATIONYEAR, obs$EXPIRATIONYEAR)
# articles_provinces file only contains info until 1920, so we remove years higher than 1920
# only for searching in this file purposes (it is not changed in original data)
if (tm_years[length(tm_years)] >= 1920) {
tm_years <- tm_years[!(tm_years > 1920)]
}
} else {
obs$EXPIRATIONYEAR <- 1920
tm_years <- seq(obs$APPLICATIONYEAR, obs$EXPIRATIONYEAR)
}
tm_ap_appprov <- obs$APPPROV
tm_ap_appcoun <- obs$APPCOUN
# Extract all provinces except that of the observation at hand
tm_ap_appprov_not <- prov_list[-match(tm_ap_appprov, prov_list)]
tm_industries <- select(obs, starts_with("INDUSTRY_")) %>%
select_if(function(col) all(col > 0)) %>%
names()
# Initialize objects used in the loop
numart_ynp <- data.frame() # numerator of the numerator
numart_yp <- data.frame() # denominator of the numerator
numart_yn <- data.frame() # numerator of the denominator
numart_y <- data.frame() # denominator of the denominator
numart_ync <- data.frame() # numerator of the numerator
numart_yc <- data.frame() # denominator of the numerator
numart_yn_2 <- data.frame() # numerator of the denominator
numart_y_2 <- data.frame() # denominator of the denominator
numart_ynp_exp <- list() # numerator of the numerator
numart_yp_exp <- list() # denominator of the numerator
numart_yn_exp <- list() # numerator of the denominator
numart_y_exp <- list() # denominator of the denominator
RTMGS_noprov <- list()
RTMGD_noprov <- list()
data_QI$RTMGS_EXP <- 0
data_QI$RTMGD_EXP <- 0
# Get into the loop for indices calculation
if (data_QI$APPCOUN[row] == "SPAIN") {
print(paste(row, "- entra en == SPAIN"))
GTC_vector <- select(GTC_matrix, tm_ap_appprov)
for (y in tm_years) {
for (n in tm_industries) {
# This is the numerator of the numerator
numart_ynp <- bind_rows(numart_ynp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
select(tm_ap_appprov)))
# This is the numerator of the denominator in RTMGS
numart_yn <- bind_rows(numart_yn, (filter(articles_provinces, Year == y, SECTOR == n)))
}
}
for (y in tm_years) { # File only contains info until 1920
# This is the denominator of the numerator in RTMGS
numart_yp <- bind_rows(numart_yp, (filter(articles_provinces, Year == y) %>%
select(tm_ap_appprov)))
# This is the denominator of the denominator in RTMGS
numart_y <- bind_rows(numart_y, (filter(articles_provinces, Year == y)))
# This loop serves also to calculate NUM_NUM and NUM_DEN of RTMGS_EXP and RTMGD_EXP
for (p in tm_ap_appprov_not) {
numart_ynp_exp <- append(numart_ynp_exp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
select(p)))
numart_yp_exp <- append(numart_yp_exp, (filter(articles_provinces, Year == y) %>%
select(p)))
}
}
# El NUM_DEN y DEN_DEN de RTMGS_EXP y RTMGD_EXT solo hay que hacerlo una vez, no por cada p, porque en realidad va a salir la misma lista para cada p. Hay que sacarlos del loop.
numart_yn_exp <- append(numart_yn_exp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
select(-Year, -SECTOR, -tm_ap_appprov)))
numart_y_exp <- append(numart_y_exp, (filter(articles_provinces, Year == y) %>%
select(-Year, -SECTOR, -tm_ap_appprov)))
# Final calculation of RTMGS and RTMGD and RTMGS_EXP and RTMGD_EXP
# RTMGS and RTMGD
data_QI[row,"RTMGS_NUM_NUM"] <- mean(unlist(numart_ynp))
data_QI[row,"RTMGS_DEN_NUM"] <- mean(unlist(numart_yn[3:ncol(numart_yn)]))
data_QI[row,"RTMGS_NUM_DEN"] <- mean(unlist(numart_yp))
data_QI[row,"RTMGS_DEN_DEN"] <- mean(unlist(numart_y[3:ncol(numart_y)]))
data_QI <- data_QI %>%
mutate(RTMGS = (RTMGS_NUM_NUM/RTMGS_DEN_NUM)/(RTMGS_NUM_DEN/RTMGS_DEN_DEN),
RTMGD = (1/(ncol(articles_provinces)-2))*((RTMGS-min(RTMGS, na.rm = T))/(max(RTMGS, na.rm = T)-min(RTMGS, na.rm = T)))
)
# RTMGS_EXP and RTMGD_EXP
# The RTMGS and RTMGD for each region are needed first. Note that the min/max fun is actually min_/max_ from hablar package
for (i in (1:54)) {
RTMGS_noprov[[i]] <- (numart_ynp_exp[[i]]/mean(unlist(numart_yp_exp[[i]])))/(numart_yn_exp[[i]]/mean(unlist(numart_y_exp[[i]]))) # Hay muchos NaN, porque en muchas de las otras provincias que no son la de la marca en cuestion, el numero de articulos es cero. Y cero/cero = NaN
# RTMGD_noprov must include max and min for ALL provinces, not n-1
RTMGS_allprov <- append(RTMGS_noprov, data_QI$RTMGS[row])
RTMGD_noprov[[i]] <- (1/(ncol(articles_provinces)-2))*((RTMGS_allprov[[i]]-min_(unlist(RTMGS_allprov)))/(max_(unlist(RTMGS_allprov))-min_(unlist(RTMGS_allprov))))
}
# Now the Exposure. Note that the max fun is actually max_ from hablar package
data_QI[row, "RTMGS_EXP"] <- max_(unlist(RTMGS_noprov) / GTC_vector)
data_QI[row, "RTMGD_EXP"] <- max_(unlist(RTMGD_noprov) / GTC_vector)
print(paste(row, "ha terminado == SPAIN"))
} else {
# Observations that are not SPAIN (this is the else clause of: (data_QI$APPCOUN[row] == "SPAIN"))
print(paste(row, "- entra en != SPAIN"))
for (y in tm_years) { # File only contains info until 1920
for (n in tm_industries) {
numart_ync <- bind_rows(numart_ync, (filter(articles_countries, Year == y, SECTOR == n) %>%
select(tm_ap_appcoun)))
numart_yn_2 <- bind_rows(numart_yn_2, (filter(articles_countries, Year == y, SECTOR == n) %>%
mutate(FOREINSP = (rowSums(across(where(is.numeric))) - Year) - SPAIN))) # not very elegant subtracting YEAR but it damn works
}
}
for (y in tm_years) { # File only contains info until 1920
numart_yc <- bind_rows(numart_yc, (filter(articles_countries, Year == y) %>%
select(tm_ap_appcoun)))
numart_y_2 <- bind_rows(numart_y_2, (filter(articles_countries, Year == y) %>%
mutate(FOREINSP = (rowSums(across(where(is.numeric))) - Year) - SPAIN))) # not very elegant subtracting YEAR but it damn works
}
# Final calculation of QIX_3 (RTMICS) and QIX_4 (RTMICD)
data_QI[row,"RTMICS_NUM_NUM"] <- mean(unlist(numart_ync), na.rm = T)
data_QI[row,"RTMICS_DEN_NUM"] <- mean(unlist(numart_yc))
data_QI[row,"RTMICS_NUM_DEN"] <- mean(numart_yn_2$FOREINSP)
data_QI[row,"RTMICS_DEN_DEN"] <- mean(numart_y_2$FOREINSP)
data_QI <- data_QI %>% mutate(RTMICS = (RTMICS_NUM_NUM/RTMICS_DEN_NUM)/(RTMICS_NUM_DEN/RTMICS_DEN_DEN),
RTMICD = (1/(ncol(articles_countries)-2))*((RTMICS-min(RTMICS, na.rm = T))/ (max(RTMICS, na.rm = T)-min(RTMICS, na.rm = T))))
print(paste(row,"ha terminado != SPAIN"))
}
}
Upvotes: 1
Views: 177
Reputation: 436
Each iteration of a foreach
loop should return an object (e.g. a dataframe or matrix). These are then combined as specified by the .combine
argument of the foreach
function. The foreach
function then returns the combined object. So you need something like:
combined_results <- foreach(i = 1:50, .combine=rbind) %dopar% {
# code that produces some_result
return(some_result)
}
Upvotes: 1