The Dark Knight
The Dark Knight

Reputation: 23

foreach loop not creating variables

I have this huge loop, which I'm sure that can be optimized, but that works as it is. The problem is that it takes like 8 days to compute on a dataset of 45,000 rows. So I tried using foreach. I was quite happy in the beginning because all cores were used when I ran it, however, none of the variables calculated or iterators used inside the loop were actually computed (I cannot see the iterator "row", prov_list (in fourth line),... or any other object, in the environment).

I tried to include .export = "row", but it didn't make any difference. Funny thing is that the loop actually takes time to compute and the use of CPU jumps up to 100% during the process.

I'd appreciate if someone could give a hint on it. See loop below.

foreach(row = 1:50, .packages = c("dplyr", "hablar", "foreach")) %dopar% { # 1:nrow(data_QI)
  # Prepare data to be used in the loop
  obs <- data_QI[row,]
  prov_list <- colnames(GTC_matrix[-1])
  if (!is.na(obs$EXPIRATIONYEAR)) {
    tm_years <- seq(obs$APPLICATIONYEAR, obs$EXPIRATIONYEAR)
    # articles_provinces file only contains info until 1920, so we remove years higher than 1920
    # only for searching in this file purposes (it is not changed in original data)
    if (tm_years[length(tm_years)] >= 1920) {
      tm_years <- tm_years[!(tm_years > 1920)]
    }
  } else {
    obs$EXPIRATIONYEAR <- 1920
    tm_years <- seq(obs$APPLICATIONYEAR, obs$EXPIRATIONYEAR)
  }
  tm_ap_appprov <- obs$APPPROV
  tm_ap_appcoun <- obs$APPCOUN
  # Extract all provinces except that of the observation at hand
  tm_ap_appprov_not <- prov_list[-match(tm_ap_appprov, prov_list)]
  tm_industries <- select(obs, starts_with("INDUSTRY_")) %>% 
    select_if(function(col) all(col > 0)) %>% 
    names()
  # Initialize objects used in the loop
  numart_ynp <- data.frame() # numerator of the numerator
  numart_yp <- data.frame() # denominator of the numerator
  numart_yn <- data.frame() # numerator of the denominator
  numart_y <- data.frame() # denominator of the denominator
  numart_ync <- data.frame() # numerator of the numerator
  numart_yc <- data.frame() # denominator of the numerator
  numart_yn_2 <- data.frame() # numerator of the denominator
  numart_y_2 <- data.frame() # denominator of the denominator
  numart_ynp_exp <- list() # numerator of the numerator
  numart_yp_exp <- list() # denominator of the numerator
  numart_yn_exp <- list() # numerator of the denominator
  numart_y_exp <- list() # denominator of the denominator
  RTMGS_noprov <- list()
  RTMGD_noprov <- list()
  data_QI$RTMGS_EXP <- 0
  data_QI$RTMGD_EXP <- 0
  
  # Get into the loop for indices calculation
  if (data_QI$APPCOUN[row] == "SPAIN") {
    print(paste(row, "- entra en == SPAIN"))
    GTC_vector <- select(GTC_matrix, tm_ap_appprov)
    for (y in tm_years) {
      for (n in tm_industries) {
        # This is the numerator of the numerator
        numart_ynp <- bind_rows(numart_ynp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
                                               select(tm_ap_appprov)))
        # This is the numerator of the denominator in RTMGS
        numart_yn <- bind_rows(numart_yn, (filter(articles_provinces, Year == y, SECTOR == n)))
      }
    }
    
    for (y in tm_years) { # File only contains info until 1920
      # This is the denominator of the numerator in RTMGS
      numart_yp <- bind_rows(numart_yp, (filter(articles_provinces, Year == y) %>%
                                           select(tm_ap_appprov)))
      # This is the denominator of the denominator in RTMGS
      numart_y <- bind_rows(numart_y, (filter(articles_provinces, Year == y)))
      #  This loop serves also to calculate NUM_NUM and NUM_DEN of RTMGS_EXP and RTMGD_EXP
      for (p in tm_ap_appprov_not) {
        numart_ynp_exp <- append(numart_ynp_exp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
                                                    select(p)))
        
        numart_yp_exp <- append(numart_yp_exp, (filter(articles_provinces, Year == y) %>%
                                                  select(p)))
      }
    }    
    
    # El NUM_DEN y DEN_DEN de RTMGS_EXP y RTMGD_EXT solo hay que hacerlo una vez, no por cada p, porque en realidad va a salir la misma lista para cada p. Hay que sacarlos del loop.
    numart_yn_exp <- append(numart_yn_exp, (filter(articles_provinces, Year == y, SECTOR == n) %>%
                                              select(-Year, -SECTOR, -tm_ap_appprov)))
    
    numart_y_exp <- append(numart_y_exp, (filter(articles_provinces, Year == y) %>%
                                            select(-Year, -SECTOR, -tm_ap_appprov)))
    
    
    # Final calculation of RTMGS and RTMGD and RTMGS_EXP and RTMGD_EXP
    # RTMGS and RTMGD
    data_QI[row,"RTMGS_NUM_NUM"] <- mean(unlist(numart_ynp)) 
    data_QI[row,"RTMGS_DEN_NUM"] <- mean(unlist(numart_yn[3:ncol(numart_yn)]))    
    data_QI[row,"RTMGS_NUM_DEN"] <- mean(unlist(numart_yp)) 
    data_QI[row,"RTMGS_DEN_DEN"] <- mean(unlist(numart_y[3:ncol(numart_y)]))
    data_QI <- data_QI %>%
      mutate(RTMGS = (RTMGS_NUM_NUM/RTMGS_DEN_NUM)/(RTMGS_NUM_DEN/RTMGS_DEN_DEN),
             RTMGD = (1/(ncol(articles_provinces)-2))*((RTMGS-min(RTMGS, na.rm = T))/(max(RTMGS, na.rm = T)-min(RTMGS, na.rm = T)))
      )
    # RTMGS_EXP and RTMGD_EXP
    # The RTMGS and RTMGD for each region are needed first. Note that the min/max fun is actually min_/max_ from hablar package
    for (i in (1:54)) {
      RTMGS_noprov[[i]] <-  (numart_ynp_exp[[i]]/mean(unlist(numart_yp_exp[[i]])))/(numart_yn_exp[[i]]/mean(unlist(numart_y_exp[[i]]))) # Hay muchos NaN, porque en muchas de las otras provincias que no son la de la marca en cuestion, el numero de articulos es cero. Y cero/cero = NaN
      # RTMGD_noprov must include max and min for ALL provinces, not n-1
      RTMGS_allprov <- append(RTMGS_noprov, data_QI$RTMGS[row])
      RTMGD_noprov[[i]] <- (1/(ncol(articles_provinces)-2))*((RTMGS_allprov[[i]]-min_(unlist(RTMGS_allprov)))/(max_(unlist(RTMGS_allprov))-min_(unlist(RTMGS_allprov))))
    }
    # Now the Exposure. Note that the max fun is actually max_ from hablar package
    data_QI[row, "RTMGS_EXP"] <- max_(unlist(RTMGS_noprov) / GTC_vector)
    data_QI[row, "RTMGD_EXP"] <- max_(unlist(RTMGD_noprov) / GTC_vector)
    print(paste(row, "ha terminado == SPAIN"))
  } else {
    # Observations that are not SPAIN (this is the else clause of: (data_QI$APPCOUN[row] == "SPAIN"))
    print(paste(row, "- entra en != SPAIN"))
    for (y in tm_years) { # File only contains info until 1920
      for (n in tm_industries) {
        numart_ync <- bind_rows(numart_ync, (filter(articles_countries, Year == y, SECTOR == n) %>%
                                               select(tm_ap_appcoun)))
        numart_yn_2 <- bind_rows(numart_yn_2, (filter(articles_countries, Year == y, SECTOR == n) %>%
                                                 mutate(FOREINSP = (rowSums(across(where(is.numeric))) - Year) - SPAIN))) # not very elegant subtracting YEAR but it damn works
      }
    }
    for (y in tm_years) { # File only contains info until 1920
      numart_yc <- bind_rows(numart_yc, (filter(articles_countries, Year == y) %>%
                                           select(tm_ap_appcoun)))
      numart_y_2 <- bind_rows(numart_y_2, (filter(articles_countries, Year == y)  %>% 
                                             mutate(FOREINSP = (rowSums(across(where(is.numeric))) - Year) - SPAIN))) # not very elegant subtracting YEAR but it damn works
    }
    # Final calculation of QIX_3 (RTMICS) and QIX_4 (RTMICD)
    data_QI[row,"RTMICS_NUM_NUM"] <- mean(unlist(numart_ync), na.rm = T)
    data_QI[row,"RTMICS_DEN_NUM"] <- mean(unlist(numart_yc))     
    data_QI[row,"RTMICS_NUM_DEN"] <- mean(numart_yn_2$FOREINSP)
    data_QI[row,"RTMICS_DEN_DEN"] <- mean(numart_y_2$FOREINSP) 
    data_QI <- data_QI %>% mutate(RTMICS = (RTMICS_NUM_NUM/RTMICS_DEN_NUM)/(RTMICS_NUM_DEN/RTMICS_DEN_DEN),
                                  RTMICD = (1/(ncol(articles_countries)-2))*((RTMICS-min(RTMICS, na.rm = T))/                                                                           (max(RTMICS, na.rm = T)-min(RTMICS, na.rm = T))))
    
    print(paste(row,"ha terminado != SPAIN"))
  }
}

Upvotes: 1

Views: 177

Answers (1)

Miguel
Miguel

Reputation: 436

Each iteration of a foreach loop should return an object (e.g. a dataframe or matrix). These are then combined as specified by the .combine argument of the foreach function. The foreach function then returns the combined object. So you need something like:

combined_results <- foreach(i = 1:50, .combine=rbind) %dopar% { 
         # code that produces some_result
         return(some_result)
}

Upvotes: 1

Related Questions