Erdne Htábrob
Erdne Htábrob

Reputation: 879

mclapply with lme4 and long vectors

I am using mclapply from the parallel package to estimate mixed glmer models using the lme4 package on a high performance cluster. I am having the issue described here. I apply the suggested fix of adding mc.preschedule=F, but the problem persists. The code is set up as described here.

I am not sure how to go around it, any ideas? Should I switch to another method of parallelization? if so, how?

This is my code, but basically it follows the logic of the linked articles:

rm(list = ls())

require(lme4)
require(parallel)

load(file="//share//home//eborbath//ess_rescaled.Rda") # load data

# paralelizing function

f_lmer_mc = function(data, calls, mc.cores) {
  require(parallel)
  if (is.data.frame(data)) 
    data = replicate(length(calls), data, simplify = F)
  for (i in 1:length(data)) attr(data[[i]], "cll") = calls[i]
  m.list = mclapply(data, function(i) eval(parse(text = attr(i, "cll"))), 
                    mc.cores = mc.cores, mc.preschedule = FALSE)
  return(m.list)
}

##########
# Models #
##########


controls <- c("gender", "agea", "eduyrs", "domicil", "unemployed", "rideol", "union", "pid", "hincfel")
values <- c("conformity", "universalism", "security")
issues <- c("gincdif", "freehms")
agr.ctrl <- c("gdp_wb_ppp", "wb_vae")
lr.agr <- c("lr_rsquar_std", "ri_l2_std")
val.agr <- c("mean_univ", "mean_conf", "mean_secur")
end <- "1 + (1|cntry/countryyear), data=i, control=glmerControl(optimizer='bobyqa', optCtrl = list(maxfun = 1e9)), family=binomial(link='logit'))"

models = c(paste0("glmer(protest ~", paste(c(controls, end), collapse="+")),
paste0("glmer(protest ~", paste(c(controls, values, end), collapse="+")),
paste0("glmer(protest ~", paste(c(controls, values, issues, end), collapse="+")),
paste0("glmer(protest ~ region+", paste(c(controls, values, issues, end), collapse="+")),
paste0("glmer(protest ~ region+", paste(c(controls, values, issues, agr.ctrl, end), collapse="+")), 
paste0("glmer(protest ~ region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, end), collapse="+")),
paste0("glmer(protest ~ region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")), # until here it's only main effects
paste0("glmer(protest ~ region*rideol + region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")), 
paste0("glmer(protest ~ region*rideol*year + region+year+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")), 
paste0("glmer(protest ~ region*rideol*year_num + region+year_num+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")), 
paste0("glmer(protest ~ region*soc_pop_eleches + region+soc_pop_eleches+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")), # now come the expl. models
paste0("glmer(protest ~ region*rideol*soc_pop_eleches + region+soc_pop_eleches+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*ri_l2_std + region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*ri_l2_std*rideol + region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*lr_rsquar_std + region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*lr_rsquar_std*rideol + region+", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region+gov_genlr", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*gov_genlr + region+gov_genlr", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*gov_genlr*rideol + region+gov_genlr", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region+pol_lrecon", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region+pol_galtan", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region+pol_galtan+pol_lrecon", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*pol_lrecon+region+pol_galtan+pol_lrecon", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*pol_galtan+region+pol_galtan+pol_lrecon", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*pol_lrecon*rideol+region+pol_galtan+pol_lrecon", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")),
paste0("glmer(protest ~ region*pol_galtan*rideol+region+pol_galtan+pol_lrecon", paste(c(controls, values, issues, agr.ctrl, lr.agr, val.agr, end), collapse="+")))

m.list = f_lmer_mc(data, models, 24)

m.1 <- c(m.list[1:3])
m.2 <- c(m.list[4:6])
m.3 <- c(m.list[7:9])
m.4 <- c(m.list[10:12])
m.5 <- c(m.list[13:15])
m.6 <- c(m.list[16:18])
m.7 <- c(m.list[19:21])
m.8 <- c(m.list[22:24])
m.9 <- c(m.list[25:26])

save(m.1, data, file='m_1.RData')
save(m.2, data, file='m_2.RData')
save(m.3, data, file='m_3.RData')
save(m.4, data, file='m_4.RData')
save(m.5, data, file='m_5.RData')
save(m.6, data, file='m_6.RData')
save(m.7, data, file='m_7.RData')
save(m.8, data, file='m_8.RData')
save(m.9, data, file='m_9.RData')

This is the relevant error message:

Error in sendMaster(try(eval(expr, env), silent = TRUE)) : 
  long vectors not supported yet: fork.c:378
Calls: f_lmer_mc ... mclapply -> lapply -> FUN -> mcparallel -> sendMaster

Thanks!

UPDATE:

The data is a cleaned version of the publicly available European Social Survey. You can download the file from here (1.8 MB)

Upvotes: 2

Views: 443

Answers (2)

Aaron - mostly inactive
Aaron - mostly inactive

Reputation: 37764

Expanding on my comment above:

I see you're replicating the data set and then sending it to all the processes. I haven't done parallel stuff in a while, but you might not need to do that; the vignette says "with mclapply all the packages and objects we use are automatically available on the workers." If so, that would take care of going to the processes, and Ralf Stubner's suggestion would hopefully take care of coming back.

To try not replicating the data, first have the calls use data as read in by your load call, instead of i; you'd change just this line.

end <- "1 + (1|cntry/countryyear), data=data, control=glmerControl(optimizer='bobyqa', optCtrl = list(maxfun = 1e9)), family=binomial(link='logit'))"

Then have mclapply just run those, without replicating the data.

library(parallel)
m.list = mclapply(calls, function(i) eval(parse(text=i)), 
                  mc.cores = 2, mc.preschedule = FALSE)

To try not returning all the information in the models (in particular, the full data set with each model), after looking at the glmer output, I think it would be best to do whatever processing you want in the processes, instead of modifying the glmer output, as modifying the glmer output is likely to make it harder to get the summaries you want afterwards. Here I only get the summary, and put it in a list, so you could add other output easily as well.

library(parallel)
m.list = mclapply(calls, function(i) {
                     a <- eval(parse(text=i))
                     list(summary=summary(a))
                  }, mc.cores = 2, mc.preschedule = FALSE)

Note that this is all untested...

Upvotes: 2

Steve Weston
Steve Weston

Reputation: 19677

I think this error happens because the forked worker processes are getting an error serializing very large result objects. I've been able to reproduce this error in R 3.3.2 with the following code:

library(parallel)
r <- mclapply(1:2, function(i) 1:2^30, mc.cores=2, mc.preschedule=FALSE)

However, this example worked for me using a 64-bit build of R 3.4.3, so the serialization limit seems to have been removed (or at least increased) in later versions of R.

I suggest that you either try to reduce the size of the result objects to less than 2GB, or use the most recent version of R.

Upvotes: 3

Related Questions