Reputation: 27
I am trying to run a bias-corrected percentile bootstrap for 3 different effect sizes and ran across some code from a predecessor. While this code functions, the run time takes FAR too long. It takes about 3 days to do each of the 3 different effect sizes.
It also shows each of the 1000 iterations which I do not need. Ideally, I would like to optimize run time maximally. I would also like all 3 of the outputs on the console or into a word/excel document rather than having to run them one at time.
I know the code is a bit clunky, but some help would be appreciated.
I am fairly new to R, and was looking for some advice on how to proceed appropriately. The code is presented below:
library(pscl)
library(boot)
# RNG MODULE FOR TWO_PART HURDLE MODEL
gen.hurdle = function(n, a, b1, b2, c1, c2, i0, i1, i2){
x = round(rnorm(n),3)
e = rnorm(n)
m = round(i0 + a*x + e, 3)
lambda = exp(i1 + b1*m + c1*x) # PUT REGRESSION TERMS FOR THE CONTINUUM PART HERE; KEEP exp()
ystar = qpois(runif(n, dpois(0, lambda), 1), lambda) # Zero-TRUNCATED POISSON DIST.; THE CONTINUUM PART
z = i2 + b2*m + c2*x # PUT REGRESSION TERMS FOR THE BINARY PART HERE
z_p = exp(z) / (1+exp(z)) # p(1) = 1-p(0)
tstar = rbinom(n, 1, z_p) # BINOMIAL DIST. ; THE BINARY PART
y= ystar*tstar # TWO-PART COUNT OUTCOME
return(cbind(x,m,y,z,z_p,tstar))
}
##################################################################################################
# MODEL ##########################################################################################
##################################################################################################
# i=1 ###################################
#small effect
seed=51; n=50 ;a=.18; b=.16; c=.25; i=1
seed=53; n=100 ;a=.18; b=.16; c=.25; i=1
seed=55; n=200 ;a=.18; b=.16; c=.25; i=1
seed=57; n=300 ;a=.18; b=.16; c=.25; i=1
seed=58; n=500 ;a=.18; b=.16; c=.25; i=1
seed=59; n=1000;a=.18; b=.16; c=.25; i=1
#medium effect
seed=61; n=50 ;a=.31; b=.35; c=.25; i=1
seed=63; n=100 ;a=.31; b=.35; c=.25; i=1
seed=65; n=200 ;a=.31; b=.35; c=.25; i=1
seed=67; n=300 ;a=.31; b=.35; c=.25; i=1
seed=68; n=500 ;a=.31; b=.35; c=.25; i=1
seed=69; n=1000;a=.31; b=.35; c=.25; i=1
#large effect
seed=81; n=50 ;a=.52; b=.49; c=.25; i=1
seed=73; n=100 ;a=.52; b=.49; c=.25; i=1
seed=75; n=200 ;a=.52; b=.49; c=.25; i=1
seed=77; n=300 ;a=.52; b=.49; c=.25; i=1
seed=78; n=500 ;a=.52; b=.49; c=.25; i=1
seed=79; n=1000;a=.52; b=.49; c=.25; i=1
#model
set.seed(seed)
iterations = 1000
r = 1000
results = matrix(,iterations,4)
for (iiii in 1:iterations){
data = data.frame(gen.hurdle(n, a, b, b, c, c, i, i, i))
data0 = data.frame(gen.hurdle(n, a, 0, 0, c, c, i, i, i))
p_0 =1-mean(data$z_p)
p_0_hat =1-mean(data$tstar)
p_0_b0 =1-mean(data0$z_p)
p_0_hat_b0 =1-mean(data0$tstar)
# power
boot= matrix(,r,8)
for(jjjj in 1:r){
#power
fit1 = lm(m ~ x, data=data)
fit2 = hurdle(formula = y ~ m + x, data=data, dist = "poisson", zero.dist = "binomial")
a_hat = summary(fit1)$coef[2,1]
b1_hat = summary(fit2)[[1]]$count[2,1]
b2_hat = summary(fit2)[[1]]$zero[2,1]
ab1_hat = prod(a_hat,b1_hat)
ab2_hat = prod(a_hat,b2_hat)
boot.data = data[sample(nrow(data), replace = TRUE), ]
boot.data$y[1] = if(prod(boot.data$y) > 0) 0 else boot.data$y[1]
boot.fit1 = lm(m ~ x, data=boot.data)
boot.fit2 = hurdle(formula = y ~ m + x, data=boot.data, dist = "poisson", zero.dist = "binomial")
boot.a = summary(boot.fit1)$coef[2,1]
boot.b1 = summary(boot.fit2)[[1]]$count[2,1]
boot.b2 = summary(boot.fit2)[[1]]$zero[2,1]
boot.ab1 = prod(boot.a,boot.b1)
boot.ab2 = prod(boot.a,boot.b2)
#type I error
fit3 = lm(m ~ x, data=data0)
fit4 = hurdle(formula = y ~ m + x, data=data0, dist = "poisson", zero.dist = "binomial")
a_hat_b0 = summary(fit3)$coef[2,1]
b1_hat_b0 = summary(fit4)[[1]]$count[2,1]
b2_hat_b0 = summary(fit4)[[1]]$zero[2,1]
ab1_hat_b0 = prod(a_hat_b0,b1_hat_b0)
ab2_hat_b0 = prod(a_hat_b0,b2_hat_b0)
boot.data0 = data0[sample(nrow(data0), replace = TRUE), ]
boot.data0$y[1] = if(prod(boot.data0$y) > 0) 0 else boot.data0$y[1]
boot.fit3 = lm(m ~ x, data=boot.data0)
boot.fit4 = hurdle(formula = y ~ m + x, data=boot.data0, dist = "poisson", zero.dist = "binomial")
boot.a_b0 = summary(boot.fit3)$coef[2,1]
boot.b1_b0 = summary(boot.fit4)[[1]]$count[2,1]
boot.b2_b0 = summary(boot.fit4)[[1]]$zero[2,1]
boot.ab1_b0 = prod(boot.a_b0,boot.b1_b0)
boot.ab2_b0 = prod(boot.a_b0,boot.b2_b0)
boot[jjjj,] = c(ab1_hat, ab2_hat, boot.ab1, boot.ab2,
ab1_hat_b0, ab2_hat_b0, boot.ab1_b0, boot.ab2_b0)
}
z0.1 = qnorm((sum(boot[,3] > boot[,1])+sum(boot[,3]==boot[,1])/2)/r)
z0.2 = qnorm((sum(boot[,4] > boot[,2])+sum(boot[,4]==boot[,2])/2)/r)
z0.1_b0 = qnorm((sum(boot[,7] > boot[,5])+sum(boot[,7]==boot[,5])/2)/r)
z0.2_b0 = qnorm((sum(boot[,8] > boot[,6])+sum(boot[,8]==boot[,6])/2)/r)
alpha=0.05 # 95% limits
z=qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits
p1 = pnorm(z-2*z0.1) # bias-correct & convert to proportions
p2 = pnorm(z-2*z0.2)
p1_b0 = pnorm(z-2*z0.1_b0)
p2_b0 = pnorm(z-2*z0.2_b0)
ci1 = quantile(boot[,3],p=p1) # Bias-corrected percentile lims
ci2 = quantile(boot[,4],p=p2)
ci1_b0 = quantile(boot[,7],p=p1_b0)
ci2_b0 = quantile(boot[,8],p=p2_b0)
sig.ab1 = if(prod(ci1) > 0) 1 else 0
sig.ab2 = if(prod(ci2) > 0) 1 else 0
sig.ab1_b0 = if(prod(ci1_b0) > 0) 1 else 0
sig.ab2_b0 = if(prod(ci2_b0) > 0) 1 else 0
#results
results[iiii,] = c(sig.ab1, sig.ab2, sig.ab1_b0, sig.ab2_b0)
message(paste0(iiii, " / iterations"))
flush.console()
}
i
n
a
b
iterations
#bootstrap how many
r
#power of ab1
mean(results[,1])
#power of ab2
mean(results[,2])
#type I error of ab1
mean(results[,3])
#type I error of ab2
mean(results[,4])
It would seem to me that the problem with each of the effects being ran separately is coming from naming the results "results" for each loop. I do not know the best way to print all results(for each effect size) without disturbing the loop.
The length is obviously coming from the sheer amount of iterations and lack of RAM to process them fast enough. Is this something that can even been remedied? The time isn't nearly as much of a concern as not getting results for all effect sizes when the program is run.
Upvotes: 1
Views: 272
Reputation: 72803
To partially give you an answer, here's how to bootstrap in a function.
Consider this data,
summary(beaver2[c("time", "activ")]); nrow(beaver2)
# time activ
# Min. : 0 Min. :0.00
# 1st Qu.:1128 1st Qu.:0.00
# Median :1535 Median :1.00
# Mean :1446 Mean :0.62
# 3rd Qu.:1942 3rd Qu.:1.00
# Max. :2350 Max. :1.00
# [1] 100
and this regression:
summary(with(beaver2, lm(temp ~ activ)))$coe
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 37.0968421 0.03456240 1073.32955 2.895092e-201
# activ 0.8062224 0.04389429 18.36736 1.682051e-33
With a for
loop we would bootstrap like this:
set.seed(528)
for.res <- rep(NA, 5e2)
for (i in 1:5e2) {
X <- beaver2[sample(seq(nrow(beaver2)), replace=TRUE), ]
y0 <- with(X, mean(temp[activ == 0]))
y1 <- with(X, mean(temp[activ == 1]))
for.res[i] <- y1 - y0
}
cat("beta:", b <- mean(for.res), "\tse:", s <- sd(for.res), "\tt:", t <- b/s,
"\tp:", 2 * pt(-abs(t), df = nrow(beaver2) - 1))
# beta: 0.8041428 se: 0.04089321 t: 19.66446 p: 5.761707e-36
Whereas with a function we would bootstrap like so:
bootFun <- function(X) {
y0 <- with(X, mean(temp[activ == 0]))
y1 <- with(X, mean(temp[activ == 1]))
return(yh1 - yh0)
}
set.seed(528)
boot.res <- replicate(5e2, bootFun(X=beaver2[sample(seq(nrow(beaver2)),
replace=TRUE), ]))
cat("beta:", b <- mean(boot.res), "\tse:", s <- sd(boot.res), "\tt:", t <- b/s,
"\tp:", 2 * pt(-abs(t), df = nrow(beaver2) - 1))
# beta: 0.8049935 s: 0.04255446 t: 18.91678 p: 1.203037e-34
In the above example, the speed increase of replicate
compared to for
is about 10%:
microbenchmark::microbenchmark(
replicate=replicate(5e2, bootFun(beaver2[sample(seq(nrow(beaver2)),
replace=TRUE), ])),
for.loop={
for (i in 1:5e2) {
X <- beaver2[sample(seq(nrow(beaver2)), replace=TRUE), ]
y0 <- with(X, mean(temp[activ == 0]))
y1 <- with(X, mean(temp[activ == 1]))
for.res[i] <- y1 - y0
}
})
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# replicate 88.37455 89.12988 93.36896 89.66325 99.8587 133.1689 100 a
# for.loop 95.88837 96.65481 102.40125 97.07489 107.5985 295.3379 100 b
Upvotes: 1