J Bacon
J Bacon

Reputation: 27

bias-corrected percentile bootstrap

I am trying to run a bias-corrected percentile bootstrap for 3 different effect sizes and ran across some code from a predecessor. While this code functions, the run time takes FAR too long. It takes about 3 days to do each of the 3 different effect sizes.

It also shows each of the 1000 iterations which I do not need. Ideally, I would like to optimize run time maximally. I would also like all 3 of the outputs on the console or into a word/excel document rather than having to run them one at time.

I know the code is a bit clunky, but some help would be appreciated.

I am fairly new to R, and was looking for some advice on how to proceed appropriately. The code is presented below:

library(pscl)
library(boot)

# RNG MODULE FOR TWO_PART HURDLE MODEL

gen.hurdle = function(n, a, b1, b2, c1, c2, i0, i1, i2){

  x = round(rnorm(n),3)
  e = rnorm(n)
  m = round(i0 + a*x + e, 3)

  lambda = exp(i1 + b1*m + c1*x)                       # PUT REGRESSION TERMS FOR THE CONTINUUM PART HERE; KEEP exp()
  ystar = qpois(runif(n, dpois(0, lambda), 1), lambda) # Zero-TRUNCATED POISSON DIST.; THE CONTINUUM PART

  z = i2 + b2*m  + c2*x                                # PUT REGRESSION TERMS FOR THE BINARY    PART HERE
  z_p = exp(z) / (1+exp(z))                            # p(1) = 1-p(0)
  tstar = rbinom(n, 1, z_p)                            # BINOMIAL DIST.         ; THE BINARY    PART

  y= ystar*tstar                                       # TWO-PART COUNT OUTCOME

  return(cbind(x,m,y,z,z_p,tstar))
}


##################################################################################################
# MODEL ##########################################################################################
##################################################################################################

# i=1  ###################################

#small effect
seed=51; n=50  ;a=.18; b=.16; c=.25; i=1
seed=53; n=100 ;a=.18; b=.16; c=.25; i=1
seed=55; n=200 ;a=.18; b=.16; c=.25; i=1
seed=57; n=300 ;a=.18; b=.16; c=.25; i=1
seed=58; n=500 ;a=.18; b=.16; c=.25; i=1
seed=59; n=1000;a=.18; b=.16; c=.25; i=1

#medium effect
seed=61; n=50  ;a=.31; b=.35; c=.25; i=1
seed=63; n=100 ;a=.31; b=.35; c=.25; i=1
seed=65; n=200 ;a=.31; b=.35; c=.25; i=1
seed=67; n=300 ;a=.31; b=.35; c=.25; i=1
seed=68; n=500 ;a=.31; b=.35; c=.25; i=1
seed=69; n=1000;a=.31; b=.35; c=.25; i=1

#large effect
seed=81; n=50  ;a=.52; b=.49; c=.25; i=1
seed=73; n=100 ;a=.52; b=.49; c=.25; i=1
seed=75; n=200 ;a=.52; b=.49; c=.25; i=1
seed=77; n=300 ;a=.52; b=.49; c=.25; i=1
seed=78; n=500 ;a=.52; b=.49; c=.25; i=1
seed=79; n=1000;a=.52; b=.49; c=.25; i=1


#model
set.seed(seed)
iterations = 1000
r = 1000

results = matrix(,iterations,4)
for (iiii in 1:iterations){

  data  = data.frame(gen.hurdle(n, a, b, b, c, c, i, i, i))
  data0 = data.frame(gen.hurdle(n, a, 0, 0, c, c, i, i, i))

  p_0     =1-mean(data$z_p)
  p_0_hat =1-mean(data$tstar)
  p_0_b0     =1-mean(data0$z_p)
  p_0_hat_b0 =1-mean(data0$tstar)

  # power

  boot= matrix(,r,8)
  for(jjjj in 1:r){

    #power
    fit1      = lm(m ~ x, data=data)
    fit2      = hurdle(formula = y ~ m + x, data=data, dist = "poisson", zero.dist = "binomial") 

    a_hat     = summary(fit1)$coef[2,1]
    b1_hat    = summary(fit2)[[1]]$count[2,1]
    b2_hat    = summary(fit2)[[1]]$zero[2,1]
    ab1_hat   = prod(a_hat,b1_hat)
    ab2_hat   = prod(a_hat,b2_hat)

    boot.data = data[sample(nrow(data), replace = TRUE), ]
    boot.data$y[1] = if(prod(boot.data$y) > 0) 0 else boot.data$y[1]

    boot.fit1 = lm(m ~ x, data=boot.data)
    boot.fit2 = hurdle(formula = y ~ m + x, data=boot.data, dist = "poisson", zero.dist = "binomial") 

    boot.a    = summary(boot.fit1)$coef[2,1]
    boot.b1   = summary(boot.fit2)[[1]]$count[2,1]
    boot.b2   = summary(boot.fit2)[[1]]$zero[2,1]
    boot.ab1  = prod(boot.a,boot.b1)
    boot.ab2  = prod(boot.a,boot.b2)

    #type I error
    fit3       = lm(m ~ x, data=data0)
    fit4       = hurdle(formula = y ~ m + x, data=data0, dist = "poisson", zero.dist = "binomial")  

    a_hat_b0   = summary(fit3)$coef[2,1]
    b1_hat_b0  = summary(fit4)[[1]]$count[2,1]
    b2_hat_b0  = summary(fit4)[[1]]$zero[2,1]
    ab1_hat_b0 = prod(a_hat_b0,b1_hat_b0)
    ab2_hat_b0 = prod(a_hat_b0,b2_hat_b0)

    boot.data0 = data0[sample(nrow(data0), replace = TRUE), ]
    boot.data0$y[1] = if(prod(boot.data0$y) > 0) 0 else boot.data0$y[1]

    boot.fit3  = lm(m ~ x, data=boot.data0)
    boot.fit4  = hurdle(formula = y ~ m + x, data=boot.data0, dist = "poisson", zero.dist = "binomial")  

    boot.a_b0   = summary(boot.fit3)$coef[2,1]
    boot.b1_b0  = summary(boot.fit4)[[1]]$count[2,1]
    boot.b2_b0  = summary(boot.fit4)[[1]]$zero[2,1]
    boot.ab1_b0 = prod(boot.a_b0,boot.b1_b0)
    boot.ab2_b0 = prod(boot.a_b0,boot.b2_b0)

    boot[jjjj,] = c(ab1_hat,    ab2_hat,    boot.ab1,    boot.ab2,
                    ab1_hat_b0, ab2_hat_b0, boot.ab1_b0, boot.ab2_b0)

  }

  z0.1 = qnorm((sum(boot[,3] > boot[,1])+sum(boot[,3]==boot[,1])/2)/r)
  z0.2 = qnorm((sum(boot[,4] > boot[,2])+sum(boot[,4]==boot[,2])/2)/r)
  z0.1_b0 = qnorm((sum(boot[,7] > boot[,5])+sum(boot[,7]==boot[,5])/2)/r)
  z0.2_b0 = qnorm((sum(boot[,8] > boot[,6])+sum(boot[,8]==boot[,6])/2)/r)

  alpha=0.05 # 95% limits
  z=qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits

  p1    = pnorm(z-2*z0.1) # bias-correct & convert to proportions
  p2    = pnorm(z-2*z0.2) 
  p1_b0 = pnorm(z-2*z0.1_b0)
  p2_b0 = pnorm(z-2*z0.2_b0) 

  ci1    = quantile(boot[,3],p=p1) # Bias-corrected percentile lims
  ci2    = quantile(boot[,4],p=p2)
  ci1_b0 = quantile(boot[,7],p=p1_b0)
  ci2_b0 = quantile(boot[,8],p=p2_b0)

  sig.ab1 = if(prod(ci1) > 0) 1 else 0
  sig.ab2 = if(prod(ci2) > 0) 1 else 0
  sig.ab1_b0 = if(prod(ci1_b0) > 0) 1 else 0
  sig.ab2_b0 = if(prod(ci2_b0) > 0) 1 else 0



  #results
  results[iiii,] = c(sig.ab1, sig.ab2, sig.ab1_b0, sig.ab2_b0)

  message(paste0(iiii, " / iterations"))
  flush.console()
}

i
n
a
b
iterations
#bootstrap how many
r

#power of ab1
mean(results[,1])
#power of ab2
mean(results[,2])
#type I error of ab1
mean(results[,3])
#type I error of ab2
mean(results[,4])

It would seem to me that the problem with each of the effects being ran separately is coming from naming the results "results" for each loop. I do not know the best way to print all results(for each effect size) without disturbing the loop.

The length is obviously coming from the sheer amount of iterations and lack of RAM to process them fast enough. Is this something that can even been remedied? The time isn't nearly as much of a concern as not getting results for all effect sizes when the program is run.

Upvotes: 1

Views: 272

Answers (1)

jay.sf
jay.sf

Reputation: 72803

To partially give you an answer, here's how to bootstrap in a function.

Consider this data,

summary(beaver2[c("time", "activ")]); nrow(beaver2)
#      time          activ     
# Min.   :   0   Min.   :0.00  
# 1st Qu.:1128   1st Qu.:0.00  
# Median :1535   Median :1.00  
# Mean   :1446   Mean   :0.62  
# 3rd Qu.:1942   3rd Qu.:1.00  
# Max.   :2350   Max.   :1.00  
# [1] 100

and this regression:

summary(with(beaver2, lm(temp ~ activ)))$coe
#               Estimate Std. Error    t value      Pr(>|t|)
# (Intercept) 37.0968421 0.03456240 1073.32955 2.895092e-201
# activ        0.8062224 0.04389429   18.36736  1.682051e-33

With a for loop we would bootstrap like this:

set.seed(528)

for.res <- rep(NA, 5e2)
for (i in 1:5e2) {
  X <- beaver2[sample(seq(nrow(beaver2)), replace=TRUE), ]
  y0 <- with(X, mean(temp[activ == 0]))
  y1 <- with(X, mean(temp[activ == 1]))
  for.res[i] <- y1 - y0
}

cat("beta:", b <- mean(for.res), "\tse:", s <-  sd(for.res), "\tt:", t <- b/s,
    "\tp:", 2 * pt(-abs(t), df = nrow(beaver2) - 1))
# beta: 0.8041428   se: 0.04089321  t: 19.66446     p: 5.761707e-36

Whereas with a function we would bootstrap like so:

bootFun <- function(X) {
  y0 <- with(X, mean(temp[activ == 0]))
  y1 <- with(X, mean(temp[activ == 1]))
  return(yh1 - yh0)
}

set.seed(528)
boot.res <- replicate(5e2, bootFun(X=beaver2[sample(seq(nrow(beaver2)), 
                                              replace=TRUE), ]))


cat("beta:", b <- mean(boot.res), "\tse:", s <-  sd(boot.res), "\tt:", t <- b/s,
    "\tp:", 2 * pt(-abs(t), df = nrow(beaver2) - 1))
# beta: 0.8049935   s: 0.04255446   t: 18.91678     p: 1.203037e-34

In the above example, the speed increase of replicate compared to for is about 10%:

microbenchmark::microbenchmark(
  replicate=replicate(5e2, bootFun(beaver2[sample(seq(nrow(beaver2)), 
                                              replace=TRUE), ])),
  for.loop={
    for (i in 1:5e2) {
      X <- beaver2[sample(seq(nrow(beaver2)), replace=TRUE), ]
      y0 <- with(X, mean(temp[activ == 0]))
      y1 <- with(X, mean(temp[activ == 1]))
      for.res[i] <- y1 - y0
    }
  })
# Unit: milliseconds
#      expr      min       lq      mean   median       uq      max neval cld
# replicate 88.37455 89.12988  93.36896 89.66325  99.8587 133.1689   100  a 
#  for.loop 95.88837 96.65481 102.40125 97.07489 107.5985 295.3379   100   b

Upvotes: 1

Related Questions