william zhang
william zhang

Reputation: 47

how can I speed up for loop in R?

I was wondering if there is a way to speed up this function (this will take 20s with those defaults), since normally sim would be a lot greater, like 10 thousands or 100 thousands


twosamp.fisher <- function(p1 = 0.1,
                           p2 = 0.05,
                           alpha = 0.025,
                           power = 0.9,
                           sim = 1000,
                           seed = 123) {
  z_c <- qnorm(1 - alpha) + qnorm(power)
  flag <- 0
  n_int <- round(z_c^2 * (p1 * (1 - p1) + p2 * (1 - p2)) / ((p1 - p2)^2))

    set.seed(seed)
    p.value <- numeric(sim)
    for (n in seq(n_int, 10000, 1)) {
      m <- matrix(NA, nrow = sim, ncol = 4)
      ka <- rbinom(sim, n, p1)
      kb <- rbinom(sim, n, p2)
      m[, 1] <- ka
      m[, 2] <- n - ka
      m[, 3] <- kb
      m[, 4] <- n - kb

      p.value <- apply(
        m,
        1,
        function(x) {
          fisher.test(
            matrix(c(x[1], x[2], x[3], x[4]),
                   nrow = 2,
                   dimnames = list(
                     trt = c("Yes", "No"),
                     ctrl = c("Yes", "No")
                   )
            ),
            alternative = "greater"
          )$p.value
        }
      )

      power.est <- mean(p.value < alpha)
      if (power.est >= power) {
        flag <- flag + 1
      } else if (power.est < power & flag >= 1) {
        flag <- 0
      }
      if ((flag == 10)) {
        break
      }
    }
  
  n <- n - flag + 1
  cat("sample needed in each arm:  ", n)
}

system.time(twosamp.fisher())

Upvotes: 2

Views: 83

Answers (1)

jblood94
jblood94

Reputation: 16981

Using Ritchie Sacramento's suggestion, you can replace the apply call with

phyper(m[,1] - 1, m[,1] + m[,3], m[,2] + m[,4], n, FALSE)

That speeds it up considerably:

system.time(twosamp.fisher())
#> sample needed in each arm:   622
#>    user  system elapsed 
#>    0.05    0.01    0.06

Alternatively, if n is not very large (e.g., < 5e3) you could calculate the exact power for a given n by finding the ka and kb that would result in the Fisher p-value being less than alpha. That can then be used to find the value of n that results in the desired power:

twosamp.fisher2 <- function(p1 = 0.1, p2 = 0.05, alpha = 0.025, power = 0.9) {
  fPow <- function(n) {
    n <- round(n)
    thresh <- 1:(n*2) - qhyper(alpha, 1:(n*2), (n*2 - 1):0, n, FALSE)
    sum(dbinom(sequence(thresh, from = 1:(n*2), by = -1L), n, p1)*dbinom(sequence(thresh, from = 0L), n, p2)) - power
  }
  
  z_c <- qnorm(1 - alpha) + qnorm(power)
  n0 <- n1 <- round(z_c^2 * (p1 * (1 - p1) + p2 * (1 - p2)) / ((p1 - p2)^2))
  
  if (fPow(n0) < 0) {
    while (fPow(n1 <- n1*1.2) < 0) {}
  } else {
    while (fPow(n0 <- n0/1.2) > 0) {}
  }
  ceiling(uniroot(fPow, c(n0, n1), tol = 1)$root)
}

The performance is better than the original function, but not as fast as estimating through sampling:

system.time(n <- twosamp.fisher2())
#>    user  system elapsed 
#>    1.36    0.02    1.37
n
#> [1] 606

Upvotes: 3

Related Questions