mert
mert

Reputation: 371

Reproduce lasso simulation example 1

I would like to reproduce the results of example 1 on the page 280 in the original lasso paper.

Any suggestion, please?

    library(MASS)
    library(glmnet)
    
    simfun <- function(trainn = 20, validationn = 20, testn = 200, corr =0.5, sigma = 3, beta) { 


      n <- trainn + testn + validationn
      p <- length(beta)
      Covmatrix <- outer(1:p, 1:p, function(x,y){corr^abs(x-y)})
      X <- mvrnorm(n, rep(0,p), Covmatrix) # MASS
      X <- X[sample(n),]
      y <- X%*%beta + rnorm(n,mean = 0,sd=sigma)
      trainx <- X[1:trainn,]
      validationx <- X[(trainn+1):(trainn+validationn),]
      testx <- X[(trainn+validationn+1):n,]
      trainy <- y[1:trainn,]
      validationy <- y[(trainn+1):(trainn+validationn),]
      testy <- y[(trainn+validationn+1):n,]
      list(trainx = trainx, validationx = validationx, testx = testx, 
           trainy = trainy, validationy = validationy, testy = testy)
    }
    
    beta <- c(3,1.5,0,0,2,0,0,0)
    data <- simfun(20,20,200,corr=0.5,sigma=3,beta)
    trainx <- data$trainx
    trainy <- data$trainy
    validationx <- data$validationx
    validationy <- data$validationy
    testx <- data$testx
    testy <- data$testy
    
    
    # training: find betas for all the lambdas
    betas <- coef(glmnet(trainx,trainy,alpha=1))
    
    # validation: compute validation test error for each lambda and find the minimum
    J.val <- colMeans((validationy-cbind(1,validationx)%*%betas)^2)
    beta.opt <- betas[, which.min(J.val)]
    
    # test
    test.mse <- mean((testy-cbind(1,testx)%*%beta.opt)^2)
    test.mse

Upvotes: 0

Views: 1124

Answers (1)

younggeun
younggeun

Reputation: 953

This is simulation study, so I think you don't have to use training-validation approach. It just cause variation due to its randomness. You can implement expected test error using its definition.

  1. Generate several training data sets following your construction
  2. Generate an independent test set
  3. Fit each model based on each training set
  4. Calculate error against the test set
  5. Take average

    set.seed(1)
    simpfun <- function(n_train = 20, n_test = 10, simul = 50, corr = .5, sigma = 3, beta = c(3, 1.5, 0, 0, 2, 0, 0, 0), lam_grid = 10^seq(-3, 5)) {
      require(foreach)
      require(tidyverse)
      # true model
      p <- length(beta)
      Covmatrix <- outer(
        1:p, 1:p,
        function(x, y) {
          corr^abs(x - y)
        }
      )
      X <- foreach(i = 1:simul, .combine = rbind) %do% {
        MASS::mvrnorm(n_train, rep(0, p), Covmatrix)
      }
      eps <- rnorm(n_train, mean = 0, sd = sigma)
      y <- X %*% beta + eps # generate true model
      # generate test set
      test <- MASS::mvrnorm(n_test, rep(0, p), Covmatrix)
      te_y <- test %*% beta + rnorm(n_test, mean = 0, sd = sigma) # test y
      simul_id <- gl(simul, k = n_train, labels = 1:n_train)
      # expected test error
      train <-
        y %>%
        as_tibble() %>%
        mutate(m_id = simul_id) %>%
        group_by(m_id) %>% # for each simulation
        do(yhat = predict(glmnet::cv.glmnet(X, y, alpha = 1, lambda = lam_grid), newx = test, s = "lambda.min")) # choose the smallest lambda
      MSE <- # (y0 - fhat0)^2
        sapply(train$yhat, function(x) {
          mean((x - te_y)^2)
        })
      mean(MSE) # 1/simul * MSE
    }
    simpfun()
    

Edit: for tuning parameter,

    find_lambda <- function(.data, x, y, lambda, x_val, y_val) {
      .data %>%
        do(
          tuning = predict(glmnet::glmnet(x, y, alpha = 1, lambda = lambda), newx = x_val)
        ) %>%
        do( # tuning parameter: validation set
          mse = apply(.$tuning, 2, function(yhat, y) {
            mean((y - yhat)^2)
          }, y = y_val)
        ) %>%
        mutate(mse_min = min(mse)) %>%
        mutate(lam_choose = lambda[mse == mse_min]) # minimize mse
    }

Using this function, it is possible to add validation step

    simpfun <- function(n_train = 20, n_val = 20, n_test = 10, simul = 50, corr = .5, sigma = 3, beta = c(3, 1.5, 0, 0, 2, 0, 0, 0), lam_grid = 10^seq(10, -1, length.out = 100)) {
    require(foreach)
    require(tidyverse)
    # true model
    p <- length(beta)
    Covmatrix <- outer(
      1:p, 1:p,
      function(x, y) {
        corr^abs(x - y)
      }
    )
    X <- foreach(i = 1:simul, .combine = rbind) %do% {
      MASS::mvrnorm(n_train, rep(0, p), Covmatrix)
    }
    eps <- rnorm(n_train, mean = 0, sd = sigma)
    y <- X %*% beta + eps # generate true model
    # generate validation set
    val <- MASS::mvrnorm(n_val, rep(0, p), Covmatrix)
    val_y <- val %*% beta + rnorm(n_val, mean = 0, sd = sigma) # validation y
    # generate test set
    test <- MASS::mvrnorm(n_test, rep(0, p), Covmatrix)
    te_y <- test %*% beta + rnorm(n_test, mean = 0, sd = sigma) # test y
    simul_id <- gl(simul, k = n_train, labels = 1:n_train)

    Y <-
      y %>%
      as_tibble() %>%
      mutate(m_id = simul_id) %>%
      group_by(m_id) %>% # for each simulation: repeat
      rename(y = V1)

    # Tuning parameter
    Tuning <-
      Y %>%
      find_lambda(x = X, y = y, lambda = lam_grid, x_val = val, y_val = val_y)

    # expected test error
    test_mse <-
      Tuning %>%
      mutate(
        test_err = mean(
          (predict(glmnet::glmnet(X, y, alpha = 1, lambda = lam_choose), newx = test) - te_y)^2
        )
      ) %>%
      select(test_err) %>%
      pull()
    mean(test_mse)
  }
  simpfun()

Upvotes: 1

Related Questions