Manu675
Manu675

Reputation: 21

Are these prediction intervals for bagged ETS models calculated correctly? (coded in R)

it would be greatly appreciated if people here could have a look at this code and try to help with assessing whether these prediction intervals are calculated correctly or what needs to be changed. I tried to produce a 48 step ahead forecast using a bagged ETS model using the forecast.baggedETS() function in R. The data are hourly traffic flows (counts) from New York City. My issue is that my point forecasts seem to be quite good, whereas the simulated prediction intervals seem to be a bit too wide and the PI means diverge from the point forecasts as well (but I suppose they can be asymmetric and they should also be wider than regular confidence intervals). To evaluate the prediction intervals I have computed the PI coverage probability (PICP) which is only a mere 71%, which seems to be quite low. I have also enclosed a graph of the point forecasts and of the PIs. data bagged model + PIs

Do these make sense or what should I change? Thank you very much in advance.

#original series
ts_ori <- ts(df_ori$V2, start = c(1,1), frequency = 24) # 

# train test split
train <- window(ts_ori, start = c(1,1), end = c(21,24))
test <- window(ts_ori, start = c(22,1), end = c(24,1))

L <- BoxCox.lambda(ts_ori)
L # the optimal lambda is 0.9176

# Box-Cox tranformed original series
ts_ori_bc <- BoxCox(x= ts_ori, lambda = L)

train_bc <- window(ts_ori_bc, start = c(1,1), end = c(21,24))
test_bc <- window(ts_ori_bc, start = c(22,1), end = c(24,1))


ets_ZZZ <- ets(y = train, model = "ZZZ")
summary(ets_ZZZ)
forecast(ets_ZZZ) %>% autoplot() + autolayer(test)


# Bergmeirs Bagging ETS function
# the function bld.mbb.bootstrap is used to calculate the bootstrapped series with the 
# Box-Cox and Loess-based decomposition (BLD) bootstrap

#num is the number of bootstrap versions to generate
#blocksize is the size of the MBB 


# ensembling

# the default for the ETS function is the "ZZZ" automated model selection 
# so the bootstrapped ETS models can theoretically choose a different ETS model everytime

# creating 48 step ahead forecasts from a regular automatically chosen ETS model
etsfc <- forecast(ets(train), h=48) 

# the bagged ETS() function implements the bagged model forecasting method from the 
# Bergmeir 2016 paper
# the default for num in the baggedETS() function is 100
baggedfc <- forecast(baggedETS(train), h=48)


# plotting the bagged forecasts against the normal ETS forecasts
# the last two lines before dev.off() remove grid and grey background color

autoplot(train, xlab = "Time", ylab = "Traffic flows") +
        autolayer(baggedfc, series="Bagged.BLD.MBB.ETS", PI=FALSE) +
        autolayer(etsfc, series="ETS", PI=FALSE) +
        guides(colour=guide_legend(title="Point Forecasts")) +
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
              panel.background = element_blank(), axis.line = element_line(colour = "black"))


# the bagged ETS model is the average of 100  ETS models of the bootstrapped training data

# gives the point forecasts and the min/max of the 100 ETS models 
summary(baggedfc)

# prediction intervals for the bootstrapped series

# step 1: simulating nsim = 1000 series using the MBB procedure
numsim <- 1000
sim <- bld.mbb.bootstrap(train, numsim)

# step 2: For each of these series, we fit an ETS model and simulate one sample path from that model
# A different ETS model may be selected in each case, although it will most likely select the same 
# model because the series are similar. However, the estimated parameters will be different.

# the simulate() function simulates responses from the distribution of the fitted ets(sim) object

h <- 48
future <- matrix(0, nrow=numsim, ncol=h)
for(i in seq(numsim))
        future[i,] <- simulate(ets(sim[[i]]), nsim=h, seed = 123)



# step 3: we take the means and quantiles of these simulated sample paths to form point 
# forecasts and prediction intervals.

# tsp() gives the start time in time units the end time and the frequency of a time series

start <- tsp(train)[2]+1/24
simfc <- structure(list(
        mean = ts(colMeans(future), start=start, frequency=24),
        lower = ts(apply(future, 2, quantile, prob=0.025),
                   start=start, frequency=24),
        upper = ts(apply(future, 2, quantile, prob=0.975),
                   start=start, frequency=24),
        level=95),
        class="forecast")

# step 4: plotting the bagged ETS model and the prediction intervals

etsfc <- forecast(ets(train), h=48, level=95) 


autoplot(train, ylab = "Traffic flows", xlab = "Time", series = "Training data", color = "black") +
        autolayer(simfc, series="Simulated PIs") +
        autolayer(baggedfc, PI=FALSE, series = "Bagged.BLD.MBB.ETS") +
        guides(colour=guide_legend(title="Series")) +
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                panel.background = element_blank(), axis.line = element_line(colour = "black"))


# PICP

for (i in 1:length(baggedfc$mean)) {
        # creating an indicator variable for every element of the mean column and store the 
        # elements in picp_dummy
        picp_dummy <- ifelse(baggedfc$mean[i] >= simfc$lower & baggedfc$mean[i] <= simfc$upper,1,0)
}
print(picp_dummy)

picp = 1/length(baggedfc$mean) *sum(picp_dummy)

print(picp)
# the PICP for the simulated 95% PI for the bagged ETS model is 0.7083333

# MPIW

length = length(simfc$upper)
length


mpiw = (1/length)*sum(simfc$upper - simfc$lower)
print(mpiw)
# the MPIW of the simulated 95% PI is 15437.7

update

I used my data and ran the modified code from Hyndman's answer. However, I got prediction intervals which are still very wide. In his sample code with the random normal numbers the width of the prediction intervals is roughly in the same ballpark as the variation in the training data. However, when using my data the PI width is extremely large.

New PIs obtained from Hyndman's code

Upvotes: 0

Views: 562

Answers (1)

Rob Hyndman
Rob Hyndman

Reputation: 31800

There are several issues here.

  1. Don't set the seed on every simulation or you end up with the same processes being generated. This will reduce the variation of your simulated series. If you want to set the seed, do it once at the start of all calculations.
  2. Your calculation for the PI coverage needs to compare the simulated quantiles against the test data, not against the forecast means.
  3. The prediction intervals need to be conditional on the training data. You can achieve this by refitting the bootstrapped models to the original training data (without re-estimating). This is not really needed when computing the forecast means as the differences will average out. It becomes important when estimating the prediction intervals as they will be too wide otherwise (with an additional source of variation).

Here is some cleaned up code to do what you want.

library(forecast)
library(ggplot2)

set.seed(666)

# original series (random data for the example)
ts_ori <- ts(rnorm(24*24), frequency=24)

# train test split
train <- window(ts_ori, start = c(1, 1), end = c(21, 24))
test <- window(ts_ori, start = c(22, 1), end = c(24, 1))

# creating 48 step ahead forecasts from ETS model
etsfc <- forecast(ets(train), h = 48)
# And from a bagged model
baggedfc <- forecast(baggedETS(train), h = 48)

# plotting the bagged forecasts against the normal ETS forecasts
autoplot(train, xlab = "Time", ylab = "Traffic flows") +
  autolayer(baggedfc, series = "Bagged ETS", PI = FALSE) +
  autolayer(etsfc, series = "ETS", PI = FALSE) +
  guides(colour = guide_legend(title = "Point Forecasts"))


# prediction intervals for the bootstrapped series

# step 1: simulate 1000 series using the MBB procedure
numsim <- 200
sim <- bld.mbb.bootstrap(train, numsim)

# step 2: Fit an ETS model to each bootstrapped series, then refit to
# training data and simulate from refit.
h <- 48
future <- matrix(0, nrow = numsim, ncol = h)
for (i in seq(numsim)) {
  model <- ets(sim[[i]])
  refit <- ets(train, model = model, use.initial.values = TRUE)
  future[i, ] <- simulate(refit, nsim = h)
}

# step 3: we take the means and quantiles of these simulated sample paths to
# form point forecasts and prediction intervals.

start <- tsp(train)[2] + 1 / 24
simfc <- structure(list(
  mean = ts(colMeans(future),
            start = start, frequency = 24),
  lower = ts(apply(future, 2, quantile, prob = 0.025),
             start = start, frequency = 24),
  upper = ts(apply(future, 2, quantile, prob = 0.975),
             start = start, frequency = 24),
  level = 95
), class = "forecast")

# step 4: plotting the bagged ETS model and the prediction intervals
autoplot(train, ylab = "Traffic flows") +
  autolayer(simfc, series = "Simulated PIs") +
  autolayer(test, series = "Test") +
  guides(colour = guide_legend(title = ""))


# PICP
mean(test >= simfc$lower & test <= simfc$upper)
#> [1] 0.9166667

# MPIW
mean(simfc$upper - simfc$lower)
#> [1] 4.079027

Created on 2020-05-04 by the reprex package (v0.3.0)

Upvotes: 0

Related Questions