Fitting for autoregressive component in a poisson model

I have a dataset where I have the number of patients who were admitted due to a condition in a area in a specific day. To predict which are the ambient variables that affect the number I have fitted a poisson model, where I included multiple environmental variables (temperature, pressure, loudness ...). I also included in the model the sine and cosine of 180, 90, 360 days for seasonality, and the day of the week as constant effects. For each of the environmental variables, I included all of them in the model, and in case some of them are not significant, I removed them and included the +1 lag of them I also want to adjust for the autoregressive component of the dependent variable. The partial autocorrelation of the variable indicates that lags 1, 2, 3, 5, 6, 8 and 9 are significant.

This is the article I am trying to emulate the analysis: https://www.sciencedirect.com/science/article/pii/S0013935124003438?via=ihub

How would you add the autorregressive component of the dependent variable? Do you think I should take anything more into account?

The code I have been doing so far is the following:

# Variables de interés (puedes ajustar según tus datos)
variables <- c("pm10", "pm2.5", 
               "no2", "O3", "rh", "hPa", "tmax", "tmin", 
               "viento", "insol", "Ld", "Ln", "Ltot", "Tcal", "Tfrio", "O3oct")


lags <- 0:15  # Lags de 0 a 15 días

# Generar las variables con lags
for (var in variables) {
  for (lag in lags) {
    data[[paste0(var,"_lag", lag)]] <- dplyr::lag(data[[var]], n = lag)
  }
}

create_formula <- function(variable_lags, fixed_effects, type = "poisson") {
  if (type == "zeroinf") {string = "| 1"} else { string = ""}
  var_lags <- sapply(names(variable_lags), function(var) paste0(var, "_lag", variable_lags[var]))
  formula_str <- paste("frT ~", paste(var_lags, collapse = " + "), "+", paste(fixed_effects, collapse = " + "), string)
  as.formula(formula_str)
}


fixed_effects <- c("lunes", "martes", "miércoles", "jueves", "viernes", "sábado", 
                   "festivo", "s365", "co365", "s180", "co180", "s120", "co120", 
                   "s90", "co90", "V1")


backward_stepwise_with_lags <- function(model, variable_lags, max_lag, fixed_effects, threshold = 0.05, 
                                        type = "poisson") {
  while (TRUE) {
    # Obtener el resumen del modelo actual
    model_summary <- summary(model)
    
    # Extraer los valores p de las variables
    p_values <- coef(model_summary)[, "Pr(>|z|)"]
    p_values <- p_values[names(p_values) != "(Intercept)"]
    
    # Separar las p-values de las variables con lag y los fixed effects
    p_values_lags <- p_values[!names(p_values)%in%fixed_effects]
    p_values_fixed <- p_values[names(p_values) %in% names(fixed_effects)]
    
    # Identificar la variable con el valor p más alto
    if (length(p_values_lags) > 0) {
      max_p_value_lags <- max(p_values_lags, na.rm = TRUE)
    } else {
      max_p_value_lags <- -Inf
    }
    
    if (length(p_values_fixed) > 0) {
      max_p_value_fixed <- max(p_values_fixed, na.rm = TRUE)
    } else {
      max_p_value_fixed <- -Inf
    }
    
    max_p_value <- max(max_p_value_lags, max_p_value_fixed, na.rm = TRUE)
    
    # Si el valor p más alto es mayor que el umbral, ajustar el lag o eliminar la variable
    if (max_p_value > threshold) {
      if (max_p_value == max_p_value_lags) {
        variable_to_adjust <- names(p_values_lags)[which.max(p_values_lags)]
        base_variable <- sub("_lag\\d+$", "", variable_to_adjust)
        current_lag <- variable_lags[base_variable]
        
        # Verificar si current_lag es NA y manejarlo adecuadamente
        if (is.na(current_lag)) {
          cat("El lag actual para la variable", base_variable, "es NA. Eliminando la variable.\n")
          variable_lags <- variable_lags[variable_lags != base_variable]
        } else if (current_lag < max_lag) {
          cat("Aumentando el lag de la variable:", base_variable, "de", current_lag, "a", current_lag + 1, "con p-valor de:", max_p_value, "\n")
          variable_lags[base_variable] <- current_lag + 1
        } else {
          cat("Eliminando la variable:", base_variable, "con p-valor de:", max_p_value, "\n")
          variable_lags <- variable_lags[names(variable_lags) != base_variable]
        }
      } else {
        variable_to_remove <- names(p_values_fixed)[which.max(p_values_fixed)]
        cat("Eliminando el efecto fijo:", variable_to_remove, "con p-valor de:", max_p_value, "\n")
        fixed_effects <- fixed_effects[fixed_effects != variable_to_remove]
      }
      
      print(variable_lags)
      new_formula <- create_formula(variable_lags, fixed_effects, type)
      if (type == "poisson") {
        model <- glm(new_formula, data = data, family = poisson)
      } else {
        model <- zeroinfl(new_formula, data = data, dist = "poisson")
      }
      
      
      # Verificar si el modelo ha convergido
      if (!model$converged) {
        cat("El modelo no ha convergido después de ajustar la variable:", variable_to_adjust, "\n")
        break
      }
    } else {
      # Si no hay valores p mayores que el umbral, el modelo ha convergido
      break
    }
  }
  return(model)
}
max_lag <- 10

# Inicializar los lags de todas las variables a 0
variable_lags <- setNames(rep(1, length(variables)), variables)
initial_formula <- create_formula(variable_lags,  fixed_effects)
model <- glm(initial_formula, data = data, family = "poisson")

final_model <- backward_stepwise_with_lags(model, variable_lags, max_lag, fixed_effects)

Upvotes: 0

Views: 11

Answers (0)

Related Questions