Reputation: 169
I have a dataset where I have the number of patients who were admitted due to a condition in a area in a specific day. To predict which are the ambient variables that affect the number I have fitted a poisson model, where I included multiple environmental variables (temperature, pressure, loudness ...). I also included in the model the sine and cosine of 180, 90, 360 days for seasonality, and the day of the week as constant effects. For each of the environmental variables, I included all of them in the model, and in case some of them are not significant, I removed them and included the +1 lag of them I also want to adjust for the autoregressive component of the dependent variable. The partial autocorrelation of the variable indicates that lags 1, 2, 3, 5, 6, 8 and 9 are significant.
This is the article I am trying to emulate the analysis: https://www.sciencedirect.com/science/article/pii/S0013935124003438?via=ihub
How would you add the autorregressive component of the dependent variable? Do you think I should take anything more into account?
The code I have been doing so far is the following:
# Variables de interés (puedes ajustar según tus datos)
variables <- c("pm10", "pm2.5",
"no2", "O3", "rh", "hPa", "tmax", "tmin",
"viento", "insol", "Ld", "Ln", "Ltot", "Tcal", "Tfrio", "O3oct")
lags <- 0:15 # Lags de 0 a 15 días
# Generar las variables con lags
for (var in variables) {
for (lag in lags) {
data[[paste0(var,"_lag", lag)]] <- dplyr::lag(data[[var]], n = lag)
}
}
create_formula <- function(variable_lags, fixed_effects, type = "poisson") {
if (type == "zeroinf") {string = "| 1"} else { string = ""}
var_lags <- sapply(names(variable_lags), function(var) paste0(var, "_lag", variable_lags[var]))
formula_str <- paste("frT ~", paste(var_lags, collapse = " + "), "+", paste(fixed_effects, collapse = " + "), string)
as.formula(formula_str)
}
fixed_effects <- c("lunes", "martes", "miércoles", "jueves", "viernes", "sábado",
"festivo", "s365", "co365", "s180", "co180", "s120", "co120",
"s90", "co90", "V1")
backward_stepwise_with_lags <- function(model, variable_lags, max_lag, fixed_effects, threshold = 0.05,
type = "poisson") {
while (TRUE) {
# Obtener el resumen del modelo actual
model_summary <- summary(model)
# Extraer los valores p de las variables
p_values <- coef(model_summary)[, "Pr(>|z|)"]
p_values <- p_values[names(p_values) != "(Intercept)"]
# Separar las p-values de las variables con lag y los fixed effects
p_values_lags <- p_values[!names(p_values)%in%fixed_effects]
p_values_fixed <- p_values[names(p_values) %in% names(fixed_effects)]
# Identificar la variable con el valor p más alto
if (length(p_values_lags) > 0) {
max_p_value_lags <- max(p_values_lags, na.rm = TRUE)
} else {
max_p_value_lags <- -Inf
}
if (length(p_values_fixed) > 0) {
max_p_value_fixed <- max(p_values_fixed, na.rm = TRUE)
} else {
max_p_value_fixed <- -Inf
}
max_p_value <- max(max_p_value_lags, max_p_value_fixed, na.rm = TRUE)
# Si el valor p más alto es mayor que el umbral, ajustar el lag o eliminar la variable
if (max_p_value > threshold) {
if (max_p_value == max_p_value_lags) {
variable_to_adjust <- names(p_values_lags)[which.max(p_values_lags)]
base_variable <- sub("_lag\\d+$", "", variable_to_adjust)
current_lag <- variable_lags[base_variable]
# Verificar si current_lag es NA y manejarlo adecuadamente
if (is.na(current_lag)) {
cat("El lag actual para la variable", base_variable, "es NA. Eliminando la variable.\n")
variable_lags <- variable_lags[variable_lags != base_variable]
} else if (current_lag < max_lag) {
cat("Aumentando el lag de la variable:", base_variable, "de", current_lag, "a", current_lag + 1, "con p-valor de:", max_p_value, "\n")
variable_lags[base_variable] <- current_lag + 1
} else {
cat("Eliminando la variable:", base_variable, "con p-valor de:", max_p_value, "\n")
variable_lags <- variable_lags[names(variable_lags) != base_variable]
}
} else {
variable_to_remove <- names(p_values_fixed)[which.max(p_values_fixed)]
cat("Eliminando el efecto fijo:", variable_to_remove, "con p-valor de:", max_p_value, "\n")
fixed_effects <- fixed_effects[fixed_effects != variable_to_remove]
}
print(variable_lags)
new_formula <- create_formula(variable_lags, fixed_effects, type)
if (type == "poisson") {
model <- glm(new_formula, data = data, family = poisson)
} else {
model <- zeroinfl(new_formula, data = data, dist = "poisson")
}
# Verificar si el modelo ha convergido
if (!model$converged) {
cat("El modelo no ha convergido después de ajustar la variable:", variable_to_adjust, "\n")
break
}
} else {
# Si no hay valores p mayores que el umbral, el modelo ha convergido
break
}
}
return(model)
}
max_lag <- 10
# Inicializar los lags de todas las variables a 0
variable_lags <- setNames(rep(1, length(variables)), variables)
initial_formula <- create_formula(variable_lags, fixed_effects)
model <- glm(initial_formula, data = data, family = "poisson")
final_model <- backward_stepwise_with_lags(model, variable_lags, max_lag, fixed_effects)
Upvotes: 0
Views: 11