Reputation: 44527
This is a simple question but I couldn't find a clear and compelling answer anywhere. If I have a regression model with one or more interaction terms, like:
mod1 <- lm(mpg ~ factor(cyl) * factor(am), data = mtcars)
coef(summary(mod1))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.900000 1.750674 13.080673 0.0000000000006057324
## factor(cyl)6 -3.775000 2.315925 -1.630018 0.1151545663620229670
## factor(cyl)8 -7.850000 1.957314 -4.010599 0.0004547582690011110
## factor(am)1 5.175000 2.052848 2.520888 0.0181760532676256310
## factor(cyl)6:factor(am)1 -3.733333 3.094784 -1.206331 0.2385525615801434851
## factor(cyl)8:factor(am)1 -4.825000 3.094784 -1.559075 0.1310692573417492068
what is a sure fire way of identifying which coefficient estimates are for interaction terms? The obvious way is to grep()
for the colon symbol in the term names. But let's assume for a second that's not possible because of something like:
mtcars$cyl2 <- factor(mtcars$cyl, levels = c(4,6,8), labels = paste("Cyl:", unique(mtcars$cyl)))
mod2 <- lm(mpg ~ cyl2 * factor(am), data = mtcars)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.900000 1.750674 13.080673 0.0000000000006057324
## cyl2Cyl: 4 -3.775000 2.315925 -1.630018 0.1151545663620229670
## cyl2Cyl: 8 -7.850000 1.957314 -4.010599 0.0004547582690011110
## factor(am)1 5.175000 2.052848 2.520888 0.0181760532676256310
## cyl2Cyl: 4:factor(am)1 -3.733333 3.094784 -1.206331 0.2385525615801434851
## cyl2Cyl: 8:factor(am)1 -4.825000 3.094784 -1.559075 0.1310692573417492068
I thought perhaps the terms()
object would be useful but it isn't. I could also probably make some assumption about the ordering/numbering of terms to get the intended result:
coef(summary(mod2))[5:6,]
## Estimate Std. Error t value Pr(>|t|)
## cyl2Cyl: 4:factor(am)1 -3.733333 3.094784 -1.206331 0.2385526
## cyl2Cyl: 8:factor(am)1 -4.825000 3.094784 -1.559075 0.1310693
but I don't know how to do that in a general way.
What can be done?
Upvotes: 8
Views: 1803
Reputation: 142
I'd go for something tidy:
library(tidyverse)
library(broom)
mtcars$cyl2 <- factor(mtcars$cyl, levels = c(4,6,8), labels = paste("Cyl:", unique(mtcars$cyl)))
mod2 <- lm(mpg ~ cyl2 * factor(am), data = mtcars)
mod2 %>% tidy %>% as_tibble %>% filter(term %>% str_detect("\\w{1}:\\w{1}")) %>% pull(estimate)
[1] -3.733333 -4.825000
Update
The above fails, if there is no space after the colon, this can be resolved like so:
# Look for factor variables and replace ':' with '_'
clean_colons = function(x){
x %>% as_tibble %>%
mutate_if(is.factor, function(x){str_replace_all(x,":","_")}) %>%
return()
}
# Define the tricky cyl2 varible
mtcars$cyl2 = factor(mtcars$cyl, levels = c(4,6,8),
labels = paste0("Cyl:", unique(mtcars$cyl)))
# Create the model and output parameters
mtcars %>% clean_colons %>% lm(mpg ~ cyl2 * factor(am), data = .) %>% tidy %>%
as_tibble
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 19.1 1.52 12.6 1.38e-12
2 cyl2Cyl_6 3.77 2.32 1.63 1.15e- 1
3 cyl2Cyl_8 -4.08 1.75 -2.33 2.80e- 2
4 factor(am)1 1.44 2.32 0.623 5.39e- 1
5 cyl2Cyl_6:factor(am)1 3.73 3.09 1.21 2.39e- 1
6 cyl2Cyl_8:factor(am)1 -1.09 3.28 -0.333 7.42e- 1
# Now we no longer have the colon-in-variable-name problem, so we can do
mtcars %>% clean_colons %>% lm(mpg ~ cyl2 * factor(am), data = .) %>% tidy %>%
as_tibble %>% filter(term %>% str_detect("\\w{1}:\\w{1}")) %>% pull(estimate)
[1] 3.733333 -1.091667
Upvotes: 1
Reputation: 3570
I built something that does just this in coefplot
.
#' @title matchCoefs.default
#' @description Match coefficients to predictors
#' @details Matches coefficients to predictors using information from model matrices
#' @author Jared P. Lander
#' @aliases matchCoefs.default
#' @import reshape2
#' @param model Fitted model
#' @param \dots Further arguments
#' @return a data.frame matching predictors to coefficients
matchCoefs.default <- function(model, ...)
{
# get the terms
theTerms <- model$terms
# get the assignment position
#thePos <- model$assign
thePos <- get.assign(model)
# get intercept indicator
inter <- attr(theTerms, "intercept")
# get coef names
coefNames <- names(stats::coef(model))
# get pred names
predNames <- attr(theTerms, "term.labels")
# expand out pred names to match coefficient names
predNames <- predNames[thePos]
# if there's an intercept term add it to the pred names
if(inter == 1)
{
predNames <- c("(Intercept)", predNames)
}
# build data.frame linking term to coefficient name
matching <- data.frame(Term=predNames, Coefficient=coefNames, stringsAsFactors=FALSE)
## now match individual predictor to term
# get matrix as data.frame
factorMat <- as.data.frame(attr(theTerms, "factor"))
# add column from rownames as identifier
factorMat$.Pred <- rownames(factorMat)
factorMat$.Type <- attr(theTerms, "dataClasses")
# melt it down for comparison
factorMelt <- melt(factorMat, id.vars=c(".Pred", ".Type"), variable.name="Term")
factorMelt$Term <- as.character(factorMelt$Term)
# only keep rows where there's a match
factorMelt <- factorMelt[factorMelt$value == 1, ]
# again, bring in coefficient if needed
if(inter == 1)
{
factorMelt <- rbind(data.frame(.Pred="(Intercept)", .Type="(Intercept)", Term="(Intercept)", value=1, stringsAsFactors=FALSE), factorMelt)
}
# join into the matching data.frame
matching <- join(matching, factorMelt, by="Term")
return(matching)
}
Then you can call matchCoefs.default(mod1)
.
Upvotes: 3
Reputation: 775
This is also convoluted, but at least you can track explicitly what you are doing:
# require package
require(Hmisc)
# load and process data
data(mtcars)
mtcars$cyl <- factor(mtcars$cyl)
mtcars$gear <- factor(mtcars$gear)
# fit model
m <- lm(mpg ~ hp*wt + disp + cyl*gear, data=mtcars)
# extract all variables used in right hand side of the model
variables <- as.character(attr(terms(m), "variables"))[-1L]
# extract model coefficients
s <- summary(m)$coeff
s
# discard unwanted rows corresponding to continuous predictors
s1 <- s[rownames(s) %nin% variables[-1], ]
s1
# discard unwanted rows corresponding to categorical predictor gear
s2 <- s1[rownames(s1) %nin% paste0("gear",levels(mtcars$gear)), ]
s2
The outputs would be as follows:
> s
Estimate Std. Error t value Pr(>|t|)
(Intercept) 43.80219399 5.977952455 7.3272905 4.404820e-07
hp -0.11303405 0.040769042 -2.7725462 1.174817e-02
wt -6.76725414 1.868428113 -3.6218970 1.699630e-03
disp -0.00332244 0.014012039 -0.2371133 8.149808e-01
cyl6 2.87780626 3.180169670 0.9049222 3.762789e-01
cyl8 2.76416050 3.605957805 0.7665538 4.523007e-01
gear4 3.66877217 2.529082701 1.4506335 1.623857e-01
gear5 4.25400864 2.931068582 1.4513508 1.621881e-01
hp:wt 0.02401629 0.009953734 2.4127921 2.555079e-02
cyl6:gear4 -4.65994590 3.331261825 -1.3988531 1.771739e-01
cyl6:gear5 -3.86789967 4.400309909 -0.8790062 3.898369e-01
cyl8:gear5 -2.08842224 3.980538289 -0.5246582 6.055881e-01
> s1
Estimate Std. Error t value Pr(>|t|)
(Intercept) 43.80219399 5.977952455 7.3272905 4.404820e-07
cyl6 2.87780626 3.180169670 0.9049222 3.762789e-01
cyl8 2.76416050 3.605957805 0.7665538 4.523007e-01
gear4 3.66877217 2.529082701 1.4506335 1.623857e-01
gear5 4.25400864 2.931068582 1.4513508 1.621881e-01
hp:wt 0.02401629 0.009953734 2.4127921 2.555079e-02
cyl6:gear4 -4.65994590 3.331261825 -1.3988531 1.771739e-01
cyl6:gear5 -3.86789967 4.400309909 -0.8790062 3.898369e-01
cyl8:gear5 -2.08842224 3.980538289 -0.5246582 6.055881e-01
> s2
Estimate Std. Error t value Pr(>|t|)
(Intercept) 43.80219399 5.977952455 7.3272905 4.404820e-07
cyl6 2.87780626 3.180169670 0.9049222 3.762789e-01
cyl8 2.76416050 3.605957805 0.7665538 4.523007e-01
hp:wt 0.02401629 0.009953734 2.4127921 2.555079e-02
cyl6:gear4 -4.65994590 3.331261825 -1.3988531 1.771739e-01
cyl6:gear5 -3.86789967 4.400309909 -0.8790062 3.898369e-01
cyl8:gear5 -2.08842224 3.980538289 -0.5246582 6.055881e-01
One could loop over all categorical predictors to automate the last two steps.
Upvotes: 1
Reputation: 44527
A possible one-liner solution is:
coef(summary(mod2))[1L + which(colSums(attr(terms(mod2), "factors")[,attr(model.matrix(mod2), "assign")[-1L]]) == 2L),]
## Estimate Std. Error t value Pr(>|t|)
## cyl2Cyl: 4:factor(am)1 -3.733333 3.094784 -1.206331 0.2385526
## cyl2Cyl: 8:factor(am)1 -4.825000 3.094784 -1.559075 0.1310693
This is messy,* but basically it uses the "assign"
attribute of the model matrix:
attr(model.matrix(mod2), "assign")
## [1] 0 1 1 2 3 3
to get the names indices (off-by-one) of the interaction term coefficients:
which(colSums(attr(terms(mod2), "factors")[,attr(model.matrix(mod2), "assign")[-1L]]) == 2L)
## cyl2:factor(am) cyl2:factor(am)
## 4 5
which can then be used to extract the relevant rows from coef()
. This works because the order of coefficients is dictated by the order of columns in the design matrix and the "assign"
attribute thereof maps those columns back to terms in the original formula.
It's not super clean but seems to work.
*PS. In this example, only second-order interaction terms are extracted. Changing == 2L
to >= 2L
or something would allow for grabbing higher order interactions.
Upvotes: 3
Reputation: 3647
Another approach (also a bit convoluted) is to go through the covariate names and see how many times the names of the other covariates appear:
mtcars$cyl2 <- factor(mtcars$cyl, levels = c(4,6,8), labels = paste("Cyl:", unique(mtcars$cyl)))
mod2 <- lm(mpg ~ cyl2 * factor(am), data = mtcars)
coef(summary(mod2))
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 22.900000 1.750674 13.080673 6.057324e-13
#> cyl2Cyl: 4 -3.775000 2.315925 -1.630018 1.151546e-01
#> cyl2Cyl: 8 -7.850000 1.957314 -4.010599 4.547583e-04
#> factor(am)1 5.175000 2.052848 2.520888 1.817605e-02
#> cyl2Cyl: 4:factor(am)1 -3.733333 3.094784 -1.206331 2.385526e-01
#> cyl2Cyl: 8:factor(am)1 -4.825000 3.094784 -1.559075 1.310693e-01
# Essentially an ugly double-loop
check.inter <- function(coefs){
which(lapply(coefs, function(co){
sum(unlist(lapply(coefs, function(co2){
grepl(co2, co, fixed = T)})))})>1)}
check.inter(names(coefficients(mod2)))
#> [1] 5 6
coef(summary(mod2))[check.inter(names(coefficients(mod2))),]
#> Estimate Std. Error t value Pr(>|t|)
#> cyl2Cyl: 4:factor(am)1 -3.733333 3.094784 -1.206331 0.2385526
#> cyl2Cyl: 8:factor(am)1 -4.825000 3.094784 -1.559075 0.1310693
Created on 2018-05-17 by the reprex package (v0.2.0).
Upvotes: 1
Reputation: 13108
This seems a little convoluted, but could we just enumerate all the main effects and then take the set difference?
mod2 <- lm(mpg ~ cyl2 * factor(am) + wt * disp, data = mtcars)
variables <- labels(mod2)[attr(terms(mod2), "order") == 1]
factors <- sapply(names(mod2$xlevels), function(x) paste0(x, mod2$xlevels[[x]])[-1])
setdiff(colnames(model.matrix(mod2)), c("(Intercept)", variables, unlist(factors)))
# [1] "cyl2Cyl: 4:factor(am)1" "cyl2Cyl: 8:factor(am)1" "wt:disp"
Upvotes: 4