Ben
Ben

Reputation: 21625

How to deal with perfect fit linear model

The data I'm dealing with occasionally has a "perfectly fitting" linear model. For each regression I run, I need to extract the r.squared value which I've been doing with summary(mymodel)$r.squared but this fails in the case of a perfectly fitting model (see below).

df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
      mymodel <- lm(y ~ x, data = df)
      summary(mymodel)$r.squared #This raises a warning
      0.5294

How can I handle these cases? Basically, I think I want to do something like

If(mymodel is a perfect fit)
    rsquared = 1
else
    rsquared = summary(mymodel)$r.squared

Upvotes: 3

Views: 3825

Answers (3)

LyzandeR
LyzandeR

Reputation: 37879

If you want to make sure that everything will be working perfect then you can just slightly modify the source code (type summary.lm to see the original code):

df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)

This is how i modified it. All is the same as the original summary function apart from the bit at the bottom of the function.

summary2 <- function (object, correlation = FALSE, symbolic.cor = FALSE, 
                      ...) 
{
  z <- object
  p <- z$rank
  rdf <- z$df.residual
  if (p == 0) {
    r <- z$residuals
    n <- length(r)
    w <- z$weights
    if (is.null(w)) {
      rss <- sum(r^2)
    }
    else {
      rss <- sum(w * r^2)
      r <- sqrt(w) * r
    }
    resvar <- rss/rdf
    ans <- z[c("call", "terms", if (!is.null(z$weights)) "weights")]
    class(ans) <- "summary.lm"
    ans$aliased <- is.na(coef(object))
    ans$residuals <- r
    ans$df <- c(0L, n, length(ans$aliased))
    ans$coefficients <- matrix(NA, 0L, 4L)
    dimnames(ans$coefficients) <- list(NULL, c("Estimate", 
                                               "Std. Error", "t value", "Pr(>|t|)"))
    ans$sigma <- sqrt(resvar)
    ans$r.squared <- ans$adj.r.squared <- 0
    return(ans)
  }
  if (is.null(z$terms)) 
    stop("invalid 'lm' object:  no 'terms' component")
  if (!inherits(object, "lm")) 
    warning("calling summary.lm(<fake-lm-object>) ...")
  Qr <- qr(object)
  n <- NROW(Qr$qr)
  if (is.na(z$df.residual) || n - p != z$df.residual) 
    warning("residual degrees of freedom in object suggest this is not an \"lm\" fit")
  r <- z$residuals
  f <- z$fitted.values
  w <- z$weights
  if (is.null(w)) {
    mss <- if (attr(z$terms, "intercept")) 
      sum((f - mean(f))^2)
    else sum(f^2)
    rss <- sum(r^2)
  }
  else {
    mss <- if (attr(z$terms, "intercept")) {
      m <- sum(w * f/sum(w))
      sum(w * (f - m)^2)
    }
    else sum(w * f^2)
    rss <- sum(w * r^2)
    r <- sqrt(w) * r
  }
  resvar <- rss/rdf
  p1 <- 1L:p
  R <- chol2inv(Qr$qr[p1, p1, drop = FALSE])
  se <- sqrt(diag(R) * resvar)
  est <- z$coefficients[Qr$pivot[p1]]
  tval <- est/se
  ans <- z[c("call", "terms", if (!is.null(z$weights)) "weights")]
  ans$residuals <- r
  ans$coefficients <- cbind(est, se, tval, 2 * pt(abs(tval), 
                                                  rdf, lower.tail = FALSE))
  dimnames(ans$coefficients) <- list(names(z$coefficients)[Qr$pivot[p1]], 
                                     c("Estimate", "Std. Error", "t value", "Pr(>|t|)"))
  ans$aliased <- is.na(coef(object))
  ans$sigma <- sqrt(resvar)
  ans$df <- c(p, rdf, NCOL(Qr$qr))
  if (p != attr(z$terms, "intercept")) {
    df.int <- if (attr(z$terms, "intercept")) 
      1L
    else 0L
    ans$r.squared <- mss/(mss + rss)
    ans$adj.r.squared <- 1 - (1 - ans$r.squared) * ((n - 
                                                       df.int)/rdf)
    ans$fstatistic <- c(value = (mss/(p - df.int))/resvar, 
                        numdf = p - df.int, dendf = rdf)
  }
  else ans$r.squared <- ans$adj.r.squared <- 0
  ans$cov.unscaled <- R
  dimnames(ans$cov.unscaled) <- dimnames(ans$coefficients)[c(1, 
                                                             1)]

  #below is the only change to the code
  #instead of ans$r.squared <- 1 the original code had a warning
  if (is.finite(resvar) && resvar < (mean(f)^2 + var(f)) * 
        1e-30) {
    ans$r.squared <- 1 #this is practically the only change in the source code. Originally it had the warning here
  }
  #moved the above lower in the order of the code so as not to affect the original code
  #checked it and seems to be working properly

  if (correlation) {
    ans$correlation <- (R * resvar)/outer(se, se)
    dimnames(ans$correlation) <- dimnames(ans$cov.unscaled)
    ans$symbolic.cor <- symbolic.cor
  }
  if (!is.null(z$na.action)) 
    ans$na.action <- z$na.action
  class(ans) <- "summary.lm"
  ans

}

Run the new formula and see that it works now without any warnings. No other if or else if conditions are required.

> summary2(mymodel)$r.squared 
[1] 1

Upvotes: 1

rawr
rawr

Reputation: 20811

You can use tryCatch

df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
      mymodel <- lm(y ~ x, data = df)
      summary(mymodel)$r.squared #This raises a warning

tryCatch(summary(mymodel)$r.squared, warning = function(w) return(1))
# [1] 1

And with an added conditional to catch specific warnings

df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning

f <- function(expr) {
  tryCatch(expr, 
           warning = function(w) {
             if (grepl('perfect fit', w))
               return(1) 
             else return(w)
           })  
}

f(TRUE)
# [1] TRUE

f(sum(1:5))
# [1] 15

f(summary(mymodel)$r.squared)
# [1] 1

f(warning('this is not a fit warning'))
# <simpleWarning in doTryCatch(return(expr), name, parentenv, handler): this is not a fit warning>

Upvotes: 5

koekenbakker
koekenbakker

Reputation: 3604

One option to catch a perfect fit is to determine the residuals: if it is a perfect fit, the sum of residuals will be zero.

x = 1:5

# generate 3 sets of y values, last set is random values
y = matrix(data = c(rep(1,5),1:5,rnorm(5)), nrow = 5)
tolerance = 0.0001
r.sq = array(NA,ncol(y))

# check fit for three sets
for (i in 1:ncol(y)){
  fit = lm(y[,i]~x)

  # determine sum of residuals
  if (sum(abs(resid(fit))) < tolerance) {

    # perfect fit case
    r.sq[i] = 1 } else { 

      # non-perfect fit case
      r.sq[i] = summary(fit)$r.squared
  }
}

print(r.sq)
# [1] 1.0000000 1.0000000 0.7638879

Upvotes: 0

Related Questions