Predict with biglm factors

Question

Why does predict with biglm differ from predict with lm?

Here is my code to generate data: (admittedly sloppy)

require(data.table)
require(biganalytics)

#create a data set with factor variables and one continuous numerical variable 
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
data = data.table(ded = runif(100), cont = runif(100), fac1 = as.factor(LETTERS702[1:10]))
data[, cont := as.factor(ifelse(cont < .25, 100, 
                  ifelse(cont < .5, 200, 
                         ifelse( cont < .75, 300,
                                 cont))))]
newdata = data.table(ded = runif(50), cont = runif(50), fac1 = as.factor(LETTERS702[7:12]))
newdata[, cont := as.factor(ifelse(cont < .25, 100, 
                     ifelse(cont < .5, 200, 
                            ifelse( cont < .75, 300,
                                    cont))))]

#add the levels from the old data to the new data
levels(newdata$fac1) = c(levels(newdata$fac1),levels(data$fac1))
levels(newdata$cont) = c(levels(newdata$cont),levels(data$cont))

levels(data$fac1) = c(levels(data$fac1),levels(newdata$fac1))
levels(data$cont) = c(levels(data$cont),levels(newdata$cont))

Now when I run an lm regression and try to create predicted values in the new data table, it works as expected:

reg =lm(ded ~ fac1 + cont, data = data)
summary(reg)

newdata[fac1 %in% data$fac1 & cont %in% data$cont, ded_hat := predict(reg, newdata[fac1 %in% data$fac1 & cont %in% data$cont, ])]

head(newdata)
         ded cont fac1   ded_hat
1: 0.1820960  200    G 0.4636306
2: 0.2561465  300    H 0.5412485
3: 0.8711127  300    I 0.5351451
4: 0.9839877  200    J 0.5737994
5: 0.8802973  200    K        NA
6: 0.1385486  200    L        NA

But when I try to use biglm, All of the new predicted values are NA:

reg = biglm(ded ~ fac1 + cont, data = data)
summary(reg)

newdata[fac1 %in% data$fac1 & cont %in% data$cont, ded_hat := predict(reg, newdata[fac1 %in% data$fac1 & cont %in% data$cont, ])]

head(newdata)
         ded cont fac1 ded_hat
1: 0.1820960  200    G      NA
2: 0.2561465  300    H      NA
3: 0.8711127  300    I      NA
4: 0.9839877  200    J      NA
5: 0.8802973  200    K      NA
6: 0.1385486  200    L      NA

What is going on here? What do I need to change to get predict to work for biglm?

gannawag · Accepted Answer

Here is the work around I ended up using:

Be sure the new data has the all the levels of the old data
Use the dummies package to expand into dummies for each level manually

Run the regression and make the prediction

require(data.table)
require(dummies)

#create a data set with factor variables and one continuous numerical variable 
LETTERS702 <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
data = data.table(ded = runif(100), cont = runif(100), fac1 = as.factor(LETTERS702[1:10]))
data[, cont := as.factor(ifelse(cont < .25, 100, 
              ifelse(cont < .5, 200, 
                     ifelse( cont < .75, 300,
                             cont))))]

newdata = data.table(ded = runif(50), cont = runif(50), fac1 = as.factor(LETTERS702[7:12]))
newdata[, cont := as.factor(ifelse(cont < .25, 100, 
                 ifelse(cont < .5, 200, 
                        ifelse( cont < .75, 300,
                                cont))))]

#add the levels from the old data to the new data
levels(newdata$fac1) = c(levels(newdata$fac1),levels(data$fac1))
levels(newdata$cont) = c(levels(newdata$cont),levels(data$cont))

#generate a dummy for each fac1
data = as.data.table(dummy.data.frame(data, names = c("fac1","cont")))
newdata = as.data.table(dummy.data.frame(newdata, names = c("fac1","cont"), drop = F))

#a list of variable names in both data sets
datanames = (names(data))
newdatanames = (names(newdata))
varnames = subset(newdatanames, newdatanames %in% datanames)

#regression
reg = lm(ded ~ ., data = data)

#prediction
newdata[, ded_hat := predict(reg, newdata)]

Predict with biglm factors

Answers (2)

Related Questions