Reputation: 339
I keep running into a stubborn error trying to use the glmnet stats package as indicated below.
I've tried the limited suggestions listed here (including setting the data as a data.matrix). I've also attempted to use the "penalty.box" setting described in ?glmnet with no positive results.
df = structure(list(term = c(0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), int_rate = c(10.65, 15.27, 15.96, 13.49, 12.69, 7.9, 15.96, 18.64, 21.28, 12.69, 14.65, 12.69, 13.49, 9.91, 10.65, 16.29, 15.27, 6.03, 11.71, 6.03, 15.27, 12.42, 11.71, 11.71, 11.71, 9.91, 16.77, 11.71, 11.71, 7.51, 7.9, 15.96, 8.9, 15.96, 10.65, 9.91, 7.9, 12.42, 12.69, 7.51, 7.9, 18.25, 16.77, 6.03, 9.91, 8.9, 10.65, 6.03, 6.62, 9.91), emp_length = c(NA, 1, NA, NA, 1, 3, 8, 9, 4, 1, 5, NA, 1, 3, 3, 1, 4, NA, 1, 6, 3, NA, NA, 5, 1, 2, 2, NA, 1, 7, 5, 2, 2, 7, NA, 2, 1, 1, 1, 4, NA, 9, NA, NA, 6, NA, 6, NA, 5, 8), annual_inc = c(24000, 30000, 12252, 49200, 80000, 36000, 47004, 48000, 40000, 15000, 72000, 75000, 30000, 15000, 1e+05, 28000, 42000, 110000, 84000, 77385.19, 43370, 105000, 50000, 50000, 76000, 92000, 50004, 106000, 25000, 17108, 75000, 29120, 24044, 34000, 41000, 55596, 45000, 36852, 27000, 68004, 62300, 65000, 55000, 45600, 0000, 1e+05, 27000, 60000, 70000, 80000), delinq_2yrs = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), inq_last_6mths = c(1L, 5L, 2L, 1L, 0L, 3L, 1L, 2L, 2L, 0L, 2L, 0L, 1L, 2L, 2L, 1L, 2L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 2L, 0L, 0L, 1L, 3L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 2L), outcome = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("term", "int_rate", "emp_length", "annual_inc", "delinq_2yrs", "inq_last_6mths", "outcome"), row.names = c(NA, 50L), class = "data.frame")
X = select(df, -outcome)
Y = df$outcome
X_train = as.matrix(X[1:50,])
Y_train = as.matrix(Y[1:50])
library(glmnet)
model = glmnet(X_train, Y_train, family = "binomial")
summary(model)
Here's the error:
Error in drop(y %*% rep(1, nc)) :
error in evaluating the argument 'x' in selecting a method for
function 'drop': Error in y %*% rep(1, nc) : non-conformable arguments
The actual dataset is 110 variables and ~1mm observations , but the partial dataset above is producing the same issue.
Any suggestions on debugging approaches for this?
Upvotes: 1
Views: 10068
Reputation: 7188
The issue is that you have NA
entries is the emp_length
column, which you cannot include in the dataset that you pass to glmnet
. You either need to drop rows that contain NA
values from your dataset, or impute them.
Here is some code that you can use to drop the values and get glmnet
to work properly.
X = select(df, -outcome)
Y = df$outcome
X_train = as.matrix(X[1:50,])
Y_train = as.matrix(sample(0:1,50,replace=TRUE))
has_NA = apply(is.na(X_train), 1, any) #= 1 if any column in that row is NA
X_train = X_train[!has_NA,]
Y_train = Y_train[!has_NA,]
library(glmnet)
model = glmnet(X_train, Y_train, family = "binomial")
Upvotes: 1