Reputation: 63
I'm having issues predicting in lda. I want to cross validate my set, so I split my data into a training set (80%) and a testing set (20%) 5 times. This gives me two dataframes of different length. I can make the training lda no problem, but when I predict I don't get the newdata prediction. It goes automatically to the training set predictions. Any help for a R newbie?
df.test=structure(list(DEV.rabbit.Bi = c(0L, 1L, 1L, 0L, 0L, 0L, 1L,0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L), cytoP = c(0,0, 0, 0, 0, 0, 0, -2.1260048, 0, 0, 0, 2.83428136, 0, 0, NA,0, -2.33067135, -3.2528685, 0, 0, -3.9118235, 0, -2.12893162,0, -2.135834975, -3.38015, 0, 2.86341288, 0, -2.4050405, 0, -2.38829672,0, -2.24985834, 0, -2.2202064, -2.15253385, -2.2366473, -2.96851445,0, -0.743292433, 0, 0, 0, -2.61448215, 0, 0, 0, 0, -2.9443965,0, 0), GIP = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0.88683115, 0, 0, 0, 4.31335206, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.900614, 0, 1.4537355, 0,6.168443, 3.872625, 3.1133642, 0, 2.3501405), neuroP = c(0, 0,2.0428646, 0, 0, 0, 0, 0, 0, 0, 5.165785, 0, 0, 0, NA, 0, 0,0, 0, 2.5078381, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.317407, 0, 0, 0,0, 0, 0, 0, 1.9766362, 0, 0, 0, 0, 4.6628686, 0, 0, 0, 4.6432279,4.586727, 0, 0, 0, 7.039145), ProlifP = c(0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, NA, 0, 3.562687467, 0, 0, 0, 0, 0, 0, 0,-2.12833253, 4.947180667, -2.04286463, 0, 0, 0, -2.562395, 0,0, 0, -2.346905, 0, 0, 0, 0, 0, 0, 2.005820067, -3.0411488, -1.885536,-3.2384957, 0, 0, 0, 0, 5.6344196, 0, -4.767982), reproP = c(0.018018017,0.418918933, 0.040540533, 0.018018017, 0.454954967, 0, 0, 0.049549533,0, 0, 0, 0.3963964, 0.058558567, 0.040540533, NA, 0.054054067,0.441441433, 0, 0, 0.040540533, 0.063063067, 0, 0.35135135, 0.058558567,0.018018017, 0, 0.027027027, 0.040540533, 0.1036036, 0.4, 0.2,0.018018017, 0.130630633, 0.018018017, 0.1, 0.054054067, 0.031531533,0.081081067, 0.1036036, 0.040540533, 0.0900901, 0.369369367,0.036036033, -1.1009885, -0.673395133, NA, 0.045045033, 0, 0,0.1036036, -0.984343, 0)), .Names = c("DEV.rabbit.Bi", "cytoP","GIP", "neuroP", "ProlifP", "reproP"), row.names = c(12L, 23L,24L, 27L, 38L, 56L, 59L, 61L, 63L, 65L, 71L, 81L, 128L, 131L,141L, 154L, 163L, 168L, 170L, 184L, 186L, 205L, 210L, 217L, 233L,236L, 253L, 268L, 276L, 293L, 302L, 303L, 312L, 314L, 322L, 326L,335L, 339L, 343L, 361L, 377L, 385L, 392L, 394L, 399L, 402L, 418L,419L, 422L, 427L, 438L, 453L), class = "data.frame")
df.train= structure(list(DEV.rabbit.Bi = c(0L, 1L, 0L, 0L, 0L, 1L, 1L,1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L,0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L,1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L,0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L,0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), cytoP = c(0,NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,NA, 0, 0, 0, -2.648429, 0, 0, -2.1260048, 0, 0, 0, 0, 0, 0, 0,2.83428136, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, -3.126005, 0, 0,0, 7.0318728, 0, 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, 3.024976, 0,0, 0, -2.33067135, 0, 0, NA, 0, 0, -3.3048862, 3.2453672, 0,NA, 0, -3.9118235, NA, 0, 0, 0, 0, 0, -3.3074869, 0, 0, 0, 0,0, NA, 0, 0, 0, -3.64705195, 0, 0, -2.6801575, 0, -2.32687549,0, 0, -3.38015, 0, 0, NA, 0, -2.4122793, 0, 0, 0, 0, 0, 0, -2.434712735,2.86341288, 0, 0, 0, 0, 0, 0, 0, 0, -3.73306513, 0, 0, 0, 0,0, -2.38829672, 0, 0, 0, -0.823873667, 0, 0, 0, -2.24985834,0, 0, 0, 0, 0, -2.2202064, 0, -2.34696895, NA, NA, 0, -2.15253385,-2.1856675, -2.2366473, 2.017460955, -2.96851445, 0, 0, 0, -3.0842214,0, -3.50124325, -5.794065, 0, NA, 0, -3.1539793, -2.5736979,0, 0, -2.3865695, 0, -2.710736745, 0, -0.743292433, 0, 2.373366367,0, -2.75693455, NA, NA, -2.61448215, NA, 0, 0, 0, -2.2124975,0, 0, 0, 0, 0, 0, 0, 0, -3.053354, NA, 5.428529647, -2.9443965,-3.8878643, -2.2083998, 0, 0, 0, NA, 0, NA, -2.13583495, 0, 0,0), GIP = c(0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,5.820918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 3.73598124, 0, 0,4.588133, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0566821, 0,0, 0, 0, 0, 4.31335206, 0, 0, 0, 0, 0, 8.6651012, 0, 2.55087375,0, 0, 0, 0, 0, 0, 0, 0, 3.068526045, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 6.3948068, 0, 0,0, 0, 0, 0, 0, 0, 0, 3.3290915, 3.205779325, 0, 0, 0, 0, 0, 0,0, 0, 0, 1.01417725, 0, 0, 1.35015685, 0, 0, NA, 1.290875, 0,NA, 1.4537355, 0, 0, 0, 3.1133642, 0, 0, 0, 6.168443, 0, 6.26968469,3.872625, 0, 3.890076867, 0, 3.1133642, 2.250768067, 0, 0.97301535,4.8966569, 0, 8.487644, 0, 3.798781, 3.253654875, 4.960366, 0,2.3501405), neuroP = c(0, NA, NA, 0, 0, 0, 0, 0, 2.0428646, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11.03703, NA, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 5.165785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA,0, 0, 0, 0, 0, 3.583922, 0, 0, 0, 0, 0, 0, 2.0009107, 0, NA,NA, 0, 0, 0, 0, 0, 2.55936099, 0, 0, 0, NA, 0, 0, 0, 0, 0, NA,2.5078381, 0, NA, 0, 3.872625, 0, 0, 0, 0, 0, 0, 3.97424399,0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.5064081, NA,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.16196, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 1.929947, 0, 2.000911, 0, 0, 0, 0, 0, 0, 0,0, 0, 2.247053, 0, 0, 0, NA, NA, 0, 0, 0, 1.9766362, 2.126448,0, 0, 0, 0, 4.130221, 0, 0, NA, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0,2.599616, 0, 0, 0, 0, 0, NA, NA, 0, NA, 0, 0, 0, 0, 3.0913634,0, 0, 4.6432279, 4.586727, 0, 1.58651903, 0, 2.6652475, NA, 0,0, 0, 3.5208109, 4.2195317, 0, 0, NA, 10.5157265, NA, 0, 0, 2.8920614,7.039145), ProlifP = c(0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, -1.945246, 0, 0, 0, 0, NA, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, 0, -11.05227, 0, 0, 0,0, 3.562687467, 0, 0, NA, 0, 0, 0, -2.02585, 3.887923007, NA,0, 0, NA, 0, 0, 0, 0, 0, 3.7865502, 0, 0, 0, 0, 0, NA, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 4.947180667, 0, 0, NA, 0, 0, 0, 0,-2.04286463, -2.0343177, 0, 0, 5.591507567, 0, -2.0868461, 0,0, 0, 0, 0, 0, 5.151728643, 4.936735813, 0, 0, 0, 0, -2.562395,0, -2.009148, -7.564251, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2.346905,3.207918667, 0, 0, -2.9254072, NA, NA, 0, 0, -2.5948795, 0, -2.060203,0, -4.14739583, -2.8027302, -4.487039, 0, 0, 0, NA, -2.8964375,5.003374, -2.263317, 0, 3.609647733, -2.6806902, 0, 3.505242133,0, 3.120921753, -3.445611, 0, 0, 5.147579867, 0, 0, NA, NA, -3.2384957,NA, 0, 0, 0, -2.798781, -1.6022584, 0, 0, 0, 0, 0, 0, 0, 4.713909533,4.4782686, -5.831885, 5.6344196, -6.8794451, 4.888960867, -3.1387679,-5.5994579, 0, NA, 0, NA, 0, 0, -4.6923589, -4.767982), reproP = c(0,0.58783785, 0.1486486, 0, 0.018018017, 0, 0.063063067, 0.418918933,0.040540533, 0, 0.4864865, 0.018018017, 0.0855856, 0.018018017,0.3963964, 0, 0, 0.0990991, 0.333333333, 0, 0, 0, 0, 0, 0, 0.076576567,0, 0.081081067, 0.049549533, 0.3873874, 0, 0, 0, 0.15, 0.06756755,0.0617284, 0.3963964, 0.383333333, 0.018018017, 0, 0.15, 0.031531533,0.3918919, 0.058558567, 0.0810811, 0, 0, 0.067567567, 0, 0, 0,0, 0, 0.06756755, 0, 0.516666667, NA, 0.058558567, 0.1621622,0.2567568, NA, NA, 0.1419753, 0, 0, 0.054054067, 0.040540533,0.018018017, 0.441441433, 0.031531533, 0, 0, 0, 0.1126126, 0.072072067,0, 0.35802469, 0.0472973, 0.040540533, 0.063063067, 0.16216215,0.083333333, 0.333333333, 0.018018017, 0.024691357, 0.0945946,0.0945946, 0.045045033, 0, 0.037037035, 0, 0.081081067, 0, 0.135135133,0.058558567, 0.081081067, 0.031531533, 0, 0.013513513, 0.063063067,0.333333333, 0.35802469, 0.1081081, 0.040540533, 0, 0.018018017,0.081081067, 0.075, 0.045045033, 0.067567567, 0.040540533, 0.031531533,0.027027027, 0.031531533, 0.036036033, 0.45, 0.018018017, 0.040540533,-0.7265556, 0.031531533, 0.4144144, 0.10185185, 0.067567567,0, 0.040540533, 0.018018017, 0.027027025, 0.0990991, 0.1036036,0.027027025, 0.054054067, 0.2, 0.018018017, 0, 0, 0.033333333,0, 0.031531533, 0.378378367, 0.130630633, 0.018018017, 0.1, 0,0, 0.1, 0, 0.054054067, 0.459459467, 0.031531533, 0.075, 0.5,0.364864867, 0.031531533, 0.06756755, 0.081081067, 0.6418919,0.1036036, 0.35135135, 0.054054067, -0.931616333, 0.3918919,0, 0.0855856, 0.1081081, 0.373873867, NA, 0.333333333, 0.0990991,-1.345913467, 0.040540533, 0.018018017, 0.081081067, 0.3963964,0.018018017, 0, 0.0900901, 0.2027027, 0.031531533, 0.3963964,0.364864867, 0.0743243, 0, -0.673395133, 0.06756755, NA, -0.316663167,0.031531533, 0, 0.031531533, 0.3873874, 0.0608108, 0.045045033,0, -1.004574, 0.018018017, 0, 0.4144144, 0.55405405, 0, 0.1036036,-1.646125933, -1.5806603, -0.9572768, -0.818359433, -0.984343,0.2, -4.2037963, 0, -1.2499105, 0.4, 0.0608108, 0)), .Names = c("DEV.rabbit.Bi","cytoP", "GIP", "neuroP", "ProlifP", "reproP"), row.names = c(2L,4L, 6L, 11L, 12L, 13L, 15L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,30L, 34L, 35L, 39L, 40L, 43L, 44L, 48L, 55L, 56L, 57L, 58L, 59L,60L, 61L, 62L, 63L, 65L, 71L, 72L, 75L, 79L, 81L, 84L, 85L, 86L,87L, 91L, 92L, 93L, 94L, 97L, 100L, 101L, 102L, 105L, 112L, 115L,118L, 119L, 120L, 121L, 126L, 128L, 129L, 132L, 136L, 141L, 144L,148L, 151L, 154L, 155L, 156L, 163L, 164L, 166L, 169L, 170L, 178L,179L, 180L, 181L, 183L, 184L, 186L, 188L, 190L, 191L, 193L, 194L,198L, 199L, 200L, 201L, 202L, 205L, 206L, 212L, 215L, 217L, 222L,223L, 224L, 228L, 229L, 230L, 231L, 232L, 235L, 236L, 238L, 239L,244L, 248L, 249L, 250L, 252L, 253L, 257L, 262L, 263L, 265L, 268L,271L, 272L, 275L, 279L, 282L, 285L, 286L, 287L, 289L, 290L, 291L,294L, 301L, 302L, 303L, 304L, 305L, 307L, 309L, 310L, 311L, 312L,314L, 315L, 317L, 319L, 322L, 323L, 326L, 327L, 329L, 331L, 333L,334L, 335L, 338L, 339L, 342L, 343L, 344L, 346L, 349L, 350L, 352L,353L, 354L, 356L, 359L, 360L, 363L, 365L, 366L, 368L, 370L, 371L,374L, 376L, 377L, 380L, 381L, 384L, 387L, 393L, 395L, 399L, 400L,402L, 403L, 408L, 409L, 414L, 415L, 417L, 418L, 419L, 420L, 421L,422L, 424L, 425L, 426L, 427L, 428L, 429L, 434L, 437L, 438L, 441L,442L, 443L, 444L, 448L, 451L, 453L), class = "data.frame")
lda.train= lda(df.train$DEV.rabbitBi~ df.train[,c(2)] +df.train[,c(3)]+df.train[,c(4)]+df.train[,c(5)]+df.train[,c(6)], data=df.train)
lda.pred= predict(lda.train, newdata=df.test)$class
Upvotes: 2
Views: 764
Reputation: 25844
The call to predict does not work as the variable names that are in the lda
model output
do not match the variable names in the newdata / test data. (see the newdata
argument description
in ?predict.lda
... a data frame with columns of the same names as the variables used).
As they don't match, the predict method ignores the newdata
argument and predicts on the original data.
You have used the same variables in the model and in the predict
but it matters how they are passed to the lda model.
Using the example from ?lda
library(MASS)
Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]),
Sp = rep(c("s","c","v"), rep(50,3)))
train <- sample(1:150, 75)
names(Iris)
#[1] "Sepal.L." "Sepal.W." "Petal.L." "Petal.W." "Sp"
z <- lda(Sp ~ . , Iris, prior = c(1,1,1)/3, subset = train)
z
#...
#...
# Coefficients of linear discriminants:
# LD1 LD2
# Sepal.L. 0.5863648 0.7580133
# Sepal.W. 2.0444073 -2.5613102
# Petal.L. -1.8827963 0.9356446
# Petal.W. -3.5895106 -3.4927051
#...
#...
Variable names in the model match those found in the data, and match those in the newdata
, so predict
will work as intended
pred <- predict(z, Iris[-train, ])
When you pass the variables differently you can run into problems.
z <- lda(Sp ~ Iris[,1] + Iris[,2] + Iris[,3] + Iris[,4] , Iris, prior = c(1,1,1)/3, subset = train)
z
# Coefficients of linear discriminants:
# LD1 LD2
# Iris[, 1] 0.5863648 0.7580133
# Iris[, 2] 2.0444073 -2.5613102
# Iris[, 3] -1.8827963 0.9356446
# Iris[, 4] -3.5895106 -3.4927051
Notice the coefficient names. So now when you use the predict
function it expects variables called
Iris[,1]
, ... etc to be in newdata
, which this is not the case, so predict
will ignore the newdata.
As for the ~ .
form: this is just a quick method to select all variables from the data (given bydata=
),
excluding those to the left-hand side of the equation (lhs ~ rhs
), into the model. `See here. Note, you can
use the data argument to only select certain columns.
So the first lda
function call above is equivalent to writing all the names out manually
z <- lda(Sp ~ Sepal.L. + Sepal.W. + Petal.L. + Petal.W. ,
Iris, prior = c(1,1,1)/3, subset = train)
Note it is common to most (all??) of the predict methods, that the names in any new data will need to match those from the model
Upvotes: 2