Peter Karoway
Peter Karoway

Reputation: 63

Predicting LDA testing set different legnth than testing set

I'm having issues predicting in lda. I want to cross validate my set, so I split my data into a training set (80%) and a testing set (20%) 5 times. This gives me two dataframes of different length. I can make the training lda no problem, but when I predict I don't get the newdata prediction. It goes automatically to the training set predictions. Any help for a R newbie?

 df.test=structure(list(DEV.rabbit.Bi = c(0L, 1L, 1L, 0L, 0L, 0L, 1L,0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L), cytoP = c(0,0, 0, 0, 0, 0, 0, -2.1260048, 0, 0, 0, 2.83428136, 0, 0, NA,0, -2.33067135, -3.2528685, 0, 0, -3.9118235, 0, -2.12893162,0, -2.135834975, -3.38015, 0, 2.86341288, 0, -2.4050405, 0, -2.38829672,0, -2.24985834, 0, -2.2202064, -2.15253385, -2.2366473, -2.96851445,0, -0.743292433, 0, 0, 0, -2.61448215, 0, 0, 0, 0, -2.9443965,0, 0), GIP = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0.88683115, 0, 0, 0, 4.31335206, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.900614, 0, 1.4537355, 0,6.168443, 3.872625, 3.1133642, 0, 2.3501405), neuroP = c(0, 0,2.0428646, 0, 0, 0, 0, 0, 0, 0, 5.165785, 0, 0, 0, NA, 0, 0,0, 0, 2.5078381, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.317407, 0, 0, 0,0, 0, 0, 0, 1.9766362, 0, 0, 0, 0, 4.6628686, 0, 0, 0, 4.6432279,4.586727, 0, 0, 0, 7.039145), ProlifP = c(0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, NA, 0, 3.562687467, 0, 0, 0, 0, 0, 0, 0,-2.12833253, 4.947180667, -2.04286463, 0, 0, 0, -2.562395, 0,0, 0, -2.346905, 0, 0, 0, 0, 0, 0, 2.005820067, -3.0411488, -1.885536,-3.2384957, 0, 0, 0, 0, 5.6344196, 0, -4.767982), reproP = c(0.018018017,0.418918933, 0.040540533, 0.018018017, 0.454954967, 0, 0, 0.049549533,0, 0, 0, 0.3963964, 0.058558567, 0.040540533, NA, 0.054054067,0.441441433, 0, 0, 0.040540533, 0.063063067, 0, 0.35135135, 0.058558567,0.018018017, 0, 0.027027027, 0.040540533, 0.1036036, 0.4, 0.2,0.018018017, 0.130630633, 0.018018017, 0.1, 0.054054067, 0.031531533,0.081081067, 0.1036036, 0.040540533, 0.0900901, 0.369369367,0.036036033, -1.1009885, -0.673395133, NA, 0.045045033, 0, 0,0.1036036, -0.984343, 0)), .Names = c("DEV.rabbit.Bi", "cytoP","GIP", "neuroP", "ProlifP", "reproP"), row.names = c(12L, 23L,24L, 27L, 38L, 56L, 59L, 61L, 63L, 65L, 71L, 81L, 128L, 131L,141L, 154L, 163L, 168L, 170L, 184L, 186L, 205L, 210L, 217L, 233L,236L, 253L, 268L, 276L, 293L, 302L, 303L, 312L, 314L, 322L, 326L,335L, 339L, 343L, 361L, 377L, 385L, 392L, 394L, 399L, 402L, 418L,419L, 422L, 427L, 438L, 453L), class = "data.frame")
df.train= structure(list(DEV.rabbit.Bi = c(0L, 1L, 0L, 0L, 0L, 1L, 1L,1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L,0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L,1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L,0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L,0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), cytoP = c(0,NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,NA, 0, 0, 0, -2.648429, 0, 0, -2.1260048, 0, 0, 0, 0, 0, 0, 0,2.83428136, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, -3.126005, 0, 0,0, 7.0318728, 0, 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, 3.024976, 0,0, 0, -2.33067135, 0, 0, NA, 0, 0, -3.3048862, 3.2453672, 0,NA, 0, -3.9118235, NA, 0, 0, 0, 0, 0, -3.3074869, 0, 0, 0, 0,0, NA, 0, 0, 0, -3.64705195, 0, 0, -2.6801575, 0, -2.32687549,0, 0, -3.38015, 0, 0, NA, 0, -2.4122793, 0, 0, 0, 0, 0, 0, -2.434712735,2.86341288, 0, 0, 0, 0, 0, 0, 0, 0, -3.73306513, 0, 0, 0, 0,0, -2.38829672, 0, 0, 0, -0.823873667, 0, 0, 0, -2.24985834,0, 0, 0, 0, 0, -2.2202064, 0, -2.34696895, NA, NA, 0, -2.15253385,-2.1856675, -2.2366473, 2.017460955, -2.96851445, 0, 0, 0, -3.0842214,0, -3.50124325, -5.794065, 0, NA, 0, -3.1539793, -2.5736979,0, 0, -2.3865695, 0, -2.710736745, 0, -0.743292433, 0, 2.373366367,0, -2.75693455, NA, NA, -2.61448215, NA, 0, 0, 0, -2.2124975,0, 0, 0, 0, 0, 0, 0, 0, -3.053354, NA, 5.428529647, -2.9443965,-3.8878643, -2.2083998, 0, 0, 0, NA, 0, NA, -2.13583495, 0, 0,0), GIP = c(0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,5.820918, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 3.73598124, 0, 0,4.588133, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0566821, 0,0, 0, 0, 0, 4.31335206, 0, 0, 0, 0, 0, 8.6651012, 0, 2.55087375,0, 0, 0, 0, 0, 0, 0, 0, 3.068526045, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 6.3948068, 0, 0,0, 0, 0, 0, 0, 0, 0, 3.3290915, 3.205779325, 0, 0, 0, 0, 0, 0,0, 0, 0, 1.01417725, 0, 0, 1.35015685, 0, 0, NA, 1.290875, 0,NA, 1.4537355, 0, 0, 0, 3.1133642, 0, 0, 0, 6.168443, 0, 6.26968469,3.872625, 0, 3.890076867, 0, 3.1133642, 2.250768067, 0, 0.97301535,4.8966569, 0, 8.487644, 0, 3.798781, 3.253654875, 4.960366, 0,2.3501405), neuroP = c(0, NA, NA, 0, 0, 0, 0, 0, 2.0428646, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11.03703, NA, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 5.165785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA,0, 0, 0, 0, 0, 3.583922, 0, 0, 0, 0, 0, 0, 2.0009107, 0, NA,NA, 0, 0, 0, 0, 0, 2.55936099, 0, 0, 0, NA, 0, 0, 0, 0, 0, NA,2.5078381, 0, NA, 0, 3.872625, 0, 0, 0, 0, 0, 0, 3.97424399,0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.5064081, NA,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.16196, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 1.929947, 0, 2.000911, 0, 0, 0, 0, 0, 0, 0,0, 0, 2.247053, 0, 0, 0, NA, NA, 0, 0, 0, 1.9766362, 2.126448,0, 0, 0, 0, 4.130221, 0, 0, NA, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0,2.599616, 0, 0, 0, 0, 0, NA, NA, 0, NA, 0, 0, 0, 0, 3.0913634,0, 0, 4.6432279, 4.586727, 0, 1.58651903, 0, 2.6652475, NA, 0,0, 0, 3.5208109, 4.2195317, 0, 0, NA, 10.5157265, NA, 0, 0, 2.8920614,7.039145), ProlifP = c(0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, -1.945246, 0, 0, 0, 0, NA, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, 0, -11.05227, 0, 0, 0,0, 3.562687467, 0, 0, NA, 0, 0, 0, -2.02585, 3.887923007, NA,0, 0, NA, 0, 0, 0, 0, 0, 3.7865502, 0, 0, 0, 0, 0, NA, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 4.947180667, 0, 0, NA, 0, 0, 0, 0,-2.04286463, -2.0343177, 0, 0, 5.591507567, 0, -2.0868461, 0,0, 0, 0, 0, 0, 5.151728643, 4.936735813, 0, 0, 0, 0, -2.562395,0, -2.009148, -7.564251, 0, 0, 0, 0, 0, 0, 0, 0, 0, -2.346905,3.207918667, 0, 0, -2.9254072, NA, NA, 0, 0, -2.5948795, 0, -2.060203,0, -4.14739583, -2.8027302, -4.487039, 0, 0, 0, NA, -2.8964375,5.003374, -2.263317, 0, 3.609647733, -2.6806902, 0, 3.505242133,0, 3.120921753, -3.445611, 0, 0, 5.147579867, 0, 0, NA, NA, -3.2384957,NA, 0, 0, 0, -2.798781, -1.6022584, 0, 0, 0, 0, 0, 0, 0, 4.713909533,4.4782686, -5.831885, 5.6344196, -6.8794451, 4.888960867, -3.1387679,-5.5994579, 0, NA, 0, NA, 0, 0, -4.6923589, -4.767982), reproP = c(0,0.58783785, 0.1486486, 0, 0.018018017, 0, 0.063063067, 0.418918933,0.040540533, 0, 0.4864865, 0.018018017, 0.0855856, 0.018018017,0.3963964, 0, 0, 0.0990991, 0.333333333, 0, 0, 0, 0, 0, 0, 0.076576567,0, 0.081081067, 0.049549533, 0.3873874, 0, 0, 0, 0.15, 0.06756755,0.0617284, 0.3963964, 0.383333333, 0.018018017, 0, 0.15, 0.031531533,0.3918919, 0.058558567, 0.0810811, 0, 0, 0.067567567, 0, 0, 0,0, 0, 0.06756755, 0, 0.516666667, NA, 0.058558567, 0.1621622,0.2567568, NA, NA, 0.1419753, 0, 0, 0.054054067, 0.040540533,0.018018017, 0.441441433, 0.031531533, 0, 0, 0, 0.1126126, 0.072072067,0, 0.35802469, 0.0472973, 0.040540533, 0.063063067, 0.16216215,0.083333333, 0.333333333, 0.018018017, 0.024691357, 0.0945946,0.0945946, 0.045045033, 0, 0.037037035, 0, 0.081081067, 0, 0.135135133,0.058558567, 0.081081067, 0.031531533, 0, 0.013513513, 0.063063067,0.333333333, 0.35802469, 0.1081081, 0.040540533, 0, 0.018018017,0.081081067, 0.075, 0.045045033, 0.067567567, 0.040540533, 0.031531533,0.027027027, 0.031531533, 0.036036033, 0.45, 0.018018017, 0.040540533,-0.7265556, 0.031531533, 0.4144144, 0.10185185, 0.067567567,0, 0.040540533, 0.018018017, 0.027027025, 0.0990991, 0.1036036,0.027027025, 0.054054067, 0.2, 0.018018017, 0, 0, 0.033333333,0, 0.031531533, 0.378378367, 0.130630633, 0.018018017, 0.1, 0,0, 0.1, 0, 0.054054067, 0.459459467, 0.031531533, 0.075, 0.5,0.364864867, 0.031531533, 0.06756755, 0.081081067, 0.6418919,0.1036036, 0.35135135, 0.054054067, -0.931616333, 0.3918919,0, 0.0855856, 0.1081081, 0.373873867, NA, 0.333333333, 0.0990991,-1.345913467, 0.040540533, 0.018018017, 0.081081067, 0.3963964,0.018018017, 0, 0.0900901, 0.2027027, 0.031531533, 0.3963964,0.364864867, 0.0743243, 0, -0.673395133, 0.06756755, NA, -0.316663167,0.031531533, 0, 0.031531533, 0.3873874, 0.0608108, 0.045045033,0, -1.004574, 0.018018017, 0, 0.4144144, 0.55405405, 0, 0.1036036,-1.646125933, -1.5806603, -0.9572768, -0.818359433, -0.984343,0.2, -4.2037963, 0, -1.2499105, 0.4, 0.0608108, 0)), .Names = c("DEV.rabbit.Bi","cytoP", "GIP", "neuroP", "ProlifP", "reproP"), row.names = c(2L,4L, 6L, 11L, 12L, 13L, 15L, 23L, 24L, 25L, 26L, 27L, 28L, 29L,30L, 34L, 35L, 39L, 40L, 43L, 44L, 48L, 55L, 56L, 57L, 58L, 59L,60L, 61L, 62L, 63L, 65L, 71L, 72L, 75L, 79L, 81L, 84L, 85L, 86L,87L, 91L, 92L, 93L, 94L, 97L, 100L, 101L, 102L, 105L, 112L, 115L,118L, 119L, 120L, 121L, 126L, 128L, 129L, 132L, 136L, 141L, 144L,148L, 151L, 154L, 155L, 156L, 163L, 164L, 166L, 169L, 170L, 178L,179L, 180L, 181L, 183L, 184L, 186L, 188L, 190L, 191L, 193L, 194L,198L, 199L, 200L, 201L, 202L, 205L, 206L, 212L, 215L, 217L, 222L,223L, 224L, 228L, 229L, 230L, 231L, 232L, 235L, 236L, 238L, 239L,244L, 248L, 249L, 250L, 252L, 253L, 257L, 262L, 263L, 265L, 268L,271L, 272L, 275L, 279L, 282L, 285L, 286L, 287L, 289L, 290L, 291L,294L, 301L, 302L, 303L, 304L, 305L, 307L, 309L, 310L, 311L, 312L,314L, 315L, 317L, 319L, 322L, 323L, 326L, 327L, 329L, 331L, 333L,334L, 335L, 338L, 339L, 342L, 343L, 344L, 346L, 349L, 350L, 352L,353L, 354L, 356L, 359L, 360L, 363L, 365L, 366L, 368L, 370L, 371L,374L, 376L, 377L, 380L, 381L, 384L, 387L, 393L, 395L, 399L, 400L,402L, 403L, 408L, 409L, 414L, 415L, 417L, 418L, 419L, 420L, 421L,422L, 424L, 425L, 426L, 427L, 428L, 429L, 434L, 437L, 438L, 441L,442L, 443L, 444L, 448L, 451L, 453L), class = "data.frame")

lda.train= lda(df.train$DEV.rabbitBi~ df.train[,c(2)] +df.train[,c(3)]+df.train[,c(4)]+df.train[,c(5)]+df.train[,c(6)], data=df.train)

lda.pred= predict(lda.train, newdata=df.test)$class

Upvotes: 2

Views: 764

Answers (1)

user20650
user20650

Reputation: 25844

The call to predict does not work as the variable names that are in the lda model output do not match the variable names in the newdata / test data. (see the newdata argument description in ?predict.lda ... a data frame with columns of the same names as the variables used). As they don't match, the predict method ignores the newdata argument and predicts on the original data. You have used the same variables in the model and in the predict but it matters how they are passed to the lda model.

Using the example from ?lda

library(MASS)

Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]),
                   Sp = rep(c("s","c","v"), rep(50,3)))
train <- sample(1:150, 75)

names(Iris)
#[1] "Sepal.L." "Sepal.W." "Petal.L." "Petal.W." "Sp"      

z <- lda(Sp ~ . , Iris, prior = c(1,1,1)/3, subset = train)
z
#...
#...
# Coefficients of linear discriminants:
#                 LD1        LD2
# Sepal.L.  0.5863648  0.7580133
# Sepal.W.  2.0444073 -2.5613102
# Petal.L. -1.8827963  0.9356446
# Petal.W. -3.5895106 -3.4927051
#...
#...

Variable names in the model match those found in the data, and match those in the newdata, so predict will work as intended

pred <- predict(z, Iris[-train, ])

When you pass the variables differently you can run into problems.

z <- lda(Sp ~ Iris[,1] + Iris[,2] + Iris[,3] + Iris[,4]  , Iris, prior = c(1,1,1)/3, subset = train)
z
# Coefficients of linear discriminants:
#                  LD1        LD2
# Iris[, 1]  0.5863648  0.7580133
# Iris[, 2]  2.0444073 -2.5613102
# Iris[, 3] -1.8827963  0.9356446
# Iris[, 4] -3.5895106 -3.4927051

Notice the coefficient names. So now when you use the predict function it expects variables called Iris[,1], ... etc to be in newdata, which this is not the case, so predict will ignore the newdata.

As for the ~ . form: this is just a quick method to select all variables from the data (given bydata=), excluding those to the left-hand side of the equation (lhs ~ rhs), into the model. `See here. Note, you can use the data argument to only select certain columns.

So the first lda function call above is equivalent to writing all the names out manually

z <- lda(Sp ~ Sepal.L. + Sepal.W. + Petal.L. + Petal.W.  , 
                             Iris, prior = c(1,1,1)/3, subset = train)

Note it is common to most (all??) of the predict methods, that the names in any new data will need to match those from the model

Upvotes: 2

Related Questions