Test data has new levels while doing a logit but doesn't gives an error while predicting in C5

Question

I don't know how the 2 model handle factor levels, but logit won't predict and gives an error message saying new factor levels. When I predict using C5 it works fine. I have created the train and test from a single data frame and levels in both match each other.

I am seeking an explanation of this behaviour and a solution for this. I understand that the new levels in test would not be able to get their coefficient calculated, but setting them to NULL should be okay I think.

Here is a bit of the code. I used this to match the levels of hold and train. tr=dataset to be split into train and test.

tr=structure(
        list(
            production_year = c(
                2007L, 2010L, 2010L, 2008L,
                2007L, 2008L, 2008L, 2008L, 2007L, 2011L, 2009L, 2009L, 2009L,
                2008L, 2007L, 2007L, 2010L, 2009L, 2008L, 2008L, 2010L, 2010L,
                2007L, 2010L, 2009L, 2008L, 2007L, 2007L, 2008L, 2007L, 2010L,
                2011L, 2010L, 2007L, 2009L, 2009L, 2008L, 2008L, 2010L, 2011L
            ), movie_sequel = structure(
                c(
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            ), creative_type = structure(
                c(
                    1L,
                    4L, 1L, 4L, 5L, 1L, 1L, 6L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 8L, 1L, 7L, 1L, 1L, 3L, 1L, 1L, 2L, 4L, 4L, 1L, 1L, 4L, 5L,
                    5L, 1L, 4L, 1L, 1L, 1L, 1L
                ), .Label = c(
                    "Contemporary Fiction",
                    "Dramatization", "Factual", "Fantasy", "Historical Fiction",
                    "Kids Fiction", "Science Fiction", "Super Hero"
                ), class = "factor"
            ),
            source = structure(
                c(
                    6L, 2L, 6L, 7L, 2L, 6L, 6L, 6L, 4L,
                    6L, 2L, 7L, 6L, 6L, 6L, 3L, 6L, 6L, 1L, 2L, 6L, 5L, 6L, 5L,
                    5L, 6L, 4L, 2L, 2L, 6L, 6L, 2L, 7L, 4L, 6L, 5L, 6L, 2L, 6L,
                    6L
                ), .Label = c(
                    "Based on Comic/Graphic Novel", "Based on Fiction Book/Short Story",
                    "Based on Folk Tale/Legend/Fairytale", "Based on Real Life Events",
                    "Based on TV", "Original Screenplay", "Remake"
                ), class = "factor"
            ),
            production_method = structure(
                c(
                    3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    2L, 3L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L,
                    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    3L, 3L, 3L
                ), .Label = c(
                    "Animation/Live Action", "Digital Animation",
                    "Live Action", "Stop-Motion Animation"
                ), class = "factor"
            ),
            genre = structure(
                c(
                    3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 4L, 5L,
                    2L, 7L, 6L, 5L, 7L, 3L, 3L, 7L, 1L, 7L, 7L, 3L, 4L, 3L, 3L,
                    6L, 4L, 2L, 1L, 2L, 6L, 4L, 7L, 1L, 4L, 2L, 3L, 7L, 7L, 5L
                ), .Label = c(
                    "Action", "Adventure", "Comedy", "Drama", "Horror",
                    "Romantic Comedy", "Thriller/Suspense"
                ), class = "factor"
            ),
            language = structure(
                c(
                    2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L
                ), .Label = c("Danish", "English"), class = "factor"
            ),
            movie_board_rating_display_name = structure(
                c(
                    3L, 3L, 3L,
                    2L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 3L, 2L, 3L, 3L,
                    3L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 1L, 2L, 3L, 2L, 2L, 3L,
                    2L, 3L, 1L, 2L, 3L, 3L, 2L
                ), .Label = c("PG", "PG-13", "R"), class = "factor"
            ), movie_release_pattern_display_name = structure(
                c(
                    4L,
                    4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
                    3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 1L, 4L,
                    4L, 4L, 2L, 3L, 4L, 4L, 4L, 3L, 4L
                ), .Label = c("Exclusive",
                              "Expands Wide", "Limited", "Wide"), class = "factor"
            ), Category1 = structure(
                c(
                    1L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            )
        ), .Names = c(
            "production_year",
            "movie_sequel", "creative_type", "source", "production_method",
            "genre", "language", "movie_board_rating_display_name", "movie_release_pattern_display_name",
            "Category1"
        ), row.names = c(
            506L, 474L, 1011L, 569L, 737L, 1124L,
            602L, 717L, 747L, 977L, 284L, 620L, 100L, 301L, 514L, 865L, 828L,
            283L, 921L, 839L, 15L, 937L, 931L, 201L, 273L, 507L, 1180L, 689L,
            276L, 649L, 603L, 22L, 555L, 974L, 552L, 500L, 216L, 312L, 796L,
            682L
        ), class = "data.frame"
    )

    train=tr[1:25,] # training data
    hold=tr[26:40,] # test data

    for(i in 1:ncol(train)){
        if(is.factor(train[,i])){
            hold[,i] <- factor(hold[,i],levels=levels(train[,i]))
            
        }
    }

m.glm=glm(Category1 ~ ., data = train, family = 'binomial')
labels=hold$Category1
hold$Category1=NULL
p=predict(m.glm, hold)

all the levels

structure(list(production_year = 2011L, movie_sequel = structure(1L, .Label = c("0", 
"1"), class = "factor"), creative_type = structure(5L, .Label = c("Contemporary Fiction", 
"Dramatization", "Factual", "Fantasy", "Historical Fiction", 
"Kids Fiction", "Multiple Creative Types", "Science Fiction", 
"Super Hero"), class = "factor"), source = structure(14L, .Label = c("Based on Comic/Graphic Novel", 
"Based on Factual Book/Article", "Based on Fiction Book/Short Story", 
"Based on Folk Tale/Legend/Fairytale", "Based on Game", "Based on Musical or Opera", 
"Based on Play", "Based on Real Life Events", "Based on Short Film", 
"Based on Theme Park Ride", "Based on Toy", "Based on TV", "Compilation", 
"Original Screenplay", "Remake", "Spin-Off"), class = "factor"), 
    production_method = structure(4L, .Label = c("Animation/Live Action", 
    "Digital Animation", "Hand Animation", "Live Action", "Multiple Production Methods", 
    "Stop-Motion Animation"), class = "factor"), genre = structure(13L, .Label = c("Action", 
    "Adventure", "Black Comedy", "Comedy", "Concert/Performance", 
    "Documentary", "Drama", "Horror", "Multiple Genres", "Musical", 
    "Romantic Comedy", "Thriller/Suspense", "Western"), class = "factor"), 
    language = structure(3L, .Label = c("Arabic", "Danish", "English", 
    "Farsi", "French", "German", "Hebrew", "Hindi", "Italian", 
    "Japanese", "Norwegian", "Polish", "Portuguese", "Silent", 
    "Spanish", "Swedish"), class = "factor"), movie_board_rating_display_name = structure(6L, .Label = c("G", 
    "NC-17", "Not Rated", "PG", "PG-13", "R"), class = "factor"), 
    movie_release_pattern_display_name = structure(7L, .Label = c("Exclusive", 
    "Expands Wide", "IMAX", "Limited", "Oscar Qualifying Run", 
    "Special Engagement", "Wide"), class = "factor"), Category1 = structure(1L, .Label = c("0", 
    "1"), class = "factor")), .Names = c("production_year", "movie_sequel", 
"creative_type", "source", "production_method", "genre", "language", 
"movie_board_rating_display_name", "movie_release_pattern_display_name", 
"Category1"), row.names = 304L, class = "data.frame")

Roman Luštrik · Accepted Answer

The way I see it, you will have to exclude the rows with levels which have not been used to fit the model.

predict(m.glm, hold[!hold$movie_release_pattern_display_name %in% c("Exclusive", "Expands Wide"), ])

Test data has new levels while doing a logit but doesn't gives an error while predicting in C5

Answers (1)

Related Questions

Test data has new levels while doing a logit but doesn&#39;t gives an error while predicting in C5

Answers (1)

Related Questions

Test data has new levels while doing a logit but doesn't gives an error while predicting in C5