weizer
weizer

Reputation: 1127

How to do the prediction for SVM in R?

I have 2 dataset, train_val and test. I want to build 3 models and use the models to predict the outcome. This is my 3 models:

#Model 1
rf.model <- randomForest(Survived ~ ., data = train_val, type = 'response')

#Model 2
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)

#Model 3
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)

After training the 3 models by using the code above, I use the following code to do the prediction:

prediction <- predict(rf.model, newdata = test)

The outpu for the prediction is:

enter image description here

Then I put the outcome into a dataframe:

df <- data.frame(PassengerId = 892:1309, Survived = prediction)

The output for the df is

enter image description here

So far everything working well, however, when I replaced the rf.model in prediction by svm.model.linear, then the output changed:

enter image description here

and hence, the dataframe also having error: enter image description here

May I know what is the reason for that and how should I get the same output of df as when I was using rf.model previously? Any help will be greatly appreciated!

This is the full code from the post

library(dplyr)
library(tidyr)
library(ggplot2)
library(Amelia)
library(corrgram)
library(caret)
library(randomForest)
library(e1071)


#Import train and test dataset
train <- read.csv("C:/Users/User/Desktop/Titanic/train.csv")
test <- read.csv("C:/Users/User/Desktop/Titanic/test.csv")

#Ensure dataset loaded correctly
head(train)
head(test)


train$set <- "train"
test$set <- "test"
test$Survived <- NA
full <- rbind(train,test)

summary(full)
str(full)


# Creating new training and testing data set
full <- dplyr::select(full,-PassengerId,-Ticket,-Cabin)

#Creating a new variable called Dependents
full$Dependents <- full$SibSp + full$Parch
head(full)


missing_values <- full %>% summarise(across(everything(), ~sum(is.na(.))/length(.)))
missing_values <- missing_values %>% pivot_longer(cols = everything(),names_to = "feature",values_to="missing_pct")

ggplot(missing_values,aes(x=reorder(feature,-missing_pct),y=missing_pct,label=missing_pct)) + geom_text(hjust = 0,aes(label=scales::percent(missing_pct))) +
  geom_col(fill='red') + coord_flip() + scale_y_continuous(labels=scales::percent)


#Overview of scatterplots between all variables
full$Sex <- factor(full$Sex)
full$Sex.factor <- as.numeric(full$Sex)
str(full)
full %>% filter(set == "train") %>% select(-Name,-Embarked,-set,-SibSp,-Parch) %>% corrgram(lower.panel = panel.shade,upper.panel = panel.pie)


full$Survived <- as.factor(full$Survived)
full$Pclass <- as.factor(full$Pclass)
full %>% filter(set == "train") %>% ggplot(aes(Sex)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Pclass)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Age)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Dependents)) + geom_histogram(bins=10,aes(fill=Survived))


full %>% ggplot(aes(x=Pclass,y=Age)) + geom_boxplot(aes(fill=Pclass))
full %>% ggplot(aes(x=Sex,y=Age)) + geom_boxplot(aes(fill=Sex))


#Getting the median age based on Pclass
p1MedAge <- full %>% filter(Pclass == 1 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p2MedAge <- full %>% filter(Pclass == 2 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p3MedAge <- full %>% filter(Pclass == 3 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()


#Imputing median age to missing data for train data
full$Age[is.na(full$Age) & full$Pclass == 1] <- p1MedAge
full$Age[is.na(full$Age) & full$Pclass == 2] <- p2MedAge
full$Age[is.na(full$Age) & full$Pclass == 3] <- p3MedAge

#Checking for missing age data
any(is.na(full$Age)
    
  
full[is.na(full$Fare),]
    
    
#Imputing the mean fare for Pclass 3
full$Fare[is.na(full$Fare)] <- round(mean(subset(full$Fare, full$Pclass == 3),na.rm = T),0)

#Final check of missing data
any(is.na(subset(full,select=-c(Survived))))


full <- full %>% select(-Name,-SibSp,-Parch,-Sex.factor)
str(full)


full$Dependents <- factor(full$Dependents)
full$Embarked <- factor(full$Embarked)
str(full)


train <- full %>% filter(set == "train") %>% select(-set)
test <- full %>% filter (set == "test") %>% select(-set)

#Creating data partition to cross validation
ind = createDataPartition(train$Survived,times = 1,p = 0.8,list = FALSE)
train_val <- train[ind,]
test_val <- train[-ind,]

#Checking distribution of data partition
round(prop.table(table(train$Survived)*100),digits=3)
round(prop.table(table(train_val$Survived)*100),digits=3)
round(prop.table(table(test_val$Survived)*100),digits=3)


log.model <- glm(formula = Survived ~ . - Fare - Embarked, data = train_val, family = binomial(link='logit'))
summary(log.model)


glm.prediction <- predict(log.model, newdata=test_val, type='response')
glm.prediction <- ifelse(glm.prediction >= 0.5, 1, 0)

table(test_val$Survived,glm.prediction)
sum(test_val$Survived==glm.prediction) / nrow(test_val)


rf.model <- randomForest(Survived ~ ., data = train_val)
print(rf.model$confusion)
importance(rf.model)


rf.prediction <- predict(rf.model, test_val)
table(test_val$Survived,rf.prediction)
sum(test_val$Survived==rf.prediction) / nrow(test_val)


tuned.svm.linear <- tune.svm(Survived ~., data = train, kernel = "linear", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.linear)
tuned.svm.radial <- tune.svm(Survived ~., data = train, kernel = "radial", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.radial)


svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)

svm.prediction.linear <- predict(svm.model.linear, test_val)
table(test_val$Survived,svm.prediction.linear)
sum(test_val$Survived==svm.prediction.linear) / nrow(test_val)

svm.prediction.radial <- predict(svm.model.radial, test_val)
table(test_val$Survived,svm.prediction.radial)
sum(test_val$Survived==svm.prediction.radial) / nrow(test_val)


#Predicting Survival on test data set using Random Forest model
rf.model <- randomForest(Survived ~ ., data = train, type = 'response')
prediction <- predict(svm.model.linear, newdata = test)

submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
write.csv(submission, file = "submission.csv", row.names = FALSE)
paste("Your submission was successfully saved!")

This is where the error occured when I run the code enter image description here

As you can see, when I run until the line submission <- data.frame(PassengerId = 892:1309, Survived = prediction), the error appeared.

Upvotes: 0

Views: 2065

Answers (1)

rw2
rw2

Reputation: 1815

You're using the wrong table as your newdata.

You should be using test_val which has gone through the same treatment as train_val. Instead you are training using train_val, but using test as your newdata.

If you make predictions for your test_val table, both the svm and random forest models will work, and will give you 177 predictions.

You will also need to change your submission data.frame to have 177 rows instead of 418.

EDIT As discussed in the comments (although they've now been removed?), you want to predict for the test data using a model built on the train data.

Try this:

svm.model.linear <- svm(Survived ~ ., data = train, kernel="linear", cost = 2, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test[,-1])

The predict function works slightly differently for different models in R, which can cause confusion. When you use it with an svm model it is actually calling predict.svm(). This particular function doesn't like that you are passing it newdata with an empty Survived column. If you remove that column by specifying newdata=test[,-1] then the prediction will work as expected.

Upvotes: 1

Related Questions