Reputation: 1127
I have 2 dataset, train_val
and test
. I want to build 3 models and use the models to predict the outcome. This is my 3 models:
#Model 1
rf.model <- randomForest(Survived ~ ., data = train_val, type = 'response')
#Model 2
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
#Model 3
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)
After training the 3 models by using the code above, I use the following code to do the prediction:
prediction <- predict(rf.model, newdata = test)
The outpu for the prediction
is:
Then I put the outcome into a dataframe:
df <- data.frame(PassengerId = 892:1309, Survived = prediction)
The output for the df
is
So far everything working well, however, when I replaced the rf.model
in prediction
by svm.model.linear
, then the output changed:
and hence, the dataframe also having error:
May I know what is the reason for that and how should I get the same output of df
as when I was using rf.model
previously? Any help will be greatly appreciated!
This is the full code from the post
library(dplyr)
library(tidyr)
library(ggplot2)
library(Amelia)
library(corrgram)
library(caret)
library(randomForest)
library(e1071)
#Import train and test dataset
train <- read.csv("C:/Users/User/Desktop/Titanic/train.csv")
test <- read.csv("C:/Users/User/Desktop/Titanic/test.csv")
#Ensure dataset loaded correctly
head(train)
head(test)
train$set <- "train"
test$set <- "test"
test$Survived <- NA
full <- rbind(train,test)
summary(full)
str(full)
# Creating new training and testing data set
full <- dplyr::select(full,-PassengerId,-Ticket,-Cabin)
#Creating a new variable called Dependents
full$Dependents <- full$SibSp + full$Parch
head(full)
missing_values <- full %>% summarise(across(everything(), ~sum(is.na(.))/length(.)))
missing_values <- missing_values %>% pivot_longer(cols = everything(),names_to = "feature",values_to="missing_pct")
ggplot(missing_values,aes(x=reorder(feature,-missing_pct),y=missing_pct,label=missing_pct)) + geom_text(hjust = 0,aes(label=scales::percent(missing_pct))) +
geom_col(fill='red') + coord_flip() + scale_y_continuous(labels=scales::percent)
#Overview of scatterplots between all variables
full$Sex <- factor(full$Sex)
full$Sex.factor <- as.numeric(full$Sex)
str(full)
full %>% filter(set == "train") %>% select(-Name,-Embarked,-set,-SibSp,-Parch) %>% corrgram(lower.panel = panel.shade,upper.panel = panel.pie)
full$Survived <- as.factor(full$Survived)
full$Pclass <- as.factor(full$Pclass)
full %>% filter(set == "train") %>% ggplot(aes(Sex)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Pclass)) + geom_bar(aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Age)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% filter(set == "train") %>% ggplot(aes(Dependents)) + geom_histogram(bins=10,aes(fill=Survived))
full %>% ggplot(aes(x=Pclass,y=Age)) + geom_boxplot(aes(fill=Pclass))
full %>% ggplot(aes(x=Sex,y=Age)) + geom_boxplot(aes(fill=Sex))
#Getting the median age based on Pclass
p1MedAge <- full %>% filter(Pclass == 1 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p2MedAge <- full %>% filter(Pclass == 2 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
p3MedAge <- full %>% filter(Pclass == 3 & Age != "NA") %>% summarise(median(Age)) %>% as.numeric()
#Imputing median age to missing data for train data
full$Age[is.na(full$Age) & full$Pclass == 1] <- p1MedAge
full$Age[is.na(full$Age) & full$Pclass == 2] <- p2MedAge
full$Age[is.na(full$Age) & full$Pclass == 3] <- p3MedAge
#Checking for missing age data
any(is.na(full$Age)
full[is.na(full$Fare),]
#Imputing the mean fare for Pclass 3
full$Fare[is.na(full$Fare)] <- round(mean(subset(full$Fare, full$Pclass == 3),na.rm = T),0)
#Final check of missing data
any(is.na(subset(full,select=-c(Survived))))
full <- full %>% select(-Name,-SibSp,-Parch,-Sex.factor)
str(full)
full$Dependents <- factor(full$Dependents)
full$Embarked <- factor(full$Embarked)
str(full)
train <- full %>% filter(set == "train") %>% select(-set)
test <- full %>% filter (set == "test") %>% select(-set)
#Creating data partition to cross validation
ind = createDataPartition(train$Survived,times = 1,p = 0.8,list = FALSE)
train_val <- train[ind,]
test_val <- train[-ind,]
#Checking distribution of data partition
round(prop.table(table(train$Survived)*100),digits=3)
round(prop.table(table(train_val$Survived)*100),digits=3)
round(prop.table(table(test_val$Survived)*100),digits=3)
log.model <- glm(formula = Survived ~ . - Fare - Embarked, data = train_val, family = binomial(link='logit'))
summary(log.model)
glm.prediction <- predict(log.model, newdata=test_val, type='response')
glm.prediction <- ifelse(glm.prediction >= 0.5, 1, 0)
table(test_val$Survived,glm.prediction)
sum(test_val$Survived==glm.prediction) / nrow(test_val)
rf.model <- randomForest(Survived ~ ., data = train_val)
print(rf.model$confusion)
importance(rf.model)
rf.prediction <- predict(rf.model, test_val)
table(test_val$Survived,rf.prediction)
sum(test_val$Survived==rf.prediction) / nrow(test_val)
tuned.svm.linear <- tune.svm(Survived ~., data = train, kernel = "linear", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.linear)
tuned.svm.radial <- tune.svm(Survived ~., data = train, kernel = "radial", cost = c(0.01,0.1,0.2,0.5,0.8,1,2,3,5,10),gamma=c(0.1,0.5,1,2,5))
summary(tuned.svm.radial)
svm.model.linear <- svm(Survived ~ ., data = train_val, kernel="linear", cost = 2, gamma = 0.1)
svm.model.radial <- svm(Survived ~ ., data = train_val, kernel="radial", cost = 10, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test_val)
table(test_val$Survived,svm.prediction.linear)
sum(test_val$Survived==svm.prediction.linear) / nrow(test_val)
svm.prediction.radial <- predict(svm.model.radial, test_val)
table(test_val$Survived,svm.prediction.radial)
sum(test_val$Survived==svm.prediction.radial) / nrow(test_val)
#Predicting Survival on test data set using Random Forest model
rf.model <- randomForest(Survived ~ ., data = train, type = 'response')
prediction <- predict(svm.model.linear, newdata = test)
submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
write.csv(submission, file = "submission.csv", row.names = FALSE)
paste("Your submission was successfully saved!")
This is where the error occured when I run the code
As you can see, when I run until the line submission <- data.frame(PassengerId = 892:1309, Survived = prediction)
, the error appeared.
Upvotes: 0
Views: 2065
Reputation: 1815
You're using the wrong table as your newdata
.
You should be using test_val
which has gone through the same treatment as train_val
. Instead you are training using train_val
, but using test
as your newdata
.
If you make predictions for your test_val
table, both the svm and random forest models will work, and will give you 177 predictions.
You will also need to change your submission
data.frame to have 177 rows instead of 418.
EDIT
As discussed in the comments (although they've now been removed?), you want to predict for the test
data using a model built on the train
data.
Try this:
svm.model.linear <- svm(Survived ~ ., data = train, kernel="linear", cost = 2, gamma = 0.1)
svm.prediction.linear <- predict(svm.model.linear, test[,-1])
The predict
function works slightly differently for different models in R, which can cause confusion. When you use it with an svm model it is actually calling predict.svm()
. This particular function doesn't like that you are passing it newdata
with an empty Survived
column. If you remove that column by specifying newdata=test[,-1]
then the prediction will work as expected.
Upvotes: 1