Reputation: 176
I am a beginner to using Random Forest. I am trying to train a random forest model then apply it to a testing dataset but am having problems getting two datasets that are the same length. I have trained a good model but need to see how it performs on my test data. Please see my code below. Any tips would be appreciated.
#Import Data
url <- "http://groupware.les.inf.puc-rio.br/static/WLE/WearableComputing_weight_lifting_exercises_biceps_curl_variations.csv"
df <- read.csv(url, header = TRUE, na.strings=c("NA","#DIV/0!",""))
#Remove columns containing ALL NA values
df <- df[,colSums(is.na(df)) == 0]
#Remove all irrelevant columns that you will not need as predictors
df <- subset(df, select = -c(1:7))
#Create training and testing datasets
library(caret)
inTrain <- createDataPartition(y = df$classe,
p=0.7, list = FALSE)
training <- df[inTrain,]
testing <- df[-inTrain,]
set.seed(2020)
rfmodel <- randomForest(classe ~ ., data = training, method="rf", ntree=100, importance = TRUE)
print(rfmodel) #Error rate of 0.17% = good!
#validating that this method works on training set
prediction_train <- predict(rfmodel, data = training, type = "class")
table(prediction_train, training$classe)
#Cannot figure out what is going wrong here
prediction_test <- predict(rfmodel, data = testing)
length(prediction_test) #27472
length(testing$classe) #11770
table(prediction_test, testing$classe) #ERROR (see below)
#Error in table(prediction_test, testing$classe) : all arguments must have the same length
Packages I am using:
version$version.string [1] "R version 3.5.3 (2019-03-11)" packageVersion("caret", lib.loc = NULL) [1] ‘6.0.85’ packageVersion("rattle", lib.loc = NULL) [1] ‘5.3.0’ packageVersion("randomForest", lib.loc = NULL) [1] ‘4.6.14’ packageVersion("randomForestExplainer", lib.loc = NULL) [1] ‘0.10.0’
Upvotes: 3
Views: 815
Reputation: 4243
using sklearn I was able to be 96% accuracy. I used 400 trees and a max depth of 32. The deep tree seems to be preferred allowing for greater accuracy.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
df=pd.read_csv('weight_training.csv')
#https://rstudio-pubs-static.s3.amazonaws.com/230066_902d438b84794c6cb5585496ebc82119.html
LABELS= ['raw_timestamp_part_1',
'yaw_belt',
'num_window',
'roll_belt',
'magnet_dumbbell_z',
'pitch_belt',
'magnet_dumbbell_y',
'accel_dumbbell_y',
'pitch_forearm',
'roll_arm',
'roll_dumbbell',
'accel_dumbbell_z']
encoder=LabelEncoder()
X=df[LABELS]
y=encoder.fit_transform(df['classe'])
corr=df[LABELS].corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr,annot=True)
plt.show()
X_train,X_test,y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)
pipeline= Pipeline([
('scaler',StandardScaler()),
('pca', PCA()),
('clf',RandomForestClassifier(
n_estimators=400,
max_depth=32,
min_samples_leaf=2,
max_features=2,
random_state=42))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
class_names=[1,2]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cm), annot=True, cmap="coolwarm" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
print("Accuracy Score",accuracy_score(y_test,y_pred));
output:
Accuracy Score 0.96
parameter_grid={'n_estimators':[1,2,100,400,450],'max_depth':[2,4,5,6,7,8,9,10],'min_samples_leaf':[1,2,4,6,8,10],'max_features':[1,2,3,4,5,6,7,8,9,10]}
number_models=8
random_RandomForest_class=RandomizedSearchCV(
estimator=pipeline['clf'],
param_distributions=parameter_grid,
n_iter=number_models,
scoring='accuracy',
n_jobs=2,
cv=4,
refit=True,
return_train_score=True)
random_RandomForest_class.fit(X_train,y_train)
predictions=random_RandomForest_class.predict(X)
print("Best params",random_RandomForest_class.best_params_)
print("Best score",random_RandomForest_class.best_score_)
Upvotes: 0
Reputation: 8176
Use newdata =
within predict
function for both training and testing data like
#validation using training data
prediction_train <- predict(rfmodel, newdata = training, type = "class")
table(prediction_train, training$classe)
prediction_train A B C D E
A 7812 0 0 0 0
B 0 5316 0 0 0
C 0 0 4791 0 0
D 0 0 0 4503 0
E 0 0 0 0 5050
#validation using testing data
prediction_test <- predict(rfmodel, newdata = testing, type = "class")
length(prediction_test)
length(testing$classe)
table(prediction_test, testing$classe)
prediction_test A B C D E
A 3346 7 0 0 0
B 1 2269 1 0 0
C 0 1 2052 4 0
D 0 0 0 1924 1
E 0 0 0 1 2163
Upvotes: 0
Reputation: 2709
The problem was in the data =
when doing the testing. Cheers.
rfmodel <- randomForest(training$classe ~ ., data = training[,-51], method="rf", ntree=100, importance = TRUE)
prediction_test <- predict(rfmodel, testing[,-51])
table(prediction_test, testing$classe)
prediction_test A B C D E
A 3346 3 0 0 0
B 1 2274 4 0 0
C 0 0 2049 15 0
D 0 0 0 1913 0
E 0 0 0 1 2164
Upvotes: 1