Reputation: 39
I am using this dataset to predict employee performance
using different ML algorithms such as DecisionTreeClassifier
, CategoricalNB
, LogisticRegression
, GaussianNB
. This is basically how the dataset is structured
df.head(5)
Age DailyRate DistanceFromHome EnvironmentSatisfaction HourlyRate ... EducationField Department BusinessTravel OverTime Over18
0 41 1102 1 2 94 ... 1 2 2 1 0
1 49 279 8 3 61 ... 1 1 1 0 0
2 37 1373 2 4 92 ... 4 1 2 1 0
3 33 1392 3 4 56 ... 1 1 1 1 0
4 27 591 2 1 40 ... 3 1 2 0 0
When I try to get the accuracy
of each model using
model.score(X_test, y_test)
it gives almost 100% accuracy (99.98...)
What could be reason for this?
Here is my code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing
def readData(path):
dataframe = pd.read_csv(path)
inputs = dataframe.drop(['PerformanceRating',
'Attrition', 'Education', 'EmployeeCount',
'EmployeeNumber',
'StockOptionLevel',
'WorkLifeBalance'
], axis='columns')
# inputs = dataframe.drop(['PerformanceRating'], axis='columns')
target = dataframe['PerformanceRating']
inputs_and_target = pre_processing(inputs, target)
return inputs_and_target
def pre_processing(inputs_, target_):
inputs = inputs_
target = target_
MaritalStatus_ = LabelEncoder()
JobRole_ = LabelEncoder()
Gender_ = LabelEncoder()
EducationField_ = LabelEncoder()
Department_ = LabelEncoder()
BusinessTravel_ = LabelEncoder()
OverTime_ = LabelEncoder()
Over18_ = LabelEncoder()
inputs['MaritalStatus_'] = MaritalStatus_.fit_transform(inputs['MaritalStatus'])
inputs['JobRole_'] = JobRole_.fit_transform(inputs['JobRole'])
inputs['Gender_'] = Gender_.fit_transform(inputs['Gender'])
inputs['EducationField_'] = EducationField_.fit_transform(inputs['EducationField'])
inputs['Department_'] = Department_.fit_transform(inputs['Department'])
inputs['BusinessTravel_'] = BusinessTravel_.fit_transform(inputs['BusinessTravel'])
inputs['OverTime_'] = OverTime_.fit_transform(inputs['OverTime'])
inputs['Over18_'] = Over18_.fit_transform(inputs['Over18'])
inputs.drop(['MaritalStatus', 'JobRole' , 'OverTime' , 'EducationField',
'Gender', 'Department', 'BusinessTravel', 'Over18'], axis='columns', inplace=True)
inputs.rename(columns={'MaritalStatus_':'MaritalStatus',
'JobRole_' : 'JobRole',
'Gender_' : 'Gender',
'EducationField_' : 'EducationField',
'Department_' : 'Department',
'BusinessTravel_' : 'BusinessTravel',
'OverTime_' : 'OverTime',
'Over18_' : 'Over18'}, inplace=True)
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(inputs_)
print(inputs.head(5))
return_ = []
return_.append(X_train_minmax)
return_.append(target)
return return_
def decision_tree_classifier(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# pred = model.predict(X_test)
print(model.score(X_test, y_test))
def naive_bayes_gaussian(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = GaussianNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test) )
def naive_bayes_categorical(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = CategoricalNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
def logistic_regression(inputs_, target_):
inputs = inputs_
target = target_
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
model = LogisticRegression(multi_class="ovr")
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
if __name__ == "__main__":
inputs_and_target = readData("performance.csv")
inputs = inputs_and_target[0]
target = inputs_and_target[1]
print(inputs)
naive_bayes_gaussian(inputs, target)
Upvotes: 0
Views: 146
Reputation: 1855
The dataset you work on probably is highly imbalanced (IBM HR Analyitcs dataset). You need to split your test dataset with stratified
approach and maybe you can use another metric instead of accuracy
such as F1
, recall
or precision
to understand your model performance.
Upvotes: 2