Escort Personal Adz
Escort Personal Adz

Reputation: 39

Why am I getting an almost perfect test accuracy?

I am using this dataset to predict employee performance using different ML algorithms such as DecisionTreeClassifier, CategoricalNB, LogisticRegression, GaussianNB. This is basically how the dataset is structured

df.head(5)

   Age  DailyRate  DistanceFromHome  EnvironmentSatisfaction  HourlyRate  ...  EducationField  Department  BusinessTravel  OverTime  Over18
0   41       1102                 1                        2          94  ...               1           2               2         1       0
1   49        279                 8                        3          61  ...               1           1               1         0       0
2   37       1373                 2                        4          92  ...               4           1               2         1       0
3   33       1392                 3                        4          56  ...               1           1               1         1       0
4   27        591                 2                        1          40  ...               3           1               2         0       0

When I try to get the accuracy of each model using

model.score(X_test, y_test)

it gives almost 100% accuracy (99.98...)

What could be reason for this?

Here is my code:

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing

def readData(path):

    dataframe = pd.read_csv(path)

    inputs = dataframe.drop(['PerformanceRating', 
                            'Attrition', 'Education', 'EmployeeCount',
                            'EmployeeNumber',
                            'StockOptionLevel',
                            'WorkLifeBalance'
                            ], axis='columns')

    # inputs = dataframe.drop(['PerformanceRating'], axis='columns')                        

    target = dataframe['PerformanceRating']
    inputs_and_target = pre_processing(inputs, target)
    return inputs_and_target


def pre_processing(inputs_, target_):

    inputs = inputs_
    target = target_

    MaritalStatus_ = LabelEncoder()
    JobRole_ = LabelEncoder()
    Gender_ = LabelEncoder()
    EducationField_ = LabelEncoder()
    Department_ = LabelEncoder()
    BusinessTravel_ = LabelEncoder()

    OverTime_ = LabelEncoder()
    Over18_ = LabelEncoder()


    inputs['MaritalStatus_'] = MaritalStatus_.fit_transform(inputs['MaritalStatus'])
    inputs['JobRole_'] = JobRole_.fit_transform(inputs['JobRole'])
    inputs['Gender_'] = Gender_.fit_transform(inputs['Gender'])
    inputs['EducationField_'] = EducationField_.fit_transform(inputs['EducationField'])
    inputs['Department_'] = Department_.fit_transform(inputs['Department'])
    inputs['BusinessTravel_'] = BusinessTravel_.fit_transform(inputs['BusinessTravel'])

    inputs['OverTime_'] = OverTime_.fit_transform(inputs['OverTime'])
    inputs['Over18_'] = Over18_.fit_transform(inputs['Over18'])


    inputs.drop(['MaritalStatus', 'JobRole' , 'OverTime' , 'EducationField',
                    'Gender', 'Department', 'BusinessTravel', 'Over18'], axis='columns', inplace=True)


    inputs.rename(columns={'MaritalStatus_':'MaritalStatus', 
                        'JobRole_' : 'JobRole',
                        'Gender_' : 'Gender',
                        'EducationField_' : 'EducationField',
                        'Department_' : 'Department',
                        'BusinessTravel_' : 'BusinessTravel',

                        'OverTime_' : 'OverTime',
                        'Over18_' : 'Over18'}, inplace=True)


    min_max_scaler = preprocessing.MinMaxScaler()
    X_train_minmax = min_max_scaler.fit_transform(inputs_)    

    print(inputs.head(5))
    return_ = []
    return_.append(X_train_minmax)
    return_.append(target)
    return return_



def decision_tree_classifier(inputs_, target_):

    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    # pred = model.predict(X_test)

    print(model.score(X_test, y_test))


def naive_bayes_gaussian(inputs_, target_):

    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

    model = GaussianNB()
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test) )


def naive_bayes_categorical(inputs_, target_):

    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

    model = CategoricalNB()
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test))


def logistic_regression(inputs_, target_):
    inputs = inputs_
    target = target_

    X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

    model = LogisticRegression(multi_class="ovr")
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test))


if __name__ == "__main__":

    inputs_and_target = readData("performance.csv")

    inputs = inputs_and_target[0]
    target = inputs_and_target[1]

    print(inputs)


    naive_bayes_gaussian(inputs, target)

Upvotes: 0

Views: 146

Answers (1)

Batuhan B
Batuhan B

Reputation: 1855

The dataset you work on probably is highly imbalanced (IBM HR Analyitcs dataset). You need to split your test dataset with stratified approach and maybe you can use another metric instead of accuracy such as F1, recall or precision to understand your model performance.

Upvotes: 2

Related Questions