Reputation: 2053
I am using this dataset Weath Based on age and the documentation states that the accuracy should be around 84%
. Unfortunately, the accuracy of my program is at 25%
To process the data I did the following:
1. Loaded the .txt data file and converted it to a .csv
2. Removed data with missing values
3. Extracted the class values: <=50K >50 and convert it to 0 and 1 respectively
4. For each attribute and for each string value of that attribute I
mapped it to an integer value. Example att1{'cs':0, 'cs2':1},
att2{'usa':0, 'greece':1} ... and so on
5. Called naive bayes on the new integer data set
Python code:
import load_csv as load #my functions to do [1..5] of the list
import numpy as np
my_data = np.genfromtxt('out.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data)) #this funcion removes the missing data
features_train = np.array(load.remove_field_num(data, len(data[0]) - 1)) #this function extracts the data, e.g removes the class in the end of the data
label_train = np.array(load.create_labels(data))
features_train = np.array(load.convert_to_int(features_train))
my_data = np.genfromtxt('test.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data))
features_test = np.array(load.remove_field_num(data, len(data[0]) - 1))
label_test = np.array(load.create_labels(data)) #extracts the labels from the .csv data file
features_test = np.array(load.convert_to_int(features_test)) #converts the strings to ints(each unique string of an attribute is assigned a unique integer value
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, label_train)
predict = clf.predict(features_test)
score = accuracy_score(predict, label_test) #Low accuracy score
load_csv module:
import numpy as np
attributes = { 'Private':0, 'Self-emp-not-inc':1, 'Self-emp-inc':2, 'Federal-gov':3, 'Local-gov':4, 'State-gov':5, 'Without-pay':6, 'Never-worked':7,
'Bachelors':0, 'Some-college':1, '11th':2, 'HS-grad':3, 'Prof-school':4, 'Assoc-acdm':5, 'Assoc-voc':6, '9th':7, '7th-8th':8, '12th':9, 'Masters':10, '1st-4th':11, '10th':12, 'Doctorate':13, '5th-6th':14, 'Preschool':15,
'Married-civ-spouse':0, 'Divorced':1, 'Never-married':2, 'Separated':3, 'Widowed':4, 'Married-spouse-absent':5, 'Married-AF-spouse':6,
'Tech-support':0, 'Craft-repair':1, 'Other-service':2, 'Sales':3, 'Exec-managerial':4, 'Prof-specialty':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Adm-clerical':8,
'Farming-fishing':9, 'Transport-moving':10, 'Priv-house-serv':11, 'Protective-serv':12, 'Armed-Forces':13,
'Wife':0, 'Own-child':1, 'Husband':2, 'Not-in-family':4, 'Other-relative':5, 'Unmarried':5,
'White':0, 'Asian-Pac-Islander':1, 'Amer-Indian-Eskimo':2, 'Other':3, 'Black':4,
'Female':0, 'Male':1,
'United-States':0, 'Cambodia':1, 'England':2, 'Puerto-Rico':3, 'Canada':4, 'Germany':5, 'Outlying-US(Guam-USVI-etc)':6, 'India':7, 'Japan':8, 'Greece':9, 'South':10, 'China':11, 'Cuba':12, 'Iran':13, 'Honduras':14, 'Philippines':15, 'Italy':16, 'Poland':17, 'Jamaica':18, 'Vietnam':19, 'Mexico':20, 'Portugal':21, 'Ireland':22, 'France':23, 'Dominican-Republic':24, 'Laos':25, 'Ecuador':26, 'Taiwan':27, 'Haiti':28, 'Columbia':29, 'Hungary':30, 'Guatemala':31, 'Nicaragua':32, 'Scotland':33, 'Thailand':34, 'Yugoslavia':35, 'El-Salvador':36, 'Trinadad&Tobago':37, 'Peru':38, 'Hong':39, 'Holand-Netherlands':40
}
def remove_field_num(a, i): #function to strip values
names = list(a.dtype.names)
new_names = names[:i] + names[i + 1:]
b = a[new_names]
return b
def remove_missing_values(data):
temp = []
for i in range(len(data)):
for j in range(len(data[i])):
if data[i][j] == '?': #If a missing value '?' is encountered do not append the line to temp
break;
if j == (len(data[i]) - 1) and len(data[i]) == 15:
temp.append(data[i]) #Append the lines that do not contain '?'
return temp
def create_labels(data):
temp = []
for i in range(len(data)): #Iterate through the data
j = len(data[i]) - 1 #Extract the labels
if data[i][j] == '<=50K':
temp.append(0)
else:
temp.append(1)
return temp
def convert_to_int(data):
my_lst = []
for i in range(len(data)):
lst = []
for j in range(len(data[i])):
key = data[i][j]
if j in (1, 3, 5, 6, 7, 8, 9, 13, 14):
lst.append(int(attributes[key]))
else:
lst.append(int(key))
my_lst.append(lst)
temp = np.array(my_lst)
return temp
I have tried to use both tree
and NaiveBayes
but the accuracy is very low. Any suggestions of what am I missing?
Upvotes: 1
Views: 2641
Reputation: 11160
I guess the problem is in preprocessing. It is better to encode the categorical variables as one_hot vectors (vectors with only zero or ones where one corresponds to the desired value for that class) instead of raw numbers. Sklearn DictVectorizer can help you in that. You can do the classification much more efficiently with the pandas
library.
The following shows how easily you can achieve that with help of pandas
library. It works very well along side scikit-learn. This achieves accuracy of 81.6 on a test set that is 20% of the entire data.
from __future__ import division
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.classification import classification_report, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
# Read the data into a pandas dataframe
df = pd.read_csv('adult.data.csv')
# Columns names
cols = np.array(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'target'])
# numeric columns
numeric_cols = ['age', 'fnlwgt', 'education-num',
'capital-gain', 'capital-loss', 'hours-per-week']
# assign names to the columns in the dataframe
df.columns = cols
# replace the target variable to 0 and 1 for <50K and >50k
df1 = df.copy()
df1.loc[df1['target'] == ' <=50K', 'target'] = 0
df1.loc[df1['target'] == ' >50K', 'target'] = 1
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(
df1.drop('target', axis=1), df1['target'], test_size=0.2)
# numeric attributes
x_num_train = X_train[numeric_cols].as_matrix()
x_num_test = X_test[numeric_cols].as_matrix()
# scale to <0,1>
max_train = np.amax(x_num_train, 0)
max_test = np.amax(x_num_test, 0) # not really needed
x_num_train = x_num_train / max_train
x_num_test = x_num_test / max_train # scale test by max_train
# labels or target attribute
y_train = y_train.astype(int)
y_test = y_test.astype(int)
# categorical attributes
cat_train = X_train.drop(numeric_cols, axis=1)
cat_test = X_test.drop(numeric_cols, axis=1)
cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)
x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()
# vectorize (encode as one hot)
vectorizer = DictVectorizer(sparse=False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test)
# build the feature vector
x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))
clf = LogisticRegression().fit(x_train, y_train.values)
pred = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
clf = DecisionTreeClassifier().fit(x_train, y_train)
predict = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
clf = GaussianNB().fit(x_train, y_train)
predict = clf.predict(x_test)
print classification_report(y_test.values, pred, digits=4)
print accuracy_score(y_test.values, pred)
Upvotes: 3