Reputation: 23
Data:
rid age income student credit_rating class_buy_computer
1 young high no fair no
2 young high no excellent no
3 middle high no fair yes
4 senior medium no fair yes
5 senior low yes fair yes
6 senior low yes excellent no
7 middle low yes excellent yes
8 young medium no fair yes
9 young low yes fair yes
10 senior medium yes fair yes
11 young medium yes excellent yes
12 middle medium no excellent yes
13 middle high yes fair yes
14 senior medium no excellent no
Code:
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
myData = open(r'C:\Users\USER\Desktop\test.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
labeList.append(row[len(row)-1])
rowDict={}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
featuelist.append(rowDict)
print(featuelist)
vec=DictVectorizer()
dummyX=vec.fit_transform(featuelist).toarray()
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))
I get this error:
File "<ipython-input-20-eacaea56a8a9>", line 1, in <module>
runfile('C:/Users/USER/Desktop/test.py', wdir='C:/Users/USER/Desktop')
File "D:\tools\python\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
File "D:\tools\python\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
File "C:/Users/USER/Desktop/test.py", line 32, in <module>
clf=clf.fit(dummyX,dummyY)
File "D:\tools\python\lib\site-packages\sklearn\tree\tree.py", line 790, in fit
X_idx_sorted=X_idx_sorted)
File "D:\tools\python\lib\site-packages\sklearn\tree\tree.py", line 236, in fit
"number of samples=%d" % (len(y), n_samples))
ValueError: Number of labels=14 does not match number of samples=56
Upvotes: 0
Views: 1345
Reputation: 8811
It's simply because each row is added 4 times inside the featuelist
dictionary.The line featuelist.append(rowDict)
should not be inside the second loop.
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
myData = open('/home/kashif/test.csv')
reader = csv.reader(myData)
headers=next(reader)
print (headers)
featuelist=[]
labeList=[]
for row in reader:
labeList.append(row[len(row)-1])
rowDict={}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
#Make sure the below line is not inside the second loop
featuelist.append(rowDict) #<--This was the typo.
print(featuelist)
vec=DictVectorizer(sparse=False)
dummyX=vec.fit_transform(featuelist)
print('dummyX:'+str(dummyX))
print(vec.get_feature_names())
print('labeList:'+str(labeList))
lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labeList)
print('dummyY:'+str(dummyY))
clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX,dummyY)
print('clf:'+str(clf))
Output :
clf:DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
Upvotes: 1