Reputation: 21
Here is the full error message:
AttributeErrorTraceback (most recent call last) in () 24 25 # train ---> 26 pipe.fit(train1, labelsTrain1) 27 28 # test
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\pipeline.pyc in fit(self, X, y, **fit_params) 246 This estimator 247 """ --> 248 Xt, fit_params = self._fit(X, y, **fit_params) 249 if self._final_estimator is not None: 250 self._final_estimator.fit(Xt, y, **fit_params)
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\pipeline.pyc in _fit(self, X, y, **fit_params) 211 Xt, fitted_transformer = fit_transform_one_cached( 212 cloned_transformer, None, Xt, y, --> 213 **fit_params_steps[name]) 214 # Replace the transformer of the step with the fitted 215 # transformer. This is necessary when loading the transformer
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\externals\joblib\memory.pyc in call(self, *args, **kwargs) 360 361 def call(self, *args, **kwargs): --> 362 return self.func(*args, **kwargs) 363 364 def call_and_shelve(self, *args, **kwargs):
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\pipeline.pyc in _fit_transform_one(transformer, weight, X, y, **fit_params) 579 **fit_params): 580 if hasattr(transformer, 'fit_transform'): --> 581 res = transformer.fit_transform(X, y, **fit_params) 582 else: 583 res = transformer.fit(X, y, **fit_params).transform(X)
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc in fit_transform(self, raw_documents, y) 867 868 vocabulary, X = self._count_vocab(raw_documents, --> 869 self.fixed_vocabulary_) 870 871 if self.binary:
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc in _count_vocab(self, raw_documents, fixed_vocab) 790 for doc in raw_documents: 791 feature_counter = {} --> 792 for feature in analyze(doc): 793 try: 794 feature_idx = vocabulary[feature]
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc in (doc) 264 265 return lambda doc: self._word_ngrams( --> 266 tokenize(preprocess(self.decode(doc))), stop_words) 267 268 else:
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc in (x) 230 231 if self.lowercase: --> 232 return lambda x: strip_accents(x.lower()) 233 else: 234 return strip_accents
AttributeError: 'NoneType' object has no attribute 'lower'
Here is the code:
def printNMostInformative(vectorizer, clf, N):
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
topClass1 = coefs_with_fns[:N]
topClass2 = coefs_with_fns[:-(N + 1):-1]
print("Class 1 best: ")
for feat in topClass1:
print(feat)
print("Class 2 best: ")
for feat in topClass2:
print(feat)
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
# data
train1 = train['Title'].tolist()
labelsTrain1 = train['Conference'].tolist()
test1 = test['Title'].tolist()
labelsTest1 = test['Conference'].tolist()
# train
pipe.fit(train1, labelsTrain1)
# test
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(labelsTest1, preds))
print("Top 10 features used to predict: ")
printNMostInformative(vectorizer, clf, 10)
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(train1, labelsTrain1)
vocab = vectorizer.get_feature_names()
for i in range(len(train1)):
s = ""
indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
for idx, num in zip(indexIntoVocab, numOccurences):
s += str((vocab[idx], num))
Looks like it has something to do with the train1 data. Not sure how to fix this.
This is after cleaning the data ad now trying to use this function to print out the most important features, the features that have the highest coefficients:
Upvotes: 0
Views: 2302
Reputation: 41
For those looking for more information - this is based off a Tutorial https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49. I was also getting the same error:
This is in relation to the cleanText()
function which did not return anything for the pipeline to work with - Hence the NoneType Object traceback
def cleanText(text):
text = text.strip().replace("\n", " ").replace("\r", " ")
text = text.lower()
If you add return text
it should fix your error
def cleanText(text):
text = text.strip().replace("\n", " ").replace("\r", " ")
text = text.lower()
return text
Upvotes: 3