Harnoor Singh
Harnoor Singh

Reputation: 1

Python3: Multi-label text classification with reuters 21578 data set

I am using the following code to classify a document in to three categories Sports, Politics and money. I can see that this code calculates Precision recall and F1. But I am not able to find a way to use this code to test against custom document a predict its label.

from nltk.corpus import stopwords, reuters
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

cachedStopWords = stopwords.words("english")
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token),words)))
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length,tokens))
    return filtered_tokens

def represent(documents, representer):
    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

    # Learn and transform train documents
    vectorised_train_documents = representer.fit_transform(train_docs)
    vectorised_test_documents = representer.transform(test_docs)

    # Transform multilabel labels
    mlb = MultiLabelBinarizer()
    train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id]) 
    test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

    return (vectorised_train_documents, train_labels, vectorised_test_documents, test_labels)

def evaluate(test_labels, predictions):
    precision = precision_score(test_labels, predictions, average='micro')
    recall = recall_score(test_labels, predictions, average='micro')
    f1 = f1_score(test_labels, predictions, average='micro')
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(test_labels, predictions, average='macro')
    recall = recall_score(test_labels, predictions, average='macro')
    f1 = f1_score(test_labels, predictions, average='macro')

    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

documents = reuters.fileids()
candidate = {'representer': TfidfVectorizer(tokenizer=tokenize),
             'estimator': OneVsRestClassifier(LinearSVC(random_state=42))}
train_docs, train_labels, test_docs, test_labels = represent(documents, candidate['representer'])
candidate['estimator'].fit(train_docs, train_labels)
predictions = candidate['estimator'].predict(test_docs)
evaluate(test_labels, predictions)

Credits: https://github.com/miguelmalvarez/reuters-tc/blob/master/notebook/Classification_Reuters.ipynb

Upvotes: 0

Views: 1641

Answers (1)

oldmonk
oldmonk

Reputation: 691

You can store your custom documents as text files in a folder, lets say yourfolder. After that you can use the below code to train on reuters data and predict labels for your text documents. all_labels will contain the list of predicted labels (as tuples) for each document

import os


classifier=OneVsRestClassifier(LinearSVC(random_state=42))
vectorizer=TfidfVectorizer(tokenizer=tokenize)

#LOAD AND TRANSFORM TRAINING DOCS
documents = reuters.fileids()
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
vectorised_train_documents = vectorizer.fit_transform(train_docs)
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id]) 

#LEARN CLASSIFICATION MODEL
classifier=classifier.fit(vectorised_train_documents, train_labels)


#LOAD AND TRANSFORM TEST DOCS
documents_yours=os.listdir('yourfoldername')
test_docs_yours = [open('yourfoldername/'+doc_id).read() for doc_id in documents_yours]
vectorised_test_documents_yours = vectorizer.transform(test_docs_yours)


#MAKEPREIDCTIONS
predictions_yours=classifier.predict(vectorised_test_documents_yours)
all_labels = mlb.inverse_transform(predictions_yours)
all_labels 

Upvotes: 0

Related Questions