Reputation: 67
I'm building a text parser to identify types of crime that contain the texts. My class was built to load the texts of 2 csv files (one file to train and one file to test). The way it was built the methods in my class are for, to make a rapid processing in the texts, to remove the stopwords, to extract the vector of characteristics and among others. Follow the code below.
import re
import codecs
import csv
import nltk
import sklearn
from sklearn import cross_validation
import pandas as pd
# variaveis
tweets = []
caracteristicas = []
testBase = []
testset = []
# Tweet pre-processing
def preProcessamentoText(tweet):
# converte para minusculas
tweet = tweet.lower()
# remove URLs (www.* ou https?://*)
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
# remove @username
tweet = re.sub('@[^\s]+','AT_USER',tweet)
# remove multiplos espacos em brancos
tweet = re.sub('[\s]+', ' ', tweet)
# substitui #work por work
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
# trim
tweet = tweet.strip('\'"')
return tweet
#end
# list of stopWords
def getStopWords(stopWordListFileName):
stopWords = []
stopWords = nltk.corpus.stopwords.words('portuguese')
stopWords.append('AT_USER')
stopWords.append('URL')
fp = codecs.open(stopWordListFileName, encoding='utf-8')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
#end
# Remove repeat letters. Ex.: leeeeento = lento
def removeRepeticao(s):
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
# Feature vector
def getVetorCaracteristicas(tweet):
featureVector = []
stopWords = getStopWords('data/stopwords_pt-BR.txt')
words = tweet.split()
for w in words:
# remove letras repetidas
w = removeRepeticao(w)
# remove sinais de pontuacao
w = w.strip('\'"?,.')
# verifica se a palavra inicia com numero
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
# não adiciona se a palavra já existe na lista
# ou se a palavra começa com número
# ou tem tamanha menos que 2
if(w in stopWords or val is None or len(w) <= 2):
continue
else:
featureVector.append(w.lower())
return featureVector
#end
#load trainset
def carregarTextos():
global caracteristicas
inpTexts = csv.reader(open('data/baseTreino.csv', 'rb'), delimiter=',', quotechar='|')
for row in inpTexts:
#print row
sentimento = row[0]
tweet = row[1]
textoProcessado = preProcessamentoText(tweet)
vetorCaracteristicas = getVetorCaracteristicas(textoProcessado)
caracteristicas.extend(vetorCaracteristicas)
tweets.append((vetorCaracteristicas,sentimento))
#print tweets
#end loop
# remove entradas duplicadas
caracteristicas = list(set(caracteristicas))
#load testSet
def test_set():
global testBase
#Lendo o conjunto de testes
testTexts = csv.reader(open('data/baseTestes.csv', 'rb'), delimiter=',', quotechar='|')
for row in testTexts:
#print row
sentimento = row[0]
tweet = row[1]
textoProcessado = preProcessamentoText(tweet)
vetorCaracteristicas = getVetorCaracteristicas(textoProcessado)
testBase.extend(vetorCaracteristicas)
testset.append((vetorCaracteristicas,sentimento))
#print testset
testBase = list(set(testBase))
#Extraction of characteristics
def extracaoCaracteristicas(tweet):
#print tweet
palavras = set(tweet)
lista = {}
for palavra in caracteristicas:
lista['contains(%s)' % palavra] = (palavra in palavras)
#end loop
return lista
#Method to classify the text according to the feeling
def classificaTexto(tweet):
textoProcessado = preProcessamentoText(tweet)
result = NBClassifier.classify(extracaoCaracteristicas(getVetorCaracteristicas(textoProcessado)))
#print result
if (result == 4) :
print 'Crime não categorizado - ' + tweet
elif (result == 1):
print 'Roubo - ' + tweet
elif(result == 2):
print 'Homicídio - ' + tweet
elif(result== 3):
print 'Tráfico - ' + tweet
else :
print 'Não representa um crime - ' + tweet
# Main function
if __name__ == '__main__':
#load the 2 set (train and test)
carregarTextos()
test_set()
# Extract the feature vector of all tweets in one go
conjuntoTreino = nltk.classify.util.apply_features(extracaoCaracteristicas, tweets)
conjuntoTeste = nltk.classify.util.apply_features(extracaoCaracteristicas,testset)
# Train the classifier
#NBClassifier = nltk.NaiveBayesClassifier.train(conjuntoTreino)
#print 'accuracy:', (nltk.classify.util.accuracy(NBClassifier, conjuntoTeste))
#CrossValidation - Using ScikitLearn and NLTK
cv = cross_validation.KFold(len(conjuntoTreino), n_folds=10, shuffle=False, random_state=None)
for traincv, testcv in cv:
classifier = nltk.NaiveBayesClassifier.train(conjuntoTreino[traincv[0]:traincv[len(traincv)-1]])
print 'accuracy:', nltk.classify.util.accuracy(classifier, conjuntoTreino[testcv[0]:testcv[len(testcv)-1]])
On Main I used the normal Naive Bayes and saw their accuracy and then the Naive Bayes with cross-validation and saw their accuracy. Now I wanted to test the Naive Bayes already trained upon the CSV containing the texts for the test. In case, test the sort on the test basis.
My method def classificaTexto(tweet):
. It's just to do this job, but I'm not even able to use it with the classifier already trained. If I create a text as
texto1 = 'Enviado por um seguidor: Carro roubado no conjunto Augusto Franco'
classificaTexto(texto1)
The method will do its job and sort.
Additional Information:
My csv are in this formed. An example:
Where the number before the text represents the crime team. It was done so that the method could be used def classificaTexto(tweet):
|1|,|Enviado por um seguidor :Exclusivo.Bom dia.2 caras vestidos de palhaços ontem a noite roubaram as armas dos guardas municipais que faziam a segurança do posto médico aqui no bairro Coroa do Meio!! Polícia nas ruas a procura dos marginais !!! Surreal isso...|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|2|,|Enviado por um seguidor :Segundo informações acaba de acontecer um homicídio na cidade de Malhador no povoado Boqueval \,vítima de pré nome Ronaldo.|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Upvotes: 2
Views: 512
Reputation: 3842
You just need to call classify()
method from the same object that called train()
. One way to do it is by passing the object as method's argument:
#Method to classify the text according to the feeling
def classificaTexto(nbc, tweet):
textoProcessado = preProcessamentoText(tweet)
result = nbc.classify(extracaoCaracteristicas(getVetorCaracteristicas(textoProcessado)))
#print result
if (result == 4) :
print 'Crime não categorizado - ' + tweet
elif (result == 1):
print 'Roubo - ' + tweet
elif(result == 2):
print 'Homicídio - ' + tweet
elif(result== 3):
print 'Tráfico - ' + tweet
else :
print 'Não representa um crime - ' + tweet
then you should be able to use it like this:
# Main function
if __name__ == '__main__':
#load the 2 set (train and test)
carregarTextos()
test_set()
# Extract the feature vector of all tweets in one go
conjuntoTreino = nltk.classify.util.apply_features(extracaoCaracteristicas, tweets)
# Train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(conjuntoTreino)
# Classify tweet
texto1 = 'Enviado por um seguidor: Carro roubado no conjunto Augusto Franco'
classificaTexto(NBClassifier, texto1)
UPDATE
If you want to classify on the output of nltk.classify.util.apply_features()
, you can slightly modify classificaTexto()
:
def classificaTexto(nbc, data):
for features in data:
result = nbc.classify(features)
#print result
if (result == 4) :
print 'Crime não categorizado - ' + tweet
elif (result == 1):
print 'Roubo - ' + tweet
elif(result == 2):
print 'Homicídio - ' + tweet
elif(result== 3):
print 'Tráfico - ' + tweet
else :
print 'Não representa um crime - ' + tweet
and use it like this:
# Main function
if __name__ == '__main__':
#load the 2 set (train and test)
carregarTextos()
test_set()
# Extract the feature vector of all tweets in one go
conjuntoTreino = nltk.classify.util.apply_features(extracaoCaracteristicas, tweets)
conjuntoTeste = nltk.classify.util.apply_features(extracaoCaracteristicas,testset)
# Train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(conjuntoTreino)
# Classify testset
classificaTexto(NBClassifier, conjuntoTeste)
you can also use
results = nbc.classify_many(data)
if you wish to immediately store the results in alist
Upvotes: 1