Reputation: 608
i have taken this code from https://github.com/davidadamojr/TextRank and i am facing this problem. i tried to solve by placing utf-8 in "keyphrases = decode('utf-8').extractKeyphrases(text)" but failed.
here is the code:
"""
From this paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
External dependencies: nltk, numpy, networkx
Based on https://gist.github.com/voidfiles/1646117
"""
import nltk
import itertools
from operator import itemgetter
import networkx as nx
import sys
import os
#apply syntactic filters based on POS tags
def filter_for_tags(tagged, tags=['NN', 'JJ', 'NNP']):
return [item for item in tagged if item[1] in tags]
def normalize(tagged):
return [(item[0].replace('.', ''), item[1]) for item in tagged]
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in itertools.ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def lDistance(firstString, secondString):
"Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python"
if len(firstString) > len(secondString):
firstString, secondString = secondString, firstString
distances = range(len(firstString) + 1)
for index2, char2 in enumerate(secondString):
newDistances = [index2 + 1]
for index1, char1 in enumerate(firstString):
if char1 == char2:
newDistances.append(distances[index1])
else:
newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1])))
distances = newDistances
return distances[-1]
def buildGraph(nodes):
"nodes - list of hashables that represents the nodes of the graph"
gr = nx.Graph() #initialize an undirected graph
gr.add_nodes_from(nodes)
nodePairs = list(itertools.combinations(nodes, 2))
#add edges to the graph (weighted by Levenshtein distance)
for pair in nodePairs:
firstString = pair[0]
secondString = pair[1]
levDistance = lDistance(firstString, secondString)
gr.add_edge(firstString, secondString, weight=levDistance)
return gr
def extractKeyphrases(text):
#tokenize the text using nltk
wordTokens = nltk.word_tokenize(text)
#assign POS tags to the words in the text
tagged = nltk.pos_tag(wordTokens)
textlist = [x[0] for x in tagged]
tagged = filter_for_tags(tagged)
tagged = normalize(tagged)
unique_word_set = unique_everseen([x[0] for x in tagged])
word_set_list = list(unique_word_set)
#this will be used to determine adjacent words in order to construct keyphrases with two words
graph = buildGraph(word_set_list)
#pageRank - initial value of 1.0, error tolerance of 0,0001,
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important words in ascending order of importance
keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices)
aThird = len(word_set_list) / 3
keyphrases = keyphrases[0:aThird+1]
#take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them
#together
modifiedKeyphrases = set([])
dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase
i = 0
j = 1
while j < len(textlist):
firstWord = textlist[i]
secondWord = textlist[j]
if firstWord in keyphrases and secondWord in keyphrases:
keyphrase = firstWord + ' ' + secondWord
modifiedKeyphrases.add(keyphrase)
dealtWith.add(firstWord)
dealtWith.add(secondWord)
else:
if firstWord in keyphrases and firstWord not in dealtWith:
modifiedKeyphrases.add(firstWord)
#if this is the last word in the text, and it is a keyword,
#it definitely has no chance of being a keyphrase at this point
if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith:
modifiedKeyphrases.add(secondWord)
i = i + 1
j = j + 1
return modifiedKeyphrases
def extractSentences(text):
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentenceTokens = sent_detector.tokenize(text.strip())
graph = buildGraph(sentenceTokens)
calculated_page_rank = nx.pagerank(graph, weight='weight')
#most important sentences in ascending order of importance
sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
#return a 100 word summary
summary = ' '.join(sentences)
summaryWords = summary.split()
summaryWords = summaryWords[0:101]
summary = ' '.join(summaryWords)
return summary
def writeFiles(summary, keyphrases, fileName):
"outputs the keyphrases and summaries to appropriate files"
print "Generating output to " + 'keywords/' + fileName
keyphraseFile = open('keywords/' + fileName, 'w')
for keyphrase in keyphrases:
keyphraseFile.write(keyphrase + '\n')
keyphraseFile.close()
print "Generating output to " + 'summaries/' + fileName
summaryFile = open('summaries/' + fileName, 'w')
summaryFile.write(summary)
summaryFile.close()
print "-"
#retrieve each of the articles
articles = os.listdir("articles")
for article in articles:
print 'Reading articles/' + article
articleFile = open('articles/' + article, 'r')
text = articleFile.read()
keyphrases = decode('utf-8').extractKeyphrases(text)
summary = extractSentences(text)
writeFiles(summary, keyphrases, article)
error:
Reading articles/1.txt
Traceback (most recent call last):
File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\test\TextRank-master\textrank.py", line 166, in <module>
keyphrases = extractKeyphrases(text).setdefaultencoding("utf-8")
File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\test\TextRank-master\textrank.py", line 72, in extractKeyphrases
wordTokens = nltk.word_tokenize(text)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 93, in word_tokenize
return [token for sent in sent_tokenize(text)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 82, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1270, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1318, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1309, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1348, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1324, in _slices_from_text
if self.text_contains_sentbreak(context):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1369, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1504, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 621, in _annotate_first_pass
for aug_tok in tokens:
File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 586, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128)
Any idea? (sorry for bad English)
Upvotes: 1
Views: 3414
Reputation: 2076
I think what you are looking for is:
# ...
text = articleFile.read().decode('utf-8')
keyphrases = extractKeyphrases(text)
# ...
Basicly you want to decode to an unicode string the contents of the file as soon as you read it. Then the rest of your program is save from conversion problems. Please also make sure the file is actually in utf-8 encoding. If unsure try latin1 as encoding because that will never throw an exception while decoding (but still produces wrong text of course when the file is not in latin1 encoding)
Upvotes: 3