Reputation: 3041
I've been following the NLTK cookbook for classification based chunking, and I came to the following error when trying to evaluate my classifier.
all the code that leads up to this error is posted below the traceback
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-64-201b22386c9f> in <module>()
1 chunker = ClassifierChunker(train_chunks)
----> 2 score = chunker.evaluate(test_chunks)
3 score.accuracy()
//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in evaluate(self, gold)
47 chunkscore = ChunkScore()
48 for correct in gold:
---> 49 chunkscore.score(correct, self.parse(correct.leaves()))
50 return chunkscore
51
//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in parse(self, tokens)
32 :rtype: Tree
33 """
---> 34 raise NotImplementedError()
35
36 def evaluate(self, gold):
NotImplementedError:
#from chunkers import TagChunker
from nltk.corpus import treebank_chunk
train_chunks = treebank_chunk.chunked_sents()[:3000]
test_chunks = treebank_chunk.chunked_sents()[3000:]
import nltk.chunk
from nltk.tag import ClassifierBasedTagger
def chunk_trees2train_chunks(chunk_sents):
tag_sents = [nltk.chunk.tree2conlltags(sent) for sent in chunk_sents]
return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
def prev_next_pos_iob(tokens, index, history):
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',)*3
else:
prevword, prevpos = tokens[index-1]
previob = history[index-1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',)*2
else:
nextword, nextpos = tokens[index+1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(nltk.chunk.ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
**kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in
chunks])
#the following is copy/pasted from chunkers.py
import nltk.tag
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
#from .transforms import node_label
#####################
## tree conversion ##
#####################
def chunk_trees2train_chunks(chunk_sents):
tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
def conll_tag_chunks(chunk_sents):
'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
so the final result is a list of lists of (tag, chunk_tag) tuples.
>>> from nltk.tree import Tree
>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
>>> conll_tag_chunks([t])
[[('DT', 'B-NP'), ('NN', 'I-NP')]]
'''
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
# tree.pos() flattens the tree and produces [(word, label)] where label is
# from the word's parent tree label. words in a chunk therefore get the
# chunk tag, while words outside a chunk get the same tag as the tree's
# top label
words, ents = zip(*tree.pos())
iobs = []
prev = None
# construct iob tags from entity names
for ent in ents:
# any entity that is the same as the tree's top label is outside a chunk
if ent == node_label(tree):
iobs.append('O')
prev = None
# have a previous entity that is equal so this is inside the chunk
elif prev == ent:
iobs.append('I-%s' % ent)
# no previous equal entity in the sequence, so this is the beginning of
# an entity chunk
else:
iobs.append('B-%s' % ent)
prev = ent
# get tags for each word, then construct 3-tuple for conll tags
words, tags = zip(*tag(words))
return zip(words, tags, iobs)
#################
## tag chunker ##
#################
class TagChunker(ChunkParserI):
'''Chunks tagged tokens using Ngram Tagging.'''
def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
'''Train Ngram taggers on chunked sentences'''
train_sents = conll_tag_chunks(train_chunks)
self.tagger = None
for cls in tagger_classes:
self.tagger = cls(train_sents, backoff=self.tagger)
def parse(self, tagged_sent):
'''Parsed tagged tokens into parse Tree of chunks'''
if not tagged_sent: return None
(words, tags) = zip(*tagged_sent)
chunks = self.tagger.tag(tags)
# create conll str for tree parsing
return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])
########################
## classifier chunker ##
########################
def prev_next_pos_iob(tokens, index, history):
word, pos = tokens[index]
if index == 0:
prevword, prevpos, previob = ('<START>',)*3
else:
prevword, prevpos = tokens[index-1]
previob = history[index-1]
if index == len(tokens) - 1:
nextword, nextpos = ('<END>',)*2
else:
nextword, nextpos = tokens[index+1]
feats = {
'word': word,
'pos': pos,
'nextword': nextword,
'nextpos': nextpos,
'prevword': prevword,
'prevpos': prevpos,
'previob': previob
}
return feats
class ClassifierChunker(ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent):
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
#############
## pattern ##
#############
class PatternChunker(ChunkParserI):
def parse(self, tagged_sent):
# don't import at top since don't want to fail if not installed
from pattern.en import parse
s = ' '.join([word for word, tag in tagged_sent])
# not tokenizing ensures that the number of tagged tokens returned is
# the same as the number of input tokens
sents = parse(s, tokenize=False).split()
if not sents: return None
return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
Upvotes: 0
Views: 1165
Reputation: 180471
You are meant to define a parse method yourself, you can see in the source that it is not implemented:
class ChunkParserI(ParserI):
"""
A processing interface for identifying non-overlapping groups in
unrestricted text. Typically, chunk parsers are used to find base
syntactic constituents, such as base noun phrases. Unlike
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
will always generate a parse.
"""
def parse(self, tokens):
"""
Return the best chunk structure for the given tokens
and return a tree.
:param tokens: The list of (word, tag) tokens to be chunked.
:type tokens: list(tuple)
:rtype: Tree
"""
raise NotImplementedError()
You actually have one defined, I think your indentation is the issue:
class ClassifierChunker(nltk.chunk.ChunkParserI):
def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
**kwargs):
if not feature_detector:
feature_detector = self.feature_detector
train_chunks = chunk_trees2train_chunks(train_sents)
self.tagger = ClassifierBasedTagger(train=train_chunks,
feature_detector=feature_detector, **kwargs)
def parse(self, tagged_sent): # indent inside the class
if not tagged_sent: return None
chunks = self.tagger.tag(tagged_sent)
return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in
chunks])
You do not have it inside the class
though so as far as nltk.chunk.ChunkParserI
is concerned you have no parse
method implemented
There is no method nltk.chunk.conlltags2tree
it is in nltk.chunk.util
return nltk.chunk.util.conlltags2tree([(w,t,c) for ((w,t),c) in
chunks])
Upvotes: 3