user3314418
user3314418

Reputation: 3041

Classification Based Chunking - NLTK Cookbook - Evaluate() not working

I've been following the NLTK cookbook for classification based chunking, and I came to the following error when trying to evaluate my classifier.

all the code that leads up to this error is posted below the traceback

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-64-201b22386c9f> in <module>()
      1 chunker = ClassifierChunker(train_chunks)
----> 2 score = chunker.evaluate(test_chunks)
      3 score.accuracy()

//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in evaluate(self, gold)
     47         chunkscore = ChunkScore()
     48         for correct in gold:
---> 49             chunkscore.score(correct, self.parse(correct.leaves()))
     50         return chunkscore
     51 

//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in parse(self, tokens)
     32         :rtype: Tree
     33         """
---> 34         raise NotImplementedError()
     35 
     36     def evaluate(self, gold):

NotImplementedError: 


#from chunkers import TagChunker
from nltk.corpus import treebank_chunk
train_chunks = treebank_chunk.chunked_sents()[:3000]
test_chunks = treebank_chunk.chunked_sents()[3000:]

import nltk.chunk
from nltk.tag import ClassifierBasedTagger

def chunk_trees2train_chunks(chunk_sents):
    tag_sents = [nltk.chunk.tree2conlltags(sent) for sent in chunk_sents]
    return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]

def prev_next_pos_iob(tokens, index, history):
    word, pos = tokens[index]
    if index == 0:
        prevword, prevpos, previob = ('<START>',)*3
    else:
        prevword, prevpos = tokens[index-1]
        previob = history[index-1]
    if index == len(tokens) - 1:
        nextword, nextpos = ('<END>',)*2
    else:
        nextword, nextpos = tokens[index+1]

    feats = {
    'word': word,
    'pos': pos,
    'nextword': nextword,
    'nextpos': nextpos,
    'prevword': prevword,
    'prevpos': prevpos,
    'previob': previob
    }

    return feats

class ClassifierChunker(nltk.chunk.ChunkParserI):
    def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
    **kwargs):
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                        feature_detector=feature_detector, **kwargs)


    def parse(self, tagged_sent):
       if not tagged_sent: return None
       chunks = self.tagger.tag(tagged_sent)
       return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in
       chunks])


#the following is copy/pasted from chunkers.py 

import nltk.tag
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
#from .transforms import node_label

#####################
## tree conversion ##
#####################

def chunk_trees2train_chunks(chunk_sents):
    tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
    return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]

def conll_tag_chunks(chunk_sents):
    '''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
    so the final result is a list of lists of (tag, chunk_tag) tuples.
    >>> from nltk.tree import Tree
    >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
    >>> conll_tag_chunks([t])
    [[('DT', 'B-NP'), ('NN', 'I-NP')]]
    '''
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
    # tree.pos() flattens the tree and produces [(word, label)] where label is
    # from the word's parent tree label. words in a chunk therefore get the
    # chunk tag, while words outside a chunk get the same tag as the tree's
    # top label
    words, ents = zip(*tree.pos())
    iobs = []
    prev = None
    # construct iob tags from entity names
    for ent in ents:
        # any entity that is the same as the tree's top label is outside a chunk
        if ent == node_label(tree):
            iobs.append('O')
            prev = None
        # have a previous entity that is equal so this is inside the chunk
        elif prev == ent:
            iobs.append('I-%s' % ent)
        # no previous equal entity in the sequence, so this is the beginning of
        # an entity chunk
        else:
            iobs.append('B-%s' % ent)
            prev = ent
    # get tags for each word, then construct 3-tuple for conll tags
    words, tags = zip(*tag(words))
    return zip(words, tags, iobs)

#################
## tag chunker ##
#################

class TagChunker(ChunkParserI):
    '''Chunks tagged tokens using Ngram Tagging.'''
    def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
        '''Train Ngram taggers on chunked sentences'''
        train_sents = conll_tag_chunks(train_chunks)
        self.tagger = None

        for cls in tagger_classes:
            self.tagger = cls(train_sents, backoff=self.tagger)

    def parse(self, tagged_sent):
        '''Parsed tagged tokens into parse Tree of chunks'''
        if not tagged_sent: return None
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        # create conll str for tree parsing
        return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])

########################
## classifier chunker ##
########################

def prev_next_pos_iob(tokens, index, history):
    word, pos = tokens[index]

    if index == 0:
        prevword, prevpos, previob = ('<START>',)*3
    else:
        prevword, prevpos = tokens[index-1]
        previob = history[index-1]

    if index == len(tokens) - 1:
        nextword, nextpos = ('<END>',)*2
    else:
        nextword, nextpos = tokens[index+1]

    feats = {
        'word': word,
        'pos': pos,
        'nextword': nextword,
        'nextpos': nextpos,
        'prevword': prevword,
        'prevpos': prevpos,
        'previob': previob
    }

    return feats

class ClassifierChunker(ChunkParserI):
    def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
            feature_detector=feature_detector, **kwargs)

    def parse(self, tagged_sent):
        if not tagged_sent: return None
        chunks = self.tagger.tag(tagged_sent)
        return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])

#############
## pattern ##
#############

class PatternChunker(ChunkParserI):
    def parse(self, tagged_sent):
        # don't import at top since don't want to fail if not installed
        from pattern.en import parse
        s = ' '.join([word for word, tag in tagged_sent])
        # not tokenizing ensures that the number of tagged tokens returned is
        # the same as the number of input tokens
        sents = parse(s, tokenize=False).split()
        if not sents: return None
        return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])

Upvotes: 0

Views: 1165

Answers (1)

Padraic Cunningham
Padraic Cunningham

Reputation: 180471

You are meant to define a parse method yourself, you can see in the source that it is not implemented:

class ChunkParserI(ParserI):
    """
    A processing interface for identifying non-overlapping groups in
    unrestricted text.  Typically, chunk parsers are used to find base
    syntactic constituents, such as base noun phrases.  Unlike
    ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
    will always generate a parse.
    """
    def parse(self, tokens):
        """
        Return the best chunk structure for the given tokens
        and return a tree.

        :param tokens: The list of (word, tag) tokens to be chunked.
        :type tokens: list(tuple)
        :rtype: Tree
        """
        raise NotImplementedError()

You actually have one defined, I think your indentation is the issue:

class ClassifierChunker(nltk.chunk.ChunkParserI):
    def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
    **kwargs):
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                        feature_detector=feature_detector, **kwargs)


    def parse(self, tagged_sent):  # indent inside the class
        if not tagged_sent: return None
        chunks = self.tagger.tag(tagged_sent)
        return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in
        chunks])

You do not have it inside the class though so as far as nltk.chunk.ChunkParserI is concerned you have no parse method implemented

There is no method nltk.chunk.conlltags2tree it is in nltk.chunk.util

return nltk.chunk.util.conlltags2tree([(w,t,c) for ((w,t),c) in
    chunks])

Upvotes: 3

Related Questions