codeforfun
codeforfun

Reputation: 187

Converting NLTK chunks to a list of dictionaries

I need to get a list of dictionaries from an example nltk tree below:

   (S
      I/PRP
      'll/MD
      have/VB
      (amount 1/CD)
      (plate pizza/NN)
      and/CC
      (amount 4/CD)
      (plate sandwiches/NNS))

The desired output is as follows

  [{amount: 1, plate: pizza}, {amount: 4, plate: sandwiches}]

I have tried the code below but I only get one dictionary list: [{amount: 4, plate: sandwiches}] It looks like the list does not append new entries, it only updates the same dictionary.

import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree

training = []
hmm_tagger = HiddenMarkovModelTagger.train(training)
sentence = "I'll have 1 pizza and 4 sandwiches"
gram = r"""
plate:      {<NN|NNS>}                    
amount:    {<CD|DT>}
        """
cp = nltk.RegexpParser(gram)
for sent in sentence:
    tokens = nltk.word_tokenize(sent)
    taggex = hmm_tagger.tag(tokens)
treee = cp.parse(taggex)
iob_ts = tree2conlltags(treee)
tree = conlltags2tree(iob_ts)
def conversion(tree):
    dlist = []
    for leaf in tree:
        if type(leaf) == tuple:
            for leaf in tree:
                key = leaf.label()
                value = leaf[0][0]
                dlist =[dict(zip(key, value)) for leaf in tree]              
    return dlist

Upvotes: 0

Views: 337

Answers (1)

thorntonc
thorntonc

Reputation: 2126

The main issue here is that you are not appending after each loop iteration, both inside and outside the conversion function.

from nltk.chunk.regexp import RegexpParser
from nltk import Tree, pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize

gram = r"""
plate:      {<NN|NNS>}                    
amount:    {<CD|DT>}
"""
cp = RegexpParser(gram)

text = "I'll have 1 pizza and 4 sandwiches"

def conversion(tree):
    dlist = []
    d = dict()
    for item in tree:
        if isinstance(item, Tree):
            d[item.label()] = ' '.join([l[0] for l in item.leaves()])
        else:
            dlist.append(d) if len(d)>0 else None
            d = dict()
    dlist.append(d) if len(d)>0 else None
    return dlist

parsed_text = [cp.parse(pos_tag(word_tokenize(sent)))
               for sent in sent_tokenize(text)]

for tree in parsed_text:
    print(conversion(tree))
#[{'amount': '1', 'plate': 'pizza'}, {'amount': '4', 'plate': 'sandwiches'}]

Upvotes: 2

Related Questions