Reputation: 187
I need to get a list of dictionaries from an example nltk tree below:
(S
I/PRP
'll/MD
have/VB
(amount 1/CD)
(plate pizza/NN)
and/CC
(amount 4/CD)
(plate sandwiches/NNS))
The desired output is as follows
[{amount: 1, plate: pizza}, {amount: 4, plate: sandwiches}]
I have tried the code below but I only get one dictionary list: [{amount: 4, plate: sandwiches}] It looks like the list does not append new entries, it only updates the same dictionary.
import nltk
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
from nltk import Tree
training = []
hmm_tagger = HiddenMarkovModelTagger.train(training)
sentence = "I'll have 1 pizza and 4 sandwiches"
gram = r"""
plate: {<NN|NNS>}
amount: {<CD|DT>}
"""
cp = nltk.RegexpParser(gram)
for sent in sentence:
tokens = nltk.word_tokenize(sent)
taggex = hmm_tagger.tag(tokens)
treee = cp.parse(taggex)
iob_ts = tree2conlltags(treee)
tree = conlltags2tree(iob_ts)
def conversion(tree):
dlist = []
for leaf in tree:
if type(leaf) == tuple:
for leaf in tree:
key = leaf.label()
value = leaf[0][0]
dlist =[dict(zip(key, value)) for leaf in tree]
return dlist
Upvotes: 0
Views: 337
Reputation: 2126
The main issue here is that you are not appending after each loop iteration, both inside and outside the conversion
function.
from nltk.chunk.regexp import RegexpParser
from nltk import Tree, pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
gram = r"""
plate: {<NN|NNS>}
amount: {<CD|DT>}
"""
cp = RegexpParser(gram)
text = "I'll have 1 pizza and 4 sandwiches"
def conversion(tree):
dlist = []
d = dict()
for item in tree:
if isinstance(item, Tree):
d[item.label()] = ' '.join([l[0] for l in item.leaves()])
else:
dlist.append(d) if len(d)>0 else None
d = dict()
dlist.append(d) if len(d)>0 else None
return dlist
parsed_text = [cp.parse(pos_tag(word_tokenize(sent)))
for sent in sent_tokenize(text)]
for tree in parsed_text:
print(conversion(tree))
#[{'amount': '1', 'plate': 'pizza'}, {'amount': '4', 'plate': 'sandwiches'}]
Upvotes: 2