Reputation: 159
I'm new to python and need help with NLTK language modeling.
I'm trying to generate the setence starting with "he said" using trigram model but get the following error:
Traceback (most recent call last):
File "C:\Users\\PycharmProjects\main.py", line 72, in <module>
d[a, b] += freq_tri[a, b, c]
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\collections\__init__.py", line 904, in __iadd__
for elem, count in other.items():
AttributeError: 'int' object has no attribute 'items'
The entire code is here:
# imports
import string
import random
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('reuters')
from nltk.corpus import reuters, stopwords
from collections import defaultdict, Counter
from nltk import FreqDist, ngrams
# input the reuters sentences
sents = reuters.sents()
# write the removal characters such as : Stopwords and punctuation
stop_words = set(stopwords.words('english'))
string.punctuation = string.punctuation + '"' + '"' + '-' + '''+''' + '—'
string.punctuation
removal_list = list(stop_words) + list(string.punctuation) + ['lt', 'rt']
removal_list
# generate unigrams bigrams trigrams
unigram = []
bigram = []
trigram = []
fourgram = []
tokenized_text = []
for sentence in sents:
sentence = list(map(lambda x: x.lower(), sentence))
for word in sentence:
if word == '.':
sentence.remove(word)
else:
unigram.append(word)
tokenized_text.append(sentence)
bigram.extend(list(ngrams(sentence, 2, pad_left=True, pad_right=True)))
trigram.extend(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))
fourgram.extend(list(ngrams(sentence, 4, pad_left=True, pad_right=True)))
# remove the n-grams with removable words
def remove_stopwords(x):
y = []
for pair in x:
count = 0
for word in pair:
if word in removal_list:
count = count or 0
else:
count = count or 1
if (count == 1):
y.append(pair)
return (y)
unigram = remove_stopwords(unigram)
bigram = remove_stopwords(bigram)
trigram = remove_stopwords(trigram)
fourgram = remove_stopwords(fourgram)
# generate frequency of n-grams
freq_bi = FreqDist(bigram)
freq_tri = FreqDist(trigram)
freq_four = FreqDist(fourgram)
d = defaultdict(Counter)
for a, b, c in freq_tri:
if (a != None and b != None):
d[a, b] += freq_tri[a, b, c]
# Next word prediction
s = ''
def pick_word(counter):
"Chooses a random element."
return random.choice(list(counter.elements()))
prefix = "he", "said"
print(" ".join(prefix))
s = " ".join(prefix)
for i in range(19):
suffix = pick_word(d[prefix])
s = s + ' ' + suffix
print(s)
prefix = prefix[1], suffix
I tried changing the for loop to the following
for a, b, c in freq_tri.items():
if (a != None and b != None):
d[a, b] += freq_tri[a, b, c]
Expecting to get the items from the freq_tri list but got the following error:
Traceback (most recent call last):
File "C:\Users\PycharmProjects\main.py", line 70, in <module>
for a, b, c in freq_tri.items():
ValueError: not enough values to unpack (expected 3, got 2)
Which to be honest I don't quite understand why it returns 2 values if using items()
Please push me to the right direction I don't know what I'm missing.
Upvotes: 0
Views: 443