How to get the most frequent word from a corpus?

Question

I am working with corpuses, and want to get the most and least used word and word class from a corpus. I have the beginning of a code, but i get some errors i don't know how to deal with. I want to get the most frequent word out of the brown corpus, and then the most and least used word classes. I have this code:

import re
import nltk
import string
from collections import Counter
from nltk.corpus import stopwords
from collections import defaultdict, Counter
from nltk.corpus import brown

brown = nltk.corpus.brown
stoplist = stopwords.words('english')

from collections import defaultdict

def toptenwords(brown):
    words = brown.words()
    no_capitals = ([word.lower() for word in words])
    filtered = [word for word in no_capitals if word not in stoplist]
    translate_table = dict((ord(char), None) for char in string.punctuation)
    no_punct = [s.translate(translate_table) for s in filtered]
    wordcounter = defaultdict(int)
    for word in no_punct:
        if word in wordcounter:
            wordcounter[word] += 1
        else:
            wordcounter[word] = 1
    sorting = [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)]
    return sorting

print(toptenwords(brown))

words_2 = [word[0] for word in brown.tagged_words(categories="news")]
# the most frequent words
print Counter(words_2).most_common(10)

words_2 = [word[1] for word in brown.tagged_words(categories="news")]
# the most frequent word class
print Counter(words_2).most_common(10)


# Keeps words and pos into a dictionary
# where the key is a word and
# the value is a counter of POS and counts
word_tags = defaultdict(Counter)
for word, pos in brown.tagged_words():
word_tags[word][pos] +=1

# To access the POS counter.
print 'Red', word_tags['Red']
print 'Marlowe', word_tags['Marlowe']
print

# Greatest number of distinct tag.
word_with_most_distinct_pos = sorted(word_tags, key=lambda x: len(word_tags[x]), reverse=True)[0]

print word_with_most_distinct_pos
print word_tags[word_with_most_distinct_pos]
print len(word_tags[word_with_most_distinct_pos])

# which word has the greatest number of distinct tags
word_tags_2 = nltk.defaultdict(lambda: set())
for word, token in tagged_words:
    word_tags[word].add(token)
    ambig_words = sorted([(k, len(v)) for (k, v) in word_tags.items()]),
    key=itemgetter(1), reverse=True)[:50]
  print [(word, numtoks, word_tags[word]) for (word, numtoks) in ambig_words]

When i run the code above, I get the errors below:

File "Oblig2a.py", line 64
    key=itemgetter(1), reverse=True)[:50]
                               ^
SyntaxError: invalid syntax

From this code i would like to get:

Most frequent word
Most frequent word class
Least frequent word class
How many words with more than one word class
Which word has the most tags, and how many distinct tags are there
The last thing i need help with is to write a function to a specific word and write how many times it appears with each of the tags. I am trying to do that above, but i cant get it to work...

It is number 3, 4, 5 and 6 i need help with... Any help would be most welcome.

How to get the most frequent word from a corpus?

Answers (1)

Related Questions