How to get the most frequent word from a corpus?

I am working with corpuses, and want to get the most and least used word and word class from a corpus. I have the beginning of a code, but i get some errors i don't know how to deal with. I want to get the most frequent word out of the brown corpus, and then the most and least used word classes. I have this code:

import re
import nltk
import string
from collections import Counter
from nltk.corpus import stopwords
from collections import defaultdict, Counter
from nltk.corpus import brown

brown = nltk.corpus.brown
stoplist = stopwords.words('english')

from collections import defaultdict

def toptenwords(brown):
    words = brown.words()
    no_capitals = ([word.lower() for word in words])
    filtered = [word for word in no_capitals if word not in stoplist]
    translate_table = dict((ord(char), None) for char in string.punctuation)
    no_punct = [s.translate(translate_table) for s in filtered]
    wordcounter = defaultdict(int)
    for word in no_punct:
        if word in wordcounter:
            wordcounter[word] += 1
        else:
            wordcounter[word] = 1
    sorting = [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)]
    return sorting

print(toptenwords(brown))

words_2 = [word[0] for word in brown.tagged_words(categories="news")]
# the most frequent words
print Counter(words_2).most_common(10)

words_2 = [word[1] for word in brown.tagged_words(categories="news")]
# the most frequent word class
print Counter(words_2).most_common(10)


# Keeps words and pos into a dictionary
# where the key is a word and
# the value is a counter of POS and counts
word_tags = defaultdict(Counter)
for word, pos in brown.tagged_words():
word_tags[word][pos] +=1

# To access the POS counter.
print 'Red', word_tags['Red']
print 'Marlowe', word_tags['Marlowe']
print

# Greatest number of distinct tag.
word_with_most_distinct_pos = sorted(word_tags, key=lambda x: len(word_tags[x]), reverse=True)[0]

print word_with_most_distinct_pos
print word_tags[word_with_most_distinct_pos]
print len(word_tags[word_with_most_distinct_pos])

# which word has the greatest number of distinct tags
word_tags_2 = nltk.defaultdict(lambda: set())
for word, token in tagged_words:
    word_tags[word].add(token)
    ambig_words = sorted([(k, len(v)) for (k, v) in word_tags.items()]),
    key=itemgetter(1), reverse=True)[:50]
  print [(word, numtoks, word_tags[word]) for (word, numtoks) in ambig_words]

When i run the code above, I get the errors below:

File "Oblig2a.py", line 64
    key=itemgetter(1), reverse=True)[:50]
                               ^
SyntaxError: invalid syntax

From this code i would like to get:

  1. Most frequent word
  2. Most frequent word class
  3. Least frequent word class
  4. How many words with more than one word class
  5. Which word has the most tags, and how many distinct tags are there
  6. The last thing i need help with is to write a function to a specific word and write how many times it appears with each of the tags. I am trying to do that above, but i cant get it to work...

It is number 3, 4, 5 and 6 i need help with... Any help would be most welcome.

Upvotes: 0

Views: 2722

Answers (1)

Jakub Macina
Jakub Macina

Reputation: 982

There are 3 problems with the code:

  1. The error what interpreter is telling you - you should provide name of the language to stopwords function: stoplist = stopwords.words('english')
  2. Use defaultdict dictionary get method to properly sort the dict: [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)]
  3. Use translate table on Unicode data, see string.translate() with unicode data in python
  4. Brown tagged words are tuples in format (word, part-of-speech)

Complete code:

import re
import nltk
import string
from collections import Counter
from nltk.corpus import stopwords

brown = nltk.corpus.brown
stoplist = stopwords.words('english')

from collections import defaultdict

def toptenwords(brown):
    words = brown.words()
    no_capitals = set([word.lower() for word in words])
    filtered = [word for word in no_capitals if word not in stoplist]
    translate_table = dict((ord(char), None) for char in string.punctuation)
    no_punct = [s.translate(translate_table) for s in filtered]
    wordcounter = defaultdict(int)
    for word in no_punct:
        if word in wordcounter:
            wordcounter[word] += 1
        else:
            wordcounter[word] = 1
    sorting = [(k, wordcounter[k])for k in sorted(wordcounter, key = wordcounter.get, reverse = True)]
    return sorting


print(toptenwords(brown))

words_2 = [word[0] for word in brown.tagged_words(categories="news")]
# the most frequent words
print Counter(words_2).most_common(10)

words_2 = [word[1] for word in brown.tagged_words(categories="news")]
# the most frequent word class
print Counter(words_2).most_common(10)

Upvotes: 0

Related Questions