construct the unigrams, bi-grams and tri-grams in python

how to construct the unigrams, bi-grams and tri-grams for large corpora then to compute the frequency for each of them. Arrange the results by the most frequent to the least frequent grams.

from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

text = "I need to write a program in NLTK that breaks a corpus (a large collection of \
txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams.\ 
I need to write a program in NLTK that breaks a corpus"
token = nltk.word_tokenize(text)
bigrams = ngrams(token,2)
trigrams = ngrams(token,3)```

Upvotes: 0

Views: 422

Answers (2)

hidan
hidan

Reputation: 1

from textblob import TextBlob

def ngram(data, from_column, to_column, t):

new_columns = [[], [], []]

if t == 1:
    for index, row in data.iterrows():
        ngram_object = TextBlob(row[from_column]) 
        unigrams = ngram_object.ngrams(n=1)
        new_columns[0].append(unigrams)
        bigrams = ngram_object.ngrams(n=2)
        new_columns[1].append(bigrams)
        trigrams = ngram_object.ngrams(n=3)
        new_columns[2].append(trigrams)
        
elif t == 2:
    for index, row in data.iterrows():
        unigrams = [word for word in row[from_column].split()]
        new_columns[0].append(unigrams)
        bigrams = [item for item in 
nltk.bigrams(row[from_column].split())]
        new_columns[1].append(bigrams)
        trigrams = [item for item in 
nltk.trigrams(row[from_column].split())]
        new_columns[2].append(trigrams)

data[to_column + ' unigrams'] = new_columns[0]
data[to_column + ' bigrams'] = new_columns[1]
data[to_column + ' trigrams'] = new_columns[2]

ngram(data, 'preprocessed text', 'preprocessed text', 2)
ngram(data_out, 'preprocessed text', 'preprocessed text', 2)

Upvotes: 0

Prateek Dewan
Prateek Dewan

Reputation: 1631

Try this:

import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

text = '''I need to write a program in NLTK that breaks a corpus (a large 
collection of txt files) into unigrams, bigrams, trigrams, fourgrams and 
fivegrams. I need to write a program in NLTK that breaks a corpus'''

token = nltk.word_tokenize(text)
most_frequent_bigrams = Counter(list(ngrams(token,2))).most_common()
most_frequent_trigrams = Counter(list(ngrams(token,3))).most_common()
for k, v in most_frequent_bigrams:
    print (k,v)
for k, v in most_frequent_trigrams:
    print (k,v)

Upvotes: 1

Related Questions