How to Classify text based on common words

Question

this question is about classification of texts based on common words, I don't know if I am approaching the problem right I have an excel with texts in the "Description" column and a unique ID in the "ID" column, I want to iterate through Descriptions and compare them based on percentage or frequency of common words in the text I would like to classify descriptions and give them another ID. Please see example below ....

    #importing pandas as pd 
    import pandas as pd 

     # creating a dataframe 
     df = pd.DataFrame({'ID': ['12 ', '54', '88','9'], 
    'Description': ['Staphylococcus aureus is a Gram-positive, round-shaped 
     bacterium that is a member of the Firmicutes', 'Streptococcus pneumoniae, 
    or pneumococcus, is a Gram-positive, alpha-hemolytic or beta-hemolytic', 
    'Dicyemida, also known as Rhombozoa, is a phylum of tiny parasites ','A 
    television set or television receiver, more commonly called a television, 
    TV, TV set, or telly']})

ID     Description
12  Staphylococcus aureus is a Gram-positive, round-shaped bacterium that is a member of the Firmicutes
54  Streptococcus pneumoniae, or pneumococcus, is a Gram-positive, round-shaped bacterium that is a member beta-hemolytic
88  Dicyemida, also known as Rhombozoa, is a phylum of tiny parasites
9   A television set or television receiver, more commonly called a television, TV, TV set, or telly

for example 12 and 54 Descriptions have more than 75% common words they will have same ID. output would be like :

ID     Description
12  Staphylococcus aureus is a Gram-positive, round-shaped bacterium that 
is a member of the Firmicutes
12  Streptococcus pneumoniae, or pneumococcus, is a Gram-positive, round- 
shaped bacterium that is a member beta-hemolytic
88  Dicyemida, also known as Rhombozoa, is a phylum of tiny parasites
9   A television set or television receiver, more commonly called a 
television, TV, TV set, or telly

Here what I tried,I worked with two different dataframes Risk1 & Risk2, I'm not iterating throught rows which I need to do too :

import codecs
import re
import copy
import collections
import pandas as pd
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
import matplotlib.pyplot as plt

%matplotlib inline

nltk.download('stopwords')

from nltk.corpus import stopwords

# creating a dataframe 1
 df = pd.DataFrame({'ID': ['12 '], 
'Description': ['Staphylococcus aureus is a Gram-positive, round-shaped 
 bacterium that is a member of the Firmicutes']})
# creating a dataframe 2
 df = pd.DataFrame({'ID': ['54'], 
'Description': ['Streptococcus pneumoniae, 
or pneumococcus, is a Gram-positive, alpha-hemolytic or beta-hemolytic']})

esw = stopwords.words('english')
esw.append('would')

word_pattern = re.compile("^\w+$")

def get_text_counter(text):
    tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
return collections.Counter(tokens), len(tokens)

def make_df(counter, size):
    abs_freq = np.array([el[1] for el in counter])
    rel_freq = abs_freq / size
    index = [el[0] for el in counter]
    df = pd.DataFrame(data = np.array([abs_freq, rel_freq]).T, index=index, columns=['Absolute Frequency', 'Relative Frequency'])
    df.index.name = 'Most_Common_Words'
return df

Risk1_counter, Risk1_size = get_text_counter(Risk1)
make_df(Risk1_counter.most_common(500), Risk1_size)

Risk2_counter, Risk2_size = get_text_counter(Risk2)
make_df(Risk2_counter.most_common(500), Risk2_size)

all_counter = Risk1_counter + Risk2_counter
all_df = make_df(Risk2_counter.most_common(1000), 1)
most_common_words = all_df.index.values


df_data = []
for word in most_common_words:
    Risk1_c = Risk1_counter.get(word, 0) / Risk1_size
    Risk2_c = Risk2_counter.get(word, 0) / Risk2_size
    d = abs(Risk1_c - Risk2_c)
    df_data.append([Risk1_c, Risk2_c, d])
dist_df= pd.DataFrame(data = df_data, index=most_common_words,
                    columns=['Risk1 Relative Freq', 'Risk2 Hight Relative Freq','Relative Freq Difference'])
dist_df.index.name = 'Most Common Words'
dist_df.sort_values('Relative Freq Difference', ascending = False, inplace=True)


dist_df.head(500)

How to Classify text based on common words

Answers (1)

Related Questions