Reputation: 13
First off, I'm a python noob and I only half-undestand how some of this stuff works. I've been trying to build word matrices for a tagging project and I hoped I could figure this out on my own, but I'm not seeing a lot of documentation around my particular error. So I apologize up front if this is something super-obvious.
I've tried to get a set of functions to work in a few different variations, but I keep getting "AttributeError: 'list' has no attribute definition."
import pandas as pd
from pandas import DataFrame, Series
import nltk.data
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer
# Gets synsets for a given term.
def get_synset(word):
for word in wn.synsets(word):
return word.name()
#Gets definitions for a synset.
def get_def(syn):
return wn.synsets(syn).defnition()
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.
def sector_tagger(frame):
sentences = frame.tolist()
tok_list = [tok.tokenize(w) for w in frame]
split_words = [w.lower() for sub in tok_list for w in sub]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
sector_matrix = DataFrame({'Categories': clean_words,
'Synsets': synset})
sec_syn = sector_matrix['Synsets'].tolist()
sector_matrix['Definition'] = [get_def(w) for w in sector_matrix['Synsets']]
return sector_matrix
The functions get called on a dataframe that I read in from excel:
test = pd.read_excel('data.xlsx')
And the sector_tagger function is called as such:
agri_matrix = sector_tagger(agri['Category'])
A previous version called wn.synsets(w).definition() in a list comprehension that populated the DataFrame. Another tried to call the definition after the fact in a Jupyter Notebook. I almost always get the Attribute Error. That said, when I call the datatype on sector_matrix['Synsets'] I get an "object" type, and when I print that column I don't see [] around the items.
I've tried:
Curiously enough, I was playing around with this yesterday and was able to make something work in my notebook directly, but (a) it's messy (b) there's no scalability, and (c) it doesn't work on other categories that I apply it to.
agrimask = (df['Agri-Food']==1) & (df['Total']==1)
df_agri = df.loc[agrimask,['Category']]
agri_words = [tok.tokenize(a) for a in df_agri['Category']]
agri_cip_words = [a.lower() for sub in agri_words for a in sub]
agri_clean = [w for w in agri_cip_words if w not in english_stops]
df_agri_clean = DataFrame({'Category': agri_clean})
df_agri_clean = df_agri_clean[df_agri_clean != ','].replace('horticulture/horticultural','horticulture').dropna().drop_duplicates()
df_agri_clean['Synsets'] = [x[0].name() for x in df_agri_clean['Category'].apply(syn)]
df_agri_clean['Definition'] = [wn.synset(x).definition() for x in df_agri_clean['Synsets']]
df_agri_clean['Lemma'] = [wn.synset(x).lemmas()[0].name() for x in df_agri_clean['Synsets']]
df_agri_clean
Edit1: Here's a link to a sample of the data.
Edit2: Also, the static variables I'm using are here (all based around the standard NLTK library):
tok = TreebankWordTokenizer()
english_stops = set(stopwords.words('english'))
french_stops = set(stopwords.words('french'))
Edit3: You can see a working version of this code here: Working Code
Upvotes: 0
Views: 216
Reputation: 62383
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer as tok
english_stops = set(stopwords.words('english'))
# Gets synsets for a given term.
def get_synset(word):
for word in wn.synsets(word):
return word.name()
#Gets definitions for a synset.
def get_def(syn):
return wn.synset(syn).definition() # your definition is misspelled
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.
def sector_tagger(frame):
tok_list = tok().tokenize(frame)
split_words = [w.lower() for w in tok_list]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
sector_matrix = pd.DataFrame({'Categories': clean_words,
'Synsets': synset})
sec_syn = list(sector_matrix['Synsets'])
sector_matrix['Definition'] = [get_def(w) if w != None else '' for w in sec_syn]
return sector_matrix
agri_matrix = df['Category'].apply(sector_tagger)
if this answers your question, please check it as the answer
The output of get_def
is a list of phrases
Alternate Approach
def sector_tagger(frame):
mapping = [('/', ' '), ('(', ''), (')', ''), (',', '')]
for k, v in mapping:
frame = frame.replace(k, v)
tok_list = tok().tokenize(frame) # note () after tok
split_words = [w.lower() for w in tok_list]
clean_words = [w for w in split_words if w not in english_stops]
synset = [get_synset(w) for w in clean_words]
def_matrix = [get_def(w) if w != None else '' for w in synset]
return clean_words, synset, def_matrix
poo = df['Category'].apply(sector_tagger)
poo[0] =
(['agricultural', 'domestic', 'animal', 'services'],
['agricultural.a.01', 'domestic.n.01', 'animal.n.01', 'services.n.01'],
['relating to or used in or promoting agriculture or farming',
'a servant who is paid to perform menial tasks around the household',
'a living organism characterized by voluntary movement',
'performance of duties or provision of space and equipment helpful to others'])
list_clean_words = []
list_synset = []
list_def_matrix = []
for x in poo:
list_clean_words.append(x[0])
list_synset.append(x[1])
list_def_matrix.append(x[2])
agri_matrix = pd.DataFrame()
agri_matrix['Categories'] = list_clean_words
agri_matrix['Synsets'] = list_synset
agri_matrix['Definition'] = list_def_matrix
agri_matrix
Categories Synsets Definition
0 [agricultural, domestic, animal, services] [agricultural.a.01, domestic.n.01, animal.n.01... [relating to or used in or promoting agricultu...
1 [agricultural, food, products, processing] [agricultural.a.01, food.n.01, merchandise.n.0... [relating to or used in or promoting agricultu...
2 [agricultural, business, management] [agricultural.a.01, business.n.01, management.... [relating to or used in or promoting agricultu...
3 [agricultural, mechanization] [agricultural.a.01, mechanization.n.01] [relating to or used in or promoting agricultu...
4 [agricultural, production, operations] [agricultural.a.01, production.n.01, operation... [relating to or used in or promoting agricultu...
Split each list of lists into a long list (they're ordered)
def create_long_list_from_list_of_lists(list_of_lists):
long_list = []
for one_list in list_of_lists:
for word in one_list:
long_list.append(word)
return long_list
long_list_clean_words = create_long_list_from_list_of_lists(list_clean_words)
long_list_synset = create_long_list_from_list_of_lists(list_synset)
long_list_def_matrix = create_long_list_from_list_of_lists(list_def_matrix)
Turn it into a DataFrame of Uniques Categories
agri_df = pd.DataFrame.from_dict(dict([('Categories', long_list_clean_words), ('Synsets', long_list_synset), ('Definitions', long_list_def_matrix)])).drop_duplicates().reset_index(drop=True)
agri_df.head(4)
Categories Synsets Definitions
0 ceramic ceramic.n.01 an artifact made of hard brittle material prod...
1 horticultural horticultural.a.01 of or relating to the cultivation of plants
2 construction construction.n.01 the act of constructing something
3 building building.n.01 a structure that has a roof and walls and stan...
Final Note
import from nltk.tokenize import TreebankWordTokenizer as tok
or:
import from nltk.tokenize import word_tokenize
to use:
tok().tokenize(string_text_phrase) # text is a string phrase, not a list of words
or:
word_tokenize(string_text_phrase)
Both methods appear to produce the same output, which is a list of words.
input = "Agricultural and domestic animal services"
output_of_both_methods = ['Agricultural', 'and', 'domestic', 'animal', 'services']
Upvotes: 1