Reputation: 637
I'm trying to implement Peter Norvig's spell checker in a pandas class with words pulled from a SQL database. The data contains user queries which often contains a number of spelling errors, and I'm hoping this class will return the most likely query (spelt correctly).
The class is initialized with a database query that returns a pandas dataframe. For example:
query count
0 foo bar 1864
1 super foo 73
2 bar of foos 1629
3 crazy foos 940
Most of the below is pulled directly from Peter's work, but the modifications I've made to the class don't seem to work correctly. My guess is that it has something to do with removing the Counter functionality (WORDS = Counter(words(open('big.txt').read()))
) but I'm unsure the best way to get this same functionality from a dataframe.
Current class below:
class _SpellCheckClient(object):
"""Wraps functionality to check the spelling of a query."""
def __init__(self, team, table, dremel_connection):
self.df = database_connection.ExecuteQuery(
'SELECT query, COUNT(query) AS count FROM table GROUP BY 1;'
def expected_word(self, word):
"""Most probable spelling correction for word."""
return max(self._candidates(word), key=self._probability)
def _probability(self, query):
"""Probability of a given word within a query."""
query_count = self.df.loc[self.df['query'] == query]['count'].values
return query_count / self.df['count'].sum()
def _candidates(self, word):
"""Generate possible spelling corrections for word."""
return (self._known([word])
or self._known(self._one_edits_from_word(word))
or self._known(self._two_edits_from_word(word))
or [word])
def _known(self, query):
"""The subset of `words` that appear in the dictionary of WORDS."""
# return set(w for w in query if w in WORDS)
return set(w for w in query if w in self.df['query'].value_counts)
def _one_edits_from_word(self, word):
"""All edits that are one edit away from `word`."""
splits = [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
deletes = [left + right[1:] for left, right in splits if right]
transposes = [left + right[1] + right[0] + right[2:]
for left, right in splits
if len(right) > 1]
replaces = [left + center + right[1:]
for left, right in splits
if right for center in LETTERS]
inserts = [left + center + right
for left, right in splits
for center in LETTERS]
return set(deletes + transposes + replaces + inserts)
def _two_edits_from_word(self, word):
"""All edits that are two edits away from `word`."""
return (e2 for e1 in self._one_edits_from_word(word)
for e2 in self._one_edits_from_word(e1))
Thanks in advance!
Upvotes: 0
Views: 4314
Reputation: 21
import pandas as pd
from spellchecker import SpellChecker
df = pd.Series(['Customir','Tast','Hlp'])
spell = SpellChecker(distance=1)
def Correct(x):
return spell.correction(x)
df = df.apply(Correct)
df
0 customer
1 last
2 help
dtype: object
Upvotes: 2
Reputation: 637
For anyone looking for an answer to this, below is what worked for me:
def _words(df):
"""Returns the total count of each word within a dataframe."""
return df['query'].str.get_dummies(sep=' ').T.dot(df['count'])
class _SpellCheckClient(object):
"""Wraps functionality to check the spelling of a query."""
def __init__(self, team, table, database_connection):
self.df = database_connection
self.words = _words(self.df)
def expected_word(self, query):
"""Most probable spelling correction for word."""
return max(self._candidates(query), key=self._probability)
def _probability(self, query):
"""Probability of a given word within a query."""
return self.words.pipe(lambda x: x / x.sum()).get(query, 0.0)
def _candidates(self, query):
"""Generate possible spelling corrections for word."""
return (self._known(self._one_edits_from_query(query))
or self._known(self._two_edits_from_query(query))
or [query])
def _known(self, query):
"""The subset of `query` that appear in the search console database."""
return set(w for w in query if self.words.get(w))
def _one_edits_from_query(self, query):
"""All edits that are one edit away from `query`."""
splits = [(query[:i], query[i:]) for i in xrange(len(query) + 1)]
deletes = [left + right[1:] for left, right in splits if right]
transposes = [left + right[1] + right[0] + right[2:]
for left, right in splits
if len(right) > 1]
replaces = [left + center + right[1:]
for left, right in splits
if right for center in LETTERS]
inserts = [left + center + right
for left, right in splits
for center in LETTERS]
return set(deletes + transposes + replaces + inserts)
def _two_edits_from_query(self, query):
"""All edits that are two edits away from `query`."""
return (e2 for e1 in self._one_edits_from_query(query)
for e2 in self._one_edits_from_query(e1))
Upvotes: 0