sokeefe
sokeefe

Reputation: 637

Spell Checker in Pandas

I'm trying to implement Peter Norvig's spell checker in a pandas class with words pulled from a SQL database. The data contains user queries which often contains a number of spelling errors, and I'm hoping this class will return the most likely query (spelt correctly).

The class is initialized with a database query that returns a pandas dataframe. For example:

  query     count
0 foo bar       1864
1 super foo      73
2 bar of foos    1629
3 crazy foos     940

Most of the below is pulled directly from Peter's work, but the modifications I've made to the class don't seem to work correctly. My guess is that it has something to do with removing the Counter functionality (WORDS = Counter(words(open('big.txt').read()))) but I'm unsure the best way to get this same functionality from a dataframe.

Current class below:

class _SpellCheckClient(object):
  """Wraps functionality to check the spelling of a query."""

  def __init__(self, team, table, dremel_connection):
    self.df = database_connection.ExecuteQuery(
        'SELECT query, COUNT(query) AS count FROM table GROUP BY 1;' 

  def expected_word(self, word):
    """Most probable spelling correction for word."""
    return max(self._candidates(word), key=self._probability)

  def _probability(self, query):
    """Probability of a given word within a query."""
    query_count = self.df.loc[self.df['query'] == query]['count'].values
    return query_count / self.df['count'].sum()

  def _candidates(self, word):
    """Generate possible spelling corrections for word."""
    return (self._known([word])
            or self._known(self._one_edits_from_word(word))
            or self._known(self._two_edits_from_word(word))
            or [word])

  def _known(self, query):
    """The subset of `words` that appear in the dictionary of WORDS."""
    # return set(w for w in query if w in WORDS)
    return set(w for w in query if w in self.df['query'].value_counts)

  def _one_edits_from_word(self, word):
    """All edits that are one edit away from `word`."""
    splits = [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
    deletes = [left + right[1:] for left, right in splits if right]
    transposes = [left + right[1] + right[0] + right[2:]
                  for left, right in splits
                  if len(right) > 1]
    replaces = [left + center + right[1:]
                for left, right in splits
                if right for center in LETTERS]
    inserts = [left + center + right
               for left, right in splits
               for center in LETTERS]
    return set(deletes + transposes + replaces + inserts)

  def _two_edits_from_word(self, word):
    """All edits that are two edits away from `word`."""
    return (e2 for e1 in self._one_edits_from_word(word)
            for e2 in self._one_edits_from_word(e1))

Thanks in advance!

Upvotes: 0

Views: 4314

Answers (2)

MitchDesmond
MitchDesmond

Reputation: 21

import pandas as pd
from spellchecker import SpellChecker
df = pd.Series(['Customir','Tast','Hlp'])
spell = SpellChecker(distance=1)
def Correct(x):
    return spell.correction(x)
df = df.apply(Correct)
df

0    customer
1        last
2        help
dtype: object

Upvotes: 2

sokeefe
sokeefe

Reputation: 637

For anyone looking for an answer to this, below is what worked for me:

def _words(df):
  """Returns the total count of each word within a dataframe."""
  return df['query'].str.get_dummies(sep=' ').T.dot(df['count'])


class _SpellCheckClient(object):
  """Wraps functionality to check the spelling of a query."""

  def __init__(self, team, table, database_connection):
    self.df = database_connection
    self.words = _words(self.df)

  def expected_word(self, query):
    """Most probable spelling correction for word."""
    return max(self._candidates(query), key=self._probability)

  def _probability(self, query):
    """Probability of a given word within a query."""
    return self.words.pipe(lambda x: x / x.sum()).get(query, 0.0)

  def _candidates(self, query):
    """Generate possible spelling corrections for word."""
    return (self._known(self._one_edits_from_query(query))
            or self._known(self._two_edits_from_query(query))
            or [query])

  def _known(self, query):
    """The subset of `query` that appear in the search console database."""
    return set(w for w in query if self.words.get(w))

  def _one_edits_from_query(self, query):
    """All edits that are one edit away from `query`."""
    splits = [(query[:i], query[i:]) for i in xrange(len(query) + 1)]
    deletes = [left + right[1:] for left, right in splits if right]
    transposes = [left + right[1] + right[0] + right[2:]
                  for left, right in splits
                  if len(right) > 1]
    replaces = [left + center + right[1:]
                for left, right in splits
                if right for center in LETTERS]
    inserts = [left + center + right
               for left, right in splits
               for center in LETTERS]
    return set(deletes + transposes + replaces + inserts)

  def _two_edits_from_query(self, query):
    """All edits that are two edits away from `query`."""
    return (e2 for e1 in self._one_edits_from_query(query)
            for e2 in self._one_edits_from_query(e1))

Upvotes: 0

Related Questions