Faster Python Lemmatization

Question

I have been testing different lemmatization methods since it will be used on a very large corpus. Below are my methods and results. Does anyone have any tips to speed any of these methods up? Spacy was the fastest with part of speech tags included (preferred), followed by lemminflect. Am I going about this the wrong way? These functions are being applied with pandas .apply() on a dataframe containing the text.

def prepareString_nltk_current(x):
    lemmatizer = WordNetLemmatizer()
    x = re.sub(r"[^0-9a-z]", " ", x)
    if len(x)==0:
        return ''
    tokens = word_tokenize(x)
    tokens = [lemmatizer.lemmatize(word).strip() for word in tokens if word not in stop_words]
    if len(tokens)==0:
        return ''
    return ' '.join(map(str,tokens))

def prepareString_pattern(x):
    error = 'Error'
    x = re.sub(r"[^0-9a-z.,;]", " ", x)
    if len(x)==0:
        return ''
    try:
        return " ".join([lemma(wd) if wd not in ['this', 'his'] else wd for wd in x.split()])
    except StopIteration:
        return error

def prepareString_pattern(x):
    error = 'Error'
    x = re.sub(r"[^0-9a-z.,;]", " ", x)
    if len(x)==0:
        return ''
    try:
        return " ".join([lemma(wd) if wd not in ['this', 'his'] else wd for wd in x.split()])
    except StopIteration:
        return error


def prepareString_spacy_pretrained(x):
    if len(x)==0:
        return ''
    doc = nlp(x)
    return re.sub(r"[^0-9a-zA-Z]", " ", " ".join(str(token.lemma) for token in doc)).lower()

def get_wordnet_pos(word):
    lemmatizer = WordNetLemmatizer()
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": 'a',
                    "N": 'n',
                    "V": 'v',
                    "R": 'r'}

    return lemmatizer.lemmatize(word, tag_dict.get(tag, 'n'))

def prepareString_nltk_pos(x):
    
    tokens = word_tokenize(x)
    if len(x)==0:
        return ''
    return " ".join(get_wordnet_pos(w) for w in tokens)

def prepareString_textblob(x):
    sent = TextBlob(x)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    return " ".join([wd.lemmatize(tag) for wd, tag in words_and_tags])

def prepareString_genism(x):
    return " ".join([wd.decode('utf-8').split('/')[0] for wd in lemmatize(x)])

def prepareString_leminflect(x):
    doc = nlp(x)
    return " ".join([str(x._.lemma) for x in doc])


def prepareString_pattern_pos(x):
    s = parsetree(x, tags=True, lemmata=True)
    for sentence in s:
        return re.sub(r"[^0-9a-zA-Z]", " ", " ".join([str(x._.lemma()) for x in doc])).lower()

Faster Python Lemmatization

Answers (1)

Related Questions