Daniel Lopez
Daniel Lopez

Reputation: 41

Train new model with Spacy

I am using Spacy for a model of recognition of named entities, Update of the Recognition of the Named Entity the documentation provided me with this code to update an existing model, this code only specifies the model that will be used as a base, the place where it would be stored and the Number of iterations

from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path

# training data
TRAIN_DATA = [
    ("', 137 S Wilmington Street Raleigh, NC 27601 919.239.4070\t, • Server: Brian 20/1 Guests: 8\t10/01/2018 1:11 PM 20014, L Chicken Arti Pizza\t10.99, Subtotal Tax\t10.99 0.91, Total\t11.90, Balance Due\t11.90, Gratuity Suggestions To Help:, 20% = 2.20 18% = 1.2L,, 115% = 1.65 |f ,9., '?", {"entities": [(3, 19, "ORG")]}),
    ("Carolina Ale House, G1enwood, 0516 Table 23 #Party 1 JORDYN M SvrCk: 27 7:42p 09/30/18, Separate checks: 3-of-7\t, 2 Carolina Hurrlca\t15.50, 1 Smoked Cheddar Burger\t9.79, Sub Total:\t25.29, Tax:\t2.08, Sub Total:\t27.37, 20X GRATUIT\t5.06, 09/30 10:36pTO TAI : 32\t, D i d you enjoy Every delicious Bite’? Come back to See us and bring your friends*, You are always Welcome at our, House>", {"entities": [(8, 18, "ORG")]}),
    (", P~ l-LMl NG *, PRIME STEAKHOUSE 8, WINE BAR, Kalelyh, nr 27612 919-571-6200, Sgj*1® IABIE 51\t6, UlER1 H SvrCk: 5 8:04p 10/02/18, 1\tBlueheny Lemon Drop, ^ Corona, 2\tCraft Beer 2 2 120 Tomahawk 1 Pork Chop 1 Scottish Salmon 4 Prime Dessert, 13.00, 35.00 14.50, 240.00, 40.00, 44.00 0.00, Sub Total: 386.50 „\tTax:\t31.89, 10/02 9:59pTOTAL :\t418.39, www.F1emingsSteakhouse.com ) rials'., Dine Rewards account not attached, Not a Dine Rewards member?, Join now at DINE-REWARDS.COM, ", {"entities": [(17, 35, "ORG")]}),
    ("Flying Saucer Draught Emporium, 328 Morgan Raleigh, NC, Server: Hope 10/30/7 Guests: 0, 10/04/2018 8:26 PM 20068, L10- Cocktail, 8.00, L10- Classic Daiquiri 1/2 Nacho Libre-r L10- Liqueur, L10- Baily’s Irish Cream L10- Rocks, Subtotal, Tax, Total, 5.50, 8.00, 21.50, 0.45, 21.95, Balance Due\t21., T»p: 3,zT., If you pay with debit card, your bank may hold additional funds temporarily. This is not a charge from Flying Saucer, www. beerknurd .com Taxi Taxi - 919.333.3333", {"entities": [(0, 30, "ORG")]}),
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int),
)

def main(model='en_core_web_sm', output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


if __name__ == "__main__":
    plac.call(main('en_core_web_sm', Path.cwd(), 100))

After executing the code he showed me this error which I could not find any reference, even though the new model was generated, but when I tried it I only recognized the entities that were used as training (TRAIN_DATA) and I should also have recognized the entities of the base model 'en_core_web_sm' by Spacy.

Traceback (most recent call last):
  File "train.py", line 105, in <module>
    plac.call(main('en_core_web_sm', Path.cwd(), 100))
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 324, in call
    parser = parser_from(obj)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 133, in parser_from
    parser.populate_from(obj)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 248, in populate_from
    self._set_func_argspec(func)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 240, in _set_func_argspec
    self.argspec = getargspec(obj)
  File "C:\ProgramData\Anaconda3\lib\site-packages\plac_core.py", line 38, in getargspec
    str(callableobj))
TypeError: Could not determine the signature of None

Upvotes: 2

Views: 783

Answers (1)

Daniel Lopez
Daniel Lopez

Reputation: 41

I solve this removing plac.

if __name__ == "__main__":
    call(main('en_core_web_sm', Path.cwd(), 100))

Upvotes: 2

Related Questions