Reputation: 11
I've been stressing out on this problem for so long and I can't seem to find a solution.
I want to train a NER model to recognise animal and species names.
I created a mock training set to test it out. However, I keep getting a ValueError: [E973] Unexpected type for NER data
I have tried other solutions on other posts on StackOverflow, including:
spacy.load('en_core_web_sm')
instead of spacy.blank('en')
All of these result in the same error.
import os
import spacy
from spacy.lang.en import English
from spacy.training.example import Example
import random
def train_spacy(data, iterations = 30):
TRAIN_DATA = data
nlp = spacy.blank("en") #start with a blank model
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last = True)
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for itn in range(iterations):
print ("Starting iterations "+str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list,tuple))) #this prints True
example = Example.from_dict(doc, {"entities":annotations})
nlp.update(
[example],
drop = 0.2,
sgd = optimizer,
losses = losses
)
print(losses)
return (nlp)
if __name__ == "__main__":
#mock training set
TRAIN_DATA=[('Dog is an animal',{'entities':[(0,3,'ANIMAL')]}),
('Cat is on the table',{'entities':[(0,3,'ANIMAL')]}),
('Rats are pets',{'entities':[(0,4,'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
The error message
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 49, in <module>
nlp = train_spacy(TRAIN_DATA)
File "c:\...\summarizer\src\feature_extraction\feature_extraction.py", line 35, in train_spacy
example = Example.from_dict(doc, {"entities":annotations})
File "spacy\training\example.pyx", line 118, in spacy.training.example.Example.from_dict
File "spacy\training\example.pyx", line 24, in spacy.training.example.annotations_to_doc
File "spacy\training\example.pyx", line 388, in spacy.training.example._add_entities_to_doc
ValueError: [E973] Unexpected type for NER data```
Upvotes: 1
Views: 775
Reputation: 1709
I had the same problem when I migrated a code that I had from a 2.x version of spacy to a 3.x version since several things changed.
Also, in your case it looks like you have a mix of spacy 2.x and 3.x syntaxt. The next version of your code with a few changes work for me using spacy 3.2.1
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.disable_pipes(*other_pipes):
losses = None
optimizer = nlp.create_optimizer()
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
print(isinstance(annotations["entities"], (list, tuple))) # this prints True
example = Example.from_dict(doc, annotations)
losses = nlp.update(
[example],
drop=0.2,
sgd=optimizer
)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
Notice the following changes:
I changed your import of Example class to from spacy.training import Example
. I think you were importing the wrong clase.
I'm using en_core_web_lg
but with a blank model it should work too!
I commented other pipeline models disabling because in spacy 3.x pipeline is more complex and I think you can't disable the whole pipeline for NER task. How ever feel free to read official documentation and try if some of the other models are not needed.
Optimizer now is initialized using nlp.create_optimizer()
instead of nlp.begin_training()
Note that annotations are already a dictionary in the expected format so you don't need to wrap it in a new dictionary: Example.from_dict(doc, annotations)
should do the job.
Finally the loss now is returned as a result of model update instead of being passed as parameter.
I hope this help you and please ask questions if you need more help.
Best regards!
EDIT:
I also want to suggest some changes in your training script to take more advantage of spacy utils:
Use spacy.utilis.minibatch
util to create mini batches from your training data.
Pass a whole minibacth of examples to update method instead of a minibatch of only one example.
Your code including this improve among other minor changes would looks as follos:
import random
import spacy
from spacy.training import Example
def train_spacy(data, iterations=30):
TRAIN_DATA = data
# nlp = spacy.blank("en") # start with a blank model
nlp = spacy.load("en_core_web_lg")
if "ner" not in nlp.pipe_names:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.get_pipe("ner")
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# Init loss
losses = None
# Init and configure optimizer
optimizer = nlp.create_optimizer()
optimizer.learn_rate = 0.001 # Change to some lr you prefers
batch_size = 32 # Choose batch size you prefers
for itn in range(iterations):
print("Starting iterations " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAIN_DATA, size=batch_size):
# Create Example instance for each training example in mini batch
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in batch]
# Update model with mini batch
losses = nlp.update(examples, drop=0.2, sgd=optimizer)
print(losses)
return nlp
if __name__ == "__main__":
# mock training set
TRAIN_DATA = [('Dog is an animal', {'entities': [(0, 3, 'ANIMAL')]}),
('Cat is on the table', {'entities': [(0, 3, 'ANIMAL')]}),
('Rats are pets', {'entities': [(0, 4, 'ANIMAL')]})]
nlp = train_spacy(TRAIN_DATA)
Upvotes: 2