shahid khan
shahid khan

Reputation: 439

Why spacy forgetting old trained data and how to solve

I am trying to train a spacy model for ner. I have a data set with 2940 rows and i trained a base model let it's name be current_model with these data and i got another 10 distinct dataset each have rows ranging from 200 to 530 rows so i loaded my current_model using spacy's spacy.load("current_model") then i trained using my each dataset. and i tried to predict ner using test data it recognises ner in new dataset but it seems forgetting ner in oldest dataset. i did this to reduce training time. please see my code below to see the what have i tried to do

Code for base model training

import spacy
from spacy.util import minibatch,compounding
import random
from pathlib import Path
from spacy import displacy
import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
imporcytoolz import partition_all
import os
from os import path
import shutil
import json


df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")
def populate_train_data(df):
train_data = []
i =0
for d_index, row in df.iterrows():
    print(row["annotations"])
    content = row["annotations"].replace("\\n", "\n").replace("\n", " ")
    content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)

# Finding tags and entities and store values in a entity list-----
soup = BeautifulSoup(content, "html.parser")
text = soup.get_text()
entities = []
for tag in soup.find_all():
    if tag.string is None:
        # failing silently for invalid tag
        print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')
        continue

tag_index = content.split(str(tag))[0].count(tag.string)
try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate


Code for training with new dataset and save to another directory

note: below code is written new file.


import spacy

from spacy import displacy

import pandas as pd

from annotations_training_spacy_31_oct_2019 import populate_train_data,train,save_to_directory



# test_texts = "I Like Today and Evening"

# base_training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

test_text = test_texts


# new_data_set = [

# ("Today is an Awsome Day", {"entities":[(1,5,"DAY")]}),

# ]


nlp = train(training_data=new_data_set,old_training_data=base_training_data,model_name="trained_model_with_transfer_learning_8_2019_12_05")

save_to_directory(nlp,["trained_model_with_transfer_learning_9"])


doc = nlp(test_text)

print("ENTITIES in '%s'" % test_text)

nlp.add_pipe(nlp.create_pipe('sentencizer'))

sentence = list(doc.sents)

for ent in doc.ents:

print(ent.label_,ent.text)



displacy.serve(sentence, style='ent')


as you can see i also tried to load old datasets tags. but still i have this problem

i know some peoples faced this problem please if anybody solved this problem help me.

thanks in advance for you help friends.

Hi,

I am trying to train a spacy model for ner. I have a data set with 2940 rows and i trained a base

model let it's name be current_model with these data and i got another 10 distinct dataset

each have rows ranging from 200 to 530 rows so i loaded my current_model using spacy's spacy.load("current_model") then i trained using my each dataset. and i tried to predict ner

using test data it recognises ner in new dataset but it seems forgetting ner in oldest dataset

i did this because to reduce training time. please see my code below to see the what have i tried

to do and i

code for base model training


import spacy

from spacy.util import minibatch,compounding

import random

from pathlib import Path

from spacy import displacy

import re

import pandas as pd

from bs4 import BeautifulSoup

from datetime import datetime

imporcytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])cytoolz import partition_all

import os

from os import path

import shutil

import json



df = pd.read_csv("new_annotations/dataset_transfer_learning1.csv")

def populate_train_data(df):

train_data = []

i =0

for d_index, row in df.iterrows():

print(row["annotations"])

content = row["annotations"].replace("\\n", "\n").replace("\n", " ")

content = re.sub(r"(?<=[:])(?=[^\s])", r" ", content)


# Finding tags and entities and store values in a entity list-----

soup = BeautifulSoup(content, "html.parser")

text = soup.get_text()

entities = []

for tag in soup.find_all():

if tag.string is None:

# failing silently for invalid tag

print(f'Tagging is invalid: {row["_id"], tag.name}, on row {i+2}skipping..')

continue


tag_index = content.split(str(tag))[0].count(tag.string)

try:

for index, match in enumerate(re.finditer(tag.string.replace("*", " "), text)):

if index == tag_index:

entities.append((match.start(), match.end(), tag.name))

except Exception as e:

print(e, f"at line no {i+2}")

continue

i += 1

if entities:

train_data.append((text, {"entities": entities}))

return train_data



def train(training_data,old_training_data=None,model_name=None):

nlp = ""

pretrained_weights = Path('weights/model999.bin')

if model_name is not None:

nlp = spacy.load(model_name,weights=pretrained_weights)

else:

print("no model specified using default model")

nlp = spacy.load("en_core_web_sm")

if "ner" not in nlp.pipe_names:

print("there is no ner creating ner")

ner = nlp.create_pipe("ner")

nlp.add_pipe(ner,last=True)

else:

print("there is ner")

ner = nlp.get_pipe("ner")

for _,annotations in training_data:

for ent in annotations.get("entities"):

ner.add_label(ent[2])

start_time = time.time()

if model_name is not None:

# nlp.resume_training()

# TRAINING_DATA = populate_train_data(pd.read_csv(old_training_data))

TRAINING_DATA = old_training_data

revision_data =[]

for doc in nlp.pipe(list(zip(*TRAINING_DATA))[0]):

tags = [w.tag_ for w in doc]

heads = [w.head.i for w in doc]

deps = [w.dep_ for w in doc]

entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]

revision_data.append((doc, GoldParse(doc, entities=entities)))

fine_tune_data = []

for raw_text, entity_offsets in training_data:

doc = nlp.make_doc(raw_text)

try:

gold = GoldParse(doc,entities=entity_offsets['entities'])

except ValueError:

pass

fine_tune_data.append((doc,gold))

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

optimizer = nlp.entity.create_optimizer()

with nlp.disable_pipes(*other_pipes):

# pretrained_weights = Path('weights/model999.bin')

# with pretrained_weights.open("rb") as file_:

# ner.model.tok2vec.from_bytes(file_.read())

for i in range(20):

example_data = revision_data+fine_tune_data

# example_data = training_data

losses = {}

random.shuffle(example_data)

for batch in partition_all(2,example_data):

docs, golds = zip(*batch)

# print(docs, golds)

try:


nlp.update(docs,golds)

except ValueError:

pass

# print(losses)

else:

for i in range(20):

random.shuffle(training_data)

correct = 1

for text, annotations in training_data:

try:

nlp.update([text],[annotations])

print(correct)

correct +=1

except ValueError:

pass

# print("skipping..")

no_of_stars = i

print("*"*no_of_stars)

end_time = time.time()

print("this code took {}".format(end_time - start_time))

return nlp


def save_to_directory(nlp,directory_name):

save_directory = directory_name

for directory in save_directory:

if directory is not None:

directory_full_path = Path(directory+"_"+datetime.today().strftime('%Y_%m_%d'))

if path.exists(directory_full_path):

shutil.rmtree(directory_full_path)

print("folder already existed so removed")

if not directory_full_path.exists():

directory_full_path.mkdir()

nlp.to_disk(directory_full_path)

print("Saved model to output directory",directory)



if __name__ == "__main__":

training_data = populate_train_data(df)


# training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

# print(training_data)

nlp = train(training_data)

save_to_directory(nlp,["trained_model_with_transfer_learning"])


#to do train using batched

#add drop rate


Code for training with new dataset and save to another directory

note: below code is written new file.


import spacy

from spacy import displacy

import pandas as pd

from annotations_training_spacy_31_oct_2019 import populate_train_data,train,save_to_directory



# test_texts = "I Like Today and Evening"

# base_training_data = [

# ("I Like Today and Evening", {"entities":[(7,12,"DAY"),(17,24,"DAY")]}),

# ("Today is my lucky day", {"entities":[(1,5,"DAY")]}),

# ("Yesterday and Today are two same days of a month",{"entities":[(14,19,"DAY")]}),

# ("May Today is Best Day",{"entities":[(4,9,"DAY")]}),

# ("Have a Nice Today and Every Day",{"entities":[(12,17,"DAY")]}),

# ("Hey How are feeling Today",{"entities":[(20,25,"DAY")]}),

# ]

test_text = test_texts


# new_data_set = [

# ("Today is an Awsome Day", {"entities":[(1,5,"DAY")]}),

# ]


nlp = train(training_data=new_data_set,old_training_data=base_training_data,model_name="trained_model_with_transfer_learning_8_2019_12_05")

save_to_directory(nlp,["trained_model_with_transfer_learning_9"])


doc = nlp(test_text)

print("ENTITIES in '%s'" % test_text)

nlp.add_pipe(nlp.create_pipe('sentencizer'))

sentence = list(doc.sents)

for ent in doc.ents:

print(ent.label_,ent.text)



displacy.serve(sentence, style='ent')


as you can see i also tried to load old datasets tags. but still i have this problem

i know some peoples faced this problem please if anybody solved this problem help me.

thanks in advance for you help friends.

Upvotes: 1

Views: 695

Answers (1)

Syenix
Syenix

Reputation: 208

Are you training the new model or appending on to existing spacy model? If you are doing the later, all the NN's(learnt weight, features) will be unlearnt and misaligned resulting in accuracy loss. I am telling this on experience when I wanted to train Korean and Japanese names that spacy could not identify. You can also try FastText, Flair and Polyglot and see if it achieves your purpose. Try to get the set out of all these tools and you should have good output. thats the solution I used in the end.

Upvotes: 1

Related Questions