Abe
Abe

Reputation: 41

How to merge same consecutive entity types using Spacy

this is sample example, which uses entity_ruler to create patterns. but I want to merge same consecutive entity types into one entity and token

import spacy
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans

ent_list_sample = ["brain", "ischimia", "heart failufe", "parenchyma"]


print("Adding patterns to EntityRuler:\n-----------")
patterns = []
for concept in ent_list_sample:
    doc = nlp.make_doc(concept)
    if len(doc) > 1:
        patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
    else:
        patterns.append({"label": "SCI", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)


doc = nlp("It has a brain and also might have brain parenchyma ")
print("Entities:")
print(doc.ents)

output: (brain, brain, parenchyma)
expected: (brain, brain parenchyma)

PS: how we can reach expected output without adding  extra pattern for "brain parenchyma" 

Upvotes: 1

Views: 1851

Answers (1)

Abe
Abe

Reputation: 41

import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.pipeline import EntityRuler
from spacy.util import filter_spans
from spacy.pipeline import merge_entities

nlp = spacy.load("en_core_web_sm")

ent_list_sample = ['algorithm', 'data', 'engineering', 'software']
patterns = []
for concept in ent_list_sample:
    doc = nlp.make_doc(concept)
    if len(doc) > 1:
        patterns.append({"label": "SCI", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
    else:
        patterns.append({"label": "SCI", "pattern":doc.text.lower()})
        
        
ent_list_sample1 = ["brain", "ischimia", "heart failufe", "parenchyma"]
patterns1 = []
for concept in ent_list_sample1:
    doc = nlp.make_doc(concept)
    if len(doc) > 1:
        patterns1.append({"label": "HE", "pattern":[{"LOWER":term.text.lower()} for term in doc]})
    else:
        patterns1.append({"label": "HE", "pattern":doc.text.lower()})
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns+patterns1)
nlp.add_pipe(ruler, before="ner")


class EntityRetokenizeComponent:
    def __init__(self, nlp):
        pass
    def __call__(self, doc):
        new_ents = []
        for ent in doc.ents:
            if ent.label_ == doc[ent.start - 1].ent_type_  and ent.start != 0:
                    new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                    new_ents.append(new_ent)
            else:
                new_ents.append(ent)
        doc.ents =filter_spans(new_ents+ list(doc.ents))
        return doc
    
    
retokenizer = EntityRetokenizeComponent(nlp) 
nlp.add_pipe(retokenizer, name='merge_phrases', last=True)
nlp.add_pipe(merge_entities, last=True)
nlp.pipe_names

doc = nlp("I love Ann is good as well  data software is good for brain parenchyma and Apple is good company")
print([(ent.text, ent.label_) for ent in doc.ents])



This gave me desired output I wanted to get:

[('Ann', 'PERSON'), ('data software', 'SCI'), ('brain parenchyma', 'HE'), ('Apple', 'ORG')]

Upvotes: 3

Related Questions