Reputation: 95
I am using spaCy's NER model "en_core_web_lg". This is the basic function that I have.
def find_pii_in_text(text):
pii_dict = {}
global nlp
if not nlp:
nlp = spacy.load("en_core_web_lg")
clean_html = re.compile("<.*?>")
text = re.sub(clean_html, "", text)
text = re.sub(r"[^\x00-\x7F]+", " ", text)
text_ent = nlp(text)
pii_dict.update(process_flagged_data(text_ent, pii_dict))
return pii_dict
def process_flagged_data(doc_list, pii_dict={}):
for idx, entity in enumerate(doc_list.ents):
# Ignore some labels
pii_info = f"{entity.label_} : {entity.text}"
if pii_loc not in pii_dict:
pii_dict[pii_loc] = {}
pii_dict[pii_loc].update(
{
f"n{idx}": {
"text": entity.text,
"type": entity.label_,
}
}
)
return pii_dict
Now when I pass sentences to this function, it is flagging text like 'fingers', 'spine', 'groin', 'torso' as PERSON. (Note that 'finger' is not flagged. Only 'fingers') which is obiviously a false positive.
How do I resolve this ? Any suggestions ?
Upvotes: 0
Views: 29