Reputation:
i have a sql table with 12000 entries stored in a dataframe df1 which looks like this:
id | name |
---|---|
00001 | angiocarcoma |
00261 | shrimp allergy |
and i have another table with 20000 entries which is stored in dataframe df:
Entry_name | CA |
---|---|
TRGV2 | 3BHS1 HSD3B1 3BH HSDB3 |
TRGJ1 | 3BP1 SH3BP1 IF |
The aim is to match for each possible combination of name from df1 with that of CA(splitted with " " space) from df in a sentence with a condition that length of CA cell value should be greater than 2. The simplest logic would be to search for all the name values from df1 in the sentence and if a match is found then search for CA values in the same sentence. But doing that i am limiting resource usage.
Following is the code which i have tried and i can only think of nested loops to accomplish the task. If i use two functions then i am creating a functon calling overhead and if i try to do it recursive then if am exceeding the recusrive function call in Python which is forcing the kernel to shut off. The following function is called by passing a sentence (i have to parse 500k sentences) to it:
def disease_search(nltk_tokens_sen):
for dis_index in range(len(df1)):
disease_name=df1.at[dis_index,'name']
regex_for_dis = rf"\b{disease_name}\b"
matches_for_dis= re.findall(regex_for_dis, nltk_tokens_sen, re.IGNORECASE | re.MULTILINE)
if len(matches_for_dis)!=0:
disease_marker(nltk_tokens_sen, disease_name)
and this function is called if the above function founds a match:
def disease_marker(nltk_tokens_sen, disease_name):
for zz in range(len(df)):
biomarker_txt=((df.at[zz,'CA']))
biomarker = biomarker_txt.split(" ")
for tt in range(len(biomarker)):
if len(biomarker[tt])>2:
matches_for_marker = re.findall(rf"\b{re.escape(biomarker[tt])}\b", nltk_tokens_sen)
if len(matches_for_marker)!=0:
print("Match_found:", disease_name, biomarker[tt] )
Do i need need to change my logic completely or is there a Pythonic runtime efficent way to achieve it?
Upvotes: 0
Views: 463
Reputation: 17156
Optimizations and Modifications
Result:
New Code
File: main.py
##################################################################
# Imports keyword data from simulate_data module and processes
# on nltk sentence
##################################################################
if __name__ == "__main__":
from process import Finder, format_result
from simulate_data import df_disease, df_biomarker
# Sentence to process
nltk_sentence = 'very hard angiocarcoma diagnosed 3BHS1'
# Set up search engine
finder = Finder(df_disease, df_biomarker)
# Process and loop through results
for d, m in finder.process(nltk_sentence):
format_result(d, m)
File: process.py
from itertools import product
import ahocorasick as ahc
def make_aho_automaton(keywords):
A = ahc.Automaton() # initialize
for (key, cat) in keywords:
A.add_word(key, (cat, key)) # add keys and categories
A.make_automaton() # generate automaton
return A
class Finder():
def __init__(self, df_disease, df_biomarker):
' Initialize Automatons for Disease and biomarkers '
# Disease and biomarker Keywords
# Pairs of (name, id) with name as key and id as category
# note: used underscore on variable names (e.g. _id) to ensure against shadowing builtins
disease_keys = ((_name.lower(), _id) for _name, _id in zip(df_disease['name'], df_disease['id']))
# Pairs of (CA, Entry_name) with CA as key and Entry_name as category
biomarker_keys = (
(_ca_entry, _entry_name)
for _ca, _entry_name in zip(df_biomarker['CA'], df_biomarker['Entry_name'])
for _ca_entry in _ca.split() if len(_ca_entry) >= 3
)
# Create Aho-Corasick automatons
# Surround keywords with space so we find only only word boundaries
self.disease_automaton = make_aho_automaton((f' {keyw} ', cat) for keyw, cat in disease_keys)
self.biomarker_automaton = make_aho_automaton((f' {keyw} ', cat) for keyw, cat in biomarker_keys)
def find_keywords(self, line, A):
' Finds key words in line that exist in Automaton A (as generator)'
# ensure line has space at beginning and end, but remove from key
return ((keyw.strip(), cat) for end_index, (cat, keyw) in A.iter(f' {line} '))
def process(self, nltk_sentence):
sentence = f' {nltk_sentence} ' # ensures sentence as space at both ends (for keyword matching)
for d, m in product(self.find_keywords(sentence, self.disease_automaton),
self.find_keywords(sentence, self.biomarker_automaton)):
yield d, m
def format_result(d, m):
' Format results for printing '
return print(f'Match_found: Disease: name {d[0].title()} with id {d[1]}, Biomarker CA {m[0]} with Entry_name {m[1]}')
File: simulate_data.py
import string
from random import randint, choice
import pandas as pd
def random_word(min_length = 2, max_length = 5, upper = False):
' Generate random word '
if upper:
# Used for biomarker
letters = string.ascii_uppercase + string.digits
else:
# used by disease
letters = string.ascii_lowercase
return ''.join(choice(letters) for _ in range(randint(min_length, max_length)))
def random_sentence(min_length = 1, max_length = 3, upper = False):
'''Random sentence -- upper (True) generates letters and digits,
letters only for lower
'''
return ' '.join(random_word(upper = upper) for _ in range(randint(min_length, max_length)))
diseases = {"id":['00001', '00261'],
'name':['angiocarcoma', 'shrimp allergy']}
biomarkers = {'Entry_name':['TRGV2', 'TRGJ1'],
'CA':['3BHS1 HSD3B1 3BH HSDB3', '3BP1 SH3BP1 IF']}
N = 10000
for i in range(N):
diseases['id'].append(str(i+300).zfill(5))
diseases['name'].append(random_sentence(min_length = 1, max_length = 5, upper = False))
biomarkers['Entry_name'].append(random_word(4, 5, True))
biomarkers['CA'].append(random_sentence(2, 6, True))
df_disease = pd.DataFrame(diseases)
df_biomarker = pd.DataFrame(biomarkers)
Output
Example 1 -- use on a single sentence nltk sentence
nltk_sentence = 'allergy very hard angiocarcoma diagnosed 3BH 3BHS1'
finder = Finder(df_disease, df_biomarker) # Finder for disease and biomarker keys
for d, m in finder.process(nltk_sentence): # apply to nltk_sentence
format_result(d, m)
# Out:
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BH with Entry_name TRGV2
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BHS1 with Entry_name TRGV2
Example 2--looping over multiple nltk sentences
sentences = ("Very hard angiocarcoma diagnosed 3BHS1\n" # Note: Un-Capitalized A in Angiocarcoma
"very hard Angiocarcoma diagnosed IF\n" # Note: Biomarker IF is too short)
"very hard angiocarcoma diagnosed 3BP0\n" # Note: Un-Capitalized A in Angiocarcoma
"Very hard Angiocarcoma diagnosed 3BP0 3BHS1\n" # Note: Capitalized A in Angiocarcoma
"Fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1"
)
for sentence in sentences.split('\n'):
print(sentence)
for d, m in finder.process(sentence):
format_result(d, m)
print()
# Out:
# Very hard angiocarcoma diagnosed 3BHS1
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BHS1 with Entry_name TRGV2
#
very hard Angiocarcoma diagnosed IF
#
very hard angiocarcoma diagnosed 3BP0
#
Very hard Angiocarcoma diagnosed 3BP0 3BHS1
#
# Fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BHS1 with Entry_name TRGV2
Timing Test
Used a dataset with over 10K records (see simulated_data.py above)
Modification of OP code (to make timing comparison fair)
Modified OP code (used to time original)
def disease_search(nltk_tokens_sen):
for dis_index in range(len(df1)):
disease_name = df_disease.at[dis_index,'name']
regex_for_dis = rf"\b{disease_name}\b"
matches_for_dis= re.findall(regex_for_dis, nltk_tokens_sen, re.IGNORECASE | re.MULTILINE)
if len(matches_for_dis)!=0:
yield from disease_marker(nltk_tokens_sen, disease_name)
def disease_marker(nltk_tokens_sen, disease_name):
for zz in range(len(df_biomarkers)):
biomarker_txt=((df_biomarkers.at[zz,'CA']))
biomarker = biomarker_txt.split(" ")
for tt in range(len(biomarker)):
if len(biomarker[tt])>2:
matches_for_marker = re.findall(rf"\b{re.escape(biomarker[tt])}\b", nltk_tokens_sen)
if len(matches_for_marker)!=0:
yield disease_name, biomarker[tt]
Timing Test
Using Jupyter Notebook 'Magic commands'
Timing New Code
%timeit finder = Finder(df_disease, df_biomarker) # Initilization
107 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit list(finder.process(nltk_sentence)) # Per nltk string
6.57 µs ± 618 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Total: ~107 ms to setup and 6.57 us per nltk sentence
Timing Original Code
%timeit list(disease_search(nltk_sentence))
3.54 s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Upvotes: 2
Reputation: 340
Based on your link pasted in comments, You are trying to loop through all the available disease names to find the disease in the given paragraph of words. I suggest you to loop through the words of the paragraph and find matches in the DataFrame
You can try doing following steps,
Split the nltk_tokens_sen to list of words and lets name it as nltk_tokens_words
Instead of looping through entire DataFrame for finding the matched rows from a given list of words, you can use DF string filters like match & contains. This will reduce the looping of entire DF.
filtered_rows = (df1['name'].str.contains(string) for string in nltk_tokens_words)
create a combined mark using np and apply to get the filtered DF
combined_mask = np.vstack(filtered_rows).all(axis=0)
df1[combined_mask]
Repeat the same steps for the second DF.
Try this and let me know if this helps for you!!
Upvotes: 0
Reputation: 588
Try this one and let me know. This should be more time efficient due to faster-access structures (like lists and dicts) than pandas DataFrame and fast preliminary selection of valid items, which is not using library re.
# necessary imports
import pandas as pd
import itertools
import re
# test dataframes
df1 = pd.DataFrame({
'id': ['00001','00261','00002'],
'name': ['angiocarcoma', 'shrimp allergy', 'fish allergy']
})
df = pd.DataFrame({
'Entry_name': ['TRGV2','TRGJ1','TRGJ2'],
'CA': ['3BHS1 HSD3B1 3BH HSDB3', '3BP1 SH3BP1 IF', '3BP0']
})
# redesign data structures you work with
# set() will deduplicate for you
disease_list = list(set(df1['name']))
CA_list = list(set(df['CA']))
valid_CA_list_tmp = list(itertools.chain(*[x.split() for x in CA_list]))
valid_CA_list = [x for x in valid_CA_list_tmp if len(x)>2]
# the function
def disease_search_v2(nltk_tokens_sen):
"""Takes string as input"""
found_diseases_preliminary = [x for x in disease_list if x.lower() in nltk_tokens_sen.lower()]
found_CA_preliminary = [x for x in valid_CA_list if x.lower() in nltk_tokens_sen.lower()]
found_diseases = [x for x in found_diseases_preliminary if re.search(rf"\b{x}\b", nltk_tokens_sen)]
found_CA = [x for x in found_CA_preliminary if re.search(rf"\b{x}\b", nltk_tokens_sen)]
if len(found_diseases) > 0 and len(found_CA) > 0:
return {x:found_CA for x in found_diseases}
else:
return {}
# testing cases
disease_search_v2('very hard angiocarcoma diagnosed 3BHS1')
disease_search_v2('very hard angiocarcoma diagnosed IF')
disease_search_v2('very hard angiocarcoma diagnosed 3BP0')
disease_search_v2('very hard angiocarcoma diagnosed 3BP0 3BHS1')
disease_search_v2('fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1')
disease_search_v2('fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1\nfish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1\n')
Upvotes: 1