Reputation: 311
The Python Pandas takes about 25 milliseconds to use the loc[] function. I have about 185,000 rows in the dataframe. Therefore, it takes about one and a half hours to run my program. How can I speed it up?
This is how I am using it and how I can tell how long it takes:
m_time = time.time()
element = dataframe.loc[(dataframe['text'] == t) & (dataframe['text1'] == t1)]
print(time.time() - m_time)
The terminal then prints:
0.026927947998046875
0.02593064308166504
0.026927709579467773
0.025930166244506836
0.026927471160888672
0.026927709579467773
0.026927947998046875
0.024932861328125
0.02593064308166504
0.02593088150024414
0.026927709579467773
0.02593088150024414
0.02593064308166504
...
Edit
Here is the entire function. The timing statements around the dataframe .loc[]
operation have been removed.
def parse_jmdict(xml_file='JMdict_e.xml', furigana_file='JmdictFurigana.json'):
tree = ET.parse(xml_file)
root = tree.getroot()
c = conn.cursor()
entries_processed = 0
start_time = time.time()
print("Starting parsing of {0}".format(furigana_file))
json_file = open(furigana_file, encoding="utf-8-sig")
json_str = json_file.read()
furigana_list = json.loads(json_str)
furigana_dataframe = pandas.DataFrame(furigana_list)
print("Starting parsing of {0}".format(xml_file))
# For every entry
for entry in root.findall('entry'):
entry_id = entry.find('ent_seq').text
for k_ele in entry.findall('k_ele'): # For every Kanji Element in an entry
keb = k_ele.find('keb').text
c.execute('INSERT INTO Jmdict_Kanji_Element (ENTRY_ID, VALUE) VALUES (?, ?)', (entry_id, keb))
ke_pri = k_ele.findall('ke_pri')
for priority in ke_pri:
c.execute('INSERT INTO Jmdict_Priority (ENTRY_ID, VALUE, TYPE) VALUES (?, ?, ?)', (entry_id, priority.text, 'Kanji_Element'))
for r_ele in entry.findall('r_ele'): # For every Reading Element in an entry
reb = r_ele.find('reb').text
re_nokanji = r_ele.find('re_nokanji')
re_nokanji_value = 0
if re_nokanji is not None:
re_nokanji_value = 1
c.execute('INSERT INTO Jmdict_Reading_Element (ENTRY_ID, VALUE, NO_KANJI) VALUES (?, ?, ?)', (entry_id, reb, re_nokanji_value))
c.execute('SELECT last_insert_rowid()')
r_ele_id = c.fetchone()[0]
for re_restr in r_ele.findall('re_restr'):
c.execute('INSERT INTO Jmdict_Reading_Relation (ENTRY_ID, READING_ELEMENT_ID, VALUE) VALUES (?, ?, ?)', (entry_id, r_ele_id, re_restr.text))
for k_ele in entry.findall('k_ele'):
keb = k_ele.find('keb').text
element_furigana = furigana_dataframe.loc[(furigana_dataframe['text'] == keb) & (furigana_dataframe['reading'] == reb)]
try:
element_furigana_dict_list = element_furigana.iloc[0]["furigana"]
for i in range(len(element_furigana_dict_list)):
c.execute('INSERT INTO Jmdict_Furigana_Bottom (ENTRY_ID, READING_ELEMENT_ID, FURIGANA_PORTION_INDEX, VALUE) VALUES (?, ?, ?, ?)', (entry_id, r_ele_id, i, element_furigana_dict_list[i]["ruby"]))
furigana_top = 0
if "rt" in element_furigana_dict_list[i].keys():
furigana_top = element_furigana_dict_list[i]["rt"]
c.execute('INSERT INTO Jmdict_Furigana_Top (ENTRY_ID, READING_ELEMENT_ID, FURIGANA_PORTION_INDEX, VALUE) VALUES (?, ?, ?, ?)', (entry_id, r_ele_id, i, furigana_top))
except IndexError:
pass
for priority in r_ele.findall('re_pri'):
c.execute('INSERT INTO Jmdict_Priority (ENTRY_ID, VALUE, TYPE) VALUES (?, ?, ?)', (entry_id, priority.text, 'Reading_Element'))
for sense in entry.findall('sense'): # For every Sense element in an entry
c.execute('INSERT INTO Jmdict_Sense_Element (ENTRY_ID) VALUES (?)', (entry_id,))
c.execute('SELECT last_insert_rowid()')
sense_id = c.fetchone()[0]
pos_list = sense.findall('pos')
for pos in pos_list:
c.execute('INSERT INTO Jmdict_Sense_Pos (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, pos.text))
gloss_list = sense.findall('gloss')
for gloss in gloss_list:
c.execute('INSERT INTO Jmdict_Gloss (ENTRY_ID, SENSE_ID, VALUE) VALUES (?, ?, ?)', (entry_id, sense_id, gloss.text))
misc_list = sense.findall('misc')
for misc in misc_list:
c.execute('INSERT INTO Jmdict_Sense_Miscellaneous (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, misc.text))
field_list = sense.findall('field')
for field in field_list:
c.execute('INSERT INTO Jmdict_Sense_Field (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, field.text))
dialect_list = sense.findall('dial')
for dial in dialect_list:
c.execute('INSERT INTO Jmdict_Sense_Dialect (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, dial.text))
entries_processed += 1
if entries_processed % 5 == 0:
print(str(entries_processed) + " entries processed")
conn.commit()
print("Completed parsing of {0}".format(xml_file))
print("Number of entries parsed: {0}".format(entries_processed))
print("--- %s seconds ---" % (time.time() - start_time))
Upvotes: 0
Views: 481
Reputation: 4673
You can improve the execution time of that line by setting an index, rather than using a boolean series lookup.
Here is a snippet from the function:
element_furigana = furigana_dataframe.loc[(furigana_dataframe['text'] == keb) &
(furigana_dataframe['reading'] == reb)]
element_furigana_dict_list = element_furigana.iloc[0]["furigana"]
This is doing much more work than it needs to: it is filtering all columns of the entire dataframe, when it really needs one column value from the first (only?) row matching those criteria.
Since there is no reproducible way to test this, I generated some test data:
import itertools
import string
import pandas
records = itertools.product(*[string.ascii_lowercase]*3, string.digits)
df = pandas.DataFrame.from_records([(''.join(t[0:2]), ''.join(t[2:4]), hash(t))
for t in records],
columns=['text', 'reading', 'furigana'])
df.shape
is (175760, 3)
, which is close to your dataframe (by design).
Printing the head and tail gives:
text reading furigana
0 aa a0 -5422069581675220239
1 aa a1 -653727266625336176
2 aa a2 3207829723004107725
3 aa a3 4314664729972606463
175756 zz z6 3818139299469014399
175757 zz z7 4401450686737676603
175758 zz z8 1773038865889527139
175759 zz z9 3066148500118677746
There are two approaches to try:
furigana
column; and text
and reading
columns are turned into an index.For approach 2, we create a Series like this:
s = df.set_index(['text', 'reading'])['furigana']
text reading
aa a0 -5422069581675220239
a1 -653727266625336176
a2 3207829723004107725
a3 4314664729972606463
zz z6 3818139299469014399
z7 4401450686737676603
z8 1773038865889527139
z9 3066148500118677746
Name: furigana, dtype: int64
The results on my machine:
keb = 'zz'
reb = 'a4'
scope = dict(df=df, s=s, keb=keb, reb=reb)
timeit.timeit("df[(df['text'] == keb) & (df['reading'] == reb)]",
globals=scope, number=1000)
Result: 19.1124583
, or 0.19 seconds per call
timeit.timeit("s.loc[(keb, reb)]", globals=scope, number=1000)
Result: 0.042754
, or 0.000042 seconds per call
Upvotes: 1