Aden Diamond
Aden Diamond

Reputation: 311

Python Pandas takes about 25 milliseconds to use the loc[] function. How can I speed it up?

The Python Pandas takes about 25 milliseconds to use the loc[] function. I have about 185,000 rows in the dataframe. Therefore, it takes about one and a half hours to run my program. How can I speed it up?

This is how I am using it and how I can tell how long it takes:

m_time = time.time()
element = dataframe.loc[(dataframe['text'] == t) & (dataframe['text1'] == t1)]
print(time.time() - m_time)

The terminal then prints:

0.026927947998046875
0.02593064308166504
0.026927709579467773
0.025930166244506836
0.026927471160888672
0.026927709579467773
0.026927947998046875
0.024932861328125
0.02593064308166504
0.02593088150024414
0.026927709579467773
0.02593088150024414
0.02593064308166504
...

Edit

Here is the entire function. The timing statements around the dataframe .loc[] operation have been removed.

def parse_jmdict(xml_file='JMdict_e.xml', furigana_file='JmdictFurigana.json'):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    c = conn.cursor()
    entries_processed = 0
    start_time = time.time()

    print("Starting parsing of {0}".format(furigana_file))
    json_file = open(furigana_file, encoding="utf-8-sig")
    json_str = json_file.read()
    furigana_list = json.loads(json_str)
    furigana_dataframe = pandas.DataFrame(furigana_list)

    print("Starting parsing of {0}".format(xml_file))

    # For every entry
    for entry in root.findall('entry'):
        entry_id = entry.find('ent_seq').text

        for k_ele in entry.findall('k_ele'):  # For every Kanji Element in an entry
            keb = k_ele.find('keb').text
            c.execute('INSERT INTO Jmdict_Kanji_Element (ENTRY_ID, VALUE) VALUES (?, ?)', (entry_id, keb))

            ke_pri = k_ele.findall('ke_pri')
            for priority in ke_pri:
                c.execute('INSERT INTO Jmdict_Priority (ENTRY_ID, VALUE, TYPE) VALUES (?, ?, ?)', (entry_id, priority.text, 'Kanji_Element'))

        for r_ele in entry.findall('r_ele'):  # For every Reading Element in an entry
            reb = r_ele.find('reb').text
            re_nokanji = r_ele.find('re_nokanji')
            re_nokanji_value = 0
            if re_nokanji is not None:
                re_nokanji_value = 1
            c.execute('INSERT INTO Jmdict_Reading_Element (ENTRY_ID, VALUE, NO_KANJI) VALUES (?, ?, ?)', (entry_id, reb, re_nokanji_value))

            c.execute('SELECT last_insert_rowid()')
            r_ele_id = c.fetchone()[0]
            for re_restr in r_ele.findall('re_restr'):
                c.execute('INSERT INTO Jmdict_Reading_Relation (ENTRY_ID, READING_ELEMENT_ID, VALUE) VALUES (?, ?, ?)', (entry_id, r_ele_id, re_restr.text))

            for k_ele in entry.findall('k_ele'):
                keb = k_ele.find('keb').text
                element_furigana = furigana_dataframe.loc[(furigana_dataframe['text'] == keb) & (furigana_dataframe['reading'] == reb)]
                try:
                    element_furigana_dict_list = element_furigana.iloc[0]["furigana"]
                    for i in range(len(element_furigana_dict_list)):
                        c.execute('INSERT INTO Jmdict_Furigana_Bottom (ENTRY_ID, READING_ELEMENT_ID, FURIGANA_PORTION_INDEX, VALUE) VALUES (?, ?, ?, ?)', (entry_id, r_ele_id, i, element_furigana_dict_list[i]["ruby"]))
                        furigana_top = 0
                        if "rt" in element_furigana_dict_list[i].keys():
                            furigana_top = element_furigana_dict_list[i]["rt"]
                        c.execute('INSERT INTO Jmdict_Furigana_Top (ENTRY_ID, READING_ELEMENT_ID, FURIGANA_PORTION_INDEX, VALUE) VALUES (?, ?, ?, ?)', (entry_id, r_ele_id, i, furigana_top))
                except IndexError:
                    pass

            for priority in r_ele.findall('re_pri'):
                c.execute('INSERT INTO Jmdict_Priority (ENTRY_ID, VALUE, TYPE) VALUES (?, ?, ?)', (entry_id, priority.text, 'Reading_Element'))

        for sense in entry.findall('sense'):  # For every Sense element in an entry
            c.execute('INSERT INTO Jmdict_Sense_Element (ENTRY_ID) VALUES (?)', (entry_id,))

            c.execute('SELECT last_insert_rowid()')
            sense_id = c.fetchone()[0]

            pos_list = sense.findall('pos')
            for pos in pos_list:
                c.execute('INSERT INTO Jmdict_Sense_Pos (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, pos.text))

            gloss_list = sense.findall('gloss')
            for gloss in gloss_list:
               c.execute('INSERT INTO Jmdict_Gloss (ENTRY_ID, SENSE_ID, VALUE) VALUES (?, ?, ?)', (entry_id, sense_id, gloss.text))

            misc_list = sense.findall('misc')
            for misc in misc_list:
                c.execute('INSERT INTO Jmdict_Sense_Miscellaneous (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, misc.text))

            field_list = sense.findall('field')
            for field in field_list:
                c.execute('INSERT INTO Jmdict_Sense_Field (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, field.text))

            dialect_list = sense.findall('dial')
            for dial in dialect_list:
                c.execute('INSERT INTO Jmdict_Sense_Dialect (SENSE_ID, VALUE) VALUES (?, ?)', (sense_id, dial.text))

        entries_processed += 1
        if entries_processed % 5 == 0:
            print(str(entries_processed) + " entries processed")

    conn.commit()
    print("Completed parsing of {0}".format(xml_file))
    print("Number of entries parsed: {0}".format(entries_processed))
    print("--- %s seconds ---" % (time.time() - start_time))

Upvotes: 0

Views: 481

Answers (1)

NicholasM
NicholasM

Reputation: 4673

You can improve the execution time of that line by setting an index, rather than using a boolean series lookup.

Here is a snippet from the function:

element_furigana = furigana_dataframe.loc[(furigana_dataframe['text'] == keb) & 
                                          (furigana_dataframe['reading'] == reb)]
element_furigana_dict_list = element_furigana.iloc[0]["furigana"]

This is doing much more work than it needs to: it is filtering all columns of the entire dataframe, when it really needs one column value from the first (only?) row matching those criteria.

Since there is no reproducible way to test this, I generated some test data:

import itertools
import string

import pandas

records = itertools.product(*[string.ascii_lowercase]*3, string.digits)
df = pandas.DataFrame.from_records([(''.join(t[0:2]), ''.join(t[2:4]), hash(t)) 
                                    for t in records],
                                   columns=['text', 'reading', 'furigana'])

df.shape is (175760, 3), which is close to your dataframe (by design).

Printing the head and tail gives:

       text reading             furigana
0        aa      a0 -5422069581675220239
1        aa      a1  -653727266625336176
2        aa      a2  3207829723004107725
3        aa      a3  4314664729972606463
175756   zz      z6  3818139299469014399
175757   zz      z7  4401450686737676603
175758   zz      z8  1773038865889527139
175759   zz      z9  3066148500118677746

There are two approaches to try:

  1. the current approach, of creating a boolean mask and getting the first element of the furigana column; and
  2. an approach in which the text and reading columns are turned into an index.

For approach 2, we create a Series like this:

s = df.set_index(['text', 'reading'])['furigana']
text  reading
aa    a0        -5422069581675220239
      a1         -653727266625336176
      a2         3207829723004107725
      a3         4314664729972606463
zz    z6         3818139299469014399
      z7         4401450686737676603
      z8         1773038865889527139
      z9         3066148500118677746
Name: furigana, dtype: int64

The results on my machine:

keb = 'zz'
reb = 'a4'
scope = dict(df=df, s=s, keb=keb, reb=reb)

timeit.timeit("df[(df['text'] == keb) & (df['reading'] == reb)]",
              globals=scope, number=1000)

Result: 19.1124583, or 0.19 seconds per call

timeit.timeit("s.loc[(keb, reb)]", globals=scope, number=1000)

Result: 0.042754, or 0.000042 seconds per call

Upvotes: 1

Related Questions