Reputation: 846
I am running a snippet of code that queries a database and then fills in a pandas dataframe with a value of 1 if that tuple is present in the query. it does this by running the query then iterates over the tuples and fills in the dataframe. However, the query returns almost 8 million rows of data.
My question is if anyone knows how to speed up a process like this. Here is the code below:
user_age = pd.read_sql_query(sql_age, datastore, index_col=['userid']).age.astype(np.int, copy=False)
x = pd.DataFrame(0, index=user_age.index, columns=range(366), dtype=np.int8)
for r in pd.read_sql_query(sql_active, datastore, chunksize=50000):
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
Thank you in advance!
Upvotes: 3
Views: 338
Reputation: 880289
You could save some time by replacing the Python loop
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
with a NumPy array assignment using "advanced integer indexing":
x[npidx[r['userid']], r['day']] = 1
On a 80000-row DataFrame, using_numpy
(below) is about 6x faster:
In [7]: %timeit orig()
1 loop, best of 3: 984 ms per loop
In [8]: %timeit using_numpy()
10 loops, best of 3: 162 ms per loop
import numpy as np
import pandas as pd
def mock_read_sql_query():
np.random.seed(2016)
for arr in np.array_split(index, N//M):
size = len(arr)
df = pd.DataFrame({'userid':arr , 'day':np.random.randint(366, size=size)})
df = df[['userid', 'day']]
yield df
N, M = 8*10**4, 5*10**2
index = np.arange(N)
np.random.shuffle(index)
columns = range(366)
def using_numpy():
npidx = np.empty_like(index)
npidx[index] = np.arange(len(index))
x = np.zeros((len(index), len(columns)), dtype=np.int8)
for r in mock_read_sql_query():
x[npidx[r['userid']], r['day']] = 1
x = pd.DataFrame(x, columns=columns, index=index)
return x
def orig():
x = pd.DataFrame(0, index=index, columns=columns, dtype=np.int8)
for r in mock_read_sql_query():
for userid, day in r.itertuples(index=False):
x.at[userid, day] = 1
return x
expected = orig()
result = using_numpy()
expected_index, expected_col = np.where(expected)
result_index, result_col = np.where(result)
assert np.equal(expected_index, result_index).all()
assert np.equal(expected_col, result_col).all()
Upvotes: 2