Reputation: 672
I have a python function with a nested for loop that is called thousands of times, and is too slow. From what I have read online, there should be a way to optimize it with numpy vectorization so that the iteration is done in much faster C code rather than python. But, I have never worked with numpy before and I can't figure it out.
The function is :First slice from the first row, then calculate the sum of s column 1 row from the back to the front, if it is greater than or equal to n, return the total number; then slice from the first two rows, then calculate the sum of s column 1 row from the back to the front, if it is less than n, then calculate the sum of s column 2 rows, if it is greater than or equal to n, return the number of rows; and so on, until all the rows are cycled.
The original code (python 3.8) is:
import pandas as pd
from time import time
import numpy as np
def sumbars(df, s, n):
start = time()
df2 = df[s].copy()
# df2.loc[0]=1000
df['sumbars'] = 0
for i in range(len(df)):
df3 = df2[:i + 1]
l = len(df3)
for j in range(l):
# if i>=8:
# print()
df4 = df3[l - j - 1:]
if df4.sum() >= n:
df.loc[i, 'sumbars'] = j + 1
# df5=df[['trade_date',s,'sumbars']]
break
stop = time()
global sumbarstime
sumbarstime = sumbarstime + stop - start
return df['sumbars']
def main():
# df=pd.read_csv('todo.csv')
df = pd.DataFrame({'ts_code':['IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX'],'jc':[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]})
df['jcts2'] = sumbars(df,'jc',2)
print('')
df.to_csv("result.csv",index=False,sep=',')
# print sec
return
main()
I need to optimize the double for loop is:
for i in range(len(df)):
df3 = df2[:i + 1]
l = len(df3)
for j in range(l):
# if i>=8:
# print()
df4 = df3[l - j - 1:]
if df4.sum() >= n:
df.loc[i, 'sumbars'] = j + 1
# df5=df[['trade_date',s,'sumbars']]
break
Someone helps me optimized my code once, and I need to optimize the last for loop once. After optimizing a for loop :
import pandas as pd
from time import time
import numpy as np
def sumbars(df, s, n, ):
start = time()
sdata = np.array(df[s])
sdata = np.flipud(sdata)
l = len(sdata)
sumbars = np.zeros(l)
for i in range(l):
n1 = sdata[i:]
cumsum = np.cumsum(n1)
k = np.argmax(cumsum >= n)
n2 = cumsum[k]
if (k != 0) | ((n2 != 0) & (n == 1)):
sumbars[l - i - 1] = k + 1
sumbars = sumbars.astype(int)
df['sumbars'] = sumbars
stop = time()
global sumbarstime
sumbarstime += stop - start
# df.to_csv("my.csv")
# print(df['sumbars'])
return df['sumbars']
def main():
# df=pd.read_csv('todo.csv')
df = pd.DataFrame({'ts_code':['IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX'],'jc':[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]})
df['jcts2'] = sumbars(df,'jc',2)
print('')
df.to_csv("result.csv",index=False,sep=',')
# print sec
return
main()
Help: I need to optimize the last for loop with numpy vectorization. Besides using cumsum, is there any other way to optimize my original double for loop?
for i in range(l):
n1 = sdata[i:]
cumsum = np.cumsum(n1)
k = np.argmax(cumsum >= n)
n2 = cumsum[k]
if (k != 0) | ((n2 != 0) & (n == 1)):
sumbars[l - i - 1] = k + 1
Please give me more time to reply after pointing out my problems.
Upvotes: 3
Views: 167
Reputation: 50956
Assuming the input values of df[s]
are always positives, you can significantly improve the algorithm. Indeed, the complexity of your current implementation is O(n**2)
while in this case, it is possible to write an algorithm with the complexity O(n log n)
(and probably even O(n)
).
The improved solution consist in computing the cumsum
only once and perform incremental updates.
Indeed, it is not needed to recompute it since np.cumsum(arr[i:])
is always equal to fullCumsum[i:]-m
with fullCumsum = np.cumsum(arr)
and m = np.sum(arr[:i])
.
fullCumsum
is independent of i
so it can be precomputed.
m
can be computed incrementally in the loop.
k
can be found using a binary search since values are sorted.
Here is the resulting (pure Python) implementation:
def sumbarsFast(df, s, n, ):
sdata = np.array(df[s])
sdata = np.flipud(sdata)
l = len(sdata)
sumbars = np.zeros(l)
fullCumsum = np.cumsum(sdata)
m = 0
for i in range(l):
k = np.searchsorted(fullCumsum[i:], n + m)
if k == len(fullCumsum)-i:
k = 0
if (k != 0) or ((fullCumsum[i+k] != m) and (n == 1)):
sumbars[l - i - 1] = k + 1
m += sdata[i]
sumbars = sumbars.astype(int)
df['sumbars'] = sumbars
return df['sumbars']
It is possible to be even much faster using the Numba JIT and parallelism:
from numba import njit, prange, int64
@njit(int64[:](int64[:], int64), parallel=True)
def sumbarsKernel(sdata, n):
l = len(sdata)
fullCumsum = np.cumsum(sdata)
sumbars = np.zeros(l, dtype=np.int64)
nShared = np.int64(n)
for i in prange(l):
m = fullCumsum[i-1] if i > 0 else 0
k = np.searchsorted(fullCumsum[i:], n + m)
if k == len(fullCumsum)-i:
k = 0
if (k != 0) or ((fullCumsum[i+k] != m) and (nShared == 1)):
sumbars[l - i - 1] = k + 1
return sumbars
def sumbarsSuperFast(df, s, n, ):
sdata = np.array(df[s], dtype=np.int64)
sdata = np.flipud(sdata)
sumbars = sumbarsKernel(sdata, n)
sumbars = sumbars.astype(int)
df['sumbars'] = sumbars
return df['sumbars']
Here are timings with a dataframe containing 200000 lines on my machine:
Your naive version: 589.764 seconds
Your "optimized" version: 121.683 seconds
sumbarsFast (pure Python): 1.081 second
sumbarsSuperFast (Numba): 0.004 second
The final implementation is about 150000 times faster than your naive version and 30000 times faster than your "optimized" version.
Upvotes: 4