jaried
jaried

Reputation: 672

I need to use numpy's vectorization to optimize my double for loop

I have a python function with a nested for loop that is called thousands of times, and is too slow. From what I have read online, there should be a way to optimize it with numpy vectorization so that the iteration is done in much faster C code rather than python. But, I have never worked with numpy before and I can't figure it out.

The function is :First slice from the first row, then calculate the sum of s column 1 row from the back to the front, if it is greater than or equal to n, return the total number; then slice from the first two rows, then calculate the sum of s column 1 row from the back to the front, if it is less than n, then calculate the sum of s column 2 rows, if it is greater than or equal to n, return the number of rows; and so on, until all the rows are cycled.

The original code (python 3.8) is:

import pandas as pd
from time import time
import numpy as np

def sumbars(df, s, n):
    start = time()
    df2 = df[s].copy()
    # df2.loc[0]=1000
    df['sumbars'] = 0
    for i in range(len(df)):
        df3 = df2[:i + 1]
        l = len(df3)
        for j in range(l):
            # if i>=8:
            #     print()
            df4 = df3[l - j - 1:]
            if df4.sum() >= n:
                df.loc[i, 'sumbars'] = j + 1
                # df5=df[['trade_date',s,'sumbars']]
                break

    stop = time()
    global sumbarstime
    sumbarstime = sumbarstime + stop - start
    return df['sumbars']

def main():
    # df=pd.read_csv('todo.csv')
    df = pd.DataFrame({'ts_code':['IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX'],'jc':[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]})
    df['jcts2'] = sumbars(df,'jc',2)

    
    print('')
    df.to_csv("result.csv",index=False,sep=',')

    # print sec

    return

main()

I need to optimize the double for loop is:

for i in range(len(df)):
        df3 = df2[:i + 1]
        l = len(df3)
        for j in range(l):
            # if i>=8:
            #     print()
            df4 = df3[l - j - 1:]
            if df4.sum() >= n:
                df.loc[i, 'sumbars'] = j + 1
                # df5=df[['trade_date',s,'sumbars']]
                break

Someone helps me optimized my code once, and I need to optimize the last for loop once. After optimizing a for loop :

import pandas as pd
from time import time
import numpy as np


def sumbars(df, s, n, ):
    start = time()
    sdata = np.array(df[s])
    sdata = np.flipud(sdata)
    l = len(sdata)
    sumbars = np.zeros(l)
    for i in range(l):
        n1 = sdata[i:]
        cumsum = np.cumsum(n1)
        k = np.argmax(cumsum >= n)
        n2 = cumsum[k]
        if (k != 0) | ((n2 != 0) & (n == 1)):
            sumbars[l - i - 1] = k + 1
    sumbars = sumbars.astype(int)
    df['sumbars'] = sumbars
    stop = time()
    global sumbarstime
    sumbarstime += stop - start
    # df.to_csv("my.csv")
    # print(df['sumbars'])
    return df['sumbars']

def main():
    # df=pd.read_csv('todo.csv')
    df = pd.DataFrame({'ts_code':['IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX','IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX', 'IC2103.CFX'],'jc':[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]})
    df['jcts2'] = sumbars(df,'jc',2)

    
    print('')
    df.to_csv("result.csv",index=False,sep=',')

    # print sec

    return

main()

Help: I need to optimize the last for loop with numpy vectorization. Besides using cumsum, is there any other way to optimize my original double for loop?

    for i in range(l):
        n1 = sdata[i:]
        cumsum = np.cumsum(n1)
        k = np.argmax(cumsum >= n)
        n2 = cumsum[k]
        if (k != 0) | ((n2 != 0) & (n == 1)):
            sumbars[l - i - 1] = k + 1

Please give me more time to reply after pointing out my problems.

Upvotes: 3

Views: 167

Answers (1)

Jérôme Richard
Jérôme Richard

Reputation: 50956

Assuming the input values of df[s] are always positives, you can significantly improve the algorithm. Indeed, the complexity of your current implementation is O(n**2) while in this case, it is possible to write an algorithm with the complexity O(n log n) (and probably even O(n)).

The improved solution consist in computing the cumsum only once and perform incremental updates. Indeed, it is not needed to recompute it since np.cumsum(arr[i:]) is always equal to fullCumsum[i:]-m with fullCumsum = np.cumsum(arr) and m = np.sum(arr[:i]). fullCumsum is independent of i so it can be precomputed. m can be computed incrementally in the loop. k can be found using a binary search since values are sorted.

Here is the resulting (pure Python) implementation:

def sumbarsFast(df, s, n, ):
    sdata = np.array(df[s])
    sdata = np.flipud(sdata)
    l = len(sdata)
    sumbars = np.zeros(l)
    fullCumsum = np.cumsum(sdata)
    m = 0
    for i in range(l):
        k = np.searchsorted(fullCumsum[i:], n + m)
        if k == len(fullCumsum)-i:
            k = 0
        if (k != 0) or ((fullCumsum[i+k] != m) and (n == 1)):
            sumbars[l - i - 1] = k + 1
        m += sdata[i]
    sumbars = sumbars.astype(int)
    df['sumbars'] = sumbars
    return df['sumbars']

It is possible to be even much faster using the Numba JIT and parallelism:

from numba import njit, prange, int64

@njit(int64[:](int64[:], int64), parallel=True)
def sumbarsKernel(sdata, n):
    l = len(sdata)
    fullCumsum = np.cumsum(sdata)
    sumbars = np.zeros(l, dtype=np.int64)
    nShared = np.int64(n)
    for i in prange(l):
        m = fullCumsum[i-1] if i > 0 else 0
        k = np.searchsorted(fullCumsum[i:], n + m)
        if k == len(fullCumsum)-i:
            k = 0
        if (k != 0) or ((fullCumsum[i+k] != m) and (nShared == 1)):
            sumbars[l - i - 1] = k + 1
    return sumbars

def sumbarsSuperFast(df, s, n, ):
    sdata = np.array(df[s], dtype=np.int64)
    sdata = np.flipud(sdata)
    sumbars = sumbarsKernel(sdata, n)
    sumbars = sumbars.astype(int)
    df['sumbars'] = sumbars
    return df['sumbars']

Here are timings with a dataframe containing 200000 lines on my machine:

Your naive version:         589.764 seconds
Your "optimized" version:   121.683 seconds
sumbarsFast (pure Python):    1.081 second
sumbarsSuperFast (Numba):     0.004 second

The final implementation is about 150000 times faster than your naive version and 30000 times faster than your "optimized" version.

Upvotes: 4

Related Questions