Optimize NumPy sum of matrices iterated through every element

Question

I'm working using numpy 1.9, python 2.7 with opencv, dealing with big matrices and I have to make the following operation many times

def sumShifted(A):  # A: numpy array 1000*1000*10
    return A[:, 0:-1] + A[:, 1:]

I'd like to optimize this operation, if possible; I tried with Cython but I don't get any significant improvement but I do not exclude that it is because of my bad implementation.

Is there a way to make it faster?

EDIT: sumShifted is getting called in a for loop like this:

for i in xrange(0, 400):
    # ... Various operations on B
    A = sumShifted(B)
    # ... Other operations on B


#More detailed
for i in xrange(0, 400):
    A = sumShifted(a11)
    B = sumShifted(a12)
    C = sumShifted(b12)
    D = sumShifted(b22)

    v = -upQ12/upQ11

    W, X, Z = self.function1( input_matrix, v, A, C[:,:,4], D[:,:,4] )
    S, D, F = self.function2( input_matrix, v, A, C[:,:,5], D[:,:,5] )
    AA      = self.function3( input_matrix, v, A, C[:,:,6], D[:,:,6] )
    BB      = self.function4( input_matrix, v, A, C[:,:,7], D[:,:,7] )

EDIT2: Following your advice I created this two runnable benchmarks (with Cython) about merging the 4 sumShifted methods in one.

A, B, C, D= improvedSumShifted(E, F, G, H)
#E,F: 1000x1000 matrices
#G,H: 1000x1000x8 matrices

#first implementation
def improvedSumShifted(np.ndarray[dtype_t, ndim=2] a, np.ndarray[dtype_t, ndim=2] b, np.ndarray[dtype_t, ndim=3] c, np.ndarray[dtype_t, ndim=3] d):
  cdef unsigned int i,j,k;
  cdef unsigned int w = a.shape[0], h = a.shape[1]-1, z = c.shape[2]
  cdef np.ndarray[dtype_t, ndim=2] aa = np.empty((w, h))
  cdef np.ndarray[dtype_t, ndim=2] bb = np.empty((w, h))
  cdef np.ndarray[dtype_t, ndim=3] cc = np.empty((w, h, z))
  cdef np.ndarray[dtype_t, ndim=3] dd = np.empty((w, h, z))
  with cython.boundscheck(False), cython.wraparound(False), cython.overflowcheck(False), cython.nonecheck(False):
    for i in range(w):
      for j in range(h):
        aa[i,j] = a[i,j] + a[i,j+1]
        bb[i,j] = b[i,j] + b[i,j+1]
        for k in range(z):
          cc[i,j,k] = c[i,j,k] + c[i,j+1,k]
          dd[i,j,k] = d[i,j,k] + d[i,j+1,k]
return aa, bb, cc, dd

#second implementation
def improvedSumShifted(np.ndarray[dtype_t, ndim=2] a, np.ndarray[dtype_t, ndim=2] b, np.ndarray[dtype_t, ndim=3] c, np.ndarray[dtype_t, ndim=3] d):
  cdef unsigned int i,j,k;
  cdef unsigned int w = a.shape[0], h = a.shape[1]-1, z = c.shape[2]
  cdef np.ndarray[dtype_t, ndim=2] aa = np.copy(a[:, 0:h])
  cdef np.ndarray[dtype_t, ndim=2] bb = np.copy(b[:, 0:h])
  cdef np.ndarray[dtype_t, ndim=3] cc = np.copy(c[:, 0:h])
  cdef np.ndarray[dtype_t, ndim=3] dd = np.copy(d[:, 0:h])
  with cython.boundscheck(False), cython.wraparound(False), cython.overflowcheck(False), cython.nonecheck(False):
  for i in range(w):
    for j in range(h):
      aa[i,j] += a[i,j+1]
      bb[i,j] += b[i,j+1]
      for k in range(z):
        cc[i,j,k] += c[i,j+1,k]
        dd[i,j,k] += d[i,j+1,k]

return aa, bb, cc, dd

burnpanck · Accepted Answer

It is unlikely that this function can be sped up any further: It really does just four operations on python level:

(2x) Perform a slice on the input. These kinds of slices are very fast, as they only require a handful of integer operations to calculate new strides and sizes.
Allocate a new array for the output. For such a simple function, this is a significant burden.
Evaluate the np.add ufunc on the two slices, an operation that is highly optimised in numpy.

Indeed, my benchmarks show no improvement by either using numba or cython. On my machine, I consistently get ~30 ms per call if the output-array is pre-allocated, or ~50 ms if the memory allocation is taken into account.

The pure numpy versions:

import numpy as np

def ss1(A):
    return np.add(A[:,:-1,:],A[:,1:,:])

def ss2(A,output):
    return np.add(A[:,:-1,:],A[:,1:,:],output)

The cython versions:

import numpy as np
cimport numpy as np
cimport cython

def ss3(np.float64_t[:,:,::1] A not None):
    cdef unsigned int i,j,k;
    cdef np.float64_t[:,:,::1] ret = np.empty((A.shape[0],A.shape[1]-1,A.shape[2]),'f8')
    with cython.boundscheck(False), cython.wraparound(False):
        for i in range(A.shape[0]):
            for j in range(A.shape[1]-1):
                for k in range(A.shape[2]):
                    ret[i,j,k] = A[i,j,k] + A[i,j+1,k]
    return ret

def ss4(np.float64_t[:,:,::1] A not None, np.float64_t[:,:,::1] ret not None):
    cdef unsigned int i,j,k;
    assert ret.shape[0]>=A.shape[0] and ret.shape[1]>=A.shape[1]-1 and ret.shape[2]>=A.shape[2]
    with cython.boundscheck(False), cython.wraparound(False):
        for i in range(A.shape[0]):
            for j in range(A.shape[1]-1):
                for k in range(A.shape[2]):
                    ret[i,j,k] = A[i,j,k] + A[i,j+1,k]
    return ret

The numba version (current numba 0.14.0 cannot allocate new arrays in optimised functions):

@numba.njit('f8[:,:,:](f8[:,:,:],f8[:,:,:])')
def ss5(A,output):
    for i in range(A.shape[0]):
        for j in range(A.shape[1]-1):
            for k in range(A.shape[2]):
                output[i,j,k] = A[i,j,k] + A[i,j+1,k]
    return output

Here are the timings:

>>> A = np.random.randn((1000,1000,10))
>>> output = np.empty((A.shape[0],A.shape[1]-1,A.shape[2]))

>>> %timeit ss1(A)
10 loops, best of 3: 50.2 ms per loop

>>> %timeit ss2(A,output)
10 loops, best of 3: 30.8 ms per loop

>>> %timeit ss3(A)
10 loops, best of 3: 50.8 ms per loop

>>> %timeit ss4(A,output)
10 loops, best of 3: 30.9 ms per loop

>>> %timeit ss5(A,output)
10 loops, best of 3: 31 ms per loop

Optimize NumPy sum of matrices iterated through every element

Answers (1)

Related Questions