Reputation: 2735
I'm working using numpy 1.9, python 2.7 with opencv, dealing with big matrices and I have to make the following operation many times
def sumShifted(A): # A: numpy array 1000*1000*10
return A[:, 0:-1] + A[:, 1:]
I'd like to optimize this operation, if possible; I tried with Cython but I don't get any significant improvement but I do not exclude that it is because of my bad implementation.
Is there a way to make it faster?
EDIT:
sumShifted
is getting called in a for loop like this:
for i in xrange(0, 400):
# ... Various operations on B
A = sumShifted(B)
# ... Other operations on B
#More detailed
for i in xrange(0, 400):
A = sumShifted(a11)
B = sumShifted(a12)
C = sumShifted(b12)
D = sumShifted(b22)
v = -upQ12/upQ11
W, X, Z = self.function1( input_matrix, v, A, C[:,:,4], D[:,:,4] )
S, D, F = self.function2( input_matrix, v, A, C[:,:,5], D[:,:,5] )
AA = self.function3( input_matrix, v, A, C[:,:,6], D[:,:,6] )
BB = self.function4( input_matrix, v, A, C[:,:,7], D[:,:,7] )
EDIT2: Following your advice I created this two runnable benchmarks (with Cython) about merging the 4 sumShifted
methods in one.
A, B, C, D= improvedSumShifted(E, F, G, H)
#E,F: 1000x1000 matrices
#G,H: 1000x1000x8 matrices
#first implementation
def improvedSumShifted(np.ndarray[dtype_t, ndim=2] a, np.ndarray[dtype_t, ndim=2] b, np.ndarray[dtype_t, ndim=3] c, np.ndarray[dtype_t, ndim=3] d):
cdef unsigned int i,j,k;
cdef unsigned int w = a.shape[0], h = a.shape[1]-1, z = c.shape[2]
cdef np.ndarray[dtype_t, ndim=2] aa = np.empty((w, h))
cdef np.ndarray[dtype_t, ndim=2] bb = np.empty((w, h))
cdef np.ndarray[dtype_t, ndim=3] cc = np.empty((w, h, z))
cdef np.ndarray[dtype_t, ndim=3] dd = np.empty((w, h, z))
with cython.boundscheck(False), cython.wraparound(False), cython.overflowcheck(False), cython.nonecheck(False):
for i in range(w):
for j in range(h):
aa[i,j] = a[i,j] + a[i,j+1]
bb[i,j] = b[i,j] + b[i,j+1]
for k in range(z):
cc[i,j,k] = c[i,j,k] + c[i,j+1,k]
dd[i,j,k] = d[i,j,k] + d[i,j+1,k]
return aa, bb, cc, dd
#second implementation
def improvedSumShifted(np.ndarray[dtype_t, ndim=2] a, np.ndarray[dtype_t, ndim=2] b, np.ndarray[dtype_t, ndim=3] c, np.ndarray[dtype_t, ndim=3] d):
cdef unsigned int i,j,k;
cdef unsigned int w = a.shape[0], h = a.shape[1]-1, z = c.shape[2]
cdef np.ndarray[dtype_t, ndim=2] aa = np.copy(a[:, 0:h])
cdef np.ndarray[dtype_t, ndim=2] bb = np.copy(b[:, 0:h])
cdef np.ndarray[dtype_t, ndim=3] cc = np.copy(c[:, 0:h])
cdef np.ndarray[dtype_t, ndim=3] dd = np.copy(d[:, 0:h])
with cython.boundscheck(False), cython.wraparound(False), cython.overflowcheck(False), cython.nonecheck(False):
for i in range(w):
for j in range(h):
aa[i,j] += a[i,j+1]
bb[i,j] += b[i,j+1]
for k in range(z):
cc[i,j,k] += c[i,j+1,k]
dd[i,j,k] += d[i,j+1,k]
return aa, bb, cc, dd
Upvotes: 2
Views: 349
Reputation: 2196
It is unlikely that this function can be sped up any further: It really does just four operations on python level:
np.add
ufunc on the two slices, an operation that is highly optimised in numpy.Indeed, my benchmarks show no improvement by either using numba or cython. On my machine, I consistently get ~30 ms per call if the output-array is pre-allocated, or ~50 ms if the memory allocation is taken into account.
The pure numpy versions:
import numpy as np
def ss1(A):
return np.add(A[:,:-1,:],A[:,1:,:])
def ss2(A,output):
return np.add(A[:,:-1,:],A[:,1:,:],output)
The cython versions:
import numpy as np
cimport numpy as np
cimport cython
def ss3(np.float64_t[:,:,::1] A not None):
cdef unsigned int i,j,k;
cdef np.float64_t[:,:,::1] ret = np.empty((A.shape[0],A.shape[1]-1,A.shape[2]),'f8')
with cython.boundscheck(False), cython.wraparound(False):
for i in range(A.shape[0]):
for j in range(A.shape[1]-1):
for k in range(A.shape[2]):
ret[i,j,k] = A[i,j,k] + A[i,j+1,k]
return ret
def ss4(np.float64_t[:,:,::1] A not None, np.float64_t[:,:,::1] ret not None):
cdef unsigned int i,j,k;
assert ret.shape[0]>=A.shape[0] and ret.shape[1]>=A.shape[1]-1 and ret.shape[2]>=A.shape[2]
with cython.boundscheck(False), cython.wraparound(False):
for i in range(A.shape[0]):
for j in range(A.shape[1]-1):
for k in range(A.shape[2]):
ret[i,j,k] = A[i,j,k] + A[i,j+1,k]
return ret
The numba version (current numba 0.14.0 cannot allocate new arrays in optimised functions):
@numba.njit('f8[:,:,:](f8[:,:,:],f8[:,:,:])')
def ss5(A,output):
for i in range(A.shape[0]):
for j in range(A.shape[1]-1):
for k in range(A.shape[2]):
output[i,j,k] = A[i,j,k] + A[i,j+1,k]
return output
Here are the timings:
>>> A = np.random.randn((1000,1000,10))
>>> output = np.empty((A.shape[0],A.shape[1]-1,A.shape[2]))
>>> %timeit ss1(A)
10 loops, best of 3: 50.2 ms per loop
>>> %timeit ss2(A,output)
10 loops, best of 3: 30.8 ms per loop
>>> %timeit ss3(A)
10 loops, best of 3: 50.8 ms per loop
>>> %timeit ss4(A,output)
10 loops, best of 3: 30.9 ms per loop
>>> %timeit ss5(A,output)
10 loops, best of 3: 31 ms per loop
Upvotes: 6