Reputation: 991
I am new to Cython. Why is my C function Numeraire
, which at this point simply wraps a build-in function, so much slower than calling the built-in function directly?
Thanks. Here's the Cython code (backward.pyx
) code:
import numpy as np
cimport numpy as np
from libc.math cimport exp
cdef double Numeraire(int i1, int i0, np.ndarray[np.int_t, ndim=1] j):
cdef float rate = 0.05
return exp(-rate/12*(i1 - i0))
def Slow(np.ndarray[np.float_t, ndim=2] values, int i1, int i0):
cdef float norm = 0.25
cdef int i, j0, j1
cdef np.ndarray[np.int_t, ndim=1] j = np.empty(2, dtype=np.int)
for i in range(i1-1, i0-1, -1):
for j0 in range(i+1):
j[0] = j0
for j1 in range(i+1):
j[1] = j1
values[j0, j1] += (
values[j0+1, j1 ] +
values[j0 , j1+1] +
values[j0+1, j1+1])
values[j0, j1] *= norm*Numeraire(i+1, i, j) #4.397s (!)
def Fast(np.ndarray[np.float_t, ndim=2] values, int i1, int i0):
cdef float norm = 0.25
cdef int i, j0, j1
cdef np.ndarray[np.int_t, ndim=1] j = np.empty(2, dtype=np.int)
for i in range(i1-1, i0-1, -1):
for j0 in range(i+1):
j[0] = j0
for j1 in range(i+1):
j[1] = j1
values[j0, j1] += (
values[j0+1, j1 ] +
values[j0 , j1+1] +
values[j0+1, j1+1])
values[j0, j1] *= norm*exp(-0.05/12*((i+1) - i)) #0.327s
and here's the timing info:
In [1]: import numpy as np
In [2]: import backward
In [3]: factors=2
In [4]: i=360
In [5]: %timeit backward.Fast(np.ones([i+1]*factors), i, 0)
10 loops, best of 3: 104 ms per loop
In [6]: %timeit backward.Slow(np.ones([i+1]*factors), i, 0)
1 loops, best of 3: 4.67 s per loop
Upvotes: 2
Views: 793
Reputation: 30925
It has to do with the ndarray
you're passing to Numeraire and not using. If you run cython -a backward.pyx
and look at the code you see first that the cdef double Numeraire...
line is highlighed light yellow (showing that Cython is doing hidden work there) and when you click on the line you get the following code
static double __pyx_f_8backward_Numeraire(int __pyx_v_i1, int __pyx_v_i0, CYTHON_UNUSED PyArrayObject *__pyx_v_j) {
float __pyx_v_rate;
__Pyx_LocalBuf_ND __pyx_pybuffernd_j;
__Pyx_Buffer __pyx_pybuffer_j;
double __pyx_r;
__Pyx_RefNannyDeclarations
__Pyx_RefNannySetupContext("Numeraire", 0);
__pyx_pybuffer_j.pybuffer.buf = NULL;
__pyx_pybuffer_j.refcount = 0;
__pyx_pybuffernd_j.data = NULL;
__pyx_pybuffernd_j.rcbuffer = &__pyx_pybuffer_j;
{
__Pyx_BufFmt_StackElem __pyx_stack[1];
if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_j.rcbuffer->pybuffer, (PyObject*)__pyx_v_j, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
}
__pyx_pybuffernd_j.diminfo[0].strides = __pyx_pybuffernd_j.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_j.diminfo[0].shape = __pyx_pybuffernd_j.rcbuffer->pybuffer.shape[0];
/* … */
/* function exit code */
__pyx_L1_error:;
{ PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
__Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
__Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_j.rcbuffer->pybuffer);
__Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
__Pyx_WriteUnraisable("backward.Numeraire", __pyx_clineno, __pyx_lineno, __pyx_filename, 0);
__pyx_r = 0;
goto __pyx_L2;
__pyx_L0:;
__pyx_L2:;
__Pyx_RefNannyFinishContext();
return __pyx_r;
}
where the main body of the function goes in the bit marked /* … */
.
Some of that work happens for every Cython call, but quite a bit of it related to your unused ndarray, j
(e.g. __pyx_pybuffer_j
, __pyx_pybuffernd_j
)
If you remove j
from the list of arguments then the speed is the same with and without the function call. There's a number of options if you actually need j
for the non-trivial, non-example version of the function.
If you always know 'j' is going to be length 2, you could just have
cdef double Numeraire(int i1, int i0, double j0, double j1):
Alternatively you could pass it a C-style double*
, a length and possibly a stride (but if you declare j
as cdef ndarray[...,mode="c"]
you don't need that) which will probably be faster.
Best option: the easiest option is you use the new-style Cython typed memoryview interface instead of the ndarray
interface.
code:
cdef double Numeraire(int i1, int i0, long[::1] j):
# code as before
# then within your calling function
# ...
cdef long[::1] j = np.empty(2, dtype=np.int)
# ...
This seems to be almost overhead free in this case (however, in some other cases I've found the memoryview interface fractionally (~1%) slower, so it isn't always the best answer).
Upvotes: 3