Reputation: 13
I have a function that is the inner loop of some larger problem. SO it will be called millions of time. I have tried to optimize it. But since it is my first numerical project, I am wondering if there are other ways that can improve the speed.
cython does not seem to help. Maybe numpy is close to c already. or I don't write cython code efficiently.
import numpy as np
import math
import numexpr as ne
par_mu_rho = 0.8
par_alpha_rho = 0.7
# ' the first two are mean of mus and the '
# ' last two are the mean of alphas.'
cov_epsilon = [[1, par_mu_rho], [par_mu_rho, 1]]
cov_nu = [[1, par_alpha_rho], [par_alpha_rho, 1]]
nrows = 10000
np.random.seed(123)
epsilon_sim = np.random.multivariate_normal([0, 0], cov_epsilon, nrows)
nu_sim = np.random.multivariate_normal([0, 0], cov_nu, nrows)
errors = np.concatenate((epsilon_sim, nu_sim), axis=1)
errors = np.exp(errors)
### the function to be optimized
def mktout(mean_mu_alpha, errors, par_gamma):
mu10 = errors[:, 0] * math.exp(mean_mu_alpha[0])
mu11 = math.exp(par_gamma) * mu10 # mu with gamma
mu20 = errors[:, 1] * math.exp(mean_mu_alpha[1])
mu21 = math.exp(par_gamma) * mu20
alpha1 = errors[:, 2] * math.exp(mean_mu_alpha[2])
alpha2 = errors[:, 3] * math.exp(mean_mu_alpha[3])
j_is_larger = (mu10 > mu20)
# useneither1 = (mu10 < 1/168)
threshold2 = (1 + mu10 * alpha1) / (168 + alpha1)
# useboth1 = (mu21 >= threshold2)
j_is_smaller = ~j_is_larger
# useneither2 = (mu20 < 1/168)
threshold3 = (1 + mu20 * alpha2) / (168 + alpha2)
# useboth2 = (mu11 >= threshold3)
case1 = j_is_larger * (mu10 < 1 / 168)
case2 = j_is_larger * (mu21 >= threshold2)
# case3 = j_is_larger * (1 - (useneither1 | useboth1))
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / 168)
case5 = j_is_smaller * (mu11 >= threshold3)
# case6 = j_is_smaller * (1 - (useneither2 | useboth2))
case6 = j_is_smaller ^ (case4 | case5)
t0 = ne.evaluate(
"case1*168+case2 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * 168 +case5 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3")
# for some cases, t1 would be 0 anyway, so they are omitted here.
t1 = ne.evaluate(
"case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)")
# t2 = (j_is_larger*useboth1*(t0*alpha2*mu21- alpha2) +
# j_is_smaller*useboth2*(t0*alpha2*mu21- alpha2) +
# j_is_smaller*(1- (useneither2|useboth2))*(t0*alpha2*mu20 - alpha2)
# )
t2 = 168 - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
return t1.sum()/10000, t2.sum()/10000, p1.sum()/10000, p2.sum()/10000
timeit mktout([-6,-6,-1,-1], errors, -0.7)
On my old mac with 2.2GHz i7. the function runs at about 200µs.
Updates:
Based on suggestions and code from @CodeSurgeon and @GZ0, I settled on the following code
def mktout_full(double[:] mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
double threshold2, threshold3
double t0, t1, t2
double t1_sum, t2_sum, p1_sum, p2_sum, p12_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
p12_sum = 0.0
for i in range(n) :
mu10 = errors[i, 0] * exp[0]
# mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
# mu21 = exp_par_gamma * mu20
# alpha1 = errors[i, 2] * exp[2]
# alpha2 = errors[i, 3] * exp[3]
# j_is_larger = mu10 > mu20
# j_is_smaller = not j_is_larger
if (mu10 >= mu20):
if (mu10 >= 1/c) :
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
if (mu21 >= threshold2):
mu11 = exp_par_gamma * mu10
t0 = (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
t1 = (t0 * alpha1 * mu11 - alpha1)
t1_sum += t1
t2_sum += c - t0 - t1
p1_sum += 1
p2_sum += 1
p12_sum += 1
else :
t1_sum += ((1/threshold2) * alpha1 * mu10 - alpha1)
p1_sum += 1
else :
if (mu20 >= 1/c) :
mu11 = exp_par_gamma * mu10
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if (mu11 >= threshold3):
mu21 = exp_par_gamma * mu20
t0 = (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
t1 = (t0 * alpha1 * mu11 - alpha1)
t1_sum += t1
t2_sum += c - t0 - t1
p1_sum += 1
p2_sum += 1
p12_sum += 1
else :
t2_sum += ((1/threshold3) * alpha2 * mu20 - alpha2)
p2_sum += 1
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n, p12_sum/n
my original code runs at 650µs.
mktout
and mktout_if
by codesurgeon run at about 220µs and 120µs.
the above mktout_full
runs at about 68 µs.
What I do in mktout_full
is to optimize the if-else logic in mktout_if
.
Perhaps surprisingly, parallelized out_loop
by codesurgeon combined with if-else logic in mktout_full
is way slower (121ms).
Upvotes: 1
Views: 308
Reputation: 2465
Briefly looking at the code and attempting to cythonize it, simply adding ndarray types to all of the parameters and variables does not change performance meaningfully. If you are fighting for shaving off microseconds for this function in this tight inner loop, I would consider making the following modifications:
numpy
or numexpr
. While each of these operations alone are efficient, they all add some python overhead (which can be seen if you look at the annotated .html
files cython can produce).mktout
a cdef
function instead. Python function calls have significant overhead.math
module. You can replace this with from libc cimport math as cmath
and use cmath.exp
instead.mktout
function takes in a python list mean_mu_alpha
. You could consider using a cdef class
object to replace this parameter and type this instead. If you choose to make mktout
a cdef
function instead, this can become just a struct or double *
array. Either way, indexing into a python list (which can be contain arbitrary python objects that need to be unboxed into corresponds c-types) is going to be slow.mktout
, you are allocating memory for lots of arrays (for each mu
, alpha
, threshold
, case
, t-
and p-
array). You then proceed to free all of this memory at the end of the function (through python's gc), only to likely use all of this space again the next call. If you can change the signature of mktout
, you can pass in all of these arrays as parameters so that the memory can be reused and overwritten across function calls. Another option, which is better for this case, would be to iterate through the array and do all of the calculations one element at a time.prange
function. I would reach for this after you have made all of the above changes, and I would do the multithreading outside of the mktout
function itself. That is, you would be multithreading calls to mktout rather than multithreading mktout
itself.Making the above changes will be a lot of work, and you will likely have to reimplement many of the functions provided by numpy and numexpr yourself to avoid the python overhead associated with each of time. Please let me know if any part of this is unclear.
Update #1: Implementing points #1, #3, and #5, I get about a 11x fold speed-up. Here is what this code looks like. I am sure it can go faster if you ditch the def
function, the list mean_mu_alpha
input, and the tuple
output. Note: I get slightly different results in the last decimal place compared to the original code, likely due to some floating point rules I do not understand.
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
Update #2: Implemented the cdef
(#2), python object elimination (#4) and the multithreading (#6) ideas. #2 and #4 alone had negligible benefit, but were necessary for #6 since the GIL cannot be accessed in OpenMP prange
loops. With the multithreading, you get an additional 2.5x speed boost on my quad core laptop, amounting to code that is about 27.5x faster than the original. My outer_loop
function is not wholly accurate though since it just recalculates the same result over and over, but it should be enough for a test case. The complete code is below:
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
from cython.parallel cimport prange
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
ctypedef struct Vec4:
double a
double b
double c
double d
def outer_loop(list mean_mu_alpha, double[:, ::1] errors, double par_gamma, size_t n):
cdef:
size_t i
Vec4 mean_vec
Vec4 out
mean_vec.a = <double>(mean_mu_alpha[0])
mean_vec.b = <double>(mean_mu_alpha[1])
mean_vec.c = <double>(mean_mu_alpha[2])
mean_vec.d = <double>(mean_mu_alpha[3])
with nogil:
for i in prange(n):
cy_mktout(&out, &mean_vec, errors, par_gamma)
return out
cdef void cy_mktout(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
And the setup.py
file that I use is as follows (has all of the optimization and OpenMP flags):
from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
import numpy as np
import os
import shutil
import platform
libraries = {
"Linux": [],
"Windows": [],
}
language = "c"
args = ["-w", "-std=c11", "-O3", "-ffast-math", "-march=native", "-fopenmp"]
link_args = ["-std=c11", "-fopenmp"]
annotate = True
directives = {
"binding": True,
"boundscheck": False,
"wraparound": False,
"initializedcheck": False,
"cdivision": True,
"nonecheck": False,
"language_level": "3",
#"c_string_type": "unicode",
#"c_string_encoding": "utf-8",
}
if __name__ == "__main__":
system = platform.system()
libs = libraries[system]
extensions = []
ext_modules = []
#create extensions
for path, dirs, file_names in os.walk("."):
for file_name in file_names:
if file_name.endswith("pyx"):
ext_path = "{0}/{1}".format(path, file_name)
ext_name = ext_path \
.replace("./", "") \
.replace("/", ".") \
.replace(".pyx", "")
ext = Extension(
name=ext_name,
sources=[ext_path],
libraries=libs,
language=language,
extra_compile_args=args,
extra_link_args=link_args,
include_dirs = [np.get_include()],
)
extensions.append(ext)
#setup all extensions
ext_modules = cythonize(
extensions,
annotate=annotate,
compiler_directives=directives,
)
setup(ext_modules=ext_modules)
"""
#immediately remove build directory
build_dir = "./build"
if os.path.exists(build_dir):
shutil.rmtree(build_dir)
"""
Update #3: Per the advice of @GZ0, there were lots of conditions where expressions in the code would evaluate to zero and be wastefully calculated. I have attempted to eliminate these areas with the following code (after fixing both the case3
and case6
statements):
cdef void cy_mktout_if(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if j_is_larger:
case1 = mu10 < 1 / c
case2 = mu21 >= threshold2
case3 = not (case1 | case2)
t0 = case1*c + case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case3 / threshold2
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) + case3 * (t0 * alpha1 * mu10 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case2 + case3
p2_sum += case2
else:
case4 = mu20 < 1 / c
case5 = mu11 >= threshold3
case6 = not (case4 | case5)
t0 = case4 * c + case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case5
p2_sum += case5 + case6
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
For 10000 iterations, the current code performs as follows:
outer_loop: 0.5116949229995953 seconds
outer_loop_if: 0.617649456995423 seconds
mktout: 0.9221872320049442 seconds
mktout_if: 1.430276553001022 seconds
python: 10.116664300003322 seconds
I think the cost of the conditional and the branch misprediction that results is making the function surprisingly slower, but I would appreciate any help clearing this up for certain.
Upvotes: 2