Han Yuan
Han Yuan

Reputation: 13

Speed up an integration function in Python

I have a function that is the inner loop of some larger problem. SO it will be called millions of time. I have tried to optimize it. But since it is my first numerical project, I am wondering if there are other ways that can improve the speed.

cython does not seem to help. Maybe numpy is close to c already. or I don't write cython code efficiently.

import numpy as np
import math
import numexpr as ne


par_mu_rho = 0.8
par_alpha_rho = 0.7
# ' the first two are mean of mus and the '
# ' last two are the mean of alphas.'
cov_epsilon = [[1, par_mu_rho], [par_mu_rho, 1]]
cov_nu = [[1, par_alpha_rho], [par_alpha_rho, 1]]
nrows = 10000 
np.random.seed(123)
epsilon_sim = np.random.multivariate_normal([0, 0], cov_epsilon, nrows)
nu_sim = np.random.multivariate_normal([0, 0], cov_nu, nrows)
errors = np.concatenate((epsilon_sim, nu_sim), axis=1)
errors = np.exp(errors)


### the function to be optimized

def mktout(mean_mu_alpha, errors, par_gamma):
    mu10 = errors[:, 0] * math.exp(mean_mu_alpha[0])
    mu11 = math.exp(par_gamma) * mu10  # mu with gamma
    mu20 = errors[:, 1] * math.exp(mean_mu_alpha[1])
    mu21 = math.exp(par_gamma) * mu20
    alpha1 = errors[:, 2] * math.exp(mean_mu_alpha[2])
    alpha2 = errors[:, 3] * math.exp(mean_mu_alpha[3])

    j_is_larger = (mu10 > mu20)
    #     useneither1 = (mu10 < 1/168)
    threshold2 = (1 + mu10 * alpha1) / (168 + alpha1)
    #     useboth1 = (mu21 >= threshold2)
    j_is_smaller = ~j_is_larger
    #     useneither2 = (mu20 < 1/168)
    threshold3 = (1 + mu20 * alpha2) / (168 + alpha2)
    #     useboth2 = (mu11 >= threshold3)
    case1 = j_is_larger * (mu10 < 1 / 168)
    case2 = j_is_larger * (mu21 >= threshold2)
    #     case3 = j_is_larger * (1 - (useneither1 | useboth1))
    case3 = j_is_larger ^ (case1 | case2)
    case4 = j_is_smaller * (mu20 < 1 / 168)
    case5 = j_is_smaller * (mu11 >= threshold3)
    #     case6 = j_is_smaller * (1 - (useneither2 | useboth2))
    case6 = j_is_smaller ^ (case4 | case5)
    t0 = ne.evaluate(
        "case1*168+case2 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * 168 +case5 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3")
    # for some cases, t1 would be 0 anyway, so they are omitted here.
    t1 = ne.evaluate(
        "case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)")
    # t2 = (j_is_larger*useboth1*(t0*alpha2*mu21- alpha2) +
    #       j_is_smaller*useboth2*(t0*alpha2*mu21- alpha2) +
    #       j_is_smaller*(1- (useneither2|useboth2))*(t0*alpha2*mu20 - alpha2)
    #       )
    t2 = 168 - t0 - t1
    p12 = case2 + case5
    p1 = case3 + p12
    p2 = case6 + p12
    return t1.sum()/10000, t2.sum()/10000, p1.sum()/10000, p2.sum()/10000

timeit mktout([-6,-6,-1,-1], errors, -0.7)

On my old mac with 2.2GHz i7. the function runs at about 200µs.

Updates:

Based on suggestions and code from @CodeSurgeon and @GZ0, I settled on the following code

def mktout_full(double[:] mean_mu_alpha, double[:, ::1] errors, double par_gamma):
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        double threshold2, threshold3
        double t0, t1, t2
        double t1_sum, t2_sum, p1_sum, p2_sum, p12_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(<double>mean_mu_alpha[0])
    exp[1] = cmath.exp(<double>mean_mu_alpha[1])
    exp[2] = cmath.exp(<double>mean_mu_alpha[2])
    exp[3] = cmath.exp(<double>mean_mu_alpha[3])
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0
    p12_sum = 0.0

    for i in range(n) :
        mu10 = errors[i, 0] * exp[0]
    #         mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
    #         mu21 = exp_par_gamma * mu20
    #         alpha1 = errors[i, 2] * exp[2]
    #         alpha2 = errors[i, 3] * exp[3]
    #         j_is_larger = mu10 > mu20
    #         j_is_smaller = not j_is_larger

        if (mu10 >= mu20):
            if (mu10 >= 1/c) :
                mu21 = exp_par_gamma * mu20
                alpha1 = errors[i, 2] * exp[2]
                alpha2 = errors[i, 3] * exp[3]
                threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
                if (mu21 >= threshold2):
                    mu11 = exp_par_gamma * mu10
                    t0 =  (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
                    t1 = (t0 * alpha1 * mu11 - alpha1)
                    t1_sum += t1
                    t2_sum += c - t0 - t1
                    p1_sum += 1
                    p2_sum += 1
                    p12_sum += 1
                else :
                    t1_sum += ((1/threshold2) * alpha1 * mu10 - alpha1)
                    p1_sum += 1
        else :
            if (mu20 >= 1/c) :
                mu11 = exp_par_gamma * mu10
                alpha1 = errors[i, 2] * exp[2]
                alpha2 = errors[i, 3] * exp[3]
                threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
                if (mu11 >= threshold3):
                    mu21 = exp_par_gamma * mu20
                    t0 =  (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
                    t1 = (t0 * alpha1 * mu11 - alpha1)
                    t1_sum += t1
                    t2_sum += c - t0 - t1
                    p1_sum += 1
                    p2_sum += 1
                    p12_sum += 1
                else :
                    t2_sum += ((1/threshold3) * alpha2 * mu20 - alpha2)
                    p2_sum += 1

    return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n, p12_sum/n

my original code runs at 650µs. mktout and mktout_if by codesurgeon run at about 220µs and 120µs. the above mktout_full runs at about 68 µs. What I do in mktout_full is to optimize the if-else logic in mktout_if. Perhaps surprisingly, parallelized out_loop by codesurgeon combined with if-else logic in mktout_full is way slower (121ms).

Upvotes: 1

Views: 308

Answers (1)

CodeSurgeon
CodeSurgeon

Reputation: 2465

Briefly looking at the code and attempting to cythonize it, simply adding ndarray types to all of the parameters and variables does not change performance meaningfully. If you are fighting for shaving off microseconds for this function in this tight inner loop, I would consider making the following modifications:

  1. The reason why this code is so tough to cythonize is that your code is vectorized. All of the operations are going through numpy or numexpr. While each of these operations alone are efficient, they all add some python overhead (which can be seen if you look at the annotated .html files cython can produce).
  2. If you are calling this function many times (as it appears based on your comments), you can save some time by making mktout a cdef function instead. Python function calls have significant overhead.
  3. Minor but you can try avoiding any functions from python's math module. You can replace this with from libc cimport math as cmath and use cmath.exp instead.
  4. I see your mktout function takes in a python list mean_mu_alpha. You could consider using a cdef class object to replace this parameter and type this instead. If you choose to make mktout a cdef function instead, this can become just a struct or double * array. Either way, indexing into a python list (which can be contain arbitrary python objects that need to be unboxed into corresponds c-types) is going to be slow.
  5. This is probably the most important part. For each call to mktout, you are allocating memory for lots of arrays (for each mu, alpha, threshold, case, t- and p- array). You then proceed to free all of this memory at the end of the function (through python's gc), only to likely use all of this space again the next call. If you can change the signature of mktout, you can pass in all of these arrays as parameters so that the memory can be reused and overwritten across function calls. Another option, which is better for this case, would be to iterate through the array and do all of the calculations one element at a time.
  6. You can multithread the code using cython's prange function. I would reach for this after you have made all of the above changes, and I would do the multithreading outside of the mktout function itself. That is, you would be multithreading calls to mktout rather than multithreading mktout itself.

Making the above changes will be a lot of work, and you will likely have to reimplement many of the functions provided by numpy and numexpr yourself to avoid the python overhead associated with each of time. Please let me know if any part of this is unclear.


Update #1: Implementing points #1, #3, and #5, I get about a 11x fold speed-up. Here is what this code looks like. I am sure it can go faster if you ditch the def function, the list mean_mu_alpha input, and the tuple output. Note: I get slightly different results in the last decimal place compared to the original code, likely due to some floating point rules I do not understand.

from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *

def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger, j_is_smaller
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(<double>mean_mu_alpha[0])
    exp[1] = cmath.exp(<double>mean_mu_alpha[1])
    exp[2] = cmath.exp(<double>mean_mu_alpha[2])
    exp[3] = cmath.exp(<double>mean_mu_alpha[3])
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        case1 = j_is_larger * (mu10 < 1 / c)
        case2 = j_is_larger * (mu21 >= threshold2)
        case3 = j_is_larger ^ (case1 | case2)
        case4 = j_is_smaller * (mu20 < 1 / c)
        case5 = j_is_smaller * (mu11 >= threshold3)
        case6 = j_is_smaller ^ (case4 | case5)

        t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
        t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
        t2 = c - t0 - t1

        p12 = case2 + case5
        p1 = case3 + p12
        p2 = case6 + p12

        t1_sum += t1
        t2_sum += t2
        p1_sum += p1
        p2_sum += p2

    return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n

Update #2: Implemented the cdef (#2), python object elimination (#4) and the multithreading (#6) ideas. #2 and #4 alone had negligible benefit, but were necessary for #6 since the GIL cannot be accessed in OpenMP prange loops. With the multithreading, you get an additional 2.5x speed boost on my quad core laptop, amounting to code that is about 27.5x faster than the original. My outer_loop function is not wholly accurate though since it just recalculates the same result over and over, but it should be enough for a test case. The complete code is below:

from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
from cython.parallel cimport prange

def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger, j_is_smaller
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(<double>mean_mu_alpha[0])
    exp[1] = cmath.exp(<double>mean_mu_alpha[1])
    exp[2] = cmath.exp(<double>mean_mu_alpha[2])
    exp[3] = cmath.exp(<double>mean_mu_alpha[3])
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        case1 = j_is_larger * (mu10 < 1 / c)
        case2 = j_is_larger * (mu21 >= threshold2)
        case3 = j_is_larger ^ (case1 | case2)
        case4 = j_is_smaller * (mu20 < 1 / c)
        case5 = j_is_smaller * (mu11 >= threshold3)
        case6 = j_is_smaller ^ (case4 | case5)

        t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
        t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
        t2 = c - t0 - t1

        p12 = case2 + case5
        p1 = case3 + p12
        p2 = case6 + p12

        t1_sum += t1
        t2_sum += t2
        p1_sum += p1
        p2_sum += p2

    return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n

ctypedef struct Vec4:
    double a
    double b
    double c
    double d

def outer_loop(list mean_mu_alpha, double[:, ::1] errors, double par_gamma, size_t n):
    cdef:
        size_t i
        Vec4 mean_vec
        Vec4 out

    mean_vec.a = <double>(mean_mu_alpha[0])
    mean_vec.b = <double>(mean_mu_alpha[1])
    mean_vec.c = <double>(mean_mu_alpha[2])
    mean_vec.d = <double>(mean_mu_alpha[3])

    with nogil:
        for i in prange(n):
            cy_mktout(&out, &mean_vec, errors, par_gamma)
    return out

cdef void cy_mktout(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger, j_is_smaller
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(mean_mu_alpha.a)
    exp[1] = cmath.exp(mean_mu_alpha.b)
    exp[2] = cmath.exp(mean_mu_alpha.c)
    exp[3] = cmath.exp(mean_mu_alpha.d)
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        case1 = j_is_larger * (mu10 < 1 / c)
        case2 = j_is_larger * (mu21 >= threshold2)
        case3 = j_is_larger ^ (case1 | case2)
        case4 = j_is_smaller * (mu20 < 1 / c)
        case5 = j_is_smaller * (mu11 >= threshold3)
        case6 = j_is_smaller ^ (case4 | case5)

        t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
        t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
        t2 = c - t0 - t1

        p12 = case2 + case5
        p1 = case3 + p12
        p2 = case6 + p12

        t1_sum += t1
        t2_sum += t2
        p1_sum += p1
        p2_sum += p2

    out.a = t1_sum/n
    out.b = t2_sum/n
    out.c = p1_sum/n
    out.d = p2_sum/n

And the setup.py file that I use is as follows (has all of the optimization and OpenMP flags):

from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
import numpy as np
import os
import shutil
import platform

libraries = {
    "Linux": [],
    "Windows": [],
}
language = "c"
args = ["-w", "-std=c11", "-O3", "-ffast-math", "-march=native", "-fopenmp"]
link_args = ["-std=c11", "-fopenmp"]

annotate = True
directives = {
    "binding": True,
    "boundscheck": False,
    "wraparound": False,
    "initializedcheck": False,
    "cdivision": True,
    "nonecheck": False,
    "language_level": "3",
    #"c_string_type": "unicode",
    #"c_string_encoding": "utf-8",
}

if __name__ == "__main__":
    system = platform.system()
    libs = libraries[system]
    extensions = []
    ext_modules = []

    #create extensions
    for path, dirs, file_names in os.walk("."):
        for file_name in file_names:
            if file_name.endswith("pyx"):
                ext_path = "{0}/{1}".format(path, file_name)
                ext_name = ext_path \
                    .replace("./", "") \
                    .replace("/", ".") \
                    .replace(".pyx", "")
                ext = Extension(
                    name=ext_name, 
                    sources=[ext_path], 
                    libraries=libs,
                    language=language,
                    extra_compile_args=args,
                    extra_link_args=link_args,
                    include_dirs = [np.get_include()],
                )
                extensions.append(ext)

    #setup all extensions
    ext_modules = cythonize(
        extensions, 
        annotate=annotate, 
        compiler_directives=directives,
    )
    setup(ext_modules=ext_modules)

    """
    #immediately remove build directory
    build_dir = "./build"
    if os.path.exists(build_dir):
        shutil.rmtree(build_dir)
    """

Update #3: Per the advice of @GZ0, there were lots of conditions where expressions in the code would evaluate to zero and be wastefully calculated. I have attempted to eliminate these areas with the following code (after fixing both the case3 and case6 statements):

cdef void cy_mktout_if(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(mean_mu_alpha.a)
    exp[1] = cmath.exp(mean_mu_alpha.b)
    exp[2] = cmath.exp(mean_mu_alpha.c)
    exp[3] = cmath.exp(mean_mu_alpha.d)
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        if j_is_larger:
            case1 = mu10 < 1 / c
            case2 = mu21 >= threshold2
            case3 = not (case1 | case2)

            t0 = case1*c + case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case3 / threshold2
            t1 = case2 * (t0 * alpha1 * mu11 - alpha1) + case3 * (t0 * alpha1 * mu10 - alpha1)
            t2 = c - t0 - t1

            t1_sum += t1
            t2_sum += t2
            p1_sum += case2 + case3
            p2_sum += case2

        else:
            case4 = mu20 < 1 / c
            case5 = mu11 >= threshold3
            case6 = not (case4 | case5)

            t0 = case4 * c + case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
            t1 = case5 * (t0 * alpha1 * mu11 - alpha1)
            t2 = c - t0 - t1

            t1_sum += t1
            t2_sum += t2
            p1_sum += case5
            p2_sum += case5 + case6

    out.a = t1_sum/n
    out.b = t2_sum/n
    out.c = p1_sum/n
    out.d = p2_sum/n

For 10000 iterations, the current code performs as follows:

outer_loop: 0.5116949229995953 seconds
outer_loop_if: 0.617649456995423 seconds
mktout: 0.9221872320049442 seconds
mktout_if: 1.430276553001022 seconds
python: 10.116664300003322 seconds

I think the cost of the conditional and the branch misprediction that results is making the function surprisingly slower, but I would appreciate any help clearing this up for certain.

Upvotes: 2

Related Questions