parallel numba code on 20 cores slower than serial

Question

The function I want to profile and see it has improved after adding the parallel=True keyword (potentially with nogil=True as well) is quite simple (given the preamble is loaded (some function definitions, some global variables, and two variables which are load data from the disk (two .npy's)) and reads as below. The profiling is as and I run the .py on 20 cores as NUMBA_NUM_THREADS=20 python main.py:

if __name__ == "__main__":
    from timeit import Timer
    t = Timer(lambda: get_Cilmoft(Psi))    
    print( min( t.repeat(repeat=2, number=2) ) )

And the function of interest inside main.py is:

@njit(parallel=True, fastmath=True)
def get_Cilmoft(Psi):

    Cilmoft = np.zeros( (gl.N_rs - 1,  gl.N_thetas,  2*gl.N_thetas-1,  2), dtype=np.complex128)
    for i in prange(Cilmoft.shape[0]):
        for l in prange(Cilmoft.shape[1]):
            for m in prange(-l, l+1, 1):

                exp_of_phis = np.exp(-1j * m * gl.phis)  # a temporary array? allocation contention problem?

                for contor in prange(Psi.shape[2]):
                    Psi[:, :, contor] = Psi[:, :, contor] * exp_of_phis[contor]

                F1 = np.zeros( (Psi.shape[0], Psi.shape[1]), dtype=np.complex128) # a temporary array? allocation contention problem?
                for a in prange(Psi.shape[0]):
                    for b in prange(Psi.shape[1]):
                        F1[a, b] = np.trapz(Psi[a, b, :])

                poly_results = eval_legendre_withNJIT(l)
                F2 = np.zeros( (gl.N_rs - 1, ), dtype=np.complex128) # a temporary array? allocation contention problem?

                for idx in prange(gl.N_rs - 1):
                    F2[idx] = np.sum(weights_here * poly_results * F1[idx, :])

                Cilmoft[i, l, m+l, 0] = np.complex128(np.sum(eigenvectors_memory[:, i, l] * F2 * lobatto_weights))

    return Cilmoft

The preamble for a MWE is:

import numpy as np
import numba
from numba import njit, vectorize, prange
from scipy.special import roots_legendre, legendre, eval_legendre, lpmv


import gl

Psi = np.empty((80, 20, 18), dtype=np.complex128)

eigenenergies_memory = np.load("eigenValues_N_rs80_alpha25.0_rmax200.0.npy") # shall be 2D array shape (gl.N_rs - 1,   gl.N_thetas)
eigenvectors_memory = np.load("eigenVectors_N_rs80_alpha25.0_rmax200.0.npy") # shall be 3D array shape  (gl.N_rs - 1,   gl.N_rs - 1,   gl.N_thetas), first index is the r-dep, second index is the i-index, third index is the l-index
def gLLNodesAndWeights (n, epsilon = 1e-15):
    x = np.empty (n)
    w = np.empty (n)
    x[0] = -1
    x[n - 1] = 1
    w[0] = 2.0 / ((n * (n - 1)))
    w[n - 1] = w[0]
    n_2 = n // 2
    for i in range (1, n_2):
        xi = (1 - (3 * (n - 2)) / (8 * (n - 1) ** 3)) *\
           np.cos ((4 * i + 1) * np.pi / (4 * (n - 1) + 1))
        error = 1.0
        while error > epsilon:
            y  =  dLgP (n - 1, xi)
            y1 = d2LgP (n - 1, xi)
            y2 = d3LgP (n - 1, xi)
            dx = 2 * y * y1 / (2 * y1 ** 2 - y * y2)
            xi -= dx
            error = abs (dx)
        x[i] = -xi
        x[n - i - 1] =  xi
        w[i] = 2 / (n * (n - 1) * lgP (n - 1, x[i]) ** 2)
        w[n - i - 1] = w[i]
    if n % 2 != 0:
        x[n_2] = 0
        w[n_2] = 2.0 / ((n * (n - 1)) * lgP (n - 1, np.array (x[n_2])) ** 2)
    return x, w
def dLgP (n, xi):
  return n * (lgP (n - 1, xi) - xi * lgP (n, xi))\
           / (1 - xi ** 2)
def d2LgP (n, xi):
  return (2 * xi * dLgP (n, xi) - n * (n + 1)\
                                    * lgP (n, xi)) / (1 - xi ** 2)
def d3LgP (n, xi):
  return (4 * xi * d2LgP (n, xi)\
                 - (n * (n + 1) - 2) * dLgP (n, xi)) / (1 - xi ** 2)
def lgP (n, xi):
  if n == 0:
    return np.ones (xi.size)
  elif n == 1:
    return xi
  else:
    fP = np.ones (xi.size); sP = xi.copy (); nP = np.empty (xi.size)
    for i in range (2, n + 1):

      nP = ((2 * i - 1) * xi * sP - (i - 1) * fP) / i
      fP = sP; sP = nP
    return nP
a = gLLNodesAndWeights(gl.N_rs + 1)[0]
roots = a[1:-1]
lobatto_nodes = roots
b =  roots_legendre(gl.N_thetas)
roots_here, weights_here = np.float64(b[0]), np.float64(b[1])

legendre_results_for_Psi_lm = np.zeros((roots.shape[0], ), dtype=np.complex128)
for j in range(roots.shape[0]):
    legendre_results_for_Psi_lm[j] = eval_legendre(gl.N_rs, roots[j])

@njit
def eval_legendre_for_Psi_lm(j):
    return legendre_results_for_Psi_lm[j]

legendre_results = np.zeros( (gl.N_thetas, roots_here.shape[0]) , dtype=np.complex128)
for l in range(gl.N_thetas):
    legendre_results[l, :] = eval_legendre(l, roots_here)
@njit
def eval_legendre_withNJIT(l):
    return legendre_results[l, :]

@njit
def r(x):
    return gl.L * (1 + x) / (1 - x + gl.alpha)
@njit
def r_prime(x):
    return gl.L * (2 + gl.alpha) / (1 - x + gl.alpha)**2

lobatto_weights = 2 / ( (gl.N_rs) * (gl.N_rs+1) * eval_legendre(gl.N_rs, lobatto_nodes)**2 )

The gl.py is:

import numpy as np

N_rs = np.int32(80)
N_thetas = 20
N_phis = 18
phis = np.linspace(0.0, 2*np.pi, num = N_phis)

delta_theta = np.float64(np.pi / N_thetas)
delta_phi = np.float64(2*np.pi / N_phis)

N_timesteps = np.int(500)

r_max = np.float64(200.0)
alpha = np.float64(25)
L = alpha * r_max / 2

Temp_Env = "sin_squared"
N = np.int32(12)

delta_t = 0.1
omega = 0.057 # equivalent to lambda = 800 nm

c_au = 137.036

The eigenenergies_memory and eigenvectors_memory are obtained from (just run the below and it will save them to the disk):

import numpy as np
import scipy
from matplotlib import pyplot as plt

from scipy.special import roots_legendre, legendre
from numpy.linalg import eig

import gl

def gLLNodesAndWeights (n, epsilon = 1e-15):

    x = np.empty (n)
    w = np.empty (n)
    
    x[0] = -1 
    x[n - 1] = 1
    w[0] = 2.0 / ((n * (n - 1))) 
    w[n - 1] = w[0]
    
    n_2 = n // 2
    
    for i in range (1, n_2):
        xi = (1 - (3 * (n - 2)) / (8 * (n - 1) ** 3)) *\
           np.cos ((4 * i + 1) * np.pi / (4 * (n - 1) + 1))
      
        error = 1.0
      
        while error > epsilon:
            y  =  dLgP (n - 1, xi)
            y1 = d2LgP (n - 1, xi)
            y2 = d3LgP (n - 1, xi)
        
            dx = 2 * y * y1 / (2 * y1 ** 2 - y * y2)
        
            xi -= dx
            error = abs (dx)
      
        x[i] = -xi
        x[n - i - 1] =  xi
      
        w[i] = 2 / (n * (n - 1) * lgP (n - 1, x[i]) ** 2)
        w[n - i - 1] = w[i]

    if n % 2 != 0:
        x[n_2] = 0
        w[n_2] = 2.0 / ((n * (n - 1)) * lgP (n - 1, np.array (x[n_2])) ** 2)

    return x, w

def dLgP (n, xi):

  return n * (lgP (n - 1, xi) - xi * lgP (n, xi))\
           / (1 - xi ** 2)

def d2LgP (n, xi):

  return (2 * xi * dLgP (n, xi) - n * (n + 1)\
                                    * lgP (n, xi)) / (1 - xi ** 2)

def d3LgP (n, xi):

  return (4 * xi * d2LgP (n, xi)\
                 - (n * (n + 1) - 2) * dLgP (n, xi)) / (1 - xi ** 2)

def lgP (n, xi):

  if n == 0:
    return np.ones (xi.size)
  
  elif n == 1:
    return xi

  else:
    fP = np.ones (xi.size); sP = xi.copy (); nP = np.empty (xi.size)
    for i in range (2, n + 1):

      nP = ((2 * i - 1) * xi * sP - (i - 1) * fP) / i
      fP = sP; sP = nP

    return nP


a = gLLNodesAndWeights(gl.N_rs + 1)[0]
roots = a[1:-1] # has shape (gl.N_rs - 1) if input to gLLNodesAndWeights is equal to (gl.N_rs + 1)




# Non-Linear Mapping 
def r(x):
    return gl.L * (1 + x) / (1 - x + gl.alpha)

def r_prime(x):
    return gl.L * (2 + gl.alpha) / (1 - x + gl.alpha)**2


# H_{l}^{0}(x) assembly functions
def V_l(x, l, Z):
    return ( l*(l+1) / (2*r(x)**2) ) + Coulomb(r(x), Z)

def Coulomb(r, Z):
    return (-Z / r)


Z = 1
H = np.zeros(  (gl.N_rs - 1,  gl.N_rs - 1,  gl.N_thetas),   dtype=np.complex128)
pref = gl.N_rs * (gl.N_rs + 1) / 6
diag_values_Kinetic = np.array([  (1/r_prime(x)) * pref * (1/(1-x**2)) * (1/r_prime(x)) for x in roots])

for l in range(gl.N_thetas):
    diag_values_Coulomb_and_Laser = np.array([  V_l(x, l, Z) for x in roots ])
    np.fill_diagonal( H[:, :, l],   diag_values_Kinetic + diag_values_Coulomb_and_Laser )
    for i in range(H.shape[0]):
        for j in range(H.shape[1]):
            if (i != j):
                H[i, j, l] = (1/r_prime(roots[i])) * (1/r_prime(roots[j])) * (1/(roots[i]-roots[j])**2) 
            # else already filled by np.fill_diagonal() from above

eig_values = np.zeros( (gl.N_rs - 1,  gl.N_thetas), dtype=np.complex128)
eig_vectors = np.zeros( (gl.N_rs - 1,  gl.N_rs - 1,  gl.N_thetas), dtype = np.complex128)

for l in range(gl.N_thetas):
    eig_values[:, l], eig_vectors[:, :, l] = eig( H[:, :, l] )

np.save("eigenVectors_N_rs{}_alpha{}_rmax{}.npy".format(gl.N_rs, gl.alpha, gl.r_max), eig_vectors)
np.save("eigenValues_N_rs{}_alpha{}_rmax{}.npy".format(gl.N_rs, gl.alpha, gl.r_max), eig_values)

What I have tried in order to optimize get_Cilmoft(Psi):

I found from stackoverflow that allocating small arrays inside a prange for-loop means a standard allocator contention, and is to be avoided. I thus made the code to allocate F1 and F2 immediately below the allocation of Cilmoft. There has been no success in increasing the performance (time to solution) of this function when run on 20 cores by making those allocations outside of the nested for-loops.
I have played with the prange keywords, by letting only the outermost for-loop have prange and the others have the simple range. That would be for i in prange(Cilmoft.shape[0]): followed by all the other for loops using range only. No massive improvement.
I have tried adding nogil=True, a very minor change in execution time.
I have tried removing the parallel=True and a massive improvement in time to solution occurred when running the .py as before. It is thus clear that I am doing something wrong, or I simply cannot make this piece of code work better in parallel?
Might it be that @njit "does not like" np.sum() and np.trapz()? I tried to replace the two np.sum() calls by:

for idx in prange(80 - 1):
    # F2[idx] = np.sum(weights_here * poly_results * F1[idx, :])
    int_res = weights_here * poly_results * F1[idx, :]
    for n in range(int_res.shape[0]):
        F2[idx] += int_res[n]

and by:

intermediate_res  = eigenvectors_memory[:, i, l] * F2 * lobatto_weights
for k in range(intermediate_res.shape[0]):
    Cilmoft[i, l, m+l, 0] += np.complex128(intermediate_res[k])

I have put the number of iterations of the for-loops to be constants and not Psi.shape[0] or Psi.shape[1] or any other such construct. Now the for-loops are similar to: for i in range(80 - 1): ... , i.e. manually introduced ints.
I do not allocate the temporary arrays exp_of_phis and poly_results inside the triple nested for-loop (i, l, m), but I pre-allocate them outside the function as:

    exp_of_phis_matrix = np.zeros( (19*2+1, 18), dtype=np.complex128)
    for m in range(-19, 20, 1):
       exp_of_phis_matrix[m+19, :] = np.exp(-1j * m * gl.phis)
    poly_results_vector = eval_legendre_withNJIT(np.array([[l for l in range(20)]]).reshape(20, ))

Then use them as:

for contor in range(18):
    Psi[:, :, contor] = Psi[:, :, contor] * exp_of_phis_matrix[m+19, contor]

and as:

for idx in prange(80 - 1):
    F2[idx] = np.sum(weights_here * poly_results_vector[l] * F1[idx, :])

This results in the first visible improvement after everything I tried. Still, the running with parallel=True is significantly slower (1 order of magnitude at least) than running without that keyword set.

Does anyone know what I could try/read for/as the next steps?

Edit after user_na comment: I now do to profile:

if __name__ == "__main__":
    from timeit import Timer
 # call each function which will be used inside get_Cilomft(Psi) to avoid timing their compilation times

    eval_legendre_withNJIT(2)
    get_Cilmoft(Psi)

    t = Timer(lambda: get_Cilmoft(Psi))
    print( min( t.repeat(repeat=2, number=2) ) )

And I get the following. I varied gl.N_r, i.e. Psi.shape[0] to be {40, 80, 160, 320, 640} and I let run both the parallel funct but also the serial funct on those Psi's. For the parallel funct runs, I obtained better results if I kept only the first for loop with the prange keyword. First time duration is the parallel funct having all its for loops with prange, second time duration is the parallel funct having only its first for loop with the prange keyowrd, third time duration is the serial function. An underscore means that I do not have those times yet (simulations are in progress).

N_r = 40 ==> 29s; 29.5s; 22s
N_r = 80 ==> 220s; 118s; 97 s
N_r = 160 ==> 822s; 458s; 398s
N_r = 320 ==> 1806s; 1879s ; 1590s
N_r = 640 ==> 7270s ; 7354s ; 7291s

Update: I re-wrote the np.trapz and np.sum functions my own and @njit-ted them (only with the argument fastmath=True) and used in the places where I was using np.trapz and np.sum(). Now the times are as following (for serial and parallel):

N_r = 40 ==> _ ; _ ; _
N_r = 80 ==> _ ; 14.42s; 22.23s
N_r = 160 ==> _ ; 69.47s; 104.85s
N_r = 320 ==> _ ; 312.89s ; 513.18s
N_r = 640 ==> _ ; 1365.2s; 1648.88s

Thank you!

parallel numba code on 20 cores slower than serial

Answers (1)

Related Questions