Why this code can't be parallelised with Numba?

Question

I am working on a physics simulation. I am actually trying to implement a cluster Wolff algorithm for the Ising model. This deals with mainly manipulation of NumPy arrays on the programming side. I want to make a plot of specific heat,energy, magnetization with temperature. For different temperatures, the simulations are entirely independent. So,I tried to parallelize the operation with the numba prange function. But it is not allowing it. The reason seems another function that I am calling from my main function.

My code is here:

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 30 18:39:45 2020

@author: ENDEAVOUR
"""

#%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from numba import jit,prange
import timeit
#----------------------------------------------------------------------
##  semi-block codes################
#----------------------------------------------------------------------

start = timeit.default_timer()
error_model=True

@jit(error_model="numpy",nopython=True)
def initialstate(N):   
    ''' generates a random spin configuration for initial condition'''
    state =2*np.random.randint(0,2, size=(N,N))-1
    return state

@jit(error_model="numpy",nopython=True)
def calcEnergy(config):
    '''Energy of a given configuration'''
    N=len(config)
    energy = 0
    for i in range(len(config)):
        for j in range(len(config)):
               S = config[i,j]
               nb = config[(i+1)%N, j] + config[i,(j+1)%N]  + config[(i-1)%N, j] + config[i,(j-1)%N] 
               energy += -nb*S - H*S
    return energy/(4.)

@jit(error_model="numpy",nopython=True)
def calcMag(config):
    '''Magnetization of a given configuration'''
    mag = np.sum(config) + H
    return mag


def wolff(config,p):##wollff cluster implementation 

   ## the followng is the function for a sngle run of cluster making and flipping.


   #print(config)
   '''Monte Carlo move using Wolff-Cluster Sampling'''

   que = []  ### "que" ; stores the list of coordinates of the neighbours aded to the cluster
   (x0,y0) = np.random.randint(len(config)),np.random.randint(len(config)) ## a random spin is chosen at first
   que.append((x0,y0)) ## then added to the "que"


   while (len(que) > 0):## as mentioned in the documents I havesnt u , code must run untill there is nobody left to be added

       #print('Length of que ',len(que))

       q = que[np.random.randint(len(que))] ## a random element from que is chosen

       neighbours = [((q[0]+1)%N,q[1]), ((q[0]-1)%N,q[1]), (q[0],(q[1]+1)%N), (q[0],(q[1]-1)%N) ] ## neighbours to the selected spin are considered
       for c in neighbours:

           if all([config[q]==config[c]  , c not in que,np.random.rand() < p]):
## process of adding spins to the que based on wolff's criteria if they have same spin
              que.append(c)


       config[q] *= -1 ## the spin'q' that was selected from the que is flipped so to avoid being selected in future
       que.remove(q)  ## the spin'q' is removed from the 'que'

   return config



@jit(error_model="numpy",parallel=True)
def run_simulation(N,eqSteps,mcSteps,T,nt):

    E,M,C,X = np.empty(nt), np.empty(nt), np.empty(nt), np.empty(nt)

    n1, n2  = 1.0/(mcSteps*N*N), 1.0/(mcSteps*mcSteps*N*N) 


    for tt in prange(nt):

        E1 = M1 = E2 = M2 = 0
        config = initialstate(N)
        iT=1.0/T[tt]; iT2=iT*iT;
        p = 1 - np.exp(-2*J*iT)

        for i in range(eqSteps):         # equilibrate
            config=wolff(config,p)                # Monte Carlo moves



        for i in range(mcSteps):
            config=wolff(config,p)            
            Ene = calcEnergy(config)     # calculate the energy
            Mag = abs(calcMag(config))       # calculate the magnetisation



            E1 = E1 + Ene
            M1 = M1 + Mag
            M2 = M2 + Mag*Mag 
            E2 = E2 + Ene*Ene


        E[tt] = n1*E1
        M[tt] = n1*M1
        C[tt] = (n1*E2 - n2*E1*E1)*iT2
        X[tt] = (n1*M2 - n2*M1*M1)*iT

        print ("Temp:",T[tt],":", E[tt], M[tt],C[tt],X[tt])


    return E,M,C,X

#def control():
####################################
N  = 64       #N X N spin field
J  = 1
H  = 0
nt = 50
eqSteps = 150
mcSteps = 20

#############################################No change rquired here
T  = np.linspace(1.33, 4.8, nt) 
E,M,C,X=run_simulation(N,eqSteps,mcSteps,T,nt)



f = plt.figure(figsize=(25, 20)); # plot the calculated values    
plt.title("External Field :%5.2f"%(H))
sp =  f.add_subplot(3, 2, 1 );
plt.scatter(T, E, s=50, marker='o', color='Red')
plt.xlabel("Temperature (T)", fontsize=20);
plt.ylabel("Energy ", fontsize=20);         plt.axis('tight');

sp =  f.add_subplot(3, 2, 2 );
plt.scatter(T, M, s=50, marker='o', color='Blue')
plt.xlabel("Temperature (T)", fontsize=20); 
plt.ylabel("Magnetization ", fontsize=20);   plt.axis('tight');

sp =  f.add_subplot(3, 2, 3 );
plt.scatter(T, C, s=50, marker='o', color='Red')
plt.xlabel("Temperature (T)", fontsize=20);  
plt.ylabel("Specific Heat ", fontsize=20);   plt.axis('tight');   

sp =  f.add_subplot(3, 2, 4 );
plt.scatter(T, X, s=50, marker='o', color='RoyalBlue')
plt.xlabel("Temperature (T)", fontsize=20); 
plt.ylabel("Susceptibility", fontsize=20);   plt.axis('tight');

sp = f.add_subplot(3 ,2 ,5);
plt.scatter(E, M,s=50, marker='+', color='Red') 
plt.xlabel("Energy ", fontsize=20);         plt.axis('tight');
plt.ylabel("Magnetization ", fontsize=20);   plt.axis('tight');

plt.show()
stop = timeit.default_timer()

print('Time taken for this simulation:: ', stop - start)

The error message shown is

E:/Project/wolff_cluster.2.py:78: NumbaWarning: 
Compilation is falling back to object mode WITH looplifting enabled because Function "run_simulation" failed type inference due to: Untyped global name 'wolff': cannot determine Numba type of 

File "wolff_cluster.2.py", line 94:
def run_simulation(N,eqSteps,mcSteps,T,nt):
    
        for i in range(eqSteps):         # equilibrate
            config=wolff(config,p)                # Monte Carlo moves
            ^

  @jit(error_model="numpy",parallel=True,nopython=True)
E:/Project/wolff_cluster.2.py:78: NumbaWarning: 
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "run_simulation" failed type inference due to: cannot determine Numba type of 

File "wolff_cluster.2.py", line 86:
def run_simulation(N,eqSteps,mcSteps,T,nt):
    

    for tt in prange(nt):
    ^

  @jit(error_model="numpy",parallel=True,nopython=True)
C:\Users\Sandipan\Anaconda3\lib\site-packages
umba\compiler.py:725: NumbaWarning: Function "run_simulation" was compiled in object mode without forceobj=True, but has lifted loops.

File "wolff_cluster.2.py", line 79:
@jit(error_model="numpy",parallel=True,nopython=True)
def run_simulation(N,eqSteps,mcSteps,T,nt):
^

  self.func_ir.loc))
C:\Users\Sandipan\Anaconda3\lib\site-packages
umba\compiler.py:734: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "wolff_cluster.2.py", line 79:
@jit(error_model="numpy",parallel=True,nopython=True)
def run_simulation(N,eqSteps,mcSteps,T,nt):
^

  warnings.warn(errors.NumbaDeprecationWarning(msg, self.func_ir.loc))
E:/Project/wolff_cluster.2.py:78: NumbaWarning: 
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "run_simulation" failed type inference due to: Untyped global name 'wolff': cannot determine Numba type of 

File "wolff_cluster.2.py", line 94:
def run_simulation(N,eqSteps,mcSteps,T,nt):
    
        for i in range(eqSteps):         # equilibrate
            config=wolff(config,p)                # Monte Carlo moves
            ^

  @jit(error_model="numpy",parallel=True,nopython=True)
C:\Users\Sandipan\Anaconda3\lib\site-packages
umba\compiler.py:725: NumbaWarning: Function "run_simulation" was compiled in object mode without forceobj=True.

File "wolff_cluster.2.py", line 86:
def run_simulation(N,eqSteps,mcSteps,T,nt):
    

    for tt in prange(nt):
    ^

  self.func_ir.loc))
C:\Users\Sandipan\Anaconda3\lib\site-packages
umba\compiler.py:734: NumbaDeprecationWarning: 
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "wolff_cluster.2.py", line 86:
def run_simulation(N,eqSteps,mcSteps,T,nt):
    

    for tt in prange(nt):
    ^

  warnings.warn(errors.NumbaDeprecationWarning(msg, self.func_ir.loc))
E:/Project/wolff_cluster.2.py:78: NumbaWarning: 
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "run_simulation" failed type inference due to: Untyped global name 'wolff': cannot determine Numba type of 

File "wolff_cluster.2.py", line 94:
def run_simulation(N,eqSteps,mcSteps,T,nt):
    
        for i in range(eqSteps):         # equilibrate
            config=wolff(config,p)                # Monte Carlo moves
            ^

  @jit(error_model="numpy",parallel=True,nopython=True)
Temp: 1.33 : -0.9890869140625 0.9942626953125 0.041807132522607905 0.017099021969053343

After showing the above error it runs as usual but takes 3x times compared to the pure python version. I tried without numba for Wolff, but no success. It is really important to parallelize the for loop in run_simulation as it is the most time-consuming part of the code.

I think the problem is with Wolff function. Numba can't do anything to optimise it. But I have not used any function not supported by numba there. (simple list and numpy only). It marks the if condition--'all' keyword. I have tried with 'and' inside if ,but then the program runs into an infinite loop. It appends the same element multiple times in the que (probably overlooks the not in condition). I know this is technically impossible. (Both 'and' & 'all' should work). I am doing something wrong. But I have no idea what it is.

Can you please help me with optimization of this code. It works fine without numba. But with N >100,it will be too slow for any useful work.

max9111 · Accepted Answer

At first get a runable Code

Parallelization is normally the last thing to consider. At best you can gain a bit less than the number of cores available (usually less than a order of magnitude). There are quite a lot features not available, usually its best to keep it as simple as possible (and does the job here)

When using Numba it is in most cases recommendable to get rid of any global variables, which could lead to hard to debug errors or very time consuming recompilation of the code. Actually there are no real global variables, instead they are hard-coded in the compiled function.

Example

import numba as nb
@nb.njit()
def why_globals_have_to_be_avoided(A):
    print(A+B)

A=0
B=5
why_globals_have_to_be_avoided(A)
#5

A=0
B=8
why_globals_have_to_be_avoided(A)
#5

#If this function is buried in your code it will be extremely hard to debug
#If you turn caching on, it may even get harder to find an error.

This example is quite interesting because the inputs are quite tiny, but the execution speed heavily depends on the temperature T which each single calculation depends on. The work is simply spitted (0.48) and passed to a number of threads. If all but one thread finishes much earlier, you would not see any speedup. Since randomly rearranging the inputs is quite cheap in this example, shuffling the inputs is an easy workaround for this issue.

Example

@nb.njit()
def why_globals_have_to_be_avoided(A):
    print(A+B)

A=0
B=5
why_globals_have_to_be_avoided(A)
#5

A=0
B=8
why_globals_have_to_be_avoided(A)
#5

#If this function is burried in your code it will be extremely hard to debug
#If you turn caching on, it may even get harder to find an error.

Working Example

import numpy as np
import numba as nb
#----------------------------------------------------------------------
##  semi-block codes################
#----------------------------------------------------------------------

@nb.njit(error_model="numpy")
def initialstate(N):   
    ''' generates a random spin configuration for initial condition'''
    state =2*np.random.randint(0,2, size=(N,N))-1
    return state

@nb.njit(error_model="numpy")
def calcEnergy(config,H):
    '''Energy of a given configuration'''
    N=len(config)
    energy = 0
    for i in range(len(config)):
        for j in range(len(config)):
            S = config[i,j]
            nb = config[(i+1)%N, j] + config[i,(j+1)%N]  + config[(i-1)%N, j] + config[i,(j-1)%N] 
            energy += -nb*S - H*S
    return energy/(4.)

@nb.njit(error_model="numpy")
def calcMag(config,H):
    '''Magnetization of a given configuration'''
    mag = np.sum(config) + H
    return mag

@nb.njit(error_model="numpy")
def wolff(config,p,N):##wollff cluster implementation 
   ## the followng is the function for a sngle run of cluster making and flipping.
    que = []  ### "que" ; stores the list of coordinates of the neighbours aded to the cluster
    x0,y0 = np.random.randint(len(config)),np.random.randint(len(config)) ## a random spin is chosen at first
    que.append((x0,y0)) ## then added to the "que"


    while (len(que) > 0):## as mentioned in the documents I havesnt u , code must run untill there is nobody left to be added
        q = que[np.random.randint(len(que))] ## a random element from que is chosen

        neighbours = [((q[0]+1)%N,q[1]), ((q[0]-1)%N,q[1]), (q[0],(q[1]+1)%N), (q[0],(q[1]-1)%N) ] ## neighbours to the selected spin are considered
        for c in neighbours:
            if config[q]==config[c]  and c not in que and np.random.rand() < p:## process of adding spins to the que based on wolff's criteria if they have same spin
                que.append(c)


        config[q] *= -1 ## the spin'q' that was selected from the que is flipped so to avoid being selected in future
        que.remove(q)  ## the spin'q' is removed from the 'que'

    return config


@nb.njit(error_model="numpy",parallel=True)
def run_simulation(N,eqSteps,mcSteps,T,J,H):
    nt=T.shape[0]

    E,M,C,X = np.empty(nt), np.empty(nt), np.empty(nt), np.empty(nt)

    n1, n2  = 1.0/(mcSteps*N*N), 1.0/(mcSteps*mcSteps*N*N) 

    #It looks like the calculation time heavily depends on the Temperature
    #shuffle the values to get the work more equaly splitted
    #np.random.shuffle isn't supported by Numba, but you can use a Python callback
    with nb.objmode(ind='int32[::1]'): 
        ind = np.arange(nt)
        np.random.shuffle(ind)

    ind_rev=np.argsort(ind)
    T=T[ind]

    for tt in nb.prange(nt):

        E1 = M1 = E2 = M2 = 0
        config = initialstate(N)
        iT=1.0/T[tt]; iT2=iT*iT;
        p = 1 - np.exp(-2*J*iT)

        for i in range(eqSteps):           # equilibrate
            config=wolff(config,p,N)       # Monte Carlo moves

        for i in range(mcSteps):
            config=wolff(config,p,N)            
            Ene = calcEnergy(config,H)     # calculate the energy
            Mag = abs(calcMag(config,H))   # calculate the magnetisation

            E1 = E1 + Ene
            M1 = M1 + Mag
            M2 = M2 + Mag*Mag 
            E2 = E2 + Ene*Ene


        E[tt] = n1*E1
        M[tt] = n1*M1
        C[tt] = (n1*E2 - n2*E1*E1)*iT2
        X[tt] = (n1*M2 - n2*M1*M1)*iT

        #print ("Temp:",T[tt],":", E[tt], M[tt],C[tt],X[tt])

        #undo the shuffling
    return E[ind_rev],M[ind_rev],C[ind_rev],X[ind_rev]

#def control():
####################################
N  = 64       #N X N spin field
J  = 1
H  = 0
nt = 50
eqSteps = 150
mcSteps = 20

#############################################No change rquired here
T  = np.linspace(1.33, 4.8, nt) 

#You can measure the compilation time separately, it doesn't make sense to mix 
#runtime and compialtion time together.
#If the compilation time gets relevant it also make sense to use `cache=True`
E,M,C,X=run_simulation(N,eqSteps,mcSteps,T,J,H)

%timeit E,M,C,X=run_simulation(N,eqSteps,mcSteps,T,J,H)
1.17 s ± 74.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

#without parallelization
2.1 s ± 44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

#without compilation
~130 s

Why this code can't be parallelised with Numba?

Answers (1)

At first get a runable Code

Related Questions

Why this code can&#39;t be parallelised with Numba?

Answers (1)

At first get a runable Code

Related Questions

Why this code can't be parallelised with Numba?