rxu
rxu

Reputation: 1407

Use more than one thread when calling intel's mkl directly from python

I call intel's math kernel library from python. So far, it is using just one cpu, instead of all 12 cpu, according to linux's top command. How to make it use all 12 cpu?

I have tried setting three environmental variables (OMP_NUM_THREADS, MKL_NUM_THREADS, MKL_DOMAIN_NUM_THREADS) to 12.

I have also tried mkl_set_num_threads.

numpy using mkl can use all 12 cpu in a dense matrix multiplication.

The mkl is 2016 version.

Any suggestion is welcomed.

Thanks.

Below is the test code:

from ctypes import *
import scipy.sparse as spsp
import numpy as np
import multiprocessing as mp

# Load the share library
mkl = cdll.LoadLibrary("libmkl_rt.so")


def get_csr_handle2(data, indices, indptr, shape):
    a_pointer   = data.ctypes.data_as(POINTER(c_float))
    ja_pointer  = indices.ctypes.data_as(POINTER(c_int))
    ia_pointer  = indptr.ctypes.data_as(POINTER(c_int))
    return (a_pointer, ja_pointer, ia_pointer, shape)


def get_csr_handle(A,clear=False):
    if clear == True:
        A.indptr[:] = 0
        A.indices[:] = 0
        A.data[:] = 0
    return get_csr_handle2(A.data, A.indices, A.indptr, A.shape)


def csr_t_dot_csr(A_handle, C_handle, nz=None):
    # Calculate (A.T).dot(A) and put result into C
    #
    # This uses one-based indexing
    #
    # Both C.data and A.data must be in np.float32 type.
    #
    # Number of nonzero elements in C must be greater than
    #     or equal to the size of C.data
    #
    # size of C.indptr must be greater than or equal to
    #     1 + (num rows of A).
    #
    # C_data    = np.zeros((nz), dtype=np.single)
    # C_indices = np.zeros((nz), dtype=np.int32)
    # C_indptr  = np.zeros((m+1),dtype=np.int32)

    (a_pointer, ja_pointer, ia_pointer, A_shape) = A_handle
    (c_pointer, jc_pointer, ic_pointer, C_shape) = C_handle

    trans_pointer   = byref(c_char('T'))
    sort_pointer    = byref(c_int(0))

    (m, n)          = A_shape
    sort_pointer        = byref(c_int(0))
    m_pointer           = byref(c_int(m))     # Number of rows of matrix A
    n_pointer           = byref(c_int(n))     # Number of columns of matrix A
    k_pointer           = byref(c_int(n))     # Number of columns of matrix B
                                              # should be n when trans='T'
                          # Otherwise, I guess should be m
    ###
    b_pointer   = a_pointer
    jb_pointer  = ja_pointer
    ib_pointer  = ia_pointer
    ###
    if nz == None:
        nz = n*n #*n # m*m # Number of nonzero elements expected
             # probably can use lower value for sparse
             # matrices.
    nzmax_pointer   = byref(c_int(nz))
     # length of arrays c and jc. (which are data and
     # indices of csr_matrix). So this is the number of
     # nonzero elements of matrix C
     #
     # This parameter is used only if request=0.
     # The routine stops calculation if the number of
     # elements in the result matrix C exceeds the
     # specified value of nzmax.

    info = c_int(-3)
    info_pointer = byref(info)
    request_pointer_list = [byref(c_int(0)), byref(c_int(1)), byref(c_int(2))]
    return_list = []
    for ii in [0]:
        request_pointer = request_pointer_list[ii]
        ret = mkl.mkl_scsrmultcsr(trans_pointer, request_pointer, sort_pointer,
                    m_pointer, n_pointer, k_pointer,
                    a_pointer, ja_pointer, ia_pointer,
                    b_pointer, jb_pointer, ib_pointer,
                    c_pointer, jc_pointer, ic_pointer,
                    nzmax_pointer, info_pointer)
        info_val = info.value
        return_list += [ (ret,info_val) ]
    return return_list

def test():
    num_cpu = 12
    mkl.mkl_set_num_threads(byref(c_int(num_cpu))) # try to set number of mkl threads
    print "mkl get max thread:", mkl.mkl_get_max_threads()
    test_csr_t_dot_csr()

def test_csr_t_dot_csr():
    AA = np.random.choice([0,1], size=(12,750000), replace=True, p=[0.99,0.01])
    A_original = spsp.csr_matrix(AA)
    A = A_original.astype(np.float32).tocsc()
    A = spsp.csr_matrix( (A.data, A.indices, A.indptr) )

    A.indptr  += 1 # convert to 1-based indexing
    A.indices += 1 # convert to 1-based indexing
    A_ptrs = get_csr_handle(A)

    C = spsp.csr_matrix( np.ones((12,12)), dtype=np.float32)
    C_ptrs = get_csr_handle(C, clear=True)

    print "=call mkl function=" 

    while (True):
        return_list = csr_t_dot_csr(A_ptrs, C_ptrs)

if __name__ == "__main__":
    test()

Upvotes: 2

Views: 907

Answers (1)

kangshiyin
kangshiyin

Reputation: 9781

It seems some of the python distributions (e.g., anaconda) will respond to the environmental variables, while others (CentOS 7 default) not. Never dig into it further.

Upvotes: 1

Related Questions