Sina
Sina

Reputation: 411

Trying to improve performance of some vectorized numpy operations via cython

I am trying to speed up a vectorized numpy operation by Cythonizing it (or atleast see if I can). The code calculates some sort of stress given two distance matrices (target_distances and map_distances computed from some flattened coordinate vector) and some information about distance types (it can range from 0 to 3 but for my first attempt I have taken it to be all 0 in this case). The numpy version is pycalculus.py:


import scipy
import numpy as np
from scipy.spatial import distance

def decay(x, s=10):
    
    return scipy.special.expit(s*x)

def stress(z, target_distances, dim, distance_types, step,
           nrows, ncols):

    row_coords = np.reshape(z[:dim*nrows],(nrows,dim))
    col_coords = np.reshape(z[dim*nrows:dim*(nrows+ncols)],(ncols,dim))
    
    map_distances = distance.cdist(row_coords, col_coords).copy()
    error = target_distances - map_distances
    
    I0 = (distance_types==0) | (distance_types==3)
    I2 = distance_types == 2

    return np.sum(error[I0]**2) + np.sum((error[I2] + step)**2*decay(error[I2] + step))



In the cythonized version, my pyx file is as follows (calculus.pyx)

import cython
cimport cython
from libc.stdlib cimport malloc, free

cdef extern from "ctools.h":
    double stress (double*, double**, int**, double, int, int, int)

@cython.boundscheck(False) 
@cython.wraparound(False)
@cython.nonecheck(False)     
def stress_cython(double[:] z, double[:,:] target_distances, int [:,:]
           distance_types, double step, int dim, int nrows, int ncols):
    
    cdef int i
    cdef int j
    cdef int N = (nrows+ncols)*dim
    
    z_C = <double*>malloc(sizeof(double)*N)
    target_distances_C = <double **>malloc(sizeof(double*)*nrows)
    distance_types_C = <int **>malloc(sizeof(int*)*nrows)
    
    for i in range(N):
        z_C[i] = z[i]
    
    for i in range(nrows):
        target_distances_C[i] = <double *>malloc(sizeof(double)*ncols)
        distance_types_C[i] = <int *>malloc(sizeof(int)*ncols)
        
        for j in range(ncols):
            target_distances_C[i][j] = target_distances[i,j]
            distance_types_C[i][j] = distance_types[i,j]
        
   
    stress_val = stress(z_C, target_distances_C, 
                        distance_types_C, step, nrows, ncols, dim)
    
    for i in range(nrows):
        free(target_distances_C[i])
        free(distance_types_C[i])
        
    free(target_distances_C)
    free(distance_types_C)
    
    return stress_val
    
    

And the external ctools.c is

#include<stdio.h>
#include<math.h>

double decay(double x){

    return 1/(1+exp(-10*x));
    
    }

double** dist_pairs(double* z, int nrows, int ncols, int dims)
    {

    int i,j,d;
    double coord1, coord2;
    double** dist;
    
    dist = (double **)malloc(sizeof(double*)*nrows);
    for (i=0; i<nrows; i++){
        dist[i] = (double *)malloc(sizeof(double)*ncols);
        }
    
    for (i=0; i<nrows; i++){
       for (j=0; j<ncols; j++){
            
            dist[i][j] = 0;
            
            for (d=0; d<dims; d++){
            
                coord1 = z[d*(nrows+ncols) + i];
                coord2 = z[d*(nrows+ncols) + nrows + j];
                dist[i][j] += pow(coord1-coord2,2.0) ;
                }
                
            dist[i][j] = sqrt(dist[i][j]);
        }
    }
                   
    return dist;
    
}

double stress(double* z, double **target_distances, int** distance_types, 
              double step, int nrows, int ncols, int dim){

    int i,j;
    double stress = 0.0;
    double err = 0.0;
    double **map_distances = dist_pairs(z, nrows, ncols, dim);

    for (i=0; i<nrows; i++){
        for (j=0; j<ncols; j++){
        
            if (distance_types[i][j]==0 || distance_types[i][j]==3){
                stress += pow(target_distances[i][j] - map_distances[i][j],2.0);
                }
            else if (distance_types[i][j]==2){
                err = target_distances[i][j] - map_distances[i][j] + step;
                stress += pow(step,2)*decay(step);

            }
                
        }
    
    }

    for (i=0; i<nrows; i++){
        free(map_distances[i]);
    }
    free(map_distances);

    return stress;

}

I do the following test (test.py)


import numpy as np
import time 
from calculus import stress_cython
from pycalculus import stress

nrows = 3000
ncols = 2000
dim = 2
N = 100
is_discrete = 1.0

dt0 = 0
dt1 = 0
difs = []
for i in range(N):
    coordinates = np.random.rand(dim, nrows+ncols)
    coordinates_flat = coordinates.flatten()
    
    target_distances = np.random.rand(nrows, ncols)
    distance_types = np.zeros((nrows,ncols), dtype='i')
    
    t0 = time.time()
    stress1 = stress_cython(coordinates_flat, target_distances, distance_types, is_discrete, dim, nrows,
                   ncols)
    t1 = time.time()
    
    stress2 = stress(coordinates.T.flatten(), target_distances, dim, distance_types, is_discrete,
                     nrows,ncols)
    
    t2 = time.time()
    
    dt0 += t1-t0
    dt1 += t2-t1
    
    difs.append(stress1-stress2)

print(f'cython:{dt0:.2f} python:{dt1:.2f}')

and the timings end up pretty similar in my machine (~4.47 vs ~4.62). I am not quite understanding why, is there some parallelised computation going for numpy and scipy in the background?

When I check my core usage, it does not seem to be so. I tried turning of parallelism via

export MKL_NUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export OMP_NUM_THREADS=1

in the shell before running test.py and it did not make a difference. Is parallelism during vectorized operations is what I am missing here or am I just tring to optimize routines which already have been optimized to best possible extend in these libraries?

ps1: I compile cython code with the arg "-ffast-math"

ps2: In the end I will have all the cores engaged with different initial starting conditions so if numpy automatically parallelises vectorized operations, it wont help me. That is one of the reasons why I am trying to understand what is going on.

Upvotes: 1

Views: 315

Answers (1)

J&#233;r&#244;me Richard
J&#233;r&#244;me Richard

Reputation: 50433

Compiler optimizations and instruction set

First of all Cython is often compiled with -O2 which is generally suboptimal (vectorization is not enabled except for very recent version of GCC). -O3 can give better performance. Moreover, the SSE instruction set is used by default on x86-64 processor while nearly all of them now support AVX (which is capable of computing generally 2x more items per cycle). Flags like -mavx or even -mavx2 / -mfma can help to improve performance when supported (otherwise the target program can simply crash). You can use -march=native to let the compiler generate a non-portable program optimized for your processor. Numpy can detect this at runtime and use AVX instruction when needed (though it is not the case for all operations).

Code issues

Operations like pow(coord1-coord2, 2.0) is not guaranteed to be optimized by the target compiler. It is certainly better to use pow(coord1-coord2, 2), or even (coord1-coord2) * (coord1-coord2).

A double** data structure is not efficient for storing matrices. It is better to allocate a big double* flatten array of size width * height * sizeof(double). Indeed, the later has the benefit of being contiguous in memory and reading/writing contiguous data is generally faster. Furthermore, dist[i][j] can make inefficient indirection in memory while dist[i*ncols+j] can result in less/faster instructions. In fact, it is even more critical in parallel code since malloc tends not to scale with multiple cores so you should really avoid calling it many time in parallel.

for (d=0; d<dims; d++) is not efficient if dims is very small (like <8). Unrolling the loop for specific values can significantly improve performance. Alternatively, this loop can be swapped with the j-based one so to generate a faster code (possibly vectorized). It might be even faster to put it first.

is there some parallelised computation going for numpy and scipy in the background?

Currently, Numpy should not parallelize any operation accept one based on BLAS (eg. matrix multiplications). As for Scipy, it is the same thing except some method can use multiple workers (quite rare, tunable and specified in the documentation).

Your code may be memory-bound (more specifically due to inefficient cache accesses) so the way of accessing data in memory matters a lot. Numpy can make more efficient accesses than your code.

Note that if the code is bound by the memory bandwidth, then it will not scale well (since using more core does not make memory access faster when the memory is already saturated).

Further optimizations (update)

The Cython code spent a lot of time copying array for no reason since they are contiguous here. You can just pass it to the C function since the array are unmodified. Copying data is generally slow, especially large arrays.

It turns out that the compiler (GCC) simply fail to vectorize dist_pairs in practice even when -O3 is provided (while Numpy does vectorize the code). I tried to modify the code to make the vectorization trivial but the generated code is inefficient even in this case. On solution is to write a vectorized code manually even though it make the code not portable anymore (only run on x86-64 processors supporting AVX, that is, nearly all desktop PCs that are less than 10 years old). Here is an example of (unchecked) code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <x86intrin.h>

static inline double decay(double x){
    return 1/(1+exp(-10*x));
}

double* dist_pairs(double* restrict z, int nrows, int ncols, int dims){
    double* dist = malloc(sizeof(double) * nrows * ncols);
    
    if (dims == 2){
        // Fast 2D case
        for (long i=0; i<nrows; i++){
            int j;
            int z_offset_d0 = nrows;
            int z_offset_d1 = (nrows+ncols) + nrows;
            __m256d v_coord1_d0 = _mm256_set1_pd(z[i]);
            __m256d v_coord1_d1 = _mm256_set1_pd(z[(nrows+ncols) + i]);

            // Vectorized code for x86-64 processors
            for (j=0; j<ncols-3; j+=4){
                __m256d v_coord2_d0 = _mm256_loadu_pd(&z[z_offset_d0 + j]);
                __m256d v_coord2_d1 = _mm256_loadu_pd(&z[z_offset_d1 + j]);
                __m256d v_tmp0 = _mm256_sub_pd(v_coord1_d0, v_coord2_d0);
                __m256d v_tmp1 = _mm256_sub_pd(v_coord1_d1, v_coord2_d1);
                v_tmp0 = _mm256_mul_pd(v_tmp0, v_tmp0);
                v_tmp1 = _mm256_mul_pd(v_tmp1, v_tmp1);
                __m256d v_res = _mm256_sqrt_pd(_mm256_sub_pd(v_tmp0, v_tmp1));
                _mm256_storeu_pd(&dist[i*ncols + j], v_res);
            }

            // Remainder
            for (; j<ncols; j++){
                double coord1_d0 = z[i];
                double coord1_d1 = z[(nrows+ncols) + i];
                double coord2_d0 = z[nrows + j];
                double coord2_d1 = z[(nrows+ncols) + nrows + j];
                double tmp = (coord1_d0-coord2_d0) * (coord1_d0-coord2_d0);
                tmp += (coord1_d1-coord2_d1) * (coord1_d1-coord2_d1);
                dist[i*ncols+j] = sqrt(tmp);
            }
        }
    }
    else{
        // General slow implementation
        for (int i=0; i<nrows; i++){
            for (int j=0; j<ncols; j++){
                dist[i*ncols+j] = 0;
                
                for (int d=0; d<dims; d++){
                    double coord1 = z[d*(nrows+ncols) + i];
                    double coord2 = z[d*(nrows+ncols) + nrows + j];
                    dist[i*ncols+j] += (coord1-coord2) * (coord1-coord2);
                }

                dist[i*ncols+j] = sqrt(dist[i*ncols+j]);
            }
        }
    }

    return dist;
}

double stress(double* z, double* target_distances, int* distance_types, double step, int nrows, int ncols, int dim){
    double stress = 0.0;
    double err = 0.0;
    double* map_distances = dist_pairs(z, nrows, ncols, dim);
    double step_decay = decay(step);
    
    for (int i=0; i<nrows; i++){
        for (int j=0; j<ncols; j++){
            if (distance_types[i*ncols+j]==0 || distance_types[i*ncols+j]==3){
                double tmp = target_distances[i*ncols+j] - map_distances[i*ncols+j];
                stress += tmp * tmp;
            }
            else if (distance_types[i*ncols+j]==2){
                err = target_distances[i*ncols+j] - map_distances[i*ncols+j] + step;
                stress += step * step * step_decay;
            }
        }
    }

    (void)err; // Avoid a compiler warning
    free(map_distances);
    return stress;
}
import cython
cimport cython
from libc.stdlib cimport malloc, free

cdef extern from "ctools.h":
    double stress (double*, double*, int*, double, int, int, int)

# Assume Numpy arrays are contiguous
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def stress_cython(double[::1] z, double[:,::1] target_distances, int [:,::1]
           distance_types, double step, int dim, int nrows, int ncols):
    cdef int i
    cdef int j
    cdef int N = (nrows+ncols)*dim

    stress_val = stress(&z[0], &target_distances[0,0], &distance_types[0,0], step, nrows, ncols, dim)

    return stress_val

Here is the resulting time (with -O3 -mavx -mfma:

cython:1.29 python:4.45

Note that dist_pairs spent 60% of its time computing the square root. The rest is basically load/store from/to memory. The only way to speed up the square root computation is to 1. use an approximation or use less precise numbers (ie. float) or 2. use multiple threads.

You can make the overall computation less memory-bound by merging the loop of dist_pairs and stress and operating line by line (in an interleaved way). The computed lines can be stored in the fast CPU cache and instead of the slower main RAM. The resulting code should be compute-bound and scale very well.

If this is not enough, you can try to run this on server-side GPUs capable of computing double-precision operations efficiently since the square root should be computed faster and the device memory is also generally faster.

Upvotes: 3

Related Questions