Siamak
Siamak

Reputation: 51

My kernel's FLOP efficiency is very low

I implemented a kernel for pressure gradients computations. Based on used algorithm and previous parts which I did, the ngb_list[] is random and I do not have coalesced memory access. However, the double precision FLOP efficiency of the kernel is 0.2% of the peak performance on TESLA K40. It seems very low ...!

Also:

Global Memory Load Efficiency: 45.05%
Global Memory Store Efficiency: 100.00%

Is there a way to improve the DP FLOP efficiency and Global Memory Load Efficiency?

Here you can find the code:

#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <time.h>
#include <cmath>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>

    typedef double  Float;

__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
                                 Float* interactionVectors_x, Float* interactionVectors_y, Float* interactionVectors_z,
                                 int* ngb_offset, int* ngb_size, int* ngb_list,
                                 Float* pressureGrad_x, Float* pressureGrad_y, Float* pressureGrad_z,
                                 int num){

    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < num){

        for (int j = ngb_offset[idx]; j < (ngb_offset[idx] + ngb_size[idx]); j++){
            Float pij = (pressure[idx] + pressure[ngb_list[j]]);
            pressureGrad_x[idx] += interactionVectors_x[j] * pij; 
            pressureGrad_y[idx] += interactionVectors_y[j] * pij;
            pressureGrad_z[idx] += interactionVectors_z[j] * pij;
        }
        pressureGrad_x[idx] *= 0.5;
        pressureGrad_y[idx] *= 0.5;
        pressureGrad_z[idx] *= 0.5;
    }
}

int main(){

    const int num = 1 << 20;
    const int tb = 1024;
    int bg = (num + tb - 1) / tb;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    //ngb_size
    thrust::device_vector<int> ngb_size(num,27);

    //ngb_offset
    thrust::device_vector<int> ngb_offset(num);
    thrust::exclusive_scan(ngb_size.begin(),ngb_size.end(), ngb_offset.begin());

    //ngh list
    int ngbSize = thrust::reduce(ngb_size.begin(),ngb_size.end());

    std::cout << "ngbSize" << ngbSize << std::endl;

    thrust::device_vector<int> ngb_list(ngbSize);

    srand((unsigned)time(NULL));
    for (int i = 0; i < num; i++){

        int R = (rand()%(num - 0)) + 0;

        ngb_list[i] = R;
    }

    //pressure
    thrust::device_vector<Float> d_pressure(num);
    thrust::sequence(d_pressure.begin(),d_pressure.end(),1);

    //interaction vectors
    thrust::device_vector<Float> d_xInteractionVectors(ngbSize,1);
    thrust::device_vector<Float> d_yInteractionVectors(ngbSize,0);
    thrust::device_vector<Float> d_zInteractionVectors(ngbSize,0);

    //pressure gradients
    thrust::device_vector<Float> pGradx(num);
    thrust::device_vector<Float> pGrady(num);
    thrust::device_vector<Float> pGradz(num);

    //Pressure list
    thrust::device_vector<Float> pressure_list(ngbSize,0);

    cudaEventRecord(start);
    pGrad_calculator<<<bg,tb>>>(thrust::raw_pointer_cast(&d_pressure[0]),
                                thrust::raw_pointer_cast(&pressure_list[0]),
                                thrust::raw_pointer_cast(&d_xInteractionVectors[0]),
                                thrust::raw_pointer_cast(&d_yInteractionVectors[0]),
                                thrust::raw_pointer_cast(&d_zInteractionVectors[0]),
                                thrust::raw_pointer_cast(&ngb_offset[0]),
                                thrust::raw_pointer_cast(&ngb_size[0]),
                                thrust::raw_pointer_cast(&ngb_list[0]),
                                thrust::raw_pointer_cast(&pGradx[0]),
                                thrust::raw_pointer_cast(&pGrady[0]),
                                thrust::raw_pointer_cast(&pGradz[0]),
                                num);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    std::cout << "KERNEL TIME =  " << milliseconds << "  milliseconds" << std::endl;

    return 0;
}

Upvotes: 0

Views: 375

Answers (1)

Jan Lucas
Jan Lucas

Reputation: 691

Likely the compiler can not optimize your code properly and many extra loads and writes are used.

Try writing the code like this:

__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
                                 Float* interactionVectors_x, 
                                 Float* interactionVectors_y, 
                                 Float* interactionVectors_z,
                                 int* ngb_offset, int* ngb_size, 
                                 int* ngb_list, Float* pressureGrad_x, 
                                 Float* pressureGrad_y, Float* pressureGrad_z,
                                 int num)
{
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num){
        Float x=pressureGrad_x[idx];
        Float y=pressureGrad_y[idx];
        Float z=pressureGrad_z[idx];
        Float pressure_local=pressure[idx];
        int offset=ngb_offset[idx];
        int end=offset+ngb_size[idx];
        for (int j = offset; j < end; j++){
            Float pij = (pressure_local + pressure[ngb_list[j]]);
            x += interactionVectors_x[j] * pij; 
            y += interactionVectors_y[j] * pij;
            z += interactionVectors_z[j] * pij;
        }
        pressureGrad_x[idx] = 0.5*x;
        pressureGrad_y[idx] = 0.5*y;
        pressureGrad_z[idx] = 0.5*z;
    }
}

But even with this optimization, you will most likely get nowhere near peak flops as you will be limited by memory bandwidth. You are doing less than two floating point ops per double/ 8 bytes. That gives you upper bound on peak flops of 288 GB/s * 2 FOps / 8 bytes = 72 GFlop/s or about 5% peak.

Upvotes: 1

Related Questions