Reputation: 51
I implemented a kernel for pressure gradients computations. Based on used algorithm and previous parts which I did, the ngb_list[]
is random and I do not have coalesced memory access. However, the double precision FLOP efficiency of the kernel is 0.2% of the peak performance on TESLA K40. It seems very low ...!
Also:
Global Memory Load Efficiency: 45.05%
Global Memory Store Efficiency: 100.00%
Is there a way to improve the DP FLOP efficiency and Global Memory Load Efficiency?
Here you can find the code:
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/sequence.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <time.h>
#include <cmath>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
typedef double Float;
__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
Float* interactionVectors_x, Float* interactionVectors_y, Float* interactionVectors_z,
int* ngb_offset, int* ngb_size, int* ngb_list,
Float* pressureGrad_x, Float* pressureGrad_y, Float* pressureGrad_z,
int num){
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num){
for (int j = ngb_offset[idx]; j < (ngb_offset[idx] + ngb_size[idx]); j++){
Float pij = (pressure[idx] + pressure[ngb_list[j]]);
pressureGrad_x[idx] += interactionVectors_x[j] * pij;
pressureGrad_y[idx] += interactionVectors_y[j] * pij;
pressureGrad_z[idx] += interactionVectors_z[j] * pij;
}
pressureGrad_x[idx] *= 0.5;
pressureGrad_y[idx] *= 0.5;
pressureGrad_z[idx] *= 0.5;
}
}
int main(){
const int num = 1 << 20;
const int tb = 1024;
int bg = (num + tb - 1) / tb;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//ngb_size
thrust::device_vector<int> ngb_size(num,27);
//ngb_offset
thrust::device_vector<int> ngb_offset(num);
thrust::exclusive_scan(ngb_size.begin(),ngb_size.end(), ngb_offset.begin());
//ngh list
int ngbSize = thrust::reduce(ngb_size.begin(),ngb_size.end());
std::cout << "ngbSize" << ngbSize << std::endl;
thrust::device_vector<int> ngb_list(ngbSize);
srand((unsigned)time(NULL));
for (int i = 0; i < num; i++){
int R = (rand()%(num - 0)) + 0;
ngb_list[i] = R;
}
//pressure
thrust::device_vector<Float> d_pressure(num);
thrust::sequence(d_pressure.begin(),d_pressure.end(),1);
//interaction vectors
thrust::device_vector<Float> d_xInteractionVectors(ngbSize,1);
thrust::device_vector<Float> d_yInteractionVectors(ngbSize,0);
thrust::device_vector<Float> d_zInteractionVectors(ngbSize,0);
//pressure gradients
thrust::device_vector<Float> pGradx(num);
thrust::device_vector<Float> pGrady(num);
thrust::device_vector<Float> pGradz(num);
//Pressure list
thrust::device_vector<Float> pressure_list(ngbSize,0);
cudaEventRecord(start);
pGrad_calculator<<<bg,tb>>>(thrust::raw_pointer_cast(&d_pressure[0]),
thrust::raw_pointer_cast(&pressure_list[0]),
thrust::raw_pointer_cast(&d_xInteractionVectors[0]),
thrust::raw_pointer_cast(&d_yInteractionVectors[0]),
thrust::raw_pointer_cast(&d_zInteractionVectors[0]),
thrust::raw_pointer_cast(&ngb_offset[0]),
thrust::raw_pointer_cast(&ngb_size[0]),
thrust::raw_pointer_cast(&ngb_list[0]),
thrust::raw_pointer_cast(&pGradx[0]),
thrust::raw_pointer_cast(&pGrady[0]),
thrust::raw_pointer_cast(&pGradz[0]),
num);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "KERNEL TIME = " << milliseconds << " milliseconds" << std::endl;
return 0;
}
Upvotes: 0
Views: 375
Reputation: 691
Likely the compiler can not optimize your code properly and many extra loads and writes are used.
Try writing the code like this:
__global__ void pGrad_calculator(Float* pressure, Float* pressure_list,
Float* interactionVectors_x,
Float* interactionVectors_y,
Float* interactionVectors_z,
int* ngb_offset, int* ngb_size,
int* ngb_list, Float* pressureGrad_x,
Float* pressureGrad_y, Float* pressureGrad_z,
int num)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num){
Float x=pressureGrad_x[idx];
Float y=pressureGrad_y[idx];
Float z=pressureGrad_z[idx];
Float pressure_local=pressure[idx];
int offset=ngb_offset[idx];
int end=offset+ngb_size[idx];
for (int j = offset; j < end; j++){
Float pij = (pressure_local + pressure[ngb_list[j]]);
x += interactionVectors_x[j] * pij;
y += interactionVectors_y[j] * pij;
z += interactionVectors_z[j] * pij;
}
pressureGrad_x[idx] = 0.5*x;
pressureGrad_y[idx] = 0.5*y;
pressureGrad_z[idx] = 0.5*z;
}
}
But even with this optimization, you will most likely get nowhere near peak flops as you will be limited by memory bandwidth. You are doing less than two floating point ops per double/ 8 bytes. That gives you upper bound on peak flops of 288 GB/s * 2 FOps / 8 bytes = 72 GFlop/s or about 5% peak.
Upvotes: 1