Reputation: 172
I'm trying to calculate the effective bandwidth of this kernel:
__global__ void compute_flags(int nquarts, int4 *d_flag, int4 * dat_arr, int * arr, int array_size){
int gi = blockIdx.x * blockDim.x + threadIdx.x;
if (gi < nquarts) {
int4 res = {0, 0, 0, 0};
int4 dat = dat_arr[gi];
for (int i = 0; i < array_size; i++){
int p = arr[i];
res.x = res.x | (p == dat.x);
res.y = res.y | (p == dat.y);
res.z = res.z | (p == dat.z);
res.w = res.w | (p == dat.w);
}
d_flag[gi] = res;
}
}
But I am getting unrealistic numbers: 1.5TB/s for a theoretical bandwidth of 448 GB/s.
Currently I am calculating the number of accesses as:
2 * n_kernel_calls * sizeof(int4) + n_kernel_calls * array_size * sizeof(int)
The first term takes into account the access done to read dat
and to write d_flag
, the second term considers the reads from the second array inside the for loop.
The res
and dat
variable's accesses are not considered because it is loaded into registers.
The complete code I am using is the following:
#include <iostream>
#include <stdlib.h>
#define arr_size 1000000
int round_div_up (int a, int b){
return (a + b - 1)/b;
}
void cuda_err_check (cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess)
{
fprintf (stderr, "CUDA error: %s (%s:%d)\n", cudaGetErrorString (err), file, line);
exit (EXIT_FAILURE);
}
}
__global__ void compute_flags(int nquarts, int4 *d_flag, int4 * dat_arr, int * arr, int array_size){
int gi = blockIdx.x * blockDim.x + threadIdx.x;
if (gi < nquarts) {
int4 res = {0, 0, 0, 0};
int4 dat = dat_arr[gi];
for (int i = 0; i < array_size; i++){
int p = arr[i];
res.x = res.x | (p == dat.x);
res.y = res.y | (p == dat.y);
res.z = res.z | (p == dat.z);
res.w = res.w | (p == dat.w);
}
d_flag[gi] = res;
}
}
using namespace std;
int main(void){
int V1 [arr_size] = {};
int V2 [arr_size] = {};
// fill with random numbers
for(int i = 0; i < arr_size; i++){
V1[i] = rand() % 100;
V2[i] = rand() % 100;
}
int4 *d_flag;
int * d_V1;
int * d_V2;
cudaError_t err;
cudaEvent_t start, stop;
err = cudaEventCreate(&start); cuda_err_check(err, __FILE__, __LINE__);
err = cudaEventCreate(&stop); cuda_err_check(err, __FILE__, __LINE__);
err = cudaMalloc((void **)&d_flag, arr_size * sizeof(int)); cuda_err_check(err, __FILE__, __LINE__);
err = cudaMalloc((void **)&d_V1, arr_size * sizeof(int)); cuda_err_check(err, __FILE__, __LINE__);
err = cudaMalloc((void **)&d_V2, arr_size * sizeof(int)); cuda_err_check(err, __FILE__, __LINE__);
err = cudaMemcpy(d_V1, V1, arr_size * sizeof(int), cudaMemcpyHostToDevice); cuda_err_check(err, __FILE__, __LINE__);
err = cudaMemcpy(d_V2, V2, arr_size * sizeof(int), cudaMemcpyHostToDevice); cuda_err_check(err, __FILE__, __LINE__);
uint64_t nquarts = round_div_up(arr_size, 4);
uint64_t lws = 256;
uint64_t gws = round_div_up(nquarts, lws);
err = cudaEventRecord(start); cuda_err_check(err, __FILE__, __LINE__);
compute_flags<<<gws, lws>>>(nquarts, d_flag, (int4*)d_V1, d_V2, arr_size);
err = cudaEventRecord(stop); cuda_err_check(err, __FILE__, __LINE__);
err = cudaEventSynchronize(stop); cuda_err_check(err, __FILE__, __LINE__);
err = cudaGetLastError(); cuda_err_check(err, __FILE__, __LINE__);
err = cudaDeviceSynchronize(); cuda_err_check(err, __FILE__, __LINE__);
uint64_t byte_accesses = 2 * nquarts * sizeof(int4) + nquarts * arr_size * sizeof(int);
float time;
err = cudaEventElapsedTime(&time, start, stop); cuda_err_check(err, __FILE__, __LINE__);
cout << "Time: " << time << " ms" << endl;
cout << "Bandwidth: " << byte_accesses / time / 1e6 << " GB/s" << endl;
}
Is someone able to point me to the correct calculation and/or where my logic fails?
Upvotes: 2
Views: 77