Cuda: XOR single bitset with array of bitsets

Question

I want to XOR a single bitset with a bunch of other bitsets (~100k) and count the set bits of every xor-result. The size of a single bitset is around 20k bits.

The bitsets are already converted to arrays of unsigned int to be able to use the intrinsic __popc()-function. The 'bunch' is already residing contiguously in device-memory.

My current kernel code looks like this:

// Grid/Blocks used for kernel invocation 
dim3 block(32); 
dim3 grid((bunch_size / 31) + 32);

__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {

    int tid = blockIdx.x*blockDim.x + threadIdx.x;

    if (tid < bunch_size){      // 1 Thread for each bitset in the 'bunch'
        int sum = 0;
        uint xor_res = 0;
        for (int i = 0; i < bitset_size; ++i){  // Iterate through every uint-block of the bitsets
            xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
            sum += __popc(xor_res);
        }
        set_bits[tid] = sum;
    }
}

However, compared to a parallelized c++/boost version, I see no benefit using Cuda.

Is there any potential in optimizing this kernel?

Robert Crovella · Accepted Answer

Is there any potential in optimizing this kernel?

I see 2 problems here (and they are the first two classical primary optimizations objectives for any CUDA programmer):

You want to try to efficiently use global memory. Your accesses to bitset and bunch are not coalesced. (efficiently use the memory subsystems)
The use of 32 threads per block is generally not recommended and could limit your overall occupancy. One thread per bitset is also potentially problematic. (expose enough parallelism)

Whether addressing those issues will meet your definition of benefit is impossible to say without a comparison test case. Furthermore, simple memory-bound problems like this are rarely interesting in CUDA when considered by themselves. However, we can (probably) improve the performance of your kernel.

We'll use a laundry list of ideas:

have each block handle a bitset, rather than each thread, to enable coalescing
use shared memory to load the comparison bitset, and reuse it
use just enough blocks to saturate the GPU, along with striding loops
use const ... __restrict__ style decoration to possibly benefit from RO cache

Here's a worked example:

$ cat t1649.cu
#include 
#include 

const int my_bitset_size = 20000/(32);
const int my_bunch_size = 100000;
typedef unsigned uint;

//using one thread per bitset in the bunch
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {

    int tid = blockIdx.x*blockDim.x + threadIdx.x;

    if (tid < bunch_size){      // 1 Thread for each bitset in the 'bunch'
        int sum = 0;
        uint xor_res = 0;
        for (int i = 0; i < bitset_size; ++i){  // Iterate through every uint-block of the bitsets
            xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
            sum += __popc(xor_res);
        }
        set_bits[tid] = sum;
    }
}

const int nTPB = 256;
// one block per bitset, multiple bitsets per block
__global__ void kernelXOR_imp(const uint * __restrict__  bitset, const uint * __restrict__  bunch, int * __restrict__  set_bits, int bitset_size, int bunch_size) {

    __shared__ uint sbitset[my_bitset_size];  // could also be dynamically allocated for varying bitset sizes
    __shared__ int ssum[nTPB];
    // load shared, block-stride loop
    for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) sbitset[idx] = bitset[idx];
    __syncthreads();
    // stride across all bitsets in bunch
    for (int bidx = blockIdx.x; bidx < bunch_size; bidx += gridDim.x){
      int my_sum = 0;
      for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) my_sum += __popc(sbitset[idx] ^ bunch[bidx*bitset_size + idx]);
    // block level parallel reduction
      ssum[threadIdx.x] = my_sum;
      for (int ridx = nTPB>>1; ridx > 0; ridx >>=1){
        __syncthreads();
        if (threadIdx.x < ridx) ssum[threadIdx.x] += ssum[threadIdx.x+ridx];}
      if (!threadIdx.x) set_bits[bidx] = ssum[0];}
}



int main(){

// data setup

  uint *d_cbitset, *d_bitsets, *h_cbitset, *h_bitsets;
  int *d_r, *h_r, *h_ri;
  h_cbitset = new uint[my_bitset_size];
  h_bitsets = new uint[my_bitset_size*my_bunch_size];
  h_r  = new int[my_bunch_size];
  h_ri = new int[my_bunch_size];
  for (int i = 0; i < my_bitset_size*my_bunch_size; i++){
    h_bitsets[i] = rand();
    if (i < my_bitset_size) h_cbitset[i] = rand();}
  cudaMalloc(&d_cbitset, my_bitset_size*sizeof(uint));
  cudaMalloc(&d_bitsets, my_bitset_size*my_bunch_size*sizeof(uint));
  cudaMalloc(&d_r,  my_bunch_size*sizeof(int));
  cudaMemcpy(d_cbitset, h_cbitset, my_bitset_size*sizeof(uint), cudaMemcpyHostToDevice);
  cudaMemcpy(d_bitsets, h_bitsets, my_bitset_size*my_bunch_size*sizeof(uint), cudaMemcpyHostToDevice);
// original

// Grid/Blocks used for kernel invocation
  dim3 block(32);
  dim3 grid((my_bunch_size / 31) + 32);

  kernelXOR<<>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
  cudaMemcpy(h_r, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);


// improved
  dim3 iblock(nTPB);
  dim3 igrid(640);
  kernelXOR_imp<<>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
  cudaMemcpy(h_ri, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);

  for (int i = 0; i < my_bunch_size; i++)
    if (h_r[i] != h_ri[i]) {std::cout << "mismatch at i: " << i << " was: " << h_ri[i] << " should be: " << h_r[i] << std::endl; return 0;}
  std::cout << "Results match." << std::endl;
  return 0;
}
$ nvcc -o t1649 t1649.cu
$ cuda-memcheck ./t1649
========= CUDA-MEMCHECK
Results match.
========= ERROR SUMMARY: 0 errors
$ nvprof ./t1649
==18868== NVPROF is profiling process 18868, command: ./t1649
Results match.
==18868== Profiling application: ./t1649
==18868== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   97.06%  71.113ms         2  35.557ms  2.3040us  71.111ms  [CUDA memcpy HtoD]
                    2.26%  1.6563ms         1  1.6563ms  1.6563ms  1.6563ms  kernelXOR(unsigned int*, unsigned int*, int*, int, int)
                    0.59%  432.68us         1  432.68us  432.68us  432.68us  kernelXOR_imp(unsigned int const *, unsigned int const *, int*, int, int)
                    0.09%  64.770us         2  32.385us  31.873us  32.897us  [CUDA memcpy DtoH]
      API calls:   78.20%  305.44ms         3  101.81ms  11.373us  304.85ms  cudaMalloc
                   18.99%  74.161ms         4  18.540ms  31.554us  71.403ms  cudaMemcpy
                    1.39%  5.4121ms         4  1.3530ms  675.30us  3.3410ms  cuDeviceTotalMem
                    1.26%  4.9393ms       388  12.730us     303ns  530.95us  cuDeviceGetAttribute
                    0.11%  442.37us         4  110.59us  102.61us  125.59us  cuDeviceGetName
                    0.03%  128.18us         2  64.088us  21.789us  106.39us  cudaLaunchKernel
                    0.01%  35.764us         4  8.9410us  2.9670us  18.982us  cuDeviceGetPCIBusId
                    0.00%  8.3090us         8  1.0380us     540ns  1.3870us  cuDeviceGet
                    0.00%  5.9530us         3  1.9840us     310ns  3.9900us  cuDeviceGetCount
                    0.00%  2.8800us         4     720ns     574ns     960ns  cuDeviceGetUuid
$

In this case, on my Tesla V100, for your problem size, I witness about a 4x improvement in kernel performance. However the kernel performance here is tiny compared to the cost of data movement. So it's unlikely that these sort of optimizations would make a significant difference in your comparison test case, if this is the only thing you are doing on the GPU.

The code above uses striding-loops at the block level and at the grid level, which means it should behave correctly for almost any choice of threadblock size (multiple of 32 please) as well as grid size. That doesn't mean that any/all choices will perform equally. The choice of the threadblock size is to allow the possibility for nearly full occupancy (so don't choose 32). The choice of the grid size is the number of blocks to achieve full occupancy per SM, times the number of SMs. These should be nearly optimal choices, but according to my testing e.g. a larger number of blocks doesn't really reduce performance, and the performance should be roughly constant for nearly any threadblock size (except 32), assuming the number of blocks is calculated accordingly.

Cuda: XOR single bitset with array of bitsets

Answers (1)

Related Questions