RoyAbr121
RoyAbr121

Reputation: 11

CUDA Histogram Issue

I have an issue with a simple CUDA code to produce a histogram:

__#include <math.h>
#include <numeric>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__global__ void kernel_histogram(int* dev_histogram, int* dev_values_arr, unsigned int size) {

    __shared__ int temp[BLOCK_SIZE + 1];
    int thread_id, thread_value;

    thread_id = threadIdx.x + blockIdx.x * blockDim.x;

    if (thread_id >= size) {
        return;
    }

    temp[threadIdx.x + 1] = 0;
    __syncthreads();

    thread_value = dev_values_arr[thread_id];
    atomicAdd(&temp[thread_value], 1);
    __syncthreads();

    atomicAdd(&(dev_histogram[threadIdx.x + 1]), temp[threadIdx.x + 1]);
}

int* histogram_cuda(int* values_arr, int size) {

    int num_blocks = size / BLOCK_SIZE;
    int* dev_histogram = 0;
    int* dev_values_arr = 0;
    int* histogram = (int*)malloc((BLOCK_SIZE + 1) * sizeof(int));

    cudaError_t cudaStatus;

    if (size % BLOCK_SIZE != 0) {
        num_blocks = num_blocks + 1;
    }

    // allocate histogram and values_arr device memories
    cudaStatus = cudaMalloc((void**)&dev_histogram,
        (BLOCK_SIZE + 1) * sizeof(int));

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    cudaStatus = cudaMemset(dev_histogram, 0, (BLOCK_SIZE + 1) * sizeof(int));

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMemset() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    cudaStatus = cudaMalloc((void**)&dev_values_arr, size * sizeof(int));

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMalloc() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    // copy values_arr memory in host to device
    cudaStatus = cudaMemcpy(dev_values_arr, values_arr, size * sizeof(int),
        cudaMemcpyHostToDevice);

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }


    printf("the number of blocks is %d\n\n", num_blocks);

    // calculate histogram on the gpu
    kernel_histogram << <num_blocks, BLOCK_SIZE >> > (dev_histogram, dev_values_arr,
        size);

    // copy histogram memory in device to host
    cudaStatus = cudaMemcpy(histogram, dev_histogram,
        (BLOCK_SIZE + 1) * sizeof(int), cudaMemcpyDeviceToHost);

    if (cudaStatus != cudaSuccess) {
        printf("ERROR: CUDA cudaMemcpy() operation failed - %s\n",
            cudaGetErrorString(cudaStatus));
        exit(-1);
    }

    // free device memory
    cudaFree(dev_histogram);
    cudaFree(dev_values_arr);

    return histogram;
}

int main(int argc, char* argv[]) {

    unsigned int size = 21;
    int* histogram;
    int values_arr[] = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 };

    histogram = histogram_cuda(values_arr, size);

    for (int i = 1; i < BLOCK_SIZE + 1; i++) {
        if (histogram[i] > 0) {
            printf("%d : %d\n", i, histogram[i]);
        }
    }
}

The histogram is meant to record the number of values present in the input, with the allowed values being 1 to 256. Each block is to have a maximum of 256 threads. I am trying to limit the number of overall threads across the blocks to so that each threads records the occurrence of one value in the histogram.

if I use "values_arr = { 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 7, 7, 7, 7, 19, 20, 21, 100, 256 }" which means the size is 21, I get:

2 : 7 4 : 1 5 : 4 7 : 4 19 : 1 20 : 1 21 : 1

I am trying to make it so that each value is recorded by one thread and all useless threads are disposed of. Also, any other problems you spot and any suggestions to make this in the best possible way would be appreciated. Thanks!

Upvotes: 0

Views: 519

Answers (1)

talonmies
talonmies

Reputation: 72349

In the new version of the code in your question, you have two conditionally executed __syncthreads() calls, which are illegal in CUDA and prone to either deadlock or produce undefined behaviour, depending on the hardware you have and the use case.

If I modify the kernel like this:

__global__ void kernel_histogram(int* dev_histogram, int* dev_values_arr, unsigned int size) {

    __shared__ int temp[BLOCK_SIZE + 1];
    int thread_id, thread_value;

    thread_id = threadIdx.x + blockIdx.x * blockDim.x;

    temp[threadIdx.x + 1] = 0;
    // Synchronization is unconditional
    __syncthreads();

    // Load is performed conditionally
    if (thread_id < size) {
        thread_value = dev_values_arr[thread_id];
        atomicAdd(&temp[thread_value], 1);
    }

    // Synchronization is unconditional
    __syncthreads();

    atomicAdd(&(dev_histogram[threadIdx.x + 1]), temp[threadIdx.x + 1]);
}

I get this output:

the number of blocks is 1

2 : 7
4 : 1
5 : 4
7 : 4
19 : 1
20 : 1
21 : 1
100 : 1
256 : 1

This looks much more like what is expected to my eyes.

Upvotes: 1

Related Questions