Guilherme Torres Castro
Guilherme Torres Castro

Reputation: 15350

Cuda atomicInc not working

I'm implementing k-means, on GPU and for now i have the folowing code:

__device__ unsigned int cuda_delta = 0;

__global__ void kmeans_kernel(const sequence_t *data,
                          const sequence_t *centroids,
                          int * membership,
                          unsigned int n,
                          unsigned int numClusters )
{
int index = blockIdx.x * blockDim.x  + threadIdx.x;
if (index < n){

    int min_distance = INT_MAX;
    int nearest = -1;

    for (int i = 0; i < numClusters; i++){
        sequence_t centroid = centroids[i];
        int distance = distance(centroid, data[index]);
        if(distance < min_distance) {
            nearest = i;
            min_distance = distance;
        }
    }

    if(membership[index] != nearest) {
        membership[index]=nearest;
        atomicInc(&cuda_delta,n);
    }
}

As you can see, there is no data dependency on the algorithm, only in the variable cuda_delta, stored on global memory. According com the documentation:

An atomic function performs a read-modify-write atomic operation on one 32-bit or 64-bit word residing in global or shared memory

It is exactly what i need. Edit - here is all my host code

unsigned int delta=0; //Number of objects has diverged in current iteration

label = (int*)calloc(data_size,sizeof(int));
centroids = (sequence_t*)calloc(clusters,sizeof(sequence_t));

// cuda variables
sequence_t * cuda_data = NULL;
sequence_t * cuda_centroids = NULL;
int *cuda_membership = NULL;
unsigned int *cuda_tmp_centroidCount = NULL;

const unsigned int threadsPerBlock = 1024;
const unsigned int numBlocks = (data_size + threadsPerBlock - 1) / threadsPerBlock;
const unsigned int numBlocks2 = (clusters + threadsPerBlock - 1) / threadsPerBlock;

for(unsigned int i = 0;i < clusters;i++) {
    int h = i * data_size / clusters;
    centroids[i] = make_ulong3(data[h].x,data[h].y,data[h].z);
}

memset (label,-1,data_size * sizeof(int));

checkCuda(cudaMalloc(&cuda_data, data_size * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_centroids, clusters * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_membership, data_size * sizeof(int)));
checkCuda(cudaMalloc(&cuda_tmp_centroidCount, clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int)));


checkCuda(cudaMemcpy(cuda_data,data, data_size *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_centroids, centroids, clusters *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_membership, label, clusters *sizeof(int) , cudaMemcpyHostToDevice));
int pc = 0;

do {

    cudaMemset (cuda_tmp_centroidCount,0,clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int));
    delta = 0;
    checkCuda(cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int),0,cudaMemcpyHostToDevice));
    kmeans_kernel <<< numBlocks,threadsPerBlock>>>(cuda_data,
                                                   cuda_centroids,
                                                   cuda_membership,
                                                   data_size,
                                                   clusters); 
    cudaDeviceSynchronize();
    checkCuda(cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int)));
    printf ("%d - delta = %d\n",pc,delta);
    checkCuda(cudaGetLastError());
    pc++;
}
while(delta > 0);
// copy output
checkCuda(cudaMemcpy(label,cuda_membership, clusters *sizeof(int) , cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(centroids,cuda_centroids, clusters *sizeof(sequence_t) , cudaMemcpyDeviceToHost));

// free cuda memory
checkCuda(cudaFree(cuda_data));
checkCuda(cudaFree(cuda_centroids));
checkCuda(cudaFree(cuda_membership));
checkCuda(cudaFree(cuda_tmp_centroidCount));
checkCuda(cudaDeviceReset());

The delta value printed on the first iteration changes if i run the code multiple times, and it shouldn't. Most of the time the values printed are:

0 - delta = 18630
0 - delta = 859

The expected value is 18634. Am i missing something here ?

Edit The full code is available on github, to run the example just compile using make. And run the program using the following arguments, multiple times and you will see the delta value for the first iteration is not always the expected.

./cuda-means mus_musmusculus.dat 859

Thanks in advanced!

Upvotes: 0

Views: 1089

Answers (2)

Guilherme Torres Castro
Guilherme Torres Castro

Reputation: 15350

Shame on me! The atomic operation was working perfectly.

I was not "memseting" membership array. After i fix it, everything is working.

Upvotes: 0

Michael Haidl
Michael Haidl

Reputation: 5482

cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int));

and

cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int));

are your problems.

From the documentation:

    cudaError_t cudaMemcpyFromSymbol ( void* dst, const void* symbol, size_t count, size_t offset = 0, cudaMemcpyKind kind = cudaMemcpyDeviceToHost )
Copies data from the given symbol on the device.
    Parameters

dst
    - Destination memory address 
symbol
    - Device symbol address 
count
    - Size in bytes to copy 
offset
    - Offset from start of symbol in bytes 
kind
    - Type of transfer

cudaMemcpyFromSymbol expects the adress the symbole as second parameter not the device symbol.

You can optain the address of a symbol using cudaGetSymbolAddress ( void** devPtr, const void* symbol )

void* is pure evil...

Upvotes: 1

Related Questions