Reputation: 15350
I'm implementing k-means, on GPU and for now i have the folowing code:
__device__ unsigned int cuda_delta = 0;
__global__ void kmeans_kernel(const sequence_t *data,
const sequence_t *centroids,
int * membership,
unsigned int n,
unsigned int numClusters )
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n){
int min_distance = INT_MAX;
int nearest = -1;
for (int i = 0; i < numClusters; i++){
sequence_t centroid = centroids[i];
int distance = distance(centroid, data[index]);
if(distance < min_distance) {
nearest = i;
min_distance = distance;
}
}
if(membership[index] != nearest) {
membership[index]=nearest;
atomicInc(&cuda_delta,n);
}
}
As you can see, there is no data dependency on the algorithm, only in the variable cuda_delta, stored on global memory. According com the documentation:
An atomic function performs a read-modify-write atomic operation on one 32-bit or 64-bit word residing in global or shared memory
It is exactly what i need. Edit - here is all my host code
unsigned int delta=0; //Number of objects has diverged in current iteration
label = (int*)calloc(data_size,sizeof(int));
centroids = (sequence_t*)calloc(clusters,sizeof(sequence_t));
// cuda variables
sequence_t * cuda_data = NULL;
sequence_t * cuda_centroids = NULL;
int *cuda_membership = NULL;
unsigned int *cuda_tmp_centroidCount = NULL;
const unsigned int threadsPerBlock = 1024;
const unsigned int numBlocks = (data_size + threadsPerBlock - 1) / threadsPerBlock;
const unsigned int numBlocks2 = (clusters + threadsPerBlock - 1) / threadsPerBlock;
for(unsigned int i = 0;i < clusters;i++) {
int h = i * data_size / clusters;
centroids[i] = make_ulong3(data[h].x,data[h].y,data[h].z);
}
memset (label,-1,data_size * sizeof(int));
checkCuda(cudaMalloc(&cuda_data, data_size * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_centroids, clusters * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_membership, data_size * sizeof(int)));
checkCuda(cudaMalloc(&cuda_tmp_centroidCount, clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int)));
checkCuda(cudaMemcpy(cuda_data,data, data_size *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_centroids, centroids, clusters *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_membership, label, clusters *sizeof(int) , cudaMemcpyHostToDevice));
int pc = 0;
do {
cudaMemset (cuda_tmp_centroidCount,0,clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int));
delta = 0;
checkCuda(cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int),0,cudaMemcpyHostToDevice));
kmeans_kernel <<< numBlocks,threadsPerBlock>>>(cuda_data,
cuda_centroids,
cuda_membership,
data_size,
clusters);
cudaDeviceSynchronize();
checkCuda(cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int)));
printf ("%d - delta = %d\n",pc,delta);
checkCuda(cudaGetLastError());
pc++;
}
while(delta > 0);
// copy output
checkCuda(cudaMemcpy(label,cuda_membership, clusters *sizeof(int) , cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(centroids,cuda_centroids, clusters *sizeof(sequence_t) , cudaMemcpyDeviceToHost));
// free cuda memory
checkCuda(cudaFree(cuda_data));
checkCuda(cudaFree(cuda_centroids));
checkCuda(cudaFree(cuda_membership));
checkCuda(cudaFree(cuda_tmp_centroidCount));
checkCuda(cudaDeviceReset());
The delta value printed on the first iteration changes if i run the code multiple times, and it shouldn't. Most of the time the values printed are:
0 - delta = 18630
0 - delta = 859
The expected value is 18634. Am i missing something here ?
Edit The full code is available on github, to run the example just compile using make. And run the program using the following arguments, multiple times and you will see the delta value for the first iteration is not always the expected.
./cuda-means mus_musmusculus.dat 859
Thanks in advanced!
Upvotes: 0
Views: 1089
Reputation: 15350
Shame on me! The atomic operation was working perfectly.
I was not "memseting" membership array. After i fix it, everything is working.
Upvotes: 0
Reputation: 5482
cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int));
and
cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int));
are your problems.
From the documentation:
cudaError_t cudaMemcpyFromSymbol ( void* dst, const void* symbol, size_t count, size_t offset = 0, cudaMemcpyKind kind = cudaMemcpyDeviceToHost )
Copies data from the given symbol on the device.
Parameters
dst
- Destination memory address
symbol
- Device symbol address
count
- Size in bytes to copy
offset
- Offset from start of symbol in bytes
kind
- Type of transfer
cudaMemcpyFromSymbol
expects the adress the symbole as second parameter not the device symbol.
You can optain the address of a symbol using cudaGetSymbolAddress ( void** devPtr, const void* symbol )
void*
is pure evil...
Upvotes: 1