Reputation: 1
I'm having a bit of a issue understanding how I could implement CUDA shared memory since I'm not using the thread number for anything other than to check which calculations that certain thread should do.
__global__ void gpu_histogram_equalization(unsigned char * img_out, unsigned char * img_in,
int * hist_in, int img_size, int nbr_bin, int numOfThreads, int * lut){
int i = 0;
int x = threadIdx.x + blockDim.x*blockIdx.x;
int start;
int end;
/* Get the result image */
if(x >= img_size) {
return;
}
start = ((img_size/numOfThreads) * x);
if(numOfThreads == 1) {
end = (img_size/numOfThreads);
}
else {
end = ((img_size/numOfThreads) * (x+1));
}
for(i = start; i < end; i ++){
if(lut[img_in[i]] > 255){
img_out[i] = 255;
}
else{
img_out[i] = (unsigned char)lut[img_in[i]];
}
}
}
Can anyone clarify that my speculation is true, that this is not possible to make use of shared memory?
Upvotes: 0
Views: 142
Reputation: 2870
Using shared memory will give you a preformance increase if you reuse the data multiple times. The code can be rewritten to utilize higher memory bandwidth and discard the use of shared memory.
Something like this:
__global__ void gpu_histogram_equalization(unsigned char * img_out, unsigned char * img_in,
int * hist_in, int img_size, int nbr_bin, int numOfThreads, int * lut){
int lutval;
int x = threadIdx.x + blockDim.x*blockIdx.x;
/* Get the result image */
if(x >= img_size) {
return;
}
lutval = lut[img_in[x]];
if(lutval > 255){
img_out[x] = 255;
}
else{
img_out[i] = (unsigned char)lutval;
}
}
Upvotes: 1