Reputation: 202
I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
void add(int n, S * s){
for(int i = 0; i < n; i++){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
add(n,grid);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].temp);
}
return 0;
}
I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C
The cuda code I have for reference is here:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define SIZE 1000
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__ void add(int n, S * s){
int index = threadIdx.x;
int stride = blockDim.x;
//printf("%d\n",(n-index)/stride);
//printf("%d\n",s[0].temp);
for(int i = index; i < n; i+=stride){
printf("%d\n",index);
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
printf("%d\n",index);
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int *h_a;
int *d_a;
int num_blocks= 2;
int num_th_per_blk= 5;
int n = 1<<21;
S grid[n];
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand();
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
size_t memSize;
memSize = num_blocks* num_th_per_blk* sizeof(int);
h_a= (int*) malloc(memSize);
cudaMallocManaged((void **)&grid, n * sizeof(S));
cudaMalloc( (void**) &d_a, memSize);
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%83940==1)printf("%d\n",grid[i].newtemp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.
Upvotes: 1
Views: 330
Reputation: 72349
There is actually a lot wrong in your code, so much so that it is easier to post a working version than point out all the individual mistakes:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define NS_PER_US 1000
typedef struct{
int temp;
int newtemp;
int neighbors[20];
} S;
__global__
void add(int n, S * s)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < n; i+=stride){
int newTemp = 0;
for(int j = 0; j < 20; j++){
newTemp += s[s[i].neighbors[j]].temp;
}
newTemp /= 3;
s[i].newtemp = newTemp;
}
}
int main(int argc, char *argv[]){
int n = 1<<10;
S* grid;
cudaMallocManaged((void **)&grid, n * sizeof(S));
for(int i = 0; i < n; i++){
S tmp1;
tmp1.temp = rand()%n;
for(int j = 0; j<20; j++){
tmp1.neighbors[j] = rand()%n;
}
grid[i] = tmp1;
}
struct timespec start, end;
double gettime_diff, time_diff;
clock_t t, starttime, endtime;
clock_gettime(CLOCK_REALTIME, &start);
t = clock();
time(&starttime);
int num_th_per_blk= 32;
int num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;
dim3 dimGrid(num_blocks);
dim3 dimBlock(num_th_per_blk);
add<<< dimGrid, dimBlock >>>(n,grid);
cudaDeviceSynchronize();
for(int i = 0; i < n; i++){
grid[i].temp = grid[i].newtemp;
if(i%10==1)printf("%d %d\n",i,grid[i].temp);
}
clock_gettime(CLOCK_REALTIME, &end);
t = clock() - t;
time(&endtime);
gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
time_diff = difftime(endtime, starttime);
printf("\ttime (clock_gettime) %f\n", gettime_diff);
printf("\ttime (clock) %f\n", ((float)t)/CLOCKS_PER_SEC);
printf("\ttime (time) %f\n", time_diff);
return 0;
}
The most egregious error is how you handle grid
in the host code. Doing this:
S grid[n];
// code initializing grid
cudaMallocManaged((void **)&grid, n * sizeof(S));
is both illegal (you shouldn't try and set grid to another pointer value, it isn't a pointer), and nonsensical. cudaMallocManaged
allocates new memory, so all you are doing is initializing grid
, then throwing away all the carefully initialized memory and replacing it with uninitialized memory which you pass to the kernel. The kernel then operates on random data. Note also that the grid stride loop within the kernel is also incorrect, and both the original code and CUDA version potentially suffer from integer overflow due to how you initialize the temp members of the structure in both versions using rand()
.
Upvotes: 2