Cuda passing an array of structs

Question

I am new to cuda and am trying to parallelize a very simple program shown below that was inspired from this link: https://devblogs.nvidia.com/even-easier-introduction-cuda/

typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;
void add(int n, S * s){
    for(int i = 0; i < n; i++){
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}
int main(int argc, char *argv[]){
    int n = 1<<21;
    S grid[n];

    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand();
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 

    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);

    add(n,grid);

    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%83940==1)printf("%d
",grid[i].temp);
    }
    return 0;
}

I am not getting desired results however as when I am updating temp all the new values are 0. I think that the issue is because the array of structs I am passing to my add function cannot be accessed in device memory. I, however, am having a hard time figuring out how to fix this. I found this post on stackoverflow and am a little unsure what the suggested answer did to fix the issue: Array of structs of arrays CUDA C

The cuda code I have for reference is here:

#include 
#include 
#include 
#include 
#define SIZE 1000
#define NS_PER_US 1000

typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;
__global__ void add(int n, S * s){
    int index = threadIdx.x;
    int stride = blockDim.x;
    //printf("%d
",(n-index)/stride);
    //printf("%d
",s[0].temp);
    for(int i = index; i < n; i+=stride){
        printf("%d
",index);
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        printf("%d
",index);
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}

int main(int argc, char *argv[]){
    int  *h_a;
    int  *d_a;
    int  num_blocks= 2;
    int  num_th_per_blk= 5;

    int n = 1<<21;
    S grid[n];

    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand();
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 

    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);

    size_t  memSize;
    memSize = num_blocks* num_th_per_blk* sizeof(int);
    h_a= (int*) malloc(memSize);

    cudaMallocManaged((void **)&grid, n * sizeof(S));
    cudaMalloc( (void**) &d_a, memSize);
    dim3  dimGrid(num_blocks);
    dim3  dimBlock(num_th_per_blk);    

    add<<< dimGrid, dimBlock >>>(n,grid);

    cudaMemcpy( h_a, d_a, memSize,cudaMemcpyDeviceToHost);

    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%83940==1)printf("%d
",grid[i].newtemp);
    }
    clock_gettime(CLOCK_REALTIME, &end); 
    t = clock() - t; 
    time(&endtime);
    gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
    time_diff = difftime(endtime, starttime);

    printf("	time (clock_gettime) %f
", gettime_diff);
    printf("	time (clock) %f
", ((float)t)/CLOCKS_PER_SEC);
    printf("	time (time) %f
", time_diff); 

    return 0;
}

I feel like there is a simple fix here that I am not seeing, or maybe I am missing a key concept. Whatever the case any help would be greatly appreciated.

talonmies · Accepted Answer

There is actually a lot wrong in your code, so much so that it is easier to post a working version than point out all the individual mistakes:

#include 
#include 
#include 
#include 
#define NS_PER_US 1000

typedef struct{
    int temp;
    int newtemp;
    int neighbors[20];
} S;

__global__ 
void add(int n, S * s)
{
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    for(int i = index; i < n; i+=stride){
        int newTemp = 0;
        for(int j = 0; j < 20; j++){
            newTemp += s[s[i].neighbors[j]].temp;
        }
        newTemp /= 3;
        s[i].newtemp = newTemp;
    }
}

int main(int argc, char *argv[]){
    int n = 1<<10;
    S* grid;
    cudaMallocManaged((void **)&grid, n * sizeof(S));

    for(int i = 0; i < n; i++){
        S tmp1;
        tmp1.temp = rand()%n;
        for(int j = 0; j<20; j++){
            tmp1.neighbors[j] = rand()%n;
        }
        grid[i] = tmp1;
    }
    struct timespec start, end;
    double gettime_diff, time_diff; 
    clock_t t, starttime, endtime; 

    clock_gettime(CLOCK_REALTIME, &start);
    t = clock(); 
    time(&starttime);

    int  num_th_per_blk= 32;
    int  num_blocks= (n / num_th_per_blk) + (n % num_th_per_blk > 0) ? 1 : 0;

    dim3  dimGrid(num_blocks);
    dim3  dimBlock(num_th_per_blk);    

    add<<< dimGrid, dimBlock >>>(n,grid);
    cudaDeviceSynchronize();

    for(int i = 0; i < n; i++){
        grid[i].temp = grid[i].newtemp;
        if(i%10==1)printf("%d %d
",i,grid[i].temp);
    }
    clock_gettime(CLOCK_REALTIME, &end); 
    t = clock() - t; 
    time(&endtime);
    gettime_diff = (double) ((end.tv_sec - start.tv_sec)*CLOCKS_PER_SEC) + (double)((end.tv_nsec - start.tv_nsec)/NS_PER_US);
    time_diff = difftime(endtime, starttime);

    printf("	time (clock_gettime) %f
", gettime_diff);
    printf("	time (clock) %f
", ((float)t)/CLOCKS_PER_SEC);
    printf("	time (time) %f
", time_diff); 

    return 0;
}

The most egregious error is how you handle grid in the host code. Doing this:

S grid[n];

// code initializing grid

cudaMallocManaged((void **)&grid, n * sizeof(S));

is both illegal (you shouldn't try and set grid to another pointer value, it isn't a pointer), and nonsensical. cudaMallocManaged allocates new memory, so all you are doing is initializing grid, then throwing away all the carefully initialized memory and replacing it with uninitialized memory which you pass to the kernel. The kernel then operates on random data. Note also that the grid stride loop within the kernel is also incorrect, and both the original code and CUDA version potentially suffer from integer overflow due to how you initialize the temp members of the structure in both versions using rand().

Cuda passing an array of structs

Answers (1)

Related Questions