CUDA triple loop

Question

I am pretty new to CUDA and I'm very struggling with converting a C code to CUDA C, it builds successfully but it keeps crashing. Triple loop function is wrong for sure and I have no idea what should I change.

Function call:

for (z=0;z<=max;z++)    
    {
    correlationsum=coefficient(x, n, dim, z);
    printf("result for epsilon %d returns %d
", z, correlation_sum);
    }

Function

long coefficient(int vctr[40000], long numberofpoints, int coefficientrow, int epsilon)
{
long i, j, k, sum, numberofpairs;
long sq_epsilon;
sq_epsilon=epsilon*epsilon;
numberofpairs=0;
for (i=1;i<=numberofpoints-coefficientrow;i++)
    {
    sum=0;
    for (j=i+1;j<=numberofpoints+1-coefficientrow;j++)
        {
        for (k=0;k





I have problems limiting the function in GPU part, so it doesn't go out of bounds (e.g. k is less than coefficientrow above). I saw that it is possible to assign block/threadids and use if function. I have tried it but in triple for loop it is kinda... strange.

Here is almost full code.

    #define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        int j = blockIdx.y * blockDim.y + threadIdx.y;
        int k = blockIdx.z * blockDim.z + threadIdx.z;
        int sum;
        numbofpairs = 0;
        int sq_epsilon = epsilon*epsilon;


        if (i <= numberofpoints - coefficient_row)
        {
            sum = 0;
            if (j <= numberofpoints + 1 - coefficient_row)
            {
                if (k < coefficient_row)
                    sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
                if (sum < sq_epsilon){
                    numbofpairs++;
                    sum = 0;
    }}}}    

int main()
{
int n, dim, max, z;
int *d_n, *d_dim, *d_z, *d_x, *d_numbofpairs;
int x[40000], correlation_sum = 0;
    n=10;  
    max=10;
    dim=3;  

    cudaMalloc((void **)&d_n, sizeof(int));
    cudaMalloc((void **)&d_dim, sizeof(int));
    cudaMalloc((void **)&d_z, sizeof(int));
    cudaMalloc((void **)&d_x, sizeof(int));
    cudaMalloc((void **)&d_numbofpairs, sizeof(int));

    cudaMemcpy(d_n, &n, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_dim, &dim, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);

    for (z = 0; z <= max; z++)
    {
        cudaMemcpy(d_z, &z, sizeof(int), cudaMemcpyHostToDevice);
        coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
        cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
        printf("result for epsilon %d returns %d
", z, correlation_sum);
    }
    cudaFree(d_n);
    cudaFree(d_dim);
    cudaFree(d_z);
    cudaFree(d_x);
    cudaFree(d_numbofpairs);
    return 0;
}


I would like some help or tips what to change, what is wrong and why it keeps crashing so I could fix it. Thank you!

EDIT: I completed some parts, sorry my bad. As for threads and blocks, I am very confused, GPU shows 1024 threads per block, and I'm not sure whether it's it or not.

Robert Crovella · Accepted Answer

So the "crash" is a seg fault. A seg fault is a problem in host code, not kernel code (although it could be in your usage of the CUDA API).

Your code has a variety of problems.

This might cause trouble:
```
int x[40000]
```
this creates a large stack-based allocation. Instead I suggest doing a dynamic allocation:
```
int *x = (int *)malloc(40000*sizeof(int));
```
dynamic allocations have much higher size limits.
It's fairly clear from your kernel usage that you intend to use the whole x vector. Therefore, this allocation on the device for d_x is not correct:
```
cudaMalloc((void **)&d_x, sizeof(int));
```
we need the same size allocation on the device as what we have on the host:
```
cudaMalloc((void **)&d_x, 40000*sizeof(int));
```
Corresponding to 2, you probably would want to copy the entire x vector to the device (it's not really clear since your code doesn't show the initialization of x), and you have incorrectly taken the address of x here, but x is already a pointer:
```
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
```
so we want something like this instead:
```
cudaMemcpy(d_x, x, 40000*sizeof(int), cudaMemcpyHostToDevice);
```
Your other kernel parameters appear to be scalar parameters. You're mostly handling those incorrectly as well:
```
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
```
for a parameter like numberofpoints specified as above (one-way pass to function), we simply pass by value the host quantity we want when calling the kernel, just like we would with an ordinary C function. So this kernel invocation is not correct (even though it appears to compile):
```
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
```
instead we want to pass just the host variables, by value:
```
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
```
since d_numbofpairs is going both ways, your usage is correct there.
I would also recommend adding proper cuda error checking to your code.

Here is a fully worked example with the above errors fixed. I think the results are bogus of course because the input data (e.g. x) is not initialized.

$ cat t724.cu
#include 

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)
", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING
"); \
            exit(1); \
        } \
    } while (0)


#define THREADS 1024

__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        int j = blockIdx.y * blockDim.y + threadIdx.y;
        int k = blockIdx.z * blockDim.z + threadIdx.z;
        int sum;
        numbofpairs = 0;
        int sq_epsilon = epsilon*epsilon;


        if (i <= numberofpoints - coefficient_row)
        {
            sum = 0;
            if (j <= numberofpoints + 1 - coefficient_row)
            {
                if (k < coefficient_row)
                    sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
                if (sum < sq_epsilon){
                    numbofpairs++;
                    sum = 0;
    }}}}

int main()
{
  int n, dim, max, z;
  int  *d_x, *d_numbofpairs;
  int correlation_sum = 0;
  int *x = (int *)malloc(40000*sizeof(int));
  if (x == NULL) {printf("malloc fail
"); return -1;}
    n=10;
    max=10;
    dim=3;

    cudaMalloc((void **)&d_x, sizeof(int));
    cudaCheckErrors("cudaMalloc 1 fail");
    cudaMalloc((void **)&d_numbofpairs, sizeof(int));
    cudaCheckErrors("cudaMalloc 2 fail");
    cudaMemcpy(d_x, x, sizeof(int), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy 1 fail");

    for (z = 0; z <= max; z++)
    {
        coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
        cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
        cudaCheckErrors("cudaMemcpy 2/kernel fail");
        printf("result for epsilon %d returns %d
", z, correlation_sum);
    }
    cudaFree(d_x);
    cudaFree(d_numbofpairs);
    return 0;
}
$ nvcc -o t724 t724.cu
$ ./t724
result for epsilon 0 returns 3
result for epsilon 1 returns 3
result for epsilon 2 returns 3
result for epsilon 3 returns 3
result for epsilon 4 returns 3
result for epsilon 5 returns 3
result for epsilon 6 returns 3
result for epsilon 7 returns 3
result for epsilon 8 returns 3
result for epsilon 9 returns 3
result for epsilon 10 returns 3
$

Note that I didn't make any changes to your kernel code.

CUDA triple loop

Answers (1)

Related Questions