Duplicate values in array after CUDA calculations

Question

I copied one of the asynchronous CUDA/C++ examples and modified it to evaluate primality. My problem is that for every printed prime, the next value in the array is a duplicate of that value. Is this intended behavior or is it a problem with the way I programmed the example?

The Code:

////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

//
// This sample illustrates the usage of CUDA events for both GPU timing and
// overlapping CPU and GPU execution.  Events are inserted into a stream
// of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can
// perform computations while GPU is executing (including DMA memcopies
// between the host and device).  CPU can query CUDA events to determine
// whether GPU has completed tasks.
//

// includes, system
#include 

// includes CUDA Runtime
#include 

// includes, project
#include 
#include  // helper utility functions 


//set matrix to possible prime values
//evaluate if input is prime, sets variable to 0 if not prime
__global__ void testPrimality(int * g_data) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_data[idx] = 3 + idx / 2;

    if (g_data[idx] <= 3) {
        if (g_data[idx] <= 1) {
            g_data[idx] = 0;
        }
    }

    else if (g_data[idx] % 2 == 0 || g_data[idx] % 3 == 0) {
        g_data[idx] = 0;
    }

    else {
        for (unsigned short i = 5; i * i <= g_data[idx]; i += 6) {
            if (g_data[idx] % i == 0 || g_data[idx] % (i + 2) == 0) {
                g_data[idx] = 0;
            }
        }
    }

}

bool correct_output(int *data, const int n, const int x)
{
    for (int i = 0; i < n; i++)
        if (data[i] != x)
        {
            printf("Error! data[%d] = %d, ref = %d
", i, data[i], x);
            return false;
        }

    return true;
}

int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...
", argv[0]);

    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);

    // get device name
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s]
", deviceProps.name);

    const int n = 16 * 1024 * 1024;
    int nbytes = n * sizeof(int);
    int value = 1;

    // allocate host memory
    int *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
    memset(a, 0, nbytes);



    // allocate device memory
    int *d_a=0;
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));

    // set kernel launch configuration
    dim3 threads = dim3(512, 1);
    dim3 blocks  = dim3(n / threads.x, 1);

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    checkCudaErrors(cudaDeviceSynchronize());
    float gpu_time = 0.0f;

    // asynchronously issue work to the GPU (all to stream 0)
    sdkStartTimer(&timer);
    cudaEventRecord(start, 0);
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    //increment_kernel<<>>(d_a);
    testPrimality<<>>(d_a);
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
    cudaEventRecord(stop, 0);
    sdkStopTimer(&timer);

    // have CPU do some work while waiting for stage 1 to finish
    unsigned long int counter=0;

    while (cudaEventQuery(stop) == cudaErrorNotReady)
    {
        counter++;
    }

    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));

    // print the cpu and gpu times
    printf("time spent executing by the GPU: %.2f
", gpu_time);
    printf("time spent by CPU in CUDA calls: %.2f
", sdkGetTimerValue(&timer));
    printf("CPU executed %lu iterations while waiting for GPU to finish
", counter);

    //print values for all allocated memory space
    for (int i = 0; i < n; i++) {
        if (a[i] != 0) {
            std::cout << a[i]<< " : " << i << std::endl;
        }
    }

    // check the output for correctness
    //bool bFinalResults = correct_output(a, n, value);
    bool bFinalResults = true;



    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaFreeHost(a));
    checkCudaErrors(cudaFree(d_a));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

Robert Crovella · Accepted Answer

The duplication is arising from your actual "input" values to the operation. It's unclear to me what numerical sequence you were wanting but this line of code:

g_data[idx] = 3 + idx / 2;

does integer division (idx is of type int, and so is g_data[idx]).

The result of integer divsion by two means that each value in the "input" will be duplicated, therefore so will each value in the output. If you'd like to see the input values, modify your last cout statement like so:

        std::cout << a[i]<< " : " << i << " " << 3+i/2 << std::endl;

to "mimic" the input data generation you are doing in the kernel. If you do so, you'll see duplications in the final column of numbers.

EDIT: based on comments below, it seems there was some uncertainty about how the idx variable will generate numbers. This is a canonical method for generating a globally unique thread ID:

int idx = blockIdx.x * blockDim.x + threadIdx.x;

and in typical usage each thread will get a unique positive index that is one higher than the "previous" thread:

0,1,2,3,...

It seems that the desired case was to create an "input" data set that looked like this:

3,5,7,9,...

Therefore the correct arithmetic in place of this:

g_data[idx] = 3 + idx / 2;

is this:

g_data[idx] = 3 + idx * 2;

Here's is a fully worked example with that change and with the previous cout change I suggested:

$ cat t1119.cu
////////////////////////////////////////////////////////////////////////////
//
// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////

//
// This sample illustrates the usage of CUDA events for both GPU timing and
// overlapping CPU and GPU execution.  Events are inserted into a stream
// of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can
// perform computations while GPU is executing (including DMA memcopies
// between the host and device).  CPU can query CUDA events to determine
// whether GPU has completed tasks.
//

// includes, system
#include 

// includes CUDA Runtime
#include 

// includes, project
#include 
#include  // helper utility functions


//set matrix to possible prime values
//evaluate if input is prime, sets variable to 0 if not prime
__global__ void testPrimality(int * g_data) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_data[idx] = 3 + idx * 2;

    if (g_data[idx] <= 3) {
        if (g_data[idx] <= 1) {
            g_data[idx] = 0;
        }
    }

    else if (g_data[idx] % 2 == 0 || g_data[idx] % 3 == 0) {
        g_data[idx] = 0;
    }

    else {
        for (unsigned short i = 5; i * i <= g_data[idx]; i += 6) {
            if (g_data[idx] % i == 0 || g_data[idx] % (i + 2) == 0) {
                g_data[idx] = 0;
            }
        }
    }

}

bool correct_output(int *data, const int n, const int x)
{
    for (int i = 0; i < n; i++)
        if (data[i] != x)
        {
            printf("Error! data[%d] = %d, ref = %d
", i, data[i], x);
            return false;
        }

    return true;
}

int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...
", argv[0]);

    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);

    // get device name
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s]
", deviceProps.name);

    //const int n = 16 * 1024 * 1024;
    const int n = 1024;
    int nbytes = n * sizeof(int);
    //int value = 1;

    // allocate host memory
    int *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
    memset(a, 0, nbytes);



    // allocate device memory
    int *d_a=0;
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));

    // set kernel launch configuration
    dim3 threads = dim3(512, 1);
    dim3 blocks  = dim3(n / threads.x, 1);

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);

    checkCudaErrors(cudaDeviceSynchronize());
    float gpu_time = 0.0f;

    // asynchronously issue work to the GPU (all to stream 0)
    sdkStartTimer(&timer);
    cudaEventRecord(start, 0);
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    //increment_kernel<<>>(d_a);
    testPrimality<<>>(d_a);
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
    cudaEventRecord(stop, 0);
    sdkStopTimer(&timer);

    // have CPU do some work while waiting for stage 1 to finish
    unsigned long int counter=0;

    while (cudaEventQuery(stop) == cudaErrorNotReady)
    {
        counter++;
    }

    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));

    // print the cpu and gpu times
    printf("time spent executing by the GPU: %.2f
", gpu_time);
    printf("time spent by CPU in CUDA calls: %.2f
", sdkGetTimerValue(&timer));
    printf("CPU executed %lu iterations while waiting for GPU to finish
", counter);

    //print values for all allocated memory space
    for (int i = 0; i < n; i++) {
        if (a[i] != 0) {
            std::cout << a[i]<< " : " << i << " " << 3 + i * 2 << std::endl;
        }
    }

    // check the output for correctness
    //bool bFinalResults = correct_output(a, n, value);
    bool bFinalResults = true;



    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaFreeHost(a));
    checkCudaErrors(cudaFree(d_a));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1119.cu -o t1119
$ cuda-memcheck ./t1119

(excerpted output:)

337 : 167 337
347 : 172 347
349 : 173 349
353 : 175 353
359 : 178 359
367 : 182 367
373 : 185 373
379 : 188 379
383 : 190 383
389 : 193 389
397 : 197 397
401 : 199 401
409 : 203 409
419 : 208 419
421 : 209 421
431 : 214 431
433 : 215 433
439 : 218 439
443 : 220 443
449 : 223 449
457 : 227 457
461 : 229 461
463 : 230 463
467 : 232 467
479 : 238 479
487 : 242 487
491 : 244 491
499 : 248 499
503 : 250 503
509 : 253 509
521 : 259 521
523 : 260 523
541 : 269 541
547 : 272 547
557 : 277 557
563 : 280 563
569 : 283 569
571 : 284 571
577 : 287 577
587 : 292 587
593 : 295 593
599 : 298 599
601 : 299 601
607 : 302 607
613 : 305 613
617 : 307 617
619 : 308 619

As can be seen above, there are no duplicates in the output sequence.

Duplicate values in array after CUDA calculations

Answers (1)

Related Questions