CUDA 2D layered Texture from 3D array (float vs int)

Question

I got the following program which is pretty much the SDK Sample "Simple Layered Texture".

// includes, system
#include 
#include 
#include 
#include 

// includes, kernels
#include 

// includes, project
#include 
#include   // helper for shared that are common to CUDA SDK samples

#define EXIT_WAIVED 2

static char *sSDKname = "simpleLayeredTexture";

// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.

typedef int TYPE;

texture tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata  output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
    // calculate this thread's data point
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    // 0.5f offset and division are necessary to access the original data points
    // in the texture (such that bilinear interpolation will not be activated).
    // For details, see also CUDA Programming Guide, Appendix D
    float u = (x+0.5f) / (float) width;
    float v = (y+0.5f) / (float) height;

    // read from texture, do expected transformation and write to global memory
    TYPE sample = tex2DLayered(tex, u, v, layer);
    g_odata[layer*width*height + y*width + x] = sample;

    printf("Sample %d
", sample);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    printf("[%s] - Starting...
", sSDKname);

    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);

    bool bResult = true;

    // get number of SMs on this GPU
    cudaDeviceProp deviceProps;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
    printf("SM %d.%d
", deviceProps.major, deviceProps.minor);

    if (deviceProps.major < 2)
    {
        printf("%s requires SM >= 2.0 to support Texture Arrays.  Test will be waived... 
", sSDKname);
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }

    // generate input data for layered texture
    unsigned int width=16, height=16, num_layers = 5;
    unsigned int size = width * height * num_layers * sizeof(TYPE);
    TYPE *h_data = (TYPE *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++)
        {
            h_data[layer*width*height + i] = 15;//(float)i;
        }

    // this is the expected transformation of the input data (the expected output)
    TYPE *h_data_ref = (TYPE *) malloc(size);

    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++)
        {
            h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
        }

    // allocate device memory for result
    TYPE *d_data = NULL;
    checkCudaErrors(cudaMalloc((void **) &d_data, size));

    // allocate array and copy image data
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
    cudaArray *cu_3darray;
    checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos = make_cudaPos(0,0,0);
    myparms.dstPos = make_cudaPos(0,0,0);
    myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
    myparms.dstArray = cu_3darray;
    myparms.extent = make_cudaExtent(width, height, num_layers);
    myparms.kind = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&myparms));

    // set texture parameters
    tex.addressMode[0] = cudaAddressModeWrap;
    tex.addressMode[1] = cudaAddressModeWrap;
//    tex.filterMode = cudaFilterModeLinear;
    tex.filterMode = cudaFilterModePoint;
    tex.normalized = true;  // access with normalized texture coordinates

    // Bind the array to the texture
    checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));

    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads
",
           width, height, dimGrid.x, dimGrid.y);

    transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0);  // warmup (for better timing)

    // check if kernel execution generated an error
    getLastCudaError("warmup Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());

    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);

    // execute the kernel
    for (unsigned int layer = 0; layer < num_layers; layer++)
        transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);

    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");

    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec
", sdkGetTimerValue(&timer));
    printf("%.2f Mtexlookups/sec
", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);

    // allocate mem for the result on host side
    TYPE *h_odata = (TYPE *) malloc(size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));

    printf("Comparing kernel output to expected data
");

#define MIN_EPSILON_ERROR 5e-3f
    bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);

    printf("Host sample: %d == %d
", h_data_ref[0], h_odata[0]);

    // cleanup memory
    free(h_data);
    free(h_data_ref);
    free(h_odata);

    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFreeArray(cu_3darray));

    cudaDeviceReset();

    if (bResult)
        printf("Success!");
    else
        printf("Failure!");

    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

The output is correct if I use int (or uint) as TYPE. For float it produces wrong results i.e. always 0 (eventhough the SDK compareData function says everything is fine!?). I'm starting to believe that there is a bug in CUDA. I'm using version 5.0 on a Kepler K20.

Any suggestions and test results are appreciated. The code should be runnable as is.

Thanks in advance, Ben

Edit: OS is Linux (Ubuntu 12.04.2 LTS) x86_64 3.2.0-38-generic

CUDA 2D layered Texture from 3D array (float vs int)

Answers (1)

Related Questions