Reputation: 51

How to create and use a 1D layered texture in CUDA

I am new to CUDA. I have figured out how to do 1D and 2D textures in CUDA. However, I am struggling with how to use a 1D layered texture. The output of my kernel which uses the texture is all zeros, which is definitely incorrect. However, I am not sure what I am doing wrong. I have serious doubts that I set up this texture correctly, but I checked for cuda errors everywhere and couldn't find any issues. Can someone show me how to correctly set up a 1D layered texture and use it. Here is my code. Thanks in advance:

// To Compile: nvcc backproj.cu -o backproj.out
// To Run: ./backproj.out

// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define pi acos(-1)

// 1D float textures
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;

// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int layer) {
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (location_idx < numlocations) {
        // Get the location you want to interpolate from the array
        float loc2find = (float) d_locations[location_idx] + 0.5f;
        // Read from texture and write to global memory
        d_output[location_idx] = tex1DLayered(texRef, loc2find, layer);
    }
}

// Host code
int main()
{
    // Setup h_data and locations to interpolate from
    const unsigned int len = 10;
    const unsigned int numlayers = 3;
    const unsigned int upsamp = 3;
    const unsigned int loclen = 1 + (len - 1) * upsamp;
    float idx_spacing = 1/(float)upsamp;
    float h_data[len][numlayers], h_loc[loclen];
    for (int i = 0; i < len; i++) 
        for (int j = 0; j < numlayers; j++)
            h_data[i][j] = 1+cosf((float) pi*i/(j+1.0f));
    for (int i = 0; i < loclen; i ++) 
        h_loc[i] = i*idx_spacing;

    // Get the memory locations you want
    float* d_loc;
    cudaMalloc(&d_loc, loclen * sizeof(float));
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);

    // Allocate CUDA array in device memory
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, len, numlayers);

    // Copy to device memory some data located at address h_data in host memory 
    cudaMemcpyToArray(cuArray, 0, 0, h_data, len * numlayers * sizeof(float), cudaMemcpyHostToDevice);

    // Set texture reference parameters
    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;

    // Bind the array to the texture reference
    cudaBindTextureToArray(texRef, cuArray, channelDesc);

    // Allocate result of transformation in device memory
    float* d_output;
    cudaMalloc(&d_output, loclen * sizeof(float));

    // Invoke kernel
    int thdsPerBlk = 256;
    int blksPerGrid = (int) (loclen / thdsPerBlk) + 1;
    printf("Threads Per Block: %d, Blocks Per Grid: %d\n", thdsPerBlk, blksPerGrid);
    interp1Kernel <<<blksPerGrid, thdsPerBlk >>>(d_output, d_loc, loclen, 0);

    // Print Results
    printf("\n Original Indices \n");
    for (int i = 0; i < len; i++) printf("    %d ", i);
    printf("\n Original array \n");
    for (int i = 0; i < len; i++) printf("%5.3f ", h_data[i][0]);
    printf("\n Output Indices \n");
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("\n Output Array \n");
    cudaMemcpy(h_loc, d_output, loclen * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("\n");

    // Free device memory
    cudaFreeArray(cuArray);
    cudaFree(d_output);

    return 0;
}

Upvotes: 2

Answers (2)

Rehman Ali

Reputation: 51

Unfortunately, the CUDA SDK only shows you how to do it when you have 2D layered texture. There is some more trickiness when it comes to 1D layered textures. It turns out you have to put a 0 into the second argument for make_cudaExtent when making the extentDesc as follows:

cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers);  // <-- 0 height required for 1Dlayered

However, when using make_cudaExtent for mParams.extent for cudaMemcpy3D, you still need to put a 1 for the second argument:

mParams.extent = make_cudaExtent(len, 1, numlayers);  // <<-- non zero height required for memcpy to do anything

Furthermore, there are some other non-obvious details such as the pitch for make_cudaPitchedPtr. So I have included my complete and functioning code for the 1D layered texture. I couldn't find an example of this anywhere. So hopefully this will help out others who are in the same boat:

// To Compile: nvcc layeredTexture1D.cu -o layeredTexture1D.out
// To Run: ./layeredTexture1D.out

// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// Includes CUDA
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define pi acos(-1)

// 1D float textures: x is for input values, y is for corresponding output values
texture<float, cudaTextureType1DLayered, cudaReadModeElementType> texRef;

// 1D interpolation kernel: Should be very similar to what you get if you used 1D interpolation on MATLAB
__global__ void interp1Kernel(float* d_output, float* d_locations, int numlocations, int numlayers) {
    unsigned int location_idx = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int layer = blockIdx.y * blockDim.y + threadIdx.y;
    if (location_idx < numlocations && layer < numlayers) {
        // Get the location you want to interpolate from the array
        float loc2find = (float)d_locations[location_idx] + 0.5f;
        // Read from texture and write to global memory
        d_output[location_idx + layer*numlocations] = tex1DLayered(texRef, loc2find, layer);
        //printf("location=%d layer=%d loc2find=%f  result=%f \n", location_idx, layer, loc2find, d_output[location_idx]);
    }
}

// Host code
int main()
{
    // Setup h_data and locations to interpolate from
    const unsigned int len = 7;
    const unsigned int numlayers = 3;
    const unsigned int upsamp = 4;
    const unsigned int loclen = 1 + (len - 1) * upsamp;
    float idx_spacing = 1 / (float)upsamp;
    float h_data[numlayers*len], h_loc[loclen];
    for (int i = 0; i < len; i++)
        for (int j = 0; j < numlayers; j++)
            h_data[len*j + i] = 1 + cosf((float)pi*i / (j + 1.0f));
    for (int i = 0; i < loclen; i++)
        h_loc[i] = i*idx_spacing;

    // Get the memory locations you want
    float* d_loc;
    cudaMalloc(&d_loc, loclen * sizeof(float));
    cudaMemcpy(d_loc, h_loc, loclen*sizeof(float), cudaMemcpyHostToDevice);

    // Allocate CUDA array in device memory
    cudaExtent extentDesc = make_cudaExtent(len, 0, numlayers);  // <-- 0 height required for 1Dlayered
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaMemcpy3DParms mParams = { 0 };
    mParams.srcPtr = make_cudaPitchedPtr(h_data, len*sizeof(float), len, 1);
    mParams.kind = cudaMemcpyHostToDevice;
    mParams.extent = make_cudaExtent(len, 1, numlayers);  // <<-- non zero height required for memcpy to do anything
    cudaArray* cuArray;
    cudaMalloc3DArray(&cuArray, &channelDesc, extentDesc, cudaArrayLayered);
    mParams.dstArray = cuArray;
    cudaMemcpy3D(&mParams);

    // Set texture reference parameters
    texRef.addressMode[0] = cudaAddressModeBorder;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;

    // Bind the array to the texture reference
    cudaBindTextureToArray(texRef, cuArray, channelDesc);

    // Allocate result of transformation in device memory
    float *d_output;
    cudaMalloc(&d_output, loclen * numlayers * sizeof(float));
    float h_output[loclen * numlayers];

    // Invoke kernel
    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid((loclen + dimBlock.x - 1) / dimBlock.x,
        (numlayers + dimBlock.y - 1) / dimBlock.y, 1);
    interp1Kernel<<<dimGrid, dimBlock>>>(d_output, d_loc, loclen, numlayers);

    // Print Results
    printf("\n Original Indices \n");
    for (int i = 0; i < len; i++) printf("    %d ", i);
    printf("\n Original array \n");
    for (int j = 0; j < numlayers; j++) {
        for (int i = 0; i < len; i++) {
            printf("%5.3f ", h_data[i + j*len]);
        }
        printf("\n");
    }
    printf("\n Output Indices \n");
    for (int i = 0; i < loclen; i++) printf("%5.3f ", h_loc[i]);
    printf("\n Output Array \n");
    cudaMemcpy(h_output, d_output, loclen * numlayers * sizeof(float), cudaMemcpyDeviceToHost);
    for (int j = 0; j < numlayers; j++) {
        for (int i = 0; i < loclen; i++) {
            printf("%5.3f ", h_output[i + j*loclen]);
        }
        printf("\n");
    }
    printf("\n");

    // Free device memory
    cudaFreeArray(cuArray);
    cudaFree(d_output);

    return 0;
}

Upvotes: 3

talonmies

Reputation: 72348

You must use cudaMalloc3DArray with the cudaArrayLayered flag set to allocate memory for layered textures. There is a complete example of layered texture usage in the toolkit samples which you can study to see how they work.

Upvotes: 3

How to create and use a 1D layered texture in CUDA

Answers (2)

Related Questions