3D array writing and reading as texture in CUDA

Question

Due to the nature of the algorithm I am programming I need to write/fill a 3D matrix with some specific maths and then read from that matrix (in a separate kernel) as a 3D linearly interpolated texture.

As texture is a reading mode, I am assuming I can somehow write in the global memory bind to the texture, and in a separate read from it, without the need of double memory and copying the values from the write to the read matrix. However I don't seem to figure out how to do this.

How can I use 3D texture memory as read and write (in separate kernels) ?

My problem is that I don't know how to define this global read/write array. In the sample below, I have created a 3D texture, but this is using code with cudaExtent and cudaArray. But I don't seem to be able to use this types to write on them, neither I seem to be able to create them with float* or the likes.

I may not be able to do this and need a memcpy somewhere in the middle, but as these arrays are generally big, I'd like to save memory.

Sample code (doesn't compile, but clearly defines the structure of what I am trying to do). Uses 100x100x100 3D memory as default because yes.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 

#define MAXTREADS 1024

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
texture tex;

__global__ void readKernel(float* imageend )
{
    int indY = blockIdx.y * blockDim.y + threadIdx.y;
    int indX = blockIdx.x * blockDim.x + threadIdx.x;
    int indZ = blockIdx.z * blockDim.z + threadIdx.z;
    //Make sure we dont go out of bounds
    size_t idx = indZ * 100 * 100 + indY * 100 + indX;
    if (indX >= 100 | indY >= 100 | indZ >= 100)
        return;
    imageend[idx] = tex3D(tex, indX + 0.5, indY + 0.5, indZ + 0.5);

}
__global__ void writeKernel(float* imageaux){
    int indY = blockIdx.y * blockDim.y + threadIdx.y;
    int indX = blockIdx.x * blockDim.x + threadIdx.x;
    int indZ = blockIdx.z * blockDim.z + threadIdx.z;
    //Make sure we dont go out of bounds
    size_t idx = indZ * 100 * 100 + indY * 100 + indX;
    if (indX >= 100 | indY >= 100 | indZ >= 100)
        return;
    imageaux[idx] = (float)idx;

}
int main()
{

    cudaArray *d_image_aux= 0;
    const cudaExtent extent = make_cudaExtent(100, 100, 100);
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
    cudaMalloc3DArray(&d_image_aux, &channelDesc, extent);

    // Configure texture options
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;
    tex.addressMode[0] = cudaAddressModeBorder;
    tex.addressMode[1] = cudaAddressModeBorder;
    tex.addressMode[2] = cudaAddressModeBorder;

    cudaBindTextureToArray(tex, d_image_aux, channelDesc);

    float *d_image_end = 0;
    size_t num_bytes = 100 * 100 * 100 * sizeof(float);
    cudaMalloc((void**)&d_image_end, num_bytes);
    cudaMemset(d_image_end, 0, num_bytes);

    int divx, divy, divz; //Irrelevant for the demo, important for the main code
    divx = 32;
    divy = 32;
    divz = 1;
    dim3 grid((100 + divx - 1) / divx,
        (100 + divy - 1) / divy,
        (100 + divz - 1) / divz);
    dim3 block(divx, divy, divz);

    // Kernels
    writeKernel << > >(d_image_aux);
    readKernel  << > >(d_image_end);


    cudaUnbindTexture(tex);
    cudaFree(d_image_aux);
    cudaFree(d_image_end);

    return 0;
}

NOTE: I am aware that I can not write "interpolated" or whatever that would be. The write operation will always be in integer indexes, while the read operation needs to use trilinear interpolation.

Robert Crovella · Accepted Answer

I believe all of the necessary pieces to demonstrate a kernel writing to a 3D surface (bound to an underlying 3D cudaArray), followed by another kernel texturing (i.e. with auto interpolation) from the same data (a 3D texture bound to the same underlying 3D cudaArray) are contained in the volumeFiltering CUDA sample code.

The only conceptual difference is the sample code has 2 different underlying 3D cudaArrays (one for the texture, one for the surface) but we can combine these, so that the data written to the surface is subsequently read during the texturing operation.

Here's a fully worked example:

$ cat texsurf.cu
#include 
#include 

texture  volumeTexIn;
surface                                    volumeTexOut;

__global__ void
surf_write(float *data,cudaExtent volumeSize)
{
    int x = blockIdx.x*blockDim.x + threadIdx.x;
    int y = blockIdx.y*blockDim.y + threadIdx.y;
    int z = blockIdx.z*blockDim.z + threadIdx.z;

    if (x >= volumeSize.width || y >= volumeSize.height || z >= volumeSize.depth)
    {
        return;
    }
    float output = data[z*(volumeSize.width*volumeSize.height)+y*(volumeSize.width)+x];
    // surface writes need byte offsets for x!
    surf3Dwrite(output,volumeTexOut,x * sizeof(float),y,z);

}

__global__ void
tex_read(float x, float y, float z){
    printf("x: %f, y: %f, z:%f, val: %f
", x,y,z,tex3D(volumeTexIn,x,y,z));
}

void runtest(float *data, cudaExtent vol, float x, float y, float z)
{
    // create 3D array
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc();
    cudaArray_t content;
    checkCudaErrors(cudaMalloc3DArray(&content, &channelDesc, vol, cudaArraySurfaceLoadStore));

    // copy data to device
    float *d_data;
    checkCudaErrors(cudaMalloc(&d_data, vol.width*vol.height*vol.depth*sizeof(float)));
    checkCudaErrors(cudaMemcpy(d_data, data, vol.width*vol.height*vol.depth*sizeof(float), cudaMemcpyHostToDevice));

    dim3 blockSize(8,8,8);
    dim3 gridSize((vol.width+7)/8,(vol.height+7)/8,(vol.depth+7)/8);
    volumeTexIn.filterMode     = cudaFilterModeLinear;
    checkCudaErrors(cudaBindSurfaceToArray(volumeTexOut,content));
    surf_write<<>>(d_data, vol);
    // bind array to 3D texture
    checkCudaErrors(cudaBindTextureToArray(volumeTexIn, content));
    tex_read<<<1,1>>>(x, y, z);
    checkCudaErrors(cudaDeviceSynchronize());
    cudaFreeArray(content);
    cudaFree(d_data);
    return;
}

int main(){
   const int dim = 8;
   float *data = (float *)malloc(dim*dim*dim*sizeof(float));
   for (int z = 0; z < dim; z++)
     for (int y = 0; y < dim; y++)
       for (int x = 0; x < dim; x++)
         data[z*dim*dim+y*dim+x] = z*100+y*10+x;
   cudaExtent vol = {dim,dim,dim};
   runtest(data, vol, 1.5, 1.5, 1.5);
   runtest(data, vol, 1.6, 1.6, 1.6);
   return 0;
}


$ nvcc -I/usr/local/cuda/samples/common/inc texsurf.cu -o texsurf
$ cuda-memcheck ./texsurf
========= CUDA-MEMCHECK
x: 1.500000, y: 1.500000, z:1.500000, val: 111.000000
x: 1.600000, y: 1.600000, z:1.600000, val: 122.234375
========= ERROR SUMMARY: 0 errors
$

I'm not going to try to give a full tutorial on linear texture filtering here. There are plenty of other example questions here which cover the details of indexing and filtering, and it doesn't seem to be the crux of this question. I've chosen the points (1.5, 1.5, 1.5) and (1.6, 1.6, 1.6) for easy verification of the underlying data; the results make sense to me.

3D array writing and reading as texture in CUDA

Answers (1)

Related Questions