Why does my data not fit into a CUDA Texture Object?

Question

I'm trying to fill a CUDA Texture Object with some data but the call to cudaCreateTextureObject fails with the following error (edit: on both a GTX 1080TI and a RTX 2080TI):

GPU ERROR! 'invalid argument' (err code 11)

It works if I put less data into my texture so my guess is that my computation about how much data I can fit into a texture is off.

My thought process is as follows: (executable code follows below)

My data comes in the form of (76,76) images where each pixel is a float. What I would like to do is to store a column of images in a Texture Object; as I understand it, cudaMallocPitch is the way to do this.

When computing the number of images I can store in one texture I'm using the following formula to determine how much space a single image needs:

GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float)

Where the first argument should be the memory pitch on a GTX 1080TI card (512 bytes). The number of bytes that I can store in a 1D texture is given as 2^27 here. When I divide the latter by the former I get 862.3, assuming this is the number of images I can store in one Texture Object. However, when I try to store more than 855 images in my buffer the program crashes with the error above.

Here's the code:

In the following the main function (a) sets up all the relevant parameters, (b) allocates the memory using cudaMallocPitch, and (c) configures and creates a CUDA Texture Object:

#include 
#include 
#include 

#include 

#define GTX_1080TI_MEM_PITCH   512
#define GTX_1080TI_1DTEX_WIDTH 134217728 // 2^27

//=====================================================================[ util ]

// CUDA error checking for library functions
#define CUDA_ERR_CHK(func){ cuda_assert( (func), __FILE__, __LINE__ ); }
inline void cuda_assert( const cudaError_t cu_err, const char* file, int line ){
    if( cu_err != cudaSuccess ){
        fprintf( stderr, "
GPU ERROR! \'%s\' (err code %d) in file %s, line %d.

", cudaGetErrorString(cu_err), cu_err, file, line );
        exit( EXIT_FAILURE );
    }
}

// CUDA generic error checking (used after kernel calls)
#define GPU_ERR_CHK(){ gpu_assert(__FILE__, __LINE__); }
inline void gpu_assert( const char* file, const int line ){
    cudaError cu_err = cudaGetLastError();
    if( cu_err != cudaSuccess ){
        fprintf( stderr, "
GPU KERNEL ERROR! \'%s\' (err code %d) in file %s, line %d.

", cudaGetErrorString(cu_err), cu_err, file, line );
        exit(EXIT_FAILURE);
    }
}

//=====================================================================[ main ]

int main(){

    // setup
    unsigned int img_dim_x = 76;
    unsigned int img_dim_y = 76;
    unsigned int img_num   = 856;  // <-- NOTE: set this to 855 and it should work - but we should be able to put 862 here?

    unsigned int pitched_img_size = GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float);
    unsigned int img_num_per_tex  = GTX_1080TI_1DTEX_WIDTH / pitched_img_size;

    fprintf( stderr, "We should be able to stuff %d images into one texture.
", img_num_per_tex );
    fprintf( stderr, "We use %d (more than 855 leads to a crash).
", img_num );

    // allocate pitched memory
    size_t img_tex_pitch;
    float* d_img_tex_data;

    CUDA_ERR_CHK( cudaMallocPitch( &d_img_tex_data, &img_tex_pitch, img_dim_x*sizeof(float), img_dim_y*img_num ) );

    assert( img_tex_pitch == GTX_1080TI_MEM_PITCH );
    fprintf( stderr, "Asking for %zd bytes allocates %zd bytes using pitch %zd. Available: %zd/%d
", 
        img_num*img_dim_x*img_dim_y*sizeof(float), 
        img_num*img_tex_pitch*img_dim_y*sizeof(float), 
        img_tex_pitch,
        GTX_1080TI_1DTEX_WIDTH - img_num*img_tex_pitch*img_dim_y*sizeof(float),
        GTX_1080TI_1DTEX_WIDTH );

    // generic resource descriptor
    cudaResourceDesc res_desc;
    memset(&res_desc, 0, sizeof(res_desc));
    res_desc.resType = cudaResourceTypePitch2D;
    res_desc.res.pitch2D.desc = cudaCreateChannelDesc();
    res_desc.res.pitch2D.devPtr = d_img_tex_data;
    res_desc.res.pitch2D.width  = img_dim_x;
    res_desc.res.pitch2D.height = img_dim_y*img_num;
    res_desc.res.pitch2D.pitchInBytes = img_tex_pitch;

    // texture descriptor
    cudaTextureDesc tex_desc;
    memset(&tex_desc, 0, sizeof(tex_desc));
    tex_desc.addressMode[0] = cudaAddressModeClamp;
    tex_desc.addressMode[1] = cudaAddressModeClamp;
    tex_desc.filterMode     = cudaFilterModeLinear;  // for linear interpolation (NOTE: this breaks normal integer indexing!)
    tex_desc.readMode       = cudaReadModeElementType;
    tex_desc.normalizedCoords = false;  // we want to index using [0;img_dim] rather than [0;1]              

    // make sure there are no lingering errors
    GPU_ERR_CHK();
    fprintf(stderr, "No CUDA error until now..
");

    // create texture object
    cudaTextureObject_t img_tex_obj;
    CUDA_ERR_CHK( cudaCreateTextureObject(&img_tex_obj, &res_desc, &tex_desc, NULL) );

    fprintf(stderr, "bluppi
");
}

This should crash when cudaCreateTextureObject is called. If the img_num parameter (at the start of main) is changed from 856 to 855, however, the code should execute successfully. (edit: The expected behavior would be that the code runs through with a value of 862 but fails with a value of 863 since that actually requires more bytes than the documented buffer size offers.)

Any help would be appreciated!

Robert Crovella · Accepted Answer

Since you're working with a 2D texture here, the number of bytes you can store in a 1D texture (the "width") is of no relevance here.

2D textures may have different characteristics depending on the type of memory that provides the backing for the texture. Two examples are linear memory and CUDA Array. You have chosen to use a linear memory backing (that which is provided by cudaMalloc* operations other than cudaMallocArray).

The primary problem you are running into is the maximum texture height. To discover what this is, we could refer to the table 14 in the programming guide, which lists:

Maximum width and height for a 2D texture reference bound to linear memory 65000 x 65000

You are exceeding this 65000 number when going from 855 to 856 images, for an image height of 76 rows. 856*76 = 65056, 855*76 = 64980

"But wait" you say, that table 14 entry says texture reference, and I am using a texture object.

You are correct, and table 14 doesn't explicitly list the corresponding limit for texture objects. In that case, we have to refer to the device properties readable from the device at runtime, using cudaGetDeviceProperties(). If we review the data available there, we see this readable item:

maxTexture2DLinear[3] contains the maximum 2D texture dimensions for 2D textures bound to pitch linear memory.

(I suspect the 3 is a typo, but no matter, we only need the first 2 values).

This is the value we want to be sure. If we modify your code to obey that limit, there are no problems:

$ cat t382.cu
#include 
#include 
#include 

#include 

#define GTX_1080TI_MEM_PITCH   512
#define GTX_1080TI_1DTEX_WIDTH 134217728 // 2^27

//=====================================================================[ util ]

// CUDA error checking for library functions
#define CUDA_ERR_CHK(func){ cuda_assert( (func), __FILE__, __LINE__ ); }
inline void cuda_assert( const cudaError_t cu_err, const char* file, int line ){
    if( cu_err != cudaSuccess ){
        fprintf( stderr, "
GPU ERROR! \'%s\' (err code %d) in file %s, line %d.

", cudaGetErrorString(cu_err), cu_err, file, line );
        exit( EXIT_FAILURE );
    }
}

// CUDA generic error checking (used after kernel calls)
#define GPU_ERR_CHK(){ gpu_assert(__FILE__, __LINE__); }
inline void gpu_assert( const char* file, const int line ){
    cudaError cu_err = cudaGetLastError();
    if( cu_err != cudaSuccess ){
        fprintf( stderr, "
GPU KERNEL ERROR! \'%s\' (err code %d) in file %s, line %d.

", cudaGetErrorString(cu_err), cu_err, file, line );
        exit(EXIT_FAILURE);
    }
}

//=====================================================================[ main ]

int main(){

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    size_t max2Dtexturelinearwidth = prop.maxTexture2DLinear[0];  // texture x dimension
    size_t max2Dtexturelinearheight = prop.maxTexture2DLinear[1]; // texture y dimension
    fprintf( stderr, "maximum 2D linear texture dimensions (width,height): %lu,%lu
", max2Dtexturelinearwidth, max2Dtexturelinearheight);



    // setup
    unsigned int img_dim_x = 76;
    unsigned int img_dim_y = 76;
    //unsigned int img_num   = 856;  // <-- NOTE: set this to 855 and it should work - but we should be able to put 862 here?
    unsigned int img_num = max2Dtexturelinearheight/img_dim_y;
    fprintf( stderr, "maximum number of images per texture: %u
", img_num);

    unsigned int pitched_img_size = GTX_1080TI_MEM_PITCH * img_dim_y * sizeof(float);
    unsigned int img_num_per_tex  = GTX_1080TI_1DTEX_WIDTH / pitched_img_size;

    fprintf( stderr, "We should be able to stuff %d images into one texture.
", img_num_per_tex );
    fprintf( stderr, "We use %d (more than 855 leads to a crash).
", img_num );

    // allocate pitched memory
    size_t img_tex_pitch;
    float* d_img_tex_data;

    CUDA_ERR_CHK( cudaMallocPitch( &d_img_tex_data, &img_tex_pitch, img_dim_x*sizeof(float), img_dim_y*img_num ) );

    assert( img_tex_pitch == GTX_1080TI_MEM_PITCH );
    fprintf( stderr, "Asking for %zd bytes allocates %zd bytes using pitch %zd. Available: %zd/%d
",
        img_num*img_dim_x*img_dim_y*sizeof(float),
        img_num*img_tex_pitch*img_dim_y*sizeof(float),
        img_tex_pitch,
        GTX_1080TI_1DTEX_WIDTH - img_num*img_tex_pitch*img_dim_y*sizeof(float),
        GTX_1080TI_1DTEX_WIDTH );

    // generic resource descriptor
    cudaResourceDesc res_desc;
    memset(&res_desc, 0, sizeof(res_desc));
    res_desc.resType = cudaResourceTypePitch2D;
    res_desc.res.pitch2D.desc = cudaCreateChannelDesc();
    res_desc.res.pitch2D.devPtr = d_img_tex_data;
    res_desc.res.pitch2D.width  = img_dim_x;
    res_desc.res.pitch2D.height = img_dim_y*img_num;
    res_desc.res.pitch2D.pitchInBytes = img_tex_pitch;

    // texture descriptor
    cudaTextureDesc tex_desc;
    memset(&tex_desc, 0, sizeof(tex_desc));
    tex_desc.addressMode[0] = cudaAddressModeClamp;
    tex_desc.addressMode[1] = cudaAddressModeClamp;
    tex_desc.filterMode     = cudaFilterModeLinear;  // for linear interpolation (NOTE: this breaks normal integer indexing!)
    tex_desc.readMode       = cudaReadModeElementType;
    tex_desc.normalizedCoords = false;  // we want to index using [0;img_dim] rather than [0;1]

    // make sure there are no lingering errors
    GPU_ERR_CHK();
    fprintf(stderr, "No CUDA error until now..
");

    // create texture object
    cudaTextureObject_t img_tex_obj;
    CUDA_ERR_CHK( cudaCreateTextureObject(&img_tex_obj, &res_desc, &tex_desc, NULL) );

    fprintf(stderr, "bluppi
");
}
$ nvcc -o t382 t382.cu
$ cuda-memcheck ./t382
========= CUDA-MEMCHECK
maximum 2D linear texture dimensions (width,height): 131072,65000
maximum number of images per texture: 855
We should be able to stuff 862 images into one texture.
We use 855 (more than 855 leads to a crash).
Asking for 19753920 bytes allocates 133079040 bytes using pitch 512. Available: 1138688/134217728
No CUDA error until now..
bluppi
========= ERROR SUMMARY: 0 errors
$

Why does my data not fit into a CUDA Texture Object?

Answers (1)

Related Questions