Reputation: 451

CUDA can't use all available constant memory

I have a code that uses cooperative group to perform some operations. Therefore I compile my code with:

/usr/local/cuda/bin/nvcc -arch=sm_61 -gencode=arch=compute_61,code=sm_61, --device-c -g -O2 foo.cu

Then I try to invoke the device linker:

/usr/local/cuda/bin/nvcc -arch=sm_61 -gencode=arch=compute_61,code=sm_61, -g -dlink foo.o

It then yields the error:

ptxas error : File uses too much global constant data (0x10100 bytes, 0x10000 max)

The problem is caused by the way I allocated constant memory:

__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)];

where CONST_MEM = 65536 bytes, which I got from device query for SM_61. However, if I reduce the constant memory to something like 64536, the problem disappeared. It's almost as if the constant memory is "reserved" for some purposes during compilation. I've searched through CUDA documentation but didn't find a satisfying answer. Is it safe to use the maximum amount of constant memory available to you? Why would this problem happen?

EDIT: this is the code snippet that triggers the error on SM_61:

#include <algorithm>
#include <vector>
#include <type_traits>
#include <cuda_runtime.h>
#include <cfloat>
#include <iostream>

#include <cooperative_groups.h>

using namespace cooperative_groups;


struct foo_params {
    float * points;
    float * centers;
    int * centersDist;
    int * centersIndex;
    int numPoints;
};

__constant__ float d_cnst_centers[65536 / sizeof(float)];

template <int R, int C>
__device__ int 
nearestCenter(float * points, float * pC) {
    float mindist = FLT_MAX;
    int minidx = 0;
    int clistidx = 0;
    for(int i=0; i<C;i++) {
        clistidx = i*R;
        float dist;
        {
            float *point = points;
            float *center = &pC[clistidx];
            float accum;
            for(int i = 0; i<R; i++) {
                float delta = point[i] - center[i];
                accum += delta*delta;
            }
            dist = sqrt(accum);
        }
        /* ... */
    }
    return minidx;
}


template<int R, int C, bool bRO, bool ROWMAJ=true>
__global__ void getNeatestCenter(struct foo_params params) {
        float * points = params.points;
        float * centers = params.centers;
        int * centersDist = params.centersDist;
        int * centersIndex = params.centersIndex;
        int numPoints = params.numPoints;

        grid_group grid = this_grid();
        {
            int idx = blockIdx.x*blockDim.x+threadIdx.x;
            if (idx < numPoints) {
                centersIndex[idx] = nearestCenter<R,C>(&points[idx*R], d_cnst_centers);
            }
        }
        /* ... other code */
}

int main () {
    // foo paramaters, for illustration purposes
    struct foo_params param;
    param.points = NULL;
    param.centers = NULL;
    param.centersDist = NULL;
    param.centersIndex = NULL;
    param.numPoints = 1000000;
    void *p_params = &param;

    int minGridSize = 0, blockSize = 0;
    cudaOccupancyMaxPotentialBlockSize(
                            &minGridSize,
                            &blockSize,
                            (void*)getNeatestCenter<128, 64, true>,
                            0,
                            0);

    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);

    cudaLaunchCooperativeKernel((void *)getNeatestCenter<32, 32, true>, dimGrid, dimBlock, &p_params);
}

The problem seems to be cause by the line:

grid_group grid = this_grid();

which seems to use approximately 0x100 bytes of constant memory without known reasons.

Upvotes: 2

Answers (2)

talonmies

Reputation: 72372

In order to document what exactly is happening in this use case, I have cobbled together the following work through of the compilation process. Hopefully it will shed some light on how this problem arises, and some useful diagnostic tools, and dispel a few misconceptions at the same time.

Note this is a work in progress and may be updated periodically as more information comes to light. Please edit and contribute as you see fit

To start, as noted in comments, it is perfectly possible to allocate every byte of constant memory up until the 64kb limit. This example is pretty much the use case described in the original question:

const int sz = 65536;
const int NMax = sz / sizeof(float);
__constant__ float buffer[NMax];

__global__ 
void akernel(const float* __restrict__ arg1, float* __restrict__ arg2, int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        float ans = 0;
        #pragma unroll 128
        for(int i=0; i<NMax; i++)  {
            float val = buffer[i];
            float y = (i%2 == 0) ? 1.f : -1.f;
            float x = val / 255.f;
            ans = ans + y * sinf(x);
        }
        arg2[tid] = ans + arg1[tid];
    }
}

and it compiles without a problem (Godbolt link here). This proves that the linker phase in the question must be pulling in additional constant memory allocations from other code, whether that is user code, other device libraries, or device runtime support.

So let's turn our attention to the repro case posted in the updated question, mildly modified so that it will pass the compilation and link phase by reducing the constant memory footprint slightly, with a buffer of 64536 bytes:

$ nvcc -arch=sm_61 --device-c -g -O2 -Xptxas="-v" -o constmemuse.cu.o constmemuse.cu 
constmemuse.cu(51): warning: variable "centers" was declared but never referenced

constmemuse.cu(52): warning: variable "centersDist" was declared but never referenced

constmemuse.cu(31): warning: variable "dist" was set but never used
          detected during instantiation of "void getNeatestCenter<R,C,bRO,ROWMAJ>(foo_params) [with R=128, C=64, bRO=true, ROWMAJ=true]" 

constmemuse.cu(26): warning: variable "mindist" was declared but never referenced
          detected during instantiation of "void getNeatestCenter<R,C,bRO,ROWMAJ>(foo_params) [with R=128, C=64, bRO=true, ROWMAJ=true]" 

ptxas info    : 0 bytes gmem, 64536 bytes cmem[3]
ptxas info    : Function properties for cudaDeviceGetAttribute
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params' for 'sm_61'
ptxas info    : Function properties for _Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 5 registers, 360 bytes cmem[0]
ptxas info    : Function properties for cudaMalloc
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for cudaGetDevice
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params' for 'sm_61'
ptxas info    : Function properties for _Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 5 registers, 360 bytes cmem[0]
ptxas info    : Function properties for cudaFuncGetAttributes
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads

A few points:

64536 bytes cmem[3] shows the size of the user controllable constant memory bank, as we specified it
ptxas info : Used 5 registers, 360 bytes cmem[0] shows the register usage of the function and cmem[0] is the internal reserved constant memory bank which is used for holding kernel arguments and anything else which the compiler puts to constant memory. Note that register spilling goes to local memory, not constant memory.

So now let's run the device linking step:

$ nvcc -arch=sm_61 -gencode=arch=compute_61,code=sm_61, -g -dlink -Xnvlink="-v" -o constmemuse.o constmemuse.cu.o

nvlink info    : 9944 bytes gmem, 64792 bytes cmem[3] (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 20 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 23 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 28 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 23 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 12 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 17 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 14 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 16 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 16 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 14 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 17 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 8 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 11 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 12 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 11 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 21 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '_Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 360 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '_Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 360 bytes cmem[0], 0 bytes lmem (target: sm_61)

Some more remarks:

9944 bytes gmem, 64792 bytes cmem[3] now shows the global and constant memory reservations for the linked module. As you can see, we have inherited 256 additional bytes in constant bank 0, which is the user modifiable bank, plus 9944 bytes of statically reserved global memory. If the array allocation had been 65536 bytes, as in the question, the linkage will fail because it exceeds the 64kb limit.
You can see that a number of device runtime library functions have been linked automagically during the linkage phase (memcpy and memset)

It is clear that the additional constant memory usage is coming linking the device runtime, it can be confirmed with cuobjdump post hoc. The object from compilation:

$ cuobjdump -res-usage constmemuse.cu.o

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
compressed

Resource usage:
 Common:
  GLOBAL:0 CONSTANT[3]:64536
 Function cudaDeviceGetAttribute:
  REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function _Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params:
  REG:5 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function cudaMalloc:
  REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function cudaOccupancyMaxActiveBlocksPerMultiprocessor:
  REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function cudaGetDevice:
  REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function _Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params:
  REG:5 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function cudaFuncGetAttributes:
  REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
  REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0

Fatbin ptx code:
================
arch = sm_61
code version = [6,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
ptxasOptions = -v --compile-only

and the object after linking:

$ cuobjdump -res-usage constmemuse.o

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

Resource usage:
 Common:
  GLOBAL:9944 CONSTANT[3]:64792
 Function _Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function _Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:21 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:11 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:12 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:11 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:8 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:17 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:14 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:16 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
  REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:16 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:14 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:17 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:12 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:23 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:28 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:23 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:20 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
  REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
 Function cudaCGGetIntrinsicHandle:
  REG:6 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0

It has been demonstrated in the accepted answer that the math library can reserve constant memory for coefficients and lookup tables for some trigonometric and transcendental functions. However, in this case, the cause seems to be the support boilerplate emitted by the use of cooperative groups in the kernel. Delving further into the exact source of the additional bank 0 constant memory would require disassembly and reverse engineering of that code, which I am not going to do for now.

Upvotes: 3

njuffa

Reputation: 26205

This answer is speculative, because minimal but complete repro code was not provided by OP.

GPUs contain multiple constant memory banks used for different parts of program storage. One of those banks is for use by the programmer. Importantly, CUDA standard math library code uses the same bank, because the math library code becomes part of the programmer's code by function inlining. In the past, this was blatantly obvious, as the entire CUDA math library initially was just a couple of header files.

Some math functions need small tables of constant data internally. Particular examples are sin, cos, tan. When these math functions are used, the amount of __constant__ data available to programmers is reduced from 64KB by a small amount. Here are some example programs for demonstration purposes, compiled with the CUDA 8 toolchain and -arch=sm_61:

#include <stdio.h>
#include <stdlib.h>

#define CONST_MEM (65536)
__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)] = {1};

__global__ void kernel (int i, float f)
{
    float r = d_cnst_centers[i] * expf(f);
    printf ("r=%15.8f\n", r);
}

int main (void)
{
    kernel<<<1,1>>>(0,25.0f);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

This compiles fine and prints r=72004902912.00000000 at run time. Now lets change expf into sinf:

#include <stdio.h>
#include <stdlib.h>

#define CONST_MEM (65536)
__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)] = {1};

__global__ void kernel (int i, float f)
{
    float r = d_cnst_centers[i] * sinf(f);
    printf ("r=%15.8f\n", r);
}

int main (void)
{
    kernel<<<1,1>>>(0,25.0f);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

This throws an error during compilation: ptxas error : File uses too much global constant data (0x10018 bytes, 0x10000 max)

If we use the double-precision function sin instead, even more constant memory is needed:

#include <stdio.h>
#include <stdlib.h>

#define CONST_MEM (65536)
__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)] = {1};

__global__ void kernel (int i, float f)
{
    float r = d_cnst_centers[i] * sin((double)f);
    printf ("r=%15.8f\n", r);
}

int main (void)
{
    kernel<<<1,1>>>(0,25.0f);
    cudaDeviceSynchronize();
    return EXIT_SUCCESS;
}

We get the error message: ptxas error : File uses too much global constant data (0x10110 bytes, 0x10000 max)

Upvotes: 5

CUDA can&#39;t use all available constant memory

Answers (2)

Related Questions

CUDA can't use all available constant memory