CUDA illegal memory access on kernel

Question

I am trying to implement a basic device array type on CUDA, as an exercise. It should mimic the std::array interface, as a design goal. While implementing operator+, I am getting illegal memory access error and I can't decipher why. Here is the code.

#include 
#include 

enum class memcpy_t {
    host_to_host,
    host_to_device,
    device_to_host,
    device_to_device
};

bool check_cuda_err() {
    cudaError_t err = cudaGetLastError();
    if(err == cudaSuccess) {
        return true;
    }
    else {
        std::cerr << "Cuda Error: " << cudaGetErrorString(err) << "
" << std::flush;
        return false;
    }
}

template 
struct cuda_allocator {
    using pointer = T*;

    static void allocate(T *&dev_mem) {
        cudaMalloc(&dev_mem, N * sizeof(T));
    }

    static void deallocate(T *dev_mem) {
        cudaFree(dev_mem);
    }

    template 
    static void copy (T *dst, T *src) {
        switch(ct) {
        case memcpy_t::host_to_host:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToHost);
            break;
        case memcpy_t::host_to_device:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToDevice);
            break;
        case memcpy_t::device_to_host:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToHost);
            break;
        case memcpy_t::device_to_device:
            cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToDevice);
            break;
        default:
            break;
        }
    }
};

template 
struct gpu_array {
    using allocator = cuda_allocator;
    using pointer = typename allocator::pointer;
    using value_type = T;
    using iterator = T*;
    using const_iterator = T const*;

    gpu_array() {
       allocator::allocate(data);
    }

    gpu_array(std::array host_arr) {
        allocator::allocate(data);
        allocator::template copy(data, host_arr.begin());
    }

    gpu_array& operator=(gpu_array const& o) {
        //allocator::allocate(data);
        allocator::template copy(data, o.begin());
    }

    operator std::array() {
        std::array res;
        allocator::template copy(res.begin(), data);
        return res;
    }

    ~gpu_array() {
        allocator::deallocate(data);
    }

    __device__ iterator begin() { return data; }
    __device__ iterator end() { return data + N; }
    __device__ const_iterator begin() const { return data; }
    __device__ const_iterator end() const { return data + N; }

private:
    T* data;
};

template 
__global__ void add_kernel(gpu_array **r,
                           gpu_array const* a1,
                           gpu_array const* a2) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Index: %d
", i);
    (*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}

template 
gpu_array operator+(gpu_array const&a1,
                          gpu_array const&a2)
{
    gpu_array *res = new gpu_array;
    add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2);
    cudaDeviceSynchronize();
    check_cuda_err();
    // ignore memory leak for now
    return *res;
}
const int N = 1<<3;

int main() {
    std::array x,y;

    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    } 

    gpu_array dx{x};
    gpu_array dy{y};
    check_cuda_err(); // shows no error for memcpy
    std::array res = dx + dy;

    for(const auto& elem : res) {
        std::cout << elem << ", ";
    }
}

I am creating a size 8 array, to test things. As you can see, cuda_check_err() shows no error after gpu_array initialization from host arrays. I am guessing copying data works correctly. But in the kernel, when I index the device arrays, I am getting illegal memory access error. Here is the output:

Index: 0

Index: 1

Index: 2

Index: 3

Index: 4

Index: 5

Index: 6

Index: 7

Cuda Error: an illegal memory access was encountered

9.45143e-39, 0, 6.39436e-39, 0, 0, 0, 0, 0,

As you can see, I've printed computed index for each thread and nothing seems to be out of bounds. So, what might cause this illegal memory access error? By the way, cuda-memchecksays:

Invalid global read of size 8

and later

Address 0x7fff9f4c6ec0 is out of bounds

but I've printed the indices, don't know why it is out of bounds.

talonmies · Accepted Answer

We have seen two versions of code in this question, and unfortunately both have different versions of the same problem.

The first used references as arguments to the kernel:

template 
 __global__ void add_kernel(gpu_array &r,
                       gpu_array const&a1,
                       gpu_array const&a2) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Index: %d
", i);
    r.begin()[i] = a1.begin()[i] + a2.begin()[i];
}

template 
gpu_array operator+(gpu_array const&a1,
                      gpu_array const&a2)
{
    gpu_array res;
    add_kernel<<<(N+3)/4, 4>>>(res, a1, a2);
    cudaDeviceSynchronize();
    check_cuda_err();
    return res;
 }

While this is clean and elegant, and references are fully supported in CUDA kernel code, passing kernel arguments by reference from the host winds up with host addresses as arguments in the device because the CUDA toolchain, like every other C++ compiler I am aware of, implements references using pointers. The result is a kernel runtime error for illegal addresses.

The second uses pointer indirection instead of references and winds up passing host pointers to the GPU which fails pretty much identically to the first version:

template 
__global__ void add_kernel(gpu_array **r,
                           gpu_array const* a1,
                           gpu_array const* a2) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("Index: %d
", i);
    (*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}

template 
gpu_array operator+(gpu_array const&a1,
                          gpu_array const&a2)
{
    gpu_array *res = new gpu_array;
    add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2); 
    cudaDeviceSynchronize();
    check_cuda_err();
    // ignore memory leak for now
    return *res;
}

The only safe implementation for passing this structure directly to device kernels will be using pass-by-value. However that will mean that copies will fall out of scope and trigger destruction, which will deallocate the memory backing the arrays and result in unexpected errors of a different kind.

CUDA illegal memory access on kernel

Answers (1)

Related Questions