Reputation: 1526
I am trying to implement a basic device array type on CUDA, as an exercise. It should mimic the std::array interface, as a design goal. While implementing operator+
, I am getting illegal memory access error and I can't decipher why.
Here is the code.
#include <iostream>
#include <array>
enum class memcpy_t {
host_to_host,
host_to_device,
device_to_host,
device_to_device
};
bool check_cuda_err() {
cudaError_t err = cudaGetLastError();
if(err == cudaSuccess) {
return true;
}
else {
std::cerr << "Cuda Error: " << cudaGetErrorString(err) << "\n" << std::flush;
return false;
}
}
template <typename T, std::size_t N>
struct cuda_allocator {
using pointer = T*;
static void allocate(T *&dev_mem) {
cudaMalloc(&dev_mem, N * sizeof(T));
}
static void deallocate(T *dev_mem) {
cudaFree(dev_mem);
}
template <memcpy_t ct>
static void copy (T *dst, T *src) {
switch(ct) {
case memcpy_t::host_to_host:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToHost);
break;
case memcpy_t::host_to_device:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyHostToDevice);
break;
case memcpy_t::device_to_host:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToHost);
break;
case memcpy_t::device_to_device:
cudaMemcpy(dst, src, N * sizeof(T), cudaMemcpyDeviceToDevice);
break;
default:
break;
}
}
};
template <typename T, std::size_t N>
struct gpu_array {
using allocator = cuda_allocator<T, N>;
using pointer = typename allocator::pointer;
using value_type = T;
using iterator = T*;
using const_iterator = T const*;
gpu_array() {
allocator::allocate(data);
}
gpu_array(std::array<T, N> host_arr) {
allocator::allocate(data);
allocator::template copy<memcpy_t::host_to_device>(data, host_arr.begin());
}
gpu_array& operator=(gpu_array const& o) {
//allocator::allocate(data);
allocator::template copy<memcpy_t::device_to_device>(data, o.begin());
}
operator std::array<T, N>() {
std::array<T, N> res;
allocator::template copy<memcpy_t::device_to_host>(res.begin(), data);
return res;
}
~gpu_array() {
allocator::deallocate(data);
}
__device__ iterator begin() { return data; }
__device__ iterator end() { return data + N; }
__device__ const_iterator begin() const { return data; }
__device__ const_iterator end() const { return data + N; }
private:
T* data;
};
template <typename T, std::size_t N>
__global__ void add_kernel(gpu_array<T,N> **r,
gpu_array<T,N> const* a1,
gpu_array<T,N> const* a2) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
printf("Index: %d\n", i);
(*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}
template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
gpu_array<T,N> const&a2)
{
gpu_array<T, N> *res = new gpu_array<T, N>;
add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2);
cudaDeviceSynchronize();
check_cuda_err();
// ignore memory leak for now
return *res;
}
const int N = 1<<3;
int main() {
std::array<float, N> x,y;
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
gpu_array<float, N> dx{x};
gpu_array<float, N> dy{y};
check_cuda_err(); // shows no error for memcpy
std::array<float, N> res = dx + dy;
for(const auto& elem : res) {
std::cout << elem << ", ";
}
}
I am creating a size 8 array, to test things. As you can see, cuda_check_err()
shows no error after gpu_array
initialization from host arrays. I am guessing copying data works correctly. But in the kernel, when I index the device arrays, I am getting illegal memory access
error. Here is the output:
Index: 0
Index: 1
Index: 2
Index: 3
Index: 4
Index: 5
Index: 6
Index: 7
Cuda Error: an illegal memory access was encountered
9.45143e-39, 0, 6.39436e-39, 0, 0, 0, 0, 0,
As you can see, I've printed computed index for each thread and nothing seems to be out of bounds. So, what might cause this illegal memory access error? By the way, cuda-memcheck
says:
Invalid global read of size 8
and later
Address 0x7fff9f4c6ec0 is out of bounds
but I've printed the indices, don't know why it is out of bounds.
Upvotes: 1
Views: 4621
Reputation: 72342
We have seen two versions of code in this question, and unfortunately both have different versions of the same problem.
The first used references as arguments to the kernel:
template <typename T, std::size_t N>
__global__ void add_kernel(gpu_array<T,N> &r,
gpu_array<T,N> const&a1,
gpu_array<T,N> const&a2) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
printf("Index: %d\n", i);
r.begin()[i] = a1.begin()[i] + a2.begin()[i];
}
template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
gpu_array<T,N> const&a2)
{
gpu_array<T, N> res;
add_kernel<<<(N+3)/4, 4>>>(res, a1, a2);
cudaDeviceSynchronize();
check_cuda_err();
return res;
}
While this is clean and elegant, and references are fully supported in CUDA kernel code, passing kernel arguments by reference from the host winds up with host addresses as arguments in the device because the CUDA toolchain, like every other C++ compiler I am aware of, implements references using pointers. The result is a kernel runtime error for illegal addresses.
The second uses pointer indirection instead of references and winds up passing host pointers to the GPU which fails pretty much identically to the first version:
template <typename T, std::size_t N>
__global__ void add_kernel(gpu_array<T,N> **r,
gpu_array<T,N> const* a1,
gpu_array<T,N> const* a2) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
printf("Index: %d\n", i);
(*r)->begin()[i] = a1->begin()[i] + a2->begin()[i];
}
template <typename T, std::size_t N>
gpu_array<T, N> operator+(gpu_array<T,N> const&a1,
gpu_array<T,N> const&a2)
{
gpu_array<T, N> *res = new gpu_array<T, N>;
add_kernel<<<(N+3)/4, 4>>>(&res, &a1, &a2);
cudaDeviceSynchronize();
check_cuda_err();
// ignore memory leak for now
return *res;
}
The only safe implementation for passing this structure directly to device kernels will be using pass-by-value. However that will mean that copies will fall out of scope and trigger destruction, which will deallocate the memory backing the arrays and result in unexpected errors of a different kind.
Upvotes: 3