Reputation: 1060
I am attempting to allocate device memory and store the pointer as a global variable. However, when I attempt to access the memory from a kernel, I receive this error from cudaDeviceSynchronize(): cudaErrorIllegalAddress. I've checked the cudaStatus codes returned from cudaMalloc and cudaMemcpy and they are both successful.
I hope the following example is straight forward enough to demonstrate what I want to do. Basically, I have a large array of sample data that I want all the kernels to be able to read from, but I don't want to have to pass the pointer into the kernel call every time.
I'm using Windows 8 x64, compiling code with Visual Studio 2012 and nvcc (via VS integration). Target is x64 Debug executable. My device is a GTX 780.
#include "cuda_runtime.h"
#include <stdio.h>
#define SIZE (1024 * 1024 * 10)
__device__ int* cData;
void Init()
{
int* data = new int[SIZE];
cudaError_t cudaStatus;
cudaStatus = cudaMalloc(&cData, SIZE * sizeof(int));
for (int i = 0; i < SIZE; i++)
data[i] = i;
cudaStatus = cudaMemcpy(cData, data, SIZE * sizeof(int), cudaMemcpyHostToDevice);
delete data;
}
__global__ void kernel(int i, int* output)
{
*output = cData[i];
}
int main()
{
cudaError_t cudaStatus = cudaSetDevice(0);
cudaDeviceProp properties;
int* result;
cudaStatus = cudaMallocManaged(&result, sizeof(int));
Init();
kernel<<<1, 1>>>(1000, result); // invoke a single thread, expecting the value of *result to be 1000 afterwards
cudaStatus = cudaGetLastError();
cudaStatus = cudaDeviceSynchronize(); // returns cudaErrorIllegalAddress
printf("Value is: %d", *result); // crashes the program, "In page error reading location 0x0000000D00000000"
cudaFree(result);
cudaStatus = cudaDeviceReset();
return 0;
}
Upvotes: 5
Views: 3497
Reputation: 151869
We don't use cudaMalloc
and cudaMemcpy
on __device__
variables.
Read the documentation for __device__
variables, where it states the API calls to be used:
cudaMemcpyToSymbol();
cudaMemcpyFromSymbol();
If you want to use cudaMalloc
on a dynamically allocated device array, but store the returned pointer in a __device__
variable, you'll have to do something like this:
void Init()
{
int* data = new int[SIZE];
int* d_data;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc(&d_data, SIZE * sizeof(int));
for (int i = 0; i < SIZE; i++)
data[i] = i;
cudaStatus = cudaMemcpy(d_data, data, SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(cData, &d_data, sizeof(int *));
delete data;
}
When I compile your code as-is, I get the following compiler warning from CUDA 6 nvcc
:
t411.cu(15): warning: a __device__ variable "cData" cannot be directly read in a host function
Those warnings should not be ignored.
If SIZE
is known at compile-time, as it is in your example, you can also do something like this:
__device__ int cData[SIZE];
void Init()
{
int* data = new int[SIZE];
cudaError_t cudaStatus;
for (int i = 0; i < SIZE; i++)
data[i] = i;
cudaStatus = cudaMemcpyToSymbol(cData, data, SIZE * sizeof(int));
delete data;
}
Upvotes: 6