Reputation: 197
In my code, I have a statically allocated array in global memory (i.e., allocated using __device__
), which I want to sort using thrust::sort
, which isn't working. All of the examples on this topic are using CUDA runtime allocated arrays (using cudaMalloc
). Is there any way I can sort a statically allocated array?
I guess it has something to do with statically allocated memory not being accessible from the host. Using cudaMalloc
-allocated arrays, it is working fine. However, I want to avoid using this type of allocation since static allocation allows for easier access to the data from device code (doesn't it?).
#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#define N 4
typedef struct element {
int key;
int value;
__host__ __device__ bool operator<(element e) const
{ return key > e.key; }
} element;
__device__ element array[N];
__global__ void init() {
for (int i = 0; i < N; ++i) {
array[N - i - 1].key = i;
}
}
__global__ void print_array() {
for (int i = 0; i < N; ++i) {
printf("%d ", array[i].key);
}
printf("\n");
}
int main(void) {
thrust::device_ptr<element> array_first(array);
init<<<1,1>>>();
printf("unsorted: ");
print_array<<<1, 1>>>();
cudaDeviceSynchronize();
thrust::sort(array_first, array_first + N);
printf("sorted: ");
print_array<<<1, 1>>>();
cudaDeviceSynchronize();
}
Upvotes: 1
Views: 627
Reputation: 72349
This:
__device__ element array[N];
...
thrust::device_ptr<element> array_first(array);
is illegal. In host code, array
is a host address and can't be passed to device code. Do something like this instead:
element* array_d;
cudaGetSymbolAddress((void **)&array_d, array);
thrust::device_ptr<element> array_first(array_d);
i.e. you need to use cudaGetSymbolAddress to read the address from the GPU context at runtime, then you can use the result of that call in GPU code.
Upvotes: 2
Reputation: 11406
Use cudaGetSymbolAddress
to take the address of the array
variable from a __host__
function:
void* array_ptr = 0;
cudaGetSymbolAddress(&array_ptr, array);
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));
Here's the complete program:
#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#define N 4
typedef struct element {
int key;
int value;
__host__ __device__ bool operator<(element e) const
{ return key > e.key; }
} element;
__device__ element array[N];
__global__ void init() {
for (int i = 0; i < N; ++i) {
array[N - i - 1].key = i;
}
}
__global__ void print_array() {
for (int i = 0; i < N; ++i) {
printf("%d ", array[i].key);
}
printf("\n");
}
int main(void) {
cudaError_t error;
void* array_ptr = 0;
if(error = cudaGetSymbolAddress(&array_ptr, array))
{
throw thrust::system_error(error, thrust::cuda_category());
}
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));
init<<<1,1>>>();
printf("unsorted: ");
print_array<<<1, 1>>>();
if(error = cudaDeviceSynchronize())
{
throw thrust::system_error(error, thrust::cuda_category());
}
thrust::sort(array_first, array_first + N);
if(error = cudaDeviceSynchronize())
{
throw thrust::system_error(error, thrust::cuda_category());
}
printf("sorted: ");
print_array<<<1, 1>>>();
if(error = cudaDeviceSynchronize())
{
throw thrust::system_error(error, thrust::cuda_category());
}
return 0;
}
Here's the output on my system:
$ nvcc test.cu -run
unsorted: 3 2 1 0
sorted: 3 2 1 0
The sorted output is the same as the unsorted output, but I guess that is intentional given the way the data is generated and the definition of element::operator<
.
Upvotes: 2