J0hn D0e
J0hn D0e

Reputation: 197

Sorting statically allocated array using Thrust

In my code, I have a statically allocated array in global memory (i.e., allocated using __device__), which I want to sort using thrust::sort, which isn't working. All of the examples on this topic are using CUDA runtime allocated arrays (using cudaMalloc). Is there any way I can sort a statically allocated array?

I guess it has something to do with statically allocated memory not being accessible from the host. Using cudaMalloc-allocated arrays, it is working fine. However, I want to avoid using this type of allocation since static allocation allows for easier access to the data from device code (doesn't it?).

Minimal (not-) working example:

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  thrust::device_ptr<element> array_first(array);

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();

  thrust::sort(array_first, array_first + N);

  printf("sorted: ");
  print_array<<<1, 1>>>();
  cudaDeviceSynchronize();
}

Upvotes: 1

Views: 627

Answers (2)

talonmies
talonmies

Reputation: 72349

This:

__device__ element array[N];

...

thrust::device_ptr<element> array_first(array);

is illegal. In host code, array is a host address and can't be passed to device code. Do something like this instead:

element* array_d;
cudaGetSymbolAddress((void **)&array_d, array);
thrust::device_ptr<element> array_first(array_d);

i.e. you need to use cudaGetSymbolAddress to read the address from the GPU context at runtime, then you can use the result of that call in GPU code.

Upvotes: 2

Jared Hoberock
Jared Hoberock

Reputation: 11406

Use cudaGetSymbolAddress to take the address of the array variable from a __host__ function:

void* array_ptr = 0;
cudaGetSymbolAddress(&array_ptr, array);
thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

Here's the complete program:

#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>

#define N 4

typedef struct element {
  int key;
  int value;
  __host__ __device__ bool operator<(element e) const
  { return key > e.key; }
} element;

__device__ element array[N];

__global__ void init() {
  for (int i = 0; i < N; ++i) {
    array[N - i - 1].key = i;
  }
}

__global__ void print_array() {
  for (int i = 0; i < N; ++i) {
    printf("%d ", array[i].key);
  }
  printf("\n");
}

int main(void) {
  cudaError_t error;

  void* array_ptr = 0;
  if(error = cudaGetSymbolAddress(&array_ptr, array))
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::device_ptr<element> array_first(reinterpret_cast<element*>(array_ptr));

  init<<<1,1>>>();

  printf("unsorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  thrust::sort(array_first, array_first + N);

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  printf("sorted: ");
  print_array<<<1, 1>>>();

  if(error = cudaDeviceSynchronize())
  {
    throw thrust::system_error(error, thrust::cuda_category());
  }

  return 0;
}

Here's the output on my system:

$ nvcc test.cu -run
unsorted: 3 2 1 0 
sorted: 3 2 1 0 

The sorted output is the same as the unsorted output, but I guess that is intentional given the way the data is generated and the definition of element::operator<.

Upvotes: 2

Related Questions