Reputation: 11
I'm looking to sort a large 3D array along the z-axis.
Example array is X x Y x Z (1000x1000x5)
I'd like to sort along the z-axis so I'd perform 1000x1000 sorts for 5 element along the z-axis.
Edit Update: Tried an attempt to use thrust below. It's functional and I'd store the output back, but this is very slow since I'm sorting 5 elements at a time per (x,y) location:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
int main(){
int x = 1000, y = 1000, z = 5;
float*** unsorted_cube = new float** [x];
for (int i = 0; i < x; i++)
{
// Allocate memory blocks for
// rows of each 2D array
unsorted_cube[i] = new float* [y];
for (int j = 0; j < y; j++)
{
// Allocate memory blocks for
// columns of each 2D array
unsorted_cube[i][j] = new float[z];
}
}
for (int i = 0; i < x; i++)
{
for (int j = 0; j < y; j++)
{
unsorted_cube[i][j][0] = 4.0f;
unsorted_cube[i][j][1] = 3.0f;
unsorted_cube[i][j][2] = 1.0f;
unsorted_cube[i][j][3] = 5.0f;
unsorted_cube[i][j][4] = 2.0f;
}
}
for (int i = 0; i < 5; i++)
{
printf("unsorted_cube first 5 elements to sort at (0,0): %f\n", unsorted_cube[0][0][i]);
}
float* temp_input;
float* temp_output;
float* raw_ptr;
float raw_ptr_out[5];
cudaMalloc((void**)&raw_ptr, N_Size * sizeof(float));
for (int i = 0; i < x; i++)
{
for (int j = 0; j < y; j++)
{
temp_input[0] = unsorted_cube[i][j][0];
temp_input[1] = unsorted_cube[i][j][1];
temp_input[2] = unsorted_cube[i][j][2];
temp_input[3] = unsorted_cube[i][j][3];
temp_input[4] = unsorted_cube[i][j][4];
cudaMemcpy(raw_ptr, temp_input, 5 * sizeof(float), cudaMemcpyHostToDevice);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(raw_ptr);
thrust::sort(dev_ptr, dev_ptr + 5);
thrust::host_vector<float> host_vec(5);
thrust::copy(dev_ptr, dev_ptr + 5, raw_ptr_out);
if (i == 0 && j == 0)
{
for (int i = 0; i < 5; i++)
{
temp_output[i] = raw_ptr_out[i];
}
printf("sorted_cube[0,0,0] : %f\n", temp_output[0]);
printf("sorted_cube[0,0,1] : %f\n", temp_output[1]);
printf("sorted_cube[0,0,2] : %f\n", temp_output[2]);
printf("sorted_cube[0,0,3] : %f\n", temp_output[3]);
printf("sorted_cube[0,0,4] : %f\n", temp_output[4]);
}
}
}
}
Upvotes: 1
Views: 368
Reputation: 3095
Assuming that the data is in a format where the values in each xy-plane are consecutive in memory: data[((z * y_length) + y) * x_length + x]
(which is be best for coalescing memory accesses on the GPU, as well)
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/zip_function.h>
void sort_in_z_dir(thrust::device_vector<float> &data, int x_length,
int y_length) { // z_length == 5
auto z_stride = x_length * y_length;
thrust::for_each_n(
thrust::make_zip_iterator(
data.begin() + 0 * z_stride,
data.begin() + 1 * z_stride,
data.begin() + 2 * z_stride,
data.begin() + 3 * z_stride,
data.begin() + 4 * z_stride),
z_stride,
thrust::make_zip_function([] __host__ __device__(float a, float b,
float c, float d,
float e) {
float local_data[] = {a, b, c, d, e};
thrust::sort(
thrust::seq,
local_data,
local_data + sizeof(local_data) / sizeof(local_data[0]));
a = local_data[0];
b = local_data[1];
c = local_data[2];
d = local_data[3];
e = local_data[4];
}));
}
This solution is certainly very ugly in terms of hard-coding z_length
. One can use some C++ template-"magic" to make z_length
into a template parameter, but this seemed to be overkill for this answer about Thrust.
See Convert std::tuple to std::array C++11 and How to convert std::array to std::tuple? for examples on interfacing between arrays and tuples (I avoided the explicit tuple here by using zip_function
. Without that the functor/lambda would be passed a tuple of five floats instead of five floats as different arguments).
The advantage of this solution is that up to the sorting algorithm itself it should be pretty much optimal performance-wise. I don't know if thrust::sort
is optimized for such small input arrays, but you can replace it by any self written sorting algorithm as I proposed in the comments.
If you want to be able to use different z_length
without all this hassle, you might prefer this solution, which sorts in global memory, which is far from optimal, and feels a bit hacky because it uses Thrust pretty much only to launch a kernel. Here you want to have the data ordered the other way around: data[((x * y_length) + y) * z_length + z]
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>
void sort_in_z_dir_alternative(thrust::device_vector<float> &data, int x_length,
int y_length, int z_length) {
int n_threads = x_length * y_length;
float *ddata = thrust::raw_pointer_cast(data.data());
thrust::for_each(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n_threads),
[ddata, z_length] __host__ __device__(int idx) {
thrust::sort(thrust::seq,
ddata + z_length * idx,
ddata + z_length * (idx + 1));
});
}
If you are ok with z_length
being a template parameter, this might be a solution that combines the best from both worlds (data format like in the first example):
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sort.h>
template <int z_length>
void sort_in_z_dir_middle_ground(thrust::device_vector<float> &data,
int x_length, int y_length) {
int n_threads = x_length * y_length; // == z_stride
float *ddata = thrust::raw_pointer_cast(data.data());
thrust::for_each(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n_threads),
[ddata, n_threads] __host__ __device__(int idx) {
float local_data[z_length];
#pragma unroll
for (int i = 0; i < z_length; ++i) {
local_data[i] = ddata[idx + i * n_threads];
}
thrust::sort(thrust::seq,
local_data,
local_data + z_length);
#pragma unroll
for (int i = 0; i < z_length; ++i) {
ddata[idx + i * n_threads] = local_data[i];
}
});
}
If the code doesn't have to be able to tun on the CPU I would recommend using the CUB library (it is used in the CUDA backend of Thrust) because it actually has segmented sort algorithms readily available. For these very small (5) segments cub::DeviceSegmentedSort::SortKeys
seem to be the right tool (cub::DeviceSegmentedRadixSort
aims at much biger segments). Just use fancy iterators to avoid memory access for the segment-offsets (The algorithm expects the elements of each segment to be consecutive in memory like in the second Thrust implementation above):
auto begin_offsets_iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
cuda::proclaim_return_type<int>(
[] __device__(int idx) { return idx * z_length; }));
auto end_offsets_iter = begin_offsets_iter + 1;
cuda::proclaim_return_type
is just a tool for making the return type of a pure __device__
lambda available on the host. It is not needed if you use a normal functor with a __device__
operator()
for technical reasons. cuda::proclaim_return_type
needs #include <cuda/functional>
and comes with libcu++ which is the third library in the CCCL with Thrust and CUB.
Upvotes: 1