Reputation: 974
How do I convert a device array that contains interleaved floats to a CUDA thrust tuple for thrust vector operations.
Purpose : I generate a crude list of vertices using Marching Cubes on CUDA. The output is a list of vertices, with redundancy and no connectivity. I wish to get a list of unique vertices and then an index buffer to these unique vertices, so I can perform some operations such as mesh simplification, etc...
float *devPtr; //this is device pointer that holds an array of floats
//6 floats represent a vertex, array size is vertsCount*6*sizeof(float).
//format is [v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, ...]
typedef thrust::tuple<float, float, float, float, float, float> MCVertex;
thrust::device_vector<MCVertex> inputVertices(vertsCount);
//copy from *devPtr to inputVertices.
//use something like unique to get rid of redundancies.
thrust::unique(inputVertices.begin(), inputVertices.end());
how do I achieve the copy, or is there some other better way of doing this?
Upvotes: 2
Views: 1880
Reputation: 974
Firstly, thanks to m.s. for his answer as it pointed me in the right direction.
Please bear in mind though if you are using Microsoft Visual Studio, only VS2013 supports variadic tuples.
For c++11 feature support list for host compiler (cl.exe as in VS2013) use the link below. https://msdn.microsoft.com/en-us/library/hh567368.aspx
PS : make sure you are building for v120 platform toolset to avail the variadic template feature.
Thanks to @Robert Crovella, [-std=c++11] is set by default with VS2013 so the flag need not be set.
Back to the problem at hand, here is how I solved it using code from m.s. but using thrust::device_ptr instead of raw pointers.
#include <iostream>
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\sort.h"
#include "thrust\unique.h"
#include "thrust\binary_search.h"
#include "thrust\iterator\zip_iterator.h"
#include "thrust\execution_policy.h"
template <typename Iterator>
struct strided_range
{
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function < difference_type, difference_type >
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
//forcing it to be a 3-tuple one instead of using variadic templates
template<typename Iterator>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterator, Iterator, Iterator>> zip(const Iterator& sr1, const Iterator& sr2, const Iterator& sr3)
{
return thrust::make_zip_iterator(thrust::make_tuple(sr1, sr2, sr3));
}
int main()
{
const int stride = 3;
const int num = 6;
const int size = stride * num;
//values on host
float values[size] = { 1, 2, 3,
4, 5, 6,
1, 2, 3,
4, 5, 6,
1, 2, 3,
7, 8, 9 };
//ptr for device
float *d_data;
//allocate memory on the device
cudaMalloc((void**)&d_data, size*sizeof(float));
//copy from host to device
cudaMemcpy(d_data, values, size*sizeof(float), cudaMemcpyHostToDevice);
//a typedef for device_ptr<float>
typedef thrust::device_ptr<float> floatdevptr;
//cast our raw pointer to device pointer
floatdevptr dev_dataptr = thrust::device_pointer_cast(d_data);
//create a device_vector from the dev_dataptr
thrust::device_vector<float> d_vec(dev_dataptr, dev_dataptr + size);
//make a copy
thrust::device_vector<float> d_veccopy = d_vec;
//create a device_vector to hold indices (6 indices for 6 vertices)
thrust::device_vector<unsigned int> indices( num );
//print input values
std::cout << "Input Values : ";
thrust::copy(d_vec.begin(), d_vec.begin() + size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
//a typedef for our strided_range<device_ptr<float>>
typedef strided_range<floatdevptr>::iterator floatdevptr_stridedrangeiterator;
//create the strided_range for x, y and z;
strided_range<floatdevptr> dvx = strided_range<floatdevptr>(dev_dataptr + 0, dev_dataptr + size - stride + 1, stride);
strided_range<floatdevptr> dvy = strided_range<floatdevptr>(dev_dataptr + 1, dev_dataptr + size - stride + 2, stride);
strided_range<floatdevptr> dvz = strided_range<floatdevptr>(dev_dataptr + 2, dev_dataptr + size - stride + 3, stride);
//create zip_iterator for the vertex
auto zip_dv_first = zip<floatdevptr_stridedrangeiterator>(dvx.begin(), dvy.begin(), dvz.begin());
auto zip_dv_last = zip<floatdevptr_stridedrangeiterator>(dvx.end(), dvy.end(), dvz.end());
//sort
thrust::sort(zip_dv_first, zip_dv_last);
//remove duplicates
auto new_dv_last = thrust::unique(zip_dv_first, zip_dv_last);
//compute new size
std::size_t new_dv_size = stride * (new_dv_last - zip_dv_first);
//create the same for the copy.
strided_range<floatdevptr> dvcpyx = strided_range<floatdevptr>(d_veccopy.data() + 0, d_veccopy.data() + size - stride + 1, stride);
strided_range<floatdevptr> dvcpyy = strided_range<floatdevptr>(d_veccopy.data() + 1, d_veccopy.data() + size - stride + 2, stride);
strided_range<floatdevptr> dvcpyz = strided_range<floatdevptr>(d_veccopy.data() + 2, d_veccopy.data() + size - stride + 3, stride);
auto zip_dvcpy_first = zip<floatdevptr_stridedrangeiterator>(dvcpyx.begin(), dvcpyy.begin(), dvcpyz.begin());
auto zip_dvcpy_last = zip<floatdevptr_stridedrangeiterator>(dvcpyx.end(), dvcpyy.end(), dvcpyz.end());
//find index of each input vertex in the list of unique vertices
thrust::lower_bound(zip_dv_first, new_dv_last,
zip_dvcpy_first, zip_dvcpy_last,
indices.begin());
// print unique vertex data
std::cout << "Output Values : ";
thrust::copy(d_vec.begin(), d_vec.begin() + new_dv_size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
// print the indices
std::cout << "Index Values : ";
thrust::copy(indices.begin(), indices.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
}
Output is :
Input Values : 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 7 8 9
Output Values : 1 2 3 4 5 6 7 8 9
Index Values : 0 1 0 1 0 2
Upvotes: 0
Reputation: 16334
There is no need to copy, you can use a combination of thrust::zip_iterator
and a strided_range
iterator.
The following example works for a list of floats where 3 consecutive values belong to each other. It can of course be extended to support more than that, it is just a matter of typing.
The first step is to load some demo data on to the GPU, this uses a thrust::device_vector
, but this results in a float*
pointer just like you have.
Based on the strided_range
iterator and the thrust::zip_iterator
the data is first sorted and then compacted.
This code uses C++11 features, so compile it using:
nvcc -std=c++11 unique.cu -o unique
The output when running ./unique
is:
1 2 3 4 5 6
unique.cu
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename Iterator>
struct strided_range
{
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main()
{
const int stride = 3;
const int num = 3;
const int size = stride * num;
float values[size] = {1,2,3,
4,5,6,
1,2,3};
// in this example I use thrust vectors to simplify copying from host to device
thrust::host_vector<float> h_vec (values, values+size);
thrust::device_vector<float> d_vec = h_vec;
// in your case, dev_ptr is your input pointer
float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());
auto first = strided_range<float*>(dev_ptr, dev_ptr + size+1-stride, stride);
auto second = strided_range<float*>(dev_ptr+1, dev_ptr + size+1-stride+1, stride);
auto third = strided_range<float*>(dev_ptr+2, dev_ptr + size+1-stride+2, stride);
auto zip_begin = zip(first.begin(),second.begin(), third.begin());
auto zip_end = zip(first.end(), second.end(), third.end());
thrust::sort(thrust::device, zip_begin, zip_end);
auto new_end = thrust::unique(thrust::device, zip_begin,zip_end);
std::size_t new_size = stride * (new_end - zip_begin);
// use the underlying thrust::device_vector again to simplify printing
thrust::copy(d_vec.begin(), d_vec.begin()+new_size, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
return 0;
}
By the way: Be aware of floating point inaccuracies when trying to get unique values.
I also created a generic version of the example above which builds the zip_iterator
automatically and works for any number of consecutive elements. Since the official thrust version unfortunately does not yet support variadic tuples, we use a std::tuple
to build the desired tuple type and then convert it into a thrust::tuple
. If Andrew Corrigan's branch of thrust (which adds support for variadic tuples) was merged into thrust master, we could avoid using std::tuple at all.
Compile this example using:
nvcc generic_unique.cu -std=c++11 -o generic_unique
The output when running ./generic_unique
is:
input data: 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0
after sort: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 4 5 6
after unique: 0 0 0 0 0 0 1 2 3 4 5 6
generic_unique.cu
#include <tuple>
#include <thrust/tuple.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
// adapted from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
//template <difference_type stride>
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last)
: first(first), last(last) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
};
// copied from http://stackoverflow.com/a/16853775/678093
template<typename, typename>
struct append_to_type_seq { };
template<typename T, typename... Ts, template<typename...> class TT>
struct append_to_type_seq<T, TT<Ts...>>
{
using type = TT<Ts..., T>;
};
template<typename T, unsigned int N, template<typename...> class TT>
struct repeat
{
using type = typename
append_to_type_seq<
T,
typename repeat<T, N-1, TT>::type
>::type;
};
template<typename T, template<typename...> class TT>
struct repeat<T, 0, TT>
{
using type = TT<>;
};
template<typename Tuple> struct std_to_thrust_tuple;
template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
using type = thrust::tuple<T...>;
};
template<typename IteratorType, std::size_t stride>
class zipped_strided_range
{
public:
typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;
zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
{
assign<0>();
}
ZipIterator begin() const
{
return thrust::make_zip_iterator(begin_tuple);
}
ZipIterator end() const
{
return thrust::make_zip_iterator(end_tuple);
}
protected:
template <std::size_t index>
void assign(typename std::enable_if< (index < stride) >::type* = 0)
{
strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);
thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
thrust::get<index>(end_tuple) = strided_range_iterator.end();
assign<index+1>();
}
template <std::size_t index>
void assign(typename std::enable_if< (index == stride) >::type* = 0)
{
// end recursion
}
IteratorType first;
IteratorType last;
IteratorTuple begin_tuple;
IteratorTuple end_tuple;
};
int main()
{
const int stride = 6;
const int num = 6;
const int size = stride * num;
float values[size] = {1,2,3,4,5,6,
0,0,0,0,0,0,
1,2,3,4,5,6,
0,0,0,0,0,0,
1,2,3,4,5,6,
0,0,0,0,0,0
};
// in this example I use thrust vectors to simplify copying from host to device
// it also simplifies printing
thrust::host_vector<float> h_vec (values, values+size);
thrust::device_vector<float> d_vec = h_vec;
std::cout << "input data: ";
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
// in your case, dev_ptr is your input pointer
float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());
zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);
thrust::sort(thrust::device, zipped.begin(), zipped.end());
std::cout << "after sort: ";
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
std::size_t new_size = stride * (new_end - zipped.begin());
std::cout << "after unique: ";
d_vec.resize(new_size);
thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
return 0;
}
Upvotes: 6