convert CUDA device interleaved array to tuple for vector operations

Question

How do I convert a device array that contains interleaved floats to a CUDA thrust tuple for thrust vector operations.

Purpose : I generate a crude list of vertices using Marching Cubes on CUDA. The output is a list of vertices, with redundancy and no connectivity. I wish to get a list of unique vertices and then an index buffer to these unique vertices, so I can perform some operations such as mesh simplification, etc...

float *devPtr; //this is device pointer that holds an array of floats
//6 floats represent a vertex, array size is vertsCount*6*sizeof(float).
//format is [v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, ...]

typedef thrust::tuple MCVertex;

thrust::device_vector inputVertices(vertsCount);

//copy from *devPtr to inputVertices.

//use something like unique to get rid of redundancies.
thrust::unique(inputVertices.begin(), inputVertices.end());

how do I achieve the copy, or is there some other better way of doing this?

m.s. · Accepted Answer

There is no need to copy, you can use a combination of thrust::zip_iterator and a strided_range iterator.

The following example works for a list of floats where 3 consecutive values belong to each other. It can of course be extended to support more than that, it is just a matter of typing.

The first step is to load some demo data on to the GPU, this uses a thrust::device_vector, but this results in a float* pointer just like you have.

Based on the strided_range iterator and the thrust::zip_iterator the data is first sorted and then compacted. This code uses C++11 features, so compile it using:

nvcc -std=c++11 unique.cu -o unique

The output when running ./unique is:

1 2 3 4 5 6

unique.cu

#include 
#include 
#include 
#include 
#include 
#include 

template
__host__ __device__
thrust::zip_iterator> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}

template 
struct strided_range
{
    typedef typename thrust::iterator_difference::type difference_type;

    struct stride_functor : public thrust::unary_function
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        { 
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator                   CountingIterator;
    typedef typename thrust::transform_iterator TransformIterator;
    typedef typename thrust::permutation_iterator     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

int main()
{
    const int stride = 3;
    const int num = 3;

    const int size = stride * num;

    float values[size] = {1,2,3,
                          4,5,6,
                          1,2,3};


    // in this example I use thrust vectors to simplify copying from host to device
    thrust::host_vector h_vec (values, values+size);
    thrust::device_vector d_vec = h_vec;

    // in your case, dev_ptr is your input pointer
    float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());

    auto first =  strided_range(dev_ptr,   dev_ptr + size+1-stride,   stride);
    auto second = strided_range(dev_ptr+1, dev_ptr + size+1-stride+1, stride);
    auto third =  strided_range(dev_ptr+2, dev_ptr + size+1-stride+2, stride);

    auto zip_begin = zip(first.begin(),second.begin(), third.begin());
    auto zip_end = zip(first.end(), second.end(), third.end());

    thrust::sort(thrust::device, zip_begin, zip_end);
    auto new_end = thrust::unique(thrust::device, zip_begin,zip_end);
    std::size_t new_size = stride * (new_end - zip_begin);

    // use the underlying thrust::device_vector again to simplify printing
    thrust::copy(d_vec.begin(), d_vec.begin()+new_size, std::ostream_iterator(std::cout, " "));
    std::cout << std::endl;

    return 0;
}

By the way: Be aware of floating point inaccuracies when trying to get unique values.

I also created a generic version of the example above which builds the zip_iterator automatically and works for any number of consecutive elements. Since the official thrust version unfortunately does not yet support variadic tuples, we use a std::tuple to build the desired tuple type and then convert it into a thrust::tuple. If Andrew Corrigan's branch of thrust (which adds support for variadic tuples) was merged into thrust master, we could avoid using std::tuple at all.

Compile this example using:

nvcc generic_unique.cu -std=c++11 -o generic_unique

The output when running ./generic_unique is:

input data: 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0 
after sort: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 4 5 6 
after unique: 0 0 0 0 0 0 1 2 3 4 5 6

generic_unique.cu

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

// adapted from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
template ::type stride>
class strided_range
{
public:
    typedef typename thrust::iterator_difference::type difference_type;

    //template 
    struct stride_functor : public thrust::unary_function
    {
        __host__ __device__
        difference_type operator()(const difference_type& i) const
        { 
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator                           CountingIterator;
    typedef typename thrust::transform_iterator TransformIterator;
    typedef typename thrust::permutation_iterator             PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last)
        : first(first), last(last) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

protected:
    Iterator first;
    Iterator last;
};

// copied from http://stackoverflow.com/a/16853775/678093
template
struct append_to_type_seq { };

template class TT>
struct append_to_type_seq>
{
    using type = TT;
};

template class TT>
struct repeat
{
    using type = typename
        append_to_type_seq<
            T,
            typename repeat::type
            >::type;
};

template class TT>
struct repeat
{
    using type = TT<>;
};

template struct std_to_thrust_tuple;
template struct std_to_thrust_tuple> {
  using type = thrust::tuple;
};

template
class zipped_strided_range
{
public:

    typedef typename strided_range::iterator SingleIterator;
    typedef typename repeat::type StdIteratorTuple;
    typedef typename std_to_thrust_tuple::type IteratorTuple;
    typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;

    zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
    {
        assign<0>();
    }

    ZipIterator begin() const
    {
        return thrust::make_zip_iterator(begin_tuple);
    }

    ZipIterator end() const
    {
        return thrust::make_zip_iterator(end_tuple);
    }

protected:

    template 
    void assign(typename std::enable_if< (index < stride) >::type* = 0)
    {
        strided_range strided_range_iterator(first+index, last-(stride-1)+index);

        thrust::get(begin_tuple) = strided_range_iterator.begin();
        thrust::get(end_tuple) = strided_range_iterator.end();
        assign();
    }

    template 
    void assign(typename std::enable_if< (index == stride) >::type* = 0)
    {
        // end recursion
    }

    IteratorType first;
    IteratorType last;

    IteratorTuple begin_tuple;
    IteratorTuple end_tuple;
};


int main()
{

    const int stride = 6;
    const int num = 6;

    const int size = stride * num;

    float values[size] = {1,2,3,4,5,6,
                          0,0,0,0,0,0,
                          1,2,3,4,5,6,
                          0,0,0,0,0,0,
                          1,2,3,4,5,6,
                          0,0,0,0,0,0
    };


    // in this example I use thrust vectors to simplify copying from host to device
    // it also simplifies printing
    thrust::host_vector h_vec (values, values+size);
    thrust::device_vector d_vec = h_vec;

    std::cout << "input data: ";
    thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator(std::cout, " "));
    std::cout << std::endl;

    // in your case, dev_ptr is your input pointer
    float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());

    zipped_strided_range zipped(dev_ptr, dev_ptr+size);


    thrust::sort(thrust::device, zipped.begin(), zipped.end());

    std::cout << "after sort: ";
    thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator(std::cout, " "));
    std::cout << std::endl;

    auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
    std::size_t new_size = stride * (new_end - zipped.begin());

    std::cout << "after unique: ";
    d_vec.resize(new_size);
    thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator(std::cout, " "));
    std::cout << std::endl;

    return 0;
}

convert CUDA device interleaved array to tuple for vector operations

Answers (2)

Related Questions