Harish
Harish

Reputation: 974

convert CUDA device interleaved array to tuple for vector operations

How do I convert a device array that contains interleaved floats to a CUDA thrust tuple for thrust vector operations.

Purpose : I generate a crude list of vertices using Marching Cubes on CUDA. The output is a list of vertices, with redundancy and no connectivity. I wish to get a list of unique vertices and then an index buffer to these unique vertices, so I can perform some operations such as mesh simplification, etc...

float *devPtr; //this is device pointer that holds an array of floats
//6 floats represent a vertex, array size is vertsCount*6*sizeof(float).
//format is [v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, ...]

typedef thrust::tuple<float, float, float, float, float, float> MCVertex;

thrust::device_vector<MCVertex> inputVertices(vertsCount);

//copy from *devPtr to inputVertices.

//use something like unique to get rid of redundancies.
thrust::unique(inputVertices.begin(), inputVertices.end());

how do I achieve the copy, or is there some other better way of doing this?

Upvotes: 2

Views: 1880

Answers (2)

Harish
Harish

Reputation: 974

Firstly, thanks to m.s. for his answer as it pointed me in the right direction.

Please bear in mind though if you are using Microsoft Visual Studio, only VS2013 supports variadic tuples.

For c++11 feature support list for host compiler (cl.exe as in VS2013) use the link below. https://msdn.microsoft.com/en-us/library/hh567368.aspx

PS : make sure you are building for v120 platform toolset to avail the variadic template feature.

Thanks to @Robert Crovella, [-std=c++11] is set by default with VS2013 so the flag need not be set.

Back to the problem at hand, here is how I solved it using code from m.s. but using thrust::device_ptr instead of raw pointers.

#include <iostream>
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\sort.h"
#include "thrust\unique.h"
#include "thrust\binary_search.h"
#include "thrust\iterator\zip_iterator.h"
#include "thrust\execution_policy.h"

template <typename Iterator>
struct strided_range
{
    typedef typename thrust::iterator_difference<Iterator>::type     difference_type;

    struct stride_functor : public thrust::unary_function < difference_type, difference_type >
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
            difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator, TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

//forcing it to be a 3-tuple one instead of using variadic templates
template<typename Iterator>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterator, Iterator, Iterator>> zip(const         Iterator& sr1, const Iterator& sr2, const Iterator& sr3)
{
    return thrust::make_zip_iterator(thrust::make_tuple(sr1, sr2, sr3));
}

int main()
{

    const int stride = 3;
    const int num = 6;

    const int size = stride * num;

    //values on host
    float values[size] = { 1, 2, 3,
        4, 5, 6,
        1, 2, 3,
        4, 5, 6,
        1, 2, 3,
        7, 8, 9 };
    //ptr for device
    float *d_data;
    //allocate memory on the device
    cudaMalloc((void**)&d_data, size*sizeof(float));
    //copy from host to device
    cudaMemcpy(d_data, values, size*sizeof(float), cudaMemcpyHostToDevice);

    //a typedef for device_ptr<float>
    typedef thrust::device_ptr<float> floatdevptr;

    //cast our raw pointer to device pointer
    floatdevptr dev_dataptr = thrust::device_pointer_cast(d_data);

    //create a device_vector from the dev_dataptr
    thrust::device_vector<float> d_vec(dev_dataptr, dev_dataptr + size);
    //make a copy
    thrust::device_vector<float> d_veccopy = d_vec;

    //create a device_vector to hold indices (6 indices for 6 vertices)
    thrust::device_vector<unsigned int> indices( num );

    //print input values
    std::cout << "Input Values : ";
    thrust::copy(d_vec.begin(), d_vec.begin() + size, std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;

    //a typedef for our strided_range<device_ptr<float>>
    typedef strided_range<floatdevptr>::iterator floatdevptr_stridedrangeiterator;

    //create the strided_range for x, y and z;
    strided_range<floatdevptr> dvx = strided_range<floatdevptr>(dev_dataptr + 0, dev_dataptr + size - stride + 1, stride);
    strided_range<floatdevptr> dvy = strided_range<floatdevptr>(dev_dataptr + 1, dev_dataptr + size - stride + 2, stride);
    strided_range<floatdevptr> dvz = strided_range<floatdevptr>(dev_dataptr + 2, dev_dataptr + size - stride + 3, stride);

    //create zip_iterator for the vertex
    auto zip_dv_first = zip<floatdevptr_stridedrangeiterator>(dvx.begin(), dvy.begin(), dvz.begin());
    auto zip_dv_last = zip<floatdevptr_stridedrangeiterator>(dvx.end(), dvy.end(), dvz.end());

    //sort
    thrust::sort(zip_dv_first, zip_dv_last);
    //remove duplicates
    auto new_dv_last = thrust::unique(zip_dv_first, zip_dv_last);
    //compute new size
    std::size_t new_dv_size = stride * (new_dv_last - zip_dv_first);

    //create the same for the copy.
    strided_range<floatdevptr> dvcpyx = strided_range<floatdevptr>(d_veccopy.data() + 0, d_veccopy.data() + size - stride + 1, stride);
    strided_range<floatdevptr> dvcpyy = strided_range<floatdevptr>(d_veccopy.data() + 1, d_veccopy.data() + size - stride + 2, stride);
    strided_range<floatdevptr> dvcpyz = strided_range<floatdevptr>(d_veccopy.data() + 2, d_veccopy.data() + size - stride + 3, stride);

    auto zip_dvcpy_first = zip<floatdevptr_stridedrangeiterator>(dvcpyx.begin(), dvcpyy.begin(), dvcpyz.begin());
    auto zip_dvcpy_last = zip<floatdevptr_stridedrangeiterator>(dvcpyx.end(), dvcpyy.end(), dvcpyz.end());

    //find index of each input vertex in the list of unique vertices
    thrust::lower_bound(zip_dv_first, new_dv_last,
        zip_dvcpy_first, zip_dvcpy_last,
        indices.begin());

    // print unique vertex data
    std::cout << "Output Values : ";
    thrust::copy(d_vec.begin(), d_vec.begin() + new_dv_size, std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;
    // print the indices
    std::cout << "Index Values : ";
    thrust::copy(indices.begin(), indices.end(), std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;
}

Output is :

Input Values : 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 7 8 9 
Output Values : 1 2 3 4 5 6 7 8 9 
Index Values : 0 1 0 1 0 2 

Upvotes: 0

m.s.
m.s.

Reputation: 16334

There is no need to copy, you can use a combination of thrust::zip_iterator and a strided_range iterator.

The following example works for a list of floats where 3 consecutive values belong to each other. It can of course be extended to support more than that, it is just a matter of typing.

The first step is to load some demo data on to the GPU, this uses a thrust::device_vector, but this results in a float* pointer just like you have.

Based on the strided_range iterator and the thrust::zip_iterator the data is first sorted and then compacted. This code uses C++11 features, so compile it using:

nvcc -std=c++11 unique.cu -o unique

The output when running ./unique is:

1 2 3 4 5 6 

unique.cu

#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}

template <typename Iterator>
struct strided_range
{
    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        { 
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
    Iterator first;
    Iterator last;
    difference_type stride;
};

int main()
{
    const int stride = 3;
    const int num = 3;

    const int size = stride * num;

    float values[size] = {1,2,3,
                          4,5,6,
                          1,2,3};


    // in this example I use thrust vectors to simplify copying from host to device
    thrust::host_vector<float> h_vec (values, values+size);
    thrust::device_vector<float> d_vec = h_vec;

    // in your case, dev_ptr is your input pointer
    float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());

    auto first =  strided_range<float*>(dev_ptr,   dev_ptr + size+1-stride,   stride);
    auto second = strided_range<float*>(dev_ptr+1, dev_ptr + size+1-stride+1, stride);
    auto third =  strided_range<float*>(dev_ptr+2, dev_ptr + size+1-stride+2, stride);

    auto zip_begin = zip(first.begin(),second.begin(), third.begin());
    auto zip_end = zip(first.end(), second.end(), third.end());

    thrust::sort(thrust::device, zip_begin, zip_end);
    auto new_end = thrust::unique(thrust::device, zip_begin,zip_end);
    std::size_t new_size = stride * (new_end - zip_begin);

    // use the underlying thrust::device_vector again to simplify printing
    thrust::copy(d_vec.begin(), d_vec.begin()+new_size, std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;

    return 0;
}

By the way: Be aware of floating point inaccuracies when trying to get unique values.


I also created a generic version of the example above which builds the zip_iterator automatically and works for any number of consecutive elements. Since the official thrust version unfortunately does not yet support variadic tuples, we use a std::tuple to build the desired tuple type and then convert it into a thrust::tuple. If Andrew Corrigan's branch of thrust (which adds support for variadic tuples) was merged into thrust master, we could avoid using std::tuple at all.

Compile this example using:

nvcc generic_unique.cu -std=c++11 -o generic_unique

The output when running ./generic_unique is:

input data: 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 1 2 3 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0 
after sort: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 5 6 1 2 3 4 5 6 1 2 3 4 5 6 
after unique: 0 0 0 0 0 0 1 2 3 4 5 6 

generic_unique.cu

#include <tuple>
#include <thrust/tuple.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

// adapted from https://github.com/thrust/thrust/blob/master/examples/strided_range.cu
template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
class strided_range
{
public:
    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    //template <difference_type stride>
    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        __host__ __device__
        difference_type operator()(const difference_type& i) const
        { 
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                           CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>             PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last)
        : first(first), last(last) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

protected:
    Iterator first;
    Iterator last;
};

// copied from http://stackoverflow.com/a/16853775/678093
template<typename, typename>
struct append_to_type_seq { };

template<typename T, typename... Ts, template<typename...> class TT>
struct append_to_type_seq<T, TT<Ts...>>
{
    using type = TT<Ts..., T>;
};

template<typename T, unsigned int N, template<typename...> class TT>
struct repeat
{
    using type = typename
        append_to_type_seq<
            T,
            typename repeat<T, N-1, TT>::type
            >::type;
};

template<typename T, template<typename...> class TT>
struct repeat<T, 0, TT>
{
    using type = TT<>;
};

template<typename Tuple> struct std_to_thrust_tuple;
template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
  using type = thrust::tuple<T...>;
};

template<typename IteratorType, std::size_t stride>
class zipped_strided_range
{
public:

    typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
    typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
    typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
    typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;

    zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
    {
        assign<0>();
    }

    ZipIterator begin() const
    {
        return thrust::make_zip_iterator(begin_tuple);
    }

    ZipIterator end() const
    {
        return thrust::make_zip_iterator(end_tuple);
    }

protected:

    template <std::size_t index>
    void assign(typename std::enable_if< (index < stride) >::type* = 0)
    {
        strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);

        thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
        thrust::get<index>(end_tuple) = strided_range_iterator.end();
        assign<index+1>();
    }

    template <std::size_t index>
    void assign(typename std::enable_if< (index == stride) >::type* = 0)
    {
        // end recursion
    }

    IteratorType first;
    IteratorType last;

    IteratorTuple begin_tuple;
    IteratorTuple end_tuple;
};


int main()
{

    const int stride = 6;
    const int num = 6;

    const int size = stride * num;

    float values[size] = {1,2,3,4,5,6,
                          0,0,0,0,0,0,
                          1,2,3,4,5,6,
                          0,0,0,0,0,0,
                          1,2,3,4,5,6,
                          0,0,0,0,0,0
    };


    // in this example I use thrust vectors to simplify copying from host to device
    // it also simplifies printing
    thrust::host_vector<float> h_vec (values, values+size);
    thrust::device_vector<float> d_vec = h_vec;

    std::cout << "input data: ";
    thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;

    // in your case, dev_ptr is your input pointer
    float* dev_ptr = thrust::raw_pointer_cast(d_vec.data());

    zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);


    thrust::sort(thrust::device, zipped.begin(), zipped.end());

    std::cout << "after sort: ";
    thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;

    auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
    std::size_t new_size = stride * (new_end - zipped.begin());

    std::cout << "after unique: ";
    d_vec.resize(new_size);
    thrust::copy(d_vec.begin(), d_vec.end(), std::ostream_iterator<float>(std::cout, " "));
    std::cout << std::endl;

    return 0;
}

Upvotes: 6

Related Questions