Reputation: 1261
It is not obvious how to use std::vector in CUDA, so I have designed my own Vector class:
#ifndef VECTORHEADERDEF
#define VECTORHEADERDEF
#include <cmath>
#include <iostream>
#include <cassert>
template <typename T>
class Vector
{
private:
T* mData; // data stored in vector
int mSize; // size of vector
public:
Vector(const Vector& otherVector); // Constructor
Vector(int size); // Constructor
~Vector(); // Desructor
__host__ __device__ int GetSize() const; // get size of the vector
T& operator[](int i); // see element
// change element i
__host__ __device__ void set(size_t i, T value) {
mData[i] = value;
}
template <class S> // output vector
friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};
// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
mSize = otherVector.GetSize();
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
}
// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
assert(size > 0);
mSize = size;
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = 0.0;
}
}
// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
delete[] mData;
}
// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
{
return mSize;
}
// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
{
assert(i > -1);
assert(i < mSize);
return mData[i];
}
// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
{
assert(mSize == otherVector.mSize);
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
return *this;
}
// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
for (int i=0; i<v.mSize; i++) {
output << v[i] << " ";
}
return output;
}
My main function - where I just pass a vector to the device, modify it and pass it back - is as follows (with the kernel designed just for testing purposes):
#include <iostream>
#include "Vector.hpp"
__global__ void alpha(Vector<int>* d_num)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
d_num->set(0,100);
d_num->set(2,11);
}
int main()
{
Vector<int> num(10);
for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
std::cout << "Size of vector: " << num.GetSize() << "\n";
std::cout << num << "\n"; // print vector
Vector<int>* d_num;
// allocate global memory on the device
cudaMalloc((void **) &d_num, num.GetSize()*sizeof(int));
// copy data from host memory to the device memory
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// launch the kernel
alpha<<<1,100>>>(d_num);
// copy the modified array back to the host, overwriting the contents of h_arr
cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
std::cout << num << "\n";
// free GPU memory allocation and exit
cudaFree(d_num);
return 0;
}
The problem I encounter is with cudaMemcpyDeviceToHost. It does not really copy the device vector to the num vector as can be seen from the output.
How should I deal with that? (Please be explicit, I am fairly new to CUDA).
Upvotes: 0
Views: 1922
Reputation: 1933
It is not just cudaMemcpyDeviceToHost
part that you're having trouble with.
Vector<int> num(10);
Vector<int>* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));
This will allocate 40 bytes on the cuda global memory(assuming sizeof(int)
is 4), which is pointed by d_num
of type Vector<int>*
. I don't think you are expecting Vector<int>
object itself to be 40 bytes.
Let's try another way.
cudaMalloc(&d_num, sizeof(Vector<int>));
cudaMalloc(&d_num->mData, num.GetSize()*sizeof(int)); // assume mData is a public attribute
Unfortunately, the second line will emit segmentation fault
because you are accessing device memory from host code(d_num->mData
).
So your implementation of Vector class has many fallacies. If you're planning to have a fixed size array, just declare d_num as a pointer.
int* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// .. some kernel operations
cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
Upvotes: 1
Reputation: 151849
This will create a valid pointer to the first element of the vector num
:
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
^^^^^^^
This will not:
cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
^^^
The name of a your Vector
object is not a pointer to its first data element. Instead, you should write that line in a similar fashion to the first one you wrote, like this:
cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
However this by itself is not a fix. Note that d_num
is not a Vector
, but is already a pointer, so we can use it directly in these operations. Although it is not wrong to use &(d_num[0])
, it is unnecessary to do so.
Because d_num
is not a Vector
(as you have allocated it - it is a bare pointer to a set of int
quantities), your usage of Vector
methods in the kernel is also broken. If you want to use Vector
methods in the kernel, you will need to pass it an actual Vector
object, not just the data. Since passing an object will require device data handling within the object (data accessible on the host is not accessible on the device, and vice-versa), it is an extensive re-write of your Vector
class. I've made a limited attempt at that, showing one possible way forward. The basic methodology (ie. one possible approach) is as follows:
to_device()
method is used for this purpose. This method also switches our "reference" pointer (mData
) to refer to the device-side copy of the Vector
data.d_num
).__device__
decoration.to_host()
method is provided for this purpose.Here is a worked example:
$ cat t101.cu
#include <iostream>
#include <cmath>
#include <iostream>
#include <cassert>
template <typename T>
class Vector
{
private:
T* mData, *hData, *dData; // data stored in vector
int mSize; // size of vector
public:
Vector(const Vector& otherVector); // Constructor
Vector(int size); // Constructor
~Vector(); // Desructor
__host__ __device__ int GetSize() const; // get size of the vector
__host__ __device__ T& operator[](int i); // see element
// change element i
__host__ __device__ void set(size_t i, T value) {
mData[i] = value;
};
__host__ __device__ Vector<T>& operator=(const Vector<T>& otherVector);
void to_device();
void to_host();
template <class S> // output vector
friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};
// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
mSize = otherVector.GetSize();
hData = new T [mSize];
cudaMalloc(&dData, mSize*sizeof(T));
mData = hData;
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
}
// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
assert(size > 0);
mSize = size;
hData = new T [mSize];
cudaMalloc(&dData, mSize*sizeof(T));
mData = hData;
for (int i=0; i<mSize; i++)
{
mData[i] = 0.0;
}
}
// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
delete[] hData;
if (dData) cudaFree(dData);
}
// Method to get the size of a vector
template <typename T>
__host__ __device__
int Vector<T>::GetSize() const
{
return mSize;
}
// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
__host__ __device__
T& Vector<T>::operator[](int i)
{
assert(i > -1);
assert(i < mSize);
return mData[i];
}
// Overloading the assignment operator
template <typename T>
__host__ __device__
Vector<T>& Vector<T>::operator=(const Vector<T>& otherVector)
{
assert(mSize == otherVector.mSize);
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
return *this;
}
// Overloading the insertion << operator
// not callable on the device!
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
for (int i=0; i<v.mSize; i++) {
output << v[i] << " ";
}
return output;
}
template <typename T>
void Vector<T>::to_device(){
cudaMemcpy(dData, hData, mSize*sizeof(T), cudaMemcpyHostToDevice);
mData = dData;
}
template <typename T>
void Vector<T>::to_host(){
cudaMemcpy(hData, dData, mSize*sizeof(T), cudaMemcpyDeviceToHost);
mData = hData;
}
__global__ void alpha(Vector<int> *d_num)
{
d_num->set(0,100);
d_num->set(2,11);
(*d_num)[1] = 50;
}
int main()
{
Vector<int> num(10);
for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
std::cout << "Size of vector: " << num.GetSize() << "\n";
std::cout << num << "\n"; // print vector
Vector<int> *d_num;
cudaMalloc(&d_num, sizeof(Vector<int>));
num.to_device();
cudaMemcpy(d_num, &(num), sizeof(Vector<int>), cudaMemcpyHostToDevice);
// launch the kernel
alpha<<<1,1>>>(d_num);
// copy the modified array back to the host, overwriting the contents of h_arr
num.to_host();
std::cout << num << "\n";
// free GPU memory allocation and exit
return 0;
}
$ nvcc -arch=sm_61 -o t101 t101.cu
$ cuda-memcheck ./t101
========= CUDA-MEMCHECK
Size of vector: 10
0 1 2 3 4 5 6 7 8 9
100 50 11 3 4 5 6 7 8 9
========= ERROR SUMMARY: 0 errors
$
Notes:
According to my testing, your posted code had various compile errors so I had to make other changes to your Vector
class just to get it to compile.
Passing an object by value to the kernel will invoke the copy constructor, and subsequently the destructor, which makes things more difficult, therefore I have elected to pass the object via pointer (which is how you originally had it), to avoid this.
Your kernel call is launching 100 threads. Since they are all doing precisely the same thing, without any read activity going on, there's nothing particularly wrong with this, but I have changed it to just a single thread. It still demonstrates the same capability.
Upvotes: 1
Reputation: 71
Thrust is library written for CUDA and it has vectors. http://docs.nvidia.com/cuda/thrust/ Maybe it has all the functions you need, so why reinvent the wheel if you dont have to.
Upvotes: 0