Reputation: 135
I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.
Here is the code I am working on:
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.
The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.
Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.
Thanks for your help!
Edit: For those having trouble with copy and pasting, here is the complete example:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cublas_v2.h>
#include <vector>
#include <iostream>
using namespace std;
template<typename T> class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;
T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector<T> value);
vector<T>& GetElements();
T* GetPointer();
Matrix<T> cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C);
private:
vector<T> elements;
T* firstElement;
};
template<typename T>
Matrix<T>::~Matrix()
{
}
template<typename T>
Matrix<T>::Matrix()
{
}
template<typename T>
Matrix<T>::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}
template<typename T>
T &Matrix<T>::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}
template<typename T>
vector<T>& Matrix<T>::GetElements()
{
return elements; //row major order return
}
template<typename T>
void Matrix<T>::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}
template<typename T>
void Matrix<T>::SetElements(vector<T> value)
{
elements = value;
}
template<typename T>
T* Matrix<T>::GetPointer()
{
return firstElement;
}
template<typename T>
//Matrix Multiplication using CUDA
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;
//Thrust usage
/*thrust::device_vector<T> d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector<T> d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector<T> d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/
T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! CUBLAS initialization error\n";
}
status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! kernel execution error.\n";
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "!!!! shutdown error (A)\n";
}
//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return C;
}
int main()
{
Matrix<float> A(2,2);
Matrix<float> B(2,2);
Matrix<float> C;
vector<float> aE(4,2);
vector<float> bE(4,4);
A.SetElements(aE);
B.SetElements(bE);
C = C.cudaProd(A, B, C); //function call to cudaProd()
for(int row = 0; row < A.height; ++row)
{
for(int col = 0; col < A.width; ++col)
{
cout<<A.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < B.height; ++row)
{
for(int col = 0; col < B.width; ++col)
{
cout<<B.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
for(int row = 0; row < C.height; ++row)
{
for(int col = 0; col < C.width; ++col)
{
cout<<C.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
}
printf("\n");
}
printf("\n");
}
Upvotes: 0
Views: 353
Reputation: 76270
If I am not mistaken,
vector::data()
should return a pointer to the actual array stored in memory for that vector so it should be the same as having aT*
aArray pointer to an array of typeT
stored in memory.
The std::vector
class is an owning resource class. It means that trying to manage the underlying resource yourself with the data
pointer will make you enter a world of pain.
For this very same reason:
cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);
and:
cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);
and:
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cannot possibly work.
Upvotes: 1
Reputation: 21495
From the std::vector::data documentation, data()
returns both const
and non-const
qualified pointers, depending on the fact that the vector
is qualified as const
or not. Quoting the documentation
If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.
Accordingly, using
firstElement = elements.data();
in the Matrix
constructor is fine to read/write the data.
The main problem with your code is that you are declaring C
in the main
, passing a reference to C
to the cudaProd
method and then internally using
C = Matrix<T>(A.height, B.width);
which will redeclare the Matrix
.
If you change the definition of the cudaProd
method to
template<typename T>
void cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
remove the
return C;
statement and allocate space for C
in the main as
Matrix<float> C(2,2);
vector<float> cE(4,10);
C.SetElements(cE);
your code should work correctly.
Upvotes: 0