using a pointer to vector::data() for cublasSgemm

Question

I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.

Here is the code I am working on:

Matrix Matrix::cudaProd(Matrix&A,Matrix&B, Matrix&C)
{
C = Matrix(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;

T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();

cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);

cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);

cublasHandle_t handle;

cublasStatus_t status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! CUBLAS initialization error
";
}

status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! kernel execution error.
";
}

status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! shutdown error (A)
";
}

cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.

The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.

Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.

Thanks for your help!

Edit: For those having trouble with copy and pasting, here is the complete example:

#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;

template class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;

T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector value);
vector& GetElements();
T* GetPointer();
Matrix cudaProd(Matrix&A,Matrix&B, Matrix&C);
private:
vector elements;
T* firstElement;
};

template
Matrix::~Matrix()
{
}

template
Matrix::Matrix()
{
}

template
Matrix::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}

template
T &Matrix::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}

template
vector& Matrix::GetElements()
{
return elements; //row major order return
}

template
void Matrix::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}

template
void Matrix::SetElements(vector value)
{
elements = value;
}

template
T* Matrix::GetPointer()
{
return firstElement;
}


template
//Matrix Multiplication using CUDA
Matrix Matrix::cudaProd(Matrix&A,Matrix&B, Matrix&C)
{
C = Matrix(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;


//Thrust usage

/*thrust::device_vector d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/

T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();

cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);

cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);

cublasHandle_t handle;

cublasStatus_t status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! CUBLAS initialization error
";
}

status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! kernel execution error.
";
}

status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! shutdown error (A)
";
}

//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());

cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return C;

}

int main()
{
    Matrix A(2,2);
Matrix B(2,2);
Matrix C;

vector aE(4,2);
vector bE(4,4);
A.SetElements(aE);
B.SetElements(bE);

C = C.cudaProd(A, B, C);  //function call to cudaProd()

for(int row = 0; row < A.height; ++row)
{
    for(int col = 0; col < A.width; ++col)
    {       
        cout<

Vitality · Accepted Answer

From the std::vector::data documentation, data() returns both const and non-const qualified pointers, depending on the fact that the vector is qualified as const or not. Quoting the documentation

If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.

Accordingly, using

firstElement = elements.data();

in the Matrix constructor is fine to read/write the data.

The main problem with your code is that you are declaring C in the main, passing a reference to C to the cudaProd method and then internally using

C = Matrix(A.height, B.width);

which will redeclare the Matrix.

If you change the definition of the cudaProd method to

template
void cudaProd(Matrix&A,Matrix&B, Matrix&C)

remove the

return C;

statement and allocate space for C in the main as

Matrix C(2,2);
vector cE(4,10);
C.SetElements(cE);

your code should work correctly.

using a pointer to vector<T>::data() for cublasSgemm

Answers (2)

Related Questions

using a pointer to vector&lt;T&gt;::data() for cublasSgemm

Answers (2)

Related Questions

using a pointer to vector<T>::data() for cublasSgemm