Reputation: 29
I am looking for a way to perform operations over columns . I have MxN matrix, i want to activate cublas function (for example nrm2) over each column.
The result i expect to get is : M x 1
How can I do that?
Upvotes: 0
Views: 497
Reputation: 72339
CUBLAS has no batched Level 1 routines, so there is no direct way to compute the column or row norms in a single call. You can do it by calling nrm2 many times in a loop over all the rows or columns of the matrix, for example:
#include <cublas_v2.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/random.h>
#include <thrust/device_vector.h>
#include <iostream>
struct prg
{
float a, b;
__host__ __device__
prg(float _a=0.f, float _b=1.f) : a(_a), b(_b) {};
__host__ __device__
float operator()(const unsigned int n) const
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a, b);
rng.discard(n);
return dist(rng);
}
};
int main(void)
{
const int M = 1024, N = M;
const int num = N * M;
thrust::device_vector<float> matrix(num);
thrust::device_vector<float> vector(N, -1.0f);
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin,
index_sequence_begin + num,
matrix.begin(),
prg(1.f,2.f));
float* m_d = thrust::raw_pointer_cast(matrix.data());
float* v_d = thrust::raw_pointer_cast(vector.data());
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
cublasSetStream(handle, stream);
for(int col=0; col < N; col++) {
cublasSnrm2(handle, M, m_d + col*M, 1, v_d + col);
}
cudaDeviceSynchronize();
for(auto x : vector) {
float normval = x;
std::cout << normval << std::endl;
}
return 0;
}
Unless you have very large rows or columns, there is little scope to exploit streams to run simultaneous kernels and reduce the overall runtime because each nrm2 call will be too short. So there is a lot of latency in running lots of individual kernels, which will negatively effect performance.
A much better alternative would be to write your own kernel to do this.
Upvotes: 2