Reputation: 1
When I run the following code to compute the matrix multiplication y = X * B:
#include <iostream>
#include <Eigen/Dense>
#include <cuda_runtime.h>
#include "cublas_v2.h"
using namespace Eigen;
int main(){
int N = 240000;
int K = 3;
int p = 9700;
MatrixXf X_host = MatrixXf::Zero(N, p);
MatrixXf B_host = MatrixXf::Zero(p, K);
MatrixXf y_host(N, K);
float *X_dev;
float *B_dev;
float *y_dev;
cudaMalloc((void**)&X_dev, sizeof(float) * p * N);
cudaMalloc((void**)&B_dev, sizeof(float) * p * K);
cudaMalloc((void**)&y_dev, sizeof(float) * N * K);
cudaMemcpy(X_dev, X_host.data(), sizeof(float)*p*N, cudaMemcpyHostToDevice);
cudaMemcpy(B_dev, B_host.data(), sizeof(float)*p*K, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
} else {
std::cout << "No problem before cublas call\n";
}
float alpha = 1.0;
float beta = 0.0;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
N, K, p, &alpha,
X_dev, N, B_dev, p, &beta, y_dev, N);
cudaDeviceSynchronize();
error = cudaGetLastError();
if(error != cudaSuccess)
{
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
}
cublasDestroy(handle);
cudaFree(X_dev);
cudaFree(B_dev);
cudaFree(y_dev);
return 0;
}
I got this error from cuda-memcheck
:
========= Invalid __global__ read of size 4
========= at 0x00000a88 in void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=4, int=4, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>)
========= by thread (223,0,0) in block (190,0,0)
========= Address 0x2b660269807c is out of bounds
There are about 100 such address out of bounds error, and the number of them varies between runs. The problem disappears when I set K to be a larger number (for example 10). Anyone has an idea what might be going on? I'm using CUDA 10.1, P100 on CentOS 7. Thanks!
Update on September 21, 2020: This issue is gone after I updated to CUDA 11.
Upvotes: 0
Views: 160
Reputation: 72349
As mentioned in comments, this would appear to be an internal issue in the CUBLAS library. I would editorialize and guess that they don't have test coverage for this unusual dimension problem with such a small inner product dimensions and this bug passed through pre-release testing undetected.
As is usual with likely bugs, your best best is to submit the code in your question as a repro case on a ticket with the NVIDIA developer portal.
Upvotes: 1