Numerical error in cuda/cublas simple kernel using particular input

Question

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:

__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
    const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int stride = blockDim.x * gridDim.x;
    for (unsigned int i = offset; i < n; i += stride)
    {
        dest[i] = source[i] * value;
    }
}

This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result

cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);

Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens. Any opinions about this situation? Thanks in advance!

EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.

Robert Crovella · Accepted Answer

Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?

Anyway I see several problems.

Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
```
         CUBLAS    Your Kernel
         ---------------------------
alpha:   alpha     alpha
    x:       1     ahost    (ahost is  your huge data array)
    y:   ahost         -
```
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
```
                float diff = abs(host[i]-dev[i]);
                ...
                if (diff > (-1e12))
```
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)

Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):

static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};

#include 
#include 
#include 
#include 

#define TOL 0.0001

typedef unsigned int u32;

#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)

inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s
", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}

__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
        for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
        {
                dest[i] = source[i] * value;
        }
}

float check_eq(float* dev, float* host, u32 len)
{
        float sum = 0.0f;
        for (u32 i = 0; i < len; ++i)
        {
                if (dev[i]!=host[i])
                {
                        //printf("diff %d %f %f
", i, dev[i], host[i]);
                        //break;
                        float diff = abs((host[i]-dev[i])/host[i]);
                        sum += diff;
                        if (diff > (TOL))
                                printf("diff %d %f
", i, diff);
                }
        }
        printf("%f
", sum);
        return sum;
}

void div_host(float* a, float v, u32 len)
{
        for (u32 i = 0; i < len; ++i)
        {
                a[i]=a[i]*v;
        }
}

int main()
{
        u32 len = sizeof(array)/sizeof(float);
        printf("array len = %d
", len);
        for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d
",i); return -1;}
        float* adev, *adevcublas, *d_zero;
        float* ahost = (float*) malloc(len * sizeof(float));

        checkCuda(cudaMalloc(&adev, len * sizeof(float)));
        checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
        checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));

        memcpy(ahost, &array[0], len * sizeof(float));
        checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
        checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
        checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));

        float alpha = 1/2494.f;

        printf("%f
", alpha);

        div_host(ahost, alpha, len);
        u32 tb = 256;
        div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);

        float* r = (float*) malloc(len * sizeof(float));

        checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));

        check_eq(r,ahost,len);


        cublasHandle_t ch;
        cublasCreate(&ch);

        float* r0 = (float*) malloc(len * sizeof(float));

        cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
        if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}

        checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));


        check_eq(r0,ahost,len);

        free(r);
        free(r0);
        free(ahost);
        cudaFree(adev);

        return 0;
}

Numerical error in cuda/cublas simple kernel using particular input

Answers (1)

Related Questions