Why my cuda C code does not become faster with single precision?

Question

Fermi generation GPU's single precision calculation should be 2 times faster than double precision. However, although I rewrite all declaration 'double' to 'float', I got no speed up. Is there any mistake ex. compile option etc..?

GPU:Tesla C2075 OS:win7 pro Compiler:VS2013(nvcc) CUDA:v.7.5 Command line:nvcc test.cu

I wrote test code:

#include
#include
#include
#include
#include

#include
#include 
#include
#include

#define DOUBLE 1

#define MAXI 10

__global__ void Kernel_double(double*a,int nthreadx)
{
    double b=1.e0;
    int i;
    i = blockIdx.x * nthreadx + threadIdx.x + 0;
    a[i] *= b;
}
__global__ void Kernel_float(float*a,int nthreadx)
{
    float b=1.0F;
    int i;
    i = blockIdx.x * nthreadx + threadIdx.x + 0;
    a[i] *= b;
}

int main()
{
#if DOUBLE
    double a[10];
    for(int i=0;i> > (d_a, 2);
        cudaMemcpy(a, d_a, sizeof(double)*(MAXI), cudaMemcpyDeviceToHost);
#else
        Kernel_float << < grid, block >> > (d_a, 2);
        cudaMemcpy(a, d_a, sizeof(float)*(MAXI), cudaMemcpyDeviceToHost);
#endif
    }

    end_clock = clock();
    sec_clock = (end_clock - start_clock) / (double)CLOCKS_PER_SEC;
    printf("[%d] %f[s]
", __LINE__, sec_clock);
    printf("[%d] end
", __LINE__);

    return 0;
}

Regis Portalez · Accepted Answer

Well, after some investigation, that's because you just perform a multiplication by the constant 1, which gets optimized to "do nothing" in the binary:

If instead you square the array (to prevent this trivial optimization), you get the following assembly:

and the performance gains are restored on the below(simplified) piece of code, in which i changed a few things:

way larger array (100M)
using blockDim.x instead of an argument parameter
use better kernel configuration for my machine (GTX 980)
allocate input array on heap instead of stack (to allow more than 1M)

here is the code:

#include
#include
#include
#include
#include

#include
#include 
#include
#include

#define DOUBLE float

#define ITER 10
#define MAXI 100000000

__global__ void kernel(DOUBLE*a)
{
    for(int i = blockIdx.x * blockDim.x + threadIdx.x ; i < MAXI; i += blockDim.x * gridDim.x) 
    {
        a[i] *= a[i];
    }
}

int main()
{
    DOUBLE* a = (DOUBLE*) malloc(MAXI*sizeof(DOUBLE));
    for(int i=0;i>> (d_a);
    }
    cudaDeviceSynchronize();

    end_clock = clock();
    cudaMemcpy(a, d_a, sizeof(DOUBLE)*(MAXI), cudaMemcpyDeviceToHost);
    sec_clock = (end_clock - start_clock) / (double)CLOCKS_PER_SEC;
    printf("[%d] %f/%d[s]
", __LINE__, sec_clock, CLOCKS_PER_SEC);
    printf("[%d] end
", __LINE__);

    return 0;
}

(You'll notice I allocate a array of length 100M to get measurable performance.)

Why my cuda C code does not become faster with single precision?

Answers (1)

Related Questions