CUDA kernel only launches and runs at some grid sizes

Question

I'm actually working with CUDA and i'm trying to optimize a program using this technology. So i've got a big kernel that I must launch between 100k+ time and 100M+ time maybe billions ?

So I read using a dim3 variable allow to launch that amount of threads(cf : https://devtalk.nvidia.com/default/topic/621867/size-limitation-for-1d-arrays-in-cuda-/?offset=7)

I got a sample code that (on my gtx970) run sometime and sometime doesn't.

#ifndef PROPAGATORSAT_CUH_
# define PROPAGATORSAT_CUH
# define M_PI (3.14159265358979323846)
# define TWO_PI (2 * M_PI)
# define TOTAL_TIME (615359.772)
# define STEP (0.771)
# define NB_IT (TOTAL_TIME / (double)STEP)
# define NB_THREADS (1024)
# define NB_BLOCKS (int)((NB_IT + NB_THREADS - 1) / NB_THREADS)

# include 
# include 
# include 
# include "../common/book.h"
# include "cuda_runtime.h"
# include "device_launch_parameters.h"
# define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

class                           Global
{
public:
    const double                _ITURadEarth = 6378145.0;
    const double                _ITUGravCst = 3.986012E5;
    const double                _ITUJ2 = 0.001082636;
    const double                _J2000AngleDeg = 0;//-79.8058;
    const double                _J2000AngleRad = 0;//TO_RAD(_J2000AngleDeg);
    const double                _ITUAngleRateEarthRot = 4.1780745823E-3;
    const double                _ITUAngleRateEarthRotRad = degToRad(_ITUAngleRateEarthRot);

public:
             __device__ double  myAsin(double angle);
    __host__ __device__ double  myAcos(double angle);
    __host__ __device__ double  negPiToPi(double angle);
    __host__ __device__ double  degToRad(double angle);
             __device__ double  radToDeg(double angle);
};

class                               Cartesian
{
public:
    double                          _X;
    double                          _Y;
    double                          _Z;

private:
    double                          _m;

public:
    __host__ __device__             Cartesian(double x, double y, double z) : _X(x), _Y(y), _Z(z), _m(-1) {}
};

class                       Propagator
{
public:
    double                  _iDeg;
    double                  _a;
    double                  _omega_0;
    double                  _OMEGA_0;
    double                  _omega_r;
    double                  _OMEGA_r;
    double                  _rho;
    double                  _SinI;
    double                  _CosI;
    double                  _p;
    double                  _e;
    double                  _ReKm;
    double                  _n0;
    double                  _n_bar;
    double                  _M0;
    double                  _sqrt_e;
    int                     _orbitCase = -1;
    double                  _WdeltaRad;
    double                  _precessionRateRad;
    double                  _artificialPrecessionRad = DBL_MIN;
    double                  _simulationDuration = DBL_MIN;
    double                  _incrementWdeltaRad;

    void                    propagator(double               smaKm,
                                       double               incDeg,
                                       double               e,
                                       double               raanDeg,
                                       double               aopDeg,
                                       double               trueAnomalyDeg,
                                       bool                 stationKeeping,
                                       double               WdeltaDeg,
                                       bool                 precessionMechanismSupplied,
                                       double               precessionRateDeg);
    __device__ Cartesian    evaluate(double                 timeSec,
                                     double                 simulationDuration,
                                     double                 artificialPrecessionRad,
                                     bool                   ECImode);
    __device__ double       solveKepler(double              M,
                                        double              e,
                                        double              epsilon);
    __device__ Cartesian    rotateOrbitalElements(Cartesian pq0,
                                                  double    omega,
                                                  double    OMEGA,
                                                  double    CosI,
                                                  double    SinI);
};

#endif /* !PROPAGATORSAT_CUH_ */

__host__ __device__ double  Global::myAcos(double angle)
{
    return (acos(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}

__device__ double   Global::myAsin(double angle)
{
    return (asin(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}

__host__ __device__ double  Global::degToRad(double angle)
{
    return (angle * M_PI / 180.0);
}

__device__ double   Global::radToDeg(double angle)
{
    return (angle * 180.0 / M_PI);
}

__host__ __device__ double  Global::negPiToPi(double angle)
{
    double          output;

    output = fmod(angle, TWO_PI);
    output = fmod(angle + TWO_PI, TWO_PI);
    return ((output > M_PI) ? (output - TWO_PI) : (output));
}

void        Propagator::propagator(double smaKm, double incDeg, double e, double raanDeg, double aopDeg, double trueAnomalyDeg, bool stationKeeping, double WdeltaDeg, bool precessionMechanismSupplied, double precessionRateDeg)
{
    double          iRad, trueAnomalyRad, cosV, E, mu;
    Global          global;

    _iDeg = incDeg;
    iRad = global.degToRad(_iDeg);
    _CosI = cos(iRad);
    _SinI = sin(iRad);
    _e = e;
    _a = smaKm;
    trueAnomalyRad = global.degToRad(trueAnomalyDeg);
    if (e == 0)
        _M0 = trueAnomalyRad;
    else
    {
        cosV = cos(trueAnomalyRad);
        E = global.myAcos((e + cosV) / (1 + e * cosV));
        if (global.negPiToPi(trueAnomalyRad) < 0)
            E = M_PI * 2 - E;
        _M0 = E - e * sin(E);
    }
    _OMEGA_0 = global.degToRad(raanDeg);
    _omega_0 = global.degToRad(aopDeg);
    _p = _a * (1 - e * e);
    _ReKm = global._ITURadEarth / 1000;
    mu = global._ITUGravCst;
    _n0 = sqrt(mu / pow(_a, 3));
    _n_bar = _n0 * (1.0 + 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * (1.0 - 1.5 * pow(_SinI, 2)) * pow(1.0 - pow(e, 2), 0.5));
    _OMEGA_r = -1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * _CosI;
    _omega_r = 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * (2.0 - 2.5 * pow(_SinI, 2));
    _sqrt_e = sqrt((1 + e) / (1 - e));
    _WdeltaRad = global.degToRad(WdeltaDeg);
    _precessionRateRad = global.degToRad(precessionRateDeg);
    if (stationKeeping == false)
        _orbitCase = 1;
    else if (precessionMechanismSupplied == false)
        _orbitCase = 2;
    else
        _orbitCase = 3;
}

__device__ Cartesian    Propagator::rotateOrbitalElements(Cartesian pq0, double omega, double OMEGA, double CosI, double SinI)
{
    double              CosOMEGA, SinOMEGA, CosOmega, SinOmega, R11, R12, R13, R21, R22, R23, R31, R32, R33, x, y, z;

    CosOMEGA = cos(OMEGA);
    SinOMEGA = sin(OMEGA);
    CosOmega = cos(omega);
    SinOmega = sin(omega);
    R11 = CosOMEGA * CosOmega - SinOMEGA * SinOmega * CosI;
    R12 = -CosOMEGA * SinOmega - SinOMEGA * CosOmega * CosI;
    R13 = SinOMEGA * SinI;
    R21 = SinOMEGA * CosOmega + CosOMEGA * SinOmega * CosI;
    R22 = -SinOMEGA * SinOmega + CosOMEGA * CosOmega * CosI;
    R23 = -CosOMEGA * SinI;
    R31 = SinOmega * SinI;
    R32 = CosOmega * SinI;
    R33 = CosI;
    x = R11 * pq0._X + R12 * pq0._Y + R13 * pq0._Z;
    y = R21 * pq0._X + R22 * pq0._Y + R23 * pq0._Z;
    z = R31 * pq0._X + R32 * pq0._Y + R33 * pq0._Z;
    Cartesian           cart = Cartesian(x, y, z);

    return (cart);
}
__device__ Cartesian    Propagator::evaluate(double timeSec, double simulationDuration, double artificialPrecessionRad, bool ECImode = true)
{
    double              M, E, v, cosV, sinV, rotationAngleECF, omega, OMEGA;
    Global              global;

    if (_simulationDuration != simulationDuration || _artificialPrecessionRad != artificialPrecessionRad)
    {
        _simulationDuration = simulationDuration;
        _artificialPrecessionRad = artificialPrecessionRad;
        _incrementWdeltaRad = (_WdeltaRad * 2) / _simulationDuration;
    }
    M = _M0 + ((_orbitCase == 3) ? _n0 : _n_bar) * timeSec;
    E = E = (_e == 0) ? M : solveKepler(M, _e, 1e-8);
    v = 2.0 * atan(_sqrt_e * tan(E / 2));
    cosV = cos(v);
    sinV = sin(v);
    _rho = _p / (1 + _e * cosV);
    rotationAngleECF = (ECImode) ? 0 : -1 * (global._J2000AngleRad + timeSec * global._ITUAngleRateEarthRotRad);
    omega = _omega_0 + ((_orbitCase == 3) ? 0 : _omega_r * timeSec);
    OMEGA = _OMEGA_0 + rotationAngleECF + ((_orbitCase == 3) ? 0 : _OMEGA_r * timeSec);
    if (_orbitCase == 1)
        OMEGA += artificialPrecessionRad * timeSec;
    else if (_orbitCase == 2)
        OMEGA += _WdeltaRad * ((2.0 * timeSec / _simulationDuration) - 1);
    else if (_orbitCase == 3)
        OMEGA += _precessionRateRad * timeSec - _WdeltaRad + _incrementWdeltaRad * timeSec;
    Cartesian pq0 = Cartesian(1000 * _rho * cosV, 1000 * _rho * sinV, 0);

    Cartesian positionECI = Propagator::rotateOrbitalElements(pq0, omega, OMEGA, _CosI, _SinI);

    return (positionECI);
}
__device__ double   Propagator::solveKepler(double M, double e, double epsilon)
{
    double          En, Ens;

    En = M;
    Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
    while (abs(Ens - En) > epsilon)
    {
        En = Ens;
        Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
    }
    return (Ens);
}

__global__ void kernel(Propagator *CUDA_prop)
{
    size_t      tid;

    tid = (blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z) * blockDim.x + threadIdx.x;
    //if (tid < NB_IT)
    Cartesian positionNGSOsatECI = CUDA_prop[0].evaluate(STEP * tid, 615359.772, 0);
}

int     main(void)
{
    cudaEvent_t     start, stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));
    HANDLE_ERROR(cudaEventRecord(start, 0));
    Propagator  prop[1], *CUDA_prop;
    dim3        block(1000, 1, 1);
    dim3        thread(1024, 1, 1);

    prop[0].propagator(7847.3, 53, 0, 18, 0, 67.5, true, 5, true, 3.4000000596279278E-05);
    HANDLE_ERROR(cudaMalloc((void **)&CUDA_prop, sizeof(Propagator)));
    HANDLE_ERROR(cudaMemcpy(CUDA_prop, prop, sizeof(Propagator), cudaMemcpyHostToDevice));
    kernel <<< block, thread >>> (CUDA_prop);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    HANDLE_ERROR(cudaFree(CUDA_prop));
    HANDLE_ERROR(cudaEventRecord(stop, 0));
    HANDLE_ERROR(cudaEventSynchronize(stop));
    float           elapsedTime;
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
    printf("time : %f ms
", elapsedTime);
    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    return (0);
}

If i launch that amount of "threads ?" it work until approximately 300k blocks. But sometime for the same amount it doesn't work. I get an error : "unknown error" from the line :

gpuErrchk(cudaDeviceSynchronize());

or cudaFree, or some function after the kernel call. enter code here

if I launch with only 1k blocks and 1k threads and use cuda-memcheck I get the same error as before but without cuda-memcheck it run great.

I don't know what cause this problem and how to solve it

NB : HANDLE_ERROR macro can be changed by gpuErrchk maccro, it's a define from a library that do exactly the same thing

And I wanted to know also how to determine the maximum amount of threads I can launch with spec' of the hardware or anything.

tera · Accepted Answer

On Windows using the WDDM driver multiple kernel launches can get batched up to reduce launch overhead. As the watchdog timer applies to an entire batch, this can trigger a timeout even if each kernel by itself finishes within the chosen timeout value.

A cheap way of enforcing immediate execution of all kernels batched up so far is a call to cudaStreamQuery(0). Unlike calling cudaDeviceSynchronize() this will return immediately and not wait for the kernels to complete.

Scattering cudaStreamQuery(0) between kernel invocations thus ensures the WDDM timeout only applies to the kernels between two cudaStreamQuery(0) calls.

If even a single kernel takes too long and triggers the watchdog, try splitting it up into multiple invocations with fewer blocks each, and again call cudaStreamQuery(0) in between. This not only makes the watchdog happy, but also keeps the GUI somewhat reactive.

CUDA kernel only launches and runs at some grid sizes

Answers (1)

Related Questions