Reputation: 75
I'm actually working with CUDA and i'm trying to optimize a program using this technology. So i've got a big kernel that I must launch between 100k+ time and 100M+ time maybe billions ?
So I read using a dim3 variable allow to launch that amount of threads(cf : https://devtalk.nvidia.com/default/topic/621867/size-limitation-for-1d-arrays-in-cuda-/?offset=7)
I got a sample code that (on my gtx970) run sometime and sometime doesn't.
#ifndef PROPAGATORSAT_CUH_
# define PROPAGATORSAT_CUH
# define M_PI (3.14159265358979323846)
# define TWO_PI (2 * M_PI)
# define TOTAL_TIME (615359.772)
# define STEP (0.771)
# define NB_IT (TOTAL_TIME / (double)STEP)
# define NB_THREADS (1024)
# define NB_BLOCKS (int)((NB_IT + NB_THREADS - 1) / NB_THREADS)
# include <cmath>
# include <cfloat>
# include <stdio.h>
# include "../common/book.h"
# include "cuda_runtime.h"
# include "device_launch_parameters.h"
# define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
class Global
{
public:
const double _ITURadEarth = 6378145.0;
const double _ITUGravCst = 3.986012E5;
const double _ITUJ2 = 0.001082636;
const double _J2000AngleDeg = 0;//-79.8058;
const double _J2000AngleRad = 0;//TO_RAD(_J2000AngleDeg);
const double _ITUAngleRateEarthRot = 4.1780745823E-3;
const double _ITUAngleRateEarthRotRad = degToRad(_ITUAngleRateEarthRot);
public:
__device__ double myAsin(double angle);
__host__ __device__ double myAcos(double angle);
__host__ __device__ double negPiToPi(double angle);
__host__ __device__ double degToRad(double angle);
__device__ double radToDeg(double angle);
};
class Cartesian
{
public:
double _X;
double _Y;
double _Z;
private:
double _m;
public:
__host__ __device__ Cartesian(double x, double y, double z) : _X(x), _Y(y), _Z(z), _m(-1) {}
};
class Propagator
{
public:
double _iDeg;
double _a;
double _omega_0;
double _OMEGA_0;
double _omega_r;
double _OMEGA_r;
double _rho;
double _SinI;
double _CosI;
double _p;
double _e;
double _ReKm;
double _n0;
double _n_bar;
double _M0;
double _sqrt_e;
int _orbitCase = -1;
double _WdeltaRad;
double _precessionRateRad;
double _artificialPrecessionRad = DBL_MIN;
double _simulationDuration = DBL_MIN;
double _incrementWdeltaRad;
void propagator(double smaKm,
double incDeg,
double e,
double raanDeg,
double aopDeg,
double trueAnomalyDeg,
bool stationKeeping,
double WdeltaDeg,
bool precessionMechanismSupplied,
double precessionRateDeg);
__device__ Cartesian evaluate(double timeSec,
double simulationDuration,
double artificialPrecessionRad,
bool ECImode);
__device__ double solveKepler(double M,
double e,
double epsilon);
__device__ Cartesian rotateOrbitalElements(Cartesian pq0,
double omega,
double OMEGA,
double CosI,
double SinI);
};
#endif /* !PROPAGATORSAT_CUH_ */
__host__ __device__ double Global::myAcos(double angle)
{
return (acos(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}
__device__ double Global::myAsin(double angle)
{
return (asin(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}
__host__ __device__ double Global::degToRad(double angle)
{
return (angle * M_PI / 180.0);
}
__device__ double Global::radToDeg(double angle)
{
return (angle * 180.0 / M_PI);
}
__host__ __device__ double Global::negPiToPi(double angle)
{
double output;
output = fmod(angle, TWO_PI);
output = fmod(angle + TWO_PI, TWO_PI);
return ((output > M_PI) ? (output - TWO_PI) : (output));
}
void Propagator::propagator(double smaKm, double incDeg, double e, double raanDeg, double aopDeg, double trueAnomalyDeg, bool stationKeeping, double WdeltaDeg, bool precessionMechanismSupplied, double precessionRateDeg)
{
double iRad, trueAnomalyRad, cosV, E, mu;
Global global;
_iDeg = incDeg;
iRad = global.degToRad(_iDeg);
_CosI = cos(iRad);
_SinI = sin(iRad);
_e = e;
_a = smaKm;
trueAnomalyRad = global.degToRad(trueAnomalyDeg);
if (e == 0)
_M0 = trueAnomalyRad;
else
{
cosV = cos(trueAnomalyRad);
E = global.myAcos((e + cosV) / (1 + e * cosV));
if (global.negPiToPi(trueAnomalyRad) < 0)
E = M_PI * 2 - E;
_M0 = E - e * sin(E);
}
_OMEGA_0 = global.degToRad(raanDeg);
_omega_0 = global.degToRad(aopDeg);
_p = _a * (1 - e * e);
_ReKm = global._ITURadEarth / 1000;
mu = global._ITUGravCst;
_n0 = sqrt(mu / pow(_a, 3));
_n_bar = _n0 * (1.0 + 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * (1.0 - 1.5 * pow(_SinI, 2)) * pow(1.0 - pow(e, 2), 0.5));
_OMEGA_r = -1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * _CosI;
_omega_r = 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * (2.0 - 2.5 * pow(_SinI, 2));
_sqrt_e = sqrt((1 + e) / (1 - e));
_WdeltaRad = global.degToRad(WdeltaDeg);
_precessionRateRad = global.degToRad(precessionRateDeg);
if (stationKeeping == false)
_orbitCase = 1;
else if (precessionMechanismSupplied == false)
_orbitCase = 2;
else
_orbitCase = 3;
}
__device__ Cartesian Propagator::rotateOrbitalElements(Cartesian pq0, double omega, double OMEGA, double CosI, double SinI)
{
double CosOMEGA, SinOMEGA, CosOmega, SinOmega, R11, R12, R13, R21, R22, R23, R31, R32, R33, x, y, z;
CosOMEGA = cos(OMEGA);
SinOMEGA = sin(OMEGA);
CosOmega = cos(omega);
SinOmega = sin(omega);
R11 = CosOMEGA * CosOmega - SinOMEGA * SinOmega * CosI;
R12 = -CosOMEGA * SinOmega - SinOMEGA * CosOmega * CosI;
R13 = SinOMEGA * SinI;
R21 = SinOMEGA * CosOmega + CosOMEGA * SinOmega * CosI;
R22 = -SinOMEGA * SinOmega + CosOMEGA * CosOmega * CosI;
R23 = -CosOMEGA * SinI;
R31 = SinOmega * SinI;
R32 = CosOmega * SinI;
R33 = CosI;
x = R11 * pq0._X + R12 * pq0._Y + R13 * pq0._Z;
y = R21 * pq0._X + R22 * pq0._Y + R23 * pq0._Z;
z = R31 * pq0._X + R32 * pq0._Y + R33 * pq0._Z;
Cartesian cart = Cartesian(x, y, z);
return (cart);
}
__device__ Cartesian Propagator::evaluate(double timeSec, double simulationDuration, double artificialPrecessionRad, bool ECImode = true)
{
double M, E, v, cosV, sinV, rotationAngleECF, omega, OMEGA;
Global global;
if (_simulationDuration != simulationDuration || _artificialPrecessionRad != artificialPrecessionRad)
{
_simulationDuration = simulationDuration;
_artificialPrecessionRad = artificialPrecessionRad;
_incrementWdeltaRad = (_WdeltaRad * 2) / _simulationDuration;
}
M = _M0 + ((_orbitCase == 3) ? _n0 : _n_bar) * timeSec;
E = E = (_e == 0) ? M : solveKepler(M, _e, 1e-8);
v = 2.0 * atan(_sqrt_e * tan(E / 2));
cosV = cos(v);
sinV = sin(v);
_rho = _p / (1 + _e * cosV);
rotationAngleECF = (ECImode) ? 0 : -1 * (global._J2000AngleRad + timeSec * global._ITUAngleRateEarthRotRad);
omega = _omega_0 + ((_orbitCase == 3) ? 0 : _omega_r * timeSec);
OMEGA = _OMEGA_0 + rotationAngleECF + ((_orbitCase == 3) ? 0 : _OMEGA_r * timeSec);
if (_orbitCase == 1)
OMEGA += artificialPrecessionRad * timeSec;
else if (_orbitCase == 2)
OMEGA += _WdeltaRad * ((2.0 * timeSec / _simulationDuration) - 1);
else if (_orbitCase == 3)
OMEGA += _precessionRateRad * timeSec - _WdeltaRad + _incrementWdeltaRad * timeSec;
Cartesian pq0 = Cartesian(1000 * _rho * cosV, 1000 * _rho * sinV, 0);
Cartesian positionECI = Propagator::rotateOrbitalElements(pq0, omega, OMEGA, _CosI, _SinI);
return (positionECI);
}
__device__ double Propagator::solveKepler(double M, double e, double epsilon)
{
double En, Ens;
En = M;
Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
while (abs(Ens - En) > epsilon)
{
En = Ens;
Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
}
return (Ens);
}
__global__ void kernel(Propagator *CUDA_prop)
{
size_t tid;
tid = (blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z) * blockDim.x + threadIdx.x;
//if (tid < NB_IT)
Cartesian positionNGSOsatECI = CUDA_prop[0].evaluate(STEP * tid, 615359.772, 0);
}
int main(void)
{
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
Propagator prop[1], *CUDA_prop;
dim3 block(1000, 1, 1);
dim3 thread(1024, 1, 1);
prop[0].propagator(7847.3, 53, 0, 18, 0, 67.5, true, 5, true, 3.4000000596279278E-05);
HANDLE_ERROR(cudaMalloc((void **)&CUDA_prop, sizeof(Propagator)));
HANDLE_ERROR(cudaMemcpy(CUDA_prop, prop, sizeof(Propagator), cudaMemcpyHostToDevice));
kernel <<< block, thread >>> (CUDA_prop);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
HANDLE_ERROR(cudaFree(CUDA_prop));
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("time : %f ms\n", elapsedTime);
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
return (0);
}
If i launch that amount of "threads ?" it work until approximately 300k blocks. But sometime for the same amount it doesn't work. I get an error : "unknown error" from the line :
gpuErrchk(cudaDeviceSynchronize());
or cudaFree, or some function after the kernel call. enter code here
if I launch with only 1k blocks and 1k threads and use cuda-memcheck I get the same error as before but without cuda-memcheck it run great.
I don't know what cause this problem and how to solve it
NB : HANDLE_ERROR macro can be changed by gpuErrchk maccro, it's a define from a library that do exactly the same thing
And I wanted to know also how to determine the maximum amount of threads I can launch with spec' of the hardware or anything.
Upvotes: 1
Views: 650
Reputation: 7255
On Windows using the WDDM driver multiple kernel launches can get batched up to reduce launch overhead. As the watchdog timer applies to an entire batch, this can trigger a timeout even if each kernel by itself finishes within the chosen timeout value.
A cheap way of enforcing immediate execution of all kernels batched up so far is a call to cudaStreamQuery(0)
. Unlike calling cudaDeviceSynchronize()
this will return immediately and not wait for the kernels to complete.
Scattering cudaStreamQuery(0)
between kernel invocations thus ensures the WDDM timeout only applies to the kernels between two cudaStreamQuery(0)
calls.
If even a single kernel takes too long and triggers the watchdog, try splitting it up into multiple invocations with fewer blocks each, and again call cudaStreamQuery(0)
in between. This not only makes the watchdog happy, but also keeps the GUI somewhat reactive.
Upvotes: 1