No unspecified launch error only if a loop is unrolled?

Question

Updated: The title is misleading. Originally I could make the error disappear by unrolling the block loop in the code below. Now, even a simple code change lets it disappear. (See code example below).

Background:

An CUDA C kernel implementation of a Cholesky decomposition of a 12x12 matrix results in a quite large CUDA kernel (280 code lines, lots of loops).

I reproduced the error with a reduced setup (code below). NVCC (CUDA 4.2) invoked with

nvcc -arch sm_20 -o main main.cu

executed on Fermi architecture on Linux:

kernel call: unspecified launch failure

The kernel body contains a conditional preprocessor block #if 1, #else, #endif. I inserted this to easily switch between a working and non-working version. Compiling the first alternative results in the unspecified launch failure. Whereas the second alternative runs fine.

The tricky part is, that the code actually executed should be the same in either case. (hasOrderedRep is true!!)

One can still make the error disappear even when the #if 1 statement is left untouched. Therefor the block loop must be unrolled. This is where the title originates from.

#include "cuda.h"
#include "stdio.h"
#include 
#include 

using namespace std;

/////////////////////////////////////
//
// First some basic types I need
// Implementation of a templated
// scalar and complex number

template class RScalar
{
public:
  __device__  RScalar() {}

  __device__  ~RScalar() {}

  template  __device__   
  RScalar(const RScalar& rhs) : F(rhs.elem()) {}

  template  __device__
  RScalar(const T1& rhs) : F(rhs) {}

  template __device__ inline
  RScalar& operator=(const RScalar& rhs)  {
    elem() = rhs.elem();
    return *this;
  }

public:
  __device__ T& elem() {return F;}
  __device__ const T& elem() const {return F;}

private:
  T F;
};



template class RComplex
{
public:
  __device__  RComplex() {}

  __device__  ~RComplex() {}

  template  __device__
  RComplex(const RScalar& _re, const RScalar& _im): 
    re(_re.elem()), im(_im.elem()) {}

  template  __device__
  RComplex(const T1& _re, const T2& _im): re(_re), im(_im) {}

  template  __device__
  RComplex(const T1& _re): re(_re), im() {}

  template
  __device__ inline
  RComplex& operator*=(const RScalar& rhs) 
    {
      real() *= rhs.elem();
      imag() *= rhs.elem();
      return *this;
    }

  template
  __device__ inline
  RComplex& operator-=(const RComplex& rhs) 
    {
      real() -= rhs.real();
      imag() -= rhs.imag();
      return *this;
    }



  template  __device__ inline
  RComplex& operator/=(const RComplex& rhs) 
    {
      RComplex d;
      d = *this / rhs;
      real() = d.real();
      imag() = d.imag();
      return *this;
    }


public:
  __device__ T& real() {return re;}
  __device__ const T& real() const {return re;}

  __device__ T& imag() {return im;}
  __device__ const T& imag() const {return im;}

private:
  T re;
  T im;
};


template __device__ RComplex
operator*(const RComplex& __restrict__ l, 
      const RComplex& __restrict__ r) 
{
  return RComplex(l.real()*r.real() - l.imag()*r.imag(),
             l.real()*r.imag() + l.imag()*r.real());
}


template __device__ RComplex
operator/(const RComplex& l, const RComplex& r)
{
  T tmp = T(1.0) / (r.real()*r.real() + r.imag()*r.imag());

  return RComplex((l.real()*r.real() + l.imag()*r.imag()) * tmp,
             (l.imag()*r.real() - l.real()*r.imag()) * tmp);
}


template __device__ RComplex
operator*(const RComplex& l, const RScalar& r)
{
  return RComplex(l.real()*r.elem(), 
             l.imag()*r.elem());
}

//
//
//////////////////////////////////////////////



#define REALT float
#define Nc 3

struct PrimitiveClovTriang
{
  RScalar   diag[2][2*Nc];
  RComplex  offd[2][2*Nc*Nc-Nc];
};

__global__ void kernel(bool hasOrderedRep, int * siteTable, 
               PrimitiveClovTriang* tri)
{
  RScalar zip=0;
  int N = 2*Nc;
  int site;


  //
  // First if-block results in an error,
  // second, runs fine! Since hasOrderedRep
  // is true, the code blocks should be
  // identical.
  //
#if 1
  if (hasOrderedRep) {
    site = blockDim.x * blockIdx.x + 
      blockDim.x * gridDim.x * blockIdx.y + 
      threadIdx.x;
  } else {
    int idx0 = blockDim.x * blockIdx.x + 
      blockDim.x * gridDim.x * blockIdx.y + 
      threadIdx.x;
    site = ((int*)(siteTable))[idx0];
  }
#else
  site = blockDim.x * blockIdx.x + blockDim.x * gridDim.x * blockIdx.y + threadIdx.x;
#endif

  int site_neg_logdet=0;
  for(int block=0; block < 2; block++) { 
    RScalar inv_d[6];
    RComplex inv_offd[15];
    RComplex v[6];
    RScalar  diag_g[6];
    for(int i=0; i < N; i++) { 
      inv_d[i] = tri[site].diag[block][i];
    }
    for(int i=0; i < 15; i++) { 
      inv_offd[i]  =tri[site].offd[block][i];
    }
    for(int j=0; j < N; ++j) { 
      for(int i=0; i < j; i++) { 
    int elem_ji = j*(j-1)/2 + i;
    RComplex A_ii = RComplex( inv_d[i], zip );
    v[i] = A_ii*RComplex(inv_offd[elem_ji].real(),-inv_offd[elem_ji].imag());
      }
      v[j] = RComplex(inv_d[j],zip);
      for(int k=0; k < j; k++) { 
    int elem_jk = j*(j-1)/2 + k;
    v[j] -= inv_offd[elem_jk]*v[k];
      }
      inv_d[j].elem() = v[j].real();
      for(int k=j+1; k < N; k++) { 
    int elem_kj = k*(k-1)/2 + j;
    for(int l=0; l < j; l++) { 
      int elem_kl = k*(k-1)/2 + l;
      inv_offd[elem_kj] -= inv_offd[elem_kl] * v[l];
    }
    inv_offd[elem_kj] /= v[j];
      }
    }
    RScalar one;
    one.elem() = (REALT)1;
    for(int i=0; i < N; i++) { 
      diag_g[i].elem() = one.elem()/inv_d[i].elem();
      //      ((PScalar > > *)(args->dev_ptr[ 1 ] ))[site] .elem().elem().elem() += log(fabs(inv_d[i].elem()));
      if( inv_d[i].elem() < 0 ) { 
    site_neg_logdet++;
      }
    }
    RComplex sum;
    for(int k = 0; k < N; ++k) {
      for(int i = 0; i < k; ++i) {
    v[i].real()=v[i].imag()=0;
      }
      v[k] = RComplex(diag_g[k],zip);
      for(int i = k+1; i < N; ++i) {
    v[i].real()=v[i].imag()=0;
    for(int j = k; j < i; ++j) {
      int elem_ij = i*(i-1)/2+j;    
      v[i] -= inv_offd[elem_ij] *inv_d[j]*v[j];
    }
    v[i] *= diag_g[i];
      }
      for(int i = N-2; (int)i >= (int)k; --i) {
    for(int j = i+1; j < N; ++j) {
      int elem_ji = j*(j-1)/2 + i;
      v[i] -= RComplex(inv_offd[elem_ji].real(),-inv_offd[elem_ji].imag()) * v[j];
    }
      }
      inv_d[k].elem() = v[k].real();
      for(int i = k+1; i < N; ++i) {
    int elem_ik = i*(i-1)/2+k;
    inv_offd[elem_ik] = v[i];
      }
    }
    for(int i=0; i < N; i++) { 
      tri[site].diag[block][i] = inv_d[i];
    }
    for(int i=0; i < 15; i++) { 
      tri[site].offd[block][i] = inv_offd[i];
    }
  }
  if( site_neg_logdet != 0 ) { 
  }

}




int main()
{
  int sites=1;

  dim3  blocksPerGrid( 1 , 1 , 1 );
  dim3  threadsPerBlock( sites , 1, 1);

  PrimitiveClovTriang* tri_dev;
  int * siteTable;
  cudaMalloc( (void**)&tri_dev , sizeof(PrimitiveClovTriang) * sites );
  cudaMalloc( (void**)&siteTable , sizeof(int) * sites );
  bool ord=true;

  kernel<<< blocksPerGrid , threadsPerBlock , 0 >>>( ord , siteTable , tri_dev );

  cudaDeviceSynchronize();
  cudaError_t kernel_call = cudaGetLastError();

  cout << "kernel call: " << string(cudaGetErrorString(kernel_call)) << endl;
  cudaFree(tri_dev);
  cudaFree(siteTable);

  return(0);
}

Note that siteTable contains uninitialized data. That's fine since its not used. I need it to make the error appear.

Update:

Just tried on a different machine which has CUDA 4.0 installed. There the error does not appear (same Fermi card model). Might be really a NVCC bug of CUDA 4.2. Since they switched to LLVM from CUDA 4.1 this is likely to be a bug.

No unspecified launch error only if a loop is unrolled?

Answers (1)

Related Questions