NVCC doesn't unroll small nested loop

Question

I was wondering, why NVCC isn't able to unroll the following Cholesky factorization kernel for small matrices (N=4).

template
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
            double s = 0;
#pragma unroll
            for (int k = 0; k < j; k++){
                s += C[i*N+k] * C[j*N+k];
            }
            s = C[i*N+j] - s;
            C[i*N+j] = (i == j) ?
                        sqrt(s) :
                        (1.0 / C[j*N+j] * (s));
        }
    }
}

The generated PTX code looks like this:

sqrt.rn.f64     %fd12, %fd29;
    st.local.f64    [%rd1], %fd12;
    rcp.rn.f64  %fd34, %fd12;
    mul.f64     %fd13, %fd30, %fd34;
    st.local.f64    [%rd1+32], %fd13;
    fma.rn.f64  %fd35, %fd13, %fd13, 0d0000000000000000;
    sub.f64     %fd36, %fd31, %fd35;
    sqrt.rn.f64     %fd14, %fd36;
    st.local.f64    [%rd1+40], %fd14;
    mul.f64     %fd15, %fd32, %fd34;
    st.local.f64    [%rd1+64], %fd15;
    ld.local.f64    %fd37, [%rd1+32];
    fma.rn.f64  %fd38, %fd15, %fd37, 0d0000000000000000;
    sub.f64     %fd39, %fd33, %fd38;
    rcp.rn.f64  %fd40, %fd14;
    mul.f64     %fd16, %fd39, %fd40;
    st.local.f64    [%rd1+72], %fd16;
    mov.f64     %fd58, 0d0000000000000000;
    mov.u32     %r58, -2;
    mov.u64     %rd40, -8;

BB1_5:
    shl.b64     %rd23, %rd40, 3;
    sub.s64     %rd24, %rd1, %rd23;
    ld.local.f64    %fd41, [%rd24];
    fma.rn.f64  %fd58, %fd41, %fd41, %fd58;
    add.s64     %rd40, %rd40, -1;
    add.s32     %r58, %r58, 1;
    setp.ne.s32 %p3, %r58, 0;
    @%p3 bra    BB1_5;

    sub.f64     %fd43, %fd6, %fd58;
    sqrt.rn.f64     %fd19, %fd43;
    st.local.f64    [%rd1+80], %fd19;
    mul.f64     %fd20, %fd8, %fd34;
    st.local.f64    [%rd1+96], %fd20;
    ld.local.f64    %fd45, [%rd1+32];
    fma.rn.f64  %fd46, %fd20, %fd45, 0d0000000000000000;
    sub.f64     %fd47, %fd9, %fd46;
    mul.f64     %fd21, %fd47, %fd40;
    st.local.f64    [%rd1+104], %fd21;
    mov.f64     %fd59, 0d0000000000000000;
    mov.u32     %r59, -2;
    mov.u64     %rd41, %rd1;

BB1_7:
    mov.u64     %rd5, %rd41;
    ld.local.f64    %fd49, [%rd5+64];
    ld.local.f64    %fd50, [%rd5+96];
    fma.rn.f64  %fd59, %fd50, %fd49, %fd59;
    add.s64     %rd6, %rd5, 8;
    add.s32     %r59, %r59, 1;
    setp.ne.s32 %p4, %r59, 0;
    mov.u64     %rd41, %rd6;
    @%p4 bra    BB1_7;

    sub.f64     %fd52, %fd10, %fd59;
    rcp.rn.f64  %fd53, %fd19;
    mul.f64     %fd24, %fd52, %fd53;
    st.local.f64    [%rd1+112], %fd24;
    mov.f64     %fd60, 0d0000000000000000;
    mov.u32     %r60, -3;
    mov.u64     %rd42, -12;

BB1_9:
    shl.b64     %rd26, %rd42, 3;
    sub.s64     %rd27, %rd1, %rd26;
    ld.local.f64    %fd54, [%rd27];
    fma.rn.f64  %fd60, %fd54, %fd54, %fd60;
    add.s64     %rd42, %rd42, -1;
    add.s32     %r60, %r60, 1;
    setp.ne.s32 %p5, %r60, 0;
    @%p5 bra    BB1_9;

Clearly the loops aren't unrolled and expensive local memory is used (My goal is to do everything in registers only).

I'm calling the function like so:

    T l[N*N];
    for(int i = 0; i < N*N; ++i){
        l[i] = buffer[offset+i];
    }

    choleskyKernel2(l);

    for(int i = 0; i < N*N; ++i){
        buffer[offset+i] =  l[i];
    }

Is there a way to properly unroll this loops so everything can be done in registers?

EDIT:

Full code:

#include 

template
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {
            double s = 0;
#pragma unroll
            for (int k = 0; k < j; k++){
                s += C[i*N+k] * C[j*N+k];
            }
            s = C[i*N+j] - s;
            C[i*N+j] = (i == j) ?
                        sqrt(s) :
                        (1.0 / C[j*N+j] * (s));
        }
    }
}

template
__global__ static
void test3(T* buffer){
    const int matrixElements = N * N;

    T l[matrixElements];
    for(int i = 0; i < matrixElements; ++i){
        l[i] = buffer[i];
    }

    choleskyKernel2(l);

    for(int i = 0; i < matrixElements; ++i){
        buffer[i] =  l[i];
    }
}

int main(){
     thrust::device_vector d_data(16);
     test3 <<< 1,1 >>>(thrust::raw_pointer_cast(d_data.data()));
}

tera · Accepted Answer

While I can't tell you why nvcc (or indeed cicc which performs the device code compilation on behalf of nvcc) doesn't unroll your loops, I can show you how to change the code so that it does.

Turn

#pragma unroll
    for (int i = 0; i < N; i++){
#pragma unroll
        for (int j = 0; j <= i; j++) {

into

#pragma unroll
    for (int i = 0; i < N; i++) {
#pragma unroll
        for (int j = 0; j < N; j++)
            if (j <= i) {

and you will find all loops unrolled with no use of any local memory.

This is even though you didn't ask for the load and store loops to be unrolled. In fact, with my change above you don't need any #pragma unroll directives at all.

NVCC doesn't unroll small nested loop

Answers (1)

Related Questions

NVCC doesn&#39;t unroll small nested loop

Answers (1)

Related Questions

NVCC doesn't unroll small nested loop