Automatic loop unrolling by the NVIDIA CUDA compiler

Question

I don't know if NVCC will be smart enough to automatically expose Instruction Level Parallelism (ILP) in a loop like this:

for (int i = 0; i < 8; i++) {
   if (somethingHappens) {
       someVar = someVar & 1 << i;
   }
}

or should I rewrite it to exposing the ILP explicitly like this:

char somevar[8];
for (int i = 0; i < 8; i++) {
       if (somethingHappens) {
           someVar[i] = 1 << i;
       }
    }
//reduce somevar using vaddus4 and 3 logical-ands

Other questions:

How deep is the arithmetic pipeline in Kepler?
How could I effectively take measures for knowing if such optimizations are worth it? Would reading clock register before the block and after the block be enough?

Vitality · Accepted Answer

To answer your question, I'm considering four different kernels in which each thread performs a for loop over n_loop iterations. The four kernels implement four different possible situations:

The number of iterations n_loop is known at compile-time;
The number of iterations n_loop is known at compile-time and the summation is conditional;
The number of iterations n_loop is known at run-time;
The number of iterations n_loop is known at run-time and a manual loop-unroll is performed.

The full code is the following:

#include 
#include 

#define BLOCKSIZE 512

#define epsilon 0.5
#define n_loop  8

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/****************************************************/
/* KERNEL #1: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
/****************************************************/
__global__ void testKernel1(float* input, float* output, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum = 0.f;

        for (int i = 0; i < n_loop; i++) { 
            accum = accum + input[n_loop*tid+i];
        }

        output[tid] = accum;

    }

}

/****************************************************/
/* KERNEL #2: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
/****************************************************/
__global__ void testKernel2(float* input, float* output, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum = 0.f;

        for (int i = 0; i < n_loop; i++) if (input[n_loop*tid+i] < epsilon) accum = accum + input[n_loop*tid+i];

        output[tid] = accum;

    }

}

/************************************************/
/* KERNEL #3: NUMBER OF LOOPS KNOWN AT RUN-TIME */
/************************************************/
__global__ void testKernel3(float* input, float* output, int N_loop, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum = 0.f;

        for (int i = 0; i < N_loop; i++) accum = accum + input[N_loop*tid+i];

        output[tid] = accum;

    }

}

/*******************************************************************/
/* KERNEL #4: NUMBER OF LOOPS KNOWN AT RUN-TIME - LOOP UNROLL OF 4 */
/*******************************************************************/
__global__ void testKernel4(float* input, float* output, int N_loop, int N) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float accum1 = 0.f;
        float accum2 = 0.f;
        float accum3 = 0.f;
        float accum4 = 0.f;

        for (int i = 0; i < N_loop/4; i++) {
            accum1 = accum1 + input[N_loop*tid+i];
            accum2 = accum2 + input[N_loop*tid+i+N_loop/4];
            accum3 = accum3 + input[N_loop*tid+i+2*N_loop/4];
            accum4 = accum4 + input[N_loop*tid+i+3*N_loop/4];
        }

        output[tid] = accum1 + accum2 + accum3 + accum4;

    }

}

int main() {

    const int N = 512*512*32;

    float* input    = (float*) malloc(n_loop*N*sizeof(float));
    float* output   = (float*) malloc(N*sizeof(float));
    float* output2  = (float*) malloc(N*sizeof(float));
    float* outputif = (float*) malloc(N*sizeof(float));

    float* d_input;     gpuErrchk(cudaMalloc((void**)&d_input, n_loop*N*sizeof(float)));
    float* d_output;    gpuErrchk(cudaMalloc((void**)&d_output, N*sizeof(float)));

    srand(time(NULL));
    for (int i=0; i>>(d_input, d_output, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel1 elapsed time:  %3.4f ms 
", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i>>(d_input, d_output, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel1 elapsed time:  %3.4f ms 
", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i>>(d_input, d_output, n_loop, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel3 elapsed time:  %3.4f ms 
", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i>>(d_input, d_output, n_loop, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Kernel4 elapsed time:  %3.4f ms 
", time);

    gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));

    // --- Check CPU and GPU results
    for (int i=0; i 0.0001) {
            printf("Mismatch at i = %d, Host= %f, Device = %f, difference = %f
", i, output[i], output2[i], output2[i] - output[i]);
            return 1;
            }
    printf("kernel4: results match!
");

    return 0;

}

Let us now analyze the disassembled code (compiled with CUDA 6.0) for the four different cases. I'm considering compilation for the Fermi architecture.

KERNEL 1

     MOV R1, c[0x1][0x100];
     S2R R0, SR_CTAID.X;
     IMUL R2, R0, c[0x0][0x8];
     S2R R3, SR_TID.X;
     IADD R0, R2, R3;
     ISETP.GE.AND P0, PT, R0, c[0x0][0x28], PT;
 @P0 BRA.U 0xd8;
@!P0 IADD R2, R3, R2;
@!P0 ISCADD R2, R2, c[0x0][0x20], 0x5; 
@!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
@!P0 LD R9, [R2];
@!P0 LD R8, [R2+0x4];
@!P0 LD R7, [R2+0x8];
@!P0 LD R6, [R2+0xc];
@!P0 LD R5, [R2+0x10];
@!P0 LD R4, [R2+0x14];
@!P0 LD R3, [R2+0x18];
@!P0 LD R2, [R2+0x1c];
@!P0 F2F.F32.F32 R9, R9;
@!P0 FADD R8, R9, R8;
@!P0 FADD R7, R8, R7;
@!P0 FADD R6, R7, R6;
@!P0 FADD R5, R6, R5;
@!P0 FADD R4, R5, R4;
@!P0 FADD R3, R4, R3;
@!P0 FADD R2, R3, R2;
@!P0 ST [R0], R2;
     EXIT;

In this case, the compiler is fully unrolling the loop. You will see 8 different load (LD) instructions and 7 different add (FADD) instructions.

KERNEL 2

    MOV R1, c[0x1][0x100];
    S2R R0, SR_CTAID.X;
    IMUL R0, R0, c[0x0][0x8];
    S2R R2, SR_TID.X;
    IADD R3, R0, R2;
    ISETP.GE.AND P0, PT, R3, c[0x0][0x28], PT;
@P0 EXIT;
    IADD R0, R2, R0;
    ISCADD R9, R0, c[0x0][0x20], 0x5;
    LD R0, [R9];
    LD R2, [R9+0x4];
    LD R4, [R9+0x8];
    LD R5, [R9+0xc];
    LD R6, [R9+0x10];
    LD R7, [R9+0x14];
    LD R8, [R9+0x18];
    LD R9, [R9+0x1c];
    FSETP.LT.AND P0, PT, R0, 0.5, PT;
    FSETP.LT.AND P1, PT, R4, 0.5, PT;
    F2F.F32.F32 R0, R0;
    SEL R0, R0, RZ, P0;
    FSETP.LT.AND P0, PT, R2, 0.5, PT;
@P0 FADD R0, R0, R2;
    FSETP.LT.AND P0, PT, R5, 0.5, PT;
@P1 FADD R0, R0, R4;
@P0 FADD R0, R0, R5;
    FSETP.LT.AND P1, PT, R8, 0.5, PT;
    FSETP.LT.AND P0, PT, R6, 0.5, PT;
    FADD R2, R0, R6;
    SEL R2, R2, R0, P0;
    FSETP.LT.AND P0, PT, R7, 0.5, PT;
    ISCADD R0, R3, c[0x0][0x24], 0x2;
@P0 FADD R2, R2, R7;
    FSETP.LT.AND P0, PT, R9, 0.5, PT;
@P1 FADD R2, R2, R8;
@P0 FADD R2, R2, R9;
    ST [R0], R2;
    EXIT;

Also in this case, the compiler is fully unrolling the loop. You will see again 8 different load (LD) instructions and 7 different add (FADD) instructions.

KERNEL 3

c[0x0][0x30]    = N
c[0x1][0x100]   = BLOCKSIZE
c[0x0][0x8]     = blockDim.x
c[0x0][0x30]    = N_loop
c[0x0][0x20]    = input

/*0000*/         MOV R1, c[0x1][0x100];                           R1 = BLOCKSIZE = 256
/*0008*/         S2R R0, SR_CTAID.X;                              R0 = blockIdx.x
/*0010*/         S2R R2, SR_TID.X;                                R2 = threadIdx.x
/*0018*/         IMAD R0, R0, c[0x0][0x8], R2;                    R0 = tid = blockIDx.x * blockDim.x + threadIdx.x
/*0020*/         ISETP.GE.AND P0, PT, R0, c[0x0][0x34], PT;       P0 = (tid >= N) then EXIT
/*0028*/     @P0 EXIT;
/*0030*/         ISETP.LT.AND P0, PT, RZ, c[0x0][0x30], PT;       P0 = (0 < N_loop)
/*0038*/     @P0 BRA 0x60;
/*0040*/         MOV R4, RZ;
/*0048*/         BRA 0x170;
/*0050*/         NOP;
/*0058*/         NOP;
/*0060*/         MOV R2, c[0x0][0x30];                            R2 = N_loop
/*0068*/         IMUL R3, R0, c[0x0][0x30];                       R3 = tid * N_loop
/*0070*/         MOV32I R6, 0x4;                                  R6 = sizeof(float) = 4
/*0078*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;                P0 = (R2 >= 3)
/*0080*/         IMAD R2.CC, R3, R6, c[0x0][0x20];                R2 = R3 * R6 + input = tid * N_loop * 4 + input
/*0088*/         MOV R4, RZ;                                      R4 = 0
/*0090*/         MOV R5, RZ;                                      R5 = 0
/*0098*/         IMAD.HI.X R3, R3, R6, c[0x0][0x24];              
/*00a0*/    @!P0 BRA 0x128;                               
/*00a8*/         MOV R6, c[0x0][0x30];                            R6 = N_loop
/*00b0*/         IADD R10, R6, -0x3;                              R10 = N_loop - 3
/*00b8*/         NOP;
/*00c0*/         IADD R5, R5, 0x4;                                R5 = R5 + 4 = 4                              
/*00c8*/         LD.E R6, [R2];                                   R6 = input[tid * N_loop]
/*00d0*/         ISETP.LT.AND P0, PT, R5, R10, PT;                P0 = (4 < (N_loop - 3))
/*00d8*/         LD.E R7, [R2+0x4];                               R7 = input[tid * N_loop + 1]
/*00e0*/         LD.E R8, [R2+0x8];                               R8 = input[tid * N_loop + 2]
/*00e8*/         LD.E R9, [R2+0xc];                               R9 = input[tid * N_loop + 3]
/*00f0*/         IADD R2.CC, R2, 0x10;                            R2 = R2 + 16 = R2 + 4 * sizeof(float)
/*00f8*/         IADD.X R3, R3, RZ;                               
/*0100*/         FADD R6, R4, R6;                                 R6 = 0 + input[tid * N_loop]
/*0108*/         FADD R4, R6, R7;                                 R4 = input[tid * N_loop] + input[tid * N_loop + 1]
/*0110*/         FADD R8, R4, R8;                                 R8 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2]
/*0118*/         FADD R4, R8, R9;                                 R4 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2] + input[tid * N_loop + 3]
/*0120*/     @P0 BRA 0xc0;                                        ...
/*0128*/         ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
/*0130*/    @!P0 BRA 0x170;
/*0138*/         IADD R5, R5, 0x1;
/*0140*/         LD.E R6, [R2];
/*0148*/         ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
/*0150*/         IADD R2.CC, R2, 0x4;
/*0158*/         IADD.X R3, R3, RZ;
/*0160*/         FADD R4, R4, R6;
/*0168*/     @P0 BRA 0x138;
/*0170*/         MOV32I R3, 0x4;
/*0178*/         IMAD R2.CC, R0, R3, c[0x0][0x28];
/*0180*/         IMAD.HI.X R3, R0, R3, c[0x0][0x2c];
/*0188*/         ST.E [R2], R4;
/*0190*/         EXIT;

As it can be seen, the compiler automatically performs a loop unroll of 4, as I see 4 load operations (LD) and 3 different adds (FADD)

KERNEL 4

/*0000*/         MOV R1, c[0x1][0x100];
/*0008*/         S2R R0, SR_CTAID.X;
/*0010*/         S2R R2, SR_TID.X;
/*0018*/         IMAD R13, R0, c[0x0][0x8], R2;
/*0020*/         ISETP.GE.AND P0, PT, R13, c[0x0][0x34], PT;
/*0028*/     @P0 EXIT;
/*0030*/         MOV R2, c[0x0][0x30];
/*0038*/         SHR R0, R2, 0x1f;
/*0040*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*0048*/         IMAD.U32.U32.HI R0, R0, 0x4, R2;
/*0050*/         SHR R0, R0, 0x2;
/*0058*/     @P0 BRA 0x98;
/*0060*/         MOV R18, RZ;
/*0068*/         MOV R19, RZ;
/*0070*/         MOV R10, RZ;
/*0078*/         MOV R11, RZ;
/*0080*/         BRA 0x308;
/*0088*/         NOP;
/*0090*/         NOP;
/*0098*/         MOV R3, c[0x0][0x30];
/*00a0*/         IMUL R4, R13, c[0x0][0x30];
/*00a8*/         MOV32I R5, 0x4;
/*00b0*/         IMUL R2, R3, 0x3;
/*00b8*/         SHL R6, R3, 0x1;
/*00c0*/         IADD R10, R0, R4;
/*00c8*/         SHR R3, R2, 0x1f;
/*00d0*/         IMAD R8.CC, R4, R5, c[0x0][0x20];
/*00d8*/         SHR R7, R6, 0x1f;
/*00e0*/         IMAD.U32.U32.HI R2, R3, 0x4, R2;
/*00e8*/         IMAD.HI.X R9, R4, R5, c[0x0][0x24];
/*00f0*/         IMAD.U32.U32.HI R7, R7, 0x4, R6;
/*00f8*/         IMAD.HI R3, R2, c[0x10][0x0], R4;
/*0100*/         IMAD R6.CC, R10, R5, c[0x0][0x20];
/*0108*/         ISETP.GT.AND P0, PT, R0, 0x1, PT;
/*0110*/         IMAD.HI R14, R7, c[0x10][0x0], R4;
/*0118*/         MOV R18, RZ;
/*0120*/         IMAD.HI.X R7, R10, R5, c[0x0][0x24];
/*0128*/         MOV R19, RZ;
/*0130*/         IMAD R2.CC, R3, R5, c[0x0][0x20];
/*0138*/         MOV R10, RZ;
/*0140*/         IMAD.HI.X R3, R3, R5, c[0x0][0x24];
/*0148*/         MOV R11, RZ;
/*0150*/         IMAD R4.CC, R14, R5, c[0x0][0x20];
/*0158*/         MOV R12, RZ;
/*0160*/         IMAD.HI.X R5, R14, R5, c[0x0][0x24];
/*0168*/    @!P0 BRA 0x260;
/*0170*/         IADD R16, R0, -0x1;
/*0178*/         NOP;
/*0180*/         IADD R12, R12, 0x2;
/*0188*/         LD.E R15, [R8];
/*0190*/         ISETP.LT.AND P0, PT, R12, R16, PT;
/*0198*/         LD.E R20, [R6];
/*01a0*/         FADD R17, R18, R15;
/*01a8*/         LD.E R14, [R4];
/*01b0*/         FADD R19, R19, R20;
/*01b8*/         LD.E R15, [R2];
/*01c0*/         LD.E R18, [R8+0x4];
/*01c8*/         LD.E R20, [R6+0x4];
/*01d0*/         IADD R6.CC, R6, 0x8;
/*01d8*/         NOP;
/*01e0*/         FADD R14, R10, R14;
/*01e8*/         FADD R15, R11, R15;
/*01f0*/         IADD.X R7, R7, RZ;
/*01f8*/         LD.E R10, [R4+0x4];
/*0200*/         IADD R4.CC, R4, 0x8;
/*0208*/         LD.E R11, [R2+0x4];
/*0210*/         IADD.X R5, R5, RZ;
/*0218*/         FADD R18, R17, R18;
/*0220*/         IADD R2.CC, R2, 0x8;
/*0228*/         FADD R19, R19, R20;
/*0230*/         IADD.X R3, R3, RZ;
/*0238*/         IADD R8.CC, R8, 0x8;
/*0240*/         IADD.X R9, R9, RZ;
/*0248*/         FADD R10, R14, R10;
/*0250*/         FADD R11, R15, R11;
/*0258*/     @P0 BRA 0x180;
/*0260*/         ISETP.LT.AND P0, PT, R12, R0, PT;
/*0268*/    @!P0 BRA 0x308;
/*0270*/         IADD R12, R12, 0x1;
/*0278*/         LD.E R17, [R8];
/*0280*/         ISETP.LT.AND P0, PT, R12, R0, PT;
/*0288*/         LD.E R16, [R6];
/*0290*/         IADD R6.CC, R6, 0x4;
/*0298*/         LD.E R15, [R4];
/*02a0*/         IADD.X R7, R7, RZ;
/*02a8*/         LD.E R14, [R2];
/*02b0*/         IADD R4.CC, R4, 0x4;
/*02b8*/         IADD.X R5, R5, RZ;
/*02c0*/         IADD R2.CC, R2, 0x4;
/*02c8*/         IADD.X R3, R3, RZ;
/*02d0*/         IADD R8.CC, R8, 0x4;
/*02d8*/         IADD.X R9, R9, RZ;
/*02e0*/         FADD R18, R18, R17;
/*02e8*/         FADD R19, R19, R16;
/*02f0*/         FADD R10, R10, R15;
/*02f8*/         FADD R11, R11, R14;
/*0300*/     @P0 BRA 0x270;
/*0308*/         FADD R0, R18, R19;
/*0310*/         MOV32I R3, 0x4;
/*0318*/         FADD R0, R0, R10;
/*0320*/         IMAD R2.CC, R13, R3, c[0x0][0x28];
/*0328*/         FADD R0, R0, R11;
/*0330*/         IMAD.HI.X R3, R13, R3, c[0x0][0x2c];
/*0338*/         ST.E [R2], R0;
/*0340*/         EXIT;

In this case, the compiler automatically performs a loop unroll of 4, which superimposes to the manual loop unroll of 4, as I see 8 load operations (LD) and 7 different adds (FADD).

Although the disassembled codes are different from those for the Fermi architecture, the compiler behavior is similar also for the Kepler architecture.

Due to the automatic loop unrolling capabilities, there is not much difference in performance between the different kernels:

GT 210 (c.c. 1.2)

Kernel 1 = 111ms
Kernel 2 = 108ms
Kernel 3 = 107ms
Kernel 4 = 110ms

Kepler K20c (c.c. 3.5)

Kernel 1 = 1.8ms
Kernel 2 = 1.8ms
Kernel 3 = 1.8ms
Kernel 4 = 1.8ms

I'm not explictly providing results for the Fermi architecture, but the timing is approximately the same for the four considered kernels.

Automatic loop unrolling by the NVIDIA CUDA compiler

Answers (1)

Related Questions