Reputation: 21465
I have the following code:
#include<stdio.h>
#define N_ITERATIONS 2048
#define UNROLL 32
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if (tid < N) {
int a = d_a[tid];
int b = d_b[tid];
int c = d_c[tid];
#pragma unroll UNROLL
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/2];
int b2 = d_b[tid+N/2];
int c2 = d_c[tid+N/2];
#pragma unroll UNROLL
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid+N/2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/4) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/4];
int b2 = d_b[tid+N/4];
int c2 = d_c[tid+N/4];
int a3 = d_a[tid+N/2];
int b3 = d_b[tid+N/2];
int c3 = d_c[tid+N/2];
int a4 = d_a[tid+3*N/4];
int b4 = d_b[tid+3*N/4];
int c4 = d_c[tid+3*N/4];
#pragma unroll UNROLL
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
//if (tid==0) printf("iteration %i %i\n",i,a1);
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
//if (tid==0) printf("last iteration %i\n",a1);
d_a[tid] = a1;
d_a[tid+N/4] = a2;
d_a[tid+N/2] = a3;
d_a[tid+3*N/4] = a4;
}
}
/********/
/* MAIN */
/********/
void main() {
const int N = 1024;
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the maximum occupancy for a full device launch
int *h_a = (int*)malloc(N*sizeof(int));
int *h_a_result_host = (int*)malloc(N*sizeof(int));
int *h_a_result_device = (int*)malloc(N*sizeof(int));
int *h_b = (int*)malloc(N*sizeof(int));
int *h_c = (int*)malloc(N*sizeof(int));
for (int i=0; i<N; i++) {
h_a[i] = 2;
h_b[i] = 1;
h_c[i] = 2;
h_a_result_host[i] = h_a[i];
for(unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* KERNEL2 */
/***********/
cudaEventRecord(start, 0);
kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
cudaDeviceReset();
}
I'm compiling it using CUDA 6.5 for compute_20, sm_21
(GT540M) and it works correctly for UNROLL = 2, 4, 8, 16
, but it does not work anymore for UNROLL = 32
, in the sense that the result is incorrect already for the first element of the output array.
The error message is:
Error at i=0! Host = 4098; Device = 4036
If I monitor the processing for tid = 0
, i.e., I uncomment the printf
lines, I see that a1
holds the correct value until the last for
iteration, while, immediately after exiting the for
loop, the variable becomes incorrect. In other words, by uncommenting the two printf
's, the output is
....
iteration 2047 4098
last iteration 4036
which means that the for
loop is executed correctly, but when exiting it, the value of the a1
variable changes.
If I compile the code for UNROLL = 32
with CUDA 5.5
and 6.0
, it works.
Having a look at the disassembled code, I see
CUDA 5.5
Very similar to CUDA 6.0
- not shown here
CUDA 6.0
Function : _Z7kernel2PiS_S_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ NOP;
/*0010*/ MOV R3, c[0x0][0x38];
/*0018*/ S2R R0, SR_CTAID.X;
/*0020*/ SHR.U32 R3, R3, 0x2;
/*0028*/ S2R R2, SR_TID.X;
/*0030*/ IMAD R15, R0, c[0x0][0x8], R2;
/*0038*/ ISETP.GE.U32.AND P0, PT, R15, R3, PT;
/*0040*/ @P0 EXIT;
/*0048*/ MOV32I R17, 0x4;
/*0050*/ IADD R7, R15, R3;
/*0058*/ IMAD R8.CC, R15, R17, c[0x0][0x20];
/*0060*/ IMAD.HI.X R9, R15, R17, c[0x0][0x24];
/*0068*/ IMAD R20.CC, R15, R17, c[0x0][0x28];
/*0070*/ LD.E R0, [R8];
/*0078*/ IMAD.HI.X R21, R15, R17, c[0x0][0x2c];
/*0080*/ IMAD R2.CC, R15, R17, c[0x0][0x30];
/*0088*/ MOV R12, c[0x0][0x38];
/*0090*/ IMAD.HI.X R3, R15, R17, c[0x0][0x34];
/*0098*/ IMUL R13, R12, 0x3;
/*00a0*/ IMAD.U32.U32 R10.CC, R7, R17, c[0x0][0x30];
/*00a8*/ LD.E R14, [R2];
/*00b0*/ IMAD.U32.U32.HI.X R11, R7, R17, c[0x0][0x34];
/*00b8*/ LD.E R16, [R20];
/*00c0*/ IMAD.U32.U32 R4.CC, R7, R17, c[0x0][0x20];
/*00c8*/ SHR.U32 R3, R12, 0x1;
/*00d0*/ IMAD.U32.U32.HI.X R5, R7, R17, c[0x0][0x24];
/*00d8*/ IADD R20, R15, R3;
/*00e0*/ IMAD.U32.U32 R6.CC, R7, R17, c[0x0][0x28];
/*00e8*/ LD.E R18, [R4];
/*00f0*/ IMAD.U32.U32.HI R13, R13, c[0x10][0x0], R15;
/*00f8*/ LD.E R21, [R10];
/*0100*/ IMAD.U32.U32.HI.X R7, R7, R17, c[0x0][0x2c];
/*0108*/ IMAD.U32.U32 R8.CC, R13, R17, c[0x0][0x28];
/*0110*/ LD.E R19, [R6];
/*0118*/ IMAD.U32.U32.HI.X R9, R13, R17, c[0x0][0x2c];
/*0120*/ IMAD.U32.U32 R2.CC, R13, R17, c[0x0][0x20];
/*0128*/ LD.E R9, [R8];
/*0130*/ IMAD.U32.U32.HI.X R3, R13, R17, c[0x0][0x24];
/*0138*/ IMAD.U32.U32 R4.CC, R20, R17, c[0x0][0x28];
/*0140*/ IMAD.U32.U32.HI.X R5, R20, R17, c[0x0][0x2c];
/*0148*/ LD.E R8, [R2];
/*0150*/ IMAD.U32.U32 R6.CC, R20, R17, c[0x0][0x30];
/*0158*/ LD.E R5, [R4];
/*0160*/ IMAD.U32.U32.HI.X R7, R20, R17, c[0x0][0x34];
/*0168*/ MOV32I R4, 0xfffff800;
/*0170*/ IMAD.U32.U32 R12.CC, R13, R17, c[0x0][0x30];
/*0178*/ LD.E R6, [R6];
/*0180*/ IMAD.U32.U32.HI.X R13, R13, R17, c[0x0][0x34];
/*0188*/ IMAD.U32.U32 R10.CC, R20, R17, c[0x0][0x20];
/*0190*/ LD.E R13, [R12];
/*0198*/ IMAD.U32.U32.HI.X R11, R20, R17, c[0x0][0x24];
/*01a0*/ LD.E R17, [R10];
/*01a8*/ IMAD R0, R0, R16, R14;
/*01b0*/ IMAD R7, R18, R19, R21;
/*01b8*/ IMAD R12, R17, R5, R6;
/*01c0*/ IMAD R8, R8, R9, R13;
/*01c8*/ IMAD R0, R0, R16, R14;
/*01d0*/ IMAD R7, R7, R19, R21;
/*01d8*/ IMAD R12, R12, R5, R6;
/*01e0*/ IMAD R8, R8, R9, R13;
/*01e8*/ IMAD R0, R0, R16, R14;
/*01f0*/ IMAD R7, R7, R19, R21;
/*01f8*/ IMAD R12, R12, R5, R6;
/*0200*/ IMAD R8, R8, R9, R13;
/*0208*/ IMAD R0, R0, R16, R14;
/*0210*/ IMAD R7, R7, R19, R21;
/*0218*/ IMAD R12, R12, R5, R6;
/*0220*/ IMAD R8, R8, R9, R13;
/*0228*/ IMAD R0, R0, R16, R14;
/*0230*/ IMAD R7, R7, R19, R21;
/*0238*/ IMAD R12, R12, R5, R6;
/*0240*/ IMAD R8, R8, R9, R13;
/*0248*/ IMAD R0, R0, R16, R14;
/*0250*/ IMAD R7, R7, R19, R21;
/*0258*/ IMAD R12, R12, R5, R6;
/*0260*/ IMAD R8, R8, R9, R13;
/*0268*/ IMAD R0, R0, R16, R14;
/*0270*/ IMAD R7, R7, R19, R21;
/*0278*/ IMAD R12, R12, R5, R6;
/*0280*/ IMAD R8, R8, R9, R13;
/*0288*/ IMAD R0, R0, R16, R14;
/*0290*/ IMAD R7, R7, R19, R21;
/*0298*/ IMAD R12, R12, R5, R6;
/*02a0*/ IMAD R8, R8, R9, R13;
/*02a8*/ IMAD R0, R0, R16, R14;
/*02b0*/ IMAD R7, R7, R19, R21;
/*02b8*/ IMAD R12, R12, R5, R6;
/*02c0*/ IMAD R8, R8, R9, R13;
/*02c8*/ IMAD R0, R0, R16, R14;
/*02d0*/ IMAD R7, R7, R19, R21;
/*02d8*/ IMAD R12, R12, R5, R6;
/*02e0*/ IMAD R8, R8, R9, R13;
/*02e8*/ IMAD R0, R0, R16, R14;
/*02f0*/ IMAD R7, R7, R19, R21;
/*02f8*/ IMAD R12, R12, R5, R6;
/*0300*/ IMAD R8, R8, R9, R13;
/*0308*/ IMAD R0, R0, R16, R14;
/*0310*/ IMAD R7, R7, R19, R21;
/*0318*/ IMAD R12, R12, R5, R6;
/*0320*/ IMAD R8, R8, R9, R13;
/*0328*/ IMAD R0, R0, R16, R14;
/*0330*/ IMAD R7, R7, R19, R21;
/*0338*/ IMAD R12, R12, R5, R6;
/*0340*/ IMAD R8, R8, R9, R13;
/*0348*/ IMAD R0, R0, R16, R14;
/*0350*/ IMAD R7, R7, R19, R21;
/*0358*/ IMAD R12, R12, R5, R6;
/*0360*/ IMAD R8, R8, R9, R13;
/*0368*/ IMAD R0, R0, R16, R14;
/*0370*/ IMAD R7, R7, R19, R21;
/*0378*/ IMAD R12, R12, R5, R6;
/*0380*/ IMAD R8, R8, R9, R13;
/*0388*/ IMAD R0, R0, R16, R14;
/*0390*/ IMAD R7, R7, R19, R21;
/*0398*/ IMAD R12, R12, R5, R6;
/*03a0*/ IMAD R8, R8, R9, R13;
/*03a8*/ IMAD R0, R0, R16, R14;
/*03b0*/ IMAD R7, R7, R19, R21;
/*03b8*/ IMAD R12, R12, R5, R6;
/*03c0*/ IMAD R8, R8, R9, R13;
/*03c8*/ IMAD R0, R0, R16, R14;
/*03d0*/ IMAD R7, R7, R19, R21;
/*03d8*/ IMAD R12, R12, R5, R6;
/*03e0*/ IMAD R8, R8, R9, R13;
/*03e8*/ IMAD R0, R0, R16, R14;
/*03f0*/ IMAD R7, R7, R19, R21;
/*03f8*/ IMAD R12, R12, R5, R6;
/*0400*/ IMAD R8, R8, R9, R13;
/*0408*/ IMAD R0, R0, R16, R14;
/*0410*/ IMAD R7, R7, R19, R21;
/*0418*/ IMAD R12, R12, R5, R6;
/*0420*/ IMAD R8, R8, R9, R13;
/*0428*/ IMAD R0, R0, R16, R14;
/*0430*/ IMAD R7, R7, R19, R21;
/*0438*/ IMAD R12, R12, R5, R6;
/*0440*/ IMAD R8, R8, R9, R13;
/*0448*/ IMAD R0, R0, R16, R14;
/*0450*/ IMAD R7, R7, R19, R21;
/*0458*/ IMAD R12, R12, R5, R6;
/*0460*/ IMAD R8, R8, R9, R13;
/*0468*/ IMAD R0, R0, R16, R14;
/*0470*/ IMAD R7, R7, R19, R21;
/*0478*/ IMAD R12, R12, R5, R6;
/*0480*/ IMAD R8, R8, R9, R13;
/*0488*/ IMAD R0, R0, R16, R14;
/*0490*/ IMAD R7, R7, R19, R21;
/*0498*/ IMAD R12, R12, R5, R6;
/*04a0*/ IMAD R8, R8, R9, R13;
/*04a8*/ IMAD R0, R0, R16, R14;
/*04b0*/ IMAD R7, R7, R19, R21;
/*04b8*/ IMAD R12, R12, R5, R6;
/*04c0*/ IMAD R8, R8, R9, R13;
/*04c8*/ IMAD R0, R0, R16, R14;
/*04d0*/ IMAD R7, R7, R19, R21;
/*04d8*/ IMAD R12, R12, R5, R6;
/*04e0*/ IMAD R8, R8, R9, R13;
/*04e8*/ IMAD R0, R0, R16, R14;
/*04f0*/ IMAD R7, R7, R19, R21;
/*04f8*/ IMAD R12, R12, R5, R6;
/*0500*/ IMAD R8, R8, R9, R13;
/*0508*/ IMAD R0, R0, R16, R14;
/*0510*/ IMAD R7, R7, R19, R21;
/*0518*/ IMAD R12, R12, R5, R6;
/*0520*/ IMAD R8, R8, R9, R13;
/*0528*/ IMAD R0, R0, R16, R14;
/*0530*/ IMAD R7, R7, R19, R21;
/*0538*/ IMAD R12, R12, R5, R6;
/*0540*/ IMAD R8, R8, R9, R13;
/*0548*/ IADD R4, R4, 0x20;
/*0550*/ IMAD R0, R0, R16, R14;
/*0558*/ IMAD R7, R7, R19, R21;
/*0560*/ IMAD R12, R12, R5, R6;
/*0568*/ IMAD R8, R8, R9, R13;
/*0570*/ ISETP.NE.AND P0, PT, R4, RZ, PT;
/*0578*/ IMAD R0, R0, R16, R14;
/*0580*/ IMAD R18, R7, R19, R21;
/*0588*/ IMAD R17, R12, R5, R6;
/*0590*/ IMAD R8, R8, R9, R13;
/*0598*/ IMAD R0, R0, R16, R14;
/*05a0*/ IMAD R18, R18, R19, R21;
/*05a8*/ IMAD R17, R17, R5, R6;
/*05b0*/ IMAD R8, R8, R9, R13;
/*05b8*/ @P0 BRA 0x1a8;
/*05c0*/ MOV32I R4, 0x40000000;
/*05c8*/ MOV32I R9, 0x4;
/*05d0*/ IMAD.U32.U32.HI R7, R4, c[0x0][0x38], R15;
/*05d8*/ IMAD R4.CC, R15, R9, c[0x0][0x20];
/*05e0*/ IMAD.HI.X R5, R15, R9, c[0x0][0x24];
/*05e8*/ IMAD.U32.U32 R6.CC, R7, R9, c[0x0][0x20];
/*05f0*/ IMAD.U32.U32.HI.X R7, R7, R9, c[0x0][0x24];
/*05f8*/ ST.E [R4], R0;
/*0600*/ ST.E [R6], R18;
/*0608*/ ST.E [R10], R17;
/*0610*/ ST.E [R2], R8;
/*0618*/ EXIT;
CUDA 6.5
Function : _Z7kernel2PiS_S_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ NOP;
/*0010*/ MOV R3, c[0x0][0x38];
/*0018*/ S2R R0, SR_CTAID.X;
/*0020*/ SHR.U32 R3, R3, 0x2;
/*0028*/ S2R R2, SR_TID.X;
/*0030*/ IMAD R0, R0, c[0x0][0x8], R2;
/*0038*/ ISETP.GE.U32.AND P0, PT, R0, R3, PT;
/*0040*/ @P0 EXIT;
/*0048*/ MOV32I R14, 0x4;
/*0050*/ IADD R21, R0, R3;
/*0058*/ IMAD R18.CC, R0, R14, c[0x0][0x20];
/*0060*/ IMAD.HI.X R19, R0, R14, c[0x0][0x24];
/*0068*/ IMAD R10.CC, R0, R14, c[0x0][0x28];
/*0070*/ LD.E R15, [R18];
/*0078*/ IMAD.HI.X R11, R0, R14, c[0x0][0x2c];
/*0080*/ IMAD R12.CC, R0, R14, c[0x0][0x30];
/*0088*/ MOV R22, c[0x0][0x38];
/*0090*/ IMAD.HI.X R13, R0, R14, c[0x0][0x34];
/*0098*/ IMUL R2, R22, 0x3;
/*00a0*/ IMAD.U32.U32 R8.CC, R21, R14, c[0x0][0x20];
/*00a8*/ LD.E R17, [R10];
/*00b0*/ IMAD.U32.U32.HI.X R9, R21, R14, c[0x0][0x24];
/*00b8*/ LD.E R20, [R12];
/*00c0*/ IMAD.U32.U32 R4.CC, R21, R14, c[0x0][0x28];
/*00c8*/ SHR.U32 R13, R22, 0x1;
/*00d0*/ IMAD.U32.U32.HI R16, R2, c[0x10][0x0], R0;
/*00d8*/ LD.E R23, [R8];
/*00e0*/ IMAD.U32.U32.HI.X R5, R21, R14, c[0x0][0x2c];
/*00e8*/ IADD R19, R0, R13;
/*00f0*/ IMAD.U32.U32 R2.CC, R16, R14, c[0x0][0x28];
/*00f8*/ LD.E R22, [R4];
/*0100*/ IMAD.U32.U32.HI.X R3, R16, R14, c[0x0][0x2c];
/*0108*/ IMAD.U32.U32 R6.CC, R16, R14, c[0x0][0x20];
/*0110*/ LD.E R2, [R2];
/*0118*/ IMAD.U32.U32.HI.X R7, R16, R14, c[0x0][0x24];
/*0120*/ MOV32I R3, 0xfffff800;
/*0128*/ IMAD.U32.U32 R10.CC, R21, R14, c[0x0][0x30];
/*0130*/ IMAD.U32.U32.HI.X R11, R21, R14, c[0x0][0x34];
/*0138*/ IMAD.U32.U32 R12.CC, R16, R14, c[0x0][0x30];
/*0140*/ IMAD.U32.U32.HI.X R13, R16, R14, c[0x0][0x34];
/*0148*/ LD.E R10, [R10];
/*0150*/ IMAD.U32.U32 R8.CC, R19, R14, c[0x0][0x28];
/*0158*/ LD.E R16, [R6];
/*0160*/ IMAD.U32.U32.HI.X R9, R19, R14, c[0x0][0x2c];
/*0168*/ LD.E R12, [R12];
/*0170*/ IMAD.U32.U32 R4.CC, R19, R14, c[0x0][0x30];
/*0178*/ LD.E R8, [R8];
/*0180*/ IMAD.U32.U32.HI.X R5, R19, R14, c[0x0][0x34];
/*0188*/ IMAD.U32.U32 R18.CC, R19, R14, c[0x0][0x20];
/*0190*/ LD.E R4, [R4];
/*0198*/ IMAD.U32.U32.HI.X R19, R19, R14, c[0x0][0x24];
/*01a0*/ LD.E R14, [R18];
/*01a8*/ IMAD R5, R15, R17, R20;
/*01b0*/ IMAD R9, R23, R22, R10;
/*01b8*/ IMAD R11, R14, R8, R4;
/*01c0*/ IMAD R13, R16, R2, R12;
/*01c8*/ IMAD R15, R5, R17, R20;
/*01d0*/ IMAD R21, R9, R22, R10;
/*01d8*/ IMAD R14, R11, R8, R4;
/*01e0*/ IMAD R16, R13, R2, R12;
/*01e8*/ IMAD R15, R15, R17, R20;
/*01f0*/ IMAD R21, R21, R22, R10;
/*01f8*/ IMAD R14, R14, R8, R4;
/*0200*/ IMAD R16, R16, R2, R12;
/*0208*/ IMAD R15, R15, R17, R20;
/*0210*/ IMAD R21, R21, R22, R10;
/*0218*/ IMAD R14, R14, R8, R4;
/*0220*/ IMAD R16, R16, R2, R12;
/*0228*/ IMAD R15, R15, R17, R20;
/*0230*/ IMAD R21, R21, R22, R10;
/*0238*/ IMAD R14, R14, R8, R4;
/*0240*/ IMAD R16, R16, R2, R12;
/*0248*/ IMAD R15, R15, R17, R20;
/*0250*/ IMAD R21, R21, R22, R10;
/*0258*/ IMAD R14, R14, R8, R4;
/*0260*/ IMAD R16, R16, R2, R12;
/*0268*/ IMAD R15, R15, R17, R20;
/*0270*/ IMAD R21, R21, R22, R10;
/*0278*/ IMAD R14, R14, R8, R4;
/*0280*/ IMAD R16, R16, R2, R12;
/*0288*/ IMAD R15, R15, R17, R20;
/*0290*/ IMAD R21, R21, R22, R10;
/*0298*/ IMAD R14, R14, R8, R4;
/*02a0*/ IMAD R16, R16, R2, R12;
/*02a8*/ IMAD R15, R15, R17, R20;
/*02b0*/ IMAD R21, R21, R22, R10;
/*02b8*/ IMAD R14, R14, R8, R4;
/*02c0*/ IMAD R16, R16, R2, R12;
/*02c8*/ IMAD R15, R15, R17, R20;
/*02d0*/ IMAD R21, R21, R22, R10;
/*02d8*/ IMAD R14, R14, R8, R4;
/*02e0*/ IMAD R16, R16, R2, R12;
/*02e8*/ IMAD R15, R15, R17, R20;
/*02f0*/ IMAD R21, R21, R22, R10;
/*02f8*/ IMAD R14, R14, R8, R4;
/*0300*/ IMAD R16, R16, R2, R12;
/*0308*/ IMAD R15, R15, R17, R20;
/*0310*/ IMAD R21, R21, R22, R10;
/*0318*/ IMAD R14, R14, R8, R4;
/*0320*/ IMAD R16, R16, R2, R12;
/*0328*/ IMAD R15, R15, R17, R20;
/*0330*/ IMAD R21, R21, R22, R10;
/*0338*/ IMAD R14, R14, R8, R4;
/*0340*/ IMAD R16, R16, R2, R12;
/*0348*/ IMAD R15, R15, R17, R20;
/*0350*/ IMAD R21, R21, R22, R10;
/*0358*/ IMAD R14, R14, R8, R4;
/*0360*/ IMAD R16, R16, R2, R12;
/*0368*/ IMAD R15, R15, R17, R20;
/*0370*/ IMAD R21, R21, R22, R10;
/*0378*/ IMAD R14, R14, R8, R4;
/*0380*/ IMAD R16, R16, R2, R12;
/*0388*/ IMAD R15, R15, R17, R20;
/*0390*/ IMAD R21, R21, R22, R10;
/*0398*/ IMAD R14, R14, R8, R4;
/*03a0*/ IMAD R16, R16, R2, R12;
/*03a8*/ IMAD R15, R15, R17, R20;
/*03b0*/ IMAD R21, R21, R22, R10;
/*03b8*/ IMAD R14, R14, R8, R4;
/*03c0*/ IMAD R16, R16, R2, R12;
/*03c8*/ IMAD R15, R15, R17, R20;
/*03d0*/ IMAD R21, R21, R22, R10;
/*03d8*/ IMAD R14, R14, R8, R4;
/*03e0*/ IMAD R16, R16, R2, R12;
/*03e8*/ IMAD R15, R15, R17, R20;
/*03f0*/ IMAD R21, R21, R22, R10;
/*03f8*/ IMAD R14, R14, R8, R4;
/*0400*/ IMAD R16, R16, R2, R12;
/*0408*/ IMAD R15, R15, R17, R20;
/*0410*/ IMAD R21, R21, R22, R10;
/*0418*/ IMAD R14, R14, R8, R4;
/*0420*/ IMAD R16, R16, R2, R12;
/*0428*/ IMAD R15, R15, R17, R20;
/*0430*/ IMAD R21, R21, R22, R10;
/*0438*/ IMAD R14, R14, R8, R4;
/*0440*/ IMAD R16, R16, R2, R12;
/*0448*/ IMAD R15, R15, R17, R20;
/*0450*/ IMAD R21, R21, R22, R10;
/*0458*/ IMAD R14, R14, R8, R4;
/*0460*/ IMAD R16, R16, R2, R12;
/*0468*/ IMAD R15, R15, R17, R20;
/*0470*/ IMAD R21, R21, R22, R10;
/*0478*/ IMAD R14, R14, R8, R4;
/*0480*/ IMAD R16, R16, R2, R12;
/*0488*/ IMAD R15, R15, R17, R20;
/*0490*/ IMAD R21, R21, R22, R10;
/*0498*/ IMAD R14, R14, R8, R4;
/*04a0*/ IMAD R16, R16, R2, R12;
/*04a8*/ IMAD R15, R15, R17, R20;
/*04b0*/ IMAD R21, R21, R22, R10;
/*04b8*/ IMAD R14, R14, R8, R4;
/*04c0*/ IMAD R16, R16, R2, R12;
/*04c8*/ IMAD R15, R15, R17, R20;
/*04d0*/ IMAD R21, R21, R22, R10;
/*04d8*/ IMAD R14, R14, R8, R4;
/*04e0*/ IMAD R16, R16, R2, R12;
/*04e8*/ IMAD R15, R15, R17, R20;
/*04f0*/ IMAD R21, R21, R22, R10;
/*04f8*/ IMAD R14, R14, R8, R4;
/*0500*/ IMAD R16, R16, R2, R12;
/*0508*/ IMAD R15, R15, R17, R20;
/*0510*/ IMAD R21, R21, R22, R10;
/*0518*/ IMAD R14, R14, R8, R4;
/*0520*/ IMAD R16, R16, R2, R12;
/*0528*/ IMAD R15, R15, R17, R20;
/*0530*/ IMAD R21, R21, R22, R10;
/*0538*/ IMAD R14, R14, R8, R4;
/*0540*/ IMAD R16, R16, R2, R12;
/*0548*/ IADD R3, R3, 0x20;
/*0550*/ IMAD R15, R15, R17, R20;
/*0558*/ IMAD R21, R21, R22, R10;
/*0560*/ IMAD R14, R14, R8, R4;
/*0568*/ IMAD R16, R16, R2, R12;
/*0570*/ ISETP.NE.AND P0, PT, R3, RZ, PT;
/*0578*/ IMAD R15, R15, R17, R20;
/*0580*/ IMAD R23, R21, R22, R10;
/*0588*/ IMAD R14, R14, R8, R4;
/*0590*/ IMAD R16, R16, R2, R12;
/*0598*/ IMAD R15, R15, R17, R20;
/*05a0*/ IMAD R23, R23, R22, R10;
/*05a8*/ IMAD R14, R14, R8, R4;
/*05b0*/ IMAD R16, R16, R2, R12;
/*05b8*/ @P0 BRA 0x1a8;
/*05c0*/ MOV32I R2, 0x40000000;
/*05c8*/ MOV32I R8, 0x4;
/*05d0*/ IMAD.U32.U32.HI R4, R2, c[0x0][0x38], R0;
/*05d8*/ IMAD R2.CC, R0, R8, c[0x0][0x20];
/*05e0*/ IMAD.HI.X R3, R0, R8, c[0x0][0x24];
/*05e8*/ IMAD.U32.U32 R14.CC, R4, R8, c[0x0][0x20];
/*05f0*/ IMAD.U32.U32.HI.X R15, R4, R8, c[0x0][0x24];
/*05f8*/ ST.E [R2], R5;
/*0600*/ ST.E [R14], R9;
/*0608*/ ST.E [R18], R11;
/*0610*/ ST.E [R6], R13;
/*0618*/ EXIT;
The ST
operations for both CUDA 5.5
and 6.0
operate on R10
, R18
, R17
and R8
which are updated by the last IMAD
's, while for CUDA 6.5 they operate on R5
, R9
, R11
and R13
which are not updated by the last IMAD
's.
What I'm doing wrong?
My set up: core i7 laptop with Windows 7, compiling a release project, either 32
-bit or 64
-bit (same problem). The above refers to 64
-bit.
Command line:
# (Approximate command-line. Settings inherited from host are not visible below.)
# (Please see the output window after a build for the full command-line)
# Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" --use-local-env --cl- version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\x86_amd64" -- keep --keep-dir x64\Release -maxrregcount=0 --ptxas-options=-v --machine 32 --compile -cudart static -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"
# Runtime API (NVCC Compilation Type is hybrid object or .c file)
set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\"
"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" --use-local-env --cl- version 2010 -ccbin "c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin\x86_amd64" -- keep --keep-dir x64\Release -maxrregcount=0 --ptxas-options=-v --machine 32 --compile -cudart static -Xcompiler "/EHsc /nologo /Zi " -o x64\Release\%(Filename)%(Extension).obj "%(FullPath)"
EDIT
The problem only occurs when the two additional kernels are present. If I comment them out, everything works. The disassembled codes for the working and non-working cases are reported here and here, respectively.
Upvotes: 2
Views: 255
Reputation: 151799
I have confirmed that the problem is fixed in CUDA 7 EA. When CUDA 7 RC or CUDA 7 production release are available, the issue should be fixed there as well.
Upvotes: 1