Reputation: 9176
I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.
I created the following program to illustrate the problem:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
__global__ void less_threads(float * d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
//Do 12 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
num_inliers += j*(j+5);
num_inliers += j*(j+6);
num_inliers += j*(j+7);
num_inliers += j*(j+8);
num_inliers += j*(j+9);
num_inliers += j*(j+10);
num_inliers += j*(j+11);
num_inliers += j*(j+12);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
__global__ void more_threads(float *d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
// Do 4 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
int main(int argc, char* argv[])
{
float *d_out = NULL;
cudaMalloc((void**)&d_out,sizeof(float)*25000);
more_threads<<<780,128>>>(d_out);
less_threads<<<780,32>>>(d_out);
return 0;
}
And the PTX output is:
.entry _Z12less_threadsPf (
.param .u32 __cudaparm__Z12less_threadsPf_d_out)
{
.reg .u32 %r<35>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 6 0
// 2 #include <stdlib.h>
// 3 #include <cuda_runtime.h>
// 4 #include <cutil.h>
// 5
// 6 __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, 0;
mov.s32 %r5, 0;
mov.s32 %r6, 0;
mov.s32 %r7, 0;
mov.s32 %r8, 0;
mov.s32 %r9, 0;
mov.s32 %r10, 0;
mov.s32 %r11, 0;
mov.s32 %r12, %r13;
mov.s32 %r14, 0;
$Lt_0_2562:
//<loop> Loop body line 6, nesting depth: 1, iterations: 800
.loc 17 10 0
// 7 int num_inliers;
// 8 for (int j=0;j<800;++j) {
// 9 //Do 12 computations
// 10 num_inliers += j*(j+1);
mul.lo.s32 %r15, %r14, %r14;
add.s32 %r16, %r12, %r14;
add.s32 %r12, %r15, %r16;
.loc 17 11 0
// 11 num_inliers += j*(j+2);
add.s32 %r17, %r15, %r12;
add.s32 %r12, %r1, %r17;
.loc 17 12 0
// 12 num_inliers += j*(j+3);
add.s32 %r18, %r15, %r12;
add.s32 %r12, %r2, %r18;
.loc 17 13 0
// 13 num_inliers += j*(j+4);
add.s32 %r19, %r15, %r12;
add.s32 %r12, %r3, %r19;
.loc 17 14 0
// 14 num_inliers += j*(j+5);
add.s32 %r20, %r15, %r12;
add.s32 %r12, %r4, %r20;
.loc 17 15 0
// 15 num_inliers += j*(j+6);
add.s32 %r21, %r15, %r12;
add.s32 %r12, %r5, %r21;
.loc 17 16 0
// 16 num_inliers += j*(j+7);
add.s32 %r22, %r15, %r12;
add.s32 %r12, %r6, %r22;
.loc 17 17 0
// 17 num_inliers += j*(j+8);
add.s32 %r23, %r15, %r12;
add.s32 %r12, %r7, %r23;
.loc 17 18 0
// 18 num_inliers += j*(j+9);
add.s32 %r24, %r15, %r12;
add.s32 %r12, %r8, %r24;
.loc 17 19 0
// 19 num_inliers += j*(j+10);
add.s32 %r25, %r15, %r12;
add.s32 %r12, %r9, %r25;
.loc 17 20 0
// 20 num_inliers += j*(j+11);
add.s32 %r26, %r15, %r12;
add.s32 %r12, %r10, %r26;
.loc 17 21 0
// 21 num_inliers += j*(j+12);
add.s32 %r27, %r15, %r12;
add.s32 %r12, %r11, %r27;
add.s32 %r14, %r14, 1;
add.s32 %r11, %r11, 12;
add.s32 %r10, %r10, 11;
add.s32 %r9, %r9, 10;
add.s32 %r8, %r8, 9;
add.s32 %r7, %r7, 8;
add.s32 %r6, %r6, 7;
add.s32 %r5, %r5, 6;
add.s32 %r4, %r4, 5;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r28, 1600;
setp.ne.s32 %p1, %r1, %r28;
@%p1 bra $Lt_0_2562;
cvt.u32.u16 %r29, %tid.x;
mov.u32 %r30, -1;
setp.ne.u32 %p2, %r29, %r30;
@%p2 bra $Lt_0_3074;
.loc 17 25 0
// 22 }
// 23
// 24 if (threadIdx.x == -1)
// 25 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r12;
ld.param.u32 %r31, [__cudaparm__Z12less_threadsPf_d_out];
mul24.lo.u32 %r32, %r29, 4;
add.u32 %r33, %r31, %r32;
st.global.f32 [%r33+0], %f1;
$Lt_0_3074:
.loc 17 26 0
// 26 }
exit;
$LDWend__Z12less_threadsPf:
} // _Z12less_threadsPf
.entry _Z12more_threadsPf (
.param .u32 __cudaparm__Z12more_threadsPf_d_out)
{
.reg .u32 %r<19>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 28 0
// 27
// 28 __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, %r5;
mov.s32 %r6, 0;
$Lt_1_2562:
//<loop> Loop body line 28, nesting depth: 1, iterations: 800
.loc 17 32 0
// 29 int num_inliers;
// 30 for (int j=0;j<800;++j) {
// 31 // Do 4 computations
// 32 num_inliers += j*(j+1);
mul.lo.s32 %r7, %r6, %r6;
add.s32 %r8, %r4, %r6;
add.s32 %r4, %r7, %r8;
.loc 17 33 0
// 33 num_inliers += j*(j+2);
add.s32 %r9, %r7, %r4;
add.s32 %r4, %r1, %r9;
.loc 17 34 0
// 34 num_inliers += j*(j+3);
add.s32 %r10, %r7, %r4;
add.s32 %r4, %r2, %r10;
.loc 17 35 0
// 35 num_inliers += j*(j+4);
add.s32 %r11, %r7, %r4;
add.s32 %r4, %r3, %r11;
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r12, 1600;
setp.ne.s32 %p1, %r1, %r12;
@%p1 bra $Lt_1_2562;
cvt.u32.u16 %r13, %tid.x;
mov.u32 %r14, -1;
setp.ne.u32 %p2, %r13, %r14;
@%p2 bra $Lt_1_3074;
.loc 17 38 0
// 36 }
// 37 if (threadIdx.x == -1)
// 38 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r4;
ld.param.u32 %r15, [__cudaparm__Z12more_threadsPf_d_out];
mul24.lo.u32 %r16, %r13, 4;
add.u32 %r17, %r15, %r16;
st.global.f32 [%r17+0], %f1;
$Lt_1_3074:
.loc 17 39 0
// 39 }
exit;
$LDWend__Z12more_threadsPf:
} // _Z12more_threadsPf
Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.
Significant results form the profiler results are as followsL:
more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552
less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381
As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.
Why are there more instructions?
Why is there any branching, let alone divergent branching, considering there is no conditional code?
Why are there any gst requests when there is no global memory access?
What is going on here!
Thanks
Update
Added PTX code and fixed CUDA C so it should compile
Upvotes: 2
Views: 2225
Reputation: 7383
The two functions are not doing the same amount of work.
more_threads<<<780, 128>>>():
less_threads<<<780, 32>>>():
So, more_threads is doing more work than less threads, which is why the number of instructions goes up and why more_threads is slower. To fix more_threads
, do only 3 computations inside the loop: 780*128*800*(3+6) = 718,848,000.
Upvotes: 4
Reputation: 21108
Since your code has only arithmetic instructions, you don't need very high occupancy to hide the latency of the arithmetic units. Indeed, even if you do have memory instructions you can maximise performance with ~50% occupancy provided your reads/writes are efficient. See the recorded Advanced CUDA C presentation for more information on occupancy and performance.
In your case, given that your kernel doesn't need high occupancy to saturate the arithmetic units, you will have better performance using fewer larger blocks than more smaller blocks since there is a cost for launching blocks. In general however the cost of launching blocks is negligible compared with the time to actually run the code.
Why are there more instructions?
Remember that the counters are not counting per block (aka CTA) but instead per SM (Streaming Multiprocessor) or per TPC (Texture Processing Cluster) which is a group of two or three SMs depending on your device. The instructions count is per SM.
It is fair to expect the less_threads
kernel to have fewer instructions, however you are launching four times as many warps per block which means each SM will execute the code approximately four times as many times. Taking into account the shorter kernel code, your measurement doesn't seem unreasonable.
Why is there any branching?
Actually you do have conditional code:
for (int j=0;j<800;++j)
This has a condition, however all threads within a warp are indeed executing the same path so it is not divergent. My guess is the divergence is in the administration code somewhere, you could take a look at the PTX code to analyse this if you were worried. 26 is very low compared with the number of instructions executed, so this will not affect your performance.
Why are there any gst requests?
In your code you have:
if (threadIdx.x == -1)
d_out[blockIdx.x*blockDim.x+threadIdx.x] = num_inliers;
This will be handled by the load/store unit and hence counted even though it results in no actual transaction. The gst_32/gst_64/gst_128 counters indicate actual memory transfers (your device has compute capability 1.2 or 1.3, older devices have different sets of counters).
Upvotes: 4
Reputation: 51465
two functions have different number of code lines, so different number of instructions
for loop is implemented using branches. last line of code always divergent
global store request is not the same as global score. operation is set up, but never commited.
Upvotes: 0