Reputation: 2455
I was wondering, why NVCC isn't able to unroll the following Cholesky factorization kernel for small matrices (N=4).
template<typename T, int N>
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
for (int i = 0; i < N; i++){
#pragma unroll
for (int j = 0; j <= i; j++) {
double s = 0;
#pragma unroll
for (int k = 0; k < j; k++){
s += C[i*N+k] * C[j*N+k];
}
s = C[i*N+j] - s;
C[i*N+j] = (i == j) ?
sqrt(s) :
(1.0 / C[j*N+j] * (s));
}
}
}
The generated PTX code looks like this:
sqrt.rn.f64 %fd12, %fd29;
st.local.f64 [%rd1], %fd12;
rcp.rn.f64 %fd34, %fd12;
mul.f64 %fd13, %fd30, %fd34;
st.local.f64 [%rd1+32], %fd13;
fma.rn.f64 %fd35, %fd13, %fd13, 0d0000000000000000;
sub.f64 %fd36, %fd31, %fd35;
sqrt.rn.f64 %fd14, %fd36;
st.local.f64 [%rd1+40], %fd14;
mul.f64 %fd15, %fd32, %fd34;
st.local.f64 [%rd1+64], %fd15;
ld.local.f64 %fd37, [%rd1+32];
fma.rn.f64 %fd38, %fd15, %fd37, 0d0000000000000000;
sub.f64 %fd39, %fd33, %fd38;
rcp.rn.f64 %fd40, %fd14;
mul.f64 %fd16, %fd39, %fd40;
st.local.f64 [%rd1+72], %fd16;
mov.f64 %fd58, 0d0000000000000000;
mov.u32 %r58, -2;
mov.u64 %rd40, -8;
BB1_5:
shl.b64 %rd23, %rd40, 3;
sub.s64 %rd24, %rd1, %rd23;
ld.local.f64 %fd41, [%rd24];
fma.rn.f64 %fd58, %fd41, %fd41, %fd58;
add.s64 %rd40, %rd40, -1;
add.s32 %r58, %r58, 1;
setp.ne.s32 %p3, %r58, 0;
@%p3 bra BB1_5;
sub.f64 %fd43, %fd6, %fd58;
sqrt.rn.f64 %fd19, %fd43;
st.local.f64 [%rd1+80], %fd19;
mul.f64 %fd20, %fd8, %fd34;
st.local.f64 [%rd1+96], %fd20;
ld.local.f64 %fd45, [%rd1+32];
fma.rn.f64 %fd46, %fd20, %fd45, 0d0000000000000000;
sub.f64 %fd47, %fd9, %fd46;
mul.f64 %fd21, %fd47, %fd40;
st.local.f64 [%rd1+104], %fd21;
mov.f64 %fd59, 0d0000000000000000;
mov.u32 %r59, -2;
mov.u64 %rd41, %rd1;
BB1_7:
mov.u64 %rd5, %rd41;
ld.local.f64 %fd49, [%rd5+64];
ld.local.f64 %fd50, [%rd5+96];
fma.rn.f64 %fd59, %fd50, %fd49, %fd59;
add.s64 %rd6, %rd5, 8;
add.s32 %r59, %r59, 1;
setp.ne.s32 %p4, %r59, 0;
mov.u64 %rd41, %rd6;
@%p4 bra BB1_7;
sub.f64 %fd52, %fd10, %fd59;
rcp.rn.f64 %fd53, %fd19;
mul.f64 %fd24, %fd52, %fd53;
st.local.f64 [%rd1+112], %fd24;
mov.f64 %fd60, 0d0000000000000000;
mov.u32 %r60, -3;
mov.u64 %rd42, -12;
BB1_9:
shl.b64 %rd26, %rd42, 3;
sub.s64 %rd27, %rd1, %rd26;
ld.local.f64 %fd54, [%rd27];
fma.rn.f64 %fd60, %fd54, %fd54, %fd60;
add.s64 %rd42, %rd42, -1;
add.s32 %r60, %r60, 1;
setp.ne.s32 %p5, %r60, 0;
@%p5 bra BB1_9;
Clearly the loops aren't unrolled and expensive local memory is used (My goal is to do everything in registers only).
I'm calling the function like so:
T l[N*N];
for(int i = 0; i < N*N; ++i){
l[i] = buffer[offset+i];
}
choleskyKernel2<T,N>(l);
for(int i = 0; i < N*N; ++i){
buffer[offset+i] = l[i];
}
Is there a way to properly unroll this loops so everything can be done in registers?
EDIT:
Full code:
#include <thrust/device_vector.h>
template<typename T, int N>
__device__ inline
void choleskyKernel2(T* C){
#pragma unroll
for (int i = 0; i < N; i++){
#pragma unroll
for (int j = 0; j <= i; j++) {
double s = 0;
#pragma unroll
for (int k = 0; k < j; k++){
s += C[i*N+k] * C[j*N+k];
}
s = C[i*N+j] - s;
C[i*N+j] = (i == j) ?
sqrt(s) :
(1.0 / C[j*N+j] * (s));
}
}
}
template<typename T, int N>
__global__ static
void test3(T* buffer){
const int matrixElements = N * N;
T l[matrixElements];
for(int i = 0; i < matrixElements; ++i){
l[i] = buffer[i];
}
choleskyKernel2<T,N>(l);
for(int i = 0; i < matrixElements; ++i){
buffer[i] = l[i];
}
}
int main(){
thrust::device_vector<double> d_data(16);
test3<double,4> <<< 1,1 >>>(thrust::raw_pointer_cast(d_data.data()));
}
Upvotes: 4
Views: 1541
Reputation: 7245
While I can't tell you why nvcc (or indeed cicc which performs the device code compilation on behalf of nvcc) doesn't unroll your loops, I can show you how to change the code so that it does.
Turn
#pragma unroll
for (int i = 0; i < N; i++){
#pragma unroll
for (int j = 0; j <= i; j++) {
into
#pragma unroll
for (int i = 0; i < N; i++) {
#pragma unroll
for (int j = 0; j < N; j++)
if (j <= i) {
and you will find all loops unrolled with no use of any local memory.
This is even though you didn't ask for the load and store loops to be unrolled. In fact, with my change above you don't need any #pragma unroll
directives at all.
Upvotes: 6