Reputation: 6861

How can I optimize conversion from half-precision float16 to single-precision float32?

I'm trying improve performance for my function. Profiler points to the code at inner loop. Can I improve perfomance of that code, maybe using SSE intrinsics?

void ConvertImageFrom_R16_FLOAT_To_R32_FLOAT(char* buffer, void* convertedData, DWORD width, DWORD height, UINT rowPitch)
{
    struct SINGLE_FLOAT
    {
        union {
            struct {
                unsigned __int32 R_m : 23;
                unsigned __int32 R_e : 8;
                unsigned __int32 R_s : 1;
            };
            struct {
                float r;
            };
        };
    };
    C_ASSERT(sizeof(SINGLE_FLOAT) == 4); // 4 bytes
    struct HALF_FLOAT
    {
        unsigned __int16 R_m : 10;
        unsigned __int16 R_e : 5;
        unsigned __int16 R_s : 1;
    };
    C_ASSERT(sizeof(HALF_FLOAT) == 2);
    SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
    for(DWORD j = 0; j< height; j++)
    {
        HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
        for(DWORD i = 0; i< width; i++)
        {
            d->R_s = s->R_s;
            d->R_e = s->R_e - 15 + 127;
            d->R_m = s->R_m << (23-10);
            d++;
            s++;
        }
    }
}

Update:

Disassembly

; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 

    TITLE   Utils.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES

PUBLIC  ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
; Function compile flags: /Ogtp
;   COMDAT ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z
_TEXT   SEGMENT
_buffer$ = 8                        ; size = 4
tv83 = 12                       ; size = 4
_convertedData$ = 12                    ; size = 4
_width$ = 16                        ; size = 4
_height$ = 20                       ; size = 4
_rowPitch$ = 24                     ; size = 4
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z PROC ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT, COMDAT

; 323  : {

    push    ebp
    mov ebp, esp

; 343  :    for(DWORD j = 0; j< height; j++)

    mov eax, DWORD PTR _height$[ebp]
    push    esi
    mov esi, DWORD PTR _convertedData$[ebp]
    test    eax, eax
    je  SHORT $LN4@ConvertIma

; 324  :    union SINGLE_FLOAT {
; 325  :        struct {
; 326  :            unsigned __int32 R_m : 23;
; 327  :            unsigned __int32 R_e : 8;
; 328  :            unsigned __int32 R_s : 1;
; 329  :        };
; 330  :        struct {
; 331  :            float r;
; 332  :        };
; 333  :    };
; 334  :    C_ASSERT(sizeof(SINGLE_FLOAT) == 4);
; 335  :    struct HALF_FLOAT
; 336  :    {
; 337  :        unsigned __int16 R_m : 10;
; 338  :        unsigned __int16 R_e : 5;
; 339  :        unsigned __int16 R_s : 1;
; 340  :    };
; 341  :    C_ASSERT(sizeof(HALF_FLOAT) == 2);
; 342  :    SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;

    push    ebx
    mov ebx, DWORD PTR _buffer$[ebp]
    push    edi
    mov DWORD PTR tv83[ebp], eax
$LL13@ConvertIma:

; 344  :    {
; 345  :        HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
; 346  :        for(DWORD i = 0; i< width; i++)

    mov edi, DWORD PTR _width$[ebp]
    mov edx, ebx
    test    edi, edi
    je  SHORT $LN5@ConvertIma
    npad    1
$LL3@ConvertIma:

; 347  :        {
; 348  :            d->R_s = s->R_s;

    movzx   ecx, WORD PTR [edx]
    movzx   eax, WORD PTR [edx]
    shl ecx, 16                 ; 00000010H
    xor ecx, DWORD PTR [esi]
    shl eax, 16                 ; 00000010H
    and ecx, 2147483647             ; 7fffffffH
    xor ecx, eax
    mov DWORD PTR [esi], ecx

; 349  :            d->R_e = s->R_e - 15 + 127;

    movzx   eax, WORD PTR [edx]
    shr eax, 10                 ; 0000000aH
    and eax, 31                 ; 0000001fH
    add eax, 112                ; 00000070H
    shl eax, 23                 ; 00000017H
    xor eax, ecx
    and eax, 2139095040             ; 7f800000H
    xor eax, ecx
    mov DWORD PTR [esi], eax

; 350  :            d->R_m = s->R_m << (23-10);

    movzx   ecx, WORD PTR [edx]
    and ecx, 1023               ; 000003ffH
    shl ecx, 13                 ; 0000000dH
    and eax, -8388608               ; ff800000H
    or  ecx, eax
    mov DWORD PTR [esi], ecx

; 351  :            d++;

    add esi, 4

; 352  :            s++;

    add edx, 2
    dec edi
    jne SHORT $LL3@ConvertIma
$LN5@ConvertIma:

; 343  :    for(DWORD j = 0; j< height; j++)

    add ebx, DWORD PTR _rowPitch$[ebp]
    dec DWORD PTR tv83[ebp]
    jne SHORT $LL13@ConvertIma
    pop edi
    pop ebx
$LN4@ConvertIma:
    pop esi

; 353  :        }
; 354  :    }
; 355  : }

    pop ebp
    ret 0
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ENDP ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
_TEXT   ENDS

Upvotes: 3

Answers (10)

Peter Cordes

Reputation: 365267

The x86 F16C instruction-set extension adds hardware support for converting single-precision float vectors to/from vectors of half-precision float.

The format is the same IEEE 754 half-precision binary16 that you describe. I didn't check that the endianness is the same as your struct, but that's easy to fix if needed (with a pshufb).

F16C is supported starting from Intel IvyBridge and AMD Piledriver. (And has its own CPUID feature bit, which your code should check for, otherwise fall back to SIMD integer shifts and shuffles).

The intrinsics for VCVTPS2PH are:

__m128i _mm_cvtps_ph ( __m128 m1, const int imm);
__m128i _mm256_cvtps_ph(__m256 m1, const int imm);

The immediate byte is a rounding control. The compiler can use it as a convert-and-store directly to memory (unlike most instructions that can optionally use a memory operand, where it's the source operand that can be memory instead of a register.)

VCVTPH2PS goes the other way, and is just like most other SSE instructions (can be used between registers or as a load).

__m128 _mm_cvtph_ps ( __m128i m1);
__m256 _mm256_cvtph_ps ( __m128i m1)

F16C is so efficient that you might want to consider leaving your image in half-precision format, and converting on the fly every time you need a vector of data from it. This is great for your cache footprint.

Upvotes: 7

please delete me

Reputation:

My suspicion is that this operation will be already bottlenecked on memory access, and making it more efficient (e.g., using SSE) would not make it execute more quickly. However this is only a suspicion.

Other things to try, assuming x86/x64, might be:

Don't d++ and s++, but use d[i] and s[i] on each iteration. (Then of course bump d after each scanline.) Since the elements of d are 4 bytes and those of s 2, this operation can be folded into the address calculation. (Unfortunately I can't guarantee that this would necessarily make execution more efficient.)
Remove the bitfield operations and do the operations manually. (When extracting, shift first and mask second, to maximize the likelihood that the mask can fit into a small immediate value.)
Unroll the loop, though with a loop as easily-predicted as this one it might not make much difference.
Count along each line from width down to zero. This stops the compiler having to fetch width each time round. Probably more important for x86, because it has so few registers. (If the CPU likes my "d[i] and s[i]" suggestion, you could make width signed, count from width-1 instead, and walk backwards.)

These would all be quicker to try than converting to SSE and would hopefully make it memory-bound, if it isn't already, at which point you can give up.

Finally if the output is in write-combined memory (e.g., it's a texture or vertex buffer or something accessed over AGP, or PCI Express, or whatever it is PCs have these days) then this could well result in poor performance, depending on what code the compiler has generated for the inner loop. So if that is the case you may get better results converting each scanline into a local buffer then using memcpy to copy it to its final destination.

Upvotes: 0

Thomas Matthews

Reputation: 57729

Here are some ideas:

Put the constants into `const register` variables.

Some processors don't like fetching constants from memory; it is awkward and may take many instruction cycles.

Loop Unrolling

Repeat the statements in the loop, and increase the increment.
Processors prefer continuous instructions; jumps and branches anger them.

Data Prefetching (or loading the cache)

Use more variables in the loop, and declare them as volatile so the compiler doesn't optimize them:

SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
SINGLE_FLOAT* d1 = d + 1;
SINGLE_FLOAT* d2 = d + 2;
SINGLE_FLOAT* d3 = d + 3;
for(DWORD j = 0; j< height; j++)
{
    HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
    HALF_FLOAT* s1 = (HALF_FLOAT*)((char*)buffer + rowPitch * (j + 1));
    HALF_FLOAT* s2 = (HALF_FLOAT*)((char*)buffer + rowPitch * (j + 2));
    HALF_FLOAT* s3 = (HALF_FLOAT*)((char*)buffer + rowPitch * (j + 3));
    for(DWORD i = 0; i< width; i += 4)
    {
        d->R_s = s->R_s;
        d->R_e = s->R_e - 15 + 127;
        d->R_m = s->R_m << (23-10);
        d1->R_s = s1->R_s;
        d1->R_e = s1->R_e - 15 + 127;
        d1->R_m = s1->R_m << (23-10);
        d2->R_s = s2->R_s;
        d2->R_e = s2->R_e - 15 + 127;
        d2->R_m = s2->R_m << (23-10);
        d3->R_s = s3->R_s;
        d3->R_e = s3->R_e - 15 + 127;
        d3->R_m = s3->R_m << (23-10);
        d += 4;
        d1 += 4;
        d2 += 4;
        d3 += 4;
        s += 4;
        s1 += 4;
        s2 += 4;
        s3 += 4;
    }
}

Upvotes: 1

Mackie Messer

Reputation: 7348

SSE Intrinsics seem to be an excellent idea. Before you go down that road, you should

look at the assembly code generated by the compiler, (is there potential for optimization?)
search your compiler documentation how to generate SSE code automatically,
search your software library's documentation (or wherever the 16bit float type originated) for a function to bulk convert this type. (a conversion to 64bit floating point could be helpful too.) You are very likely not the first person to encounter this problem!

If all that fails, go and try your luck with some SSE intrinsics. To get some idea, here is some SSE code to convert from 32 to 16 bit floating point. (you want the reverse)

Besides SSE you should also consider multi-threading and offloading the task to the GPU.

Upvotes: 1

Michael J

Reputation: 7939

The function is only doing a few small things. It is going to be tough to shave much off the time by optimisation, but as somebody already said, parallelisation has promise.

Check how many cache misses you are getting. If the data is paging in and out, you might be able to speed it up by applying more intelligence into the ordering to minimise cache swaps.

Also consider macro-optimisations. Are there any redundancies in the data computation that might be avoided (e.g. caching old results instead of recomputing them when needed)? Do you really need to convert the whole data set or could you just convert the bits you need? I don't know your application so I'm just guessing wildly here, but there might be scope for that kind of optimisation.

Upvotes: 0

Null Set

Reputation: 5414

You should be able to reduce this to a single instruction on chips which use the upcoming CVT16 instruction set. According to that Wikipedia article:

The CVT16 instructions allow conversion of floating point vectors between single precision and half precision.

Upvotes: 1

Jay

Reputation: 14471

You're processing the data as a two dimension array. If you consider how it's laid out in memory you may be able to process it as a single dimensional array and you can save a little overhead by having one loop instead of nested loops.

I'd also compile to assembly code and make sure the compiler optimization worked and it isn't recalculating (15 + 127) hundreds of times.

Upvotes: 1

Lindydancer

Reputation: 26124

Accessing bitfields in memory can be really tricky, depending on the architecture, of course.

You might achieve better performance if you would make a union of a float and a 32 bit integer, and simply perform all decomposition and composition using a local variables. That way the generated code could perform the entire operation using only processor registers.

Upvotes: 2

Clafou

Reputation: 15400

I don't know about SSE intrinsics but it would be interesting to see a disassembly of your inner loop. An old-school way (that may not help much but that would be easy to try out) would be to reduce the number of iterations by doing two inner loops: one that does N (say 32) repeats of the processing (loop count of width/N) and then one to finish the remainder (loop count of width%N)... with those divs and modulos calculated outside the first loop to avoid recalculating them. Apologies if that sounds obvious!

Upvotes: 0

Necrolis

Reputation: 26181

the loops are independent of each other, so you could easily parallelize this code, either by using SIMD or OpenMP, a simple version would be splitting the top half and the bottom half of the image into two threads, running concurrently.

Upvotes: 1

How can I optimize conversion from half-precision float16 to single-precision float32?

Answers (10)

Put the constants into const register variables.

Loop Unrolling

Data Prefetching (or loading the cache)

Related Questions

Put the constants into `const register` variables.