Reputation: 6861
I'm trying improve performance for my function. Profiler points to the code at inner loop. Can I improve perfomance of that code, maybe using SSE intrinsics?
void ConvertImageFrom_R16_FLOAT_To_R32_FLOAT(char* buffer, void* convertedData, DWORD width, DWORD height, UINT rowPitch)
{
struct SINGLE_FLOAT
{
union {
struct {
unsigned __int32 R_m : 23;
unsigned __int32 R_e : 8;
unsigned __int32 R_s : 1;
};
struct {
float r;
};
};
};
C_ASSERT(sizeof(SINGLE_FLOAT) == 4); // 4 bytes
struct HALF_FLOAT
{
unsigned __int16 R_m : 10;
unsigned __int16 R_e : 5;
unsigned __int16 R_s : 1;
};
C_ASSERT(sizeof(HALF_FLOAT) == 2);
SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
for(DWORD j = 0; j< height; j++)
{
HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
for(DWORD i = 0; i< width; i++)
{
d->R_s = s->R_s;
d->R_e = s->R_e - 15 + 127;
d->R_m = s->R_m << (23-10);
d++;
s++;
}
}
}
Update:
Disassembly
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01
TITLE Utils.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
; Function compile flags: /Ogtp
; COMDAT ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z
_TEXT SEGMENT
_buffer$ = 8 ; size = 4
tv83 = 12 ; size = 4
_convertedData$ = 12 ; size = 4
_width$ = 16 ; size = 4
_height$ = 20 ; size = 4
_rowPitch$ = 24 ; size = 4
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z PROC ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT, COMDAT
; 323 : {
push ebp
mov ebp, esp
; 343 : for(DWORD j = 0; j< height; j++)
mov eax, DWORD PTR _height$[ebp]
push esi
mov esi, DWORD PTR _convertedData$[ebp]
test eax, eax
je SHORT $LN4@ConvertIma
; 324 : union SINGLE_FLOAT {
; 325 : struct {
; 326 : unsigned __int32 R_m : 23;
; 327 : unsigned __int32 R_e : 8;
; 328 : unsigned __int32 R_s : 1;
; 329 : };
; 330 : struct {
; 331 : float r;
; 332 : };
; 333 : };
; 334 : C_ASSERT(sizeof(SINGLE_FLOAT) == 4);
; 335 : struct HALF_FLOAT
; 336 : {
; 337 : unsigned __int16 R_m : 10;
; 338 : unsigned __int16 R_e : 5;
; 339 : unsigned __int16 R_s : 1;
; 340 : };
; 341 : C_ASSERT(sizeof(HALF_FLOAT) == 2);
; 342 : SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
push ebx
mov ebx, DWORD PTR _buffer$[ebp]
push edi
mov DWORD PTR tv83[ebp], eax
$LL13@ConvertIma:
; 344 : {
; 345 : HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
; 346 : for(DWORD i = 0; i< width; i++)
mov edi, DWORD PTR _width$[ebp]
mov edx, ebx
test edi, edi
je SHORT $LN5@ConvertIma
npad 1
$LL3@ConvertIma:
; 347 : {
; 348 : d->R_s = s->R_s;
movzx ecx, WORD PTR [edx]
movzx eax, WORD PTR [edx]
shl ecx, 16 ; 00000010H
xor ecx, DWORD PTR [esi]
shl eax, 16 ; 00000010H
and ecx, 2147483647 ; 7fffffffH
xor ecx, eax
mov DWORD PTR [esi], ecx
; 349 : d->R_e = s->R_e - 15 + 127;
movzx eax, WORD PTR [edx]
shr eax, 10 ; 0000000aH
and eax, 31 ; 0000001fH
add eax, 112 ; 00000070H
shl eax, 23 ; 00000017H
xor eax, ecx
and eax, 2139095040 ; 7f800000H
xor eax, ecx
mov DWORD PTR [esi], eax
; 350 : d->R_m = s->R_m << (23-10);
movzx ecx, WORD PTR [edx]
and ecx, 1023 ; 000003ffH
shl ecx, 13 ; 0000000dH
and eax, -8388608 ; ff800000H
or ecx, eax
mov DWORD PTR [esi], ecx
; 351 : d++;
add esi, 4
; 352 : s++;
add edx, 2
dec edi
jne SHORT $LL3@ConvertIma
$LN5@ConvertIma:
; 343 : for(DWORD j = 0; j< height; j++)
add ebx, DWORD PTR _rowPitch$[ebp]
dec DWORD PTR tv83[ebp]
jne SHORT $LL13@ConvertIma
pop edi
pop ebx
$LN4@ConvertIma:
pop esi
; 353 : }
; 354 : }
; 355 : }
pop ebp
ret 0
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ENDP ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
_TEXT ENDS
Upvotes: 3
Views: 2628
Reputation: 365267
The x86 F16C instruction-set extension adds hardware support for converting single-precision float vectors to/from vectors of half-precision float.
The format is the same IEEE 754 half-precision binary16 that you describe. I didn't check that the endianness is the same as your struct, but that's easy to fix if needed (with a pshufb
).
F16C is supported starting from Intel IvyBridge and AMD Piledriver. (And has its own CPUID feature bit, which your code should check for, otherwise fall back to SIMD integer shifts and shuffles).
The intrinsics for VCVTPS2PH are:
__m128i _mm_cvtps_ph ( __m128 m1, const int imm);
__m128i _mm256_cvtps_ph(__m256 m1, const int imm);
The immediate byte is a rounding control. The compiler can use it as a convert-and-store directly to memory (unlike most instructions that can optionally use a memory operand, where it's the source operand that can be memory instead of a register.)
VCVTPH2PS goes the other way, and is just like most other SSE instructions (can be used between registers or as a load).
__m128 _mm_cvtph_ps ( __m128i m1);
__m256 _mm256_cvtph_ps ( __m128i m1)
F16C is so efficient that you might want to consider leaving your image in half-precision format, and converting on the fly every time you need a vector of data from it. This is great for your cache footprint.
Upvotes: 7
Reputation:
My suspicion is that this operation will be already bottlenecked on memory access, and making it more efficient (e.g., using SSE) would not make it execute more quickly. However this is only a suspicion.
Other things to try, assuming x86/x64, might be:
d++
and s++
, but use d[i]
and s[i]
on each iteration. (Then of course bump d
after each scanline.) Since the elements of d
are 4 bytes and those of s
2, this operation can be folded into the address calculation. (Unfortunately I can't guarantee that this would necessarily make execution more efficient.)width
down to zero. This stops the compiler having to fetch width
each time round. Probably more important for x86, because it has so few registers. (If the CPU likes my "d[i]
and s[i]
" suggestion, you could make width signed, count from width-1
instead, and walk backwards.)These would all be quicker to try than converting to SSE and would hopefully make it memory-bound, if it isn't already, at which point you can give up.
Finally if the output is in write-combined memory (e.g., it's a texture or vertex buffer or something accessed over AGP, or PCI Express, or whatever it is PCs have these days) then this could well result in poor performance, depending on what code the compiler has generated for the inner loop. So if that is the case you may get better results converting each scanline into a local buffer then using memcpy
to copy it to its final destination.
Upvotes: 0
Reputation: 57729
Here are some ideas:
const register
variables.Some processors don't like fetching constants from memory; it is awkward and may take many instruction cycles.
Repeat the statements in the loop, and increase the increment.
Processors prefer continuous instructions; jumps and branches anger them.
Use more variables in the loop, and declare them as volatile
so the compiler doesn't optimize them:
SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
SINGLE_FLOAT* d1 = d + 1;
SINGLE_FLOAT* d2 = d + 2;
SINGLE_FLOAT* d3 = d + 3;
for(DWORD j = 0; j< height; j++)
{
HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
HALF_FLOAT* s1 = (HALF_FLOAT*)((char*)buffer + rowPitch * (j + 1));
HALF_FLOAT* s2 = (HALF_FLOAT*)((char*)buffer + rowPitch * (j + 2));
HALF_FLOAT* s3 = (HALF_FLOAT*)((char*)buffer + rowPitch * (j + 3));
for(DWORD i = 0; i< width; i += 4)
{
d->R_s = s->R_s;
d->R_e = s->R_e - 15 + 127;
d->R_m = s->R_m << (23-10);
d1->R_s = s1->R_s;
d1->R_e = s1->R_e - 15 + 127;
d1->R_m = s1->R_m << (23-10);
d2->R_s = s2->R_s;
d2->R_e = s2->R_e - 15 + 127;
d2->R_m = s2->R_m << (23-10);
d3->R_s = s3->R_s;
d3->R_e = s3->R_e - 15 + 127;
d3->R_m = s3->R_m << (23-10);
d += 4;
d1 += 4;
d2 += 4;
d3 += 4;
s += 4;
s1 += 4;
s2 += 4;
s3 += 4;
}
}
Upvotes: 1
Reputation: 7348
SSE Intrinsics seem to be an excellent idea. Before you go down that road, you should
look at the assembly code generated by the compiler, (is there potential for optimization?)
search your compiler documentation how to generate SSE code automatically,
search your software library's documentation (or wherever the 16bit float type originated) for a function to bulk convert this type. (a conversion to 64bit floating point could be helpful too.) You are very likely not the first person to encounter this problem!
If all that fails, go and try your luck with some SSE intrinsics. To get some idea, here is some SSE code to convert from 32 to 16 bit floating point. (you want the reverse)
Besides SSE you should also consider multi-threading and offloading the task to the GPU.
Upvotes: 1
Reputation: 7939
The function is only doing a few small things. It is going to be tough to shave much off the time by optimisation, but as somebody already said, parallelisation has promise.
Check how many cache misses you are getting. If the data is paging in and out, you might be able to speed it up by applying more intelligence into the ordering to minimise cache swaps.
Also consider macro-optimisations. Are there any redundancies in the data computation that might be avoided (e.g. caching old results instead of recomputing them when needed)? Do you really need to convert the whole data set or could you just convert the bits you need? I don't know your application so I'm just guessing wildly here, but there might be scope for that kind of optimisation.
Upvotes: 0
Reputation: 5414
You should be able to reduce this to a single instruction on chips which use the upcoming CVT16 instruction set. According to that Wikipedia article:
The CVT16 instructions allow conversion of floating point vectors between single precision and half precision.
Upvotes: 1
Reputation: 14471
You're processing the data as a two dimension array. If you consider how it's laid out in memory you may be able to process it as a single dimensional array and you can save a little overhead by having one loop instead of nested loops.
I'd also compile to assembly code and make sure the compiler optimization worked and it isn't recalculating (15 + 127) hundreds of times.
Upvotes: 1
Reputation: 26124
Accessing bitfields in memory can be really tricky, depending on the architecture, of course.
You might achieve better performance if you would make a union of a float and a 32 bit integer, and simply perform all decomposition and composition using a local variables. That way the generated code could perform the entire operation using only processor registers.
Upvotes: 2
Reputation: 15400
I don't know about SSE intrinsics but it would be interesting to see a disassembly of your inner loop. An old-school way (that may not help much but that would be easy to try out) would be to reduce the number of iterations by doing two inner loops: one that does N (say 32) repeats of the processing (loop count of width/N) and then one to finish the remainder (loop count of width%N)... with those divs and modulos calculated outside the first loop to avoid recalculating them. Apologies if that sounds obvious!
Upvotes: 0
Reputation: 26181
the loops are independent of each other, so you could easily parallelize this code, either by using SIMD or OpenMP, a simple version would be splitting the top half and the bottom half of the image into two threads, running concurrently.
Upvotes: 1