Reputation:
I have a number of sample processing primitives like:
function Add8(A, B: Byte): Byte; {$IFDEF CODEINLINING}inline;{$ENDIF}
begin
Result := A + B;
end;
function Sub16(A, B: Word): Word; {$IFDEF CODEINLINING}inline;{$ENDIF}
begin
Result := A - B;
end;
{ et cetera }
These functions are workhorses of the data processing and to be invoked for every input sample (millions of them). By design Result
type has to be the same size as arguments (operands).
The problem arises then the result of operation exceed the defined range of Low(Result)..High(Result)
, truncating most significant bits and effectively making result incorrect. For example:
Adding low value to the peak value Add8(240, 22)
gives eliminates the peak, I'd better have 255
.
For subtracting two values of near baseline level Sub16(32000, 33000)
I'd better have 0
.
My questions are: how to have such operations to clamp a result values to the range in the performance-wise manner? Is there a generalized solution for all arithmetics and all base types (8 bit, 16 bit, unsigned, signed)?
Upvotes: 2
Views: 1450
Reputation: 2930
If you'd care for a Generics-based comparer (probably not optimal since it resorts to using TComparer)
unit Zoomicon.Generics.Functors;
type
TF = class
class function Clamp<T>(const Value: T; const MinValue, MaxValue: T): T; inline; //EnsureRange
//...
end;
implementation
uses System.Generics.Defaults; //for TComparer
class function TF.Clamp<T>(const Value: T; const MinValue, MaxValue: T): T;
begin
result := Value;
var Comparer := TComparer<T>.Default; //TODO: document if throws exception if T is something not comparable and/or for "nil" value or bounds
if Comparer.Compare(result, MinValue) < 0 then //if (Value < MinValue)
result := MinValue
else if Comparer.Compare(result, MaxValue) > 0 then //if (Value > MaxValue)
result := MaxValue;
end;
//...
end.
How to use:
Value := TF.Clamp(Value, Min, Max);
Upvotes: 0
Reputation: 167
If the values to be clamped are just above 255 or below zero, you can use a lookup table. Such lookup table approaches are very fast as long as the tables fit inside L1 cache for commonly occuring values. And the byte_clamp_lut+256 is translated by compiler into single memory reference into the middle of the table. I don't know if Pascal/Delphi allows macros, but in C/C++ that is done like that:
#define byte_clamp(v) ((uint8_t*)(byte_clamp_lut+256))[v]
static uint8_t byte_clamp768[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,
96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
256,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
};
Upvotes: 0
Reputation: 80197
Because you deals with large-size data processing, I'd recommend to try some assembler - MMX, SSE2 commands are specially intended for such tasks. For example, paddusb instruction can add 16 pairs of bytes at once with saturation (clamps results in byte range). (Don't forget about proper aligning of data chunks)
Example (not thoroughly tested) for 32-bit compiler. It works 9x faster than pascal-version for treatment of 256M-arrays (604 vs 5100 ms with 10 repeats). Note that for reasonable data size pascal version is pretty fast as well.
program Project1;
{$APPTYPE CONSOLE}
uses SysUtils;
procedure AddBytesSat(const A, B, Res: PByteArray; Len: Integer);
//adds byte arrays Res[i] := A[i] + B[i] with saturation
//arrays should be aligned to 16-byte border, length divisible by 16
//three parameters in eax, edx, ecx registers, fourth on the stack
asm
push esi
mov esi, ecx // save Res pointer
mov ecx, Len
shr ecx, 4 // Len div 16
@@start:
movdqa xmm0, [eax] //copies 16 bytes (aligned) to sse register
paddusb xmm0, [edx] // adds 16 unsigned values with saturation
movdqa [esi], xmm0 // move result bytes back to memory
add eax, 16 // move array pointers
add edx, 16
add esi, 16
loop @@start //go to next iteration
pop esi
end;
var
A, B, C: PByteArray;
i: integer;
begin
//ensure that memory manager returns properly aligned blocks
SetMinimumBlockAlignment(System.mba16Byte);
GetMem(A, 32);
GetMem(B, 32);
GetMem(C, 32);
for i := 0 to 31 do begin
A[i] := 8 * i;
B[i] := 200;
end;
AddBytesSat(A, B, C, 32);
//clamping demonstration
for i := 0 to 15 do
Writeln(C[i]);
Readln;
end.
Upvotes: 5