Reputation: 912
I've written quite a big library for matrix operations for Delphi and FPC. There exists now an extension for this library for the Intel AVX extension but I could only manage to get that compiled in FPC. My idea was to create .o files in FPC which contains the AVX assembler codes and include these files in Delphi. I tried to follow this question here: Linking FPC .o files into Delphi
but without success. I was able to dump the function names and tried to import these in the Delphi unit. The problem is that I always get an error saying that the .o files is in the wrong format.
I use CodeTyphoon for compilation which internally uses FPC 3.1.1 and Delphi2010 as a first try.
The code is once compiled in FPC and one time in Delphi using the approriate ifdefs.
My base code looks like this (just an excerpt):
// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2011, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################
unit AVXMatrixMultOperations;
interface
{$IFDEF CPUX64}
{$DEFINE x64}
{$ENDIF}
{$IFDEF cpux86_64}
{$DEFINE x64}
{$ENDIF}
{$IFNDEF x64}
uses MatrixConst;
{$IFNDEF FPC}
// this fails -> wrong object format
{$L '.\AVXPrecompiled\win32\AVXMatrixMultOperations.o'}
{$ENDIF}
// full matrix operations
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
{$IFNDEF FPC} external '' name 'AVXMATRIXMULTOPERATIONS_$$_AVXMATRIXMULTALIGNED$crc2A67AB04'; {$ENDIF}
{$ENDIF}
implementation
{$IFDEF FPC} {$ASMMODE intel} {$ENDIF}
{$IFNDEF x64}
{$IFDEF FPC}
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
var bytesWidth2, destOffset : TASMNativeInt;
iter : TASMNativeInt;
{$IFDEF FPC}
begin
{$ENDIF}
asm
// prolog - simulate stack
push ebx;
push edi;
push esi;
mov ecx, dest;
mov edi, width1;
imul edi, -8;
mov iter, edi;
sub mt1, edi;
//destOffset := destLineWidth - Width2*sizeof(double);
mov ebx, Width2;
shl ebx, 3;
mov eax, destLineWidth;
sub eax, ebx;
mov destOffset, eax;
//bytesWidth2 := width2*sizeof(double);
mov bytesWidth2, ebx;
// for y := 0 to height1 - 1 do
@@foryloop:
// r12 -> counter to width2
mov esi, width2;
sub esi, 2;
jl @LastXColumn;
@@forxloop:
// for x := 0 to width2 div 2 - 1
// esi: mt1 - width1*sizeof(double)
// mt2: mt2
mov edx, mt1;
mov ebx, mt2;
mov eax, iter;
mov edi, LineWidth2;
vxorpd ymm0, ymm0, ymm0;
vxorpd ymm1, ymm1, ymm1;
cmp eax, -32;
jg @@Innerloop2Begin;
// for z := 0 to width1 - 1do
// AVX part:
@@InnerLoop1:
// 4x4 block
vmovapd xmm2, [ebx];
add ebx, edi;
vmovapd xmm4, xmm2;
vmovapd xmm3, [ebx];
add ebx, edi;
// shuffle so we can multiply
// swap such that we can immediately multiply
vmovlhps xmm2, xmm2, xmm3;
vmovhlps xmm3, xmm3, xmm4;
// next 4 elements
vmovapd xmm4, [ebx];
add ebx, edi;
vmovapd xmm6, xmm4;
vmovapd xmm5, [ebx];
add ebx, edi;
vmovapd ymm7, [edx + eax]
vmovlhps xmm4, xmm4, xmm5;
vmovhlps xmm5, xmm5, xmm6;
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
// now multiply and add
vmulpd ymm2, ymm2, ymm7;
vmulpd ymm3, ymm3, ymm7;
vaddpd ymm0, ymm0, ymm2;
vaddpd ymm1, ymm1, ymm3;
add eax, 32;
jl @@InnerLoop1;
vextractf128 xmm2, ymm0, 1;
vextractf128 xmm3, ymm1, 1;
vhaddpd xmm0, xmm0, xmm2;
vhaddpd xmm1, xmm1, xmm3;
test eax, eax;
jz @@InnerLoopEnd2;
@@Innerloop2Begin:
// rest in single elements
@@InnerLoop2:
vmovapd xmm2, [ebx];
add ebx, edi;
vmovddup xmm3, [edx + eax];
vmulpd xmm2, xmm2, xmm3;
vmovhlps xmm4, xmm4, xmm2;
vaddsd xmm0, xmm0, xmm2;
vaddsd xmm1, xmm1, xmm4;
add eax, 8;
jnz @@InnerLoop2;
@@InnerLoopEnd2:
// finall horizontal addition
vhaddpd xmm0, xmm0, xmm1;
vmovapd [ecx], xmm0;
// increment the pointers
// inc(mt2), inc(dest);
//add dword ptr [mt2], 8;
add mt2, 16;
add ecx, 16;
// end for x := 0 to width2 div 2 - 1
sub esi, 2;
jge @@forxloop;
@LastXColumn:
cmp esi, -1;
jne @NextLine;
// last column of mt2
mov eax, iter;
mov ebx, mt2;
vxorpd xmm0, xmm0, xmm0;
@InnerLoop2:
vmovsd xmm1, [edx + eax];
vmovsd xmm2, [ebx];
vmulsd xmm1, xmm1, xmm2;
vaddsd xmm0, xmm0, xmm1;
add ebx, edi;
add eax, 8;
jnz @InnerLoop2;
vmovsd [ecx], xmm0;
add ecx, 8;
add mt2, 8;
@NextLine:
// dec(mt2, Width2);
// inc(PByte(mt1), LineWidth1);
// inc(PByte(dest), destOffset);
//mov ebx, bytesWidth2;
//sub dword ptr [mt2], ebx;
mov eax, bytesWidth2;
sub mt2, eax;
mov eax, LineWidth1;
add mt1, eax;
add ecx, destOffset;
// end for y := 0 to height1 - 1
//dec eax;
dec height1;
jnz @@foryloop;
// epilog
vzeroupper;
pop esi;
pop edi;
pop ebx;
end;
{$IFDEF FPC}
end;
{$ENDIF}
{$ENDIF}
{$ENDIF}
end.
Upvotes: 1
Views: 317
Reputation: 43033
Since there is a single function involved here, the easiest is IMHO to convert the FPC AVXMatrixMultOperations.o file directly.
Use the great Object file converter tool.
You may try to convert from one binary format to another, accepted by Delphi.
But I guess that the cleanest way is to convert it to asm:
objconv -fasm AVXMatrixMultOperations.o
It will create a AVXMatrixMultOperations.asm
file, which could be used to replace the unknown AVX instructions by simple db ..,..,..,..
bytes. Typically, the generated .asm
file has the assembler on the left side, and the raw hexadecimal bytes on the right side.
This is how I dealt with old Delphi compilers in my libraries, for instance:
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
asm // eax=crc, edx=buf, ecx=len
not eax
test ecx, ecx
jz @0
test edx, edx
jz @0
@3: test edx, 3
jz @8 // align to 4 bytes boundary
{$ifdef ISDELPHI2010}
crc32 eax, byte ptr[edx]
{$else}
db $F2, $0F, $38, $F0, $02
{$endif}
inc edx
....
So in your case, something like
{$ifdef FPC}
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
{$else}
db $xx,$yy,$zz
db $xx,$yy,$zz
{$endif}
Upvotes: 2