Reputation: 19
This is my .CPP file
#include <iostream>
using namespace std;
extern "C" void KeysAsm(int arr[], int n, int thetha, int rho);
// Keep this and call it from assembler
extern "C"
void crim(int *xp, int *yp) {
int temp = *xp;
*xp = *yp;
*yp = temp+2;
}
// Translate this into Intel assembler
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
// Function to print an array
void printArray(int arr[], int size){
int i;
for (i = 0; i < size; i++)
cout << arr[i] << "\n";
cout << endl;
}
int main() {
int gamma1[]{
9,
270,
88,
-12,
456,
80,
45,
123,
427,
999
};
int gamma2[]{
900,
312,
542,
234,
234,
1,
566,
123,
427,
111
};
printf("Array:\n");
printArray(gamma1, 10);
KeysAsm(gamma1, 10, 5, 6);
printf("Array Result Asm:\n");
printArray(gamma1, 10);
KeysCpp(gamma2, 10, 5, 6);
printf("Array Result Cpp:\n");
printArray(gamma2, 10);
}
What I want to do is, convert the KeysCpp function into assembly language and call it from this very .CPP file. I want to keep the crim function as it is in .CPP, while only converting the KeysCpp.
Here is my .ASM file
PUBLIC KeysAsm
includelib kernel32.lib
_DATA SEGMENT
EXTERN crim:PROC
_DATA ENDS
_TEXT SEGMENT
KeysAsm PROC
push rbp
mov rbp, rsp
sub rsp, 40
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-32], edx
mov DWORD PTR [rbp-36], ecx
mov DWORD PTR [rbp-4], 0
jmp L3
L3:
mov eax, DWORD PTR [rbp-28]
sub eax, 1
cmp DWORD PTR [rbp-4], eax
jl L7
L4:
mov eax, DWORD PTR [rbp-28]
sub eax, DWORD PTR [rbp-4]
sub eax, 1
cmp DWORD PTR [rbp-8], eax
jl L6
L5:
add DWORD PTR [rbp-8], 1
L6:
mov eax, DWORD PTR [rbp-8]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov edx, DWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov eax, DWORD PTR [rax]
cmp edx, eax
jle L5
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-8]
cdqe
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov rsi, rdx
mov rdi, rax
call crim
L7:
mov DWORD PTR [rbp-8], 0
jmp L4
KeysAsm ENDP
_TEXT ENDS
END
I am using Visual Studio 2017 to run this project.
I am getting next error when I run this code.
Unhandled exception at 0x00007FF74B0E429C in MatrixMultiplication.exe: Stack cookie instrumentation code detected a stack-based buffer overrun. occurred
Upvotes: 0
Views: 302
Reputation: 365981
Your asm looks like it's expecting the x86-64 System V calling convention, with args in RDI, ESI, EDX, ECX. But you said you're compiling with Visual Studio, so the compiler-generated code will use the Windows x64 calling convention: RCX, EDX, R8D, R9D.
And when you call crim
, it can use shadow space (32 bytes above its return address, which you didn't reserve space for).
It looks like you got this asm from un-optimized compiler output, probably from https://godbolt.org/z/ea4MPh81r using GCC for Linux, without using -mabi=ms
to override the default -mabi=sysv
when compiling for non-Windows targets. And then you modified it to make the loop infinite, with a jmp
at the bottom instead of a ret
? Maybe a different GCC version than 12.2 since the label numbers and code don't match exactly.
(The signs of being un-optimized compiler output are all the reloads from [rbp-whatever]
, and redoing sign-extension before using an int
to index an array with cdqe
. A human would know the int
must be non-negative. And being GCC specifically, the numbered label like .L1:
etc. where you just removed the .
, and of heavily using RAX
for as much as possible in a debug build. And choices like lea rdx, [0+rax*4]
to copy-and-shift, and the exact syntax it used to print that instruction in Intel syntax match GCC.)
extern "C" void crim(int *xp, int *yp); // prototype only
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
Then on Godbolt, use gcc -O3 -mabi=ms
, or use MSVC which always targets Windows. https://godbolt.org/z/Mj5Gb54b5 shows both GCC and MSVC with optimization enabled.
KeysCpp(int*, int, int, int): ; demangled name
cmp edx, 1
jle .L11 ; "shrink wrap" optimization: early-out on n<=1 before saving regs
push r15 ; save some call-preserved regs
push r14
lea r14, [rcx+4] ; arr + 1
push r13
mov r13, rcx
Unfortunately GCC fails to hoist the thetha / rho * 2 - 4
loop-invariant, instead redoing idiv
every time through the loop. Seems like an obvious optimization since those are local vars whose address hasn't been taken at all, and it keeps thetha
(typo for theta
?) and rho
in registers. So MSVC is much more efficient here. Clang also misses this optimization.
Upvotes: 2