Reputation: 33678
I have some code in an ISR. The code is given for completeness, the question is only about the commented-out __asm_ block.
Without the __asm_ block, this is compiled into 82 instructions. With the __asm_ block, the result is 107 instructions long. Why the big difference?
Here's the C code:
if (PIR1bits.SSPIF)
{
spi_rec_buffer.read_cursor = 0;
spi_rec_buffer.write_cursor = 0;
LATAbits.LATA4 ^= 1;
// _asm nop nop _endasm
LATAbits.LATA4 ^= 1;
while (!PORTAbits.NOT_SS && spi_rec_buffer.write_cursor < spi_rec_buffer.size)
{
spi_rec_buffer.data[spi_rec_buffer.write_cursor] = SSPBUF;
SSPBUF = spi_out_msg_buffer.data[spi_out_msg_buffer.read_cursor];
PIR1bits.SSPIF = 0;
spi_rec_buffer.write_cursor++;
spi_out_msg_buffer.read_cursor++;
if (spi_out_msg_buffer.read_cursor == spi_out_msg_buffer.write_cursor)
LATAbits.LATA4 = 0;
LATBbits.LATB1 = 1;
while (!PORTAbits.NOT_SS && !PIR1bits.SSPIF);
LATBbits.LATB1 = 0;
}
spi_message_locked = true;
spi_message_received = true;
}
Without NOPs:
BTFSS 0x9e,0x3,0x0 if (PIR1bits.SSPIF)
BRA 0x2ba
{
MOVLB 0xf spi_rec_buffer.read_cursor = 0;
CLRF 0x4,0x1
CLRF 0x5,0x1
CLRF 0x6,0x1 spi_rec_buffer.write_cursor = 0;
CLRF 0x7,0x1
BTG 0x89,0x4,0x0 LATAbits.LATA4 ^= 1;
BTG 0x89,0x4,0x0 LATAbits.LATA4 ^= 1;
MOVF 0x80,0x0,0x0 while (!PORTAbits.NOT_SS && spi_rec_buffer.write_cursor < spi_rec_buffer.size)
ANDLW 0x20
BNZ 0x2b0
MOVLB 0xf
MOVF 0x7,0x0,0x1
XORWF 0x3,0x0,0x1
BTFSS 0xe8,0x7,0x0
BRA 0x254
RLCF 0x3,0x0,0x1
BRA 0x25c
MOVF 0x2,0x0,0x1
SUBWF 0x6,0x0,0x1
MOVF 0x3,0x0,0x1
SUBWFB 0x7,0x0,0x1
BC 0x2b0
BRA 0x240
{
MOVF 0x0,0x0,0x1 spi_rec_buffer.data[spi_rec_buffer.write_cursor] = SSPBUF;
ADDWF 0x6,0x0,0x1
MOVWF 0xe9,0x0
MOVF 0x1,0x0,0x1
ADDWFC 0x7,0x0,0x1
MOVWF 0xea,0x0
MOVFF 0xfc9,0xfef
MOVLB 0xf SSPBUF = spi_out_msg_buffer.data[spi_out_msg_buffer.read_cursor];
MOVF 0x10,0x0,0x1
ADDWF 0x14,0x0,0x1
MOVWF 0xe9,0x0
MOVF 0x11,0x0,0x1
ADDWFC 0x15,0x0,0x1
MOVWF 0xea,0x0
MOVF 0xef,0x0,0x0
MOVWF 0xc9,0x0
BCF 0x9e,0x3,0x0 PIR1bits.SSPIF = 0;
MOVLB 0xf spi_rec_buffer.write_cursor++;
INCF 0x6,0x1,0x1
MOVLW 0x0
ADDWFC 0x7,0x1,0x1
MOVLB 0xf spi_out_msg_buffer.read_cursor++;
INCF 0x14,0x1,0x1
ADDWFC 0x15,0x1,0x1
MOVF 0x16,0x0,0x1 if (spi_out_msg_buffer.read_cursor == spi_out_msg_buffer.write_cursor)
XORWF 0x14,0x0,0x1
BNZ 0x29e
MOVF 0x17,0x0,0x1
XORWF 0x15,0x0,0x1
BNZ 0x29e
BCF 0x89,0x4,0x0 LATAbits.LATA4 = 0;
BSF 0x8a,0x1,0x0 LATBbits.LATB1 = 1;
MOVF 0x80,0x0,0x0 while (!PORTAbits.NOT_SS && !PIR1bits.SSPIF);
ANDLW 0x20
BNZ 0x2ac
MOVF 0x9e,0x0,0x0
ANDLW 0x8
BZ 0x2a0
BCF 0x8a,0x1,0x0 LATBbits.LATB1 = 0;
}
MOVLB 0xf spi_message_locked = true;
MOVLW 0x1
MOVWF 0x18,0x1
MOVLB 0xf spi_message_received = true;
MOVWF 0x19,0x1
}
MOVLW 0x4 }
SUBWF 0xe1,0x0,0x0
BC 0x2c4
CLRF 0xe1,0x0
MOVF 0xe5,0x1,0x0
MOVWF 0xe1,0x0
MOVF 0xe5,0x1,0x0
MOVFF 0xfe7,0xfd9
MOVF 0xe5,0x1,0x0
MOVFF 0xfe5,0xfea
MOVFF 0xfe5,0xfe9
MOVFF 0xfe5,0xfda
RETFIE 0x1
With NOPs:
BTFSS 0x9e,0x3,0x0 if (PIR1bits.SSPIF)
BRA 0x30e
{
MOVLB 0xf spi_rec_buffer.read_cursor = 0;
CLRF 0x4,0x1
CLRF 0x5,0x1
MOVLB 0xf spi_rec_buffer.write_cursor = 0;
CLRF 0x6,0x1
CLRF 0x7,0x1
BTG 0x89,0x4,0x0 LATAbits.LATA4 ^= 1;
NOP _asm nop nop _endasm
NOP
BTG 0x89,0x4,0x0 LATAbits.LATA4 ^= 1;
MOVF 0x80,0x0,0x0 while (!PORTAbits.NOT_SS && spi_rec_buffer.write_cursor < spi_rec_buffer.size)
ANDLW 0x20
BNZ 0x302
MOVLB 0xf
MOVF 0x7,0x0,0x1
MOVLB 0xf
XORWF 0x3,0x0,0x1
BTFSS 0xe8,0x7,0x0
BRA 0x27e
RLCF 0x3,0x0,0x1
BRA 0x28c
MOVF 0x2,0x0,0x1
MOVLB 0xf
SUBWF 0x6,0x0,0x1
MOVLB 0xf
MOVF 0x3,0x0,0x1
MOVLB 0xf
SUBWFB 0x7,0x0,0x1
BC 0x302
BRA 0x268
{
MOVLB 0xf spi_rec_buffer.data[spi_rec_buffer.write_cursor] = SSPBUF;
MOVLB 0xf
MOVF 0x0,0x0,0x1
MOVLB 0xf
ADDWF 0x6,0x0,0x1
MOVWF 0xe9,0x0
MOVLB 0xf
MOVLB 0xf
MOVF 0x1,0x0,0x1
MOVLB 0xf
ADDWFC 0x7,0x0,0x1
MOVWF 0xea,0x0
MOVFF 0xfc9,0xfef
MOVLB 0xf SSPBUF = spi_out_msg_buffer.data[spi_out_msg_buffer.read_cursor];
MOVLB 0xf
MOVF 0x10,0x0,0x1
MOVLB 0xf
ADDWF 0x14,0x0,0x1
MOVWF 0xe9,0x0
MOVLB 0xf
MOVLB 0xf
MOVF 0x11,0x0,0x1
MOVLB 0xf
ADDWFC 0x15,0x0,0x1
MOVWF 0xea,0x0
MOVF 0xef,0x0,0x0
MOVWF 0xc9,0x0
BCF 0x9e,0x3,0x0 PIR1bits.SSPIF = 0; // Interruptflag löschen...
MOVLB 0xf spi_rec_buffer.write_cursor++;
INCF 0x6,0x1,0x1
MOVLW 0x0
ADDWFC 0x7,0x1,0x1
MOVLB 0xf spi_out_msg_buffer.read_cursor++;
INCF 0x14,0x1,0x1
MOVLW 0x0
ADDWFC 0x15,0x1,0x1
MOVLB 0xf if (spi_out_msg_buffer.read_cursor == spi_out_msg_buffer.write_cursor)
MOVF 0x16,0x0,0x1
MOVLB 0xf
XORWF 0x14,0x0,0x1
BNZ 0x2ea
MOVLB 0xf
MOVF 0x17,0x0,0x1
MOVLB 0xf
XORWF 0x15,0x0,0x1
BNZ 0x2ee
BCF 0x89,0x4,0x0 LATAbits.LATA4 = 0;
BSF 0x8a,0x1,0x0 LATBbits.LATB1 = 1;
MOVF 0x80,0x0,0x0 while (!PORTAbits.NOT_SS && !PIR1bits.SSPIF);
ANDLW 0x20
BNZ 0x2fe
MOVF 0x9e,0x0,0x0
ANDLW 0x8
BNZ 0x2fe
BRA 0x2f0
BCF 0x8a,0x1,0x0 LATBbits.LATB1 = 0;
}
MOVLB 0xf spi_message_locked = true;
MOVLW 0x1
MOVWF 0x18,0x1
MOVLB 0xf spi_message_received = true;
MOVLW 0x1
MOVWF 0x19,0x1
}
MOVLW 0x4 }
SUBWF 0xe1,0x0,0x0
BC 0x318
CLRF 0xe1,0x0
MOVF 0xe5,0x1,0x0
MOVWF 0xe1,0x0
MOVF 0xe5,0x1,0x0
MOVFF 0xfe7,0xfd9
MOVF 0xe5,0x1,0x0
MOVFF 0xfe5,0xfea
MOVFF 0xfe5,0xfe9
MOVFF 0xfe5,0xfda
RETFIE 0x1
Here's a screenshot of a partly diff (click to enlarge):
Upvotes: 12
Views: 2780
Reputation: 146093
It seems that the compiler emits the MOVLB instructions before any access to "banked RAM".
The optimizer takes the extra ones out. (And some other stuff.)
The optimizer doesn't run when you have inline assembly.
So adding that inline block is the same thing as turning off optimization.
Upvotes: 3
Reputation: 78923
Your compiler seems to have a rellatively poor extension to include assembler. Basically it gives no hint at all back to the compiler, which register you are using, perhaps modifying etc. To produce consistent code the assembler that he produces has to be significantly different then. He has to re-initialize all its registers to known values.
Other compilers, e.g gcc, have asm
extension that let you be more specific on these things. In particular you have effective ways of telling the compiler which memory and registers are affected by your assembler code. For them such a NOP
instruction would introduce not much more than an "optimization barrier".
Upvotes: 1
Reputation: 340218
So that people don't have to guess, here's a statement from the Microchip C18 manual (emphasis added):
It is generally recommended to limit the use of inline assembly to a minimum. Any functions containing inline assembly will not be optimized by the compiler. To write large fragments of assembly code, use the MPASM assembler and link the modules to the C modules using the MPLINK linker.
I think that this is a common situation with inline asm. GCC is an exception - it will optimize the inline assembly along with the surrounding C code; in order to do this correctly, GCC's inline assembly is quite complex (you have to let it know which registers and memory are clobbered).
Upvotes: 10
Reputation: 109159
As MRAB has mentioned in his answer, this is likely an optimization issue. Try moving the assembly instructions into their own function.
A function call is probably going to add more overhead than 2 NOP
s, so you can try messing with the function once you've figured out whether it makes a difference. For instance, try declaring the function inline
, or write the function as a C callable assembly function (assuming this is possible with your compiler).
Upvotes: 0
Reputation: 20664
I suspect that it's to do with optimisation.
The compiler sees that you're inserting a chunk of assembly language, it doesn't know what effect it will have, so it just acts more cautiously.
Upvotes: 2