Reputation: 61
%include "asm_io.inc"
segment .data
outmsg1 db "Input Integer: ", 0
outmsg2 db "After Operation": ", 0
segment .bss
input1 resd 1
input2 resd 1
segment .text
global asm_main
asm_main:
enter 0,0
pusha
mov eax, outmsg1
call print_string
call read_int
call print_nl
dump_regs 1
rol eax, 8
rol ax, 8
ror eax,8
mov ebx, 0
mov ebx, eax
dump_regs 2
popa
mov eax, 0
leave
ret
given the above assembly program which swaps the highest value Byte with the lowest value byte of a given integer. im trying to figure out how to make it swap the highest value BIT with the lowest value BIT.
im somewhat stuck, maybe you can help me out
Upvotes: 2
Views: 1099
Reputation: 16596
Ok, so as there are already different answers, I will make my "comment" official one with some extensions to it:
rol eax,1 ; get the top bit down into low 8 bits
test al,3 ; now test the two bits, setting parity flag
jpe to_ror ; if "00" or "11", skip the bit swap
xor eax,3 ; flip the two lowest bits (top and bottom original position)
to_ror:
ror eax,1 ; restore the positions of bits (top back to b31)
This is single conditional jump variant, i.e. probably not performance optimal, but should be reasonably easy to understand and doesn't use any other resource except original eax
value and flag register.
Another option is to avoid conditional branch for the price of few more instructions and registers used (but should be still faster in many cases on modern CPU, because the mispredicted branching is usually real hog of CPU resources) (this is basically what OP has come with, and what I mentioned in my original comment as "to extract either each bit separately and recompose back"):
mov ebx,eax ; copy the original value into two new regs
mov ecx,eax
shr ebx, 31 ; b31 bit into b0 position (others cleared)
shl ecx, 31 ; b0 bit into b31 position (others cleared)
and eax, 0x7FFFFFFE ; clear b0 and b31 in original value
or eax,ebx ; combining the swapped bits back into value
or eax,ecx
Upvotes: 1
Reputation: 61
i am positively astonished by the amount and detail of the replies that have been posted here. i am very grateful for all the information you have provided, it will take some time to study and understand some of it. - in the meantime i came up with another solution myself. its probably not as effective as your solutions but i still wanted to post it and read about what you think about it.
%include "asm_io.inc"
segment .data
outmsg1 db "Enter integer: ", 0
outmsg2 db "Before Operation: ", 0
outmsg3 db "After Operation: ", 0
segment .bss
input1 resd 1
input2 resd 1
segment .text
global asm_main
asm_main:
enter 0,0
pusha
mov eax, outmsg1
call print_string
call read_int
xor esi, esi
mov esi, eax
mov eax, outmsg2
call print_string
call print_nl
mov eax, esi
dump_regs 1
mov ebx,eax
mov ecx,eax
shr ebx, 31
shl ecx, 31
shl eax, 1
shr eax, 2
shl eax, 1
or eax,ebx
or eax,ecx
mov ebx,eax
mov eax, outmsg3
call print_string
call print_nl
dump_regs 2
popa
mov eax, 0
leave
ret
Upvotes: 2
Reputation: 364160
Use a temporary register (other than EFLAGS) to make this lower latency on CPUs without single-cycle adc
:
mov ecx, eax
bswap eax
shl eax, 7 ; top bit in place
shr ax, 7+7 ; bottom bit in place (without disturbing top bit)
and ecx, 0x7ffffffe ; could optimize mov+and with BMI1 andn
and eax, 0x80000001
or eax, ecx ; merge the non-moving bits with the swapped bits
On Intel CPUs before Sandybridge, shr ax
and then reading EAX will suck (partial register stall).
This looks like 5 cycle latency from input to output, same as the adc
/adc
version of @Fuz's on CPUs where that's single-cycle latency. (AMD, and Intel since Broadwell). But on Haswell and earlier, this may be better.
We could save the mov
using either BMI1 andn
with a constant in a register, or maybe BMI2 rorx ecx, eax, 16
to copy-and-swap instead of doing bswap
in place. But then the bits are in less convenient places.
@rkhb's idea to check if the bits differ and flip them is good, especially using PF to check for 0 or 2 bits set vs. 1. PF is only set based on the low byte of a result, so we can't just and 0x80000001
without rotating first.
You can do this branchlessly with cmov
; untested, but I think I have the parity correct
rorx ecx, eax, 31 ; ecx = rotate left by 1. low 2 bits are the ones we want
xor edx,edx
test cl, 3 ; sets PF=1 iff they're the same: even parity
mov ecx, 0x80000001
cmovpo edx, ecx ; edx=0 if bits match, 0x80000001 if they need swapping
xor eax, edx
With single-uop cmov
(Broadwell and later, or AMD), this is 4 cycle latency. The xor-zeroing and mov-immediate are off the critical path. The mov-immediate can be hoisted out of a loop, if you use a register other than ECX.
Or with setcc
, but it's worse (more uops), or tied on CPUs with 2-uop cmov
:
; untested, I might have the parity reversed.
rorx ecx, eax, 31 ; ecx = rotate left by 1. low 2 bits are the ones we want
xor edx,edx
and ecx, 3 ; sets PF=1 iff they're the same: even parity
setpe dl
dec edx ; 0 or -1
and edx, 0x80000001
xor eax, edx
Upvotes: 2
Reputation: 14399
You only have to toggle the both bits, if they differ. You don't need to do anything, if the bits both are set or both are cleared:
%include "asm_io.inc"
segment .text
global asm_main
asm_main:
enter 0,0
pusha
; Test values, comment it as needed
; mov eax, 0x00123400 ; Bit0 and Bit31 are cleared
mov eax, 0x00123401 ; Bit0 is set, Bit 31 is cleared
; mov eax, 0x80123400 ; Bit0 is cleared, Bit31 is set
; mov eax, 0x80123401 ; Bit0 and Bit31 are set
dump_regs 1
bt eax, 0 ; Copy the least significant bit into CF
setc cl ; Copy CF into register CL
bt eax, 31 ; Copy the most significant bit into CF
setc ch ; Copy CF into register CH
cmp cl, ch ; Compare the bits
je skip ; No operation, if they don't differ
btc eax, 0 ; Toggle the least significant bit
btc eax, 31 ; Toggle the most significant bit
skip:
dump_regs 2
popa
mov eax, 0
leave
ret
Another idea is to use TEST
and to operate according to the flags - advantage: you don't need an additional register:
%include "asm_io.inc"
segment .text
global asm_main
asm_main:
enter 0,0
pusha
; Test values, comment it as needed
; mov eax, 0x00123400 ; ZF PF
mov eax, 0x00123401 ; - -
; mov eax, 0x80123400 ; SF PF
; mov eax, 0x80123401 ; SF
test eax, 0x80000001
dump_regs 1
jns j1
jnp skip
j1:
jz skip
doit: ; Touch the bits if (SF & PF) or (!SF & !PF)
btc eax, 0 ; Toggle the least significant bit
btc eax, 31 ; Toggle the most significant bit
skip:
dump_regs 2
popa
mov eax, 0
leave
ret
Upvotes: 2
Reputation: 92986
How about this? The input is in ecx
or any other register you like.
// initial state: ECX = A...B
ror ecx // ECX = BA..., CF = B
bt ecx, 30 // ECX = BA..., CF = A
rcl ecx, 2 // ECX = ...AB, CF = A
ror ecx // ECX = B...A
As Peter Cordes indicated, if you are optimizing for performance you might want to change the code like this:
ror ecx
bt ecx, 30
adc ecx, ecx
adc ecx, ecx
ror ecx
This is because adc r,r
performs better than rcl r,imm
or rcl r
on modern microarchitectures.
Upvotes: 3