Reputation: 61

BITswap assembly program

%include "asm_io.inc"
segment .data
outmsg1 db    "Input Integer: ", 0
outmsg2 db    "After Operation": ", 0

segment .bss

input1  resd 1
input2  resd 1

segment .text
    global  asm_main
asm_main:
    enter 0,0
    pusha

    mov eax, outmsg1
    call print_string
    call read_int
    call print_nl
    dump_regs 1

    rol eax, 8
    rol ax, 8
    ror eax,8
    mov ebx, 0
    mov ebx, eax
    dump_regs 2

    popa
    mov eax, 0
    leave
    ret

given the above assembly program which swaps the highest value Byte with the lowest value byte of a given integer. im trying to figure out how to make it swap the highest value BIT with the lowest value BIT.

im somewhat stuck, maybe you can help me out

Upvotes: 2

Answers (5)

Ped7g

Reputation: 16596

Ok, so as there are already different answers, I will make my "comment" official one with some extensions to it:

    rol    eax,1        ; get the top bit down into low 8 bits
    test   al,3         ; now test the two bits, setting parity flag
    jpe    to_ror       ; if "00" or "11", skip the bit swap
    xor    eax,3        ; flip the two lowest bits (top and bottom original position)
to_ror:
    ror    eax,1        ; restore the positions of bits (top back to b31)

This is single conditional jump variant, i.e. probably not performance optimal, but should be reasonably easy to understand and doesn't use any other resource except original eax value and flag register.

Another option is to avoid conditional branch for the price of few more instructions and registers used (but should be still faster in many cases on modern CPU, because the mispredicted branching is usually real hog of CPU resources) (this is basically what OP has come with, and what I mentioned in my original comment as "to extract either each bit separately and recompose back"):

mov   ebx,eax           ; copy the original value into two new regs
mov   ecx,eax
shr   ebx, 31           ; b31 bit into b0 position (others cleared)
shl   ecx, 31           ; b0 bit into b31 position (others cleared)
and   eax, 0x7FFFFFFE   ; clear b0 and b31 in original value
or    eax,ebx           ; combining the swapped bits back into value
or    eax,ecx

Upvotes: 1

tamut

Reputation: 61

i am positively astonished by the amount and detail of the replies that have been posted here. i am very grateful for all the information you have provided, it will take some time to study and understand some of it. - in the meantime i came up with another solution myself. its probably not as effective as your solutions but i still wanted to post it and read about what you think about it.

%include "asm_io.inc"
segment .data
outmsg1 db    "Enter integer: ", 0
outmsg2 db    "Before Operation: ", 0
outmsg3 db    "After Operation: ", 0

segment .bss

input1  resd 1
input2  resd 1

segment .text
    global  asm_main
asm_main:
    enter 0,0
    pusha

    mov eax, outmsg1
    call print_string
    call read_int
    xor esi, esi    
    mov esi, eax

    mov eax, outmsg2
    call print_string
    call print_nl
    mov eax, esi
    dump_regs 1

    mov ebx,eax
    mov ecx,eax

    shr ebx, 31
    shl ecx, 31

    shl eax, 1
    shr eax, 2
    shl eax, 1
    or eax,ebx
    or eax,ecx

    mov ebx,eax
    mov eax, outmsg3
    call print_string
    call print_nl
    dump_regs 2


    popa
    mov eax, 0
    leave
    ret

Upvotes: 2

Peter Cordes

Reputation: 364160

Use a temporary register (other than EFLAGS) to make this lower latency on CPUs without single-cycle adc:

mov    ecx, eax

bswap  eax
shl    eax, 7             ; top bit in place
shr    ax, 7+7            ; bottom bit in place (without disturbing top bit)

and    ecx, 0x7ffffffe    ; could optimize mov+and with BMI1 andn
and    eax, 0x80000001
or     eax, ecx           ; merge the non-moving bits with the swapped bits

On Intel CPUs before Sandybridge, shr ax and then reading EAX will suck (partial register stall).

This looks like 5 cycle latency from input to output, same as the adc/adc version of @Fuz's on CPUs where that's single-cycle latency. (AMD, and Intel since Broadwell). But on Haswell and earlier, this may be better.

We could save the mov using either BMI1 andn with a constant in a register, or maybe BMI2 rorx ecx, eax, 16 to copy-and-swap instead of doing bswap in place. But then the bits are in less convenient places.

@rkhb's idea to check if the bits differ and flip them is good, especially using PF to check for 0 or 2 bits set vs. 1. PF is only set based on the low byte of a result, so we can't just and 0x80000001 without rotating first.

You can do this branchlessly with cmov

; untested, but I think I have the parity correct
rorx    ecx, eax, 31     ; ecx = rotate left by 1.  low 2 bits are the ones we want
xor     edx,edx
test    cl, 3            ; sets PF=1 iff they're the same: even parity
mov     ecx, 0x80000001
cmovpo  edx, ecx         ; edx=0 if bits match, 0x80000001 if they need swapping
xor     eax, edx

With single-uop cmov (Broadwell and later, or AMD), this is 4 cycle latency. The xor-zeroing and mov-immediate are off the critical path. The mov-immediate can be hoisted out of a loop, if you use a register other than ECX.

Or with setcc, but it's worse (more uops), or tied on CPUs with 2-uop cmov:

; untested, I might have the parity reversed.
rorx    ecx, eax, 31     ; ecx = rotate left by 1.  low 2 bits are the ones we want
xor     edx,edx
and     ecx, 3           ; sets PF=1 iff they're the same: even parity
setpe   dl
dec     edx              ; 0 or -1
and     edx, 0x80000001
xor     eax, edx

Upvotes: 2

rkhb

Reputation: 14399

You only have to toggle the both bits, if they differ. You don't need to do anything, if the bits both are set or both are cleared:

%include "asm_io.inc"
segment .text
    global  asm_main
asm_main:
    enter 0,0
    pusha

    ; Test values, comment it as needed
;   mov eax, 0x00123400         ; Bit0 and Bit31 are cleared
    mov eax, 0x00123401         ; Bit0 is set, Bit 31 is cleared
;   mov eax, 0x80123400         ; Bit0 is cleared, Bit31 is set
;   mov eax, 0x80123401         ; Bit0 and Bit31 are set

    dump_regs 1

    bt eax, 0                   ; Copy the least significant bit into CF
    setc cl                     ; Copy CF into register CL
    bt eax, 31                  ; Copy the most significant bit into CF
    setc ch                     ; Copy CF into register CH
    cmp cl, ch                  ; Compare the bits
    je skip                     ; No operation, if they don't differ
    btc eax, 0                  ; Toggle the least significant bit
    btc eax, 31                 ; Toggle the most significant bit
    skip:

    dump_regs 2

    popa
    mov eax, 0
    leave
    ret

Another idea is to use TEST and to operate according to the flags - advantage: you don't need an additional register:

%include "asm_io.inc"

segment .text
    global  asm_main
asm_main:
    enter 0,0
    pusha

    ; Test values, comment it as needed
;   mov eax, 0x00123400         ; ZF PF
    mov eax, 0x00123401         ; -  -
;   mov eax, 0x80123400         ; SF PF
;   mov eax, 0x80123401         ; SF


    test eax, 0x80000001

    dump_regs 1

    jns j1
    jnp skip
    j1:
    jz skip

    doit:                       ; Touch the bits if (SF & PF) or (!SF & !PF)
    btc eax, 0                  ; Toggle the least significant bit
    btc eax, 31                 ; Toggle the most significant bit
    skip:

    dump_regs 2

    popa
    mov eax, 0
    leave
    ret

Upvotes: 2

fuz

Reputation: 92986

How about this? The input is in ecx or any other register you like.

                   // initial state: ECX = A...B
ror ecx            // ECX = BA..., CF = B
bt ecx, 30         // ECX = BA..., CF = A
rcl ecx, 2         // ECX = ...AB, CF = A
ror ecx            // ECX = B...A

As Peter Cordes indicated, if you are optimizing for performance you might want to change the code like this:

ror ecx
bt ecx, 30
adc ecx, ecx
adc ecx, ecx
ror ecx

This is because adc r,r performs better than rcl r,imm or rcl r on modern microarchitectures.

Upvotes: 3

BITswap assembly program

Answers (5)

Related Questions