greybeard
greybeard

Reputation: 2467

non-restoring division: how to avoid code bloat for divisor MSB set?

Revisiting 32-bit integer division on AVR 8-bit processors, I tried to code unsigned non-restoring division. Not thinking it looks too bad, there is one problem:
With a divisor with highest bit set, it does not readily result in a quotient 0 where necessary.

Boiled down to 8-bit division, and followed by some libgcc code:

#define bits __zero_reg__
#define R r25
#define D r22
#define NQ r24
    ldi NQ, 25
    ldi r22, 129
__udivmodqi4NR:             ; 19 instructions NQ, R = divmod(NQ, D)
    clr     R
    ldi     bits, 8
    lsl     NQ
_udivmodqi4_pos:
    rol     R               ;   1
    sub     R, D            ;   2 diminish remainder
    brcs    _udivmodqi4_nep ; 4/3 remainder got < 0, was < divisor
    sec                     ;   4 How to obviate this?
_udivmodqi4_ep:
    rol     NQ              ;   5 rotate 1 quotient bit into dendQuot
    dec     bits            ;   6 decrement loop counter
    brne    _udivmodqi4_pos ; 8/7
    ret  
_udivmodqi4_neg:
    rol     R               ;   1
    add     R, D            ;   2 increase remainder
    brcs    _udivmodqi4_ep  ; 4/3 remainder restored to >= 0
_udivmodqi4_nep:
    lsl     NQ              ; 4/5 shift 0 quotient bit into dendQuot
    dec     bits            ; 5/6 decrement loop counter
    brne    _udivmodqi4_neg ; 7/6/8
    add     R, D
    ret
sleep
; libgcc (or close enough), 12 instructions, worst case not significantly slower:
#define r_rem   r25 /* remainder */
#define r_arg1  r24 /* dividend, quotient */
#define r_arg2  r22 /* divisor */
#define q_cnt   r23 /* loop count */
__udivmodqi4:               ; r24, r25 = divmod(r24, r22)
    sub     r_rem, r_rem    ; clear remainder and carry
    ldi     q_cnt, 8        ; init loop counter
    lsl     r_arg1
__udivmodqi4_loop:
    rol     r_rem           ;   1 shift dividend into remainder
    cp      r_rem, r_arg2   ;   2 compare remainder & divisor
    brcs    __udivmodqi4_ep ; 4/3 remainder <= divisor
    sub     r_rem, r_arg2   ;   4 restore remainder
__udivmodqi4_ep:
    rol     r_arg1          ;   5 shift dividend (with CARRY)
    dec     q_cnt           ;   6 decrement loop counter
    brne    __udivmodqi4_loop;8/7
    com     r_arg1          ;  67 complement result 
                        ; because C flag was complemented in loop
    ret
sleep

#define quotient    __tmp_reg__
#define byteCnt     __zero_reg__  /* loop count (0 after the loop!) */
#define dividend    r_arg1HH
zeroOne:; 11(movw)/7(mov) + 2 instructions just to handle divisor MSB == 1
    mov_l   r_remL , r_arg1L    ; remainder = dividend, part one
    mov_h   r_remH , r_arg1H
    mov_l   r_arg1L, r_remHL    ; quotient = 0
    mov_h   r_arg1H, r_remHH
    mov_l   r_remHL , r_arg1HL  ; part two
    mov_h   r_remHH , r_arg1HH
    mov_l   r_arg1HL, r_arg1L
    mov_h   r_arg1HH, r_arg1H
    ldi     byteCnt, 1          ; initialise loop control variables
    ldi     quotient, 0x80      ;  for a single/final iteration
    rjmp    subtract

__udivmodsi4NonRestoring:       ; arg2, arg1 = divmod(arg1, arg2)
    clr     r_remHH
    clr     r_remHL
    cpi     r_arg2HH, 128       ; sbrc r_arg2HH, 7   rjmp
    brsh    zeroOne             ; how to avoid special casing divisor MSB set?
    mov_l   r_remL , r_remHL
    mov_h   r_remH , r_remHH
    ldi     byteCnt, 4
    ldi     quotient, 1         ; takes NBBY(8) rol instructions to appear as carry

_udivmodsi4_pos:
    lsl     dividend            ;  1
    rol     r_remL              ;  2 shift dividend into remainder
    rol     r_remH              ;  3
    rol     r_remHL             ;  4
    rol     r_remHH             ;  5
subtract:
    sub     r_remL, r_arg2L     ;  6 diminish remainder
    sbc     r_remH, r_arg2H     ;  7
    sbc     r_remHL, r_arg2HL   ;  8
    sbc     r_remHH, r_arg2HH   ;  9
    brcs    _udivmodsi4_nep     ; 11/10  remainder got < 0, was < divisor
    sec                         ; 11
_udivmodsi4_ep:
    rol     quotient            ; 12 rotate 1 into quotient
    brcc    _udivmodsi4_pos     ; 14/13
    dec     byteCnt
    breq    bitsDone

byteLoop:
    mov     r_arg1HH, r_arg1HL  ; shift by bytes
    mov     r_arg1HL, r_arg1H
    mov     r_arg1H , r_arg1L
    mov     r_arg1L , quotient
    ldi     quotient, 1
    sbrc    r_arg1L, 0
    rjmp    _udivmodsi4_pos
_udivmodsi4_neg:
    lsl     dividend            ;  1
    rol     r_remL              ;  2 shift dividend into remainder
    rol     r_remH              ;  3
    rol     r_remHL             ;  4
    rol     r_remHH             ;  5
    add     r_remL, r_arg2L     ;  6 raise remainder
    adc     r_remH, r_arg2H     ;  7
    adc     r_remHL, r_arg2HL   ;  8
    adc     r_remHH, r_arg2HH   ;  9
    brcs    _udivmodsi4_ep      ; 11/10 remainder restored to >= 0
_udivmodsi4_nep:
    lsl     quotient            ; 12/11 shift 0 into quotient
    brcc    _udivmodsi4_neg     ; 14/13/12
    dec     byteCnt
    brne    byteLoop
    add     r_remL , r_arg2L    ; restore remainder
    adc     r_remH , r_arg2H
    adc     r_remHL, r_arg2HL
    adc     r_remHH, r_arg2HH
bitsDone:
    mov     r_arg2L , quotient
    mov     r_arg2H , r_arg1L
    mov     r_arg2HL, r_arg1H
    mov     r_arg2HH, r_arg1HL
    mov_l   r_arg1L , r_remL
    mov_h   r_arg1H , r_remH
    mov_l   r_arg1HL, r_remHL
    mov_h   r_arg1HH, r_remHH
    ret                         ; < 500 cycles worst case
sleep

How can I keep the speed of non-restoring division without that many additional instructions for divisors with MSB 1? (As is, their number increases with operand size.)

Upvotes: 3

Views: 135

Answers (1)

greybeard
greybeard

Reputation: 2467

The (obvious?) non‑answer of sorts:
One can avoid the code‑bloat apparently necessary for one special case in non‑restoring division resorting to conditionally subtracting division instead.
See A faster drop-in replacement for libgcc's AVR division below the horizontal line.

The way to avoid a considerable number of additional instructions for divisor MSB set is to just prevent the process from ever returning to subtraction while processing dividend bits, taking a hit in expected and even in worst case cycle count.
AVR allows skipping the conditional jump implementing that in a suitable way. One more conditional fix‑up is needed before return. This overhead does not grow with operand size.

Below non‑restoring division disentangled from dividend&quotient shift by bytes. Worst case execution times are very similar; but due to the duplicated & bigger bit loop, non‑restoring takes about 59 instructions without movw where byte shift takes 39. The combined byte shifting non‑restoring division from the question code block can be modified the same way, raising worst case cycle count back above 500.

__udivmodsi4NonRestoring:       ; arg2, arg1 = divmod(arg1, arg2)
    clr     r_remHH
    clr     r_remHL
    mov_l   r_remL , r_remHL
    mov_h   r_remH , r_remHH
    ldi     r_cnt, 33
    rjmp    _udivmodsi4_ep

_udivmodsi4_pos:
    rol     r_remL              ;  1 shift dividend into remainder
    rol     r_remH              ;  2
    rol     r_remHL             ;  3
    rol     r_remHH             ;  4
    sub     r_remL, r_arg2L     ;  5 diminish remainder
    sbc     r_remH, r_arg2H     ;  6
    sbc     r_remHL, r_arg2HL   ;  7
    sbc     r_remHH, r_arg2HH   ;  8
    brcs    zero_bit            ; 10/9  remainder got < 0, was < divisor
    sec                         ; 10
_udivmodsi4_ep:
one_bit:
    rol     r_arg1L             ; 12/11 rotate 1 into quotient
    rol     r_arg1H
    rol     r_arg1HL
    rol     r_arg1HH
    dec     byteCnt
    brne    _udivmodsi4_pos     ; 18/17/16
    rjmp    bitsDone

_udivmodsi4_neg:
    rol     r_remL              ;  1 shift dividend into remainder
    rol     r_remH              ;  2
    rol     r_remHL             ;  3
    rol     r_remHH             ;  4
    add     r_remL, r_arg2L     ;  5 raise remainder
    adc     r_remH, r_arg2H     ;  6
    adc     r_remHL, r_arg2HL   ;  7
    adc     r_remHH, r_arg2HH   ;  8
    sbrs    r_arg2HH, 7         ; 10/9
    brcs    one_bit             ; 11/10 remainder restored to >= 0
zero_bit:
    lsl     r_arg1L             ; 11 shift 0 into quotient
    rol     r_arg1H
    rol     r_arg1HL
    rol     r_arg1HH
    dec     r_cnt
    brne    _udivmodsi4_neg     ; 17/16
    cp      r_remHH, r_arg2HH   ; if r_remHH smaller than r_arg2HH
    brcc    restore             ;     there was a carry
    inc     r_arg1L
    rjmp    bitsDone
restore:
    add     r_remL , r_arg2L    ; restore remainder
    adc     r_remH , r_arg2H
    adc     r_remHL, r_arg2HL
    adc     r_remHH, r_arg2HH
bitsDone:
    mov_l   r_arg2L,  r_arg1L   ; quotient
    mov_h   r_arg2H,  r_arg1H
    mov_l   r_arg2HL, r_arg1HL
    mov_h   r_arg2HH, r_arg1HH
    mov_l   r_arg1L , r_remL
    mov_h   r_arg1H , r_remH
    mov_l   r_arg1HL, r_remHL
    mov_h   r_arg1HH, r_remHH
    ret                         ; ~587 cycles worst case
sleep

A faster drop-in replacement for libgcc's AVR division
processing dividend bytes in a nested loop:

#define quotient    __tmp_reg__
#define dividend    __zero_reg__
#undef  r_cnt
#define r_cnt       r_arg1HH
__udivmodsi4B:
; no immediates for r0...r15 can be infuriating, especially with
;  callee save for remaining upper registers (r28,r29 + r16,r17 on non-RC)
    mov     dividend, r_arg1HH
    ldi     r_cnt, 0x40-2       ; init loop counters to 4 bytes of 8 bits*2
    clr r_remL    clr r_remH    ; clear remainder
    mov_l   r_remHL, r_remL
    mov_h   r_remHH, r_remH
byteLoop:
__udivmodsi4_loopB:
; shift dividend into remainder
    lsl dividend    rol r_remL    rol r_remH    rol r_remHL    rol r_remHH
; compare remainder & divisor
    cp  r_remL ,r_arg2L     cpc r_remH ,r_arg2H
    cpc r_remHL,r_arg2HL    cpc r_remHH,r_arg2HH
    brcs    __udivmodsi4_epB; remainder <= divisor
; reduce remainder
    sub r_remL ,r_arg2L     sbc r_remH ,r_arg2H
    sbc r_remHL,r_arg2HL    sbc r_remHH,r_arg2HH
__udivmodsi4_epB:
    rol     quotient            ; (with CARRY)
    subi    r_cnt, 2            ; half carry every 8 iterations
    brhc    __udivmodsi4_loopB
; __zero_reg__ now restored (dividend == 0)
;   com     quotient  sets Carry instead of keeping or complementing it
    mov     dividend, r_arg1HL
    mov     r_arg1HL, r_arg1H
    mov     r_arg1H, r_arg1L
    ser     r_arg1L             ; ldi rd, ~0; r16...r31, only
    eor     r_arg1L, quotient   ; touches neither Carry nor Half-Carry
    brcc    byteLoop
; div/mod results to return registers, as for the ldiv() function
    mov_l   r_arg2L,  r_arg1L   ; quotient
    mov_h   r_arg2H,  r_arg1H
    mov_l   r_arg2HL, r_arg1HL
    mov_h   r_arg2HH, dividend  ; the last bytes shift put the quotient MSB here

    mov_l   r_arg1L,  r_remL    ; remainder
    mov_h   r_arg1H,  r_remH
    mov_l   r_arg1HL, r_remHL
    mov_h   r_arg1HH, r_remHH
    ret
sleep

Shifting one byte of dividend instead of all four saves three cycles from each iteration of the bit-loop.
Alas, the compromise I chose to keep strictly with libgcc's size & "binary interface" expectations as well as with the "one binary fits all" philosophy adds back one cycle to the bit loop, reducing worst-case speed-up from > 80 cycles to about 53.
(The code ended up one instruction shorter than the original.
 Didn't help find a way to recover that cycle lost to no ldi for __tmp_reg__ / __zero_reg__ - yet.)

Conceivably, speed-up will be smaller with a variant for "partial single" ints,
and take two instructions more than __udivmodpsi4.

Shifting dividend bytes and non‑restoring division being independent, the variant with slicker loop control (almost 30 cycles less) and Non‑restoring division look candidates for first and second stops on the way to fast, but large implementations,
combining both as in the question probably is the third.

Upvotes: 2

Related Questions