Reputation: 2467
Revisiting 32-bit integer division on AVR 8-bit processors, I tried to code unsigned non-restoring division. Not thinking it looks too bad, there is one problem:
With a divisor with highest bit set, it does not readily result in a quotient 0 where necessary.
Boiled down to 8-bit division, and followed by some libgcc code:
#define bits __zero_reg__
#define R r25
#define D r22
#define NQ r24
ldi NQ, 25
ldi r22, 129
__udivmodqi4NR: ; 19 instructions NQ, R = divmod(NQ, D)
clr R
ldi bits, 8
lsl NQ
_udivmodqi4_pos:
rol R ; 1
sub R, D ; 2 diminish remainder
brcs _udivmodqi4_nep ; 4/3 remainder got < 0, was < divisor
sec ; 4 How to obviate this?
_udivmodqi4_ep:
rol NQ ; 5 rotate 1 quotient bit into dendQuot
dec bits ; 6 decrement loop counter
brne _udivmodqi4_pos ; 8/7
ret
_udivmodqi4_neg:
rol R ; 1
add R, D ; 2 increase remainder
brcs _udivmodqi4_ep ; 4/3 remainder restored to >= 0
_udivmodqi4_nep:
lsl NQ ; 4/5 shift 0 quotient bit into dendQuot
dec bits ; 5/6 decrement loop counter
brne _udivmodqi4_neg ; 7/6/8
add R, D
ret
sleep
; libgcc (or close enough), 12 instructions, worst case not significantly slower:
#define r_rem r25 /* remainder */
#define r_arg1 r24 /* dividend, quotient */
#define r_arg2 r22 /* divisor */
#define q_cnt r23 /* loop count */
__udivmodqi4: ; r24, r25 = divmod(r24, r22)
sub r_rem, r_rem ; clear remainder and carry
ldi q_cnt, 8 ; init loop counter
lsl r_arg1
__udivmodqi4_loop:
rol r_rem ; 1 shift dividend into remainder
cp r_rem, r_arg2 ; 2 compare remainder & divisor
brcs __udivmodqi4_ep ; 4/3 remainder <= divisor
sub r_rem, r_arg2 ; 4 restore remainder
__udivmodqi4_ep:
rol r_arg1 ; 5 shift dividend (with CARRY)
dec q_cnt ; 6 decrement loop counter
brne __udivmodqi4_loop;8/7
com r_arg1 ; 67 complement result
; because C flag was complemented in loop
ret
sleep
#define quotient __tmp_reg__
#define byteCnt __zero_reg__ /* loop count (0 after the loop!) */
#define dividend r_arg1HH
zeroOne:; 11(movw)/7(mov) + 2 instructions just to handle divisor MSB == 1
mov_l r_remL , r_arg1L ; remainder = dividend, part one
mov_h r_remH , r_arg1H
mov_l r_arg1L, r_remHL ; quotient = 0
mov_h r_arg1H, r_remHH
mov_l r_remHL , r_arg1HL ; part two
mov_h r_remHH , r_arg1HH
mov_l r_arg1HL, r_arg1L
mov_h r_arg1HH, r_arg1H
ldi byteCnt, 1 ; initialise loop control variables
ldi quotient, 0x80 ; for a single/final iteration
rjmp subtract
__udivmodsi4NonRestoring: ; arg2, arg1 = divmod(arg1, arg2)
clr r_remHH
clr r_remHL
cpi r_arg2HH, 128 ; sbrc r_arg2HH, 7 rjmp
brsh zeroOne ; how to avoid special casing divisor MSB set?
mov_l r_remL , r_remHL
mov_h r_remH , r_remHH
ldi byteCnt, 4
ldi quotient, 1 ; takes NBBY(8) rol instructions to appear as carry
_udivmodsi4_pos:
lsl dividend ; 1
rol r_remL ; 2 shift dividend into remainder
rol r_remH ; 3
rol r_remHL ; 4
rol r_remHH ; 5
subtract:
sub r_remL, r_arg2L ; 6 diminish remainder
sbc r_remH, r_arg2H ; 7
sbc r_remHL, r_arg2HL ; 8
sbc r_remHH, r_arg2HH ; 9
brcs _udivmodsi4_nep ; 11/10 remainder got < 0, was < divisor
sec ; 11
_udivmodsi4_ep:
rol quotient ; 12 rotate 1 into quotient
brcc _udivmodsi4_pos ; 14/13
dec byteCnt
breq bitsDone
byteLoop:
mov r_arg1HH, r_arg1HL ; shift by bytes
mov r_arg1HL, r_arg1H
mov r_arg1H , r_arg1L
mov r_arg1L , quotient
ldi quotient, 1
sbrc r_arg1L, 0
rjmp _udivmodsi4_pos
_udivmodsi4_neg:
lsl dividend ; 1
rol r_remL ; 2 shift dividend into remainder
rol r_remH ; 3
rol r_remHL ; 4
rol r_remHH ; 5
add r_remL, r_arg2L ; 6 raise remainder
adc r_remH, r_arg2H ; 7
adc r_remHL, r_arg2HL ; 8
adc r_remHH, r_arg2HH ; 9
brcs _udivmodsi4_ep ; 11/10 remainder restored to >= 0
_udivmodsi4_nep:
lsl quotient ; 12/11 shift 0 into quotient
brcc _udivmodsi4_neg ; 14/13/12
dec byteCnt
brne byteLoop
add r_remL , r_arg2L ; restore remainder
adc r_remH , r_arg2H
adc r_remHL, r_arg2HL
adc r_remHH, r_arg2HH
bitsDone:
mov r_arg2L , quotient
mov r_arg2H , r_arg1L
mov r_arg2HL, r_arg1H
mov r_arg2HH, r_arg1HL
mov_l r_arg1L , r_remL
mov_h r_arg1H , r_remH
mov_l r_arg1HL, r_remHL
mov_h r_arg1HH, r_remHH
ret ; < 500 cycles worst case
sleep
How can I keep the speed of non-restoring division without that many additional instructions for divisors with MSB 1? (As is, their number increases with operand size.)
Upvotes: 3
Views: 135
Reputation: 2467
The (obvious?) non‑answer of sorts:
One can avoid the code‑bloat apparently necessary for one special case in non‑restoring division resorting to conditionally subtracting division instead.
See A faster drop-in replacement for libgcc's AVR division below the horizontal line.
The way to avoid a considerable number of additional instructions for divisor MSB set is to just prevent the process from ever returning to subtraction while processing dividend bits, taking a hit in expected and even in worst case cycle count.
AVR allows skipping the conditional jump implementing that in a suitable way. One more conditional fix‑up is needed before return. This overhead does not grow with operand size.
Below non‑restoring division disentangled from dividend"ient shift by bytes. Worst case execution times are very similar; but due to the duplicated & bigger bit loop, non‑restoring takes about 59 instructions without movw where byte shift takes 39. The combined byte shifting non‑restoring division from the question code block can be modified the same way, raising worst case cycle count back above 500.
__udivmodsi4NonRestoring: ; arg2, arg1 = divmod(arg1, arg2)
clr r_remHH
clr r_remHL
mov_l r_remL , r_remHL
mov_h r_remH , r_remHH
ldi r_cnt, 33
rjmp _udivmodsi4_ep
_udivmodsi4_pos:
rol r_remL ; 1 shift dividend into remainder
rol r_remH ; 2
rol r_remHL ; 3
rol r_remHH ; 4
sub r_remL, r_arg2L ; 5 diminish remainder
sbc r_remH, r_arg2H ; 6
sbc r_remHL, r_arg2HL ; 7
sbc r_remHH, r_arg2HH ; 8
brcs zero_bit ; 10/9 remainder got < 0, was < divisor
sec ; 10
_udivmodsi4_ep:
one_bit:
rol r_arg1L ; 12/11 rotate 1 into quotient
rol r_arg1H
rol r_arg1HL
rol r_arg1HH
dec byteCnt
brne _udivmodsi4_pos ; 18/17/16
rjmp bitsDone
_udivmodsi4_neg:
rol r_remL ; 1 shift dividend into remainder
rol r_remH ; 2
rol r_remHL ; 3
rol r_remHH ; 4
add r_remL, r_arg2L ; 5 raise remainder
adc r_remH, r_arg2H ; 6
adc r_remHL, r_arg2HL ; 7
adc r_remHH, r_arg2HH ; 8
sbrs r_arg2HH, 7 ; 10/9
brcs one_bit ; 11/10 remainder restored to >= 0
zero_bit:
lsl r_arg1L ; 11 shift 0 into quotient
rol r_arg1H
rol r_arg1HL
rol r_arg1HH
dec r_cnt
brne _udivmodsi4_neg ; 17/16
cp r_remHH, r_arg2HH ; if r_remHH smaller than r_arg2HH
brcc restore ; there was a carry
inc r_arg1L
rjmp bitsDone
restore:
add r_remL , r_arg2L ; restore remainder
adc r_remH , r_arg2H
adc r_remHL, r_arg2HL
adc r_remHH, r_arg2HH
bitsDone:
mov_l r_arg2L, r_arg1L ; quotient
mov_h r_arg2H, r_arg1H
mov_l r_arg2HL, r_arg1HL
mov_h r_arg2HH, r_arg1HH
mov_l r_arg1L , r_remL
mov_h r_arg1H , r_remH
mov_l r_arg1HL, r_remHL
mov_h r_arg1HH, r_remHH
ret ; ~587 cycles worst case
sleep
A faster drop-in replacement for libgcc's AVR division
processing dividend bytes in a nested loop:
#define quotient __tmp_reg__
#define dividend __zero_reg__
#undef r_cnt
#define r_cnt r_arg1HH
__udivmodsi4B:
; no immediates for r0...r15 can be infuriating, especially with
; callee save for remaining upper registers (r28,r29 + r16,r17 on non-RC)
mov dividend, r_arg1HH
ldi r_cnt, 0x40-2 ; init loop counters to 4 bytes of 8 bits*2
clr r_remL clr r_remH ; clear remainder
mov_l r_remHL, r_remL
mov_h r_remHH, r_remH
byteLoop:
__udivmodsi4_loopB:
; shift dividend into remainder
lsl dividend rol r_remL rol r_remH rol r_remHL rol r_remHH
; compare remainder & divisor
cp r_remL ,r_arg2L cpc r_remH ,r_arg2H
cpc r_remHL,r_arg2HL cpc r_remHH,r_arg2HH
brcs __udivmodsi4_epB; remainder <= divisor
; reduce remainder
sub r_remL ,r_arg2L sbc r_remH ,r_arg2H
sbc r_remHL,r_arg2HL sbc r_remHH,r_arg2HH
__udivmodsi4_epB:
rol quotient ; (with CARRY)
subi r_cnt, 2 ; half carry every 8 iterations
brhc __udivmodsi4_loopB
; __zero_reg__ now restored (dividend == 0)
; com quotient sets Carry instead of keeping or complementing it
mov dividend, r_arg1HL
mov r_arg1HL, r_arg1H
mov r_arg1H, r_arg1L
ser r_arg1L ; ldi rd, ~0; r16...r31, only
eor r_arg1L, quotient ; touches neither Carry nor Half-Carry
brcc byteLoop
; div/mod results to return registers, as for the ldiv() function
mov_l r_arg2L, r_arg1L ; quotient
mov_h r_arg2H, r_arg1H
mov_l r_arg2HL, r_arg1HL
mov_h r_arg2HH, dividend ; the last bytes shift put the quotient MSB here
mov_l r_arg1L, r_remL ; remainder
mov_h r_arg1H, r_remH
mov_l r_arg1HL, r_remHL
mov_h r_arg1HH, r_remHH
ret
sleep
Shifting one byte of dividend instead of all four saves three cycles from each iteration of the bit-loop.
Alas, the compromise I chose to keep strictly with libgcc's size & "binary interface" expectations as well as with the "one binary fits all" philosophy adds back one cycle to the bit loop, reducing worst-case speed-up from > 80 cycles to about 53.
(The code ended up one instruction shorter than the original.
Didn't help find a way to recover that cycle lost to no ldi
for __tmp_reg__
/ __zero_reg__
- yet.)
Conceivably, speed-up will be smaller with a variant for "partial single" ints,
and take two instructions more than __udivmodpsi4.
Shifting dividend bytes and non‑restoring division being independent,
the variant with slicker loop control (almost 30 cycles less) and Non‑restoring division look candidates for first and second stops on the way to fast, but large implementations,
combining both as in the question probably is the third.
Upvotes: 2