[Patch,AVR]: ad PR49313: 64-bit division

Georg-Johann Lay Sun, 20 Nov 2011 11:10:48 -0800

This implements assembler drop-in replacement for 64-bit division/remainder.


The original libgcc implementation is extremely resource gulping because it
uses inline in several places and DImode is resource gulping, anyway.

With the patch the sizes (accumulated over all modules with same name) are:

Col #2 = Original libgcc (C-Code)
Col #3 = New implementation in Asm
Col #4 = Change (relative)
Col #5 = Change (absolute)

_udivmod64.o                          :      0   1362 100000.0%   1362
_negdi2.o                             :   1296    288    -77.8%  -1008
_umoddi3.o                            :  22326      0   -100.0% -22326
_udivdi3.o                            :  26878    246    -99.1% -26632
_moddi3.o                             :  28986      0   -100.0% -28986
_divdi3.o                             :  33076    840    -97.5% -32236
:::::: Total :::::::                  : 362188 252362    -30.3% -109826

The detailed size report:

avr3/libgcc.a!_udivmod64.o            :      0    174 100000.0%    174
avr31/libgcc.a!_udivmod64.o           :      0    174 100000.0%    174
avr6/libgcc.a!_udivmod64.o            :      0    162 100000.0%    162
avr5/libgcc.a!_udivmod64.o            :      0    162 100000.0%    162
avr35/libgcc.a!_udivmod64.o           :      0    162 100000.0%    162
avr51/libgcc.a!_udivmod64.o           :      0    162 100000.0%    162
avr25/libgcc.a!_udivmod64.o           :      0    126 100000.0%    126
avr4/libgcc.a!_udivmod64.o            :      0    126 100000.0%    126
libgcc.a!_udivmod64.o                 :      0    114 100000.0%    114
avr4/libgcc.a!_negdi2.o               :    140     32    -77.1%   -108
avr25/libgcc.a!_negdi2.o              :    140     32    -77.1%   -108
avr5/libgcc.a!_negdi2.o               :    144     32    -77.8%   -112
avr6/libgcc.a!_negdi2.o               :    144     32    -77.8%   -112
libgcc.a!_negdi2.o                    :    144     32    -77.8%   -112
avr35/libgcc.a!_negdi2.o              :    144     32    -77.8%   -112
avr51/libgcc.a!_negdi2.o              :    144     32    -77.8%   -112
avr31/libgcc.a!_negdi2.o              :    148     32    -78.4%   -116
avr3/libgcc.a!_negdi2.o               :    148     32    -78.4%   -116
avr4/libgcc.a!_umoddi3.o              :   2304      0   -100.0%  -2304
avr6/libgcc.a!_umoddi3.o              :   2360      0   -100.0%  -2360
avr51/libgcc.a!_umoddi3.o             :   2360      0   -100.0%  -2360
avr5/libgcc.a!_umoddi3.o              :   2360      0   -100.0%  -2360
avr25/libgcc.a!_umoddi3.o             :   2364      0   -100.0%  -2364
avr35/libgcc.a!_umoddi3.o             :   2420      0   -100.0%  -2420
libgcc.a!_umoddi3.o                   :   2682      0   -100.0%  -2682
avr3/libgcc.a!_umoddi3.o              :   2738      0   -100.0%  -2738
avr31/libgcc.a!_umoddi3.o             :   2738      0   -100.0%  -2738
avr4/libgcc.a!_udivdi3.o              :   2784     26    -99.1%  -2758
avr25/libgcc.a!_udivdi3.o             :   2828     26    -99.1%  -2802
avr5/libgcc.a!_udivdi3.o              :   2852     28    -99.0%  -2824
avr6/libgcc.a!_udivdi3.o              :   2852     28    -99.0%  -2824
avr51/libgcc.a!_udivdi3.o             :   2852     28    -99.0%  -2824
avr35/libgcc.a!_udivdi3.o             :   2896     28    -99.0%  -2868
avr4/libgcc.a!_moddi3.o               :   3016      0   -100.0%  -3016
avr5/libgcc.a!_moddi3.o               :   3072      0   -100.0%  -3072
avr6/libgcc.a!_moddi3.o               :   3072      0   -100.0%  -3072
avr51/libgcc.a!_moddi3.o              :   3072      0   -100.0%  -3072
avr25/libgcc.a!_moddi3.o              :   3124      0   -100.0%  -3124
avr35/libgcc.a!_moddi3.o              :   3180      0   -100.0%  -3180
libgcc.a!_udivdi3.o                   :   3226     26    -99.2%  -3200
avr31/libgcc.a!_udivdi3.o             :   3294     28    -99.1%  -3266
avr3/libgcc.a!_udivdi3.o              :   3294     28    -99.1%  -3266
avr4/libgcc.a!_divdi3.o               :   3396     86    -97.5%  -3310
avr5/libgcc.a!_divdi3.o               :   3464     98    -97.2%  -3366
avr51/libgcc.a!_divdi3.o              :   3464     98    -97.2%  -3366
avr6/libgcc.a!_divdi3.o               :   3464     98    -97.2%  -3366
libgcc.a!_moddi3.o                    :   3446      0   -100.0%  -3446
avr25/libgcc.a!_divdi3.o              :   3578     86    -97.6%  -3492
avr31/libgcc.a!_moddi3.o              :   3502      0   -100.0%  -3502
avr3/libgcc.a!_moddi3.o               :   3502      0   -100.0%  -3502
avr35/libgcc.a!_divdi3.o              :   3646     98    -97.3%  -3548
libgcc.a!_divdi3.o                    :   3976     78    -98.0%  -3898
avr31/libgcc.a!_divdi3.o              :   4044    100    -97.5%  -3944
avr3/libgcc.a!_divdi3.o               :   4044     98    -97.6%  -3946
:::::: Total :::::::                  : 362188 252362    -30.3% -109826

The implementation is basically the same as the division/modulo already present
in lib1funcs.S. However, the algorithm does not compute the complement by
recycling the carry bit, instead it expands directly to the quotient. That way,
n instructions can be saved when dealing with n-byte values.

The implementation provides speed-up of the algorithm for the case when there
is enough flash.  The assumption is that speed for arithmetic matters.

As you can see above, the size of __udivmod64 varies from 114 to 174 bytes:

114 = small devices without MOVW (no speed-up: SPEED_DIV = 0)
126 = small devices with MOVW (small speed-up: SPEED_DIV = 16)
162, 174 = devices >= 16k (best speed-up: SPEED_DIV = 8)

Passed without regressions.

Moreover, the algorithm is individually tested against the old implementation.
The only difference I observed was for divisor = 0.

Ok for trunk?

Johann

libgcc/
        PR target/49313
        * config/avr/t-avr (LIB2FUNCS_EXCLUDE): Add _moddi3, _umoddi3.
        (LIB1ASMFUNCS): Add _divdi3, _udivdi3, _udivmod64, _negdi2.

        * config/avr/lib1funcs.S (wmov): New assembler macro.
        (__umoddi3, __udivdi3, __udivdi3_umoddi3): New functions.
        (__moddi3, __divdi3, __divdi3_moddi3): New functions.
        (__udivmod64): New function.
        (__negdi2): New function.

Index: libgcc/config/avr/lib1funcs.S
===================================================================
--- libgcc/config/avr/lib1funcs.S	(revision 181482)
+++ libgcc/config/avr/lib1funcs.S	(working copy)
@@ -61,6 +61,15 @@ see the files COPYING3 and COPYING.RUNTI
 #endif
 	.endm
 
+.macro	wmov  r_dest, r_src
+#if defined (__AVR_HAVE_MOVW__)
+    movw \r_dest,   \r_src
+#else
+    mov \r_dest,    \r_src
+    mov \r_dest+1,  \r_src+1
+#endif
+.endm
+
 #if defined (__AVR_HAVE_JMP_CALL__)
 #define XCALL call
 #define XJMP  jmp
@@ -846,6 +855,352 @@ __divmodsi4_exit:
 ENDF __divmodsi4
 #endif /* defined (L_divmodsi4) */
 
+
+/*******************************************************
+       Division 64 / 64
+       Modulo   64 % 64
+*******************************************************/
+
+;; Use Speed-optimized Version on "big" Devices, i.e. Devices with
+;; at least 16k of Program Memory.  For smaller Devices, depend
+;; on MOVW.
+
+#if defined (__AVR_HAVE_JMP_CALL__)
+#   define SPEED_DIV 8
+#elif defined (__AVR_HAVE_MOVW__)
+#   define SPEED_DIV 16
+#else
+#   define SPEED_DIV 0
+#endif
+
+;; A[0..7]: In: Dividend;
+;; Out: Quotient  (T = 0)
+;; Out: Remainder (T = 1)
+#define A0  18
+#define A1  A0+1
+#define A2  A0+2
+#define A3  A0+3
+#define A4  A0+4
+#define A5  A0+5
+#define A6  A0+6
+#define A7  A0+7
+
+;; B[0..7]: In: Divisor;   Out: Clobber
+#define B0  10
+#define B1  B0+1
+#define B2  B0+2
+#define B3  B0+3
+#define B4  B0+4
+#define B5  B0+5
+#define B6  B0+6
+#define B7  B0+7
+
+;; C[0..7]: Expand remainder;  Out: Remainder (unused)
+#define C0  8
+#define C1  C0+1
+#define C2  30
+#define C3  C2+1
+#define C4  28
+#define C5  C4+1
+#define C6  26
+#define C7  C6+1
+
+;; Holds Signs during Division Routine
+#define SS      __tmp_reg__
+
+;; Bit-Counter in Division Routine
+#define R_cnt   __zero_reg__
+
+;; Scratch Register for Negation
+#define NN      r31
+
+#if defined (L_udivdi3)
+
+;; R25:R18 = R24:R18  umod  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __umoddi3
+    set
+    rjmp __udivdi3_umoddi3
+ENDF __umoddi3
+
+;; R25:R18 = R24:R18  udiv  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __udivdi3
+    clt
+ENDF __udivdi3
+
+DEFUN __udivdi3_umoddi3
+    push    C0
+    push    C1
+    push    C4
+    push    C5
+    XCALL   __udivmod64
+    pop     C5
+    pop     C4
+    pop     C1
+    pop     C0
+    ret
+ENDF __udivdi3_umoddi3
+#endif /* L_udivdi3 */
+
+#if defined (L_udivmod64)
+
+;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation
+;; No Registers saved/restored; the Callers will take Care.
+;; Preserves B[] and T-flag
+;; T = 0: Compute Quotient  in A[]
+;; T = 1: Compute Remainder in A[] and shift SS one Bit left
+
+DEFUN __udivmod64
+
+    ;; Clear Remainder (C6, C7 will follow)
+    clr     C0
+    clr     C1
+    wmov    C2, C0
+    wmov    C4, C0
+    ldi     C7, 64
+
+#if SPEED_DIV == 0 || SPEED_DIV == 16
+    ;; Initialize Loop-Counter
+    mov     R_cnt, C7
+    wmov    C6, C0
+#endif /* SPEED_DIV */
+
+#if SPEED_DIV == 8
+
+    push    A7
+    clr     C6
+
+1:  ;; Compare shifted Devidend against Divisor
+    ;; If -- even after Shifting -- it is smaller...
+    CP  A7,B0  $  cpc C0,B1  $  cpc C1,B2  $  cpc C2,B3  
+    cpc C3,B4  $  cpc C4,B5  $  cpc C5,B6  $  cpc C6,B7  
+    brcc    2f
+
+    ;; ...then we can subtract it.  Thus, it is legal to shift left
+               $  mov C6,C5  $  mov C5,C4  $  mov C4,C3
+    mov C3,C2  $  mov C2,C1  $  mov C1,C0  $  mov C0,A7
+    mov A7,A6  $  mov A6,A5  $  mov A5,A4  $  mov A4,A3
+    mov A3,A2  $  mov A2,A1  $  mov A1,A0  $  clr A0
+
+    ;; 8 Bits are done
+    subi    C7, 8
+    brne    1b
+
+    ;; Shifted 64 Bits:  A7 has traveled to C7
+    pop     C7
+    ;; Divisor is greater than Dividend. We have:
+    ;; A[] % B[] = A[]
+    ;; A[] / B[] = 0
+    ;; Thus, we can return immediately
+    rjmp    5f
+
+2:  ;; Initialze Bit-Counter with Number of Bits still to be performed
+    mov     R_cnt, C7
+
+    ;; Push of A7 is not needed because C7 is still 0
+    pop     C7
+    clr     C7
+
+#elif  SPEED_DIV == 16
+
+    ;; Compare shifted Dividend against Divisor
+    cp      A7, B3
+    cpc     C0, B4
+    cpc     C1, B5
+    cpc     C2, B6
+    cpc     C3, B7
+    brcc    2f
+
+    ;; Divisor is greater than shifted Dividen: We can shift the Dividend
+    ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk
+    wmov  C2,A6  $  wmov C0,A4
+    wmov  A6,A2  $  wmov A4,A0
+    wmov  A2,C6  $  wmov A0,C4
+
+    ;; Set Bit Counter to 32
+    lsr     R_cnt
+2:
+#elif SPEED_DIV
+#error SPEED_DIV = ?
+#endif /* SPEED_DIV */
+
+;; The very Division + Remainder Routine
+
+3:  ;; Left-shift Dividend...
+    lsl A0     $  rol A1     $  rol A2     $  rol A3
+    rol A4     $  rol A5     $  rol A6     $  rol A7
+
+    ;; ...into Remainder
+    rol C0     $  rol C1     $  rol C2     $  rol C3
+    rol C4     $  rol C5     $  rol C6     $  rol C7
+
+    ;; Compare Remainder and Divisor
+    CP  C0,B0  $  cpc C1,B1  $  cpc C2,B2  $  cpc C3,B3
+    cpc C4,B4  $  cpc C5,B5  $  cpc C6,B6  $  cpc C7,B7
+
+    brcs 4f
+
+    ;; Divisor fits into Remainder:  Subtract it from Remainder...
+    SUB C0,B0  $  sbc C1,B1  $  sbc C2,B2  $  sbc C3,B3
+    sbc C4,B4  $  sbc C5,B5  $  sbc C6,B6  $  sbc C7,B7
+
+    ;; ...and set according Bit in the upcoming Quotient
+    ;; The Bit will travel to its final Position
+    ori A0, 1
+
+4:  ;; This Bit is done
+    dec     R_cnt
+    brne    3b
+    ;; __zero_reg__ is 0 again
+
+    ;; T = 0: We are fine with the Quotient in A[]
+    ;; T = 1: Copy Remainder to A[]
+5:  brtc    6f
+    wmov    A0, C0
+    wmov    A2, C2
+    wmov    A4, C4
+    wmov    A6, C6
+    ;; Move the Sign of the Result to SS.7
+    lsl     SS
+
+6:  ret
+
+ENDF __udivmod64
+#endif /* L_udivmod64 */
+    
+
+#if defined (L_divdi3)
+
+;; R25:R18 = R24:R18  mod  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __moddi3
+    set
+    rjmp    __divdi3_moddi3
+ENDF __moddi3
+
+;; R25:R18 = R24:R18  div  R17:R10
+;; Ordinary ABI-Function
+
+DEFUN __divdi3
+    clt
+ENDF __divdi3
+
+DEFUN  __divdi3_moddi3
+#if SPEED_DIV
+    mov     r31, A7
+    or      r31, B7
+    brmi    0f
+    ;; Both Signs are 0:  the following Complexitiy is not needed
+    XJMP    __udivdi3_umoddi3
+#endif /* SPEED_DIV */    
+
+0:  ;; The Prologue
+    ;; Save Z = 12 Registers:  Y, 17...8
+    ;; No Frame needed (X = 0)
+    clr r26
+    clr r27
+    ldi r30, lo8(gs(1f))
+    ldi r31, hi8(gs(1f))
+    XJMP __prologue_saves__ + ((18 - 12) * 2)
+
+1:  ;; SS.7 will contain the Sign of the Quotient  (A.sign * B.sign)
+    ;; SS.6 will contain the Sign of the Remainder (A.sign)
+    mov     SS, A7
+    asr     SS
+    ;; Adjust Dividend's Sign as needed
+#if SPEED_DIV
+    ;; Compiling for Speed we know that at least one Sign must be < 0
+    ;; Thus, if A[] >= 0 then we know B[] < 0
+    brpl    22f
+#else
+    brpl    21f
+#endif /* SPEED_DIV */
+   
+    XCALL   __negdi2
+
+    ;; Adjust Divisor's Sign and SS.7 as needed
+21: tst     B7
+    brpl    3f
+22: ldi     NN, 1 << 7
+    eor     SS, NN
+
+    ldi NN, -1
+    com B4     $  com B5     $  com B6     $  com B7
+               $  com B1     $  com B2     $  com B3
+    NEG B0
+               $  sbc B1,NN  $  sbc B2,NN  $  sbc B3,NN
+    sbc B4,NN  $  sbc B5,NN  $  sbc B6,NN  $  sbc B7,NN
+
+3:  ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag)
+    XCALL   __udivmod64
+
+    ;; Adjust Result's Sign
+#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
+    tst     SS
+    brpl    4f
+#else
+    sbrc    SS, 7
+#endif /* __AVR_HAVE_JMP_CALL__ */
+    XCALL   __negdi2
+
+4:  ;; Epilogue: Restore the Z = 12 Registers and return
+    in r28, __SP_L__
+    in r29, __SP_H__
+    ldi r30, 12
+    XJMP __epilogue_restores__ + ((18 - 12) * 2)
+
+ENDF __divdi3_moddi3
+
+#undef R_cnt
+#undef SS
+#undef NN
+
+#endif /* L_divdi3 */
+
+#if defined (L_negdi2)
+DEFUN __negdi2
+
+    com  A4    $  com  A5    $  com  A6    $  com  A7
+               $  com  A1    $  com  A2    $  com  A3
+    NEG  A0
+               $  sbci A1,-1 $  sbci A2,-1 $  sbci A3,-1
+    sbci A4,-1 $  sbci A5,-1 $  sbci A6,-1 $  sbci A7,-1
+    ret
+
+ENDF __negdi2
+#endif /* L_negdi2 */
+
+#undef C7
+#undef C6
+#undef C5
+#undef C4
+#undef C3
+#undef C2
+#undef C1
+#undef C0
+
+#undef B7
+#undef B6
+#undef B5
+#undef B4
+#undef B3
+#undef B2
+#undef B1
+#undef B0
+
+#undef A7
+#undef A6
+#undef A5
+#undef A4
+#undef A3
+#undef A2
+#undef A1
+#undef A0
+
 
 .section .text.libgcc.prologue, "ax", @progbits
     
@@ -854,6 +1209,7 @@ ENDF __divmodsi4
  **********************************/
 #if defined (L_prologue)
 
+;; This function does not clobber T-flag; 64-bit division relies on it
 DEFUN __prologue_saves__
 	push r2
 	push r3
Index: libgcc/config/avr/t-avr
===================================================================
--- libgcc/config/avr/t-avr	(revision 181482)
+++ libgcc/config/avr/t-avr	(working copy)
@@ -15,6 +15,9 @@ LIB1ASMFUNCS = \
 	_divmodpsi4 _udivmodpsi4 \
 	_udivmodsi4 \
 	_divmodsi4 \
+	_divdi3 _udivdi3 \
+	_udivmod64 \
+	_negdi2 \
 	_prologue \
 	_epilogue \
 	_exit \
@@ -50,6 +53,7 @@ LIB1ASMFUNCS = \
 	_fmul _fmuls _fmulsu
 
 LIB2FUNCS_EXCLUDE = \
+	_moddi3 _umoddi3 \
 	_clz
 
 # We do not have the DF type.

[Patch,AVR]: ad PR49313: 64-bit division

Reply via email to