This implements assembler drop-in replacement for 64-bit division/remainder.
The original libgcc implementation is extremely resource gulping because it uses inline in several places and DImode is resource gulping, anyway. With the patch the sizes (accumulated over all modules with same name) are: Col #2 = Original libgcc (C-Code) Col #3 = New implementation in Asm Col #4 = Change (relative) Col #5 = Change (absolute) _udivmod64.o : 0 1362 100000.0% 1362 _negdi2.o : 1296 288 -77.8% -1008 _umoddi3.o : 22326 0 -100.0% -22326 _udivdi3.o : 26878 246 -99.1% -26632 _moddi3.o : 28986 0 -100.0% -28986 _divdi3.o : 33076 840 -97.5% -32236 :::::: Total ::::::: : 362188 252362 -30.3% -109826 The detailed size report: avr3/libgcc.a!_udivmod64.o : 0 174 100000.0% 174 avr31/libgcc.a!_udivmod64.o : 0 174 100000.0% 174 avr6/libgcc.a!_udivmod64.o : 0 162 100000.0% 162 avr5/libgcc.a!_udivmod64.o : 0 162 100000.0% 162 avr35/libgcc.a!_udivmod64.o : 0 162 100000.0% 162 avr51/libgcc.a!_udivmod64.o : 0 162 100000.0% 162 avr25/libgcc.a!_udivmod64.o : 0 126 100000.0% 126 avr4/libgcc.a!_udivmod64.o : 0 126 100000.0% 126 libgcc.a!_udivmod64.o : 0 114 100000.0% 114 avr4/libgcc.a!_negdi2.o : 140 32 -77.1% -108 avr25/libgcc.a!_negdi2.o : 140 32 -77.1% -108 avr5/libgcc.a!_negdi2.o : 144 32 -77.8% -112 avr6/libgcc.a!_negdi2.o : 144 32 -77.8% -112 libgcc.a!_negdi2.o : 144 32 -77.8% -112 avr35/libgcc.a!_negdi2.o : 144 32 -77.8% -112 avr51/libgcc.a!_negdi2.o : 144 32 -77.8% -112 avr31/libgcc.a!_negdi2.o : 148 32 -78.4% -116 avr3/libgcc.a!_negdi2.o : 148 32 -78.4% -116 avr4/libgcc.a!_umoddi3.o : 2304 0 -100.0% -2304 avr6/libgcc.a!_umoddi3.o : 2360 0 -100.0% -2360 avr51/libgcc.a!_umoddi3.o : 2360 0 -100.0% -2360 avr5/libgcc.a!_umoddi3.o : 2360 0 -100.0% -2360 avr25/libgcc.a!_umoddi3.o : 2364 0 -100.0% -2364 avr35/libgcc.a!_umoddi3.o : 2420 0 -100.0% -2420 libgcc.a!_umoddi3.o : 2682 0 -100.0% -2682 avr3/libgcc.a!_umoddi3.o : 2738 0 -100.0% -2738 avr31/libgcc.a!_umoddi3.o : 2738 0 -100.0% -2738 avr4/libgcc.a!_udivdi3.o : 2784 26 -99.1% -2758 avr25/libgcc.a!_udivdi3.o : 2828 26 -99.1% -2802 avr5/libgcc.a!_udivdi3.o : 2852 28 -99.0% -2824 avr6/libgcc.a!_udivdi3.o : 2852 28 -99.0% -2824 avr51/libgcc.a!_udivdi3.o : 2852 28 -99.0% -2824 avr35/libgcc.a!_udivdi3.o : 2896 28 -99.0% -2868 avr4/libgcc.a!_moddi3.o : 3016 0 -100.0% -3016 avr5/libgcc.a!_moddi3.o : 3072 0 -100.0% -3072 avr6/libgcc.a!_moddi3.o : 3072 0 -100.0% -3072 avr51/libgcc.a!_moddi3.o : 3072 0 -100.0% -3072 avr25/libgcc.a!_moddi3.o : 3124 0 -100.0% -3124 avr35/libgcc.a!_moddi3.o : 3180 0 -100.0% -3180 libgcc.a!_udivdi3.o : 3226 26 -99.2% -3200 avr31/libgcc.a!_udivdi3.o : 3294 28 -99.1% -3266 avr3/libgcc.a!_udivdi3.o : 3294 28 -99.1% -3266 avr4/libgcc.a!_divdi3.o : 3396 86 -97.5% -3310 avr5/libgcc.a!_divdi3.o : 3464 98 -97.2% -3366 avr51/libgcc.a!_divdi3.o : 3464 98 -97.2% -3366 avr6/libgcc.a!_divdi3.o : 3464 98 -97.2% -3366 libgcc.a!_moddi3.o : 3446 0 -100.0% -3446 avr25/libgcc.a!_divdi3.o : 3578 86 -97.6% -3492 avr31/libgcc.a!_moddi3.o : 3502 0 -100.0% -3502 avr3/libgcc.a!_moddi3.o : 3502 0 -100.0% -3502 avr35/libgcc.a!_divdi3.o : 3646 98 -97.3% -3548 libgcc.a!_divdi3.o : 3976 78 -98.0% -3898 avr31/libgcc.a!_divdi3.o : 4044 100 -97.5% -3944 avr3/libgcc.a!_divdi3.o : 4044 98 -97.6% -3946 :::::: Total ::::::: : 362188 252362 -30.3% -109826 The implementation is basically the same as the division/modulo already present in lib1funcs.S. However, the algorithm does not compute the complement by recycling the carry bit, instead it expands directly to the quotient. That way, n instructions can be saved when dealing with n-byte values. The implementation provides speed-up of the algorithm for the case when there is enough flash. The assumption is that speed for arithmetic matters. As you can see above, the size of __udivmod64 varies from 114 to 174 bytes: 114 = small devices without MOVW (no speed-up: SPEED_DIV = 0) 126 = small devices with MOVW (small speed-up: SPEED_DIV = 16) 162, 174 = devices >= 16k (best speed-up: SPEED_DIV = 8) Passed without regressions. Moreover, the algorithm is individually tested against the old implementation. The only difference I observed was for divisor = 0. Ok for trunk? Johann libgcc/ PR target/49313 * config/avr/t-avr (LIB2FUNCS_EXCLUDE): Add _moddi3, _umoddi3. (LIB1ASMFUNCS): Add _divdi3, _udivdi3, _udivmod64, _negdi2. * config/avr/lib1funcs.S (wmov): New assembler macro. (__umoddi3, __udivdi3, __udivdi3_umoddi3): New functions. (__moddi3, __divdi3, __divdi3_moddi3): New functions. (__udivmod64): New function. (__negdi2): New function.
Index: libgcc/config/avr/lib1funcs.S =================================================================== --- libgcc/config/avr/lib1funcs.S (revision 181482) +++ libgcc/config/avr/lib1funcs.S (working copy) @@ -61,6 +61,15 @@ see the files COPYING3 and COPYING.RUNTI #endif .endm +.macro wmov r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + movw \r_dest, \r_src +#else + mov \r_dest, \r_src + mov \r_dest+1, \r_src+1 +#endif +.endm + #if defined (__AVR_HAVE_JMP_CALL__) #define XCALL call #define XJMP jmp @@ -846,6 +855,352 @@ __divmodsi4_exit: ENDF __divmodsi4 #endif /* defined (L_divmodsi4) */ + +/******************************************************* + Division 64 / 64 + Modulo 64 % 64 +*******************************************************/ + +;; Use Speed-optimized Version on "big" Devices, i.e. Devices with +;; at least 16k of Program Memory. For smaller Devices, depend +;; on MOVW. + +#if defined (__AVR_HAVE_JMP_CALL__) +# define SPEED_DIV 8 +#elif defined (__AVR_HAVE_MOVW__) +# define SPEED_DIV 16 +#else +# define SPEED_DIV 0 +#endif + +;; A[0..7]: In: Dividend; +;; Out: Quotient (T = 0) +;; Out: Remainder (T = 1) +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 +#define A4 A0+4 +#define A5 A0+5 +#define A6 A0+6 +#define A7 A0+7 + +;; B[0..7]: In: Divisor; Out: Clobber +#define B0 10 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 +#define B4 B0+4 +#define B5 B0+5 +#define B6 B0+6 +#define B7 B0+7 + +;; C[0..7]: Expand remainder; Out: Remainder (unused) +#define C0 8 +#define C1 C0+1 +#define C2 30 +#define C3 C2+1 +#define C4 28 +#define C5 C4+1 +#define C6 26 +#define C7 C6+1 + +;; Holds Signs during Division Routine +#define SS __tmp_reg__ + +;; Bit-Counter in Division Routine +#define R_cnt __zero_reg__ + +;; Scratch Register for Negation +#define NN r31 + +#if defined (L_udivdi3) + +;; R25:R18 = R24:R18 umod R17:R10 +;; Ordinary ABI-Function + +DEFUN __umoddi3 + set + rjmp __udivdi3_umoddi3 +ENDF __umoddi3 + +;; R25:R18 = R24:R18 udiv R17:R10 +;; Ordinary ABI-Function + +DEFUN __udivdi3 + clt +ENDF __udivdi3 + +DEFUN __udivdi3_umoddi3 + push C0 + push C1 + push C4 + push C5 + XCALL __udivmod64 + pop C5 + pop C4 + pop C1 + pop C0 + ret +ENDF __udivdi3_umoddi3 +#endif /* L_udivdi3 */ + +#if defined (L_udivmod64) + +;; Worker Routine for 64-Bit unsigned Quotient and Remainder Computation +;; No Registers saved/restored; the Callers will take Care. +;; Preserves B[] and T-flag +;; T = 0: Compute Quotient in A[] +;; T = 1: Compute Remainder in A[] and shift SS one Bit left + +DEFUN __udivmod64 + + ;; Clear Remainder (C6, C7 will follow) + clr C0 + clr C1 + wmov C2, C0 + wmov C4, C0 + ldi C7, 64 + +#if SPEED_DIV == 0 || SPEED_DIV == 16 + ;; Initialize Loop-Counter + mov R_cnt, C7 + wmov C6, C0 +#endif /* SPEED_DIV */ + +#if SPEED_DIV == 8 + + push A7 + clr C6 + +1: ;; Compare shifted Devidend against Divisor + ;; If -- even after Shifting -- it is smaller... + CP A7,B0 $ cpc C0,B1 $ cpc C1,B2 $ cpc C2,B3 + cpc C3,B4 $ cpc C4,B5 $ cpc C5,B6 $ cpc C6,B7 + brcc 2f + + ;; ...then we can subtract it. Thus, it is legal to shift left + $ mov C6,C5 $ mov C5,C4 $ mov C4,C3 + mov C3,C2 $ mov C2,C1 $ mov C1,C0 $ mov C0,A7 + mov A7,A6 $ mov A6,A5 $ mov A5,A4 $ mov A4,A3 + mov A3,A2 $ mov A2,A1 $ mov A1,A0 $ clr A0 + + ;; 8 Bits are done + subi C7, 8 + brne 1b + + ;; Shifted 64 Bits: A7 has traveled to C7 + pop C7 + ;; Divisor is greater than Dividend. We have: + ;; A[] % B[] = A[] + ;; A[] / B[] = 0 + ;; Thus, we can return immediately + rjmp 5f + +2: ;; Initialze Bit-Counter with Number of Bits still to be performed + mov R_cnt, C7 + + ;; Push of A7 is not needed because C7 is still 0 + pop C7 + clr C7 + +#elif SPEED_DIV == 16 + + ;; Compare shifted Dividend against Divisor + cp A7, B3 + cpc C0, B4 + cpc C1, B5 + cpc C2, B6 + cpc C3, B7 + brcc 2f + + ;; Divisor is greater than shifted Dividen: We can shift the Dividend + ;; and it is still smaller than the Divisor --> Shift one 32-Bit Chunk + wmov C2,A6 $ wmov C0,A4 + wmov A6,A2 $ wmov A4,A0 + wmov A2,C6 $ wmov A0,C4 + + ;; Set Bit Counter to 32 + lsr R_cnt +2: +#elif SPEED_DIV +#error SPEED_DIV = ? +#endif /* SPEED_DIV */ + +;; The very Division + Remainder Routine + +3: ;; Left-shift Dividend... + lsl A0 $ rol A1 $ rol A2 $ rol A3 + rol A4 $ rol A5 $ rol A6 $ rol A7 + + ;; ...into Remainder + rol C0 $ rol C1 $ rol C2 $ rol C3 + rol C4 $ rol C5 $ rol C6 $ rol C7 + + ;; Compare Remainder and Divisor + CP C0,B0 $ cpc C1,B1 $ cpc C2,B2 $ cpc C3,B3 + cpc C4,B4 $ cpc C5,B5 $ cpc C6,B6 $ cpc C7,B7 + + brcs 4f + + ;; Divisor fits into Remainder: Subtract it from Remainder... + SUB C0,B0 $ sbc C1,B1 $ sbc C2,B2 $ sbc C3,B3 + sbc C4,B4 $ sbc C5,B5 $ sbc C6,B6 $ sbc C7,B7 + + ;; ...and set according Bit in the upcoming Quotient + ;; The Bit will travel to its final Position + ori A0, 1 + +4: ;; This Bit is done + dec R_cnt + brne 3b + ;; __zero_reg__ is 0 again + + ;; T = 0: We are fine with the Quotient in A[] + ;; T = 1: Copy Remainder to A[] +5: brtc 6f + wmov A0, C0 + wmov A2, C2 + wmov A4, C4 + wmov A6, C6 + ;; Move the Sign of the Result to SS.7 + lsl SS + +6: ret + +ENDF __udivmod64 +#endif /* L_udivmod64 */ + + +#if defined (L_divdi3) + +;; R25:R18 = R24:R18 mod R17:R10 +;; Ordinary ABI-Function + +DEFUN __moddi3 + set + rjmp __divdi3_moddi3 +ENDF __moddi3 + +;; R25:R18 = R24:R18 div R17:R10 +;; Ordinary ABI-Function + +DEFUN __divdi3 + clt +ENDF __divdi3 + +DEFUN __divdi3_moddi3 +#if SPEED_DIV + mov r31, A7 + or r31, B7 + brmi 0f + ;; Both Signs are 0: the following Complexitiy is not needed + XJMP __udivdi3_umoddi3 +#endif /* SPEED_DIV */ + +0: ;; The Prologue + ;; Save Z = 12 Registers: Y, 17...8 + ;; No Frame needed (X = 0) + clr r26 + clr r27 + ldi r30, lo8(gs(1f)) + ldi r31, hi8(gs(1f)) + XJMP __prologue_saves__ + ((18 - 12) * 2) + +1: ;; SS.7 will contain the Sign of the Quotient (A.sign * B.sign) + ;; SS.6 will contain the Sign of the Remainder (A.sign) + mov SS, A7 + asr SS + ;; Adjust Dividend's Sign as needed +#if SPEED_DIV + ;; Compiling for Speed we know that at least one Sign must be < 0 + ;; Thus, if A[] >= 0 then we know B[] < 0 + brpl 22f +#else + brpl 21f +#endif /* SPEED_DIV */ + + XCALL __negdi2 + + ;; Adjust Divisor's Sign and SS.7 as needed +21: tst B7 + brpl 3f +22: ldi NN, 1 << 7 + eor SS, NN + + ldi NN, -1 + com B4 $ com B5 $ com B6 $ com B7 + $ com B1 $ com B2 $ com B3 + NEG B0 + $ sbc B1,NN $ sbc B2,NN $ sbc B3,NN + sbc B4,NN $ sbc B5,NN $ sbc B6,NN $ sbc B7,NN + +3: ;; Do the unsigned 64-Bit Division/Modulo (depending on T-flag) + XCALL __udivmod64 + + ;; Adjust Result's Sign +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + tst SS + brpl 4f +#else + sbrc SS, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XCALL __negdi2 + +4: ;; Epilogue: Restore the Z = 12 Registers and return + in r28, __SP_L__ + in r29, __SP_H__ + ldi r30, 12 + XJMP __epilogue_restores__ + ((18 - 12) * 2) + +ENDF __divdi3_moddi3 + +#undef R_cnt +#undef SS +#undef NN + +#endif /* L_divdi3 */ + +#if defined (L_negdi2) +DEFUN __negdi2 + + com A4 $ com A5 $ com A6 $ com A7 + $ com A1 $ com A2 $ com A3 + NEG A0 + $ sbci A1,-1 $ sbci A2,-1 $ sbci A3,-1 + sbci A4,-1 $ sbci A5,-1 $ sbci A6,-1 $ sbci A7,-1 + ret + +ENDF __negdi2 +#endif /* L_negdi2 */ + +#undef C7 +#undef C6 +#undef C5 +#undef C4 +#undef C3 +#undef C2 +#undef C1 +#undef C0 + +#undef B7 +#undef B6 +#undef B5 +#undef B4 +#undef B3 +#undef B2 +#undef B1 +#undef B0 + +#undef A7 +#undef A6 +#undef A5 +#undef A4 +#undef A3 +#undef A2 +#undef A1 +#undef A0 + .section .text.libgcc.prologue, "ax", @progbits @@ -854,6 +1209,7 @@ ENDF __divmodsi4 **********************************/ #if defined (L_prologue) +;; This function does not clobber T-flag; 64-bit division relies on it DEFUN __prologue_saves__ push r2 push r3 Index: libgcc/config/avr/t-avr =================================================================== --- libgcc/config/avr/t-avr (revision 181482) +++ libgcc/config/avr/t-avr (working copy) @@ -15,6 +15,9 @@ LIB1ASMFUNCS = \ _divmodpsi4 _udivmodpsi4 \ _udivmodsi4 \ _divmodsi4 \ + _divdi3 _udivdi3 \ + _udivmod64 \ + _negdi2 \ _prologue \ _epilogue \ _exit \ @@ -50,6 +53,7 @@ LIB1ASMFUNCS = \ _fmul _fmuls _fmulsu LIB2FUNCS_EXCLUDE = \ + _moddi3 _umoddi3 \ _clz # We do not have the DF type.