Hi all, Attached is the latest version of a patch we use at FreeBSD to add optimized multiply/divide functions on SPARC64. Description from the original bug report[1]:
"According to a developer at the FreeBSD project, FreeBSD's total compilation time increases by 2.6% when the host system is built against compiler-rt instead of libgcc. This is likely due to the fact that GCC has assembly-written versions of the division and modulo routines, while compiler-rt does not. The division and modulo routines used by GCC can easily be re-used by compiler-rt. They are provided for free in The SPARC Architecture Manual Version 8. Attached to this bug report is a patch that I have written for compiler-rt. It contains the M4 file that is listed in the manual, with some small modifications: - The M4 file uses exponentiation (2^N). This seems to be a Sun-specific extension to M4, as I cannot reproduce it with GNU and BSD m4. Fix this similar to OpenBSD's version by replacing 2^N with TWOSUPN. - Use the same register layout as GCC's version. - Integrate into compiler-rt's codebase by using DEFINE_COMPILERRT_FUNCTION()." The diff includes a `generate.sh', which generates the actual assembly files. I guess we don't want to depend on M4 to build compiler-rt, so it may be easier to run generate.sh and store the resulting C files in the repository as well. Sorry for not providing a CMakefile. At FreeBSD, we never build compiler-rt with CMake, as we imported compiler-rt into our own build infrastructure. -- Ed Schouten <[email protected]> [1] http://llvm.org/bugs/show_bug.cgi?id=11667
Index: lib/sparc64/divmod.m4 =================================================================== --- lib/sparc64/divmod.m4 (Revision 0) +++ lib/sparc64/divmod.m4 (Arbeitskopie) @@ -0,0 +1,259 @@ +//===-- lib/sparc64/divmod.m4 - Integer division and modulo ----*- asm -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements division and modulo routines in SPARC64 +// assembly for the compiler-rt library. This m4 code has been taken +// from The SPARC Architecture Manual Version 8. +// +//===----------------------------------------------------------------------===// + +/* + * Division/Remainder + * + * Input is: + * dividend -- the thing being divided + * divisor -- how many ways to divide it + * Important parameters: + * N -- how many bits per iteration we try to get + * as our current guess: define(N, 4) define(TWOSUPN, 16) + * WORDSIZE -- how many bits altogether we're talking about: + * obviously: define(WORDSIZE, 32) + * A derived constant: + * TOPBITS -- how many bits are in the top "decade" of a number: + * define(TOPBITS, eval( WORDSIZE - N*((WORDSIZE-1)/N) ) ) + * Important variables are: + * Q -- the partial quotient under development -- initially 0 + * R -- the remainder so far -- initially == the dividend + * ITER -- number of iterations of the main division loop which will + * be required. Equal to CEIL( lg2(quotient)/N ) + * Note that this is log_base_(2ˆN) of the quotient. + * V -- the current comparand -- initially divisor*2ˆ(ITER*N-1) + * Cost: + * current estimate for non-large dividend is + * CEIL( lg2(quotient) / N ) x ( 10 + 7N/2 ) + C + * a large dividend is one greater than 2ˆ(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + * This uses the m4 and cpp macro preprocessors. + */ + +define(dividend, `%o0') +define(divisor,`%o1') +define(Q, `%o2') +define(R, `%o3') +define(ITER, `%o4') +define(V, `%o5') +define(SIGN, `%g3') +define(T, `%g1') +define(SC,`%g2') +/* + * This is the recursive definition of how we develop quotient digits. + * It takes three important parameters: + * $1 -- the current depth, 1<=$1<=N + * $2 -- the current accumulation of quotient bits + * N -- max depth + * We add a new bit to $2 and either recurse or insert the bits in the quotient. + * Dynamic input: + * R -- current remainder + * Q -- current quotient + * V -- current comparand + * cc -- set on current value of R + * Dynamic output: + * R', Q', V', cc' + */ + +#include "../assembly.h" + +define(DEVELOP_QUOTIENT_BITS, +` !depth $1, accumulated bits $2 + bl L.$1.eval(TWOSUPN+$2) + srl V,1,V + ! remainder is nonnegative + subcc R,V,R + ifelse( $1, N, + ` b 9f + add Q, ($2*2+1), Q + ',` DEVELOP_QUOTIENT_BITS( incr($1), `eval(2*$2+1)') + ') +L.$1.eval(TWOSUPN+$2): + ! remainder is negative + addcc R,V,R + ifelse( $1, N, + ` b 9f + add Q, ($2*2-1), Q + ',` DEVELOP_QUOTIENT_BITS( incr($1), `eval(2*$2-1)') + ') + ifelse( $1, 1, `9:') +') +ifelse( ANSWER, `quotient', ` +.text + .align 32 +DEFINE_COMPILERRT_FUNCTION(__udivsi3) + b divide + mov 0,SIGN ! result always nonnegative +.text + .align 32 +DEFINE_COMPILERRT_FUNCTION(__divsi3) + orcc divisor,dividend,%g0 ! are either dividend or divisor negative + bge divide ! if not, skip this junk + xor divisor,dividend,SIGN ! record sign of result in sign of SIGN + tst divisor + bge 2f + tst dividend + ! divisor < 0 + bge divide + neg divisor + 2: + ! dividend < 0 + neg dividend + ! FALL THROUGH +',` +.text + .align 32 +DEFINE_COMPILERRT_FUNCTION(__umodsi3) + b divide + mov 0,SIGN ! result always nonnegative +.text + .align 32 +DEFINE_COMPILERRT_FUNCTION(__modsi3) + orcc divisor,dividend,%g0 ! are either dividend or divisor negative + bge divide ! if not, skip this junk + mov dividend,SIGN ! record sign of result in sign of SIGN + tst divisor + bge 2f + tst dividend + ! divisor < 0 + bge divide + neg divisor + 2: + ! dividend < 0 + neg dividend + ! FALL THROUGH +') + +divide: + ! Compute size of quotient, scale comparand. + orcc divisor,%g0,V ! movcc divisor,V + te 2 ! if divisor = 0 + mov dividend,R + mov 0,Q + sethi %hi(1<<(WORDSIZE-TOPBITS-1)),T + cmp R,T + blu not_really_big + mov 0,ITER + ! + ! Here, the dividend is >= 2ˆ(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The total number of bits in the result here is N*ITER+SC, where + ! SC <= N. + ! Compute ITER in an unorthodox manner: know we need to Shift V into +! the top decade: so don't even bother to compare to R. +1: + cmp V,T + bgeu 3f + mov 1,SC + sll V,N,V + b 1b + inc ITER +! Now compute SC +2: addcc V,V,V + bcc not_too_big + add SC,1,SC + ! We're here if the divisor overflowed when Shifting. + ! This means that R has the high-order bit set. + ! Restore V and subtract from R. + sll T,TOPBITS,T ! high order bit + srl V,1,V ! rest of V + add V,T,V + b do_single_div + dec SC +not_too_big: +3: cmp V,R + blu 2b + nop + be do_single_div + nop +! V > R: went too far: back up 1 step +! srl V,1,V +! dec SC +! do single-bit divide steps +! +! We have to be careful here. We know that R >= V, so we can do the +! first divide step without thinking. BUT, the others are conditional, +! and are only done if R >= 0. Because both R and V may have the high- +! order bit set in the first step, just falling into the regular +! division loop will mess up the first time around. +! So we unroll slightly... +do_single_div: + deccc SC + bl end_regular_divide + nop + sub R,V,R + mov 1,Q + b,a end_single_divloop + ! EMPTY +single_divloop: + sll Q,1,Q + bl 1f + srl V,1,V + ! R >= 0 + sub R,V,R + b 2f + inc Q + 1: ! R < 0 + add R,V,R + dec Q + 2: + end_single_divloop: + deccc SC + bge single_divloop + tst R + b,a end_regular_divide + ! EMPTY + +not_really_big: +1: + sll V,N,V + cmp V,R + bleu 1b + inccc ITER + be got_result + dec ITER +do_regular_divide: + ! Do the main division iteration + tst R + ! Fall through into divide loop +divloop: + sll Q,N,Q + DEVELOP_QUOTIENT_BITS( 1, 0 ) +end_regular_divide: + deccc ITER + bge divloop + tst R + bl,a got_result + ! non-restoring fixup if remainder < 0, otherwise annulled +ifelse( ANSWER, `quotient', +` dec Q +',` add R,divisor,R +') + +got_result: + tst SIGN + bl,a 1f + ! negate for answer < 0, otherwise annulled +ifelse( ANSWER, `quotient', +` neg %o2,%o2 ! Q <- -Q +',` neg %o3,%o3 ! R <- -R +') +1: + retl ! leaf-routine return +ifelse( ANSWER, `quotient', +` mov %o2,%o0 ! quotient <- Q +',` mov %o3,%o0 ! remainder <- R +') Index: lib/sparc64/generate.sh =================================================================== --- lib/sparc64/generate.sh (Revision 0) +++ lib/sparc64/generate.sh (Arbeitskopie) @@ -0,0 +1,4 @@ +#!/bin/sh + +m4 divmod.m4 | sed -e 's/[[:space:]]*$//' | grep -v '^$' > modsi3.S +m4 -DANSWER=quotient divmod.m4 | sed -e 's/[[:space:]]*$//' | grep -v '^$' > divsi3.S Index: lib/sparc64/udivsi3.S =================================================================== --- lib/sparc64/udivsi3.S (Revision 0) +++ lib/sparc64/udivsi3.S (Arbeitskopie) @@ -0,0 +1,5 @@ +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. + +// This file is intentionally left blank, as __udivsi3 is already +// provided by divsi3.S. Index: lib/sparc64/umodsi3.S =================================================================== --- lib/sparc64/umodsi3.S (Revision 0) +++ lib/sparc64/umodsi3.S (Arbeitskopie) @@ -0,0 +1,5 @@ +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. + +// This file is intentionally left blank, as __umodsi3 is already +// provided by modsi3.S.
_______________________________________________ cfe-commits mailing list [email protected] http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
