[cfe-commits] [Patch] compiler-rt: SPARC64-optimized multiply/divide

Ed Schouten Thu, 18 Oct 2012 01:02:06 -0700

Hi all,

Attached is the latest version of a patch we use at FreeBSD to add
optimized multiply/divide functions on SPARC64. Description from the
original bug report[1]:


"According to a developer at the FreeBSD project, FreeBSD's total compilation
time increases by 2.6% when the host system is built against compiler-rt
instead of libgcc. This is likely due to the fact that GCC has assembly-written
versions of the division and modulo routines, while compiler-rt does not.

The division and modulo routines used by GCC can easily be re-used by
compiler-rt. They are provided for free in The SPARC Architecture Manual
Version 8. Attached to this bug report is a patch that I have written for
compiler-rt. It contains the M4 file that is listed in the manual, with some
small modifications:

- The M4 file uses exponentiation (2^N). This seems to be a Sun-specific
extension to M4, as I cannot reproduce it with GNU and BSD m4. Fix this similar
to OpenBSD's version by replacing 2^N with TWOSUPN.

- Use the same register layout as GCC's version.

- Integrate into compiler-rt's codebase by using DEFINE_COMPILERRT_FUNCTION()."

The diff includes a `generate.sh', which generates the actual assembly
files. I guess we don't want to depend on M4 to build compiler-rt, so
it may be easier to run generate.sh and store the resulting C files in
the repository as well. Sorry for not providing a CMakefile. At
FreeBSD, we never build compiler-rt with CMake, as we imported
compiler-rt into our own build infrastructure.

-- 
Ed Schouten <[email protected]>

[1] http://llvm.org/bugs/show_bug.cgi?id=11667

Index: lib/sparc64/divmod.m4
===================================================================
--- lib/sparc64/divmod.m4       (Revision 0)
+++ lib/sparc64/divmod.m4       (Arbeitskopie)
@@ -0,0 +1,259 @@
+//===-- lib/sparc64/divmod.m4 - Integer division and modulo ----*- asm -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements division and modulo routines in SPARC64
+// assembly for the compiler-rt library.  This m4 code has been taken
+// from The SPARC Architecture Manual Version 8.
+//
+//===----------------------------------------------------------------------===//
+
+/*
+ * Division/Remainder
+ *
+ * Input is:
+ *   dividend -- the thing being divided
+ *   divisor -- how many ways to divide it
+ * Important parameters:
+ *   N -- how many bits per iteration we try to get
+ *        as our current guess: define(N, 4) define(TWOSUPN, 16)
+ *   WORDSIZE -- how many bits altogether we're talking about:
+ *               obviously: define(WORDSIZE, 32)
+ * A derived constant:
+ *   TOPBITS -- how many bits are in the top "decade" of a number:
+ *        define(TOPBITS, eval( WORDSIZE - N*((WORDSIZE-1)/N) ) )
+ * Important variables are:
+ *   Q -- the partial quotient under development -- initially 0
+ *   R -- the remainder so far -- initially == the dividend
+ *   ITER -- number of iterations of the main division loop which will
+ *           be required. Equal to CEIL( lg2(quotient)/N )
+ *           Note that this is log_base_(2ˆN) of the quotient.
+ *   V -- the current comparand -- initially divisor*2ˆ(ITER*N-1)
+ * Cost:
+ *   current estimate for non-large dividend is
+ *        CEIL( lg2(quotient) / N ) x ( 10 + 7N/2 ) + C
+ *   a large dividend is one greater than 2ˆ(31-TOPBITS) and takes a
+ *   different path, as the upper bits of the quotient must be developed
+ *   one bit at a time.
+ *   This uses the m4 and cpp macro preprocessors.
+ */
+
+define(dividend, `%o0')
+define(divisor,`%o1')
+define(Q, `%o2')
+define(R, `%o3')
+define(ITER, `%o4')
+define(V, `%o5')
+define(SIGN, `%g3')
+define(T, `%g1')
+define(SC,`%g2')
+/*
+ * This is the recursive definition of how we develop quotient digits.
+ * It takes three important parameters:
+ *   $1 -- the current depth, 1<=$1<=N
+ *   $2 -- the current accumulation of quotient bits
+ *   N -- max depth
+ * We add a new bit to $2 and either recurse or insert the bits in the 
quotient.
+ * Dynamic input:
+ *   R -- current remainder
+ *   Q -- current quotient
+ *   V -- current comparand
+ *   cc -- set on current value of R
+ * Dynamic output:
+ *   R', Q', V', cc'
+ */
+
+#include "../assembly.h"
+
+define(DEVELOP_QUOTIENT_BITS,
+`      !depth $1, accumulated bits $2
+       bl      L.$1.eval(TWOSUPN+$2)
+       srl     V,1,V
+       ! remainder is nonnegative
+       subcc   R,V,R
+       ifelse( $1, N,
+       `       b       9f
+               add     Q, ($2*2+1), Q
+       ',` DEVELOP_QUOTIENT_BITS( incr($1), `eval(2*$2+1)')
+       ')
+L.$1.eval(TWOSUPN+$2):
+       ! remainder is negative
+       addcc   R,V,R
+       ifelse( $1, N,
+       `       b       9f
+               add     Q, ($2*2-1), Q
+       ',` DEVELOP_QUOTIENT_BITS( incr($1), `eval(2*$2-1)')
+       ')
+       ifelse( $1, 1, `9:')
+')
+ifelse( ANSWER, `quotient', `
+.text
+       .align 32
+DEFINE_COMPILERRT_FUNCTION(__udivsi3)
+       b       divide
+       mov     0,SIGN                  ! result always nonnegative
+.text
+       .align 32
+DEFINE_COMPILERRT_FUNCTION(__divsi3)
+       orcc    divisor,dividend,%g0    ! are either dividend or divisor 
negative
+       bge     divide                  ! if not, skip this junk
+       xor     divisor,dividend,SIGN   ! record sign of result in sign of SIGN
+       tst     divisor
+       bge     2f
+       tst     dividend
+       ! divisor < 0
+       bge     divide
+       neg     divisor
+       2:
+       ! dividend < 0
+       neg     dividend
+       ! FALL THROUGH
+',`
+.text
+       .align 32
+DEFINE_COMPILERRT_FUNCTION(__umodsi3)
+       b       divide
+       mov     0,SIGN                  ! result always nonnegative
+.text
+       .align 32
+DEFINE_COMPILERRT_FUNCTION(__modsi3)
+       orcc    divisor,dividend,%g0    ! are either dividend or divisor 
negative
+       bge     divide                  ! if not, skip this junk
+       mov     dividend,SIGN           ! record sign of result in sign of SIGN
+       tst     divisor
+       bge     2f
+       tst     dividend
+       ! divisor < 0
+       bge     divide
+       neg     divisor
+       2:
+       ! dividend < 0
+       neg     dividend
+       ! FALL THROUGH
+')
+
+divide:
+       ! Compute size of quotient, scale comparand.
+       orcc    divisor,%g0,V           ! movcc divisor,V
+       te      2                       ! if divisor = 0
+       mov     dividend,R
+       mov     0,Q
+       sethi   %hi(1<<(WORDSIZE-TOPBITS-1)),T
+       cmp     R,T
+       blu     not_really_big
+       mov     0,ITER
+       !
+       ! Here, the dividend is >= 2ˆ(31-N) or so. We must be careful here,
+       ! as our usual N-at-a-shot divide step will cause overflow and havoc.
+       ! The total number of bits in the result here is N*ITER+SC, where
+       ! SC <= N.
+       ! Compute ITER in an unorthodox manner: know we need to Shift V into
+! the top decade: so don't even bother to compare to R.
+1:
+       cmp     V,T
+       bgeu    3f
+       mov     1,SC
+       sll     V,N,V
+       b       1b
+       inc     ITER
+! Now compute SC
+2:     addcc   V,V,V
+       bcc     not_too_big
+       add     SC,1,SC
+               ! We're here if the divisor overflowed when Shifting.
+               ! This means that R has the high-order bit set.
+               ! Restore V and subtract from R.
+               sll     T,TOPBITS,T     ! high order bit
+               srl     V,1,V           ! rest of V
+               add     V,T,V
+               b       do_single_div
+               dec     SC
+not_too_big:
+3:     cmp     V,R
+       blu     2b
+       nop
+       be      do_single_div
+       nop
+! V > R: went too far: back up 1 step
+!     srl V,1,V
+!      dec SC
+! do single-bit divide steps
+!
+! We have to be careful here. We know that R >= V, so we can do the
+! first divide step without thinking. BUT, the others are conditional,
+! and are only done if R >= 0. Because both R and V may have the high-
+! order bit set in the first step, just falling into the regular
+! division loop will mess up the first time around.
+! So we unroll slightly...
+do_single_div:
+       deccc   SC
+       bl      end_regular_divide
+       nop
+       sub     R,V,R
+       mov     1,Q
+       b,a     end_single_divloop
+       ! EMPTY
+single_divloop:
+       sll     Q,1,Q
+       bl      1f
+       srl     V,1,V
+       ! R >= 0
+               sub     R,V,R
+               b       2f
+               inc     Q
+       1:      ! R < 0
+               add     R,V,R
+               dec     Q
+       2:
+       end_single_divloop:
+               deccc   SC
+               bge     single_divloop
+               tst     R
+               b,a     end_regular_divide
+               ! EMPTY
+
+not_really_big:
+1:
+       sll     V,N,V
+       cmp     V,R
+       bleu    1b
+       inccc   ITER
+       be      got_result
+       dec     ITER
+do_regular_divide:
+       ! Do the main division iteration
+       tst     R
+       ! Fall through into divide loop
+divloop:
+       sll     Q,N,Q
+       DEVELOP_QUOTIENT_BITS( 1, 0 )
+end_regular_divide:
+       deccc   ITER
+       bge     divloop
+       tst     R
+       bl,a    got_result
+       ! non-restoring fixup if remainder < 0, otherwise annulled
+ifelse( ANSWER, `quotient',
+`      dec     Q
+',`    add     R,divisor,R
+')
+
+got_result:
+       tst     SIGN
+       bl,a    1f
+       ! negate for answer < 0, otherwise annulled
+ifelse( ANSWER, `quotient',
+`      neg     %o2,%o2                 ! Q <- -Q
+',`    neg     %o3,%o3                 ! R <- -R
+')
+1:
+       retl                            ! leaf-routine return
+ifelse( ANSWER, `quotient',
+`      mov     %o2,%o0                 ! quotient <- Q
+',`    mov     %o3,%o0                 ! remainder <- R
+')
Index: lib/sparc64/generate.sh
===================================================================
--- lib/sparc64/generate.sh     (Revision 0)
+++ lib/sparc64/generate.sh     (Arbeitskopie)
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+m4 divmod.m4 | sed -e 's/[[:space:]]*$//' | grep -v '^$' > modsi3.S
+m4 -DANSWER=quotient divmod.m4 | sed -e 's/[[:space:]]*$//' | grep -v '^$' > 
divsi3.S
Index: lib/sparc64/udivsi3.S
===================================================================
--- lib/sparc64/udivsi3.S       (Revision 0)
+++ lib/sparc64/udivsi3.S       (Arbeitskopie)
@@ -0,0 +1,5 @@
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+
+// This file is intentionally left blank, as __udivsi3 is already
+// provided by divsi3.S.
Index: lib/sparc64/umodsi3.S
===================================================================
--- lib/sparc64/umodsi3.S       (Revision 0)
+++ lib/sparc64/umodsi3.S       (Arbeitskopie)
@@ -0,0 +1,5 @@
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+
+// This file is intentionally left blank, as __umodsi3 is already
+// provided by modsi3.S.

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

[cfe-commits] [Patch] compiler-rt: SPARC64-optimized multiply/divide

Reply via email to