A general comment before we start:

CLZ was added to the Arm ISA in Armv5.  So all subsequent Arm versions
(and all versions implementing thumb2) will have this instruction.  So
the only cases where you'll need a fallback are armv6m (and derivatives)
and pre-armv5 (Arm or thumb1).  So there's no need in your code to try
to use a synthesized CLZ operation when compiling for thumb2.


On 11/01/2021 11:10, g...@danielengel.com wrote:
> From: Daniel Engel <g...@danielengel.com>
> 
> On architectures with no clz instruction, this version combines __clzdi2()
> with __clzsi2() into a single object with an efficient tail call.  Also, this
> version merges the formerly separate for Thumb and ARM code implementations
> into a unified instruction sequence.  This change significantly improves the
> Thumb performance with affecting ARM performance.  Finally, this version adds
> a new __OPTIMIZE_SIZE__ build option (using a loop).
> 
> On architectures with a clz instruction, functionality is unchanged.
> 
> gcc/libgcc/ChangeLog:
> 2021-01-07 Daniel Engel <g...@danielengel.com>
> 
>       * config/arm/bits/clz2.S: Size-optimized bitwise versions of __clzsi2()
>       and __clzdi2() (i.e. __ARM_FEATURE_CLZ not available).
>       * config/arm/lib1funcs.S: Moved CFI_FUNCTION macros, added 
> __ARM_FEATURE_IT.
>       * config/arm/t-elf: Move _clzsi2 to new group of weak LIB1ASMFUNCS.
> ---
>  libgcc/config/arm/bits/clz2.S | 342 ++++++++++++++++++++++------------
>  libgcc/config/arm/lib1funcs.S |  25 ++-
>  libgcc/config/arm/t-elf       |   8 +-
>  3 files changed, 248 insertions(+), 127 deletions(-)
> 
> diff --git a/libgcc/config/arm/bits/clz2.S b/libgcc/config/arm/bits/clz2.S
> index 1c8f10a5b29..d0a1fbec4d0 100644
> --- a/libgcc/config/arm/bits/clz2.S
> +++ b/libgcc/config/arm/bits/clz2.S
> @@ -1,124 +1,234 @@
> +/* clz2.S: Cortex M0 optimized 'clz' functions
> +
> +   Copyright (C) 2018-2021 Free Software Foundation, Inc> +   Contributed by 
> Daniel Engel, Senva Inc (g...@danielengel.com)
> +
> +   This file is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by the
> +   Free Software Foundation; either version 3, or (at your option) any
> +   later version.
> +
> +   This file is distributed in the hope that it will be useful, but
> +   WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   General Public License for more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +#if defined(__ARM_FEATURE_CLZ) && __ARM_FEATURE_CLZ

Writing the test this way is pointless.  Either test for
__ARM_FEATURE_CLZ being defined, or test for it being non-zero; but not
both.  C Macros default to a value of zero if not defined.

In this case #ifdef is just fine - it won't be defined if the
instruction doesn't exist.

Similar simplification should be used everywhere else you've used this
type of construct.

> +
> +#ifdef L_clzdi2
> +
> +// int __clzdi2(long long)
> +// Counts leading zero bits in $r1:$r0.
> +// Returns the result in $r0.
> +FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2
> +    CFI_START_FUNCTION
> +
> +        // Moved here from lib1funcs.S
> +        cmp     xxh,    #0
> +        do_it   eq,     et
> +        clzeq   r0,     xxl
> +        clzne   r0,     xxh
> +        addeq   r0,     #32
> +        RET
> +
> +    CFI_END_FUNCTION
> +FUNC_END clzdi2
> +
> +#endif /* L_clzdi2 */
> +
>  
>  #ifdef L_clzsi2
> -#ifdef NOT_ISA_TARGET_32BIT
> -FUNC_START clzsi2
> -     movs    r1, #28
> -     movs    r3, #1
> -     lsls    r3, r3, #16
> -     cmp     r0, r3 /* 0x10000 */
> -     bcc     2f
> -     lsrs    r0, r0, #16
> -     subs    r1, r1, #16
> -2:   lsrs    r3, r3, #8
> -     cmp     r0, r3 /* #0x100 */
> -     bcc     2f
> -     lsrs    r0, r0, #8
> -     subs    r1, r1, #8
> -2:   lsrs    r3, r3, #4
> -     cmp     r0, r3 /* #0x10 */
> -     bcc     2f
> -     lsrs    r0, r0, #4
> -     subs    r1, r1, #4
> -2:   adr     r2, 1f
> -     ldrb    r0, [r2, r0]
> -     adds    r0, r0, r1
> -     bx lr
> -.align 2
> -1:
> -.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
> -     FUNC_END clzsi2
> -#else
> -ARM_FUNC_START clzsi2
> -# if defined (__ARM_FEATURE_CLZ)
> -     clz     r0, r0
> -     RET
> -# else
> -     mov     r1, #28
> -     cmp     r0, #0x10000
> -     do_it   cs, t
> -     movcs   r0, r0, lsr #16
> -     subcs   r1, r1, #16
> -     cmp     r0, #0x100
> -     do_it   cs, t
> -     movcs   r0, r0, lsr #8
> -     subcs   r1, r1, #8
> -     cmp     r0, #0x10
> -     do_it   cs, t
> -     movcs   r0, r0, lsr #4
> -     subcs   r1, r1, #4
> -     adr     r2, 1f
> -     ldrb    r0, [r2, r0]
> -     add     r0, r0, r1
> -     RET
> -.align 2
> -1:
> -.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
> -# endif /* !defined (__ARM_FEATURE_CLZ) */
> -     FUNC_END clzsi2
> -#endif
> +
> +// int __clzsi2(int)
> +// Counts leading zero bits in $r0.
> +// Returns the result in $r0.
> +FUNC_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2
> +    CFI_START_FUNCTION
> +
> +        // Moved here from lib1funcs.S
> +        clz     r0,     r0
> +        RET
> +
> +    CFI_END_FUNCTION
> +FUNC_END clzsi2
> +
>  #endif /* L_clzsi2 */
>  
> +#else /* !__ARM_FEATURE_CLZ */
> +
>  #ifdef L_clzdi2
> -#if !defined (__ARM_FEATURE_CLZ)
> -
> -# ifdef NOT_ISA_TARGET_32BIT
> -FUNC_START clzdi2
> -     push    {r4, lr}
> -     cmp     xxh, #0
> -     bne     1f
> -#  ifdef __ARMEB__
> -     movs    r0, xxl
> -     bl      __clzsi2
> -     adds    r0, r0, #32
> -     b 2f
> -1:
> -     bl      __clzsi2
> -#  else
> -     bl      __clzsi2
> -     adds    r0, r0, #32
> -     b 2f
> -1:
> -     movs    r0, xxh
> -     bl      __clzsi2
> -#  endif
> -2:
> -     pop     {r4, pc}
> -# else /* NOT_ISA_TARGET_32BIT */
> -ARM_FUNC_START clzdi2
> -     do_push {r4, lr}
> -     cmp     xxh, #0
> -     bne     1f
> -#  ifdef __ARMEB__
> -     mov     r0, xxl
> -     bl      __clzsi2
> -     add     r0, r0, #32
> -     b 2f
> -1:
> -     bl      __clzsi2
> -#  else
> -     bl      __clzsi2
> -     add     r0, r0, #32
> -     b 2f
> -1:
> -     mov     r0, xxh
> -     bl      __clzsi2
> -#  endif
> -2:
> -     RETLDM  r4
> -     FUNC_END clzdi2
> -# endif /* NOT_ISA_TARGET_32BIT */
> -
> -#else /* defined (__ARM_FEATURE_CLZ) */
> -
> -ARM_FUNC_START clzdi2
> -     cmp     xxh, #0
> -     do_it   eq, et
> -     clzeq   r0, xxl
> -     clzne   r0, xxh
> -     addeq   r0, r0, #32
> -     RET
> -     FUNC_END clzdi2
>  
> -#endif
> +// int __clzdi2(long long)
> +// Counts leading zero bits in $r1:$r0.
> +// Returns the result in $r0.
> +// Uses $r2 and possibly $r3 as scratch space.
> +FUNC_START_SECTION clzdi2 .text.sorted.libgcc.clz2.clzdi2
> +    CFI_START_FUNCTION
> +
> +  #if defined(__ARMEB__) && __ARMEB__
> +        // Check if the upper word is zero.
> +        cmp     r0,     #0
> +
> +        // The upper word is non-zero, so calculate __clzsi2(upper).
> +        bne     SYM(__clzsi2)
> +
> +        // The upper word is zero, so calculate 32 + __clzsi2(lower).
> +        movs    r2,     #64
> +        movs    r0,     r1
> +        b       SYM(__internal_clzsi2)
> +
> +  #else /* !__ARMEB__ */
> +        // Assume all the bits in the argument are zero.
> +        movs    r2,     #64
> +
> +        // Check if the upper word is zero.
> +        cmp     r1,     #0
> +
> +        // The upper word is zero, so calculate 32 + __clzsi2(lower).
> +        beq     SYM(__internal_clzsi2)
> +
> +        // The upper word is non-zero, so set up __clzsi2(upper).
> +        // Then fall through.
> +        movs    r0,     r1

So this falls through to some code that is expected to be immediately
below?  If so, then that seems highly fragile.  Wouldn't it be better to
have an implementation of __clzsi2 here and then arrange (as you've done
elsewhere) for the base version to be discarded if a program needs both?

Something else to beware of is that when building libgcc.a we make sure
that each shared library (in a sysv like environment) gets its own
implementations of the functions in the library (so that we can avoid
PLT overheads for these small, critical, functions.  If you're exporting
additional symbols to the ones the libgcc build system normally expects,
we need to ensure that these symbols are similarly hidden from the
shared library's export tables, or we will potentially get conflicts
when programs with multiple such definitions are used at run time.  I'm
not entirely sure how that magic is done, so noting it here for
something that needs checking.  It might also apply to your weak
function definitions.


> +
> +  #endif /* !__ARMEB__ */
> +
>  #endif /* L_clzdi2 */
>  
> +
> +// The bitwise implementation of __clzdi2() tightly couples with __clzsi2(),
> +//  such that instructions must appear consecutively in the same memory
> +//  section for proper flow control.  However, this construction inhibits
> +//  the ability to discard __clzdi2() when only using __clzsi2().
> +// Therefore, this block configures __clzsi2() for compilation twice.
> +// The first version is a minimal standalone implementation, and the second
> +//  version is the continuation of __clzdi2().  The standalone version must
> +//  be declared WEAK, so that the combined version can supersede it and
> +//  provide both symbols when required.
> +// '_clzsi2' should appear before '_clzdi2' in LIB1ASMFUNCS.
> +#if defined(L_clzsi2) || defined(L_clzdi2)
> +
> +#ifdef L_clzsi2
> +// int __clzsi2(int)
> +// Counts leading zero bits in $r0.
> +// Returns the result in $r0.
> +// Uses $r2 and possibly $r3 as scratch space.
> +WEAK_START_SECTION clzsi2 .text.sorted.libgcc.clz2.clzsi2
> +    CFI_START_FUNCTION
> +
> +#else /* L_clzdi2 */
> +FUNC_ENTRY clzsi2
> +
> +#endif
> +
> +        // Assume all the bits in the argument are zero
> +        movs    r2,     #32
> +
> +#ifdef L_clzsi2
> +    WEAK_ENTRY internal_clzsi2
> +#else /* L_clzdi2 */
> +    FUNC_ENTRY internal_clzsi2
> +#endif
> +
> +        // Size optimized: 22 bytes, 51 cycles
> +        // Speed optimized: 50 bytes, 20 cycles
> +
> +  #if defined(__OPTIMIZE_SIZE__) && __OPTIMIZE_SIZE__
> +
> +        // Binary search starts at half the word width.
> +        movs    r3,     #16
> +
> +    LLSYM(__clz_loop):
> +        // Test the upper 'n' bits of the operand for ZERO.
> +        movs    r1,     r0
> +        lsrs    r1,     r3
> +
> +        // When the test fails, discard the lower bits of the register,
> +        //  and deduct the count of discarded bits from the result.
> +      #ifdef __ARM_FEATURE_IT
> +        do_it   ne, t
> +      #else
> +        beq     LLSYM(__clz_skip)
> +      #endif
> +
> +     IT(mov,ne) r0,     r1
> +     IT(sub,ne) r2,     r3

No, please don't write it this way - see my earlier comment on IT when
you defined it.  Put the code in the appropriate #if block.

> +
> +    LLSYM(__clz_skip):
> +        // Decrease the shift distance for the next test.
> +        lsrs    r3,     #1
> +        bne     LLSYM(__clz_loop)
> +
> +  #else /* __OPTIMIZE_SIZE__ */
> +
> +        // Unrolled binary search.
> +        lsrs    r1,     r0,     #16
> +      #ifdef __ARM_FEATURE_IT
> +        do_it   ne,t
> +      #else
> +        beq     LLSYM(__clz8)
> +      #endif
> +
> +     IT(mov,ne) r0,     r1
> +     IT(sub,ne) r2,     #16
> +
> +    LLSYM(__clz8):
> +        lsrs    r1,     r0,     #8
> +      #ifdef __ARM_FEATURE_IT
> +        do_it   ne,t
> +      #else
> +        beq     LLSYM(__clz4)
> +      #endif
> +
> +     IT(mov,ne) r0,     r1
> +     IT(sub,ne) r2,     #8
> +
> +    LLSYM(__clz4):
> +        lsrs    r1,     r0,     #4
> +      #ifdef __ARM_FEATURE_IT
> +        do_it   ne,t
> +      #else
> +        beq     LLSYM(__clz2)
> +      #endif
> +
> +     IT(mov,ne) r0,     r1
> +     IT(sub,ne) r2,     #4
> +
> +    LLSYM(__clz2):
> +        // Load the remainder by index
> +        adr     r1,     LLSYM(__clz_remainder)
> +        ldrb    r0,     [r1, r0]
> +
> +  #endif /* !__OPTIMIZE_SIZE__ */
> +
> +        // Account for the remainder.
> +        subs    r0,     r2,     r0
> +        RET
> +
> +  #if !defined(__OPTIMIZE_SIZE__) || !__OPTIMIZE_SIZE__
> +        .align 2
> +    LLSYM(__clz_remainder):
> +        .byte 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4
> +  #endif
> +
> +    CFI_END_FUNCTION
> +FUNC_END clzsi2
> +
> +#ifdef L_clzdi2
> +FUNC_END clzdi2
> +#endif
> +
> +#endif /* L_clzsi2 || L_clzdi2 */
> +
> +#endif /* !__ARM_FEATURE_CLZ */
> +
> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
> index c7a3b85bf2b..e6cb2570c8d 100644
> --- a/libgcc/config/arm/lib1funcs.S
> +++ b/libgcc/config/arm/lib1funcs.S
> @@ -96,6 +96,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>  #define NOT_ISA_TARGET_32BIT 1
>  #endif
>  
> +
>  /* How to return from a function call depends on the architecture variant.  
> */
>  
>  #if (__ARM_ARCH > 4) || defined(__ARM_ARCH_4T__)
> @@ -184,6 +185,16 @@ LSYM(Lend_fde):
>  #endif
>  .endm
>  
> +.macro CFI_START_FUNCTION
> +     .cfi_startproc
> +     .cfi_remember_state
> +.endm
> +
> +.macro CFI_END_FUNCTION
> +     .cfi_restore_state
> +     .cfi_endproc
> +.endm
> +
>  /* Don't pass dirn, it's there just to get token pasting right.  */
>  
>  .macro       RETLDM  regs=, cond=, unwind=, dirn=ia
> @@ -220,6 +231,7 @@ LSYM(Lend_fde):
>     ARM and Thumb-2.  However this is only supported by recent gas, so define
>     a set of macros to allow ARM code on older assemblers.  */
>  #if defined(__thumb2__)
> +#define __ARM_FEATURE_IT

Please use a different name for this.  __ARM_FEATURE_* are a set of
macros defined by the "ARM ACLE" and are (in theory) common across a
number of compilers supporting Arm.  Better to use something like
__HAVE_CONDEXEC.

>  .macro do_it cond, suffix=""
>       it\suffix       \cond
>  .endm
> @@ -235,6 +247,9 @@ LSYM(Lend_fde):
>       \name \dest, \src1, \tmp
>  .endm
>  #else
> +#if !defined(__thumb__)
> +#define __ARM_FEATURE_IT
> +#endif
>  .macro do_it cond, suffix=""
>  .endm
>  .macro shift1 op, arg0, arg1, arg2
> @@ -1899,16 +1914,6 @@ LSYM(Lchange_\register):
>  
>  #endif /* Arch supports thumb.  */
>  
> -.macro CFI_START_FUNCTION
> -     .cfi_startproc
> -     .cfi_remember_state
> -.endm
> -
> -.macro CFI_END_FUNCTION
> -     .cfi_restore_state
> -     .cfi_endproc
> -.endm
> -
>  #ifndef __symbian__
>  /* The condition here must match the one in gcc/config/arm/elf.h and
>     libgcc/config/arm/t-elf.  */

I think all the changes to lib1funcs.asm here should be a separate
precursor patch, as they are used by multiple subsequent patches.

> diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
> index 6a31a328073..8f1a4614904 100644
> --- a/libgcc/config/arm/t-elf
> +++ b/libgcc/config/arm/t-elf
> @@ -19,13 +19,19 @@ endif # !__symbian__
>  # implementation.  The soft-fp code is only build for ARMv6M.  This pulls
>  # in the asm implementation for other CPUs.
>  
> +
> +# Group 0: WEAK overridable function objects.
> +# See respective sources for rationale.
> +LIB1ASMFUNCS += \
> +        _clzsi2 \
> +
> +
>  # Group 1: Integer functions
>  LIB1ASMFUNCS += \
>       _ashldi3 \
>       _ashrdi3 \
>       _lshrdi3 \
>       _clzdi2 \
> -     _clzsi2 \
>       _ctzsi2 \
>       _dvmd_tls \
>       _divsi3 \
> 

Reply via email to