Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m

Andre Vieira (lists) Wed, 06 Jul 2016 03:53:45 -0700

On 01/07/16 14:40, Ramana Radhakrishnan wrote:
> 
> 
> On 13/10/15 18:01, Andre Vieira wrote:
>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings 
>> (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed 
>> by ARM under Free BSD license.
>>
>> The new aeabi_idiv routine is used to replace the one in 
>> libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 
>> wrapper. The new routine is under LGPLv3 license.
> 
> This is not under LGPLv3 . It is under GPLv3 with the runtime library 
> exception license, there's a difference. Assuming your licensing expectation 
> is ok .... read on for more of a review.
> 
>>
>> The main advantage of this version is that it can improve the performance of 
>> the aeabi_idiv function for Thumb1. This solution will also increase the 
>> code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>
>> Make check passed for armv6-m.
>>
>> libgcc/ChangeLog:
>> 2015-08-10  Hale Wang  <hale.w...@arm.com>
>>             Andre Vieira  <andre.simoesdiasvie...@arm.com>
>>
>>   * config/arm/lib1funcs.S: Add new wrapper.
>>
>> 0001-integer-division.patch
>>
>>
>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>> From: Andre Simoes Dias Vieira <andsi...@arm.com>
>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>> Subject: [PATCH] new wrapper idivmod
>>
>> ---
>>  libgcc/config/arm/lib1funcs.S | 250 
>> ++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>
>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>> index 
>> 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816
>>  100644
>> --- a/libgcc/config/arm/lib1funcs.S
>> +++ b/libgcc/config/arm/lib1funcs.S
>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>  #ifdef __ARM_EABI__
>>  .macro THUMB_LDIV0 name signed
>>  #if defined(__ARM_ARCH_6M__)
>> -    .ifc \signed, unsigned
>> -    cmp     r0, #0
>> -    beq     1f
>> -    mov     r0, #0
>> -    mvn     r0, r0          @ 0xffffffff
>> -1:
>> -    .else
>> -    cmp     r0, #0
>> -    beq     2f
>> -    blt     3f
>> +
>> +    push    {r0, lr}
>>      mov     r0, #0
>> -    mvn     r0, r0
>> -    lsr     r0, r0, #1      @ 0x7fffffff
>> -    b       2f
>> -3:  mov     r0, #0x80
>> -    lsl     r0, r0, #24     @ 0x80000000
>> -2:
>> -    .endif
>> -    push    {r0, r1, r2}
>> -    ldr     r0, 4f
>> -    adr     r1, 4f
>> -    add     r0, r1
>> -    str     r0, [sp, #8]
>> -    @ We know we are not on armv4t, so pop pc is safe.
>> -    pop     {r0, r1, pc}
>> -    .align  2
>> -4:
>> -    .word   __aeabi_idiv0 - 4b
>> +    bl      SYM(__aeabi_idiv0)
>> +    pop     {r1, pc}
>> +
> 
> I'd still retain the comment about pop pc here because there's often a 
> misconception of merging armv4t and armv6m code.
> 
>>  #elif defined(__thumb2__)
>>      .syntax unified
>>      .ifc \signed, unsigned
>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>      add     dividend, work
>>    .endif
>>  LSYM(Lgot_result):
>> -.endm       
>> +.endm
>> +
>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>> +/* If performance is preferred, the following functions are provided.  */
>> +
> 
> Comment above #if please and also check elsewhere in patch.
> 
>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>> +.macro BranchToDiv n, label
>> +    lsr     curbit, dividend, \n
>> +    cmp     curbit, divisor
>> +    blo     \label
>> +.endm
>> +
>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>> +   and dividend.  Update the dividend as the substruction result.  */
>> +.macro DoDiv n
>> +    lsr     curbit, dividend, \n
>> +    cmp     curbit, divisor
>> +    bcc     1f
>> +    lsl     curbit, divisor, \n
>> +    sub     dividend, dividend, curbit
>> +
>> +1:  adc     result, result
>> +.endm
>> +
>> +/* The body of division with positive divisor.  Unless the divisor is very
>> +   big, shift it up in multiples of four bits, since this is the amount of
>> +   unwinding in the main division loop.  Continue shifting until the divisor
>> +   is larger than the dividend.  */
>> +.macro THUMB1_Div_Positive
>> +    mov     result, #0
>> +    BranchToDiv #1, LSYM(Lthumb1_div1)
>> +    BranchToDiv #4, LSYM(Lthumb1_div4)
>> +    BranchToDiv #8, LSYM(Lthumb1_div8)
>> +    BranchToDiv #12, LSYM(Lthumb1_div12)
>> +    BranchToDiv #16, LSYM(Lthumb1_div16)
>> +LSYM(Lthumb1_div_large_positive):
>> +    mov     result, #0xff
>> +    lsl     divisor, divisor, #8
>> +    rev     result, result
>> +    lsr     curbit, dividend, #16
>> +    cmp     curbit, divisor
>> +    blo     1f
>> +    asr     result, #8
>> +    lsl     divisor, divisor, #8
>> +    beq     LSYM(Ldivbyzero_waypoint)
>> +
>> +1:  lsr     curbit, dividend, #12
>> +    cmp     curbit, divisor
>> +    blo     LSYM(Lthumb1_div12)
>> +    b       LSYM(Lthumb1_div16)
>> +LSYM(Lthumb1_div_loop):
>> +    lsr     divisor, divisor, #8
>> +LSYM(Lthumb1_div16):
>> +    Dodiv   #15
>> +    Dodiv   #14
>> +    Dodiv   #13
>> +    Dodiv   #12
>> +LSYM(Lthumb1_div12):
>> +    Dodiv   #11
>> +    Dodiv   #10
>> +    Dodiv   #9
>> +    Dodiv   #8
>> +    bcs     LSYM(Lthumb1_div_loop)
>> +LSYM(Lthumb1_div8):
>> +    Dodiv   #7
>> +    Dodiv   #6
>> +    Dodiv   #5
>> +LSYM(Lthumb1_div5):
>> +    Dodiv   #4
>> +LSYM(Lthumb1_div4):
>> +    Dodiv   #3
>> +LSYM(Lthumb1_div3):
>> +    Dodiv   #2
>> +LSYM(Lthumb1_div2):
>> +    Dodiv   #1
>> +LSYM(Lthumb1_div1):
>> +    sub     divisor, dividend, divisor
>> +    bcs     1f
>> +    cpy     divisor, dividend
>> +
>> +1:  adc     result, result
>> +    cpy     dividend, result
>> +    RET
>> +
>> +LSYM(Ldivbyzero_waypoint):
>> +    b       LSYM(Ldiv0)
>> +.endm
>> +
>> +/* The body of division with negative divisor.  Similar with
>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>> +   of six bits.  */
>> +.macro THUMB1_Div_Negative
>> +    lsr     result, divisor, #31
>> +    beq     1f
>> +    neg     divisor, divisor
>> +
>> +1:  asr     curbit, dividend, #32
>> +    bcc     2f
>> +    neg     dividend, dividend
>> +
>> +2:  eor     curbit, result
>> +    mov     result, #0
>> +    cpy     ip, curbit
>> +    BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>> +    BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>> +LSYM(Lthumb1_div_large):
>> +    mov     result, #0xfc
>> +    lsl     divisor, divisor, #6
>> +    rev     result, result
>> +    lsr     curbit, dividend, #8
>> +    cmp     curbit, divisor
>> +    blo     LSYM(Lthumb1_div_negative8)
>> +
>> +    lsl     divisor, divisor, #6
>> +    asr     result, result, #6
>> +    cmp     curbit, divisor
>> +    blo     LSYM(Lthumb1_div_negative8)
>> +
>> +    lsl     divisor, divisor, #6
>> +    asr     result, result, #6
>> +    cmp     curbit, divisor
>> +    blo     LSYM(Lthumb1_div_negative8)
>> +
>> +    lsl     divisor, divisor, #6
>> +    beq     LSYM(Ldivbyzero_negative)
>> +    asr     result, result, #6
>> +    b       LSYM(Lthumb1_div_negative8)
>> +LSYM(Lthumb1_div_negative_loop):
>> +    lsr     divisor, divisor, #6
>> +LSYM(Lthumb1_div_negative8):
>> +    DoDiv   #7
>> +    DoDiv   #6
>> +    DoDiv   #5
>> +    DoDiv   #4
>> +LSYM(Lthumb1_div_negative4):
>> +    DoDiv   #3
>> +    DoDiv   #2
>> +    bcs     LSYM(Lthumb1_div_negative_loop)
>> +    DoDiv   #1
>> +    sub     divisor, dividend, divisor
>> +    bcs     1f
>> +    cpy     divisor, dividend
>> +
>> +1:  cpy     curbit, ip
>> +    adc     result, result
>> +    asr     curbit, curbit, #1
>> +    cpy     dividend, result
>> +    bcc     2f
>> +    neg     dividend, dividend
>> +    cmp     curbit, #0
>> +
>> +2:  bpl     3f
>> +    neg     divisor, divisor
>> +
>> +3:  RET
>> +
>> +LSYM(Ldivbyzero_negative):
>> +    cpy     curbit, ip
>> +    asr     curbit, curbit, #1
>> +    bcc     LSYM(Ldiv0)
>> +    neg     dividend, dividend
>> +.endm
>> +#endif /* ARM Thumb version.  */
>> +
>>  /* ------------------------------------------------------------------------ 
>> */
>>  /*          Start of the Real Functions                                 */
>>  /* ------------------------------------------------------------------------ 
>> */
>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>  
>>      FUNC_START udivsi3
>>      FUNC_ALIAS aeabi_uidiv udivsi3
>> +#if defined(__OPTIMIZE_SIZE__)
>>  
>>      cmp     divisor, #0
>>      beq     LSYM(Ldiv0)
>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>      pop     { work }
>>      RET
>>  
>> +#else
>> +    /* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>> +       used in ARMv6-M when we need an efficient implementation.  */
>> +LSYM(udivsi3_skip_div0_test):
>> +    THUMB1_Div_Positive
>> +
>> +#endif /* __OPTIMIZE_SIZE__ */
>> +
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  
>>      ARM_FUNC_START udivsi3
>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>  FUNC_START aeabi_uidivmod
>>      cmp     r1, #0
>>      beq     LSYM(Ldiv0)
>> +# if defined(__OPTIMIZE_SIZE__)
>>      push    {r0, r1, lr}
>>      bl      LSYM(udivsi3_skip_div0_test)
>>      POP     {r1, r2, r3}
>>      mul     r2, r0
>>      sub     r1, r1, r2
>>      bx      r3
>> +# else
>> +    /* Both the quotient and remainder are calculated simultaneously
>> +       in THUMB1_Div_Positive.  There is no need to calculate the
>> +       remainder again here.  */
>> +    b       LSYM(udivsi3_skip_div0_test)
>> +    RET
>> +# endif /* __OPTIMIZE_SIZE__ */
>> +
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  ARM_FUNC_START aeabi_uidivmod
>>      cmp     r1, #0
>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>      RET
>>      
>>  #else  /* ARM version.  */
>> -    
>> +
>>      FUNC_START umodsi3
>>  
>>      subs    r2, r1, #1                      @ compare divisor with 1
>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>  
>>  #if defined(__prefer_thumb__)
>>  
>> -    FUNC_START divsi3       
>> +    FUNC_START divsi3
>>      FUNC_ALIAS aeabi_idiv divsi3
>> +#if defined(__OPTIMIZE_SIZE__)
>>  
>>      cmp     divisor, #0
>>      beq     LSYM(Ldiv0)
>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>      blo     LSYM(Lgot_result)
>>  
>>      THUMB_DIV_MOD_BODY 0
>> -    
>> +
>>      mov     r0, result
>>      mov     work, ip
>>      cmp     work, #0
>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>  LSYM(Lover12):
>>      pop     { work }
>>      RET
>> +#else
>> +    /* Implementation of aeabi_idiv for ARMv6m.  This version is only
>> +       used in ARMv6-M when we need an efficient implementation.  */
>> +LSYM(divsi3_skip_div0_test):
>> +    cpy     curbit, dividend
>> +    orr     curbit, divisor
>> +    bmi     LSYM(Lthumb1_div_negative)
>> +
>> +LSYM(Lthumb1_div_positive):
>> +    THUMB1_Div_Positive
>> +
>> +LSYM(Lthumb1_div_negative):
>> +    THUMB1_Div_Negative
>> +
>> +#endif /* __OPTIMIZE_SIZE__ */
>>  
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  
>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>      RET
>>  
>>  #else /* ARM/Thumb-2 version.  */
>> -    
>> -    ARM_FUNC_START divsi3   
>> +
>> +    ARM_FUNC_START divsi3
>>      ARM_FUNC_ALIAS aeabi_idiv divsi3
>>  
>>      cmp     r1, #0
>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>  FUNC_START aeabi_idivmod
>>      cmp     r1, #0
>>      beq     LSYM(Ldiv0)
>> +# if defined(__OPTIMIZE_SIZE__)
>>      push    {r0, r1, lr}
>>      bl      LSYM(divsi3_skip_div0_test)
>>      POP     {r1, r2, r3}
>>      mul     r2, r0
>>      sub     r1, r1, r2
>>      bx      r3
>> +# else
>> +    /* Both the quotient and remainder are calculated simultaneously
>> +       in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>> +       need to calculate the remainder again here.  */
>> +    b       LSYM(divsi3_skip_div0_test)
>> +    RET
>> +# endif /* __OPTIMIZE_SIZE__ */
>> +
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  ARM_FUNC_START aeabi_idivmod
>>      cmp     r1, #0
>> -- 1.9.1
>>
> 
> Otherwise OK if no regressions and the following request passes.
> 
> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m 
> configuration should give identical binaries with and without your patch, no 
> ? 
> 
> regards
> Ramana
> 
Hi Ramana,


Thank you for the comments. Sorry about the license, must have been a
mixup somewhere.

I put back the 'pop pc is safe' assembly comment and I moved some
comments before the #if and #else as requested. I left some in place
because they did not apply to the whole block but simply to the first
assembly instruction after the #if/else.

I checked that the assembly generated for libgcc was the same with and
without the patch for armv7-a in arm mode and armv7-m in thumb mode.

Is this OK?

Cheers,
Andre

libgcc/ChangeLog:
2016-07-06  Hale Wang  <hale.w...@arm.com>
            Andre Vieira  <andre.simoesdiasvie...@arm.com>

   * config/arm/lib1funcs.S: Add new wrapper.

>From b5b129e698b9e7446907d3da1fbce0236b09b67c Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsi...@arm.com>
Date: Mon, 16 May 2016 18:34:52 +0100
Subject: [PATCH 1/2] integer division

---
 libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 218 insertions(+), 32 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 375a5135110895faa44267ebee045fd315515027..8a245b7a9333b249f120cd7e7ecc77248b150610 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -306,34 +306,13 @@ LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #if defined(__ARM_ARCH_6M__)
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
+	bl	SYM(__aeabi_idiv0)
 	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -945,7 +924,170 @@ LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+/* If performance is preferred, the following functions are provided.  */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -955,6 +1097,7 @@ LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -972,6 +1115,14 @@ LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1023,12 +1174,21 @@ LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1084,7 +1244,7 @@ LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1109,8 +1269,9 @@ LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1133,7 +1294,7 @@ LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1143,6 +1304,22 @@ LSYM(Lover12):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START divsi3
@@ -1154,8 +1331,8 @@ LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1209,12 +1386,21 @@ LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
-- 
1.9.1

Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m

Reply via email to