Re: [PATCH] Improved strlen for ARM, around 29% faster

Gabriel Gonzalez Wed, 03 Oct 2012 13:50:31 -0700

Great, I will test it as well.


On Wed, Oct 3, 2012 at 8:59 PM, Khem Raj <[email protected]> wrote:
> On Fri, Sep 28, 2012 at 5:48 PM, Gabriel Gonzalez
> <[email protected]> wrote:
>> This version for ARM improves performance mainly unrolling the loop for 
>> iterations
>> and reducing the instructions need to look for the null character.
>> A deeper analysis of this can be found at 
>> http://www.gabrielgonzalezgarcia.com/2012/10/02/mystrlen-vs-android-bionics-strlen-on-arm-cpu/
>> where you can find some data which back up the performance improvement.
>> I have only tested it on a little endian CPU so the BIG ENDIAN chunk might 
>> need some testing
>>
>> Signed-off-by: Gabriel Gonzalez <[email protected]>
>> ---
>>  libc/string/arm/strlen.S |  123 
>> +++++++++++++++++++++++++++-------------------
>>  1 file changed, 72 insertions(+), 51 deletions(-)
>>
>> diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S
>> index 949e918..b87fbb4 100644
>> --- a/libc/string/arm/strlen.S
>> +++ b/libc/string/arm/strlen.S
>> @@ -46,63 +46,84 @@ strlen:
>>         bx lr
>>  #else
>>  strlen:
>> -       bic     r1, r0, $3              @ addr of word containing first byte
>> -       ldr     r2, [r1], $4            @ get the first word
>> -       ands    r3, r0, $3              @ how many bytes are duff?
>> -       rsb     r0, r3, $0              @ get - that number into counter.
>> -       beq     Laligned                @ skip into main check routine if no
>> -                                       @ more
>> -#if __BYTE_ORDER == __BIG_ENDIAN
>> -       orr     r2, r2, $0xff000000     @ set this byte to non-zero
>> -       subs    r3, r3, $1              @ any more to do?
>> -       IT(t, gt)
>> -       orrgt   r2, r2, $0x00ff0000     @ if so, set this byte
>> -       subs    r3, r3, $1              @ more?
>> -       IT(t, gt)
>> -       orrgt   r2, r2, $0x0000ff00     @ then set.
>> -#else
>> -       orr     r2, r2, $0x000000ff     @ set this byte to non-zero
>> -       subs    r3, r3, $1              @ any more to do?
>> -       IT(t, gt)
>> -       orrgt   r2, r2, $0x0000ff00     @ if so, set this byte
>> -       subs    r3, r3, $1              @ more?
>> -       IT(t, gt)
>> -       orrgt   r2, r2, $0x00ff0000     @ then set.
>> -#endif
>> -Laligned:                              @ here, we have a word in r2.  Does 
>> it
>> -       tst     r2, $0x000000ff         @ contain any zeroes?
>> -       IT(tttt, ne)
>> -       tstne   r2, $0x0000ff00         @
>> -       tstne   r2, $0x00ff0000         @
>> -       tstne   r2, $0xff000000         @
>> -       addne   r0, r0, $4              @ if not, the string is 4 bytes 
>> longer
>> -       IT(t, ne)
>> -       ldrne   r2, [r1], $4            @ and we continue to the next word
>> -       bne     Laligned                @
>> -Llastword:                             @ drop through to here once we find a
>> +       stmfd   sp!, {v1, v2, v3, v4, v5, v6, v7, lr}
>> +
>> +       mov     v7, a1
>> +       ldr     v6,  =0x80808080
>> +
>> +       ##
>> +       ## un-aligned address till we get aligned
>> +       ##
>> +1:     tst     v7, #3
>> +       beq     0f
>> +       ldrb    v1, [v7], #1
>> +       tst     v1, #0xFF
>> +       beq     4f
>> +       bal     1b
>> +
>> +
>> +       ## un-rolling strings
>> +       ## as few instructions in the loop
>> +       ## as possible
>> +       ## - Check whether any position equals 0
>> +0:     ldmfd   v7!, {v1, v2, v3, v4}
>> +       sub     v5, v1, v6, LSR #7
>> +       and     v5, v5, v6
>> +       bics    v5, v5, v1
>> +       bne     1f
>> +       sub     v5, v2, v6, LSR #7
>> +       and     v5, v5, v6
>> +       bics    v5, v5, v2
>> +       bne     2f
>> +       sub     v5, v3, v6, LSR #7
>> +       and     v5, v5, v6
>> +       bics    v5, v5, v3
>> +       bne     3f
>> +       sub     v5, v4, v6, LSR #7
>> +       and     v5, v5, v6
>> +       bics    v5, v5, v4
>> +       bne     4f
>> +       beq     0b
>> +
>> +4:     mov     v1, v4
>> +       bal     0f
>> +3:     mov     v1, v3
>> +       sub     v7, v7, #4
>> +       bal     0f
>> +2:     mov     v1, v2
>> +       sub     v7, v7, #8
>> +       bal     0f
>> +1:     sub     v7, v7, #12
>> +       ## After the loop we calculate the diff
>> +       ## between the end and the begining of the str
>> +       ##
>>  #if __BYTE_ORDER == __BIG_ENDIAN
>> -       tst     r2, $0xff000000         @ word that has a zero byte in it
>> -       IT(tttt, ne)
>> -       addne   r0, r0, $1              @
>> -       tstne   r2, $0x00ff0000         @ and add up to 3 bytes on to it
>> -       addne   r0, r0, $1              @
>> -       tstne   r2, $0x0000ff00         @ (if first three all non-zero, 4th
>> -       IT(t, ne)
>> -       addne   r0, r0, $1              @  must be zero)
>> +0:     tst     v1, #0xFF000000
>> +       subeq   v7, v7, #1
>> +       tst     v1, #0x00FF0000
>> +       subeq   v7, v7, #1
>> +       tst     v1, #0x0000FF00
>> +       subeq   v7, v7, #1
>> +       tst     v1, #0x000000FF
>> +4:     subeq   v7, v7, #1
>>  #else
>> -       tst     r2, $0x000000ff         @
>> -       IT(tttt, ne)
>> -       addne   r0, r0, $1              @
>> -       tstne   r2, $0x0000ff00         @ and add up to 3 bytes on to it
>> -       addne   r0, r0, $1              @
>> -       tstne   r2, $0x00ff0000         @ (if first three all non-zero, 4th
>> -       IT(t, ne)
>> -       addne   r0, r0, $1              @  must be zero)
>> +0:     tst     v1, #0xFF
>> +       subeq   v7, v7, #1
>> +       tst     v1, #0xFF00
>> +       subeq   v7, v7, #1
>> +       tst     v1, #0xFF0000
>> +       subeq   v7, v7, #1
>> +       tst     v1, #0xFF000000
>> +4:     subeq   v7, v7, #1
>>  #endif
>> +
>> +       sub     a1, v7, a1
>> +
>>  #if defined(__USE_BX__)
>> +       ldmfd   sp!, {v1, v2, v3, v4, v5, v6, v7, lr}
>>          bx      lr
>>  #else
>> -       mov     pc,lr
>> +       ldmfd   sp!, {v1, v2, v3, v4, v5, v6, v7, pc}
>>  #endif
>>  #endif
>>
>> --
>> 1.7.9.5
>
> This looks good. I would like that people try it on various arm's
> and give it a good test. Current algorithm may be suboptimal but
> it has been time tested.
_______________________________________________
uClibc mailing list
[email protected]
http://lists.busybox.net/mailman/listinfo/uclibc

Re: [PATCH] Improved strlen for ARM, around 29% faster

Reply via email to