This version for ARM improves performance mainly unrolling the loop for iterations and reducing the instructions need to look for the null character. A deeper analysis of this can be found at http://www.gabrielgonzalezgarcia.com/2012/10/02/mystrlen-vs-android-bionics-strlen-on-arm-cpu/ where you can find some data which back up the performance improvement. I have only tested it on a little endian CPU so the BIG ENDIAN chunk might need some testing
Signed-off-by: Gabriel Gonzalez <[email protected]> --- libc/string/arm/strlen.S | 123 +++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 51 deletions(-) diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S index 949e918..b87fbb4 100644 --- a/libc/string/arm/strlen.S +++ b/libc/string/arm/strlen.S @@ -46,63 +46,84 @@ strlen: bx lr #else strlen: - bic r1, r0, $3 @ addr of word containing first byte - ldr r2, [r1], $4 @ get the first word - ands r3, r0, $3 @ how many bytes are duff? - rsb r0, r3, $0 @ get - that number into counter. - beq Laligned @ skip into main check routine if no - @ more -#if __BYTE_ORDER == __BIG_ENDIAN - orr r2, r2, $0xff000000 @ set this byte to non-zero - subs r3, r3, $1 @ any more to do? - IT(t, gt) - orrgt r2, r2, $0x00ff0000 @ if so, set this byte - subs r3, r3, $1 @ more? - IT(t, gt) - orrgt r2, r2, $0x0000ff00 @ then set. -#else - orr r2, r2, $0x000000ff @ set this byte to non-zero - subs r3, r3, $1 @ any more to do? - IT(t, gt) - orrgt r2, r2, $0x0000ff00 @ if so, set this byte - subs r3, r3, $1 @ more? - IT(t, gt) - orrgt r2, r2, $0x00ff0000 @ then set. -#endif -Laligned: @ here, we have a word in r2. Does it - tst r2, $0x000000ff @ contain any zeroes? - IT(tttt, ne) - tstne r2, $0x0000ff00 @ - tstne r2, $0x00ff0000 @ - tstne r2, $0xff000000 @ - addne r0, r0, $4 @ if not, the string is 4 bytes longer - IT(t, ne) - ldrne r2, [r1], $4 @ and we continue to the next word - bne Laligned @ -Llastword: @ drop through to here once we find a + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, lr} + + mov v7, a1 + ldr v6, =0x80808080 + + ## + ## un-aligned address till we get aligned + ## +1: tst v7, #3 + beq 0f + ldrb v1, [v7], #1 + tst v1, #0xFF + beq 4f + bal 1b + + + ## un-rolling strings + ## as few instructions in the loop + ## as possible + ## - Check whether any position equals 0 +0: ldmfd v7!, {v1, v2, v3, v4} + sub v5, v1, v6, LSR #7 + and v5, v5, v6 + bics v5, v5, v1 + bne 1f + sub v5, v2, v6, LSR #7 + and v5, v5, v6 + bics v5, v5, v2 + bne 2f + sub v5, v3, v6, LSR #7 + and v5, v5, v6 + bics v5, v5, v3 + bne 3f + sub v5, v4, v6, LSR #7 + and v5, v5, v6 + bics v5, v5, v4 + bne 4f + beq 0b + +4: mov v1, v4 + bal 0f +3: mov v1, v3 + sub v7, v7, #4 + bal 0f +2: mov v1, v2 + sub v7, v7, #8 + bal 0f +1: sub v7, v7, #12 + ## After the loop we calculate the diff + ## between the end and the begining of the str + ## #if __BYTE_ORDER == __BIG_ENDIAN - tst r2, $0xff000000 @ word that has a zero byte in it - IT(tttt, ne) - addne r0, r0, $1 @ - tstne r2, $0x00ff0000 @ and add up to 3 bytes on to it - addne r0, r0, $1 @ - tstne r2, $0x0000ff00 @ (if first three all non-zero, 4th - IT(t, ne) - addne r0, r0, $1 @ must be zero) +0: tst v1, #0xFF000000 + subeq v7, v7, #1 + tst v1, #0x00FF0000 + subeq v7, v7, #1 + tst v1, #0x0000FF00 + subeq v7, v7, #1 + tst v1, #0x000000FF +4: subeq v7, v7, #1 #else - tst r2, $0x000000ff @ - IT(tttt, ne) - addne r0, r0, $1 @ - tstne r2, $0x0000ff00 @ and add up to 3 bytes on to it - addne r0, r0, $1 @ - tstne r2, $0x00ff0000 @ (if first three all non-zero, 4th - IT(t, ne) - addne r0, r0, $1 @ must be zero) +0: tst v1, #0xFF + subeq v7, v7, #1 + tst v1, #0xFF00 + subeq v7, v7, #1 + tst v1, #0xFF0000 + subeq v7, v7, #1 + tst v1, #0xFF000000 +4: subeq v7, v7, #1 #endif + + sub a1, v7, a1 + #if defined(__USE_BX__) + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, lr} bx lr #else - mov pc,lr + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, pc} #endif #endif -- 1.7.9.5 _______________________________________________ uClibc mailing list [email protected] http://lists.busybox.net/mailman/listinfo/uclibc
