Module Name: src Committed By: christos Date: Mon Jul 20 15:21:00 UTC 2009
Modified Files: src/common/lib/libc/arch/x86_64/string: ffs.S memchr.S strchr.S Log Message: Put back dsl's string changes, but fix memchr.S to use cmp so that the condition code is set (and fix the comments 0x10->0x01). From Anon Ymous We need a test for memchr(x, -1)... To generate a diff of this commit: cvs rdiff -u -r1.3 -r1.4 src/common/lib/libc/arch/x86_64/string/ffs.S \ src/common/lib/libc/arch/x86_64/string/memchr.S cvs rdiff -u -r1.5 -r1.6 src/common/lib/libc/arch/x86_64/string/strchr.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/common/lib/libc/arch/x86_64/string/ffs.S diff -u src/common/lib/libc/arch/x86_64/string/ffs.S:1.3 src/common/lib/libc/arch/x86_64/string/ffs.S:1.4 --- src/common/lib/libc/arch/x86_64/string/ffs.S:1.3 Sun Jul 19 19:45:29 2009 +++ src/common/lib/libc/arch/x86_64/string/ffs.S Mon Jul 20 11:21:00 2009 @@ -7,15 +7,14 @@ #include <machine/asm.h> #if defined(LIBC_SCCS) - RCSID("$NetBSD: ffs.S,v 1.3 2009/07/19 23:45:29 christos Exp $") + RCSID("$NetBSD: ffs.S,v 1.4 2009/07/20 15:21:00 christos Exp $") #endif ENTRY(ffs) bsfl %edi,%eax - jz L1 /* ZF is set if all bits are 0 */ + jz 1f /* ZF is set if all bits are 0 */ incl %eax /* bits numbered from 1, not 0 */ ret - _ALIGN_TEXT -L1: xorl %eax,%eax /* clear result */ +1: xorl %eax,%eax /* clear result */ ret Index: src/common/lib/libc/arch/x86_64/string/memchr.S diff -u src/common/lib/libc/arch/x86_64/string/memchr.S:1.3 src/common/lib/libc/arch/x86_64/string/memchr.S:1.4 --- src/common/lib/libc/arch/x86_64/string/memchr.S:1.3 Sun Jul 19 19:45:29 2009 +++ src/common/lib/libc/arch/x86_64/string/memchr.S Mon Jul 20 11:21:00 2009 @@ -1,111 +1,115 @@ -/* - * Written by J.T. Conklin <j...@acorntoolworks.com> - * Public domain. +/* $NetBSD: memchr.S,v 1.4 2009/07/20 15:21:00 christos Exp $ */ + +/*- + * Copyright (c) 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by David Laight. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ #include <machine/asm.h> #if defined(LIBC_SCCS) - RCSID("$NetBSD: memchr.S,v 1.3 2009/07/19 23:45:29 christos Exp $") + RCSID("$NetBSD: memchr.S,v 1.4 2009/07/20 15:21:00 christos Exp $") #endif -ENTRY(memchr) - movzbq %sil,%rcx - - /* - * Align to word boundary. - * Consider unrolling loop? - */ - testq %rdx,%rdx /* nbytes == 0? */ - je .Lzero -.Lalign: - testb $7,%dil - je .Lword_aligned - movq %rdi,%rax - cmpb (%rdi),%cl - je .Ldone - incq %rdi - decq %rdx - jnz .Lalign - jmp .Lzero - -.Lword_aligned: - /* copy char to all bytes in word */ - movb %cl,%ch - movq %rcx,%rsi - salq $16,%rcx - orq %rsi,%rcx - movq %rcx,%rsi - salq $32,%rcx - orq %rsi,%rcx +/* + * The instruction sequences used try to avoid data dependencies + * between adjacent instructions (to allow parallel execution). + * The 'imul' for %r9 could be put into the delay following the + * memory read (ie inside the loop) at no obvious cost - except + * that the loop is currently exactly 32 bytes - 2 fetch blocks!. + * + * I don't think aligning any of the other branch targets is useful. + */ +ENTRY(memchr) movabsq $0x0101010101010101,%r8 - movabsq $0x8080808080808080,%r9 + lea (%rdi,%rdx),%r10 /* limit of buffer to scan */ + movzbq %sil,%rsi /* mask high bits! */ - _ALIGN_TEXT -.Lloop: - cmpq $7,%rdx /* nbytes > 8 */ - jbe .Lbyte - movq (%rdi),%rsi - addq $8,%rdi - xorq %rcx,%rsi - subq $8,%rdx - subq %r8,%rsi - testq %r9,%rsi - je .Lloop - - /* - * In rare cases, the above loop may exit prematurely. We must - * return to the loop if none of the bytes in the word are - * equal to ch. - */ - - leaq -8(%rdi),%rax - cmpb -8(%rdi),%cl /* 1st byte == ch? */ - je .Ldone - - leaq -7(%rdi),%rax - cmpb -7(%rdi),%cl /* 2nd byte == ch? */ - je .Ldone - - leaq -6(%rdi),%rax - cmpb -6(%rdi),%cl /* 3rd byte == ch? */ - je .Ldone - - leaq -5(%rdi),%rax - cmpb -5(%rdi),%cl /* 4th byte == ch? */ - je .Ldone - - leaq -4(%rdi),%rax - cmpb -4(%rdi),%cl /* 5th byte == ch? */ - je .Ldone - - leaq -3(%rdi),%rax - cmpb -3(%rdi),%cl /* 6th byte == ch? */ - je .Ldone - - leaq -2(%rdi),%rax - cmpb -2(%rdi),%cl /* 7th byte == ch? */ - je .Ldone - - leaq -1(%rdi),%rax - cmpb -1(%rdi),%cl /* 7th byte == ch? */ - jne .Lloop - ret + /* 'directpath' imuls can execute 3 at a time ... (amd) */ + imul %r8,%rsi /* search byte replicated in word */ + imul $0x80,%r8,%r9 /* 0x8080808080808080 */ + test $7,%dil + jnz 20f /* jump if misaligned */ + jmp 1f /* jump to avoid 4 nops (13 bytes) in gap */ + + _ALIGN_TEXT /* entire loop now in 32 aligned bytes */ +1: + cmpq %r10,%rdi /* end of buffer ? */ + jae 30f /* jump if so */ -.Lbyte: - testq %rdx,%rdx - je .Lzero -.Lbyte_loop: - movq %rdi,%rax - cmpb (%rdi),%cl - je .Ldone - incq %rdi - decq %rdx - jnz .Lbyte_loop - -.Lzero: - xorq %rax,%rax + movq (%rdi),%rax /* value to check */ +2: + addq $8,%rdi + xorq %rsi,%rax /* now looking for zeros */ + mov %rax,%rcx + subq %r8,%rax /* x - 0x01 */ + not %rcx + andq %r9,%rax /* (x - 0x01) & 0x80 */ + andq %rcx,%rax /* ((x - 0x01) & 0x80) & ~x */ + je 1b /* jump if not found */ + +/* Found byte in word, get its address */ + bsf %rax,%rax + shr $3,%eax + lea -8(%rax,%rdi),%rax + cmpq %r10,%rax /* need to check not beyond buffer */ + jae 30f + rep + ret /* amd - no ret after jmp */ + +/* Input misaligned, read aligned and kill low bits */ +/* (Getting a -1 is surprisingly hard work!) */ +20: + xor %eax,%eax /* zeros all 64 bits */ + mov %dil,%cl /* misalignment amount 1..7 (+high bits )*/ + test %rdx,%rdx /* zero length, don't read */ + jz 30f + + and $~7,%dil /* %rdi now start of word */ + lea -1(%rax),%r11 /* all 0xff */ + and $7,%cl /* 1..7 */ + + mov (%rdi),%rax /* word containing first byte */ + shl $3,%cl /* 8..56 */ + cmp %r11,%rsi /* searching for 0xff */ + jz 25f + + /* Searching for other than 0xff, set low bytes */ + shl %cl,%r11 /* 0xff in high (wanted) bytes */ + not %r11 /* 0xff in low (unwanted) bytes */ + or %r11,%rax /* low bytes now set */ + jmp 2b + +25: /* Searching for 0xff, clear low bytes */ + shl %cl,%r11 /* 0xff in high (wanted) bytes */ + and %r11,%rax /* low bytes now zero */ + jmp 2b -.Ldone: +/* Not found */ +30: xorq %rax,%rax ret Index: src/common/lib/libc/arch/x86_64/string/strchr.S diff -u src/common/lib/libc/arch/x86_64/string/strchr.S:1.5 src/common/lib/libc/arch/x86_64/string/strchr.S:1.6 --- src/common/lib/libc/arch/x86_64/string/strchr.S:1.5 Sun Jul 19 19:45:29 2009 +++ src/common/lib/libc/arch/x86_64/string/strchr.S Mon Jul 20 11:21:00 2009 @@ -1,133 +1,153 @@ -/* - * Written by J.T. Conklin <j...@acorntoolworks.com> - * Public domain. +/* $NetBSD: strchr.S,v 1.6 2009/07/20 15:21:00 christos Exp $ */ + +/*- + * Copyright (c) 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by David Laight. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. */ +/* See comments in strlen.S about checking words for byte values */ + #include <machine/asm.h> #if defined(LIBC_SCCS) - RCSID("$NetBSD: strchr.S,v 1.5 2009/07/19 23:45:29 christos Exp $") + RCSID("$NetBSD: strchr.S,v 1.6 2009/07/20 15:21:00 christos Exp $") #endif -ENTRY(strchr) - movzbq %sil,%rcx - - /* - * Align to word boundary. - * Consider unrolling loop? - */ -.Lalign: - testb $7,%dil - je .Lword_aligned - movb (%rdi),%dl - cmpb %cl,%dl - je .Ldone - incq %rdi - testb %dl,%dl - jne .Lalign - jmp .Lzero - -.Lword_aligned: - /* copy char to all bytes in word */ - movb %cl,%ch - movq %rcx,%rdx - salq $16,%rcx - orq %rdx,%rcx - movq %rcx,%rdx - salq $32,%rcx - orq %rdx,%rcx +/* + * On entry %rdi is the buffer and the low byte of %rsi (%sil) the + * character to search for. + * + * Registers %rdx, %rcx, %r8-%r11 and %rax are also usable + */ +/* Uncomment below to get regression test to run this version but + * have everything else use the trivial one below. */ +/* #define TEST_STRCHR */ + +#ifdef TEST_STRCHR +ENTRY(test_strchr) +#else +ENTRY(strchr) +#endif movabsq $0x0101010101010101,%r8 - movabsq $0x8080808080808080,%r9 - /* Check whether any byte in the word is equal to ch or 0. */ - _ALIGN_TEXT -.Lloop: - movq (%rdi),%rdx + movzbq %sil,%rdx /* value to search for (c) */ + /* These imul are 'directpath' on athlons, so are fast */ + imul $0x80,%r8,%r9 /* 0x8080808080808080 */ + imul %r8,%rdx /* (c) copied to all bytes */ + test $7,%dil + jnz 20f /* jump if misaligned */ + + _ALIGN_TEXT /* one byte nop */ +1: + movq (%rdi),%rax /* bytes to check (x) */ +2: addq $8,%rdi - movq %rdx,%rsi - subq %r8,%rdx - xorq %rcx,%rsi - subq %r8,%rsi - orq %rsi,%rdx - testq %r9,%rdx - je .Lloop - - /* - * In rare cases, the above loop may exit prematurely. We must - * return to the loop if none of the bytes in the word match - * ch or are equal to 0. - */ - - movb -8(%rdi),%dl - cmpb %cl,%dl /* 1st byte == ch? */ - jne 1f - subq $8,%rdi - jmp .Ldone -1: testb %dl,%dl /* 1st byte == 0? */ - je .Lzero - - movb -7(%rdi),%dl - cmpb %cl,%dl /* 2nd byte == ch? */ - jne 1f - subq $7,%rdi - jmp .Ldone -1: testb %dl,%dl /* 2nd byte == 0? */ - je .Lzero - - movb -6(%rdi),%dl - cmpb %cl,%dl /* 3rd byte == ch? */ - jne 1f - subq $6,%rdi - jmp .Ldone -1: testb %dl,%dl /* 3rd byte == 0? */ - je .Lzero - - movb -5(%rdi),%dl - cmpb %cl,%dl /* 4th byte == ch? */ - jne 1f - subq $5,%rdi - jmp .Ldone -1: testb %dl,%dl /* 4th byte == 0? */ - je .Lzero - - movb -4(%rdi),%dl - cmpb %cl,%dl /* 5th byte == ch? */ - jne 1f - subq $4,%rdi - jmp .Ldone -1: testb %dl,%dl /* 5th byte == 0? */ - je .Lzero - - movb -3(%rdi),%dl - cmpb %cl,%dl /* 6th byte == ch? */ - jne 1f - subq $3,%rdi - jmp .Ldone -1: testb %dl,%dl /* 6th byte == 0? */ - je .Lzero - - movb -2(%rdi),%dl - cmpb %cl,%dl /* 7th byte == ch? */ - jne 1f - subq $2,%rdi - jmp .Ldone -1: testb %dl,%dl /* 7th byte == 0? */ - je .Lzero - - movb -1(%rdi),%dl - cmpb %cl,%dl /* 8th byte == ch? */ - jne 1f - subq $1,%rdi - jmp .Ldone -1: testb %dl,%dl /* 8th byte == 0? */ - jne .Lloop - -.Lzero: - /* If a ch wasn't found, return 0. */ - xorq %rdi,%rdi + mov %rax,%r10 + mov %rax,%r11 /* for 'char' check */ + not %r10 /* invert of data (~x) */ + + xorq %rdx,%r11 /* convert 'char' test to one for NUL */ + subq %r8,%rax /* x - 0x10 */ + movq %r10,%rsi /* ~x */ + subq %r8,%r11 /* (x ^ c) - 0x10 */ +/* + * Here we could check ((x - 0x10) | ((x ^ c) - 0x10)) & 0x80 + * and short-circuit the case where no top bits are set, and + * we continue the loop. + * However it needs 3 more clocks that are difficult to interleave + * in the existing dependency chain ... + */ + andq %r9,%rax /* (x - 0x10) & 0x80 */ + xorq %rdx,%rsi /* c ^ ~x == ~(c ^ x) */ + andq %r9,%r11 /* ((x ^ c) - 0x10) & 0x80 */ + andq %r10,%rax /* (x - 0x10) & 0x80 & ~x */ + jne 10f /* jump if string ends */ + andq %rsi,%r11 /* ((x ^ c) - 0x10) & 0x80 & ~(x ^ c) */ + je 1b /* jump if no match */ + + /* Found char, since LE can use bit scan */ + bsf %r11,%r11 /* 7, 15, 23 ... 63 */ +8: shr $3,%r11 /* 0, 1, 2 .. 7 */ + lea -8(%r11,%rdi),%rax + ret -.Ldone: - movq %rdi,%rax +/* End of string, check whether char is before NUL */ + _ALIGN_TEXT /* adds three byte nop */ +10: + bsf %rax,%rax /* count to NUL */ + andq %rsi,%r11 /* check for char in last 8 bytes */ + je 11f + bsf %r11,%r11 /* NUL and char - see which was first */ + cmp %r11,%rax + jae 8b /* return 'found' if same - searching for NUL */ +11: xor %eax,%eax /* char not found */ ret + +/* Source misaligned: read aligned word and make low bytes invalid */ +/* I (dsl) think a _ALIGN_TEXT here will slow things down! */ +20: + xor %rcx,%rcx + sub %dil,%cl /* Convert low address values 1..7 ... */ + sbb %rsi,%rsi /* carry was set, so %rsi now ~0u! */ + and $7,%cl /* ... to 7..1 */ + and $~7,%dil /* move address to start of word */ + shl $3,%cl /* now 56, 48 ... 16, 8 */ + movq (%rdi),%rax /* aligned word containing first data */ + xor %rdx,%rsi /* invert of search pattern (~c) */ + je 22f /* searching for 0xff */ +21: shr %cl,%rsi /* ~c in low bytes */ + or %rsi,%rax /* set some bits making low bytes invalid */ + jmp 2b + +/* We are searching for 0xff, so can't use ~pattern for invalid value */ +22: + mov %r8,%r10 /* 0x01 pattern */ + lea (%r8,%r8),%rsi /* 0x02 - bits gets set (above) */ + not %r10 /* now 0xfe */ + sar %cl,%r10 /* top bytes 0xff */ + and %r10,%rax /* clear lsb from unwanted low bytes */ + jmp 21b + +#ifdef TEST_STRCHR +/* Trivial version for bug-fixing above */ +ENTRY(strchr) + movq %rsi,%rdx + movq %rdi,%rsi +1: + lodsb + cmp %al,%dl + je 2f + test %al,%al + jne 1b + xor %eax,%eax + ret +2: lea -1(%rsi),%rax + ret +#endif + STRONG_ALIAS(index,strchr)