Module Name: src Committed By: christos Date: Sun Jul 19 23:45:29 UTC 2009
Modified Files: src/common/lib/libc/arch/x86_64/string: ffs.S memchr.S strchr.S Log Message: revert changes that made new kernels hang in ACPI detection To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 src/common/lib/libc/arch/x86_64/string/ffs.S \ src/common/lib/libc/arch/x86_64/string/memchr.S cvs rdiff -u -r1.4 -r1.5 src/common/lib/libc/arch/x86_64/string/strchr.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/common/lib/libc/arch/x86_64/string/ffs.S diff -u src/common/lib/libc/arch/x86_64/string/ffs.S:1.2 src/common/lib/libc/arch/x86_64/string/ffs.S:1.3 --- src/common/lib/libc/arch/x86_64/string/ffs.S:1.2 Sat Jul 18 08:03:31 2009 +++ src/common/lib/libc/arch/x86_64/string/ffs.S Sun Jul 19 19:45:29 2009 @@ -7,14 +7,15 @@ #include <machine/asm.h> #if defined(LIBC_SCCS) - RCSID("$NetBSD: ffs.S,v 1.2 2009/07/18 12:03:31 dsl Exp $") + RCSID("$NetBSD: ffs.S,v 1.3 2009/07/19 23:45:29 christos Exp $") #endif ENTRY(ffs) bsfl %edi,%eax - jz 1f /* ZF is set if all bits are 0 */ + jz L1 /* ZF is set if all bits are 0 */ incl %eax /* bits numbered from 1, not 0 */ ret -1: xorl %eax,%eax /* clear result */ + _ALIGN_TEXT +L1: xorl %eax,%eax /* clear result */ ret Index: src/common/lib/libc/arch/x86_64/string/memchr.S diff -u src/common/lib/libc/arch/x86_64/string/memchr.S:1.2 src/common/lib/libc/arch/x86_64/string/memchr.S:1.3 --- src/common/lib/libc/arch/x86_64/string/memchr.S:1.2 Sat Jul 18 14:06:56 2009 +++ src/common/lib/libc/arch/x86_64/string/memchr.S Sun Jul 19 19:45:29 2009 @@ -1,115 +1,111 @@ -/* $NetBSD: memchr.S,v 1.2 2009/07/18 18:06:56 dsl Exp $ */ - -/*- - * Copyright (c) 2009 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by David Laight. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. +/* + * Written by J.T. Conklin <j...@acorntoolworks.com> + * Public domain. */ #include <machine/asm.h> #if defined(LIBC_SCCS) - RCSID("$NetBSD: memchr.S,v 1.2 2009/07/18 18:06:56 dsl Exp $") + RCSID("$NetBSD: memchr.S,v 1.3 2009/07/19 23:45:29 christos Exp $") #endif -/* - * The instruction sequences used try to avoid data dependencies - * between adjacent instructions (to allow parallel execution). - * The 'imul' for %r9 could be put into the delay following the - * memory read (ie inside the loop) at no obvious cost - except - * that the loop is currently exactly 32 bytes - 2 fetch blocks!. - * - * I don't think aligning any of the other branch targets is useful. - */ - ENTRY(memchr) - movabsq $0x0101010101010101,%r8 - lea (%rdi,%rdx),%r10 /* limit of buffer to scan */ - movzbq %sil,%rsi /* mask high bits! */ + movzbq %sil,%rcx - /* 'directpath' imuls can execute 3 at a time ... (amd) */ - imul %r8,%rsi /* search byte replicated in word */ - imul $0x80,%r8,%r9 /* 0x8080808080808080 */ - test $7,%dil - jnz 20f /* jump if misaligned */ - jmp 1f /* jump to avoid 4 nops (13 bytes) in gap */ - - _ALIGN_TEXT /* entire loop now in 32 aligned bytes */ -1: - cmpq %r10,%rdi /* end of buffer ? */ - jae 30f /* jump if so */ + /* + * Align to word boundary. + * Consider unrolling loop? + */ + testq %rdx,%rdx /* nbytes == 0? */ + je .Lzero +.Lalign: + testb $7,%dil + je .Lword_aligned + movq %rdi,%rax + cmpb (%rdi),%cl + je .Ldone + incq %rdi + decq %rdx + jnz .Lalign + jmp .Lzero + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movq %rcx,%rsi + salq $16,%rcx + orq %rsi,%rcx + movq %rcx,%rsi + salq $32,%rcx + orq %rsi,%rcx - movq (%rdi),%rax /* value to check */ -2: + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 + + _ALIGN_TEXT +.Lloop: + cmpq $7,%rdx /* nbytes > 8 */ + jbe .Lbyte + movq (%rdi),%rsi addq $8,%rdi - xorq %rsi,%rax /* now looking for zeros */ - mov %rax,%rcx - subq %r8,%rax /* x - 0x10 */ - not %rcx - andq %r9,%rax /* (x - 0x10) & 0x80 */ - andq %rcx,%rax /* ((x - 0x10) & 0x80) ^ ~x */ - je 1b /* jump if not found */ - -/* Found byte in word, get its address */ - bsf %rax,%rax - shr $3,%eax - lea -8(%rax,%rdi),%rax - cmpq %r10,%rax /* need to check not beyond buffer */ - jae 30f - rep - ret /* amd - no ret after jmp */ - -/* Input misaligned, read aligned and kill low bits */ -/* (Getting a -1 is surprisingly hard work!) */ -20: - xor %eax,%eax /* zeros all 64 bits */ - mov %dil,%cl /* misalignment amount 1..7 (+high bits )*/ - test %rdx,%rdx /* zero length, don't read */ - jz 30f - - and $~7,%dil /* %rdi now start of word */ - lea -1(%rax),%r11 /* all 0xff */ - and $7,%cl /* 1..7 */ - - mov (%rdi),%rax /* word containing first byte */ - shl $3,%cl /* 8..56 */ - test %r11,%rsi /* searching for 0xff */ - jz 25f - - /* Searching for other than 0xff, set low bytes */ - shl %cl,%r11 /* 0xff in high (wanted) bytes */ - not %r11 /* 0xff in low (unwanted) bytes */ - or %r11,%rax /* low bytes now set */ - jmp 2b - -25: /* Searching for 0xff, clear low bytes */ - shl %cl,%r11 /* 0xff in high (wanted) bytes */ - and %r11,%rax /* low bytes now zero */ - jmp 2b + xorq %rcx,%rsi + subq $8,%rdx + subq %r8,%rsi + testq %r9,%rsi + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word are + * equal to ch. + */ + + leaq -8(%rdi),%rax + cmpb -8(%rdi),%cl /* 1st byte == ch? */ + je .Ldone + + leaq -7(%rdi),%rax + cmpb -7(%rdi),%cl /* 2nd byte == ch? */ + je .Ldone + + leaq -6(%rdi),%rax + cmpb -6(%rdi),%cl /* 3rd byte == ch? */ + je .Ldone + + leaq -5(%rdi),%rax + cmpb -5(%rdi),%cl /* 4th byte == ch? */ + je .Ldone + + leaq -4(%rdi),%rax + cmpb -4(%rdi),%cl /* 5th byte == ch? */ + je .Ldone + + leaq -3(%rdi),%rax + cmpb -3(%rdi),%cl /* 6th byte == ch? */ + je .Ldone + + leaq -2(%rdi),%rax + cmpb -2(%rdi),%cl /* 7th byte == ch? */ + je .Ldone + + leaq -1(%rdi),%rax + cmpb -1(%rdi),%cl /* 7th byte == ch? */ + jne .Lloop + ret + +.Lbyte: + testq %rdx,%rdx + je .Lzero +.Lbyte_loop: + movq %rdi,%rax + cmpb (%rdi),%cl + je .Ldone + incq %rdi + decq %rdx + jnz .Lbyte_loop + +.Lzero: + xorq %rax,%rax -/* Not found */ -30: xorq %rax,%rax +.Ldone: ret Index: src/common/lib/libc/arch/x86_64/string/strchr.S diff -u src/common/lib/libc/arch/x86_64/string/strchr.S:1.4 src/common/lib/libc/arch/x86_64/string/strchr.S:1.5 --- src/common/lib/libc/arch/x86_64/string/strchr.S:1.4 Sat Jul 18 12:40:31 2009 +++ src/common/lib/libc/arch/x86_64/string/strchr.S Sun Jul 19 19:45:29 2009 @@ -1,153 +1,133 @@ -/* $NetBSD: strchr.S,v 1.4 2009/07/18 16:40:31 dsl Exp $ */ - -/*- - * Copyright (c) 2009 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by David Laight. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. +/* + * Written by J.T. Conklin <j...@acorntoolworks.com> + * Public domain. */ -/* See comments in strlen.S about checking words for byte values */ - #include <machine/asm.h> #if defined(LIBC_SCCS) - RCSID("$NetBSD: strchr.S,v 1.4 2009/07/18 16:40:31 dsl Exp $") + RCSID("$NetBSD: strchr.S,v 1.5 2009/07/19 23:45:29 christos Exp $") #endif -/* - * On entry %rdi is the buffer and the low byte of %rsi (%sil) the - * character to search for. - * - * Registers %rdx, %rcx, %r8-%r11 and %rax are also usable - */ - -/* Uncomment below to get regression test to run this version but - * have everything else use the trivial one below. */ -/* #define TEST_STRCHR */ - -#ifdef TEST_STRCHR -ENTRY(test_strchr) -#else ENTRY(strchr) -#endif - movabsq $0x0101010101010101,%r8 + movzbq %sil,%rcx - movzbq %sil,%rdx /* value to search for (c) */ - /* These imul are 'directpath' on athlons, so are fast */ - imul $0x80,%r8,%r9 /* 0x8080808080808080 */ - imul %r8,%rdx /* (c) copied to all bytes */ - test $7,%dil - jnz 20f /* jump if misaligned */ - - _ALIGN_TEXT /* one byte nop */ -1: - movq (%rdi),%rax /* bytes to check (x) */ -2: - addq $8,%rdi - mov %rax,%r10 - mov %rax,%r11 /* for 'char' check */ - not %r10 /* invert of data (~x) */ - - xorq %rdx,%r11 /* convert 'char' test to one for NUL */ - subq %r8,%rax /* x - 0x10 */ - movq %r10,%rsi /* ~x */ - subq %r8,%r11 /* (x ^ c) - 0x10 */ -/* - * Here we could check ((x - 0x10) | ((x ^ c) - 0x10)) & 0x80 - * and short-circuit the case where no top bits are set, and - * we continue the loop. - * However it needs 3 more clocks that are difficult to interleave - * in the existing dependency chain ... - */ - andq %r9,%rax /* (x - 0x10) & 0x80 */ - xorq %rdx,%rsi /* c ^ ~x == ~(c ^ x) */ - andq %r9,%r11 /* ((x ^ c) - 0x10) & 0x80 */ - andq %r10,%rax /* (x - 0x10) & 0x80 & ~x */ - jne 10f /* jump if string ends */ - andq %rsi,%r11 /* ((x ^ c) - 0x10) & 0x80 & ~(x ^ c) */ - je 1b /* jump if no match */ - - /* Found char, since LE can use bit scan */ - bsf %r11,%r11 /* 7, 15, 23 ... 63 */ -8: shr $3,%r11 /* 0, 1, 2 .. 7 */ - lea -8(%r11,%rdi),%rax - ret + /* + * Align to word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $7,%dil + je .Lword_aligned + movb (%rdi),%dl + cmpb %cl,%dl + je .Ldone + incq %rdi + testb %dl,%dl + jne .Lalign + jmp .Lzero + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movq %rcx,%rdx + salq $16,%rcx + orq %rdx,%rcx + movq %rcx,%rdx + salq $32,%rcx + orq %rdx,%rcx -/* End of string, check whether char is before NUL */ - _ALIGN_TEXT /* adds three byte nop */ -10: - bsf %rax,%rax /* count to NUL */ - andq %rsi,%r11 /* check for char in last 8 bytes */ - je 11f - bsf %r11,%r11 /* NUL and char - see which was first */ - cmp %r11,%rax - jae 8b /* return 'found' if same - searching for NUL */ -11: xor %eax,%eax /* char not found */ - ret + movabsq $0x0101010101010101,%r8 + movabsq $0x8080808080808080,%r9 -/* Source misaligned: read aligned word and make low bytes invalid */ -/* I (dsl) think a _ALIGN_TEXT here will slow things down! */ -20: - xor %rcx,%rcx - sub %dil,%cl /* Convert low address values 1..7 ... */ - sbb %rsi,%rsi /* carry was set, so %rsi now ~0u! */ - and $7,%cl /* ... to 7..1 */ - and $~7,%dil /* move address to start of word */ - shl $3,%cl /* now 56, 48 ... 16, 8 */ - movq (%rdi),%rax /* aligned word containing first data */ - xor %rdx,%rsi /* invert of search pattern (~c) */ - je 22f /* searching for 0xff */ -21: shr %cl,%rsi /* ~c in low bytes */ - or %rsi,%rax /* set some bits making low bytes invalid */ - jmp 2b - -/* We are searching for 0xff, so can't use ~pattern for invalid value */ -22: - mov %r8,%r10 /* 0x01 pattern */ - lea (%r8,%r8),%rsi /* 0x02 - bits gets set (above) */ - not %r10 /* now 0xfe */ - sar %cl,%r10 /* top bytes 0xff */ - and %r10,%rax /* clear lsb from unwanted low bytes */ - jmp 21b + /* Check whether any byte in the word is equal to ch or 0. */ + _ALIGN_TEXT +.Lloop: + movq (%rdi),%rdx + addq $8,%rdi + movq %rdx,%rsi + subq %r8,%rdx + xorq %rcx,%rsi + subq %r8,%rsi + orq %rsi,%rdx + testq %r9,%rdx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word match + * ch or are equal to 0. + */ + + movb -8(%rdi),%dl + cmpb %cl,%dl /* 1st byte == ch? */ + jne 1f + subq $8,%rdi + jmp .Ldone +1: testb %dl,%dl /* 1st byte == 0? */ + je .Lzero + + movb -7(%rdi),%dl + cmpb %cl,%dl /* 2nd byte == ch? */ + jne 1f + subq $7,%rdi + jmp .Ldone +1: testb %dl,%dl /* 2nd byte == 0? */ + je .Lzero + + movb -6(%rdi),%dl + cmpb %cl,%dl /* 3rd byte == ch? */ + jne 1f + subq $6,%rdi + jmp .Ldone +1: testb %dl,%dl /* 3rd byte == 0? */ + je .Lzero + + movb -5(%rdi),%dl + cmpb %cl,%dl /* 4th byte == ch? */ + jne 1f + subq $5,%rdi + jmp .Ldone +1: testb %dl,%dl /* 4th byte == 0? */ + je .Lzero + + movb -4(%rdi),%dl + cmpb %cl,%dl /* 5th byte == ch? */ + jne 1f + subq $4,%rdi + jmp .Ldone +1: testb %dl,%dl /* 5th byte == 0? */ + je .Lzero + + movb -3(%rdi),%dl + cmpb %cl,%dl /* 6th byte == ch? */ + jne 1f + subq $3,%rdi + jmp .Ldone +1: testb %dl,%dl /* 6th byte == 0? */ + je .Lzero + + movb -2(%rdi),%dl + cmpb %cl,%dl /* 7th byte == ch? */ + jne 1f + subq $2,%rdi + jmp .Ldone +1: testb %dl,%dl /* 7th byte == 0? */ + je .Lzero + + movb -1(%rdi),%dl + cmpb %cl,%dl /* 8th byte == ch? */ + jne 1f + subq $1,%rdi + jmp .Ldone +1: testb %dl,%dl /* 8th byte == 0? */ + jne .Lloop + +.Lzero: + /* If a ch wasn't found, return 0. */ + xorq %rdi,%rdi -#ifdef TEST_STRCHR -/* Trivial version for bug-fixing above */ -ENTRY(strchr) - movq %rsi,%rdx - movq %rdi,%rsi -1: - lodsb - cmp %al,%dl - je 2f - test %al,%al - jne 1b - xor %eax,%eax - ret -2: lea -1(%rsi),%rax +.Ldone: + movq %rdi,%rax ret -#endif - STRONG_ALIAS(index,strchr)