Author: kib
Date: Wed Jun 23 10:40:28 2010
New Revision: 209460
URL: http://svn.freebsd.org/changeset/base/209460

Log:
  Remove unused i586 optimized bcopy/bzero/etc implementations that utilize
  FPU registers for copying. Remove the switch table and jumps from
  bcopy/bzero/... to the actual implementation.
  As a side-effect, i486-optimized bzero is removed.
  
  Reviewed by:  bde
  Tested by:    pho (previous version)

Modified:
  head/sys/dev/fb/fbreg.h
  head/sys/i386/i386/identcpu.c
  head/sys/i386/i386/support.s
  head/sys/i386/include/md_var.h
  head/sys/i386/isa/npx.c

Modified: head/sys/dev/fb/fbreg.h
==============================================================================
--- head/sys/dev/fb/fbreg.h     Wed Jun 23 10:06:57 2010        (r209459)
+++ head/sys/dev/fb/fbreg.h     Wed Jun 23 10:40:28 2010        (r209460)
@@ -34,16 +34,7 @@
 #define V_MAX_ADAPTERS         8               /* XXX */
 
 /* some macros */
-#ifdef __i386__
-#define bcopy_io(s, d, c)      generic_bcopy((void *)(s), (void *)(d), (c))
-#define bcopy_toio(s, d, c)    generic_bcopy((void *)(s), (void *)(d), (c))
-#define bcopy_fromio(s, d, c)  generic_bcopy((void *)(s), (void *)(d), (c))
-#define bzero_io(d, c)         generic_bzero((void *)(d), (c))
-#define fill_io(p, d, c)       fill((p), (void *)(d), (c))
-#define fillw_io(p, d, c)      fillw((p), (void *)(d), (c))
-void generic_bcopy(const void *s, void *d, size_t c);
-void generic_bzero(void *d, size_t c);
-#elif defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 #define bcopy_io(s, d, c)      bcopy((void *)(s), (void *)(d), (c))
 #define bcopy_toio(s, d, c)    bcopy((void *)(s), (void *)(d), (c))
 #define bcopy_fromio(s, d, c)  bcopy((void *)(s), (void *)(d), (c))

Modified: head/sys/i386/i386/identcpu.c
==============================================================================
--- head/sys/i386/i386/identcpu.c       Wed Jun 23 10:06:57 2010        
(r209459)
+++ head/sys/i386/i386/identcpu.c       Wed Jun 23 10:40:28 2010        
(r209460)
@@ -634,7 +634,6 @@ printcpuinfo(void)
 #if defined(I486_CPU)
        case CPUCLASS_486:
                printf("486");
-               bzero_vector = i486_bzero;
                break;
 #endif
 #if defined(I586_CPU)

Modified: head/sys/i386/i386/support.s
==============================================================================
--- head/sys/i386/i386/support.s        Wed Jun 23 10:06:57 2010        
(r209459)
+++ head/sys/i386/i386/support.s        Wed Jun 23 10:40:28 2010        
(r209460)
@@ -42,23 +42,6 @@
 #define IDXSHIFT       10
 
        .data
-       .globl  bcopy_vector
-bcopy_vector:
-       .long   generic_bcopy
-       .globl  bzero_vector
-bzero_vector:
-       .long   generic_bzero
-       .globl  copyin_vector
-copyin_vector:
-       .long   generic_copyin
-       .globl  copyout_vector
-copyout_vector:
-       .long   generic_copyout
-#if defined(I586_CPU) && defined(DEV_NPX)
-kernel_fpu_lock:
-       .byte   0xfe
-       .space  3
-#endif
        ALIGN_DATA
        .globl  intrcnt, eintrcnt
 intrcnt:
@@ -76,13 +59,7 @@ eintrnames:
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
-
 ENTRY(bzero)
-       MEXITCOUNT
-       jmp     *bzero_vector
-END(bzero)
-
-ENTRY(generic_bzero)
        pushl   %edi
        movl    8(%esp),%edi
        movl    12(%esp),%ecx
@@ -97,270 +74,8 @@ ENTRY(generic_bzero)
        stosb
        popl    %edi
        ret
-END(generic_bzero)     
+END(bzero)     
        
-#ifdef I486_CPU
-ENTRY(i486_bzero)
-       movl    4(%esp),%edx
-       movl    8(%esp),%ecx
-       xorl    %eax,%eax
-/*
- * do 64 byte chunks first
- *
- * XXX this is probably over-unrolled at least for DX2's
- */
-2:
-       cmpl    $64,%ecx
-       jb      3f
-       movl    %eax,(%edx)
-       movl    %eax,4(%edx)
-       movl    %eax,8(%edx)
-       movl    %eax,12(%edx)
-       movl    %eax,16(%edx)
-       movl    %eax,20(%edx)
-       movl    %eax,24(%edx)
-       movl    %eax,28(%edx)
-       movl    %eax,32(%edx)
-       movl    %eax,36(%edx)
-       movl    %eax,40(%edx)
-       movl    %eax,44(%edx)
-       movl    %eax,48(%edx)
-       movl    %eax,52(%edx)
-       movl    %eax,56(%edx)
-       movl    %eax,60(%edx)
-       addl    $64,%edx
-       subl    $64,%ecx
-       jnz     2b
-       ret
-
-/*
- * do 16 byte chunks
- */
-       SUPERALIGN_TEXT
-3:
-       cmpl    $16,%ecx
-       jb      4f
-       movl    %eax,(%edx)
-       movl    %eax,4(%edx)
-       movl    %eax,8(%edx)
-       movl    %eax,12(%edx)
-       addl    $16,%edx
-       subl    $16,%ecx
-       jnz     3b
-       ret
-
-/*
- * do 4 byte chunks
- */
-       SUPERALIGN_TEXT
-4:
-       cmpl    $4,%ecx
-       jb      5f
-       movl    %eax,(%edx)
-       addl    $4,%edx
-       subl    $4,%ecx
-       jnz     4b
-       ret
-
-/*
- * do 1 byte chunks
- * a jump table seems to be faster than a loop or more range reductions
- *
- * XXX need a const section for non-text
- */
-       .data
-jtab:
-       .long   do0
-       .long   do1
-       .long   do2
-       .long   do3
-
-       .text
-       SUPERALIGN_TEXT
-5:
-       jmp     *jtab(,%ecx,4)
-
-       SUPERALIGN_TEXT
-do3:
-       movw    %ax,(%edx)
-       movb    %al,2(%edx)
-       ret
-
-       SUPERALIGN_TEXT
-do2:
-       movw    %ax,(%edx)
-       ret
-
-       SUPERALIGN_TEXT
-do1:
-       movb    %al,(%edx)
-       ret
-
-       SUPERALIGN_TEXT
-do0:
-       ret
-END(i486_bzero)
-#endif
-
-#if defined(I586_CPU) && defined(DEV_NPX)
-ENTRY(i586_bzero)
-       movl    4(%esp),%edx
-       movl    8(%esp),%ecx
-
-       /*
-        * The FPU register method is twice as fast as the integer register
-        * method unless the target is in the L1 cache and we pre-allocate a
-        * cache line for it (then the integer register method is 4-5 times
-        * faster).  However, we never pre-allocate cache lines, since that
-        * would make the integer method 25% or more slower for the common
-        * case when the target isn't in either the L1 cache or the L2 cache.
-        * Thus we normally use the FPU register method unless the overhead
-        * would be too large.
-        */
-       cmpl    $256,%ecx       /* empirical; clts, fninit, smsw cost a lot */
-       jb      intreg_i586_bzero
-
-       /*
-        * The FPU registers may belong to an application or to fastmove()
-        * or to another invocation of bcopy() or ourself in a higher level
-        * interrupt or trap handler.  Preserving the registers is
-        * complicated since we avoid it if possible at all levels.  We
-        * want to localize the complications even when that increases them.
-        * Here the extra work involves preserving CR0_TS in TS.
-        * `fpcurthread != NULL' is supposed to be the condition that all the
-        * FPU resources belong to an application, but fpcurthread and CR0_TS
-        * aren't set atomically enough for this condition to work in
-        * interrupt handlers.
-        *
-        * Case 1: FPU registers belong to the application: we must preserve
-        * the registers if we use them, so we only use the FPU register
-        * method if the target size is large enough to amortize the extra
-        * overhead for preserving them.  CR0_TS must be preserved although
-        * it is very likely to end up as set.
-        *
-        * Case 2: FPU registers belong to fastmove(): fastmove() currently
-        * makes the registers look like they belong to an application so
-        * that cpu_switch() and savectx() don't have to know about it, so
-        * this case reduces to case 1.
-        *
-        * Case 3: FPU registers belong to the kernel: don't use the FPU
-        * register method.  This case is unlikely, and supporting it would
-        * be more complicated and might take too much stack.
-        *
-        * Case 4: FPU registers don't belong to anyone: the FPU registers
-        * don't need to be preserved, so we always use the FPU register
-        * method.  CR0_TS must be preserved although it is very likely to
-        * always end up as clear.
-        */
-       cmpl    $0,PCPU(FPCURTHREAD)
-       je      i586_bz1
-
-       /*
-        * XXX don't use the FPU for cases 1 and 2, since preemptive
-        * scheduling of ithreads broke these cases.  Note that we can
-        * no longer get here from an interrupt handler, since the
-        * context sitch to the interrupt handler will have saved the
-        * FPU state.
-        */
-       jmp     intreg_i586_bzero
-
-       cmpl    $256+184,%ecx           /* empirical; not quite 2*108 more */
-       jb      intreg_i586_bzero
-       sarb    $1,kernel_fpu_lock
-       jc      intreg_i586_bzero
-       smsw    %ax
-       clts
-       subl    $108,%esp
-       fnsave  0(%esp)
-       jmp     i586_bz2
-
-i586_bz1:
-       sarb    $1,kernel_fpu_lock
-       jc      intreg_i586_bzero
-       smsw    %ax
-       clts
-       fninit                          /* XXX should avoid needing this */
-i586_bz2:
-       fldz
-
-       /*
-        * Align to an 8 byte boundary (misalignment in the main loop would
-        * cost a factor of >= 2).  Avoid jumps (at little cost if it is
-        * already aligned) by always zeroing 8 bytes and using the part up
-        * to the _next_ alignment position.
-        */
-       fstl    0(%edx)
-       addl    %edx,%ecx               /* part of %ecx -= new_%edx - %edx */
-       addl    $8,%edx
-       andl    $~7,%edx
-       subl    %edx,%ecx
-
-       /*
-        * Similarly align `len' to a multiple of 8.
-        */
-       fstl    -8(%edx,%ecx)
-       decl    %ecx
-       andl    $~7,%ecx
-
-       /*
-        * This wouldn't be any faster if it were unrolled, since the loop
-        * control instructions are much faster than the fstl and/or done
-        * in parallel with it so their overhead is insignificant.
-        */
-fpureg_i586_bzero_loop:
-       fstl    0(%edx)
-       addl    $8,%edx
-       subl    $8,%ecx
-       cmpl    $8,%ecx
-       jae     fpureg_i586_bzero_loop
-
-       cmpl    $0,PCPU(FPCURTHREAD)
-       je      i586_bz3
-
-       /* XXX check that the condition for cases 1-2 stayed false. */
-i586_bzero_oops:
-       int     $3
-       jmp     i586_bzero_oops
-
-       frstor  0(%esp)
-       addl    $108,%esp
-       lmsw    %ax
-       movb    $0xfe,kernel_fpu_lock
-       ret
-
-i586_bz3:
-       fstp    %st(0)
-       lmsw    %ax
-       movb    $0xfe,kernel_fpu_lock
-       ret
-
-intreg_i586_bzero:
-       /*
-        * `rep stos' seems to be the best method in practice for small
-        * counts.  Fancy methods usually take too long to start up due
-        * to cache and BTB misses.
-        */
-       pushl   %edi
-       movl    %edx,%edi
-       xorl    %eax,%eax
-       shrl    $2,%ecx
-       cld
-       rep
-       stosl
-       movl    12(%esp),%ecx
-       andl    $3,%ecx
-       jne     1f
-       popl    %edi
-       ret
-
-1:
-       rep
-       stosb
-       popl    %edi
-       ret
-END(i586_bzero)
-#endif /* I586_CPU && defined(DEV_NPX) */
-
 ENTRY(sse2_pagezero)
        pushl   %ebx
        movl    8(%esp),%ecx
@@ -473,16 +188,11 @@ ENTRY(bcopyb)
        ret
 END(bcopyb)
 
-ENTRY(bcopy)
-       MEXITCOUNT
-       jmp     *bcopy_vector
-END(bcopy)
-
 /*
- * generic_bcopy(src, dst, cnt)
+ * bcopy(src, dst, cnt)
  *  w...@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
-ENTRY(generic_bcopy)
+ENTRY(bcopy)
        pushl   %esi
        pushl   %edi
        movl    12(%esp),%esi
@@ -526,157 +236,7 @@ ENTRY(generic_bcopy)
        popl    %esi
        cld
        ret
-END(generic_bcopy)
-
-#if defined(I586_CPU) && defined(DEV_NPX)
-ENTRY(i586_bcopy)
-       pushl   %esi
-       pushl   %edi
-       movl    12(%esp),%esi
-       movl    16(%esp),%edi
-       movl    20(%esp),%ecx
-
-       movl    %edi,%eax
-       subl    %esi,%eax
-       cmpl    %ecx,%eax                       /* overlapping && src < dst? */
-       jb      1f
-
-       cmpl    $1024,%ecx
-       jb      small_i586_bcopy
-
-       sarb    $1,kernel_fpu_lock
-       jc      small_i586_bcopy
-       cmpl    $0,PCPU(FPCURTHREAD)
-       je      i586_bc1
-
-       /* XXX turn off handling of cases 1-2, as above. */
-       movb    $0xfe,kernel_fpu_lock
-       jmp     small_i586_bcopy
-
-       smsw    %dx
-       clts
-       subl    $108,%esp
-       fnsave  0(%esp)
-       jmp     4f
-
-i586_bc1:
-       smsw    %dx
-       clts
-       fninit                          /* XXX should avoid needing this */
-
-       ALIGN_TEXT
-4:
-       pushl   %ecx
-#define        DCACHE_SIZE     8192
-       cmpl    $(DCACHE_SIZE-512)/2,%ecx
-       jbe     2f
-       movl    $(DCACHE_SIZE-512)/2,%ecx
-2:
-       subl    %ecx,0(%esp)
-       cmpl    $256,%ecx
-       jb      5f                      /* XXX should prefetch if %ecx >= 32 */
-       pushl   %esi
-       pushl   %ecx
-       ALIGN_TEXT
-3:
-       movl    0(%esi),%eax
-       movl    32(%esi),%eax
-       movl    64(%esi),%eax
-       movl    96(%esi),%eax
-       movl    128(%esi),%eax
-       movl    160(%esi),%eax
-       movl    192(%esi),%eax
-       movl    224(%esi),%eax
-       addl    $256,%esi
-       subl    $256,%ecx
-       cmpl    $256,%ecx
-       jae     3b
-       popl    %ecx
-       popl    %esi
-5:
-       ALIGN_TEXT
-large_i586_bcopy_loop:
-       fildq   0(%esi)
-       fildq   8(%esi)
-       fildq   16(%esi)
-       fildq   24(%esi)
-       fildq   32(%esi)
-       fildq   40(%esi)
-       fildq   48(%esi)
-       fildq   56(%esi)
-       fistpq  56(%edi)
-       fistpq  48(%edi)
-       fistpq  40(%edi)
-       fistpq  32(%edi)
-       fistpq  24(%edi)
-       fistpq  16(%edi)
-       fistpq  8(%edi)
-       fistpq  0(%edi)
-       addl    $64,%esi
-       addl    $64,%edi
-       subl    $64,%ecx
-       cmpl    $64,%ecx
-       jae     large_i586_bcopy_loop
-       popl    %eax
-       addl    %eax,%ecx
-       cmpl    $64,%ecx
-       jae     4b
-
-       cmpl    $0,PCPU(FPCURTHREAD)
-       je      i586_bc2
-
-       /* XXX check that the condition for cases 1-2 stayed false. */
-i586_bcopy_oops:
-       int     $3
-       jmp     i586_bcopy_oops
-
-       frstor  0(%esp)
-       addl    $108,%esp
-i586_bc2:
-       lmsw    %dx
-       movb    $0xfe,kernel_fpu_lock
-
-/*
- * This is a duplicate of the main part of generic_bcopy.  See the comments
- * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
- * would mess up high resolution profiling.
- */
-       ALIGN_TEXT
-small_i586_bcopy:
-       shrl    $2,%ecx
-       cld
-       rep
-       movsl
-       movl    20(%esp),%ecx
-       andl    $3,%ecx
-       rep
-       movsb
-       popl    %edi
-       popl    %esi
-       ret
-
-       ALIGN_TEXT
-1:
-       addl    %ecx,%edi
-       addl    %ecx,%esi
-       decl    %edi
-       decl    %esi
-       andl    $3,%ecx
-       std
-       rep
-       movsb
-       movl    20(%esp),%ecx
-       shrl    $2,%ecx
-       subl    $3,%esi
-       subl    $3,%edi
-       rep
-       movsl
-       popl    %edi
-       popl    %esi
-       cld
-       ret
-END(i586_bcopy)
-#endif /* I586_CPU && defined(DEV_NPX) */
+END(bcopy)
 
 /*
  * Note: memcpy does not support overlapping copies
@@ -723,11 +283,6 @@ END(memcpy)
  * copyout(from_kernel, to_user, len)  - MP SAFE
  */
 ENTRY(copyout)
-       MEXITCOUNT
-       jmp     *copyout_vector
-END(copyout)
-
-ENTRY(generic_copyout)
        movl    PCPU(CURPCB),%eax
        movl    $copyout_fault,PCB_ONFAULT(%eax)
        pushl   %esi
@@ -764,10 +319,6 @@ ENTRY(generic_copyout)
        /* bcopy(%esi, %edi, %ebx) */
        movl    %ebx,%ecx
 
-#if defined(I586_CPU) && defined(DEV_NPX)
-       ALIGN_TEXT
-slow_copyout:
-#endif
        shrl    $2,%ecx
        cld
        rep
@@ -785,7 +336,7 @@ done_copyout:
        movl    PCPU(CURPCB),%edx
        movl    %eax,PCB_ONFAULT(%edx)
        ret
-END(generic_copyout)
+END(copyout)
 
        ALIGN_TEXT
 copyout_fault:
@@ -797,70 +348,10 @@ copyout_fault:
        movl    $EFAULT,%eax
        ret
 
-#if defined(I586_CPU) && defined(DEV_NPX)
-ENTRY(i586_copyout)
-       /*
-        * Duplicated from generic_copyout.  Could be done a bit better.
-        */
-       movl    PCPU(CURPCB),%eax
-       movl    $copyout_fault,PCB_ONFAULT(%eax)
-       pushl   %esi
-       pushl   %edi
-       pushl   %ebx
-       movl    16(%esp),%esi
-       movl    20(%esp),%edi
-       movl    24(%esp),%ebx
-       testl   %ebx,%ebx                       /* anything to do? */
-       jz      done_copyout
-
-       /*
-        * Check explicitly for non-user addresses.  If 486 write protection
-        * is being used, this check is essential because we are in kernel
-        * mode so the h/w does not provide any protection against writing
-        * kernel addresses.
-        */
-
-       /*
-        * First, prevent address wrapping.
-        */
-       movl    %edi,%eax
-       addl    %ebx,%eax
-       jc      copyout_fault
-/*
- * XXX STOP USING VM_MAXUSER_ADDRESS.
- * It is an end address, not a max, so every time it is used correctly it
- * looks like there is an off by one error, and of course it caused an off
- * by one error in several places.
- */
-       cmpl    $VM_MAXUSER_ADDRESS,%eax
-       ja      copyout_fault
-
-       /* bcopy(%esi, %edi, %ebx) */
-3:
-       movl    %ebx,%ecx
-       /*
-        * End of duplicated code.
-        */
-
-       cmpl    $1024,%ecx
-       jb      slow_copyout
-
-       pushl   %ecx
-       call    fastmove
-       addl    $4,%esp
-       jmp     done_copyout
-END(i586_copyout)
-#endif /* I586_CPU && defined(DEV_NPX) */
-
 /*
  * copyin(from_user, to_kernel, len) - MP SAFE
  */
 ENTRY(copyin)
-       MEXITCOUNT
-       jmp     *copyin_vector
-END(copyin)
-
-ENTRY(generic_copyin)
        movl    PCPU(CURPCB),%eax
        movl    $copyin_fault,PCB_ONFAULT(%eax)
        pushl   %esi
@@ -878,10 +369,6 @@ ENTRY(generic_copyin)
        cmpl    $VM_MAXUSER_ADDRESS,%edx
        ja      copyin_fault
 
-#if defined(I586_CPU) && defined(DEV_NPX)
-       ALIGN_TEXT
-slow_copyin:
-#endif
        movb    %cl,%al
        shrl    $2,%ecx                         /* copy longword-wise */
        cld
@@ -892,17 +379,13 @@ slow_copyin:
        rep
        movsb
 
-#if defined(I586_CPU) && defined(DEV_NPX)
-       ALIGN_TEXT
-done_copyin:
-#endif
        popl    %edi
        popl    %esi
        xorl    %eax,%eax
        movl    PCPU(CURPCB),%edx
        movl    %eax,PCB_ONFAULT(%edx)
        ret
-END(generic_copyin)
+END(copyin)
 
        ALIGN_TEXT
 copyin_fault:
@@ -913,250 +396,6 @@ copyin_fault:
        movl    $EFAULT,%eax
        ret
 
-#if defined(I586_CPU) && defined(DEV_NPX)
-ENTRY(i586_copyin)
-       /*
-        * Duplicated from generic_copyin.  Could be done a bit better.
-        */
-       movl    PCPU(CURPCB),%eax
-       movl    $copyin_fault,PCB_ONFAULT(%eax)
-       pushl   %esi
-       pushl   %edi
-       movl    12(%esp),%esi                   /* caddr_t from */
-       movl    16(%esp),%edi                   /* caddr_t to */
-       movl    20(%esp),%ecx                   /* size_t  len */
-
-       /*
-        * make sure address is valid
-        */
-       movl    %esi,%edx
-       addl    %ecx,%edx
-       jc      copyin_fault
-       cmpl    $VM_MAXUSER_ADDRESS,%edx
-       ja      copyin_fault
-       /*
-        * End of duplicated code.
-        */
-
-       cmpl    $1024,%ecx
-       jb      slow_copyin
-
-       pushl   %ebx                    /* XXX prepare for fastmove_fault */
-       pushl   %ecx
-       call    fastmove
-       addl    $8,%esp
-       jmp     done_copyin
-END(i586_copyin)
-#endif /* I586_CPU && defined(DEV_NPX) */
-
-#if defined(I586_CPU) && defined(DEV_NPX)
-/* fastmove(src, dst, len)
-       src in %esi
-       dst in %edi
-       len in %ecx             XXX changed to on stack for profiling
-       uses %eax and %edx for tmp. storage
- */
-/* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
-ENTRY(fastmove)
-       pushl   %ebp
-       movl    %esp,%ebp
-       subl    $PCB_SAVEFPU_SIZE+3*4,%esp
-
-       movl    8(%ebp),%ecx
-       cmpl    $63,%ecx
-       jbe     fastmove_tail
-
-       testl   $7,%esi /* check if src addr is multiple of 8 */
-       jnz     fastmove_tail
-
-       testl   $7,%edi /* check if dst addr is multiple of 8 */
-       jnz     fastmove_tail
-
-       /* XXX grab FPU context atomically. */
-       cli
-
-/* if (fpcurthread != NULL) { */
-       cmpl    $0,PCPU(FPCURTHREAD)
-       je      6f
-/*    fnsave(&curpcb->pcb_savefpu); */
-       movl    PCPU(CURPCB),%eax
-       fnsave  PCB_SAVEFPU(%eax)
-/*   FPCURTHREAD = NULL; */
-       movl    $0,PCPU(FPCURTHREAD)
-/* } */
-6:
-/* now we own the FPU. */
-
-/*
- * The process' FP state is saved in the pcb, but if we get
- * switched, the cpu_switch() will store our FP state in the
- * pcb.  It should be possible to avoid all the copying for
- * this, e.g., by setting a flag to tell cpu_switch() to
- * save the state somewhere else.
- */
-/* tmp = curpcb->pcb_savefpu; */
-       movl    %ecx,-12(%ebp)
-       movl    %esi,-8(%ebp)
-       movl    %edi,-4(%ebp)
-       movl    %esp,%edi
-       movl    PCPU(CURPCB),%esi
-       addl    $PCB_SAVEFPU,%esi
-       cld
-       movl    $PCB_SAVEFPU_SIZE>>2,%ecx
-       rep
-       movsl
-       movl    -12(%ebp),%ecx
-       movl    -8(%ebp),%esi
-       movl    -4(%ebp),%edi
-/* stop_emulating(); */
-       clts
-/* fpcurthread = curthread; */
-       movl    PCPU(CURTHREAD),%eax
-       movl    %eax,PCPU(FPCURTHREAD)
-       movl    PCPU(CURPCB),%eax
-
-       /* XXX end of atomic FPU context grab. */
-       sti
-
-       movl    $fastmove_fault,PCB_ONFAULT(%eax)
-4:
-       movl    %ecx,-12(%ebp)
-       cmpl    $1792,%ecx
-       jbe     2f
-       movl    $1792,%ecx
-2:
-       subl    %ecx,-12(%ebp)
-       cmpl    $256,%ecx
-       jb      5f
-       movl    %ecx,-8(%ebp)
-       movl    %esi,-4(%ebp)
-       ALIGN_TEXT
-3:
-       movl    0(%esi),%eax
-       movl    32(%esi),%eax
-       movl    64(%esi),%eax
-       movl    96(%esi),%eax
-       movl    128(%esi),%eax
-       movl    160(%esi),%eax
-       movl    192(%esi),%eax
-       movl    224(%esi),%eax
-       addl    $256,%esi
-       subl    $256,%ecx
-       cmpl    $256,%ecx
-       jae     3b
-       movl    -8(%ebp),%ecx
-       movl    -4(%ebp),%esi
-5:
-       ALIGN_TEXT
-fastmove_loop:
-       fildq   0(%esi)
-       fildq   8(%esi)
-       fildq   16(%esi)
-       fildq   24(%esi)
-       fildq   32(%esi)
-       fildq   40(%esi)
-       fildq   48(%esi)
-       fildq   56(%esi)
-       fistpq  56(%edi)
-       fistpq  48(%edi)
-       fistpq  40(%edi)
-       fistpq  32(%edi)
-       fistpq  24(%edi)
-       fistpq  16(%edi)
-       fistpq  8(%edi)
-       fistpq  0(%edi)
-       addl    $-64,%ecx
-       addl    $64,%esi
-       addl    $64,%edi
-       cmpl    $63,%ecx
-       ja      fastmove_loop
-       movl    -12(%ebp),%eax
-       addl    %eax,%ecx
-       cmpl    $64,%ecx
-       jae     4b
-
-       /* XXX ungrab FPU context atomically. */
-       cli
-
-/* curpcb->pcb_savefpu = tmp; */
-       movl    %ecx,-12(%ebp)
-       movl    %esi,-8(%ebp)
-       movl    %edi,-4(%ebp)
-       movl    PCPU(CURPCB),%edi
-       addl    $PCB_SAVEFPU,%edi
-       movl    %esp,%esi
-       cld
-       movl    $PCB_SAVEFPU_SIZE>>2,%ecx
-       rep
-       movsl
-       movl    -12(%ebp),%ecx
-       movl    -8(%ebp),%esi
-       movl    -4(%ebp),%edi
-
-/* start_emulating(); */
-       smsw    %ax
-       orb     $CR0_TS,%al
-       lmsw    %ax
-/* fpcurthread = NULL; */
-       movl    $0,PCPU(FPCURTHREAD)
-
-       /* XXX end of atomic FPU context ungrab. */
-       sti
-
-       ALIGN_TEXT
-fastmove_tail:
-       movl    PCPU(CURPCB),%eax
-       movl    $fastmove_tail_fault,PCB_ONFAULT(%eax)
-
-       movb    %cl,%al
-       shrl    $2,%ecx                         /* copy longword-wise */
-       cld
-       rep
-       movsl
-       movb    %al,%cl
-       andb    $3,%cl                          /* copy remaining bytes */
-       rep
-       movsb
-
-       movl    %ebp,%esp
-       popl    %ebp
-       ret
-
-       ALIGN_TEXT
-fastmove_fault:
-       /* XXX ungrab FPU context atomically. */
-       cli
-
-       movl    PCPU(CURPCB),%edi
-       addl    $PCB_SAVEFPU,%edi
-       movl    %esp,%esi
-       cld
-       movl    $PCB_SAVEFPU_SIZE>>2,%ecx
-       rep
-       movsl
-
-       smsw    %ax
-       orb     $CR0_TS,%al
-       lmsw    %ax
-       movl    $0,PCPU(FPCURTHREAD)
-
-       /* XXX end of atomic FPU context ungrab. */
-       sti
-
-fastmove_tail_fault:
-       movl    %ebp,%esp
-       popl    %ebp
-       addl    $8,%esp
-       popl    %ebx
-       popl    %edi
-       popl    %esi
-       movl    PCPU(CURPCB),%edx
-       movl    $0,PCB_ONFAULT(%edx)
-       movl    $EFAULT,%eax
-       ret
-END(fastmove)
-#endif /* I586_CPU && defined(DEV_NPX) */
-
 /*
  * casuword.  Compare and set user word.  Returns -1 or the current value.
  */

Modified: head/sys/i386/include/md_var.h
==============================================================================
--- head/sys/i386/include/md_var.h      Wed Jun 23 10:06:57 2010        
(r209459)
+++ head/sys/i386/include/md_var.h      Wed Jun 23 10:40:28 2010        
(r209460)
@@ -36,11 +36,6 @@
  * Miscellaneous machine-dependent declarations.
  */
 
-extern void    (*bcopy_vector)(const void *from, void *to, size_t len);
-extern void    (*bzero_vector)(void *buf, size_t len);
-extern int     (*copyin_vector)(const void *udaddr, void *kaddr, size_t len);
-extern int     (*copyout_vector)(const void *kaddr, void *udaddr, size_t len);
-
 extern long    Maxmem;
 extern u_int   basemem;        /* PA of original top of base memory */
 extern int     busdma_swi_pending;
@@ -98,11 +93,6 @@ void dump_add_page(vm_paddr_t);
 void   dump_drop_page(vm_paddr_t);
 void   enable_sse(void);
 void   fillw(int /*u_short*/ pat, void *base, size_t cnt);
-void   i486_bzero(void *buf, size_t len);
-void   i586_bcopy(const void *from, void *to, size_t len);
-void   i586_bzero(void *buf, size_t len);
-int    i586_copyin(const void *udaddr, void *kaddr, size_t len);
-int    i586_copyout(const void *kaddr, void *udaddr, size_t len);
 void   i686_pagezero(void *addr);
 void   sse2_pagezero(void *addr);
 void   init_AMD_Elan_sc520(void);

Modified: head/sys/i386/isa/npx.c
==============================================================================
--- head/sys/i386/isa/npx.c     Wed Jun 23 10:06:57 2010        (r209459)
+++ head/sys/i386/isa/npx.c     Wed Jun 23 10:40:28 2010        (r209460)
@@ -85,11 +85,6 @@ __FBSDID("$FreeBSD$");
  * 387 and 287 Numeric Coprocessor Extension (NPX) Driver.
  */
 
-/* Configuration flags. */
-#define        NPX_DISABLE_I586_OPTIMIZED_BCOPY        (1 << 0)
-#define        NPX_DISABLE_I586_OPTIMIZED_BZERO        (1 << 1)
-#define        NPX_DISABLE_I586_OPTIMIZED_COPYIO       (1 << 2)
-
 #if defined(__GNUCLIKE_ASM) && !defined(lint)
 
 #define        fldcw(addr)             __asm("fldcw %0" : : "m" (*(addr)))
@@ -168,10 +163,6 @@ static     int     npx_attach(device_t dev);
 static void    npx_identify(driver_t *driver, device_t parent);
 static int     npx_intr(void *);
 static int     npx_probe(device_t dev);
-#ifdef I586_CPU_XXX
-static long    timezero(const char *funcname,
-                   void (*func)(void *buf, size_t len));
-#endif /* I586_CPU */
 
 int    hw_float;               /* XXX currently just alias for npx_exists */
 
@@ -442,22 +433,8 @@ npx_attach(dev)
                bzero(npx_initialstate.sv_87.sv_ac,
                    sizeof(npx_initialstate.sv_87.sv_ac));
        intr_restore(s);
-#ifdef I586_CPU_XXX
-       if (cpu_class == CPUCLASS_586 && npx_ex16 &&
-           timezero("i586_bzero()", i586_bzero) <
-           timezero("bzero()", bzero) * 4 / 5) {
-               if (!(flags & NPX_DISABLE_I586_OPTIMIZED_BCOPY))
-                       bcopy_vector = i586_bcopy;
-               if (!(flags & NPX_DISABLE_I586_OPTIMIZED_BZERO))
-                       bzero_vector = i586_bzero;
-               if (!(flags & NPX_DISABLE_I586_OPTIMIZED_COPYIO)) {
-                       copyin_vector = i586_copyin;
-                       copyout_vector = i586_copyout;
-               }
-       }
-#endif
 
-       return (0);             /* XXX unused */
+       return (0);
 }
 
 /*
@@ -1085,36 +1062,6 @@ fpurstor(addr)
                frstor(addr);
 }
 
-#ifdef I586_CPU_XXX
-static long
-timezero(funcname, func)
-       const char *funcname;
-       void (*func)(void *buf, size_t len);
-
-{
-       void *buf;
-#define        BUFSIZE         1048576
-       long usec;
-       struct timeval finish, start;
-
-       buf = malloc(BUFSIZE, M_TEMP, M_NOWAIT);
-       if (buf == NULL)
-               return (BUFSIZE);
-       microtime(&start);
-       (*func)(buf, BUFSIZE);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to