For anyone  who cares.

--
Mitch Allmond
Georgia Institute of Technology, Physics & Computer Science major
[EMAIL PROTECTED]
lca13.eastnet.gatech.edu
"God does not play dice, but I do"




mitch wrote:
> 
> Were can I get the  SSE P3  kernel patch for 2.2.13? Thanks.

I've attached the latest version of the patch here.  The xor patch goes on top
of the RAID-0.90 patch (Ingo Molnar's raid code).  When using this version of
the PIII patch you need to make sure you have both this and the RAID code
installed (there are some FPU use interactions that have to be taken care
of).  In case you don't have it, I've attached Ingo's last RAID patch.  The
order of application should be RAID patch followed by the two PIII patches. 
These should all apply cleanly to a 2.2.14 kernel.  The RAID patch will have
problems applying to a 2.2.13 kernel.

-- 
  Doug Ledford   <[EMAIL PROTECTED]>
   Opinions expressed are my own, but
      they should be everybody's.
--- linux/init/main.c.PIII      Sun Dec  5 14:23:13 1999
+++ linux/init/main.c   Sun Dec  5 14:23:14 1999
@@ -99,6 +99,7 @@
 #ifdef __i386__
 extern void ioapic_pirq_setup(char *str, int *ints);
 extern void ioapic_setup(char *str, int *ints);
+extern void x86_serial_nr_setup(char *str, int *ints);
 #endif
 extern void no_scroll(char *str, int *ints);
 extern void kbd_reset_setup(char *str, int *ints);
@@ -581,6 +582,9 @@
        { "noapic", ioapic_setup },
        { "pirq=", ioapic_pirq_setup },
 #endif
+#endif
+#ifdef __i386__
+       { "x86_serial_nr", x86_serial_nr_setup },
 #endif
 #ifdef CONFIG_BLK_DEV_RAM
        { "ramdisk_start=", ramdisk_start_setup },
--- linux/include/asm-i386/bugs.h.PIII  Mon Aug  9 15:04:57 1999
+++ linux/include/asm-i386/bugs.h       Sun Dec  5 14:23:14 1999
@@ -18,6 +18,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/stddef.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 
@@ -69,6 +70,45 @@
 #endif
                return;
        }
+#ifdef CONFIG_X86_FX
+       /*
+        * If we got so far we can safely turn on FXSAVE/FXRESTORE,
+        * but make sure we are 16-byte aligned first.
+        */
+       if (offsetof(struct task_struct, tss.i387.hard.fxsave.fxcwd) & 15) {
+               /*
+                * This triggers a link-time error if we manage to
+                * break alignment somehow.
+                */
+               extern void __buggy_fxsr_alignment(void);
+
+               __buggy_fxsr_alignment();
+       }
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+               printk("Enabling extended fast FPU save and restore...");
+               set_in_cr4(X86_CR4_OSFXSR);
+               printk("done.\n");
+       }
+       /*
+        * Note, Katmai instructions are enabled as soon as you start
+        * using the FXSAVE/RESTORE stuff.  This setting only
+        * indicates support for the masked/unmasked exceptions on
+        * the new PIII cpus. We don't have an Exception 16 handler
+        * for this yet, but we set this bit anyway.  It'll kill us
+        * the first time we take an umasked KNI exception, but since
+        * no userland apps currently use KNI, it isn't an issue yet.
+        * We should have the handler added by then.
+        */
+       if (boot_cpu_data.x86_capability & X86_FEATURE_XMM) {
+               printk("Not enabling KNI unmasked exception support\n");
+               printk("Exception 19 error handler not integrated yet\n");
+#if 0
+               set_in_cr4(X86_CR4_OSXMMEXCPT);
+               printk("done.\n");
+#endif
+       }
+#endif
+       disable_serial_nr();
        if (mca_pentium_flag) {
                /* The IBM Model 95 machines with pentiums lock up on
                 * fpu test, so we avoid it. All pentiums have inbuilt
@@ -117,23 +157,23 @@
                return;
        if (!ignore_irq13) {
                printk("OK, FPU using old IRQ 13 error reporting\n");
-               return;
+       } else {
+               __asm__("fninit\n\t"
+                       "fldl %1\n\t"
+                       "fdivl %2\n\t"
+                       "fmull %2\n\t"
+                       "fldl %1\n\t"
+                       "fsubp %%st,%%st(1)\n\t"
+                       "fistpl %0\n\t"
+                       "fwait\n\t"
+                       "fninit"
+                       : "=m" (*&boot_cpu_data.fdiv_bug)
+                       : "m" (*&x), "m" (*&y));
+               if (!boot_cpu_data.fdiv_bug)
+                       printk("OK, FPU using exception 16 error reporting.\n");
+               else
+                       printk("Hmm, FPU using exception 16 error reporting with FDIV 
+bug.\n");
        }
-       __asm__("fninit\n\t"
-               "fldl %1\n\t"
-               "fdivl %2\n\t"
-               "fmull %2\n\t"
-               "fldl %1\n\t"
-               "fsubp %%st,%%st(1)\n\t"
-               "fistpl %0\n\t"
-               "fwait\n\t"
-               "fninit"
-               : "=m" (*&boot_cpu_data.fdiv_bug)
-               : "m" (*&x), "m" (*&y));
-       if (!boot_cpu_data.fdiv_bug)
-               printk("OK, FPU using exception 16 error reporting.\n");
-       else
-               printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n");
 }
 
 __initfunc(static void check_hlt(void))
@@ -419,5 +459,7 @@
        check_amd_k6();
        check_pentium_f00f();
        check_cyrix_coma();
+       boot_cpu_data.enable_fixups = 1; /* should be safe to use MMX/MMX2 */
+                                        /* kernel functions now */
        system_utsname.machine[1] = '0' + boot_cpu_data.x86;
 }
--- linux/include/asm-i386/i387.h.PIII  Sun Dec  5 14:23:14 1999
+++ linux/include/asm-i386/i387.h       Sun Dec  5 14:23:14 1999
@@ -0,0 +1,313 @@
+/*
+ * include/asm-i386/i387.h
+ *
+ * Copyright (c) 1999 Doug Ledford <[EMAIL PROTECTED]>
+ *
+ * Made from various code bits pulled from other files
+ * in order to put things together in a way that made
+ * sense.
+ *
+ * FX/FPU support:
+ * Copyright (c) 1999 Ingo Molnar <[EMAIL PROTECTED]>,
+ *                   Gabriel Paubert <[EMAIL PROTECTED]>
+ */
+
+#ifndef __ASM_I386_I387_H
+#define __ASM_I386_I387_H
+
+extern int i387_hard_to_user ( struct user_i387_struct * user,
+       union i387_hard_union * hard);
+extern int i387_user_to_hard ( union i387_hard_union * hard,
+       struct user_i387_struct * user);
+
+/*
+ * Fill out the reserved bits, treat it as an fsave struct since the
+ * union makes this work for both fsave and fxsave structs.
+ */
+#ifdef CONFIG_X86_FX
+
+#define i387_save_hard(x) \
+do { \
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+               __asm__ __volatile__("fxsave %0" \
+                                    : "=m" ((x).hard.fxsave.fxcwd)); \
+       } else { \
+               __asm__ __volatile__("fnsave %0; fwait;" \
+                                    : "=m" ((x).hard.fsave.cwd)); \
+       } \
+} while(0)
+
+#define i387_restore_hard(x) \
+do { \
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+                __asm__ __volatile__("fxrstor %0" \
+                                    : \
+                                    : "m" ((x).hard.fxsave.fxcwd)); \
+       } else { \
+               __asm__ __volatile__("frstor %0" \
+                                    : \
+                                    :"m" ((x).hard.fsave.cwd)); \
+       } \
+} while(0)
+
+#define i387_set_cwd(x,v) \
+do { \
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+               (x).fxsave.fxcwd = (short)(v); \
+       } else { \
+               (x).fsave.cwd = ((long)(v) | 0xffff0000); \
+       } \
+} while(0)
+
+#define i387_set_swd(x,v) \
+do { \
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+               (x).fxsave.fxswd = (short)(v); \
+       } else { \
+               (x).fsave.swd = ((long)(v) | 0xffff0000); \
+       } \
+} while(0)
+
+#define i387_set_twd(x,v) \
+do { \
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+               (x).fxsave.fxtwd = (short)(v); \
+       } else { \
+               (x).fsave.twd = ((long)(v) | 0xffff0000); \
+       } \
+} while(0)
+
+static inline unsigned short fputag_KNI_to_387(unsigned char tb) {
+       unsigned short tw = tb;
+       tw = (tw | (tw << 4)) & 0x0f0f; /* zzzz7654zzzz3210 */
+       tw = (tw | (tw << 2)) & 0x3333; /* zz76zz54zz32zz10 */
+       tw = (tw | (tw << 1)) & 0x5555; /* z7z6z5z4z3z2z1z0 */
+       tw = ~(tw * 3);
+       return tw;
+}
+
+static inline unsigned char fputag_387_to_KNI(unsigned short tw) {
+       tw = ~tw & 0x5555;              /* z7z6z5z4z3z2z1z0 */
+       tw = (tw | (tw >> 1)) & 0x3333; /* zz76zz54zz32zz10 */
+       tw = (tw | (tw >> 2)) & 0x0f0f; /* zzzz7654zzzz3210 */
+       tw = (tw | (tw >> 4)) & 0x00ff; /* zzzzzzzz76543210 */
+       return tw;
+}
+
+#else /* CONFIG_X86_FX */
+
+#define i387_save_hard(x) \
+do { \
+       __asm__ __volatile__("fnsave %0; fwait;" \
+                            : "=m" ((x).hard.fsave.cwd)); \
+} while(0)
+
+#define i387_restore_hard(x) \
+do { \
+       __asm__ __volatile__("frstor %0" \
+                            : \
+                            :"m" ((x).hard.fsave.cwd)); \
+} while(0)
+
+#define i387_set_cwd(x,v) \
+do { (x).fsave.cwd = ((long)(v) | 0xffff0000); } while(0)
+
+#define i387_set_swd(x,v) \
+do { (x).fsave.swd = ((long)(v) | 0xffff0000); } while(0)
+
+#define i387_set_twd(x,v) \
+do { (x).fsave.twd = ((long)(v) | 0xffff0000); } while(0)
+
+#endif /* CONFIG_X86_FX */
+
+/*
+ * FPU lazy state save handling..
+ */
+#define save_kern_fpu(tsk) do { \
+       if(tsk->tss.mmx_reg_space != NULL) \
+               __asm__("movq %%mm0, 0x00(%0)\n\t" \
+                       "movq %%mm1, 0x08(%0)\n\t" \
+                       "movq %%mm2, 0x10(%0)\n\t" \
+                       "movq %%mm3, 0x18(%0)\n\t" \
+                       :: "r" (tsk->tss.mmx_reg_space):"memory"); \
+       if(tsk->tss.kni_reg_space != NULL) \
+               __asm__("movups %%xmm0, 0x00(%0)\n\t" \
+                       "movups %%xmm1, 0x10(%0)\n\t" \
+                       "movups %%xmm2, 0x20(%0)\n\t" \
+                       "movups %%xmm3, 0x30(%0)\n\t" \
+                       :: "r" (tsk->tss.kni_reg_space):"memory"); \
+} while (0)
+
+#define unlazy_fpu(tsk) do { \
+       if (tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { \
+               save_kern_fpu(tsk); \
+               if (!(tsk->flags & PF_USEDFPU)) { \
+                       stts(); \
+               } \
+       } \
+       if (tsk->flags & PF_USEDFPU) { \
+               if (!(tsk->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED)) { \
+                       i387_save_hard(tsk->tss.i387); \
+               } \
+               tsk->flags &= ~PF_USEDFPU; \
+               stts(); \
+       } \
+} while (0)
+
+#define clear_fpu(tsk) do { \
+       if ( (tsk->flags & PF_USEDFPU) || \
+            (tsk->tss.x86_fpustate) ) { \
+               tsk->flags &= ~PF_USEDFPU; \
+               tsk->tss.x86_fpustate = 0; \
+               stts(); \
+       } \
+} while (0)
+
+/*
+ * For when we want to use the FPU in kernel code
+ * 
+ * These functions allow the use of up to 4 KNI based xmm registers on the
+ * Pentium III processors or up to 4 MMX registers on Pentium MMX and above
+ * or compatible processors.  Pick the routines that you need based on the
+ * regs you are going to use.  Keep in mind that these are intended to be
+ * used only after you've verified that the processor supports these
+ * operations.  Use them before you've done that and watch your machine go
+ * boom.  Take a look in arch/i386/lib/best_function.c for an example of
+ * how to fixup the kernel with kni/mmx using functions once the CPU
+ * capabilities have been determined.
+ *
+ * In all of these functions:
+ *
+ *   recursive - int, used to determine what the state is at restore time
+ *   regs - char * to an array that is 32 bytes for mmx and 64 bytes for kni
+ *          which is then used to save off the contents of the current
+ *          regs to be recursively safe
+ *   task_switch_regs - char * to another array of the same size as the one
+ *          above, but this array is optional.  If your function might get 
+ *          pre-empted by another task then this pointer should be non-NULL
+ *          so that at unlazy_fpu() time in the switch_to() function we
+ *          can save your register state (copy_*_user functions are an example
+ *          of functions that need this, since they can take a page fault and
+ *          while that fault is being serviced the scheduler is free to run
+ *          another task entirely).
+ *   irqflags - unsigned long used to store IRQ state
+ */
+
+#define SAVE_MMX_REGS(regs) \
+       __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \
+                            "movq %%mm1, 0x08(%0)\n\t" \
+                            "movq %%mm2, 0x10(%0)\n\t" \
+                            "movq %%mm3, 0x18(%0)\n\t" \
+                            : : "r" ((regs)) : "memory" );
+
+#define RESTORE_MMX_REGS(regs) \
+       __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \
+                            "movq 0x08(%0), %%mm1\n\t" \
+                            "movq 0x10(%0), %%mm2\n\t" \
+                            "movq 0x18(%0), %%mm3\n\t" \
+                            : : "r" ((regs)));
+
+#define SAVE_KNI_REGS(regs) \
+       __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \
+                            "movups %%xmm1, 0x10(%0)\n\t" \
+                            "movups %%xmm2, 0x20(%0)\n\t" \
+                            "movups %%xmm3, 0x30(%0)\n\t" \
+                            : : "r" ((regs)) : "memory" );
+
+#define RESTORE_KNI_REGS(regs) \
+       __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \
+                            "movups 0x10(%0), %%xmm1\n\t" \
+                            "movups 0x20(%0), %%xmm2\n\t" \
+                            "movups 0x30(%0), %%xmm3\n\t" \
+                            : : "r" ((regs)));
+
+#define SFENCE() \
+       __asm__ __volatile__("sfence":::"memory")
+
+
+extern spinlock_t kern_fpu_lock;
+
+/*
+ * Although it seems wasteful to do a unilateral clts() in the take_fpu
+ * functions, the reason I did it that way is because the alternative is
+ * to test for:
+ *
+ * if ( ( (current->flags & PF_USEDFPU) &&
+ *        (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ) ||
+ *      ( !(current->flags & PF_USEDFPU) &&
+ *        !(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) ) )
+ *
+ */
+
+#define kernel_take_fpu_mmx(recursive, regs, task_switch_regs, irqflags) do { \
+       spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+       clts(); \
+       (recursive) = (current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY); \
+       if ( (current->flags & PF_USEDFPU) && \
+           !(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ){ \
+               i387_save_hard(current->tss.i387); \
+               current->tss.x86_fpustate |= X86_FPUSTATE_USER_SAVED; \
+       } \
+       if ((recursive) & X86_FPUSTATE_KERN_MMX) { \
+               SAVE_MMX_REGS((regs)); \
+       } else { \
+               current->tss.mmx_reg_space = (task_switch_regs); \
+               current->tss.x86_fpustate |= X86_FPUSTATE_KERN_MMX; \
+       } \
+       spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+#define kernel_release_fpu_mmx(recursive, regs, irqflags) do { \
+       spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+       if ((recursive) & X86_FPUSTATE_KERN_MMX) { \
+               RESTORE_MMX_REGS((regs)); \
+       } else { \
+               current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_MMX; \
+               current->tss.mmx_reg_space = NULL; \
+       } \
+       if ((recursive) == 0) { \
+               stts(); \
+       } \
+       spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+#define kernel_take_fpu_kni(recursive, regs, task_switch_regs, irqflags) do { \
+       spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+       clts(); \
+       (recursive) = current->tss.x86_fpustate; \
+       if ( (current->flags & PF_USEDFPU) || \
+            (current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI) ) { \
+               SAVE_KNI_REGS((regs)); \
+       } \
+       if (!(current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI)) { \
+               current->tss.kni_reg_space = (task_switch_regs); \
+               current->tss.x86_fpustate |= X86_FPUSTATE_KERN_KNI; \
+       } \
+       spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+               
+       
+#define kernel_release_fpu_kni(recursive, regs, irqflags) do { \
+       spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+       if ( (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) && \
+            !(((recursive) & X86_FPUSTATE_USER_SAVED) && \
+               (current->flags & PF_USEDFPU)) ) { \
+               i387_restore_hard(current->tss.i387); \
+               current->tss.x86_fpustate &= ~X86_FPUSTATE_USER_SAVED; \
+       } \
+       if ( ((recursive) & X86_FPUSTATE_KERN_KNI) || \
+            (current->flags & PF_USEDFPU) ) { \
+               RESTORE_KNI_REGS((regs)); \
+       } \
+       if (((recursive) & X86_FPUSTATE_KERN_KNI) == 0) { \
+               current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_KNI; \
+               current->tss.kni_reg_space = NULL; \
+       } \
+       if ( ((recursive) == 0) && ((current->flags & PF_USEDFPU) == 0) ) { \
+               stts(); \
+       } \
+       spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+
+#endif /* __ASM_I386_I387_H */
--- linux/include/asm-i386/processor.h.PIII     Tue May 11 13:35:44 1999
+++ linux/include/asm-i386/processor.h  Sun Dec  5 14:23:14 1999
@@ -7,10 +7,11 @@
 #ifndef __ASM_I386_PROCESSOR_H
 #define __ASM_I386_PROCESSOR_H
 
+#include <linux/config.h>
 #include <asm/vm86.h>
 #include <asm/math_emu.h>
-#include <asm/segment.h>
 #include <asm/page.h>
+#include <asm/user.h>
 
 /*
  *  CPU type and hardware bug flags. Kept separately for each CPU.
@@ -29,6 +30,7 @@
        char    rfu;
        int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
        __u32   x86_capability;
+       __u32   mmu_cr4_features;
        char    x86_vendor_id[16];
        char    x86_model_id[64];
        int     x86_cache_size;  /* in KB - valid for CPUS which support this
@@ -36,6 +38,7 @@
        int     fdiv_bug;
        int     f00f_bug;
        int     coma_bug;
+       int     enable_fixups;
        unsigned long loops_per_sec;
        unsigned long *pgd_quick;
        unsigned long *pte_quick;
@@ -70,16 +73,16 @@
 #define X86_FEATURE_PGE                0x00002000      /* Page Global Enable */
 #define X86_FEATURE_MCA                0x00004000      /* Machine Check Architecture 
*/
 #define X86_FEATURE_CMOV       0x00008000      /* CMOV instruction (FCMOVCC and FCOMI 
too if FPU present) */
-#define X86_FEATURE_PAT        0x00010000      /* Page Attribute Table */
+#define X86_FEATURE_PAT                0x00010000      /* Page Attribute Table */
 #define X86_FEATURE_PSE36      0x00020000      /* 36-bit PSEs */
-#define X86_FEATURE_18         0x00040000
+#define X86_FEATURE_PN         0x00040000      /* 96 bit CPU serial # */
 #define X86_FEATURE_19         0x00080000
 #define X86_FEATURE_20         0x00100000
 #define X86_FEATURE_21         0x00200000
 #define X86_FEATURE_22         0x00400000
 #define X86_FEATURE_MMX                0x00800000      /* multimedia extensions */
 #define X86_FEATURE_FXSR       0x01000000      /* FXSAVE and FXRSTOR instructions 
(fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions) 
available */
-#define X86_FEATURE_25         0x02000000
+#define X86_FEATURE_XMM                0x02000000      /* Intel MMX2 instruction set 
+*/
 #define X86_FEATURE_26         0x04000000
 #define X86_FEATURE_27         0x08000000
 #define X86_FEATURE_28         0x10000000
@@ -89,6 +92,82 @@
 
 extern struct cpuinfo_x86 boot_cpu_data;
 
+#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
+#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
+#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
+#define X86_CR4_DE             0x0008  /* enable debugging extensions */
+#define X86_CR4_PSE            0x0010  /* enable page size extensions */
+#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
+#define X86_CR4_MCE            0x0040  /* Machine check enable */
+#define X86_CR4_PGE            0x0080  /* enable global pages */
+#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR         0x0200  /* fast FPU save/restore */
+#define X86_CR4_OSXMMEXCPT     0x0400  /* KNI (MMX2) unmasked exception 16 */
+                                       /* handler is available */
+
+/*
+ * Some defines for using with the x86_fpu_state variable in the new
+ * thread struct.  We use these because the rest of the kernel doesn't
+ * like us messing with current->flags at arbitrary times ;-)
+ */
+#define X86_FPUSTATE_USER_SAVED        0x0001
+#define X86_FPUSTATE_KERN_ANY  0x0006
+#define X86_FPUSTATE_KERN_MMX  0x0002
+#define X86_FPUSTATE_KERN_KNI  0x0004
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+
+static inline void set_in_cr4(unsigned long mask)
+{
+       boot_cpu_data.mmu_cr4_features |= mask;
+       __asm__("movl %%cr4,%%eax\n\t"
+               "orl %0,%%eax\n\t"
+               "movl %%eax,%%cr4\n"
+               : : "irg" (mask)
+               :"ax");
+}
+
+extern int disable_x86_serial_nr;
+
+static inline void disable_serial_nr(void)
+{
+       if ( disable_x86_serial_nr && 
+           (boot_cpu_data.x86_capability & X86_FEATURE_PN) ) {
+               printk("Disabling CPUID Serial number...");
+               __asm__ __volatile__( "movl $0x119,%%ecx\n\t"
+                               "rdmsr\n\t"
+                               "orl $0x00200000,%%eax\n\t"
+                               "wrmsr":::"ax","dx","cx","memory");
+               /*
+                * We might need to re-read the x86 capability set now to
+                * make sure that the PN bit has been turned off so
+                * we know that the serial number stuff is disabled
+                *
+                * Note: we don't need to re-read the registers.  We can tell
+                * by rebooting that the flag is off since on reboots that
+                * don't power the machine down the serial number doesn't
+                * get disabled any more because it already is disabled.
+                */
+               printk("done.\n");
+       }
+}
+
+static inline void load_default_mxcsr(void)
+{
+       long mxcsr = 0x1f80;
+
+       if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+            (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+               __asm__("ldmxcsr %0": :"m" (mxcsr));
+       }
+}
+
+
 #ifdef __SMP__
 extern struct cpuinfo_x86 cpu_data[];
 #define current_cpu_data cpu_data[smp_processor_id()]
@@ -171,36 +250,61 @@
  */
 #define IO_BITMAP_SIZE 32
 
-struct i387_hard_struct {
-       long    cwd;
-       long    swd;
-       long    twd;
-       long    fip;
-       long    fcs;
-       long    foo;
-       long    fos;
-       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
-       long    status;         /* software status information */
+struct i387_hard_fsave {
+       long     cwd;
+       long     swd;
+       long     twd;
+       long     fip;
+       long     fcs;
+       long     foo;
+       long     fos;
+       long     st_space[20];     /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+/*
+ * has to be 128-bit aligned
+ */
+struct i387_hard_fxsave {
+       unsigned short fxcwd;
+       unsigned short fxswd;
+       unsigned short fxtwd;
+       unsigned short fxfopcode;
+       long     fxfip;
+       short    fxfcs;
+       short    __reserved_00;
+       long     fxfoo;
+       short    fxfos;
+       short    __reserved_01;
+       long     mxcsr;
+       long     __reserved_02;
+       long     st_space[32];     /* 8*16 bytes for each FP/MMX-reg = 128 bytes */
+       long     xmm_space[32];    /* 8*16 bytes for each XMM-reg = 128 bytes */
+       long     __reserved_03 [14*4]; /* 14 16byte lines for remainder */
+} __attribute__ ((aligned (16)));
+
+union i387_hard_union {
+       struct i387_hard_fxsave    fxsave;
+       struct i387_hard_fsave     fsave;
 };
 
 struct i387_soft_struct {
-       long    cwd;
-       long    swd;
-       long    twd;
-       long    fip;
-       long    fcs;
-       long    foo;
-       long    fos;
-       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
-       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
-       struct info     *info;
-       unsigned long   entry_eip;
+       long     cwd;
+       long     swd;
+       long     twd;
+       long     fip;
+       long     fcs;
+       long     foo;
+       long     fos;
+       long     st_space[20];     /* 8*10 bytes for each FP-reg = 80 bytes */
+       unsigned char     ftop, changed, lookahead, no_update, rm, alimit;
+       struct info       *info;
+       unsigned long     entry_eip;
 };
 
 union i387_union {
-       struct i387_hard_struct hard;
+       union i387_hard_union hard;
        struct i387_soft_struct soft;
-};
+} __attribute__ ((aligned(16)));
 
 typedef struct {
        unsigned long seg;
@@ -242,6 +346,10 @@
        struct vm86_struct * vm86_info;
        unsigned long screen_bitmap;
        unsigned long v86flags, v86mask, v86mode, saved_esp0;
+       volatile long x86_fpustate;
+       char *mmx_reg_space;
+       char *kni_reg_space;
+
 };
 
 #define INIT_MMAP \
@@ -263,8 +371,9 @@
        {~0, }, /* ioperm */                                    \
        _TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */  \
        { 0, },                                                 \
-       { { 0, }, },  /* 387 state */                           \
+       { { { 0, }, }, },  /* 387 state */                      \
        NULL, 0, 0, 0, 0, 0, /* vm86_info */                    \
+       0, NULL, NULL /* fpustate, mmx, and xmm_reg_space */    \
 }
 
 #define start_thread(regs, new_eip, new_esp) do {              \
@@ -289,27 +398,6 @@
 extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm);
 extern void release_segments(struct mm_struct * mm);
 extern void forget_segments(void);
-
-/*
- * FPU lazy state save handling..
- */
-#define save_fpu(tsk) do { \
-       asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \
-       tsk->flags &= ~PF_USEDFPU; \
-       stts(); \
-} while (0)
-
-#define unlazy_fpu(tsk) do { \
-       if (tsk->flags & PF_USEDFPU) \
-               save_fpu(tsk); \
-} while (0)
-
-#define clear_fpu(tsk) do { \
-       if (tsk->flags & PF_USEDFPU) { \
-               tsk->flags &= ~PF_USEDFPU; \
-               stts(); \
-       } \
-} while (0)
 
 /*
  * Return saved PC of a blocked thread.
--- linux/include/asm-i386/string.h.PIII        Thu Apr 22 12:59:46 1999
+++ linux/include/asm-i386/string.h     Sun Dec  5 14:23:14 1999
@@ -14,6 +14,10 @@
 #include <asm/string-486.h>
 #else
 
+#ifndef _LINUX_CONFIG_H
+#include <linux/config.h>
+#endif
+
 /*
  * This string-include defines all string functions as inline
  * functions. Use gcc. It also assumes ds=es=data space, this should be
@@ -293,10 +297,21 @@
 }
 
 #define __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+extern void * __kni_memcpy(void * to, const void * from, size_t n);
+extern void * best_memcpy(void * to, const void * from, size_t n);
+#define memcpy(t, f, n) \
+(__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
+ __constant_memcpy((t),(f),(n)) : \
+ best_memcpy((t),(f),(n))) : \
+ best_memcpy((t),(f),(n)))
+#else
 #define memcpy(t, f, n) \
 (__builtin_constant_p(n) ? \
  __constant_memcpy((t),(f),(n)) : \
  __memcpy((t),(f),(n)))
+#endif
 
 #define __HAVE_ARCH_MEMMOVE
 extern inline void * memmove(void * dest,const void * src, size_t n)
@@ -449,21 +464,32 @@
 #undef COMMON
 }
 
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
+#define __constant_x_count_memset(s, c, count) \
+(__builtin_constant_p(c) ? \
+ __constant_c_and_count_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) :\
+ __constant_count_memset((s),(c),(count)))
 
 #define __memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
+(__builtin_constant_p(c) ? \
+ __constant_c_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \
  __memset_generic((s),(c),(count)))
 
 #define __HAVE_ARCH_MEMSET
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+extern void * __kni_memset(void * s, char c, size_t count);
+extern void * best_memset(void * s, char c, size_t count);
 #define memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \
+(__builtin_constant_p(count) ? \
+ (((count) < 128) ? \
+ __constant_x_count_memset((s),(c),(count)) : \
+ best_memset((s),(c),(count))) : \
+ best_memset((s),(c),(count)))
+#else
+#define memset(s, c, count) \
+(__builtin_constant_p(count) ? \
+ __constant_x_count_memset((s),(c),(count)) : \
  __memset((s),(c),(count)))
+#endif
 
 /*
  * find the first occurrence of byte 'c', or 1 past the area if none
--- linux/include/asm-i386/uaccess.h.PIII       Tue Oct 19 20:14:02 1999
+++ linux/include/asm-i386/uaccess.h    Sun Dec  5 14:23:14 1999
@@ -571,19 +571,61 @@
        return n;
 }
 
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+
+/*
+ * The XMM based copy_*_user() function declarations...the best_*_user()
+ * routines need this
+ */
+unsigned long kni_copy_to_user(void *, const void *, unsigned long);
+unsigned long kni_copy_from_user(void *, const void *, unsigned long);
+unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long);
+unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long);
+
+unsigned long best_copy_to_user(void *, const void *, unsigned long);
+unsigned long best_copy_from_user(void *, const void *, unsigned long);
+unsigned long __best_copy_to_user(void *, const void *, unsigned long);
+unsigned long __best_copy_from_user(void *, const void *, unsigned long);
+
 #define copy_to_user(to,from,n)                                \
        (__builtin_constant_p(n) ?                      \
+       (((n) < 128) ?                                  \
         __constant_copy_to_user((to),(from),(n)) :     \
-        __generic_copy_to_user((to),(from),(n)))
+        best_copy_to_user((to),(from),(n))) :          \
+        best_copy_to_user((to),(from),(n)))
 
 #define copy_from_user(to,from,n)                      \
        (__builtin_constant_p(n) ?                      \
+       (((n) < 128) ?                                  \
         __constant_copy_from_user((to),(from),(n)) :   \
-        __generic_copy_from_user((to),(from),(n)))
+        best_copy_from_user((to),(from),(n))) :        \
+        best_copy_from_user((to),(from),(n)))
 
-#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return 
retval; })
+#define __copy_to_user(to,from,n)                      \
+       (__builtin_constant_p(n) ?                      \
+       (((n) < 128) ?                                  \
+        __constant_copy_to_user_nocheck((to),(from),(n)) :     \
+        __best_copy_to_user((to),(from),(n))) :        \
+        __best_copy_to_user((to),(from),(n)))
 
-#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return 
retval; })
+#define __copy_from_user(to,from,n)                    \
+       (__builtin_constant_p(n) ?                      \
+       (((n) < 128) ?                                  \
+        __constant_copy_from_user_nocheck((to),(from),(n)) :   \
+        __best_copy_from_user((to),(from),(n))) :      \
+        __best_copy_from_user((to),(from),(n)))
+
+#else /* CONFIG_X86_CPU_OPTIMIZATIONS */
+
+#define copy_to_user(to,from,n)                                \
+       (__builtin_constant_p(n) ?                      \
+        __constant_copy_to_user((to),(from),(n)) :     \
+        __generic_copy_to_user((to),(from),(n)))
+
+#define copy_from_user(to,from,n)                      \
+       (__builtin_constant_p(n) ?                      \
+        __constant_copy_from_user((to),(from),(n)) :   \
+        __generic_copy_from_user((to),(from),(n)))
 
 #define __copy_to_user(to,from,n)                      \
        (__builtin_constant_p(n) ?                      \
@@ -594,6 +636,11 @@
        (__builtin_constant_p(n) ?                      \
         __constant_copy_from_user_nocheck((to),(from),(n)) :   \
         __generic_copy_from_user_nocheck((to),(from),(n)))
+#endif
+
+#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return 
+retval; })
+
+#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return 
+retval; })
 
 long strncpy_from_user(char *dst, const char *src, long count);
 long __strncpy_from_user(char *dst, const char *src, long count);
--- linux/include/asm-i386/io.h.PIII    Tue May 11 13:36:03 1999
+++ linux/include/asm-i386/io.h Sun Dec  5 14:23:14 1999
@@ -157,9 +157,9 @@
 #define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b))
 #define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b))
 
-#define memset_io(a,b,c)       memset(__io_virt(a),(b),(c))
-#define memcpy_fromio(a,b,c)   memcpy((a),__io_virt(b),(c))
-#define memcpy_toio(a,b,c)     memcpy(__io_virt(a),(b),(c))
+#define memset_io(a,b,c)       __memset_generic(__io_virt(a),(b),(c))
+#define memcpy_fromio(a,b,c)   __memcpy((a),__io_virt(b),(c))
+#define memcpy_toio(a,b,c)     __memcpy(__io_virt(a),(b),(c))
 
 /*
  * Again, i386 does not require mem IO specific function.
--- linux/arch/i386/mm/init.c.PIII      Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/mm/init.c   Sun Dec  5 14:23:14 1999
@@ -184,34 +184,6 @@
 extern char _text, _etext, _edata, __bss_start, _end;
 extern char __init_begin, __init_end;
 
-#define X86_CR4_VME            0x0001          /* enable vm86 extensions */
-#define X86_CR4_PVI            0x0002          /* virtual interrupts flag enable */
-#define X86_CR4_TSD            0x0004          /* disable time stamp at ipl 3 */
-#define X86_CR4_DE             0x0008          /* enable debugging extensions */
-#define X86_CR4_PSE            0x0010          /* enable page size extensions */
-#define X86_CR4_PAE            0x0020          /* enable physical address extensions 
*/
-#define X86_CR4_MCE            0x0040          /* Machine check enable */
-#define X86_CR4_PGE            0x0080          /* enable global pages */
-#define X86_CR4_PCE            0x0100          /* enable performance counters at ipl 
3 */
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-unsigned long mmu_cr4_features __initdata = 0;
-
-static inline void set_in_cr4(unsigned long mask)
-{
-       mmu_cr4_features |= mask;
-       __asm__("movl %%cr4,%%eax\n\t"
-               "orl %0,%%eax\n\t"
-               "movl %%eax,%%cr4\n"
-               : : "irg" (mask)
-               :"ax");
-}
-
 /*
  * allocate page table(s) for compile-time fixed mappings
  */
--- linux/arch/i386/lib/Makefile.PIII   Sun Dec 27 13:33:13 1998
+++ linux/arch/i386/lib/Makefile        Sun Dec  5 14:23:14 1999
@@ -9,4 +9,8 @@
 L_OBJS  = checksum.o old-checksum.o semaphore.o delay.o \
        usercopy.o getuser.o putuser.o
 
+ifeq ($(CONFIG_X86_CPU_OPTIMIZATIONS),y)
+       L_OBJS += best_function.o simd.o
+endif
+
 include $(TOPDIR)/Rules.make
--- linux/arch/i386/lib/best_function.c.PIII    Sun Dec  5 14:23:14 1999
+++ linux/arch/i386/lib/best_function.c Sun Dec  5 14:23:31 1999
@@ -0,0 +1,196 @@
+/*
+ * SIMD functions.  These replace the functions in asm-i386/string.h
+ * whenever it makes sense.  These also un-inline those functions.
+ *
+ * Copyright 1999, Doug Ledford <[EMAIL PROTECTED]>
+ *
+ * These functions are simple and trivial, consider them to be
+ * public domain
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+/*
+ * We declare our accelerator functions here since this is the only place
+ * that needs the declarations which makes a header file a pain to deal
+ * with
+ */
+extern void * kni_memcpy(void *, const void *, size_t);
+extern void * kni_memset(void *, char, size_t);
+extern unsigned long kni_copy_to_user(void *, const void *, unsigned long);
+extern unsigned long kni_copy_from_user(void *, const void *, unsigned long);
+extern unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long);
+extern unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned 
+long);
+
+static void * best_memcpy_final(void *, const void *, size_t);
+static void * best_memset_final(void *, char, size_t);
+static unsigned long best_copy_to_user_final(void *, const void *, unsigned long);
+static unsigned long best_copy_from_user_final(void *, const void *, unsigned long);
+static unsigned long __best_copy_to_user_final(void *, const void *, unsigned long);
+static unsigned long __best_copy_from_user_final(void *, const void *, unsigned long);
+
+void * best_memcpy(void * to, const void * from, size_t n)
+{
+       int BAR = (int)__builtin_return_address(0);
+       int *caller = (int *)BAR - 1;
+       if(boot_cpu_data.enable_fixups) {
+           if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+                (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+                       *caller = (int)kni_memcpy - BAR;
+                       return(kni_memcpy(to, from, n));
+               } else {
+                       *caller = (int)best_memcpy_final - BAR;
+                       return(__memcpy(to, from, n));
+               }
+       } else {
+               return(__memcpy(to, from, n));
+       }
+}
+
+static void * best_memcpy_final(void * to, const void * from, size_t n)
+{
+       return(__memcpy(to, from, n));
+}
+
+void * best_memset(void * s, char c, size_t count)
+{
+       int BAR = (int)__builtin_return_address(0);
+       int *caller = (int *)BAR - 1;
+       if(boot_cpu_data.enable_fixups) {
+           if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+                (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+                       *caller = (int)kni_memset - BAR;
+                       return(kni_memset(s, c, count));
+               } else {
+                       *caller = (int)best_memset_final - BAR;
+                       return(__memset_generic(s, c, count));
+               }
+       } else {
+               return(__memset_generic(s, c, count));
+       }
+}
+
+static void * best_memset_final(void * s, char c, size_t count)
+{
+       return(__memset_generic(s, c, count));
+}
+
+unsigned long
+best_copy_to_user(void *to, const void *from, unsigned long n)
+{
+       int BAR = (int)__builtin_return_address(0);
+       int *caller = (int *)BAR - 1;
+       if(boot_cpu_data.enable_fixups) {
+           if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+                (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+                       *caller = (int)kni_copy_to_user - BAR;
+                       return(kni_copy_to_user(to, from, n));
+               } else {
+                       *caller = (int)best_copy_to_user_final - BAR;
+                       return(best_copy_to_user_final(to, from, n));
+               }
+       } else {
+               if (access_ok(VERIFY_WRITE, to, n)) {
+                       __copy_user(to,from,n);
+               }
+               return n;
+       }
+}
+
+static unsigned long
+best_copy_to_user_final(void *to, const void *from, unsigned long n)
+{
+       if (access_ok(VERIFY_WRITE, to, n)) {
+               __copy_user(to,from,n);
+       }
+       return n;
+}
+
+unsigned long
+best_copy_from_user(void *to, const void *from, unsigned long n)
+{
+       int BAR = (int)__builtin_return_address(0);
+       int *caller = (int *)BAR - 1;
+       if(boot_cpu_data.enable_fixups) {
+           if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+                (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+                       *caller = (int)kni_copy_from_user - BAR;
+                       return(kni_copy_from_user(to, from, n));
+               } else {
+                       *caller = (int)best_copy_from_user_final - BAR;
+                       return(best_copy_from_user_final(to, from, n));
+               }
+       } else {
+               if (access_ok(VERIFY_READ, from, n)) {
+                       __copy_user_zeroing(to,from,n);
+               }
+               return n;
+       }
+}
+
+static unsigned long
+best_copy_from_user_final(void *to, const void *from, unsigned long n)
+{
+       if (access_ok(VERIFY_READ, from, n)) {
+               __copy_user_zeroing(to,from,n);
+       }
+       return n;
+}
+
+unsigned long
+__best_copy_to_user(void *to, const void *from, unsigned long n)
+{
+       int BAR = (int)__builtin_return_address(0);
+       int *caller = (int *)BAR - 1;
+       if(boot_cpu_data.enable_fixups) {
+           if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+                (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+                       *caller = (int)__kni_copy_to_user_nocheck - BAR;
+                       return(__kni_copy_to_user_nocheck(to, from, n));
+               } else {
+                       *caller = (int)__best_copy_to_user_final - BAR;
+                       return(__best_copy_to_user_final(to, from, n));
+               }
+       } else {
+               __copy_user(to,from,n);
+               return n;
+       }
+}
+
+static unsigned long
+__best_copy_to_user_final(void *to, const void *from, unsigned long n)
+{
+       __copy_user(to,from,n);
+       return n;
+}
+
+unsigned long
+__best_copy_from_user(void *to, const void *from, unsigned long n)
+{
+       int BAR = (int)__builtin_return_address(0);
+       int *caller = (int *)BAR - 1;
+       if(boot_cpu_data.enable_fixups) {
+           if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+                (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+                       *caller = (int)__kni_copy_from_user_nocheck - BAR;
+                       return(__kni_copy_from_user_nocheck(to, from, n));
+               } else {
+                       *caller = (int)__best_copy_from_user_final - BAR;
+                       return(__best_copy_from_user_final(to, from, n));
+               }
+       } else {
+               __copy_user_zeroing(to,from,n);
+               return n;
+       }
+}
+
+static unsigned long
+__best_copy_from_user_final(void *to, const void *from, unsigned long n)
+{
+       __copy_user_zeroing(to,from,n);
+       return n;
+}
+
--- linux/arch/i386/lib/simd.c.PIII     Sun Dec  5 14:23:14 1999
+++ linux/arch/i386/lib/simd.c  Sun Dec  5 14:23:14 1999
@@ -0,0 +1,435 @@
+/*
+ * SIMD functions.  These replace the functions in asm-i386/string.h
+ * whenever it makes sense.  These also un-inline those functions.
+ *
+ * Copyright 1999, Doug Ledford <[EMAIL PROTECTED]>
+ *
+ * These functions are simple and trivial, consider them to be
+ * public domain
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+
+extern void * kni_memcpy(void * to, const void * from, size_t n)
+{
+       unsigned long flags;
+       void *ret=to;
+       size_t size;
+       int recursive = 0;
+       char xmm_space[64];
+
+       /*
+        * If the transfer is too small, then use the generic routine.
+        */
+       if (n < 128) {
+               return(__memcpy(to, from, n));
+       }
+       kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
+
+       /*
+        * Align the destination on a 16byte boundary.
+        * The source doesn't have to be aligned.
+        */
+       if ( (unsigned long)to & 0xf ) {
+               size = 0x10 - ((unsigned long)to & 0xf);
+               __asm__ __volatile__("movups (%0),%%xmm0\n\t"
+                                    "movups %%xmm0,(%1)\n\t"
+                                    :
+                                    : "r" (from),
+                                      "r" (to));
+               n -= size;
+               from += size;
+               to += size;
+       }
+       /*
+        * If the copy would have tailings, take care of them
+        * now instead of later
+        */
+       if(n & 0xf) {
+               size = n - 0x10;
+               __asm__ __volatile__("movups (%0),%%xmm0\n\t"
+                                    "movups %%xmm0,(%1)\n\t"
+                                    :
+                                    : "r" (from + size),
+                                      "r" (to + size));
+               n &= ~0xf;
+       }
+       /*
+        * Prefetch the first two cachelines now.
+        */
+       __asm__ __volatile__("prefetchnta 0x00(%0)\n\t"
+                            "prefetchnta 0x20(%0)\n\t"
+                            :
+                            : "r" (from));
+       /*
+        * Copy 32 bytes at a time.  The single unroll is good
+        * for a 30% performance boost in the copy.  Additional
+        * unrolls are not productive.  We are guaranteed to
+        * have at least 32 bytes of data to copy since the
+        * macro in string.h doesn't call into this function
+        * with less than 64 bytes of copy and we lost < 32
+        * bytes to alignment earlier.
+        */
+       while (n >= 0x20) {
+               __asm__ __volatile__(
+                                    "movups 0x00(%0),%%xmm0\n\t"
+                                    "movups 0x10(%0),%%xmm1\n\t"
+                                    "movntps %%xmm0,0x00(%1)\n\t"
+                                    "movntps %%xmm1,0x10(%1)\n\t"
+                                    : 
+                                    : "r" (from), "r" (to)
+                                    : "memory");
+               from += 0x20;
+               /*
+                * Note: Intermixing the prefetch at *exactly* this point
+                * in time has been shown to be the fastest possible.
+                * Timing these prefetch instructions is a complete black
+                * art with nothing but trial and error showing the way.
+                * To that extent, this optimum version was found by using
+                * a userland version of this routine that we clocked for
+                * lots of runs.  We then fiddled with ordering until we
+                * settled on our highest speen routines.  So, the long
+                * and short of this is, don't mess with instruction ordering
+                * here or suffer permance penalties you will.
+                */
+               __asm__ __volatile__(
+                                    "prefetchnta 0x20(%0)\n\t"
+                                    : 
+                                    : "r" (from));
+               to += 0x20;
+               n -= 0x20;
+       }
+       if (n) {
+               __asm__ __volatile__("movups 0x00(%0),%%xmm0\n\t"
+                                    "movntps %%xmm0,0x00(%1)\n\t"
+                                    : 
+                                    : "r" (from), "r" (to)
+                                    : "memory");
+       }
+       SFENCE();
+       kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+       return(ret);
+}
+
+extern void * kni_memset(void * s, char c, size_t count)
+{
+       unsigned long flags;
+       size_t size;
+       void *ret=s;
+       int recursive = 0;
+       char xmm_space[64];
+
+       /*
+        * If the transfer is too small, then use the generic routine.
+        */
+       if (count < 128) {
+               return(__memset_generic(s, c, count));
+       }
+       kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
+       /*
+        * Load up our XMM register with the stuff to set mem with
+        */
+       if(c == '\0') {
+               __asm__ __volatile__("xorps %%xmm0,%%xmm0\n\t"
+                                    "movups %%xmm0,(%0)\n\t"
+                                    :
+                                    : "r" (s));
+       } else {
+               __memset_generic(s, c, 0x10);
+               __asm__ __volatile__("movups (%0),%%xmm0"
+                                    :
+                                    : "r" (s));
+       }
+       /*
+        * align the destination on a 16 byte boundary, we can simply
+        * do the math to align things since we already populated the
+        * first 16 bytes.
+        */
+       size = (0x10 - ((unsigned long)s & 0xf));
+       count -= size;
+       s += size;
+       /*
+        * On the off chance we have tailings due to alignment issues,
+        * do them now to make later more efficient
+        */
+       if(count & 0xf) {
+               __asm__ __volatile__("movups %%xmm0,(%0)"
+                                    :
+                                    : "r" (s + (count - 0x10))
+                                    : "memory");
+               count &= ~0xf;
+       }
+       /*
+        * Do the copy by plopping out the register to memory.
+        * Note: Unrolling this was *totally* unproductive.  My benchmark
+        * showed that one or two plops per iteration produced the same
+        * speed to within .06 MByte/s of speed.  Considering that the
+        * routine benchmarked at over 3000 MByte/s, .06 is not statistically
+        * significant and only doing one drop per loop simplifies 
+        * overhead of book keeping.
+        */
+       while(count) {
+               __asm__ __volatile__("movntps %%xmm0,0x00(%0)\n\t"
+                                    :
+                                    : "r" (s));
+               s += 0x10;
+               count -= 0x10;
+       }
+       SFENCE();
+       kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+       return(ret);
+}
+
+#define __kni_copy_to_user(to,from,size)                               \
+do {                                                                   \
+       int __d0, __d1, tmp, tmp2;                                      \
+       __asm__ __volatile__(                                           \
+               "       movl %1,%4\n"                                   \
+               "       andl $0xf,%4\n"                                 \
+               "       movups (%2),%%xmm0\n"                           \
+               "1:     movups %%xmm0,(%1)\n"                           \
+               "       movl $0x10,%3\n"                                \
+               "       subl %4,%3\n"                                   \
+               "       addl %3,%2\n"                                   \
+               "       addl %3,%1\n"                                   \
+               "       subl %3,%0\n"                                   \
+               "       prefetchnta 0x00(%2)\n"                         \
+               "       prefetchnta 0x20(%2)\n"                         \
+               "       jmp 200f\n"                                     \
+               "100:   movups 0x00(%2),%%xmm0\n"                       \
+               "       movups 0x10(%2),%%xmm1\n"                       \
+               "2:     movntps %%xmm0,0x00(%1)\n"                      \
+               "3:     movntps %%xmm1,0x10(%1)\n"                      \
+               "       addl $0x20,%2\n"                                \
+               "       prefetchnta 0x20(%2)\n"                         \
+               "       addl $0x20,%1\n"                                \
+               "       subl $0x20,%0\n"                                \
+               "200:   cmpl $0x1f,%0\n"                                \
+               "       ja 100b\n"                                      \
+               "       cmpl $0xf,%0\n"                                 \
+               "       jbe 300f\n"                                     \
+               "       movups 0x00(%2),%%xmm0\n"                       \
+               "4:     movntps %%xmm0,0x00(%1)\n"                      \
+               "       addl $0x10,%2\n"                                \
+               "       addl $0x10,%1\n"                                \
+               "       subl $0x10,%0\n"                                \
+               "300:   testl %0,%0\n"                                  \
+               "       je 400f\n"                                      \
+               "       movl $0x10,%3\n"                                \
+               "       subl %0,%3\n"                                   \
+               "       subl %3,%1\n"                                   \
+               "       subl %3,%2\n"                                   \
+               "       movups 0x00(%2),%%xmm0\n"                       \
+               "5:     movups %%xmm0,0x00(%1)\n"                       \
+               "       addl $0x10,%2\n"                                \
+               "       addl $0x10,%1\n"                                \
+               "       xorl %0,%0\n"                                   \
+               "400:\n"                                                \
+               ".section .fixup,\"ax\"\n"                              \
+               "6:     jmp 400b\n"                                     \
+               "7:     addl $0x10,%1\n"                                \
+               "       addl $0x10,%2\n"                                \
+               "       subl $0x10,%0\n"                                \
+               "       jmp 400b\n"                                     \
+               "8:     addl %3,%1\n"                                   \
+               "       addl %3,%2\n"                                   \
+               "       jmp 400b\n"                                     \
+               ".previous\n"                                           \
+               ".section __ex_table,\"a\"\n"                           \
+               "       .align 4\n"                                     \
+               "       .long 1b,6b\n"                                  \
+               "       .long 2b,6b\n"                                  \
+               "       .long 3b,7b\n"                                  \
+               "       .long 4b,6b\n"                                  \
+               "       .long 5b,8b\n"                                  \
+               ".previous"                                             \
+               : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp),   \
+                 "=r"(tmp2)                                            \
+               : "0"(size), "1"(to), "2"(from)                         \
+               : "memory");                                            \
+} while (0)
+
+#define __kni_copy_from_user(to,from,size)                             \
+do {                                                                   \
+       int __d0, __d1, tmp, tmp2;                                      \
+       __asm__ __volatile__(                                           \
+               "       movl %1,%4\n"                                   \
+               "       andl $0xf,%4\n"                                 \
+               "1:     movups (%2),%%xmm0\n"                           \
+               "       movups %%xmm0,(%1)\n"                           \
+               "       movl $0x10,%3\n"                                \
+               "       subl %4,%3\n"                                   \
+               "       addl %3,%2\n"                                   \
+               "       addl %3,%1\n"                                   \
+               "       subl %3,%0\n"                                   \
+               "       prefetchnta 0x00(%2)\n"                         \
+               "       prefetchnta 0x20(%2)\n"                         \
+               "       jmp 100f\n"                                     \
+               "2:     movups 0x00(%2),%%xmm0\n"                       \
+               "3:     movups 0x10(%2),%%xmm1\n"                       \
+               "       movntps %%xmm0,0x00(%1)\n"                      \
+               "       movntps %%xmm1,0x10(%1)\n"                      \
+               "       addl $0x20,%2\n"                                \
+               "       prefetchnta 0x20(%2)\n"                         \
+               "       addl $0x20,%1\n"                                \
+               "       subl $0x20,%0\n"                                \
+               "100:   cmpl $0x1f,%0\n"                                \
+               "       ja 2b\n"                                        \
+               "       cmpl $0xf,%0\n"                                 \
+               "       jbe 200f\n"                                     \
+               "4:     movups 0x00(%2),%%xmm0\n"                       \
+               "       movntps %%xmm0,0x00(%1)\n"                      \
+               "       addl $0x10,%2\n"                                \
+               "       addl $0x10,%1\n"                                \
+               "       subl $0x10,%0\n"                                \
+               "200:   testl %0,%0\n"                                  \
+               "       je 300f\n"                                      \
+               "       movl $0x10,%3\n"                                \
+               "       subl %0,%3\n"                                   \
+               "       subl %3,%1\n"                                   \
+               "       subl %3,%2\n"                                   \
+               "5:     movups 0x00(%2),%%xmm0\n"                       \
+               "       movups %%xmm0,0x00(%1)\n"                       \
+               "       addl $0x10,%2\n"                                \
+               "       addl $0x10,%1\n"                                \
+               "       xorl %0,%0\n"                                   \
+               "300:\n"                                                \
+               ".section .fixup,\"ax\"\n"                              \
+               "6:     xorps %%xmm0,%%xmm0\n"                          \
+               "       movups %%xmm0,(%1)\n"                           \
+               "       movl $0x10,%3\n"                                \
+               "       subl %4,%3\n"                                   \
+               "       addl %3,%1\n"                                   \
+               "       movl %3,%4\n"                                   \
+               "       movl %0,%3\n"                                   \
+               "       subl %4,%3\n"                                   \
+               "       jmp 600f\n"                                     \
+               "7:     subl $0x10,%0\n"                                \
+               "       addl $0x10,%1\n"                                \
+               "400:   movl %0,%3\n"                                   \
+               "       xorps %%xmm0,%%xmm0\n"                          \
+               "       jmp 600f\n"                                     \
+               "500:   movntps %%xmm0,0x00(%1)\n"                      \
+               "       movntps %%xmm0,0x10(%1)\n"                      \
+               "       addl $0x20,%1\n"                                \
+               "       subl $0x20,%3\n"                                \
+               "600:   cmpl $0x1f,%3\n"                                \
+               "       ja 500b\n"                                      \
+               "       cmpl $0xf,%3\n"                                 \
+               "       jbe 700f\n"                                     \
+               "       movntps %%xmm0,0x00(%1)\n"                      \
+               "       addl $0x10,%1\n"                                \
+               "       subl $0x10,%3\n"                                \
+               "700:   testl %3,%3\n"                                  \
+               "       je 300b\n"                                      \
+               "       xorl %4,%4\n"                                   \
+               "       movb %4,(%1)\n"                                 \
+               "       inc %1\n"                                       \
+               "       dec %3\n"                                       \
+               "       jmp 700b\n"                                     \
+               "8:     addl %3,%1\n"                                   \
+               "       movl %0,%3\n"                                   \
+               "       jmp 700b\n"                                     \
+               ".previous\n"                                           \
+               ".section __ex_table,\"a\"\n"                           \
+               "       .align 4\n"                                     \
+               "       .long 1b,6b\n"                                  \
+               "       .long 2b,400b\n"                                \
+               "       .long 3b,7b\n"                                  \
+               "       .long 4b,400b\n"                                \
+               "       .long 5b,8b\n"                                  \
+               ".previous"                                             \
+               : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp),   \
+                 "=q"(tmp2)                                            \
+               : "0"(size), "1"(to), "2"(from)                         \
+               : "memory");                                            \
+} while (0)
+
+
+unsigned long
+__kni_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
+{
+       unsigned long flags;
+       int recursive = 0;
+       char xmm_space[64];
+       char xmm_reg_space[64]; /* in case we switch context */
+
+       if (n >= 128) {
+               kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+               __kni_copy_to_user(to,from,n);
+               SFENCE();
+               kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+       } else {
+               __copy_user(to,from,n);
+       }
+       return n;
+}
+
+unsigned long
+__kni_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
+{
+       unsigned long flags;
+       int recursive = 0;
+       char xmm_space[64];
+       char xmm_reg_space[64]; /* in case we switch context */
+
+       if (n >= 128) {
+               kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+               __kni_copy_from_user(to,from,n);
+               SFENCE();
+               kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+       } else {
+               __copy_user_zeroing(to,from,n);
+       }
+       return n;
+}
+
+
+
+unsigned long
+kni_copy_to_user(void *to, const void *from, unsigned long n)
+{
+       unsigned long flags;
+       int recursive = 0;
+       char xmm_space[64];
+       char xmm_reg_space[64]; /* in case we switch context */
+
+       if (access_ok(VERIFY_WRITE, to, n)) {
+               if (n >= 128) {
+                       
+kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+                       __kni_copy_to_user(to,from,n);
+                       SFENCE();
+                       kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+               } else {
+                       __copy_user(to,from,n);
+               }
+       }
+       return n;
+}
+
+unsigned long
+kni_copy_from_user(void *to, const void *from, unsigned long n)
+{
+       unsigned long flags;
+       int recursive = 0;
+       char xmm_space[64];
+       char xmm_reg_space[64]; /* in case we switch context */
+
+       if (access_ok(VERIFY_READ, from, n)) {
+               if (n >= 128) {
+                       
+kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+                       __kni_copy_from_user(to,from,n);
+                       SFENCE();
+                       kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+               } else {
+                       __copy_user_zeroing(to,from,n);
+               }
+       }
+       return n;
+}
+
+
--- linux/arch/i386/kernel/head.S.PIII  Fri Jan 15 01:57:25 1999
+++ linux/arch/i386/kernel/head.S       Sun Dec  5 14:23:14 1999
@@ -14,7 +14,6 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
-
 #define CL_MAGIC_ADDR  0x90020
 #define CL_MAGIC       0xA33F
 #define CL_BASE_ADDR   0x90000
@@ -32,7 +31,8 @@
 #define X86_HARD_MATH  CPU_PARAMS+6
 #define X86_CPUID      CPU_PARAMS+8
 #define X86_CAPABILITY CPU_PARAMS+12
-#define X86_VENDOR_ID  CPU_PARAMS+16
+#define X86_MMU_CR4    CPU_PARAMS+16
+#define X86_VENDOR_ID  CPU_PARAMS+20
 
 /*
  * swapper_pg_dir is the main page directory, address 0x00101000
@@ -59,9 +59,8 @@
  *     NOTE! We have to correct for the fact that we're
  *     not yet offset PAGE_OFFSET..
  */
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
        movl %cr4,%eax          # Turn on 4Mb pages
-       orl cr4_bits,%eax
+       orl X86_MMU_CR4-__PAGE_OFFSET,%eax
        movl %eax,%cr4
 #endif
 /*
--- linux/arch/i386/kernel/process.c.PIII       Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/kernel/process.c    Sun Dec  5 14:23:14 1999
@@ -42,6 +42,7 @@
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <asm/i387.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -582,6 +583,106 @@
 }
 
 /*
+ * FPU state handling functions
+ */
+
+int i387_hard_to_user ( struct user_i387_struct * user,
+                               union i387_hard_union * hard)
+{
+#ifdef CONFIG_X86_FX
+       int i, err = 0;
+       short *tmp, *tmp2;
+       union i387_hard_union hard2;
+#else
+       int err = 0;
+#endif
+
+       if (!access_ok(VERIFY_WRITE, user, sizeof(*user)))
+               return -EFAULT;
+#ifdef CONFIG_X86_FX
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+               hard2.fsave.cwd = 0xffff0000 | hard->fxsave.fxcwd;
+               hard2.fsave.swd = 0xffff0000 | hard->fxsave.fxswd;
+               hard2.fsave.twd = fputag_KNI_to_387(hard->fxsave.fxtwd);
+               hard2.fsave.fip = hard->fxsave.fxfip;
+               hard2.fsave.fcs = hard->fxsave.fxfcs;
+               hard2.fsave.foo = hard->fxsave.fxfoo;
+               hard2.fsave.fos = hard->fxsave.fxfos;
+
+               tmp = (short *)&hard2.fsave.st_space[0];
+               tmp2 = (short *)&hard->fxsave.st_space[0];
+
+               /*
+                * Transform the two layouts:
+                * (we do not mix 32-bit access with 16-bit access because
+                * thats suboptimal on PPros)
+                */
+
+               for (i = 0; i < 8; i++) {
+                       *tmp = *tmp2; tmp++; tmp2++;
+                       *tmp = *tmp2; tmp++; tmp2++;
+                       *tmp = *tmp2; tmp++; tmp2++;
+                       *tmp = *tmp2; tmp++; tmp2++;
+                       *tmp = *tmp2; tmp++; tmp2 += 4;
+               }
+               err = copy_to_user((void *)(user),(&(hard2)),
+                                  sizeof(struct i387_hard_fsave));
+       } else
+#endif
+               err = copy_to_user((void *)(user),
+                                  (&(hard->fsave.cwd)),
+                                  sizeof(struct i387_hard_fsave));
+       return err;
+}
+
+int i387_user_to_hard (union i387_hard_union * hard,
+                       struct user_i387_struct * user)
+{
+#ifdef CONFIG_X86_FX
+       int i, err = 0;
+       short *tmp, *tmp2;
+       union i387_hard_union hard2;
+#else
+       int err = 0;
+#endif
+
+       if (!access_ok(VERIFY_READ, user, sizeof(*user)))
+               return -EFAULT;
+#ifdef CONFIG_X86_FX
+       if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+               err = copy_from_user((&(hard2)),(void *)(user),
+                                    sizeof(struct i387_hard_fsave));
+               hard->fxsave.fxcwd = hard2.fsave.cwd & 0xffff;
+               hard->fxsave.fxswd = hard2.fsave.swd & 0xffff;
+               hard->fxsave.fxtwd = fputag_387_to_KNI(hard2.fsave.twd);
+               hard->fxsave.fxfip = hard2.fsave.fip;
+               hard->fxsave.fxfcs = hard2.fsave.fcs & 0xffff;
+               hard->fxsave.fxfoo = hard2.fsave.foo;
+               hard->fxsave.fxfos = hard2.fsave.fos & 0xffff;
+
+               tmp2 = (short *)&hard->fxsave.st_space[0];
+               tmp = (short *)&hard2.fsave.st_space[0];
+
+               for (i = 0; i < 8; i++) {
+                       *tmp2 = *tmp; tmp++; tmp2++;
+                       *tmp2 = *tmp; tmp++; tmp2++;
+                       *tmp2 = *tmp; tmp++; tmp2++;
+                       *tmp2 = *tmp; tmp++; tmp2++;
+                       *tmp2 = *tmp; tmp++; tmp2++;
+                       *tmp2 = 0; tmp2++;
+                       *tmp2 = 0; tmp2++;
+                       *tmp2 = 0; tmp2++;
+               }
+       } else
+#endif
+               err = copy_from_user((&(hard->fsave.cwd)),
+                                    (void *)(user),
+                                    sizeof(struct i387_hard_fsave));
+       return err;
+}
+
+
+/*
  * Save a segment.
  */
 #define savesegment(seg,value) \
@@ -626,13 +727,43 @@
  */
 int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu)
 {
+#ifdef CONFIG_X86_FX
+       int fpvalid, i;
+       short *tmp, *tmp2;
+       struct task_struct *tsk = current;
+       union i387_hard_union *hard;
+#else
        int fpvalid;
        struct task_struct *tsk = current;
-
+#endif
        fpvalid = tsk->used_math;
        if (fpvalid) {
                unlazy_fpu(tsk);
-               memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu));
+#ifdef CONFIG_X86_FX
+               if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+                       hard = &tsk->tss.i387.hard;
+
+                       fpu->cwd = 0xffff0000 | hard->fxsave.fxcwd;
+                       fpu->swd = 0xffff0000 | hard->fxsave.fxswd;
+                       fpu->twd = fputag_KNI_to_387(hard->fxsave.fxtwd);
+                       fpu->fip = hard->fxsave.fxfip;
+                       fpu->fcs = hard->fxsave.fxfcs;
+                       fpu->foo = hard->fxsave.fxfoo;
+                       fpu->fos = hard->fxsave.fxfos;
+
+                       tmp = (short *)&fpu->st_space[0];
+                       tmp2 = (short *)&hard->fxsave.st_space[0];
+
+                       for (i = 0; i < 8; i++) {
+                               *tmp = *tmp2; tmp++; tmp2++;
+                               *tmp = *tmp2; tmp++; tmp2++;
+                               *tmp = *tmp2; tmp++; tmp2++;
+                               *tmp = *tmp2; tmp++; tmp2++;
+                               *tmp = *tmp2; tmp++; tmp2+=4;
+                       }
+               } else
+#endif
+                       memcpy(fpu,&tsk->tss.i387.hard.fsave,sizeof(*fpu));
        }
 
        return fpvalid;
@@ -692,8 +823,8 @@
 /*
  *     switch_to(x,yn) should switch tasks from x to y.
  *
- * We fsave/fwait so that an exception goes off at the right time
- * (as a call from the fsave or fwait in effect) rather than to
+ * We fpu_save so that an exception goes off at the right time
+ * (as a call from the f*save or fwait in effect) rather than to
  * the wrong process. Lazy FP saving no longer makes any sense
  * with modern CPU's, and this simplifies a lot of things (SMP
  * and UP become the same).
--- linux/arch/i386/kernel/ptrace.c.PIII        Sun Dec  5 14:23:09 1999
+++ linux/arch/i386/kernel/ptrace.c     Sun Dec  5 14:23:14 1999
@@ -17,6 +17,7 @@
 #include <asm/system.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
+#include <asm/i387.h>
 
 /*
  * does not yet catch signals sent when the child dies.
@@ -646,6 +647,9 @@
                  };
 
                case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+                       /*
+                        * user-space expects an 'old-style' FPU dump.
+                        */
                        if (!access_ok(VERIFY_WRITE, (unsigned *)data,
                                       sizeof(struct user_i387_struct)))
                          {
@@ -655,15 +659,17 @@
                        ret = 0;
                        if ( !child->used_math ) {
                          /* Simulate an empty FPU. */
-                         child->tss.i387.hard.cwd = 0xffff037f;
-                         child->tss.i387.hard.swd = 0xffff0000;
-                         child->tss.i387.hard.twd = 0xffffffff;
+                         i387_set_cwd(child->tss.i387.hard, 0x037f);
+                         i387_set_swd(child->tss.i387.hard, 0x0000);
+                         i387_set_twd(child->tss.i387.hard, 0xffff);
                        }
 #ifdef CONFIG_MATH_EMULATION
                        if ( boot_cpu_data.hard_math ) {
 #endif
-                               __copy_to_user((void *)data, &child->tss.i387.hard,
-                                               sizeof(struct user_i387_struct));
+                               i387_hard_to_user(
+                                       (struct user_i387_struct *)data,
+                                       &child->tss.i387.hard
+                               );
 #ifdef CONFIG_MATH_EMULATION
                        } else {
                          save_i387_soft(&child->tss.i387.soft,
@@ -684,8 +690,10 @@
 #ifdef CONFIG_MATH_EMULATION
                        if ( boot_cpu_data.hard_math ) {
 #endif
-                         __copy_from_user(&child->tss.i387.hard, (void *)data,
-                                          sizeof(struct user_i387_struct));
+                               i387_user_to_hard(
+                                       &child->tss.i387.hard,
+                                       (struct user_i387_struct *)data
+                               );
 #ifdef CONFIG_MATH_EMULATION
                        } else {
                          restore_i387_soft(&child->tss.i387.soft,
--- linux/arch/i386/kernel/signal.c.PIII        Sun Dec  5 14:23:09 1999
+++ linux/arch/i386/kernel/signal.c     Sun Dec  5 14:23:14 1999
@@ -21,6 +21,7 @@
 #include <linux/stddef.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
+#include <asm/i387.h>
 
 #define DEBUG_SIG 0
 
@@ -153,9 +154,14 @@
 
 static inline int restore_i387_hard(struct _fpstate *buf)
 {
+       int err = 0;
        struct task_struct *tsk = current;
        clear_fpu(tsk);
-       return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf));
+
+       err = i387_user_to_hard(&tsk->tss.i387.hard,
+                               (struct user_i387_struct *)buf);
+       err |= get_user(tsk->tss.i387.hard.fsave.swd, &buf->status);
+       return err;
 }
 
 static inline int restore_i387(struct _fpstate *buf)
@@ -305,11 +311,14 @@
 
 static inline int save_i387_hard(struct _fpstate * buf)
 {
+       int err = 0;
        struct task_struct *tsk = current;
 
        unlazy_fpu(tsk);
-       tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd;
-       if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf)))
+       err = i387_hard_to_user((struct user_i387_struct *)buf,
+                       &tsk->tss.i387.hard);
+       err |= put_user(tsk->tss.i387.hard.fsave.swd, &buf->status);
+       if (err)
                return -1;
        return 1;
 }
--- linux/arch/i386/kernel/smp.c.PIII   Sun Dec  5 14:23:09 1999
+++ linux/arch/i386/kernel/smp.c        Sun Dec  5 14:23:14 1999
@@ -890,6 +890,8 @@
  */
 int __init start_secondary(void *unused)
 {
+       disable_serial_nr();
+       load_default_mxcsr();
        /*
         * Dont put anything before smp_callin(), SMP
         * booting is too fragile that we want to limit the
--- linux/arch/i386/kernel/traps.c.PIII Tue Feb 16 17:20:05 1999
+++ linux/arch/i386/kernel/traps.c      Sun Dec  5 14:23:14 1999
@@ -33,6 +33,7 @@
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
 #include <asm/desc.h>
+#include <asm/i387.h>
 
 #include <asm/smp.h>
 
@@ -421,7 +422,9 @@
         * (this will also clear the error)
         */
        task = current;
-       save_fpu(task);
+       i387_save_hard(task->tss.i387);
+       task->flags &= ~PF_USEDFPU;
+       stts();
        task->tss.trap_no = 16;
        task->tss.error_code = 0;
        force_sig(SIGFPE, task);
@@ -452,17 +455,44 @@
 asmlinkage void math_state_restore(struct pt_regs regs)
 {
        __asm__ __volatile__("clts");           /* Allow maths ops (or we recurse) */
-       if(current->used_math)
-               __asm__("frstor %0": :"m" (current->tss.i387));
-       else
-       {
+       /*
+        * If we have either of the kernel FPU use states set in the
+        * fpustate variable, then this will be a kernel math trap.
+        * Otherwise, this is userspace trying to use the FPU.
+        */
+       if(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) {
+               load_default_mxcsr(); /* we don't ever mess with this in
+                                        kernel space, so just make sure
+                                        we have a reasonable one so we
+                                        don't start taking unmasked
+                                        exceptions by accident */
+               if(current->tss.mmx_reg_space != NULL)
+                       __asm__("movq 0x00(%0), %%mm0\n\t"
+                               "movq 0x08(%0), %%mm1\n\t"
+                               "movq 0x10(%0), %%mm2\n\t"
+                               "movq 0x18(%0), %%mm3\n\t"
+                               :: "r" (current->tss.mmx_reg_space));
+               if(current->tss.kni_reg_space != NULL)
+                       __asm__("movups 0x00(%0), %%xmm0\n\t"
+                               "movups 0x10(%0), %%xmm1\n\t"
+                               "movups 0x20(%0), %%xmm2\n\t"
+                               "movups 0x30(%0), %%xmm3\n\t"
+                               :: "r" (current->tss.kni_reg_space));
+       } else if(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) {
+               i387_restore_hard(current->tss.i387);
+               current->tss.x86_fpustate = 0;
+       } else if(current->used_math) {
+               i387_restore_hard(current->tss.i387);
+               current->flags|=PF_USEDFPU;     /* make switch_to() work */
+       } else {
                /*
                 *      Our first FPU usage, clean the chip.
                 */
                __asm__("fninit");
+               load_default_mxcsr();
                current->used_math = 1;
+               current->flags|=PF_USEDFPU;     /* make switch_to() work */
        }
-       current->flags|=PF_USEDFPU;             /* So we fnsave on switch_to() */
 }
 
 #ifndef CONFIG_MATH_EMULATION
--- linux/arch/i386/kernel/i386_ksyms.c.PIII    Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/kernel/i386_ksyms.c Sun Dec  5 14:23:14 1999
@@ -119,3 +119,13 @@
 #ifdef CONFIG_VT
 EXPORT_SYMBOL(screen_info);
 #endif
+
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+EXPORT_SYMBOL(best_memcpy);
+EXPORT_SYMBOL(best_memset);
+EXPORT_SYMBOL(best_copy_to_user);
+EXPORT_SYMBOL(best_copy_from_user);
+EXPORT_SYMBOL(__best_copy_to_user);
+EXPORT_SYMBOL(__best_copy_from_user);
+#endif
+
--- linux/arch/i386/kernel/setup.c.PIII Sun Dec  5 14:23:09 1999
+++ linux/arch/i386/kernel/setup.c      Sun Dec  5 14:23:14 1999
@@ -104,6 +104,17 @@
 extern int _etext, _edata, _end;
 extern unsigned long cpu_hz;
 
+#ifdef CONFIG_X86_PN_OFF
+int disable_x86_serial_nr = 1;
+#else
+int disable_x86_serial_nr = 0;
+#endif
+
+/*
+ * For the various FPU using kernel accelerator routines
+ */
+spinlock_t kern_fpu_lock = SPIN_LOCK_UNLOCKED;
+
 /*
  * This is set up by the setup-routine at boot-time
  */
@@ -809,20 +820,6 @@
 
        if (c->x86_vendor == X86_VENDOR_AMD && amd_model(c))
                return;
-               
-       if (c->cpuid_level > 0 && c->x86_vendor == X86_VENDOR_INTEL)
-       {
-               if(c->x86_capability&(1<<18))
-               {
-                       /* Disable processor serial number on Intel Pentium III 
-                          from code by Phil Karn */
-                       unsigned long lo,hi;
-                       rdmsr(0x119,lo,hi);
-                       lo |= 0x200000;
-                       wrmsr(0x119,lo,hi);
-                       printk(KERN_INFO "Pentium-III serial number disabled.\n");
-               }
-       }
 
        if (c->cpuid_level > 1) {
                /* supports eax=2  call */
@@ -909,7 +906,15 @@
        }
        cyrix_model(&boot_cpu_data);
 }
-       
+
+/*
+ * Setup function for serial number stuff
+ */
+
+__initfunc(void x86_serial_nr_setup(char *str, int *ints))
+{
+       disable_x86_serial_nr = !disable_x86_serial_nr;
+}
        
 
 static char *cpu_vendor_names[] __initdata = {
--- linux/arch/i386/Makefile.PIII       Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/Makefile    Sun Dec  5 14:23:14 1999
@@ -43,6 +43,10 @@
 CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2 
-DCPU=686
 endif
 
+ifdef CONFIG_M686FX
+CFLAGS := $(CFLAGS) -m486 -malign-loops=0 -malign-jumps=0 -malign-functions=0 
+-DCPU=686
+endif
+
 HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
 
 SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib
--- linux/arch/i386/config.in.PIII      Mon Aug  9 15:04:38 1999
+++ linux/arch/i386/config.in   Sun Dec  5 14:23:14 1999
@@ -16,7 +16,8 @@
         486/Cx486              CONFIG_M486     \
         586/K5/5x86/6x86       CONFIG_M586     \
         Pentium/K6/TSC         CONFIG_M586TSC  \
-        PPro/6x86MX            CONFIG_M686" PPro
+        PPro/6x86MX/PII        CONFIG_M686 \
+        PIII/Xeon/Deschutes    CONFIG_M686FX" PIII
 #
 # Define implied options from the CPU selection here
 #
@@ -26,20 +27,24 @@
   define_bool CONFIG_X86_BSWAP y
   define_bool CONFIG_X86_POPAD_OK y
 fi
-if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then
+if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" \
+                               -o "$CONFIG_M586TSC" = "y" ]; then
   define_bool CONFIG_X86_TSC y
 fi
-if [ "$CONFIG_M686" = "y" ]; then
+if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" ]; then
   define_bool CONFIG_X86_GOOD_APIC y
 fi
+bool 'Disable the PII/PIII Serial Number at bootup' CONFIG_X86_PN_OFF
+bool 'Enable PII/PIII Extended/Fast FPU save and restore support' CONFIG_X86_FX
+bool 'Enable CPU Specific (MMX/MMX2) Optimization Functions' 
+CONFIG_X86_CPU_OPTIMIZATIONS
+bool 'Math emulation' CONFIG_MATH_EMULATION
+bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
+bool 'Symmetric multi-processing support' CONFIG_SMP
 
 choice 'Maximum Physical Memory' \
        "1GB            CONFIG_1GB \
         2GB            CONFIG_2GB" 1GB
 
-bool 'Math emulation' CONFIG_MATH_EMULATION
-bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
-bool 'Symmetric multi-processing support' CONFIG_SMP
 endmenu
 
 mainmenu_option next_comment
--- linux/arch/i386/defconfig.PIII      Sun Dec  5 14:23:13 1999
+++ linux/arch/i386/defconfig   Sun Dec  5 14:23:14 1999
@@ -21,11 +21,14 @@
 CONFIG_X86_POPAD_OK=y
 CONFIG_X86_TSC=y
 CONFIG_X86_GOOD_APIC=y
-CONFIG_1GB=y
-# CONFIG_2GB is not set
+CONFIG_X86_PN_OFF=y
+CONFIG_X86_FX=y
+CONFIG_X86_CPU_OPTIMIZATIONS=y
 # CONFIG_MATH_EMULATION is not set
 # CONFIG_MTRR is not set
 CONFIG_SMP=y
+CONFIG_1GB=y
+# CONFIG_2GB is not set
 
 #
 # Loadable module support
--- linux/Documentation/Configure.help.PIII     Sun Dec  5 14:23:14 1999
+++ linux/Documentation/Configure.help  Sun Dec  5 14:23:14 1999
@@ -1659,10 +1659,10 @@
   all x86 CPU types (albeit not optimally fast), you can specify
   "386" here.
 
-  If you specify one of "486" or "586" or "Pentium" or "PPro", then
-  the kernel will not necessarily run on earlier architectures (e.g. a
-  Pentium optimized kernel will run on a PPro, but not necessarily on
-  a i486).
+  If you specify one of "486" or "586" or "Pentium" or "PPro" or "PIII",
+  then the kernel will not necessarily run on earlier architectures 
+  (e.g. a Pentium optimized kernel will run on a PPro, but not necessarily
+  on a i486).
 
   Here are the settings recommended for greatest speed:
    - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI
@@ -1676,8 +1676,30 @@
      K6-3D.
    - "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and
      Intel Pentium II/Pentium Pro.
+   - "PIII/Xeon/Deschutes" for the PIII (Katmai), Xeon and later PIIs
+     with the Deschutes or Mendocino core. You have to chose this for
+     MMX2 support.
 
   If you don't know what to do, choose "386".
+
+Disable PII/PIII Serial Number at bootup
+CONFIG_X86_PN_OFF
+  This makes the kernel disable the CPUID serial number that is embedded on
+  the new PIII CPUs at bootup.
+
+Enable PII/PIII Extended Fast FPU save and restore support
+CONFIG_X86_FX
+  This enables use of the new PII/PIII FXSAVE/FXRSTOR support.  This item
+  is required to make use of the new PIII 128bit XMM registers.  It is safe
+  to leave this enabled all the time.
+
+Enable CPU Specific (MMX/MMX2) Optimizations
+CONFIG_X86_CPU_OPTIMIZATIONS
+  This enables use of the MMX registers and 128bit MMX2 registers on CPUs
+  that can support the new instructions (Pentium/AMD K6 or newer).  In
+  order to support the Pentium III 128 bit XMM registers you must enable
+  both this and PII/PIII Extended Fast FPU save support.  It is safe to
+  leave this enabled all the time.
 
 VGA text console
 CONFIG_VGA_CONSOLE
--- linux/drivers/block/xor.c.pIII-2    Tue Nov 23 14:02:17 1999
+++ linux/drivers/block/xor.c   Tue Nov 23 14:03:21 1999
@@ -22,6 +22,10 @@
 #include <asm/asi.h>
 #include <asm/visasm.h>
 #endif
+#ifdef __i386__
+#include <asm/processor.h>
+#include <asm/i387.h>
+#endif
 
 /*
  * we use the 'XOR function template' to register multiple xor
@@ -66,7 +70,7 @@
 
 #ifdef __i386__
 
-#ifdef CONFIG_X86_XMM
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
 /*
  * Cache avoiding checksumming functions utilizing KNI instructions
  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
@@ -74,20 +78,12 @@
 
 XORBLOCK_TEMPLATE(pIII_kni)
 {
-       char xmm_save[16*4];
-       int cr0;
-        int lines = (bh_ptr[0]->b_size>>8);
-
-       __asm__ __volatile__ ( 
-               "movl %%cr0,%0          ;\n\t"
-               "clts                   ;\n\t"
-               "movups %%xmm0,(%1)     ;\n\t"
-               "movups %%xmm1,0x10(%1) ;\n\t"
-               "movups %%xmm2,0x20(%1) ;\n\t"
-               "movups %%xmm3,0x30(%1) ;\n\t"
-               : "=r" (cr0)
-               : "r" (xmm_save) 
-               : "memory" );
+       char xmm_space[64];
+       int lines = (bh_ptr[0]->b_size>>8);
+       int recursive = 0;
+       unsigned long flags;
+
+       kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
 
 #define OFFS(x) "8*("#x"*2)"
 #define        PF0(x) \
@@ -157,7 +153,7 @@
         "       jnz 1b                  ;\n"
 
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data)
                        : "memory" );
@@ -207,7 +203,7 @@
         "       decl %0                 ;\n"
         "       jnz 1b                  ;\n"
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data),
                          "r" (bh_ptr[2]->b_data)
@@ -266,7 +262,7 @@
         "       jnz 1b                  ;\n"
 
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data),
                          "r" (bh_ptr[2]->b_data),
@@ -333,7 +329,7 @@
         "       jnz 1b                  ;\n"
 
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data),
                          "r" (bh_ptr[2]->b_data),
@@ -343,16 +339,7 @@
                        break;
        }
 
-       __asm__ __volatile__ ( 
-               "sfence                 ;\n\t"
-               "movups (%1),%%xmm0     ;\n\t"
-               "movups 0x10(%1),%%xmm1 ;\n\t"
-               "movups 0x20(%1),%%xmm2 ;\n\t"
-               "movups 0x30(%1),%%xmm3 ;\n\t"
-               "movl   %0,%%cr0        ;\n\t"
-               :
-               : "r" (cr0), "r" (xmm_save)
-               : "memory" );
+       kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
 }
 
 #undef OFFS
@@ -371,7 +358,7 @@
 #undef XO5
 #undef BLOCK
 
-#endif /* CONFIG_X86_XMM */
+#endif /* CONFIG_X86_CPU_OPTIMIZATIONS */
 
 /*
  * high-speed RAID5 checksumming functions utilizing MMX instructions
@@ -379,13 +366,12 @@
  */
 XORBLOCK_TEMPLATE(pII_mmx)
 {
-       char fpu_save[108];
         int lines = (bh_ptr[0]->b_size>>7);
+       char mmx_space[32];
+       int recursive = 0;
+       unsigned long flags;
 
-       if (!(current->flags & PF_USEDFPU))
-               __asm__ __volatile__ ( " clts;\n");
-
-       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+       kernel_take_fpu_mmx(recursive,&mmx_space[0],NULL,flags);
 
 #define LD(x,y) \
         "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
@@ -431,7 +417,7 @@
                        "       decl %0               ;\n"
                        "       jnz 1b                ;\n"
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data)
                        : "memory");
@@ -471,7 +457,7 @@
                        "       decl %0               ;\n"
                        "       jnz 1b                ;\n"
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data),
                          "r" (bh_ptr[2]->b_data)
@@ -517,7 +503,7 @@
                        "       decl %0               ;\n"
                        "       jnz 1b                ;\n"
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data),
                          "r" (bh_ptr[2]->b_data),
@@ -569,7 +555,7 @@
                        "       decl %0               ;\n"
                        "       jnz 1b                ;\n"
                        :
-                       : "r" (lines),
+                       : "m" (lines),
                          "r" (bh_ptr[0]->b_data),
                          "r" (bh_ptr[1]->b_data),
                          "r" (bh_ptr[2]->b_data),
@@ -579,10 +565,7 @@
                        break;
        }
 
-       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-       if (!(current->flags & PF_USEDFPU))
-               stts();
+       kernel_release_fpu_mmx(recursive,&mmx_space[0],flags);
 }
 
 #undef LD
@@ -595,13 +578,12 @@
 
 XORBLOCK_TEMPLATE(p5_mmx)
 {
-       char fpu_save[108];
         int lines = (bh_ptr[0]->b_size>>6);
+       char mmx_space[32];
+       int recursive = 0;
+       unsigned long flags;
 
-       if (!(current->flags & PF_USEDFPU))
-               __asm__ __volatile__ ( " clts;\n");
-
-       __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+       kernel_take_fpu_mmx(recursive,&mmx_space[0],NULL,flags);
 
        switch(count) {
                case 2:
@@ -618,21 +600,21 @@
                                "       movq 24(%1), %%mm3   ;\n"
                                "       movq %%mm1,  8(%1)   ;\n"
                                "       pxor 16(%2), %%mm2   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
+                               "       movq 32(%1), %%mm0   ;\n"
                                "       movq %%mm2, 16(%1)   ;\n"
                                "       pxor 24(%2), %%mm3   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
+                               "       movq 40(%1), %%mm1   ;\n"
                                "       movq %%mm3, 24(%1)   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
+                               "       pxor 32(%2), %%mm0   ;\n"
+                               "       movq 48(%1), %%mm2   ;\n"
+                               "       movq %%mm0, 32(%1)   ;\n"
+                               "       pxor 40(%2), %%mm1   ;\n"
+                               "       movq 56(%1), %%mm3   ;\n"
+                               "       movq %%mm1, 40(%1)   ;\n"
+                               "       pxor 48(%2), %%mm2   ;\n"
+                               "       pxor 56(%2), %%mm3   ;\n"
+                               "       movq %%mm2, 48(%1)   ;\n"
+                               "       movq %%mm3, 56(%1)   ;\n"
         
                                "       addl $64, %1         ;\n"
                                "       addl $64, %2         ;\n"
@@ -640,7 +622,7 @@
                                "       jnz 1b               ;\n"
 
                                : 
-                               : "r" (lines),
+                               : "m" (lines),
                                  "r" (bh_ptr[0]->b_data),
                                  "r" (bh_ptr[1]->b_data)
                                : "memory" );
@@ -662,26 +644,26 @@
                                "       pxor 16(%3), %%mm2   ;\n"
                                "       movq 24(%1), %%mm3   ;\n"
                                "       movq %%mm1,  8(%1)   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
+                               "       movq 32(%1), %%mm0   ;\n"
+                               "       movq 40(%1), %%mm1   ;\n"
                                "       pxor 24(%2), %%mm3   ;\n"
                                "       movq %%mm2, 16(%1)   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
+                               "       pxor 32(%2), %%mm0   ;\n"
                                "       pxor 24(%3), %%mm3   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
+                               "       pxor 40(%2), %%mm1   ;\n"
                                "       movq %%mm3, 24(%1)   ;\n"
-                               "       pxor 32(%3), %%mm4   ;\n"
-                               "       pxor 40(%3), %%mm5   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       pxor 48(%3), %%mm6   ;\n"
-                               "       pxor 56(%3), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
+                               "       pxor 32(%3), %%mm0   ;\n"
+                               "       pxor 40(%3), %%mm1   ;\n"
+                               "       movq 48(%1), %%mm2   ;\n"
+                               "       movq %%mm0, 32(%1)   ;\n"
+                               "       movq 56(%1), %%mm3   ;\n"
+                               "       pxor 48(%2), %%mm2   ;\n"
+                               "       movq %%mm1, 40(%1)   ;\n"
+                               "       pxor 56(%2), %%mm3   ;\n"
+                               "       pxor 48(%3), %%mm2   ;\n"
+                               "       pxor 56(%3), %%mm3   ;\n"
+                               "       movq %%mm2, 48(%1)   ;\n"
+                               "       movq %%mm3, 56(%1)   ;\n"
         
                                "       addl $64, %1         ;\n"
                                "       addl $64, %2         ;\n"
@@ -690,7 +672,7 @@
                                "       jnz 1b               ;\n"
 
                                : 
-                               : "r" (lines),
+                               : "m" (lines),
                                  "r" (bh_ptr[0]->b_data),
                                  "r" (bh_ptr[1]->b_data),
                                  "r" (bh_ptr[2]->b_data)
@@ -714,33 +696,33 @@
                                "       pxor 16(%3), %%mm2   ;\n"
                                "       pxor  8(%4), %%mm1   ;\n"
                                "       movq %%mm0,   (%1)   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
+                               "       movq 32(%1), %%mm0   ;\n"
                                "       pxor 24(%2), %%mm3   ;\n"
                                "       pxor 16(%4), %%mm2   ;\n"
                                "       movq %%mm1,  8(%1)   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
+                               "       movq 40(%1), %%mm1   ;\n"
+                               "       pxor 32(%2), %%mm0   ;\n"
                                "       pxor 24(%3), %%mm3   ;\n"
                                "       movq %%mm2, 16(%1)   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
-                               "       pxor 32(%3), %%mm4   ;\n"
+                               "       pxor 40(%2), %%mm1   ;\n"
+                               "       pxor 32(%3), %%mm0   ;\n"
                                "       pxor 24(%4), %%mm3   ;\n"
                                "       movq %%mm3, 24(%1)   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       pxor 40(%3), %%mm5   ;\n"
-                               "       pxor 32(%4), %%mm4   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       pxor 40(%4), %%mm5   ;\n"
-                               "       pxor 48(%3), %%mm6   ;\n"
-                               "       pxor 56(%3), %%mm7   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 48(%4), %%mm6   ;\n"
-                               "       pxor 56(%4), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
+                               "       movq 56(%1), %%mm3   ;\n"
+                               "       movq 48(%1), %%mm2   ;\n"
+                               "       pxor 40(%3), %%mm1   ;\n"
+                               "       pxor 32(%4), %%mm0   ;\n"
+                               "       pxor 48(%2), %%mm2   ;\n"
+                               "       movq %%mm0, 32(%1)   ;\n"
+                               "       pxor 56(%2), %%mm3   ;\n"
+                               "       pxor 40(%4), %%mm1   ;\n"
+                               "       pxor 48(%3), %%mm2   ;\n"
+                               "       pxor 56(%3), %%mm3   ;\n"
+                               "       movq %%mm1, 40(%1)   ;\n"
+                               "       pxor 48(%4), %%mm2   ;\n"
+                               "       pxor 56(%4), %%mm3   ;\n"
+                               "       movq %%mm2, 48(%1)   ;\n"
+                               "       movq %%mm3, 56(%1)   ;\n"
         
                                "       addl $64, %1         ;\n"
                                "       addl $64, %2         ;\n"
@@ -750,7 +732,7 @@
                                "       jnz 1b               ;\n"
 
                                : 
-                               : "r" (lines),
+                               : "m" (lines),
                                  "r" (bh_ptr[0]->b_data),
                                  "r" (bh_ptr[1]->b_data),
                                  "r" (bh_ptr[2]->b_data),
@@ -782,34 +764,34 @@
                                "       movq %%mm1,  8(%1)   ;\n"
                                "       pxor 16(%5), %%mm2   ;\n"
                                "       pxor 24(%3), %%mm3   ;\n"
-                               "       movq 32(%1), %%mm4   ;\n"
+                               "       movq 32(%1), %%mm0   ;\n"
                                "       movq %%mm2, 16(%1)   ;\n"
                                "       pxor 24(%4), %%mm3   ;\n"
-                               "       pxor 32(%2), %%mm4   ;\n"
-                               "       movq 40(%1), %%mm5   ;\n"
+                               "       pxor 32(%2), %%mm0   ;\n"
+                               "       movq 40(%1), %%mm1   ;\n"
                                "       pxor 24(%5), %%mm3   ;\n"
-                               "       pxor 32(%3), %%mm4   ;\n"
-                               "       pxor 40(%2), %%mm5   ;\n"
+                               "       pxor 32(%3), %%mm0   ;\n"
+                               "       pxor 40(%2), %%mm1   ;\n"
                                "       movq %%mm3, 24(%1)   ;\n"
-                               "       pxor 32(%4), %%mm4   ;\n"
-                               "       pxor 40(%3), %%mm5   ;\n"
-                               "       movq 48(%1), %%mm6   ;\n"
-                               "       movq 56(%1), %%mm7   ;\n"
-                               "       pxor 32(%5), %%mm4   ;\n"
-                               "       pxor 40(%4), %%mm5   ;\n"
-                               "       pxor 48(%2), %%mm6   ;\n"
-                               "       pxor 56(%2), %%mm7   ;\n"
-                               "       movq %%mm4, 32(%1)   ;\n"
-                               "       pxor 48(%3), %%mm6   ;\n"
-                               "       pxor 56(%3), %%mm7   ;\n"
-                               "       pxor 40(%5), %%mm5   ;\n"
-                               "       pxor 48(%4), %%mm6   ;\n"
-                               "       pxor 56(%4), %%mm7   ;\n"
-                               "       movq %%mm5, 40(%1)   ;\n"
-                               "       pxor 48(%5), %%mm6   ;\n"
-                               "       pxor 56(%5), %%mm7   ;\n"
-                               "       movq %%mm6, 48(%1)   ;\n"
-                               "       movq %%mm7, 56(%1)   ;\n"
+                               "       pxor 32(%4), %%mm0   ;\n"
+                               "       pxor 40(%3), %%mm1   ;\n"
+                               "       movq 48(%1), %%mm2   ;\n"
+                               "       movq 56(%1), %%mm3   ;\n"
+                               "       pxor 32(%5), %%mm0   ;\n"
+                               "       pxor 40(%4), %%mm1   ;\n"
+                               "       pxor 48(%2), %%mm2   ;\n"
+                               "       pxor 56(%2), %%mm3   ;\n"
+                               "       movq %%mm0, 32(%1)   ;\n"
+                               "       pxor 48(%3), %%mm2   ;\n"
+                               "       pxor 56(%3), %%mm3   ;\n"
+                               "       pxor 40(%5), %%mm1   ;\n"
+                               "       pxor 48(%4), %%mm2   ;\n"
+                               "       pxor 56(%4), %%mm3   ;\n"
+                               "       movq %%mm1, 40(%1)   ;\n"
+                               "       pxor 48(%5), %%mm2   ;\n"
+                               "       pxor 56(%5), %%mm3   ;\n"
+                               "       movq %%mm2, 48(%1)   ;\n"
+                               "       movq %%mm3, 56(%1)   ;\n"
         
                                "       addl $64, %1         ;\n"
                                "       addl $64, %2         ;\n"
@@ -820,7 +802,7 @@
                                "       jnz 1b               ;\n"
 
                                : 
-                               : "r" (lines),
+                               : "m" (lines),
                                  "r" (bh_ptr[0]->b_data),
                                  "r" (bh_ptr[1]->b_data),
                                  "r" (bh_ptr[2]->b_data),
@@ -830,10 +812,7 @@
                        break;
        }
 
-       __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
-       if (!(current->flags & PF_USEDFPU))
-               stts();
+       kernel_release_fpu_mmx(recursive,&mmx_space[0],flags);
 }
 #endif /* __i386__ */
 #endif /* !__sparc_v9__ */
@@ -1811,11 +1790,12 @@
                if (f->speed > fastest->speed)
                        fastest = f;
        }
-#ifdef CONFIG_X86_XMM 
-       if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+       if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+            (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
                fastest = &t_xor_block_pIII_kni;
        }
-#endif
+#endif /* CONFIG_X86_CPU_OPTIMIZATIONS */
        xor_block = fastest->xor_block;
        printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
                fastest->speed / 1000, fastest->speed % 1000);
@@ -1847,8 +1827,9 @@
        xor_speed(&t_xor_block_SPARC,&b1,&b2);
 #endif
 
-#ifdef CONFIG_X86_XMM 
-       if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+       if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+            (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
                printk(KERN_INFO
                        "raid5: KNI detected, trying cache-avoiding KNI checksum 
routine\n");
                /* we force the use of the KNI xor block because it
@@ -1859,7 +1840,7 @@
                */
                xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
        }
-#endif /* CONFIG_X86_XMM */
+#endif /* CONFIG_X86_CPU_OPTIMIZATIONS */
 
 #ifdef __i386__
 

raid-2.2.14-B1.gz



Reply via email to