mitch wrote:
>
> Were can I get the SSE P3 kernel patch for 2.2.13? Thanks.
I've attached the latest version of the patch here. The xor patch goes on top
of the RAID-0.90 patch (Ingo Molnar's raid code). When using this version of
the PIII patch you need to make sure you have both this and the RAID code
installed (there are some FPU use interactions that have to be taken care
of). In case you don't have it, I've attached Ingo's last RAID patch. The
order of application should be RAID patch followed by the two PIII patches.
These should all apply cleanly to a 2.2.14 kernel. The RAID patch will have
problems applying to a 2.2.13 kernel.
--
Doug Ledford <[EMAIL PROTECTED]>
Opinions expressed are my own, but
they should be everybody's.
--- linux/init/main.c.PIII Sun Dec 5 14:23:13 1999
+++ linux/init/main.c Sun Dec 5 14:23:14 1999
@@ -99,6 +99,7 @@
#ifdef __i386__
extern void ioapic_pirq_setup(char *str, int *ints);
extern void ioapic_setup(char *str, int *ints);
+extern void x86_serial_nr_setup(char *str, int *ints);
#endif
extern void no_scroll(char *str, int *ints);
extern void kbd_reset_setup(char *str, int *ints);
@@ -581,6 +582,9 @@
{ "noapic", ioapic_setup },
{ "pirq=", ioapic_pirq_setup },
#endif
+#endif
+#ifdef __i386__
+ { "x86_serial_nr", x86_serial_nr_setup },
#endif
#ifdef CONFIG_BLK_DEV_RAM
{ "ramdisk_start=", ramdisk_start_setup },
--- linux/include/asm-i386/bugs.h.PIII Mon Aug 9 15:04:57 1999
+++ linux/include/asm-i386/bugs.h Sun Dec 5 14:23:14 1999
@@ -18,6 +18,7 @@
*/
#include <linux/config.h>
+#include <linux/stddef.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -69,6 +70,45 @@
#endif
return;
}
+#ifdef CONFIG_X86_FX
+ /*
+ * If we got so far we can safely turn on FXSAVE/FXRESTORE,
+ * but make sure we are 16-byte aligned first.
+ */
+ if (offsetof(struct task_struct, tss.i387.hard.fxsave.fxcwd) & 15) {
+ /*
+ * This triggers a link-time error if we manage to
+ * break alignment somehow.
+ */
+ extern void __buggy_fxsr_alignment(void);
+
+ __buggy_fxsr_alignment();
+ }
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+ printk("Enabling extended fast FPU save and restore...");
+ set_in_cr4(X86_CR4_OSFXSR);
+ printk("done.\n");
+ }
+ /*
+ * Note, Katmai instructions are enabled as soon as you start
+ * using the FXSAVE/RESTORE stuff. This setting only
+ * indicates support for the masked/unmasked exceptions on
+ * the new PIII cpus. We don't have an Exception 16 handler
+ * for this yet, but we set this bit anyway. It'll kill us
+ * the first time we take an umasked KNI exception, but since
+ * no userland apps currently use KNI, it isn't an issue yet.
+ * We should have the handler added by then.
+ */
+ if (boot_cpu_data.x86_capability & X86_FEATURE_XMM) {
+ printk("Not enabling KNI unmasked exception support\n");
+ printk("Exception 19 error handler not integrated yet\n");
+#if 0
+ set_in_cr4(X86_CR4_OSXMMEXCPT);
+ printk("done.\n");
+#endif
+ }
+#endif
+ disable_serial_nr();
if (mca_pentium_flag) {
/* The IBM Model 95 machines with pentiums lock up on
* fpu test, so we avoid it. All pentiums have inbuilt
@@ -117,23 +157,23 @@
return;
if (!ignore_irq13) {
printk("OK, FPU using old IRQ 13 error reporting\n");
- return;
+ } else {
+ __asm__("fninit\n\t"
+ "fldl %1\n\t"
+ "fdivl %2\n\t"
+ "fmull %2\n\t"
+ "fldl %1\n\t"
+ "fsubp %%st,%%st(1)\n\t"
+ "fistpl %0\n\t"
+ "fwait\n\t"
+ "fninit"
+ : "=m" (*&boot_cpu_data.fdiv_bug)
+ : "m" (*&x), "m" (*&y));
+ if (!boot_cpu_data.fdiv_bug)
+ printk("OK, FPU using exception 16 error reporting.\n");
+ else
+ printk("Hmm, FPU using exception 16 error reporting with FDIV
+bug.\n");
}
- __asm__("fninit\n\t"
- "fldl %1\n\t"
- "fdivl %2\n\t"
- "fmull %2\n\t"
- "fldl %1\n\t"
- "fsubp %%st,%%st(1)\n\t"
- "fistpl %0\n\t"
- "fwait\n\t"
- "fninit"
- : "=m" (*&boot_cpu_data.fdiv_bug)
- : "m" (*&x), "m" (*&y));
- if (!boot_cpu_data.fdiv_bug)
- printk("OK, FPU using exception 16 error reporting.\n");
- else
- printk("Hmm, FPU using exception 16 error reporting with FDIV bug.\n");
}
__initfunc(static void check_hlt(void))
@@ -419,5 +459,7 @@
check_amd_k6();
check_pentium_f00f();
check_cyrix_coma();
+ boot_cpu_data.enable_fixups = 1; /* should be safe to use MMX/MMX2 */
+ /* kernel functions now */
system_utsname.machine[1] = '0' + boot_cpu_data.x86;
}
--- linux/include/asm-i386/i387.h.PIII Sun Dec 5 14:23:14 1999
+++ linux/include/asm-i386/i387.h Sun Dec 5 14:23:14 1999
@@ -0,0 +1,313 @@
+/*
+ * include/asm-i386/i387.h
+ *
+ * Copyright (c) 1999 Doug Ledford <[EMAIL PROTECTED]>
+ *
+ * Made from various code bits pulled from other files
+ * in order to put things together in a way that made
+ * sense.
+ *
+ * FX/FPU support:
+ * Copyright (c) 1999 Ingo Molnar <[EMAIL PROTECTED]>,
+ * Gabriel Paubert <[EMAIL PROTECTED]>
+ */
+
+#ifndef __ASM_I386_I387_H
+#define __ASM_I386_I387_H
+
+extern int i387_hard_to_user ( struct user_i387_struct * user,
+ union i387_hard_union * hard);
+extern int i387_user_to_hard ( union i387_hard_union * hard,
+ struct user_i387_struct * user);
+
+/*
+ * Fill out the reserved bits, treat it as an fsave struct since the
+ * union makes this work for both fsave and fxsave structs.
+ */
+#ifdef CONFIG_X86_FX
+
+#define i387_save_hard(x) \
+do { \
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+ __asm__ __volatile__("fxsave %0" \
+ : "=m" ((x).hard.fxsave.fxcwd)); \
+ } else { \
+ __asm__ __volatile__("fnsave %0; fwait;" \
+ : "=m" ((x).hard.fsave.cwd)); \
+ } \
+} while(0)
+
+#define i387_restore_hard(x) \
+do { \
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+ __asm__ __volatile__("fxrstor %0" \
+ : \
+ : "m" ((x).hard.fxsave.fxcwd)); \
+ } else { \
+ __asm__ __volatile__("frstor %0" \
+ : \
+ :"m" ((x).hard.fsave.cwd)); \
+ } \
+} while(0)
+
+#define i387_set_cwd(x,v) \
+do { \
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+ (x).fxsave.fxcwd = (short)(v); \
+ } else { \
+ (x).fsave.cwd = ((long)(v) | 0xffff0000); \
+ } \
+} while(0)
+
+#define i387_set_swd(x,v) \
+do { \
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+ (x).fxsave.fxswd = (short)(v); \
+ } else { \
+ (x).fsave.swd = ((long)(v) | 0xffff0000); \
+ } \
+} while(0)
+
+#define i387_set_twd(x,v) \
+do { \
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) { \
+ (x).fxsave.fxtwd = (short)(v); \
+ } else { \
+ (x).fsave.twd = ((long)(v) | 0xffff0000); \
+ } \
+} while(0)
+
+static inline unsigned short fputag_KNI_to_387(unsigned char tb) {
+ unsigned short tw = tb;
+ tw = (tw | (tw << 4)) & 0x0f0f; /* zzzz7654zzzz3210 */
+ tw = (tw | (tw << 2)) & 0x3333; /* zz76zz54zz32zz10 */
+ tw = (tw | (tw << 1)) & 0x5555; /* z7z6z5z4z3z2z1z0 */
+ tw = ~(tw * 3);
+ return tw;
+}
+
+static inline unsigned char fputag_387_to_KNI(unsigned short tw) {
+ tw = ~tw & 0x5555; /* z7z6z5z4z3z2z1z0 */
+ tw = (tw | (tw >> 1)) & 0x3333; /* zz76zz54zz32zz10 */
+ tw = (tw | (tw >> 2)) & 0x0f0f; /* zzzz7654zzzz3210 */
+ tw = (tw | (tw >> 4)) & 0x00ff; /* zzzzzzzz76543210 */
+ return tw;
+}
+
+#else /* CONFIG_X86_FX */
+
+#define i387_save_hard(x) \
+do { \
+ __asm__ __volatile__("fnsave %0; fwait;" \
+ : "=m" ((x).hard.fsave.cwd)); \
+} while(0)
+
+#define i387_restore_hard(x) \
+do { \
+ __asm__ __volatile__("frstor %0" \
+ : \
+ :"m" ((x).hard.fsave.cwd)); \
+} while(0)
+
+#define i387_set_cwd(x,v) \
+do { (x).fsave.cwd = ((long)(v) | 0xffff0000); } while(0)
+
+#define i387_set_swd(x,v) \
+do { (x).fsave.swd = ((long)(v) | 0xffff0000); } while(0)
+
+#define i387_set_twd(x,v) \
+do { (x).fsave.twd = ((long)(v) | 0xffff0000); } while(0)
+
+#endif /* CONFIG_X86_FX */
+
+/*
+ * FPU lazy state save handling..
+ */
+#define save_kern_fpu(tsk) do { \
+ if(tsk->tss.mmx_reg_space != NULL) \
+ __asm__("movq %%mm0, 0x00(%0)\n\t" \
+ "movq %%mm1, 0x08(%0)\n\t" \
+ "movq %%mm2, 0x10(%0)\n\t" \
+ "movq %%mm3, 0x18(%0)\n\t" \
+ :: "r" (tsk->tss.mmx_reg_space):"memory"); \
+ if(tsk->tss.kni_reg_space != NULL) \
+ __asm__("movups %%xmm0, 0x00(%0)\n\t" \
+ "movups %%xmm1, 0x10(%0)\n\t" \
+ "movups %%xmm2, 0x20(%0)\n\t" \
+ "movups %%xmm3, 0x30(%0)\n\t" \
+ :: "r" (tsk->tss.kni_reg_space):"memory"); \
+} while (0)
+
+#define unlazy_fpu(tsk) do { \
+ if (tsk->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) { \
+ save_kern_fpu(tsk); \
+ if (!(tsk->flags & PF_USEDFPU)) { \
+ stts(); \
+ } \
+ } \
+ if (tsk->flags & PF_USEDFPU) { \
+ if (!(tsk->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED)) { \
+ i387_save_hard(tsk->tss.i387); \
+ } \
+ tsk->flags &= ~PF_USEDFPU; \
+ stts(); \
+ } \
+} while (0)
+
+#define clear_fpu(tsk) do { \
+ if ( (tsk->flags & PF_USEDFPU) || \
+ (tsk->tss.x86_fpustate) ) { \
+ tsk->flags &= ~PF_USEDFPU; \
+ tsk->tss.x86_fpustate = 0; \
+ stts(); \
+ } \
+} while (0)
+
+/*
+ * For when we want to use the FPU in kernel code
+ *
+ * These functions allow the use of up to 4 KNI based xmm registers on the
+ * Pentium III processors or up to 4 MMX registers on Pentium MMX and above
+ * or compatible processors. Pick the routines that you need based on the
+ * regs you are going to use. Keep in mind that these are intended to be
+ * used only after you've verified that the processor supports these
+ * operations. Use them before you've done that and watch your machine go
+ * boom. Take a look in arch/i386/lib/best_function.c for an example of
+ * how to fixup the kernel with kni/mmx using functions once the CPU
+ * capabilities have been determined.
+ *
+ * In all of these functions:
+ *
+ * recursive - int, used to determine what the state is at restore time
+ * regs - char * to an array that is 32 bytes for mmx and 64 bytes for kni
+ * which is then used to save off the contents of the current
+ * regs to be recursively safe
+ * task_switch_regs - char * to another array of the same size as the one
+ * above, but this array is optional. If your function might get
+ * pre-empted by another task then this pointer should be non-NULL
+ * so that at unlazy_fpu() time in the switch_to() function we
+ * can save your register state (copy_*_user functions are an example
+ * of functions that need this, since they can take a page fault and
+ * while that fault is being serviced the scheduler is free to run
+ * another task entirely).
+ * irqflags - unsigned long used to store IRQ state
+ */
+
+#define SAVE_MMX_REGS(regs) \
+ __asm__ __volatile__("movq %%mm0, 0x00(%0)\n\t" \
+ "movq %%mm1, 0x08(%0)\n\t" \
+ "movq %%mm2, 0x10(%0)\n\t" \
+ "movq %%mm3, 0x18(%0)\n\t" \
+ : : "r" ((regs)) : "memory" );
+
+#define RESTORE_MMX_REGS(regs) \
+ __asm__ __volatile__("movq 0x00(%0), %%mm0\n\t" \
+ "movq 0x08(%0), %%mm1\n\t" \
+ "movq 0x10(%0), %%mm2\n\t" \
+ "movq 0x18(%0), %%mm3\n\t" \
+ : : "r" ((regs)));
+
+#define SAVE_KNI_REGS(regs) \
+ __asm__ __volatile__("movups %%xmm0, 0x00(%0)\n\t" \
+ "movups %%xmm1, 0x10(%0)\n\t" \
+ "movups %%xmm2, 0x20(%0)\n\t" \
+ "movups %%xmm3, 0x30(%0)\n\t" \
+ : : "r" ((regs)) : "memory" );
+
+#define RESTORE_KNI_REGS(regs) \
+ __asm__ __volatile__("movups 0x00(%0), %%xmm0\n\t" \
+ "movups 0x10(%0), %%xmm1\n\t" \
+ "movups 0x20(%0), %%xmm2\n\t" \
+ "movups 0x30(%0), %%xmm3\n\t" \
+ : : "r" ((regs)));
+
+#define SFENCE() \
+ __asm__ __volatile__("sfence":::"memory")
+
+
+extern spinlock_t kern_fpu_lock;
+
+/*
+ * Although it seems wasteful to do a unilateral clts() in the take_fpu
+ * functions, the reason I did it that way is because the alternative is
+ * to test for:
+ *
+ * if ( ( (current->flags & PF_USEDFPU) &&
+ * (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ) ||
+ * ( !(current->flags & PF_USEDFPU) &&
+ * !(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) ) )
+ *
+ */
+
+#define kernel_take_fpu_mmx(recursive, regs, task_switch_regs, irqflags) do { \
+ spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+ clts(); \
+ (recursive) = (current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY); \
+ if ( (current->flags & PF_USEDFPU) && \
+ !(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) ){ \
+ i387_save_hard(current->tss.i387); \
+ current->tss.x86_fpustate |= X86_FPUSTATE_USER_SAVED; \
+ } \
+ if ((recursive) & X86_FPUSTATE_KERN_MMX) { \
+ SAVE_MMX_REGS((regs)); \
+ } else { \
+ current->tss.mmx_reg_space = (task_switch_regs); \
+ current->tss.x86_fpustate |= X86_FPUSTATE_KERN_MMX; \
+ } \
+ spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+#define kernel_release_fpu_mmx(recursive, regs, irqflags) do { \
+ spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+ if ((recursive) & X86_FPUSTATE_KERN_MMX) { \
+ RESTORE_MMX_REGS((regs)); \
+ } else { \
+ current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_MMX; \
+ current->tss.mmx_reg_space = NULL; \
+ } \
+ if ((recursive) == 0) { \
+ stts(); \
+ } \
+ spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+#define kernel_take_fpu_kni(recursive, regs, task_switch_regs, irqflags) do { \
+ spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+ clts(); \
+ (recursive) = current->tss.x86_fpustate; \
+ if ( (current->flags & PF_USEDFPU) || \
+ (current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI) ) { \
+ SAVE_KNI_REGS((regs)); \
+ } \
+ if (!(current->tss.x86_fpustate & X86_FPUSTATE_KERN_KNI)) { \
+ current->tss.kni_reg_space = (task_switch_regs); \
+ current->tss.x86_fpustate |= X86_FPUSTATE_KERN_KNI; \
+ } \
+ spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+
+#define kernel_release_fpu_kni(recursive, regs, irqflags) do { \
+ spin_lock_irqsave(&kern_fpu_lock, (irqflags)); \
+ if ( (current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) && \
+ !(((recursive) & X86_FPUSTATE_USER_SAVED) && \
+ (current->flags & PF_USEDFPU)) ) { \
+ i387_restore_hard(current->tss.i387); \
+ current->tss.x86_fpustate &= ~X86_FPUSTATE_USER_SAVED; \
+ } \
+ if ( ((recursive) & X86_FPUSTATE_KERN_KNI) || \
+ (current->flags & PF_USEDFPU) ) { \
+ RESTORE_KNI_REGS((regs)); \
+ } \
+ if (((recursive) & X86_FPUSTATE_KERN_KNI) == 0) { \
+ current->tss.x86_fpustate &= ~X86_FPUSTATE_KERN_KNI; \
+ current->tss.kni_reg_space = NULL; \
+ } \
+ if ( ((recursive) == 0) && ((current->flags & PF_USEDFPU) == 0) ) { \
+ stts(); \
+ } \
+ spin_unlock_irqrestore(&kern_fpu_lock, (irqflags)); \
+} while (0)
+
+
+#endif /* __ASM_I386_I387_H */
--- linux/include/asm-i386/processor.h.PIII Tue May 11 13:35:44 1999
+++ linux/include/asm-i386/processor.h Sun Dec 5 14:23:14 1999
@@ -7,10 +7,11 @@
#ifndef __ASM_I386_PROCESSOR_H
#define __ASM_I386_PROCESSOR_H
+#include <linux/config.h>
#include <asm/vm86.h>
#include <asm/math_emu.h>
-#include <asm/segment.h>
#include <asm/page.h>
+#include <asm/user.h>
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
@@ -29,6 +30,7 @@
char rfu;
int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
__u32 x86_capability;
+ __u32 mmu_cr4_features;
char x86_vendor_id[16];
char x86_model_id[64];
int x86_cache_size; /* in KB - valid for CPUS which support this
@@ -36,6 +38,7 @@
int fdiv_bug;
int f00f_bug;
int coma_bug;
+ int enable_fixups;
unsigned long loops_per_sec;
unsigned long *pgd_quick;
unsigned long *pte_quick;
@@ -70,16 +73,16 @@
#define X86_FEATURE_PGE 0x00002000 /* Page Global Enable */
#define X86_FEATURE_MCA 0x00004000 /* Machine Check Architecture
*/
#define X86_FEATURE_CMOV 0x00008000 /* CMOV instruction (FCMOVCC and FCOMI
too if FPU present) */
-#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */
+#define X86_FEATURE_PAT 0x00010000 /* Page Attribute Table */
#define X86_FEATURE_PSE36 0x00020000 /* 36-bit PSEs */
-#define X86_FEATURE_18 0x00040000
+#define X86_FEATURE_PN 0x00040000 /* 96 bit CPU serial # */
#define X86_FEATURE_19 0x00080000
#define X86_FEATURE_20 0x00100000
#define X86_FEATURE_21 0x00200000
#define X86_FEATURE_22 0x00400000
#define X86_FEATURE_MMX 0x00800000 /* multimedia extensions */
#define X86_FEATURE_FXSR 0x01000000 /* FXSAVE and FXRSTOR instructions
(fast save and restore of FPU context), and CR4.OSFXSR (OS uses these instructions)
available */
-#define X86_FEATURE_25 0x02000000
+#define X86_FEATURE_XMM 0x02000000 /* Intel MMX2 instruction set
+*/
#define X86_FEATURE_26 0x04000000
#define X86_FEATURE_27 0x08000000
#define X86_FEATURE_28 0x10000000
@@ -89,6 +92,82 @@
extern struct cpuinfo_x86 boot_cpu_data;
+#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
+#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
+#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
+#define X86_CR4_DE 0x0008 /* enable debugging extensions */
+#define X86_CR4_PSE 0x0010 /* enable page size extensions */
+#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
+#define X86_CR4_MCE 0x0040 /* Machine check enable */
+#define X86_CR4_PGE 0x0080 /* enable global pages */
+#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
+#define X86_CR4_OSFXSR 0x0200 /* fast FPU save/restore */
+#define X86_CR4_OSXMMEXCPT 0x0400 /* KNI (MMX2) unmasked exception 16 */
+ /* handler is available */
+
+/*
+ * Some defines for using with the x86_fpu_state variable in the new
+ * thread struct. We use these because the rest of the kernel doesn't
+ * like us messing with current->flags at arbitrary times ;-)
+ */
+#define X86_FPUSTATE_USER_SAVED 0x0001
+#define X86_FPUSTATE_KERN_ANY 0x0006
+#define X86_FPUSTATE_KERN_MMX 0x0002
+#define X86_FPUSTATE_KERN_KNI 0x0004
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+
+static inline void set_in_cr4(unsigned long mask)
+{
+ boot_cpu_data.mmu_cr4_features |= mask;
+ __asm__("movl %%cr4,%%eax\n\t"
+ "orl %0,%%eax\n\t"
+ "movl %%eax,%%cr4\n"
+ : : "irg" (mask)
+ :"ax");
+}
+
+extern int disable_x86_serial_nr;
+
+static inline void disable_serial_nr(void)
+{
+ if ( disable_x86_serial_nr &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_PN) ) {
+ printk("Disabling CPUID Serial number...");
+ __asm__ __volatile__( "movl $0x119,%%ecx\n\t"
+ "rdmsr\n\t"
+ "orl $0x00200000,%%eax\n\t"
+ "wrmsr":::"ax","dx","cx","memory");
+ /*
+ * We might need to re-read the x86 capability set now to
+ * make sure that the PN bit has been turned off so
+ * we know that the serial number stuff is disabled
+ *
+ * Note: we don't need to re-read the registers. We can tell
+ * by rebooting that the flag is off since on reboots that
+ * don't power the machine down the serial number doesn't
+ * get disabled any more because it already is disabled.
+ */
+ printk("done.\n");
+ }
+}
+
+static inline void load_default_mxcsr(void)
+{
+ long mxcsr = 0x1f80;
+
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ __asm__("ldmxcsr %0": :"m" (mxcsr));
+ }
+}
+
+
#ifdef __SMP__
extern struct cpuinfo_x86 cpu_data[];
#define current_cpu_data cpu_data[smp_processor_id()]
@@ -171,36 +250,61 @@
*/
#define IO_BITMAP_SIZE 32
-struct i387_hard_struct {
- long cwd;
- long swd;
- long twd;
- long fip;
- long fcs;
- long foo;
- long fos;
- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
- long status; /* software status information */
+struct i387_hard_fsave {
+ long cwd;
+ long swd;
+ long twd;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+/*
+ * has to be 128-bit aligned
+ */
+struct i387_hard_fxsave {
+ unsigned short fxcwd;
+ unsigned short fxswd;
+ unsigned short fxtwd;
+ unsigned short fxfopcode;
+ long fxfip;
+ short fxfcs;
+ short __reserved_00;
+ long fxfoo;
+ short fxfos;
+ short __reserved_01;
+ long mxcsr;
+ long __reserved_02;
+ long st_space[32]; /* 8*16 bytes for each FP/MMX-reg = 128 bytes */
+ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ long __reserved_03 [14*4]; /* 14 16byte lines for remainder */
+} __attribute__ ((aligned (16)));
+
+union i387_hard_union {
+ struct i387_hard_fxsave fxsave;
+ struct i387_hard_fsave fsave;
};
struct i387_soft_struct {
- long cwd;
- long swd;
- long twd;
- long fip;
- long fcs;
- long foo;
- long fos;
- long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
- unsigned char ftop, changed, lookahead, no_update, rm, alimit;
- struct info *info;
- unsigned long entry_eip;
+ long cwd;
+ long swd;
+ long twd;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+ unsigned char ftop, changed, lookahead, no_update, rm, alimit;
+ struct info *info;
+ unsigned long entry_eip;
};
union i387_union {
- struct i387_hard_struct hard;
+ union i387_hard_union hard;
struct i387_soft_struct soft;
-};
+} __attribute__ ((aligned(16)));
typedef struct {
unsigned long seg;
@@ -242,6 +346,10 @@
struct vm86_struct * vm86_info;
unsigned long screen_bitmap;
unsigned long v86flags, v86mask, v86mode, saved_esp0;
+ volatile long x86_fpustate;
+ char *mmx_reg_space;
+ char *kni_reg_space;
+
};
#define INIT_MMAP \
@@ -263,8 +371,9 @@
{~0, }, /* ioperm */ \
_TSS(0), 0, 0, 0, (mm_segment_t) { 0 }, /* obsolete */ \
{ 0, }, \
- { { 0, }, }, /* 387 state */ \
+ { { { 0, }, }, }, /* 387 state */ \
NULL, 0, 0, 0, 0, 0, /* vm86_info */ \
+ 0, NULL, NULL /* fpustate, mmx, and xmm_reg_space */ \
}
#define start_thread(regs, new_eip, new_esp) do { \
@@ -289,27 +398,6 @@
extern void copy_segments(int nr, struct task_struct *p, struct mm_struct * mm);
extern void release_segments(struct mm_struct * mm);
extern void forget_segments(void);
-
-/*
- * FPU lazy state save handling..
- */
-#define save_fpu(tsk) do { \
- asm volatile("fnsave %0\n\tfwait":"=m" (tsk->tss.i387)); \
- tsk->flags &= ~PF_USEDFPU; \
- stts(); \
-} while (0)
-
-#define unlazy_fpu(tsk) do { \
- if (tsk->flags & PF_USEDFPU) \
- save_fpu(tsk); \
-} while (0)
-
-#define clear_fpu(tsk) do { \
- if (tsk->flags & PF_USEDFPU) { \
- tsk->flags &= ~PF_USEDFPU; \
- stts(); \
- } \
-} while (0)
/*
* Return saved PC of a blocked thread.
--- linux/include/asm-i386/string.h.PIII Thu Apr 22 12:59:46 1999
+++ linux/include/asm-i386/string.h Sun Dec 5 14:23:14 1999
@@ -14,6 +14,10 @@
#include <asm/string-486.h>
#else
+#ifndef _LINUX_CONFIG_H
+#include <linux/config.h>
+#endif
+
/*
* This string-include defines all string functions as inline
* functions. Use gcc. It also assumes ds=es=data space, this should be
@@ -293,10 +297,21 @@
}
#define __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+extern void * __kni_memcpy(void * to, const void * from, size_t n);
+extern void * best_memcpy(void * to, const void * from, size_t n);
+#define memcpy(t, f, n) \
+(__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
+ __constant_memcpy((t),(f),(n)) : \
+ best_memcpy((t),(f),(n))) : \
+ best_memcpy((t),(f),(n)))
+#else
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))
+#endif
#define __HAVE_ARCH_MEMMOVE
extern inline void * memmove(void * dest,const void * src, size_t n)
@@ -449,21 +464,32 @@
#undef COMMON
}
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
+#define __constant_x_count_memset(s, c, count) \
+(__builtin_constant_p(c) ? \
+ __constant_c_and_count_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) :\
+ __constant_count_memset((s),(c),(count)))
#define __memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
+(__builtin_constant_p(c) ? \
+ __constant_c_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \
__memset_generic((s),(c),(count)))
#define __HAVE_ARCH_MEMSET
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+extern void * __kni_memset(void * s, char c, size_t count);
+extern void * best_memset(void * s, char c, size_t count);
#define memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(0x01010101UL*(unsigned char)(c)),(count)) : \
+(__builtin_constant_p(count) ? \
+ (((count) < 128) ? \
+ __constant_x_count_memset((s),(c),(count)) : \
+ best_memset((s),(c),(count))) : \
+ best_memset((s),(c),(count)))
+#else
+#define memset(s, c, count) \
+(__builtin_constant_p(count) ? \
+ __constant_x_count_memset((s),(c),(count)) : \
__memset((s),(c),(count)))
+#endif
/*
* find the first occurrence of byte 'c', or 1 past the area if none
--- linux/include/asm-i386/uaccess.h.PIII Tue Oct 19 20:14:02 1999
+++ linux/include/asm-i386/uaccess.h Sun Dec 5 14:23:14 1999
@@ -571,19 +571,61 @@
return n;
}
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+
+/*
+ * The XMM based copy_*_user() function declarations...the best_*_user()
+ * routines need this
+ */
+unsigned long kni_copy_to_user(void *, const void *, unsigned long);
+unsigned long kni_copy_from_user(void *, const void *, unsigned long);
+unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long);
+unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned long);
+
+unsigned long best_copy_to_user(void *, const void *, unsigned long);
+unsigned long best_copy_from_user(void *, const void *, unsigned long);
+unsigned long __best_copy_to_user(void *, const void *, unsigned long);
+unsigned long __best_copy_from_user(void *, const void *, unsigned long);
+
#define copy_to_user(to,from,n) \
(__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
__constant_copy_to_user((to),(from),(n)) : \
- __generic_copy_to_user((to),(from),(n)))
+ best_copy_to_user((to),(from),(n))) : \
+ best_copy_to_user((to),(from),(n)))
#define copy_from_user(to,from,n) \
(__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
__constant_copy_from_user((to),(from),(n)) : \
- __generic_copy_from_user((to),(from),(n)))
+ best_copy_from_user((to),(from),(n))) : \
+ best_copy_from_user((to),(from),(n)))
-#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return
retval; })
+#define __copy_to_user(to,from,n) \
+ (__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
+ __constant_copy_to_user_nocheck((to),(from),(n)) : \
+ __best_copy_to_user((to),(from),(n))) : \
+ __best_copy_to_user((to),(from),(n)))
-#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return
retval; })
+#define __copy_from_user(to,from,n) \
+ (__builtin_constant_p(n) ? \
+ (((n) < 128) ? \
+ __constant_copy_from_user_nocheck((to),(from),(n)) : \
+ __best_copy_from_user((to),(from),(n))) : \
+ __best_copy_from_user((to),(from),(n)))
+
+#else /* CONFIG_X86_CPU_OPTIMIZATIONS */
+
+#define copy_to_user(to,from,n) \
+ (__builtin_constant_p(n) ? \
+ __constant_copy_to_user((to),(from),(n)) : \
+ __generic_copy_to_user((to),(from),(n)))
+
+#define copy_from_user(to,from,n) \
+ (__builtin_constant_p(n) ? \
+ __constant_copy_from_user((to),(from),(n)) : \
+ __generic_copy_from_user((to),(from),(n)))
#define __copy_to_user(to,from,n) \
(__builtin_constant_p(n) ? \
@@ -594,6 +636,11 @@
(__builtin_constant_p(n) ? \
__constant_copy_from_user_nocheck((to),(from),(n)) : \
__generic_copy_from_user_nocheck((to),(from),(n)))
+#endif
+
+#define copy_to_user_ret(to,from,n,retval) ({ if (copy_to_user(to,from,n)) return
+retval; })
+
+#define copy_from_user_ret(to,from,n,retval) ({ if (copy_from_user(to,from,n)) return
+retval; })
long strncpy_from_user(char *dst, const char *src, long count);
long __strncpy_from_user(char *dst, const char *src, long count);
--- linux/include/asm-i386/io.h.PIII Tue May 11 13:36:03 1999
+++ linux/include/asm-i386/io.h Sun Dec 5 14:23:14 1999
@@ -157,9 +157,9 @@
#define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b))
#define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b))
-#define memset_io(a,b,c) memset(__io_virt(a),(b),(c))
-#define memcpy_fromio(a,b,c) memcpy((a),__io_virt(b),(c))
-#define memcpy_toio(a,b,c) memcpy(__io_virt(a),(b),(c))
+#define memset_io(a,b,c) __memset_generic(__io_virt(a),(b),(c))
+#define memcpy_fromio(a,b,c) __memcpy((a),__io_virt(b),(c))
+#define memcpy_toio(a,b,c) __memcpy(__io_virt(a),(b),(c))
/*
* Again, i386 does not require mem IO specific function.
--- linux/arch/i386/mm/init.c.PIII Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/mm/init.c Sun Dec 5 14:23:14 1999
@@ -184,34 +184,6 @@
extern char _text, _etext, _edata, __bss_start, _end;
extern char __init_begin, __init_end;
-#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
-#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
-#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
-#define X86_CR4_DE 0x0008 /* enable debugging extensions */
-#define X86_CR4_PSE 0x0010 /* enable page size extensions */
-#define X86_CR4_PAE 0x0020 /* enable physical address extensions
*/
-#define X86_CR4_MCE 0x0040 /* Machine check enable */
-#define X86_CR4_PGE 0x0080 /* enable global pages */
-#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl
3 */
-
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-unsigned long mmu_cr4_features __initdata = 0;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- mmu_cr4_features |= mask;
- __asm__("movl %%cr4,%%eax\n\t"
- "orl %0,%%eax\n\t"
- "movl %%eax,%%cr4\n"
- : : "irg" (mask)
- :"ax");
-}
-
/*
* allocate page table(s) for compile-time fixed mappings
*/
--- linux/arch/i386/lib/Makefile.PIII Sun Dec 27 13:33:13 1998
+++ linux/arch/i386/lib/Makefile Sun Dec 5 14:23:14 1999
@@ -9,4 +9,8 @@
L_OBJS = checksum.o old-checksum.o semaphore.o delay.o \
usercopy.o getuser.o putuser.o
+ifeq ($(CONFIG_X86_CPU_OPTIMIZATIONS),y)
+ L_OBJS += best_function.o simd.o
+endif
+
include $(TOPDIR)/Rules.make
--- linux/arch/i386/lib/best_function.c.PIII Sun Dec 5 14:23:14 1999
+++ linux/arch/i386/lib/best_function.c Sun Dec 5 14:23:31 1999
@@ -0,0 +1,196 @@
+/*
+ * SIMD functions. These replace the functions in asm-i386/string.h
+ * whenever it makes sense. These also un-inline those functions.
+ *
+ * Copyright 1999, Doug Ledford <[EMAIL PROTECTED]>
+ *
+ * These functions are simple and trivial, consider them to be
+ * public domain
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+/*
+ * We declare our accelerator functions here since this is the only place
+ * that needs the declarations which makes a header file a pain to deal
+ * with
+ */
+extern void * kni_memcpy(void *, const void *, size_t);
+extern void * kni_memset(void *, char, size_t);
+extern unsigned long kni_copy_to_user(void *, const void *, unsigned long);
+extern unsigned long kni_copy_from_user(void *, const void *, unsigned long);
+extern unsigned long __kni_copy_to_user_nocheck(void *, const void *, unsigned long);
+extern unsigned long __kni_copy_from_user_nocheck(void *, const void *, unsigned
+long);
+
+static void * best_memcpy_final(void *, const void *, size_t);
+static void * best_memset_final(void *, char, size_t);
+static unsigned long best_copy_to_user_final(void *, const void *, unsigned long);
+static unsigned long best_copy_from_user_final(void *, const void *, unsigned long);
+static unsigned long __best_copy_to_user_final(void *, const void *, unsigned long);
+static unsigned long __best_copy_from_user_final(void *, const void *, unsigned long);
+
+void * best_memcpy(void * to, const void * from, size_t n)
+{
+ int BAR = (int)__builtin_return_address(0);
+ int *caller = (int *)BAR - 1;
+ if(boot_cpu_data.enable_fixups) {
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ *caller = (int)kni_memcpy - BAR;
+ return(kni_memcpy(to, from, n));
+ } else {
+ *caller = (int)best_memcpy_final - BAR;
+ return(__memcpy(to, from, n));
+ }
+ } else {
+ return(__memcpy(to, from, n));
+ }
+}
+
+static void * best_memcpy_final(void * to, const void * from, size_t n)
+{
+ return(__memcpy(to, from, n));
+}
+
+void * best_memset(void * s, char c, size_t count)
+{
+ int BAR = (int)__builtin_return_address(0);
+ int *caller = (int *)BAR - 1;
+ if(boot_cpu_data.enable_fixups) {
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ *caller = (int)kni_memset - BAR;
+ return(kni_memset(s, c, count));
+ } else {
+ *caller = (int)best_memset_final - BAR;
+ return(__memset_generic(s, c, count));
+ }
+ } else {
+ return(__memset_generic(s, c, count));
+ }
+}
+
+static void * best_memset_final(void * s, char c, size_t count)
+{
+ return(__memset_generic(s, c, count));
+}
+
+unsigned long
+best_copy_to_user(void *to, const void *from, unsigned long n)
+{
+ int BAR = (int)__builtin_return_address(0);
+ int *caller = (int *)BAR - 1;
+ if(boot_cpu_data.enable_fixups) {
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ *caller = (int)kni_copy_to_user - BAR;
+ return(kni_copy_to_user(to, from, n));
+ } else {
+ *caller = (int)best_copy_to_user_final - BAR;
+ return(best_copy_to_user_final(to, from, n));
+ }
+ } else {
+ if (access_ok(VERIFY_WRITE, to, n)) {
+ __copy_user(to,from,n);
+ }
+ return n;
+ }
+}
+
+static unsigned long
+best_copy_to_user_final(void *to, const void *from, unsigned long n)
+{
+ if (access_ok(VERIFY_WRITE, to, n)) {
+ __copy_user(to,from,n);
+ }
+ return n;
+}
+
+unsigned long
+best_copy_from_user(void *to, const void *from, unsigned long n)
+{
+ int BAR = (int)__builtin_return_address(0);
+ int *caller = (int *)BAR - 1;
+ if(boot_cpu_data.enable_fixups) {
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ *caller = (int)kni_copy_from_user - BAR;
+ return(kni_copy_from_user(to, from, n));
+ } else {
+ *caller = (int)best_copy_from_user_final - BAR;
+ return(best_copy_from_user_final(to, from, n));
+ }
+ } else {
+ if (access_ok(VERIFY_READ, from, n)) {
+ __copy_user_zeroing(to,from,n);
+ }
+ return n;
+ }
+}
+
+static unsigned long
+best_copy_from_user_final(void *to, const void *from, unsigned long n)
+{
+ if (access_ok(VERIFY_READ, from, n)) {
+ __copy_user_zeroing(to,from,n);
+ }
+ return n;
+}
+
+unsigned long
+__best_copy_to_user(void *to, const void *from, unsigned long n)
+{
+ int BAR = (int)__builtin_return_address(0);
+ int *caller = (int *)BAR - 1;
+ if(boot_cpu_data.enable_fixups) {
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ *caller = (int)__kni_copy_to_user_nocheck - BAR;
+ return(__kni_copy_to_user_nocheck(to, from, n));
+ } else {
+ *caller = (int)__best_copy_to_user_final - BAR;
+ return(__best_copy_to_user_final(to, from, n));
+ }
+ } else {
+ __copy_user(to,from,n);
+ return n;
+ }
+}
+
+static unsigned long
+__best_copy_to_user_final(void *to, const void *from, unsigned long n)
+{
+ __copy_user(to,from,n);
+ return n;
+}
+
+unsigned long
+__best_copy_from_user(void *to, const void *from, unsigned long n)
+{
+ int BAR = (int)__builtin_return_address(0);
+ int *caller = (int *)BAR - 1;
+ if(boot_cpu_data.enable_fixups) {
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
+ *caller = (int)__kni_copy_from_user_nocheck - BAR;
+ return(__kni_copy_from_user_nocheck(to, from, n));
+ } else {
+ *caller = (int)__best_copy_from_user_final - BAR;
+ return(__best_copy_from_user_final(to, from, n));
+ }
+ } else {
+ __copy_user_zeroing(to,from,n);
+ return n;
+ }
+}
+
+static unsigned long
+__best_copy_from_user_final(void *to, const void *from, unsigned long n)
+{
+ __copy_user_zeroing(to,from,n);
+ return n;
+}
+
--- linux/arch/i386/lib/simd.c.PIII Sun Dec 5 14:23:14 1999
+++ linux/arch/i386/lib/simd.c Sun Dec 5 14:23:14 1999
@@ -0,0 +1,435 @@
+/*
+ * SIMD functions. These replace the functions in asm-i386/string.h
+ * whenever it makes sense. These also un-inline those functions.
+ *
+ * Copyright 1999, Doug Ledford <[EMAIL PROTECTED]>
+ *
+ * These functions are simple and trivial, consider them to be
+ * public domain
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+
+extern void * kni_memcpy(void * to, const void * from, size_t n)
+{
+ unsigned long flags;
+ void *ret=to;
+ size_t size;
+ int recursive = 0;
+ char xmm_space[64];
+
+ /*
+ * If the transfer is too small, then use the generic routine.
+ */
+ if (n < 128) {
+ return(__memcpy(to, from, n));
+ }
+ kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
+
+ /*
+ * Align the destination on a 16byte boundary.
+ * The source doesn't have to be aligned.
+ */
+ if ( (unsigned long)to & 0xf ) {
+ size = 0x10 - ((unsigned long)to & 0xf);
+ __asm__ __volatile__("movups (%0),%%xmm0\n\t"
+ "movups %%xmm0,(%1)\n\t"
+ :
+ : "r" (from),
+ "r" (to));
+ n -= size;
+ from += size;
+ to += size;
+ }
+ /*
+ * If the copy would have tailings, take care of them
+ * now instead of later
+ */
+ if(n & 0xf) {
+ size = n - 0x10;
+ __asm__ __volatile__("movups (%0),%%xmm0\n\t"
+ "movups %%xmm0,(%1)\n\t"
+ :
+ : "r" (from + size),
+ "r" (to + size));
+ n &= ~0xf;
+ }
+ /*
+ * Prefetch the first two cachelines now.
+ */
+ __asm__ __volatile__("prefetchnta 0x00(%0)\n\t"
+ "prefetchnta 0x20(%0)\n\t"
+ :
+ : "r" (from));
+ /*
+ * Copy 32 bytes at a time. The single unroll is good
+ * for a 30% performance boost in the copy. Additional
+ * unrolls are not productive. We are guaranteed to
+ * have at least 32 bytes of data to copy since the
+ * macro in string.h doesn't call into this function
+ * with less than 64 bytes of copy and we lost < 32
+ * bytes to alignment earlier.
+ */
+ while (n >= 0x20) {
+ __asm__ __volatile__(
+ "movups 0x00(%0),%%xmm0\n\t"
+ "movups 0x10(%0),%%xmm1\n\t"
+ "movntps %%xmm0,0x00(%1)\n\t"
+ "movntps %%xmm1,0x10(%1)\n\t"
+ :
+ : "r" (from), "r" (to)
+ : "memory");
+ from += 0x20;
+ /*
+ * Note: Intermixing the prefetch at *exactly* this point
+ * in time has been shown to be the fastest possible.
+ * Timing these prefetch instructions is a complete black
+ * art with nothing but trial and error showing the way.
+ * To that extent, this optimum version was found by using
+ * a userland version of this routine that we clocked for
+ * lots of runs. We then fiddled with ordering until we
+ * settled on our highest speen routines. So, the long
+ * and short of this is, don't mess with instruction ordering
+ * here or suffer permance penalties you will.
+ */
+ __asm__ __volatile__(
+ "prefetchnta 0x20(%0)\n\t"
+ :
+ : "r" (from));
+ to += 0x20;
+ n -= 0x20;
+ }
+ if (n) {
+ __asm__ __volatile__("movups 0x00(%0),%%xmm0\n\t"
+ "movntps %%xmm0,0x00(%1)\n\t"
+ :
+ : "r" (from), "r" (to)
+ : "memory");
+ }
+ SFENCE();
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+ return(ret);
+}
+
+extern void * kni_memset(void * s, char c, size_t count)
+{
+ unsigned long flags;
+ size_t size;
+ void *ret=s;
+ int recursive = 0;
+ char xmm_space[64];
+
+ /*
+ * If the transfer is too small, then use the generic routine.
+ */
+ if (count < 128) {
+ return(__memset_generic(s, c, count));
+ }
+ kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
+ /*
+ * Load up our XMM register with the stuff to set mem with
+ */
+ if(c == '\0') {
+ __asm__ __volatile__("xorps %%xmm0,%%xmm0\n\t"
+ "movups %%xmm0,(%0)\n\t"
+ :
+ : "r" (s));
+ } else {
+ __memset_generic(s, c, 0x10);
+ __asm__ __volatile__("movups (%0),%%xmm0"
+ :
+ : "r" (s));
+ }
+ /*
+ * align the destination on a 16 byte boundary, we can simply
+ * do the math to align things since we already populated the
+ * first 16 bytes.
+ */
+ size = (0x10 - ((unsigned long)s & 0xf));
+ count -= size;
+ s += size;
+ /*
+ * On the off chance we have tailings due to alignment issues,
+ * do them now to make later more efficient
+ */
+ if(count & 0xf) {
+ __asm__ __volatile__("movups %%xmm0,(%0)"
+ :
+ : "r" (s + (count - 0x10))
+ : "memory");
+ count &= ~0xf;
+ }
+ /*
+ * Do the copy by plopping out the register to memory.
+ * Note: Unrolling this was *totally* unproductive. My benchmark
+ * showed that one or two plops per iteration produced the same
+ * speed to within .06 MByte/s of speed. Considering that the
+ * routine benchmarked at over 3000 MByte/s, .06 is not statistically
+ * significant and only doing one drop per loop simplifies
+ * overhead of book keeping.
+ */
+ while(count) {
+ __asm__ __volatile__("movntps %%xmm0,0x00(%0)\n\t"
+ :
+ : "r" (s));
+ s += 0x10;
+ count -= 0x10;
+ }
+ SFENCE();
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+ return(ret);
+}
+
+#define __kni_copy_to_user(to,from,size) \
+do { \
+ int __d0, __d1, tmp, tmp2; \
+ __asm__ __volatile__( \
+ " movl %1,%4\n" \
+ " andl $0xf,%4\n" \
+ " movups (%2),%%xmm0\n" \
+ "1: movups %%xmm0,(%1)\n" \
+ " movl $0x10,%3\n" \
+ " subl %4,%3\n" \
+ " addl %3,%2\n" \
+ " addl %3,%1\n" \
+ " subl %3,%0\n" \
+ " prefetchnta 0x00(%2)\n" \
+ " prefetchnta 0x20(%2)\n" \
+ " jmp 200f\n" \
+ "100: movups 0x00(%2),%%xmm0\n" \
+ " movups 0x10(%2),%%xmm1\n" \
+ "2: movntps %%xmm0,0x00(%1)\n" \
+ "3: movntps %%xmm1,0x10(%1)\n" \
+ " addl $0x20,%2\n" \
+ " prefetchnta 0x20(%2)\n" \
+ " addl $0x20,%1\n" \
+ " subl $0x20,%0\n" \
+ "200: cmpl $0x1f,%0\n" \
+ " ja 100b\n" \
+ " cmpl $0xf,%0\n" \
+ " jbe 300f\n" \
+ " movups 0x00(%2),%%xmm0\n" \
+ "4: movntps %%xmm0,0x00(%1)\n" \
+ " addl $0x10,%2\n" \
+ " addl $0x10,%1\n" \
+ " subl $0x10,%0\n" \
+ "300: testl %0,%0\n" \
+ " je 400f\n" \
+ " movl $0x10,%3\n" \
+ " subl %0,%3\n" \
+ " subl %3,%1\n" \
+ " subl %3,%2\n" \
+ " movups 0x00(%2),%%xmm0\n" \
+ "5: movups %%xmm0,0x00(%1)\n" \
+ " addl $0x10,%2\n" \
+ " addl $0x10,%1\n" \
+ " xorl %0,%0\n" \
+ "400:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "6: jmp 400b\n" \
+ "7: addl $0x10,%1\n" \
+ " addl $0x10,%2\n" \
+ " subl $0x10,%0\n" \
+ " jmp 400b\n" \
+ "8: addl %3,%1\n" \
+ " addl %3,%2\n" \
+ " jmp 400b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 1b,6b\n" \
+ " .long 2b,6b\n" \
+ " .long 3b,7b\n" \
+ " .long 4b,6b\n" \
+ " .long 5b,8b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp), \
+ "=r"(tmp2) \
+ : "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+#define __kni_copy_from_user(to,from,size) \
+do { \
+ int __d0, __d1, tmp, tmp2; \
+ __asm__ __volatile__( \
+ " movl %1,%4\n" \
+ " andl $0xf,%4\n" \
+ "1: movups (%2),%%xmm0\n" \
+ " movups %%xmm0,(%1)\n" \
+ " movl $0x10,%3\n" \
+ " subl %4,%3\n" \
+ " addl %3,%2\n" \
+ " addl %3,%1\n" \
+ " subl %3,%0\n" \
+ " prefetchnta 0x00(%2)\n" \
+ " prefetchnta 0x20(%2)\n" \
+ " jmp 100f\n" \
+ "2: movups 0x00(%2),%%xmm0\n" \
+ "3: movups 0x10(%2),%%xmm1\n" \
+ " movntps %%xmm0,0x00(%1)\n" \
+ " movntps %%xmm1,0x10(%1)\n" \
+ " addl $0x20,%2\n" \
+ " prefetchnta 0x20(%2)\n" \
+ " addl $0x20,%1\n" \
+ " subl $0x20,%0\n" \
+ "100: cmpl $0x1f,%0\n" \
+ " ja 2b\n" \
+ " cmpl $0xf,%0\n" \
+ " jbe 200f\n" \
+ "4: movups 0x00(%2),%%xmm0\n" \
+ " movntps %%xmm0,0x00(%1)\n" \
+ " addl $0x10,%2\n" \
+ " addl $0x10,%1\n" \
+ " subl $0x10,%0\n" \
+ "200: testl %0,%0\n" \
+ " je 300f\n" \
+ " movl $0x10,%3\n" \
+ " subl %0,%3\n" \
+ " subl %3,%1\n" \
+ " subl %3,%2\n" \
+ "5: movups 0x00(%2),%%xmm0\n" \
+ " movups %%xmm0,0x00(%1)\n" \
+ " addl $0x10,%2\n" \
+ " addl $0x10,%1\n" \
+ " xorl %0,%0\n" \
+ "300:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "6: xorps %%xmm0,%%xmm0\n" \
+ " movups %%xmm0,(%1)\n" \
+ " movl $0x10,%3\n" \
+ " subl %4,%3\n" \
+ " addl %3,%1\n" \
+ " movl %3,%4\n" \
+ " movl %0,%3\n" \
+ " subl %4,%3\n" \
+ " jmp 600f\n" \
+ "7: subl $0x10,%0\n" \
+ " addl $0x10,%1\n" \
+ "400: movl %0,%3\n" \
+ " xorps %%xmm0,%%xmm0\n" \
+ " jmp 600f\n" \
+ "500: movntps %%xmm0,0x00(%1)\n" \
+ " movntps %%xmm0,0x10(%1)\n" \
+ " addl $0x20,%1\n" \
+ " subl $0x20,%3\n" \
+ "600: cmpl $0x1f,%3\n" \
+ " ja 500b\n" \
+ " cmpl $0xf,%3\n" \
+ " jbe 700f\n" \
+ " movntps %%xmm0,0x00(%1)\n" \
+ " addl $0x10,%1\n" \
+ " subl $0x10,%3\n" \
+ "700: testl %3,%3\n" \
+ " je 300b\n" \
+ " xorl %4,%4\n" \
+ " movb %4,(%1)\n" \
+ " inc %1\n" \
+ " dec %3\n" \
+ " jmp 700b\n" \
+ "8: addl %3,%1\n" \
+ " movl %0,%3\n" \
+ " jmp 700b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 1b,6b\n" \
+ " .long 2b,400b\n" \
+ " .long 3b,7b\n" \
+ " .long 4b,400b\n" \
+ " .long 5b,8b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(tmp), \
+ "=q"(tmp2) \
+ : "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+
+unsigned long
+__kni_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
+{
+ unsigned long flags;
+ int recursive = 0;
+ char xmm_space[64];
+ char xmm_reg_space[64]; /* in case we switch context */
+
+ if (n >= 128) {
+ kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+ __kni_copy_to_user(to,from,n);
+ SFENCE();
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+ } else {
+ __copy_user(to,from,n);
+ }
+ return n;
+}
+
+unsigned long
+__kni_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
+{
+ unsigned long flags;
+ int recursive = 0;
+ char xmm_space[64];
+ char xmm_reg_space[64]; /* in case we switch context */
+
+ if (n >= 128) {
+ kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+ __kni_copy_from_user(to,from,n);
+ SFENCE();
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+ } else {
+ __copy_user_zeroing(to,from,n);
+ }
+ return n;
+}
+
+
+
+unsigned long
+kni_copy_to_user(void *to, const void *from, unsigned long n)
+{
+ unsigned long flags;
+ int recursive = 0;
+ char xmm_space[64];
+ char xmm_reg_space[64]; /* in case we switch context */
+
+ if (access_ok(VERIFY_WRITE, to, n)) {
+ if (n >= 128) {
+
+kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+ __kni_copy_to_user(to,from,n);
+ SFENCE();
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+ } else {
+ __copy_user(to,from,n);
+ }
+ }
+ return n;
+}
+
+unsigned long
+kni_copy_from_user(void *to, const void *from, unsigned long n)
+{
+ unsigned long flags;
+ int recursive = 0;
+ char xmm_space[64];
+ char xmm_reg_space[64]; /* in case we switch context */
+
+ if (access_ok(VERIFY_READ, from, n)) {
+ if (n >= 128) {
+
+kernel_take_fpu_kni(recursive,&xmm_space[0],&xmm_reg_space[0],flags);
+ __kni_copy_from_user(to,from,n);
+ SFENCE();
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
+ } else {
+ __copy_user_zeroing(to,from,n);
+ }
+ }
+ return n;
+}
+
+
--- linux/arch/i386/kernel/head.S.PIII Fri Jan 15 01:57:25 1999
+++ linux/arch/i386/kernel/head.S Sun Dec 5 14:23:14 1999
@@ -14,7 +14,6 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-
#define CL_MAGIC_ADDR 0x90020
#define CL_MAGIC 0xA33F
#define CL_BASE_ADDR 0x90000
@@ -32,7 +31,8 @@
#define X86_HARD_MATH CPU_PARAMS+6
#define X86_CPUID CPU_PARAMS+8
#define X86_CAPABILITY CPU_PARAMS+12
-#define X86_VENDOR_ID CPU_PARAMS+16
+#define X86_MMU_CR4 CPU_PARAMS+16
+#define X86_VENDOR_ID CPU_PARAMS+20
/*
* swapper_pg_dir is the main page directory, address 0x00101000
@@ -59,9 +59,8 @@
* NOTE! We have to correct for the fact that we're
* not yet offset PAGE_OFFSET..
*/
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
movl %cr4,%eax # Turn on 4Mb pages
- orl cr4_bits,%eax
+ orl X86_MMU_CR4-__PAGE_OFFSET,%eax
movl %eax,%cr4
#endif
/*
--- linux/arch/i386/kernel/process.c.PIII Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/kernel/process.c Sun Dec 5 14:23:14 1999
@@ -42,6 +42,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/desc.h>
+#include <asm/i387.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
#endif
@@ -582,6 +583,106 @@
}
/*
+ * FPU state handling functions
+ */
+
+int i387_hard_to_user ( struct user_i387_struct * user,
+ union i387_hard_union * hard)
+{
+#ifdef CONFIG_X86_FX
+ int i, err = 0;
+ short *tmp, *tmp2;
+ union i387_hard_union hard2;
+#else
+ int err = 0;
+#endif
+
+ if (!access_ok(VERIFY_WRITE, user, sizeof(*user)))
+ return -EFAULT;
+#ifdef CONFIG_X86_FX
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+ hard2.fsave.cwd = 0xffff0000 | hard->fxsave.fxcwd;
+ hard2.fsave.swd = 0xffff0000 | hard->fxsave.fxswd;
+ hard2.fsave.twd = fputag_KNI_to_387(hard->fxsave.fxtwd);
+ hard2.fsave.fip = hard->fxsave.fxfip;
+ hard2.fsave.fcs = hard->fxsave.fxfcs;
+ hard2.fsave.foo = hard->fxsave.fxfoo;
+ hard2.fsave.fos = hard->fxsave.fxfos;
+
+ tmp = (short *)&hard2.fsave.st_space[0];
+ tmp2 = (short *)&hard->fxsave.st_space[0];
+
+ /*
+ * Transform the two layouts:
+ * (we do not mix 32-bit access with 16-bit access because
+ * thats suboptimal on PPros)
+ */
+
+ for (i = 0; i < 8; i++) {
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2 += 4;
+ }
+ err = copy_to_user((void *)(user),(&(hard2)),
+ sizeof(struct i387_hard_fsave));
+ } else
+#endif
+ err = copy_to_user((void *)(user),
+ (&(hard->fsave.cwd)),
+ sizeof(struct i387_hard_fsave));
+ return err;
+}
+
+int i387_user_to_hard (union i387_hard_union * hard,
+ struct user_i387_struct * user)
+{
+#ifdef CONFIG_X86_FX
+ int i, err = 0;
+ short *tmp, *tmp2;
+ union i387_hard_union hard2;
+#else
+ int err = 0;
+#endif
+
+ if (!access_ok(VERIFY_READ, user, sizeof(*user)))
+ return -EFAULT;
+#ifdef CONFIG_X86_FX
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+ err = copy_from_user((&(hard2)),(void *)(user),
+ sizeof(struct i387_hard_fsave));
+ hard->fxsave.fxcwd = hard2.fsave.cwd & 0xffff;
+ hard->fxsave.fxswd = hard2.fsave.swd & 0xffff;
+ hard->fxsave.fxtwd = fputag_387_to_KNI(hard2.fsave.twd);
+ hard->fxsave.fxfip = hard2.fsave.fip;
+ hard->fxsave.fxfcs = hard2.fsave.fcs & 0xffff;
+ hard->fxsave.fxfoo = hard2.fsave.foo;
+ hard->fxsave.fxfos = hard2.fsave.fos & 0xffff;
+
+ tmp2 = (short *)&hard->fxsave.st_space[0];
+ tmp = (short *)&hard2.fsave.st_space[0];
+
+ for (i = 0; i < 8; i++) {
+ *tmp2 = *tmp; tmp++; tmp2++;
+ *tmp2 = *tmp; tmp++; tmp2++;
+ *tmp2 = *tmp; tmp++; tmp2++;
+ *tmp2 = *tmp; tmp++; tmp2++;
+ *tmp2 = *tmp; tmp++; tmp2++;
+ *tmp2 = 0; tmp2++;
+ *tmp2 = 0; tmp2++;
+ *tmp2 = 0; tmp2++;
+ }
+ } else
+#endif
+ err = copy_from_user((&(hard->fsave.cwd)),
+ (void *)(user),
+ sizeof(struct i387_hard_fsave));
+ return err;
+}
+
+
+/*
* Save a segment.
*/
#define savesegment(seg,value) \
@@ -626,13 +727,43 @@
*/
int dump_fpu (struct pt_regs * regs, struct user_i387_struct* fpu)
{
+#ifdef CONFIG_X86_FX
+ int fpvalid, i;
+ short *tmp, *tmp2;
+ struct task_struct *tsk = current;
+ union i387_hard_union *hard;
+#else
int fpvalid;
struct task_struct *tsk = current;
-
+#endif
fpvalid = tsk->used_math;
if (fpvalid) {
unlazy_fpu(tsk);
- memcpy(fpu,&tsk->tss.i387.hard,sizeof(*fpu));
+#ifdef CONFIG_X86_FX
+ if (boot_cpu_data.x86_capability & X86_FEATURE_FXSR) {
+ hard = &tsk->tss.i387.hard;
+
+ fpu->cwd = 0xffff0000 | hard->fxsave.fxcwd;
+ fpu->swd = 0xffff0000 | hard->fxsave.fxswd;
+ fpu->twd = fputag_KNI_to_387(hard->fxsave.fxtwd);
+ fpu->fip = hard->fxsave.fxfip;
+ fpu->fcs = hard->fxsave.fxfcs;
+ fpu->foo = hard->fxsave.fxfoo;
+ fpu->fos = hard->fxsave.fxfos;
+
+ tmp = (short *)&fpu->st_space[0];
+ tmp2 = (short *)&hard->fxsave.st_space[0];
+
+ for (i = 0; i < 8; i++) {
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2++;
+ *tmp = *tmp2; tmp++; tmp2+=4;
+ }
+ } else
+#endif
+ memcpy(fpu,&tsk->tss.i387.hard.fsave,sizeof(*fpu));
}
return fpvalid;
@@ -692,8 +823,8 @@
/*
* switch_to(x,yn) should switch tasks from x to y.
*
- * We fsave/fwait so that an exception goes off at the right time
- * (as a call from the fsave or fwait in effect) rather than to
+ * We fpu_save so that an exception goes off at the right time
+ * (as a call from the f*save or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
--- linux/arch/i386/kernel/ptrace.c.PIII Sun Dec 5 14:23:09 1999
+++ linux/arch/i386/kernel/ptrace.c Sun Dec 5 14:23:14 1999
@@ -17,6 +17,7 @@
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
+#include <asm/i387.h>
/*
* does not yet catch signals sent when the child dies.
@@ -646,6 +647,9 @@
};
case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+ /*
+ * user-space expects an 'old-style' FPU dump.
+ */
if (!access_ok(VERIFY_WRITE, (unsigned *)data,
sizeof(struct user_i387_struct)))
{
@@ -655,15 +659,17 @@
ret = 0;
if ( !child->used_math ) {
/* Simulate an empty FPU. */
- child->tss.i387.hard.cwd = 0xffff037f;
- child->tss.i387.hard.swd = 0xffff0000;
- child->tss.i387.hard.twd = 0xffffffff;
+ i387_set_cwd(child->tss.i387.hard, 0x037f);
+ i387_set_swd(child->tss.i387.hard, 0x0000);
+ i387_set_twd(child->tss.i387.hard, 0xffff);
}
#ifdef CONFIG_MATH_EMULATION
if ( boot_cpu_data.hard_math ) {
#endif
- __copy_to_user((void *)data, &child->tss.i387.hard,
- sizeof(struct user_i387_struct));
+ i387_hard_to_user(
+ (struct user_i387_struct *)data,
+ &child->tss.i387.hard
+ );
#ifdef CONFIG_MATH_EMULATION
} else {
save_i387_soft(&child->tss.i387.soft,
@@ -684,8 +690,10 @@
#ifdef CONFIG_MATH_EMULATION
if ( boot_cpu_data.hard_math ) {
#endif
- __copy_from_user(&child->tss.i387.hard, (void *)data,
- sizeof(struct user_i387_struct));
+ i387_user_to_hard(
+ &child->tss.i387.hard,
+ (struct user_i387_struct *)data
+ );
#ifdef CONFIG_MATH_EMULATION
} else {
restore_i387_soft(&child->tss.i387.soft,
--- linux/arch/i386/kernel/signal.c.PIII Sun Dec 5 14:23:09 1999
+++ linux/arch/i386/kernel/signal.c Sun Dec 5 14:23:14 1999
@@ -21,6 +21,7 @@
#include <linux/stddef.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
+#include <asm/i387.h>
#define DEBUG_SIG 0
@@ -153,9 +154,14 @@
static inline int restore_i387_hard(struct _fpstate *buf)
{
+ int err = 0;
struct task_struct *tsk = current;
clear_fpu(tsk);
- return __copy_from_user(&tsk->tss.i387.hard, buf, sizeof(*buf));
+
+ err = i387_user_to_hard(&tsk->tss.i387.hard,
+ (struct user_i387_struct *)buf);
+ err |= get_user(tsk->tss.i387.hard.fsave.swd, &buf->status);
+ return err;
}
static inline int restore_i387(struct _fpstate *buf)
@@ -305,11 +311,14 @@
static inline int save_i387_hard(struct _fpstate * buf)
{
+ int err = 0;
struct task_struct *tsk = current;
unlazy_fpu(tsk);
- tsk->tss.i387.hard.status = tsk->tss.i387.hard.swd;
- if (__copy_to_user(buf, &tsk->tss.i387.hard, sizeof(*buf)))
+ err = i387_hard_to_user((struct user_i387_struct *)buf,
+ &tsk->tss.i387.hard);
+ err |= put_user(tsk->tss.i387.hard.fsave.swd, &buf->status);
+ if (err)
return -1;
return 1;
}
--- linux/arch/i386/kernel/smp.c.PIII Sun Dec 5 14:23:09 1999
+++ linux/arch/i386/kernel/smp.c Sun Dec 5 14:23:14 1999
@@ -890,6 +890,8 @@
*/
int __init start_secondary(void *unused)
{
+ disable_serial_nr();
+ load_default_mxcsr();
/*
* Dont put anything before smp_callin(), SMP
* booting is too fragile that we want to limit the
--- linux/arch/i386/kernel/traps.c.PIII Tue Feb 16 17:20:05 1999
+++ linux/arch/i386/kernel/traps.c Sun Dec 5 14:23:14 1999
@@ -33,6 +33,7 @@
#include <asm/atomic.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
+#include <asm/i387.h>
#include <asm/smp.h>
@@ -421,7 +422,9 @@
* (this will also clear the error)
*/
task = current;
- save_fpu(task);
+ i387_save_hard(task->tss.i387);
+ task->flags &= ~PF_USEDFPU;
+ stts();
task->tss.trap_no = 16;
task->tss.error_code = 0;
force_sig(SIGFPE, task);
@@ -452,17 +455,44 @@
asmlinkage void math_state_restore(struct pt_regs regs)
{
__asm__ __volatile__("clts"); /* Allow maths ops (or we recurse) */
- if(current->used_math)
- __asm__("frstor %0": :"m" (current->tss.i387));
- else
- {
+ /*
+ * If we have either of the kernel FPU use states set in the
+ * fpustate variable, then this will be a kernel math trap.
+ * Otherwise, this is userspace trying to use the FPU.
+ */
+ if(current->tss.x86_fpustate & X86_FPUSTATE_KERN_ANY) {
+ load_default_mxcsr(); /* we don't ever mess with this in
+ kernel space, so just make sure
+ we have a reasonable one so we
+ don't start taking unmasked
+ exceptions by accident */
+ if(current->tss.mmx_reg_space != NULL)
+ __asm__("movq 0x00(%0), %%mm0\n\t"
+ "movq 0x08(%0), %%mm1\n\t"
+ "movq 0x10(%0), %%mm2\n\t"
+ "movq 0x18(%0), %%mm3\n\t"
+ :: "r" (current->tss.mmx_reg_space));
+ if(current->tss.kni_reg_space != NULL)
+ __asm__("movups 0x00(%0), %%xmm0\n\t"
+ "movups 0x10(%0), %%xmm1\n\t"
+ "movups 0x20(%0), %%xmm2\n\t"
+ "movups 0x30(%0), %%xmm3\n\t"
+ :: "r" (current->tss.kni_reg_space));
+ } else if(current->tss.x86_fpustate & X86_FPUSTATE_USER_SAVED) {
+ i387_restore_hard(current->tss.i387);
+ current->tss.x86_fpustate = 0;
+ } else if(current->used_math) {
+ i387_restore_hard(current->tss.i387);
+ current->flags|=PF_USEDFPU; /* make switch_to() work */
+ } else {
/*
* Our first FPU usage, clean the chip.
*/
__asm__("fninit");
+ load_default_mxcsr();
current->used_math = 1;
+ current->flags|=PF_USEDFPU; /* make switch_to() work */
}
- current->flags|=PF_USEDFPU; /* So we fnsave on switch_to() */
}
#ifndef CONFIG_MATH_EMULATION
--- linux/arch/i386/kernel/i386_ksyms.c.PIII Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/kernel/i386_ksyms.c Sun Dec 5 14:23:14 1999
@@ -119,3 +119,13 @@
#ifdef CONFIG_VT
EXPORT_SYMBOL(screen_info);
#endif
+
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+EXPORT_SYMBOL(best_memcpy);
+EXPORT_SYMBOL(best_memset);
+EXPORT_SYMBOL(best_copy_to_user);
+EXPORT_SYMBOL(best_copy_from_user);
+EXPORT_SYMBOL(__best_copy_to_user);
+EXPORT_SYMBOL(__best_copy_from_user);
+#endif
+
--- linux/arch/i386/kernel/setup.c.PIII Sun Dec 5 14:23:09 1999
+++ linux/arch/i386/kernel/setup.c Sun Dec 5 14:23:14 1999
@@ -104,6 +104,17 @@
extern int _etext, _edata, _end;
extern unsigned long cpu_hz;
+#ifdef CONFIG_X86_PN_OFF
+int disable_x86_serial_nr = 1;
+#else
+int disable_x86_serial_nr = 0;
+#endif
+
+/*
+ * For the various FPU using kernel accelerator routines
+ */
+spinlock_t kern_fpu_lock = SPIN_LOCK_UNLOCKED;
+
/*
* This is set up by the setup-routine at boot-time
*/
@@ -809,20 +820,6 @@
if (c->x86_vendor == X86_VENDOR_AMD && amd_model(c))
return;
-
- if (c->cpuid_level > 0 && c->x86_vendor == X86_VENDOR_INTEL)
- {
- if(c->x86_capability&(1<<18))
- {
- /* Disable processor serial number on Intel Pentium III
- from code by Phil Karn */
- unsigned long lo,hi;
- rdmsr(0x119,lo,hi);
- lo |= 0x200000;
- wrmsr(0x119,lo,hi);
- printk(KERN_INFO "Pentium-III serial number disabled.\n");
- }
- }
if (c->cpuid_level > 1) {
/* supports eax=2 call */
@@ -909,7 +906,15 @@
}
cyrix_model(&boot_cpu_data);
}
-
+
+/*
+ * Setup function for serial number stuff
+ */
+
+__initfunc(void x86_serial_nr_setup(char *str, int *ints))
+{
+ disable_x86_serial_nr = !disable_x86_serial_nr;
+}
static char *cpu_vendor_names[] __initdata = {
--- linux/arch/i386/Makefile.PIII Tue Oct 19 20:14:00 1999
+++ linux/arch/i386/Makefile Sun Dec 5 14:23:14 1999
@@ -43,6 +43,10 @@
CFLAGS := $(CFLAGS) -m486 -malign-loops=2 -malign-jumps=2 -malign-functions=2
-DCPU=686
endif
+ifdef CONFIG_M686FX
+CFLAGS := $(CFLAGS) -m486 -malign-loops=0 -malign-jumps=0 -malign-functions=0
+-DCPU=686
+endif
+
HEAD := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
SUBDIRS := $(SUBDIRS) arch/i386/kernel arch/i386/mm arch/i386/lib
--- linux/arch/i386/config.in.PIII Mon Aug 9 15:04:38 1999
+++ linux/arch/i386/config.in Sun Dec 5 14:23:14 1999
@@ -16,7 +16,8 @@
486/Cx486 CONFIG_M486 \
586/K5/5x86/6x86 CONFIG_M586 \
Pentium/K6/TSC CONFIG_M586TSC \
- PPro/6x86MX CONFIG_M686" PPro
+ PPro/6x86MX/PII CONFIG_M686 \
+ PIII/Xeon/Deschutes CONFIG_M686FX" PIII
#
# Define implied options from the CPU selection here
#
@@ -26,20 +27,24 @@
define_bool CONFIG_X86_BSWAP y
define_bool CONFIG_X86_POPAD_OK y
fi
-if [ "$CONFIG_M686" = "y" -o "$CONFIG_M586TSC" = "y" ]; then
+if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" \
+ -o "$CONFIG_M586TSC" = "y" ]; then
define_bool CONFIG_X86_TSC y
fi
-if [ "$CONFIG_M686" = "y" ]; then
+if [ "$CONFIG_M686FX" = "y" -o "$CONFIG_M686" = "y" ]; then
define_bool CONFIG_X86_GOOD_APIC y
fi
+bool 'Disable the PII/PIII Serial Number at bootup' CONFIG_X86_PN_OFF
+bool 'Enable PII/PIII Extended/Fast FPU save and restore support' CONFIG_X86_FX
+bool 'Enable CPU Specific (MMX/MMX2) Optimization Functions'
+CONFIG_X86_CPU_OPTIMIZATIONS
+bool 'Math emulation' CONFIG_MATH_EMULATION
+bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
+bool 'Symmetric multi-processing support' CONFIG_SMP
choice 'Maximum Physical Memory' \
"1GB CONFIG_1GB \
2GB CONFIG_2GB" 1GB
-bool 'Math emulation' CONFIG_MATH_EMULATION
-bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
-bool 'Symmetric multi-processing support' CONFIG_SMP
endmenu
mainmenu_option next_comment
--- linux/arch/i386/defconfig.PIII Sun Dec 5 14:23:13 1999
+++ linux/arch/i386/defconfig Sun Dec 5 14:23:14 1999
@@ -21,11 +21,14 @@
CONFIG_X86_POPAD_OK=y
CONFIG_X86_TSC=y
CONFIG_X86_GOOD_APIC=y
-CONFIG_1GB=y
-# CONFIG_2GB is not set
+CONFIG_X86_PN_OFF=y
+CONFIG_X86_FX=y
+CONFIG_X86_CPU_OPTIMIZATIONS=y
# CONFIG_MATH_EMULATION is not set
# CONFIG_MTRR is not set
CONFIG_SMP=y
+CONFIG_1GB=y
+# CONFIG_2GB is not set
#
# Loadable module support
--- linux/Documentation/Configure.help.PIII Sun Dec 5 14:23:14 1999
+++ linux/Documentation/Configure.help Sun Dec 5 14:23:14 1999
@@ -1659,10 +1659,10 @@
all x86 CPU types (albeit not optimally fast), you can specify
"386" here.
- If you specify one of "486" or "586" or "Pentium" or "PPro", then
- the kernel will not necessarily run on earlier architectures (e.g. a
- Pentium optimized kernel will run on a PPro, but not necessarily on
- a i486).
+ If you specify one of "486" or "586" or "Pentium" or "PPro" or "PIII",
+ then the kernel will not necessarily run on earlier architectures
+ (e.g. a Pentium optimized kernel will run on a PPro, but not necessarily
+ on a i486).
Here are the settings recommended for greatest speed:
- "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI
@@ -1676,8 +1676,30 @@
K6-3D.
- "PPro" for the Cyrix/IBM/National Semiconductor 6x86MX, MII and
Intel Pentium II/Pentium Pro.
+ - "PIII/Xeon/Deschutes" for the PIII (Katmai), Xeon and later PIIs
+ with the Deschutes or Mendocino core. You have to chose this for
+ MMX2 support.
If you don't know what to do, choose "386".
+
+Disable PII/PIII Serial Number at bootup
+CONFIG_X86_PN_OFF
+ This makes the kernel disable the CPUID serial number that is embedded on
+ the new PIII CPUs at bootup.
+
+Enable PII/PIII Extended Fast FPU save and restore support
+CONFIG_X86_FX
+ This enables use of the new PII/PIII FXSAVE/FXRSTOR support. This item
+ is required to make use of the new PIII 128bit XMM registers. It is safe
+ to leave this enabled all the time.
+
+Enable CPU Specific (MMX/MMX2) Optimizations
+CONFIG_X86_CPU_OPTIMIZATIONS
+ This enables use of the MMX registers and 128bit MMX2 registers on CPUs
+ that can support the new instructions (Pentium/AMD K6 or newer). In
+ order to support the Pentium III 128 bit XMM registers you must enable
+ both this and PII/PIII Extended Fast FPU save support. It is safe to
+ leave this enabled all the time.
VGA text console
CONFIG_VGA_CONSOLE
--- linux/drivers/block/xor.c.pIII-2 Tue Nov 23 14:02:17 1999
+++ linux/drivers/block/xor.c Tue Nov 23 14:03:21 1999
@@ -22,6 +22,10 @@
#include <asm/asi.h>
#include <asm/visasm.h>
#endif
+#ifdef __i386__
+#include <asm/processor.h>
+#include <asm/i387.h>
+#endif
/*
* we use the 'XOR function template' to register multiple xor
@@ -66,7 +70,7 @@
#ifdef __i386__
-#ifdef CONFIG_X86_XMM
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
/*
* Cache avoiding checksumming functions utilizing KNI instructions
* Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
@@ -74,20 +78,12 @@
XORBLOCK_TEMPLATE(pIII_kni)
{
- char xmm_save[16*4];
- int cr0;
- int lines = (bh_ptr[0]->b_size>>8);
-
- __asm__ __volatile__ (
- "movl %%cr0,%0 ;\n\t"
- "clts ;\n\t"
- "movups %%xmm0,(%1) ;\n\t"
- "movups %%xmm1,0x10(%1) ;\n\t"
- "movups %%xmm2,0x20(%1) ;\n\t"
- "movups %%xmm3,0x30(%1) ;\n\t"
- : "=r" (cr0)
- : "r" (xmm_save)
- : "memory" );
+ char xmm_space[64];
+ int lines = (bh_ptr[0]->b_size>>8);
+ int recursive = 0;
+ unsigned long flags;
+
+ kernel_take_fpu_kni(recursive,&xmm_space[0],NULL,flags);
#define OFFS(x) "8*("#x"*2)"
#define PF0(x) \
@@ -157,7 +153,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data)
: "memory" );
@@ -207,7 +203,7 @@
" decl %0 ;\n"
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data)
@@ -266,7 +262,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
@@ -333,7 +329,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
@@ -343,16 +339,7 @@
break;
}
- __asm__ __volatile__ (
- "sfence ;\n\t"
- "movups (%1),%%xmm0 ;\n\t"
- "movups 0x10(%1),%%xmm1 ;\n\t"
- "movups 0x20(%1),%%xmm2 ;\n\t"
- "movups 0x30(%1),%%xmm3 ;\n\t"
- "movl %0,%%cr0 ;\n\t"
- :
- : "r" (cr0), "r" (xmm_save)
- : "memory" );
+ kernel_release_fpu_kni(recursive,&xmm_space[0],flags);
}
#undef OFFS
@@ -371,7 +358,7 @@
#undef XO5
#undef BLOCK
-#endif /* CONFIG_X86_XMM */
+#endif /* CONFIG_X86_CPU_OPTIMIZATIONS */
/*
* high-speed RAID5 checksumming functions utilizing MMX instructions
@@ -379,13 +366,12 @@
*/
XORBLOCK_TEMPLATE(pII_mmx)
{
- char fpu_save[108];
int lines = (bh_ptr[0]->b_size>>7);
+ char mmx_space[32];
+ int recursive = 0;
+ unsigned long flags;
- if (!(current->flags & PF_USEDFPU))
- __asm__ __volatile__ ( " clts;\n");
-
- __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+ kernel_take_fpu_mmx(recursive,&mmx_space[0],NULL,flags);
#define LD(x,y) \
" movq 8*("#x")(%1), %%mm"#y" ;\n"
@@ -431,7 +417,7 @@
" decl %0 ;\n"
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data)
: "memory");
@@ -471,7 +457,7 @@
" decl %0 ;\n"
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data)
@@ -517,7 +503,7 @@
" decl %0 ;\n"
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
@@ -569,7 +555,7 @@
" decl %0 ;\n"
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
@@ -579,10 +565,7 @@
break;
}
- __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
- if (!(current->flags & PF_USEDFPU))
- stts();
+ kernel_release_fpu_mmx(recursive,&mmx_space[0],flags);
}
#undef LD
@@ -595,13 +578,12 @@
XORBLOCK_TEMPLATE(p5_mmx)
{
- char fpu_save[108];
int lines = (bh_ptr[0]->b_size>>6);
+ char mmx_space[32];
+ int recursive = 0;
+ unsigned long flags;
- if (!(current->flags & PF_USEDFPU))
- __asm__ __volatile__ ( " clts;\n");
-
- __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
+ kernel_take_fpu_mmx(recursive,&mmx_space[0],NULL,flags);
switch(count) {
case 2:
@@ -618,21 +600,21 @@
" movq 24(%1), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
" pxor 16(%2), %%mm2 ;\n"
- " movq 32(%1), %%mm4 ;\n"
+ " movq 32(%1), %%mm0 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 24(%2), %%mm3 ;\n"
- " movq 40(%1), %%mm5 ;\n"
+ " movq 40(%1), %%mm1 ;\n"
" movq %%mm3, 24(%1) ;\n"
- " pxor 32(%2), %%mm4 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " pxor 40(%2), %%mm5 ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
+ " pxor 32(%2), %%mm0 ;\n"
+ " movq 48(%1), %%mm2 ;\n"
+ " movq %%mm0, 32(%1) ;\n"
+ " pxor 40(%2), %%mm1 ;\n"
+ " movq 56(%1), %%mm3 ;\n"
+ " movq %%mm1, 40(%1) ;\n"
+ " pxor 48(%2), %%mm2 ;\n"
+ " pxor 56(%2), %%mm3 ;\n"
+ " movq %%mm2, 48(%1) ;\n"
+ " movq %%mm3, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
@@ -640,7 +622,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data)
: "memory" );
@@ -662,26 +644,26 @@
" pxor 16(%3), %%mm2 ;\n"
" movq 24(%1), %%mm3 ;\n"
" movq %%mm1, 8(%1) ;\n"
- " movq 32(%1), %%mm4 ;\n"
- " movq 40(%1), %%mm5 ;\n"
+ " movq 32(%1), %%mm0 ;\n"
+ " movq 40(%1), %%mm1 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" movq %%mm2, 16(%1) ;\n"
- " pxor 32(%2), %%mm4 ;\n"
+ " pxor 32(%2), %%mm0 ;\n"
" pxor 24(%3), %%mm3 ;\n"
- " pxor 40(%2), %%mm5 ;\n"
+ " pxor 40(%2), %%mm1 ;\n"
" movq %%mm3, 24(%1) ;\n"
- " pxor 32(%3), %%mm4 ;\n"
- " pxor 40(%3), %%mm5 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " pxor 48(%3), %%mm6 ;\n"
- " pxor 56(%3), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
+ " pxor 32(%3), %%mm0 ;\n"
+ " pxor 40(%3), %%mm1 ;\n"
+ " movq 48(%1), %%mm2 ;\n"
+ " movq %%mm0, 32(%1) ;\n"
+ " movq 56(%1), %%mm3 ;\n"
+ " pxor 48(%2), %%mm2 ;\n"
+ " movq %%mm1, 40(%1) ;\n"
+ " pxor 56(%2), %%mm3 ;\n"
+ " pxor 48(%3), %%mm2 ;\n"
+ " pxor 56(%3), %%mm3 ;\n"
+ " movq %%mm2, 48(%1) ;\n"
+ " movq %%mm3, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
@@ -690,7 +672,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data)
@@ -714,33 +696,33 @@
" pxor 16(%3), %%mm2 ;\n"
" pxor 8(%4), %%mm1 ;\n"
" movq %%mm0, (%1) ;\n"
- " movq 32(%1), %%mm4 ;\n"
+ " movq 32(%1), %%mm0 ;\n"
" pxor 24(%2), %%mm3 ;\n"
" pxor 16(%4), %%mm2 ;\n"
" movq %%mm1, 8(%1) ;\n"
- " movq 40(%1), %%mm5 ;\n"
- " pxor 32(%2), %%mm4 ;\n"
+ " movq 40(%1), %%mm1 ;\n"
+ " pxor 32(%2), %%mm0 ;\n"
" pxor 24(%3), %%mm3 ;\n"
" movq %%mm2, 16(%1) ;\n"
- " pxor 40(%2), %%mm5 ;\n"
- " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%2), %%mm1 ;\n"
+ " pxor 32(%3), %%mm0 ;\n"
" pxor 24(%4), %%mm3 ;\n"
" movq %%mm3, 24(%1) ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " pxor 40(%3), %%mm5 ;\n"
- " pxor 32(%4), %%mm4 ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " pxor 40(%4), %%mm5 ;\n"
- " pxor 48(%3), %%mm6 ;\n"
- " pxor 56(%3), %%mm7 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 48(%4), %%mm6 ;\n"
- " pxor 56(%4), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
+ " movq 56(%1), %%mm3 ;\n"
+ " movq 48(%1), %%mm2 ;\n"
+ " pxor 40(%3), %%mm1 ;\n"
+ " pxor 32(%4), %%mm0 ;\n"
+ " pxor 48(%2), %%mm2 ;\n"
+ " movq %%mm0, 32(%1) ;\n"
+ " pxor 56(%2), %%mm3 ;\n"
+ " pxor 40(%4), %%mm1 ;\n"
+ " pxor 48(%3), %%mm2 ;\n"
+ " pxor 56(%3), %%mm3 ;\n"
+ " movq %%mm1, 40(%1) ;\n"
+ " pxor 48(%4), %%mm2 ;\n"
+ " pxor 56(%4), %%mm3 ;\n"
+ " movq %%mm2, 48(%1) ;\n"
+ " movq %%mm3, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
@@ -750,7 +732,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
@@ -782,34 +764,34 @@
" movq %%mm1, 8(%1) ;\n"
" pxor 16(%5), %%mm2 ;\n"
" pxor 24(%3), %%mm3 ;\n"
- " movq 32(%1), %%mm4 ;\n"
+ " movq 32(%1), %%mm0 ;\n"
" movq %%mm2, 16(%1) ;\n"
" pxor 24(%4), %%mm3 ;\n"
- " pxor 32(%2), %%mm4 ;\n"
- " movq 40(%1), %%mm5 ;\n"
+ " pxor 32(%2), %%mm0 ;\n"
+ " movq 40(%1), %%mm1 ;\n"
" pxor 24(%5), %%mm3 ;\n"
- " pxor 32(%3), %%mm4 ;\n"
- " pxor 40(%2), %%mm5 ;\n"
+ " pxor 32(%3), %%mm0 ;\n"
+ " pxor 40(%2), %%mm1 ;\n"
" movq %%mm3, 24(%1) ;\n"
- " pxor 32(%4), %%mm4 ;\n"
- " pxor 40(%3), %%mm5 ;\n"
- " movq 48(%1), %%mm6 ;\n"
- " movq 56(%1), %%mm7 ;\n"
- " pxor 32(%5), %%mm4 ;\n"
- " pxor 40(%4), %%mm5 ;\n"
- " pxor 48(%2), %%mm6 ;\n"
- " pxor 56(%2), %%mm7 ;\n"
- " movq %%mm4, 32(%1) ;\n"
- " pxor 48(%3), %%mm6 ;\n"
- " pxor 56(%3), %%mm7 ;\n"
- " pxor 40(%5), %%mm5 ;\n"
- " pxor 48(%4), %%mm6 ;\n"
- " pxor 56(%4), %%mm7 ;\n"
- " movq %%mm5, 40(%1) ;\n"
- " pxor 48(%5), %%mm6 ;\n"
- " pxor 56(%5), %%mm7 ;\n"
- " movq %%mm6, 48(%1) ;\n"
- " movq %%mm7, 56(%1) ;\n"
+ " pxor 32(%4), %%mm0 ;\n"
+ " pxor 40(%3), %%mm1 ;\n"
+ " movq 48(%1), %%mm2 ;\n"
+ " movq 56(%1), %%mm3 ;\n"
+ " pxor 32(%5), %%mm0 ;\n"
+ " pxor 40(%4), %%mm1 ;\n"
+ " pxor 48(%2), %%mm2 ;\n"
+ " pxor 56(%2), %%mm3 ;\n"
+ " movq %%mm0, 32(%1) ;\n"
+ " pxor 48(%3), %%mm2 ;\n"
+ " pxor 56(%3), %%mm3 ;\n"
+ " pxor 40(%5), %%mm1 ;\n"
+ " pxor 48(%4), %%mm2 ;\n"
+ " pxor 56(%4), %%mm3 ;\n"
+ " movq %%mm1, 40(%1) ;\n"
+ " pxor 48(%5), %%mm2 ;\n"
+ " pxor 56(%5), %%mm3 ;\n"
+ " movq %%mm2, 48(%1) ;\n"
+ " movq %%mm3, 56(%1) ;\n"
" addl $64, %1 ;\n"
" addl $64, %2 ;\n"
@@ -820,7 +802,7 @@
" jnz 1b ;\n"
:
- : "r" (lines),
+ : "m" (lines),
"r" (bh_ptr[0]->b_data),
"r" (bh_ptr[1]->b_data),
"r" (bh_ptr[2]->b_data),
@@ -830,10 +812,7 @@
break;
}
- __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
-
- if (!(current->flags & PF_USEDFPU))
- stts();
+ kernel_release_fpu_mmx(recursive,&mmx_space[0],flags);
}
#endif /* __i386__ */
#endif /* !__sparc_v9__ */
@@ -1811,11 +1790,12 @@
if (f->speed > fastest->speed)
fastest = f;
}
-#ifdef CONFIG_X86_XMM
- if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
fastest = &t_xor_block_pIII_kni;
}
-#endif
+#endif /* CONFIG_X86_CPU_OPTIMIZATIONS */
xor_block = fastest->xor_block;
printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name,
fastest->speed / 1000, fastest->speed % 1000);
@@ -1847,8 +1827,9 @@
xor_speed(&t_xor_block_SPARC,&b1,&b2);
#endif
-#ifdef CONFIG_X86_XMM
- if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) {
+#ifdef CONFIG_X86_CPU_OPTIMIZATIONS
+ if ( (boot_cpu_data.mmu_cr4_features & X86_CR4_OSFXSR) &&
+ (boot_cpu_data.x86_capability & X86_FEATURE_XMM) ) {
printk(KERN_INFO
"raid5: KNI detected, trying cache-avoiding KNI checksum
routine\n");
/* we force the use of the KNI xor block because it
@@ -1859,7 +1840,7 @@
*/
xor_speed(&t_xor_block_pIII_kni,&b1,&b2);
}
-#endif /* CONFIG_X86_XMM */
+#endif /* CONFIG_X86_CPU_OPTIMIZATIONS */
#ifdef __i386__
raid-2.2.14-B1.gz