Hi,
A new try to send the diff for the fast memcpy.
This time I attached the complete diff (I used diff urN with a cvs
updated copy (updated 10 minutes ago))
freedos and win95 bootwith this new code.
Still some attention because there is a catch.
MMX functions use the registers which are also used with fpu functions.
Because of this all applications which use mmx keep track of the state
of these registers (is the fpu being used? are there mmx functions
running) If a code segment uses both techniques (mmx and fpu) code must
be added to save and retrieve all registers at the right moment.
The code I used comes from the linux kernel. The linux kernel keeps the
state of the fpu use in a global variable. If the fpu is being used an
extra routine is called within the mmx-memcpy function which does all
the register saving/retrieving. If the kernel is in an interrupt the mmx
functions are not used at all, Instead it uses slow-memcpy. I don't know
why.
I changed the code by assuming no fpu code is being run at the same time
(or that the multitasking code takes care of the register saving) and I
also assume we are not in an interrupt. Because I don't know if this
code is being run in the real world/ the virtual world or both I can't
quite see if there could be problems here.
I could ofcourse implement some code which allways saves the state but
this way the speedimprovement of the mmx code would be less. This should
only be done when needed.
Is there allready a some status information within plex86 about mmx/fpu
use ?
Well, here comes the diff.
Greetings,
Martin DvH
diff -urN ../plex86_orig2/config.h.in ./config.h.in
--- ../plex86_orig2/config.h.in Tue May 15 18:49:56 2001
+++ ./config.h.in Mon May 28 00:30:03 2001
@@ -135,6 +135,8 @@
/* limited i440FX PCI support */
#define BX_PCI_SUPPORT 0
+#define USE_MMX 0
+
#define BX_SUPPORT_CDROM 0
#if BX_SUPPORT_CDROM
/* This is the C++ class name to use if we are supporting */
diff -urN ../plex86_orig2/configure.in ./configure.in
--- ../plex86_orig2/configure.in Tue May 15 18:49:56 2001
+++ ./configure.in Mon May 28 00:30:03 2001
@@ -103,6 +103,24 @@
]
)
+AC_MSG_CHECKING(for mmx support)
+AC_ARG_ENABLE(mmx,
+ [ --enable-mmx enable use of mmx instructions],
+ [if test "$enableval" = yes; then
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(USE_MMX, 1)
+ elif test "$enableval" = no; then
+ AC_MSG_RESULT(no)
+ AC_DEFINE(USE_MMX, 0)
+ else
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(USE_MMX, 1)
+ fi],
+ [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(USE_MMX, 1)
+ ]
+ )
AC_ARG_WITH(WinNT,
diff -urN ../plex86_orig2/kernel/Makefile.in ./kernel/Makefile.in
--- ../plex86_orig2/kernel/Makefile.in Sat May 26 07:07:11 2001
+++ ./kernel/Makefile.in Mon May 28 01:02:58 2001
@@ -64,7 +64,7 @@
$(KERNEL_TARGET): $(HOST_O) monitor-host.o \
nexus.o flag-nexus.o print-nexus.o segment-nexus.o \
- mode-nexus.o util-nexus.o system-nexus.o \
+ mode-nexus.o util-nexus.o util-nexus-mmx.o system-nexus.o \
fault-mon.o phymem-mon.o panic-mon.o \
paging-mon.o system-mon.o mode-mon.o instrument-mon.o \
emulation/emu.o dt/dt.o
diff -urN ../plex86_orig2/kernel/include/monitor.h ./kernel/include/monitor.h
--- ../plex86_orig2/kernel/include/monitor.h Sat May 26 07:07:11 2001
+++ ./kernel/include/monitor.h Mon May 28 00:50:56 2001
@@ -811,6 +811,8 @@
void mon_memzero(void *ptr, int size);
void mon_memcpy(void *dst, void *src, int size);
+void mon_slow_memcpy(void *dst, void *src, int size);
+void *mon_mmx_memcpy(void *dst, const void *src, int size);
void *mon_memset(void *s, unsigned c, unsigned n);
unsigned isV86MCompatible(vm_t *);
diff -urN ../plex86_orig2/kernel/util-nexus-mmx.c ./kernel/util-nexus-mmx.c
--- ../plex86_orig2/kernel/util-nexus-mmx.c Thu Jan 1 01:00:00 1970
+++ ./kernel/util-nexus-mmx.c Mon May 28 00:52:11 2001
@@ -0,0 +1,120 @@
+/*include <linux/types.h>
+include <linux/string.h>
+include <linux/sched.h>*/
+
+/*include <asm/i387.h>*/
+
+/*
+ * Clear and set 'TS' bit respectively
+ */
+#define clts() __asm__ __volatile__ ("clts")
+#define read_cr0() ({ \
+ unsigned int __dummy; \
+ __asm__( \
+ "movl %%cr0,%0\n\t" \
+ :"=r" (__dummy)); \
+ __dummy; \
+})
+#define write_cr0(x) \
+ __asm__("movl %0,%%cr0": :"r" (x));
+#define stts() write_cr0(8 | read_cr0())
+#define mon_fpu_end() stts()
+void mon_fpu_begin(void)
+{
+/* next code is commented out because we know (do we ?) that the fpu is not being
+used anywhere.*/
+ /*struct task_struct *tsk = current;
+
+ if (tsk->flags & PF_USEDFPU) {
+ __save_init_fpu(tsk);
+ return;
+ }*/
+
+ clts();
+}
+
+/*
+ * MMX 3DNow! library helper functions
+ *
+ * To do:
+ * We can use MMX just for prefetch in IRQ's. This may be a win.
+ * (reported so on K6-III)
+ * We should use a better code neutral filler for the short jump
+ * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ * We also want to clobber the filler register so we dont get any
+ * register forwarding stalls on the filler.
+ *
+ * Add *user handling. Checksums are not a win with MMX on any CPU
+ * tested so far for any MMX solution figured.
+ *
+ * 22/09/2000 - Arjan van de Ven
+ * Improved for non-egineering-sample Athlons
+ * 28/05/2001 - Martin Dudok van Heel
+ * Changed this linux kernel-code so it can be used in plex86
+ */
+
+void *mon_mmx_memcpy(void *dst, const void *src, int size)
+{
+ void *p=dst;
+ int i= size >> 6; /* size/64 */
+
+ mon_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n" /* This set is 28 bytes */
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (src) );
+
+
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (src), "r" (dst) : "memory");
+ src+=64;
+ dst+=64;
+ }
+ /*
+ * Now do the tail of the block
+ */
+ mon_slow_memcpy(dst, src, size&63);
+ mon_fpu_end();
+ return p;
+}
+
+
diff -urN ../plex86_orig2/kernel/util-nexus.c ./kernel/util-nexus.c
--- ../plex86_orig2/kernel/util-nexus.c Tue May 15 18:49:56 2001
+++ ./kernel/util-nexus.c Mon May 28 00:30:03 2001
@@ -26,7 +26,6 @@
#include "monitor.h"
-
void
mon_memzero(void *ptr, int size)
{
@@ -36,13 +35,25 @@
}
void
-mon_memcpy(void *dst, void *src, int size)
+mon_slow_memcpy(void *dst, void *src, int size)
{
char *d = dst;
char *s = src;
while (size--)
*d++ = *s++;
}
+
+
+ void
+mon_memcpy(void *dst, void *src, int size)
+{
+#if USE_MMX
+ mon_mmx_memcpy(dst,src,size);
+#else
+ mon_slow_memcpy(dst,src,size);
+#endif
+}
+
void *
mon_memset(void *dst, unsigned c, unsigned n)