Michael,

Tres asked me to help with the configure.in and Makefile.am bits for his SSE2 code for x86_64. I've attached patches against Eterm CVS with changes to: configure.in, src/Makefile.am, src/pixmap.c, and the new file: src/sse2_cmod.c (changed since Tres's earlier version)

The configure.in changes add "--enable-sse2" which will be on by default for x86_64. The incomplete tests for MMX_64 have been removed. There was an odd "dps_snprintf_oflow()" line in configure.in which was causing problems on my i686 system, and which I think was a typo, so I
removed it.

The src/Makefile.am changes just add the new source file sse2_cmod.c and conditionally compile it when HAVE_SSE2 is defined.

The src/pixman.c changes add externs for the new sse2 function, and conditionals to use them if HAVE_SSE2 is defined. Tres also added 8 or 16 byte alignment conditionally to colormod_trans() - this should perhaps be done only for gcc?

Tres indicated that adding "-mpreferred-stack-boundary=16" might still be beneficial on x86_64, but that it might consume extra space at run time. I'm not in a position to make that call, but if you agree its a good idea I can probably make the configure.in change.

I've tested the changes on xi686 and x86_64 and verified that the correct routines get compiled in. I verified that the modified Eterms run and that the Brightness control does something reasonable on both systems. I have not done any performance tests.

All credit for the sse2_cmod.c code goes to Tres. I just did the easy bits.

John




Index: configure.in
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/configure.in,v
retrieving revision 1.92
diff -u -r1.92 configure.in
--- configure.in	1 May 2005 07:16:51 -0000	1.92
+++ configure.in	6 Jun 2005 14:44:01 -0000
@@ -220,7 +220,7 @@
 seteuid memmove putenv strsep setresuid setresgid \
 memmem usleep snprintf strcasestr strcasechr \
 strcasepbrk strrev nl_langinfo)
-dps_snprintf_oflow()
+
 AC_CHECK_LIB(m, pow)
 
 dnl# Portability checks for various functions
@@ -500,9 +500,11 @@
                 AC_DEFINE(PIXMAP_OFFSET, , [Define for pseudo-transparency support.])
 ])
 
+dnl#
+dnl# MMX support
+dnl#
 AC_MSG_CHECKING(for MMX support)
 HAVE_MMX=""
-HAVE_MMX_64=""
 AC_ARG_ENABLE(mmx, [  --enable-mmx            enable MMX assembly routines], [
                   test "x$enableval" = "xyes" && HAVE_MMX="yes"
               ], [
@@ -510,25 +512,39 @@
                       i*86)
                           grep mmx /proc/cpuinfo >/dev/null 2>&1 && HAVE_MMX="yes"
                           ;;
-                      x86_64)
-                          grep mmx /proc/cpuinfo >/dev/null 2>&1 && HAVE_MMX_64="yes"
-                          ;;
                   esac
               ])
 if test "x$HAVE_MMX" = "xyes"; then
     AC_MSG_RESULT([yes (32-bit)])
     AC_DEFINE(HAVE_MMX, , [Define for 32-bit MMX support.])
-elif test "x$HAVE_MMX_64" = "xyes"; then
-    dnl# AC_MSG_RESULT([yes (64-bit)])
-    dnl# AC_DEFINE(HAVE_MMX_64, , [Define for 64-bit MMX support.])
-    AC_MSG_RESULT([no (64-bit MMX not yet supported)])
 else
     AC_MSG_RESULT([no (no MMX detected)])
 fi
-dnl# AM_CONDITIONAL(HAVE_MMX, test "x$HAVE_MMX" = "xyes" -o "x$HAVE_MMX_64" = "xyes")
 AM_CONDITIONAL(HAVE_MMX, test "x$HAVE_MMX" = "xyes")
 
 dnl#
+dnl# SSE2 support
+dnl#
+AC_MSG_CHECKING(for SSE2 support)
+HAVE_SSE2=""
+AC_ARG_ENABLE(sse2, [  --enable-sse2            enable SSE2 assembly routines], [
+                  test "x$enableval" = "xyes" && HAVE_SSE2="yes"
+              ], [
+                  case $host_cpu in
+                      x86_64)
+                          grep sse2 /proc/cpuinfo >/dev/null 2>&1 && HAVE_SSE2="yes"
+                          ;;
+                  esac
+              ])
+if test "x$HAVE_SSE2" = "xyes"; then
+    AC_MSG_RESULT([yes])
+    AC_DEFINE(HAVE_SSE2, , [Define for 64-bit SSE2 support.])
+else
+    AC_MSG_RESULT([no (no SSE2 detected)])
+fi
+AM_CONDITIONAL(HAVE_SSE2, test "x$HAVE_SSE2" = "xyes")
+
+dnl#
 dnl# LibAST
 dnl#
 LIBAST_MIN=5
Index: src/Makefile.am
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/src/Makefile.am,v
retrieving revision 1.29
diff -u -r1.29 Makefile.am
--- src/Makefile.am	15 Mar 2005 21:48:01 -0000	1.29
+++ src/Makefile.am	6 Jun 2005 14:44:01 -0000
@@ -6,6 +6,9 @@
 MMX_SRCS = mmx_cmod.S
 MMX_OBJS = mmx_cmod.lo
 
+SSE2_SRCS = sse2_cmod.c
+SSE2_OBJS = sse2_cmod.lo
+
 libEterm_la_SOURCES = actions.c actions.h buttons.c buttons.h command.c          \
                       command.h draw.c draw.h e.c e.h eterm_debug.h eterm_utmp.h \
                       events.c events.h feature.h font.c font.h grkelot.c        \
@@ -16,22 +19,27 @@
                       timer.c timer.h utmp.c windows.c windows.h defaultfont.c   \
                       defaultfont.h libscream.c scream.h screamcfg.h
 
-EXTRA_libEterm_la_SOURCES = $(MMX_SRCS)
+EXTRA_libEterm_la_SOURCES = $(MMX_SRCS) $(SSE2_SRCS)
 
 libEterm_la_LDFLAGS = -release $(VERSION)
+if HAVE_SSE2
+libEterm_la_DEPENDENCIES = feature.h $(SSE2_OBJS)
+libEterm_la_LIBADD = $(SSE2_OBJS)
+else
 if HAVE_MMX
 libEterm_la_DEPENDENCIES = feature.h $(MMX_OBJS)
 libEterm_la_LIBADD = $(MMX_OBJS)
 else
 libEterm_la_DEPENDENCIES = feature.h
 endif
+endif
 
 Eterm_SOURCES = main.c
 Eterm_DEPENDENCIES = libEterm.la
 Eterm_LDFLAGS = -rpath $(libdir):$(pkglibdir)
 Eterm_LDADD = libEterm.la 
 
-EXTRA_DIST = mmx_cmod.S
+EXTRA_DIST = mmx_cmod.S sse2_cmod.c
 
 install-exec-hook:
 	$(mkinstalldirs) $(DESTDIR)$(pkgdatadir)
Index: src/pixmap.c
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/src/pixmap.c,v
retrieving revision 1.112
diff -u -r1.112 pixmap.c
--- src/pixmap.c	10 May 2005 18:59:50 -0000	1.112
+++ src/pixmap.c	6 Jun 2005 14:44:03 -0000
@@ -57,11 +57,16 @@
 #  define WORDS_BIGENDIAN 0
 #endif
 
-/* Assembler routines */
+/* Assembler routines for 32 bit cpu with mmx */
 extern void shade_ximage_15_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 extern void shade_ximage_16_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 extern void shade_ximage_32_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 
+/* Assembler routines for 64 bit cpu with sse2 */
+extern void shade_ximage_15_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+extern void shade_ximage_16_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+extern void shade_ximage_32_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+
 #ifdef PIXMAP_SUPPORT
 static Imlib_Border bord_none = { 0, 0, 0, 0 };
 #endif
@@ -1548,7 +1553,7 @@
 
 /* New optimized routines for tinting XImages written by Willem Monsuwe <[EMAIL PROTECTED]> */
 
-#ifndef HAVE_MMX
+#if !defined HAVE_MMX && !defined HAVE_SSE2
 /* RGB 15 */
 static void
 shade_ximage_15(void *data, int bpl, int w, int h, int rm, int gm, int bm)
@@ -1740,7 +1745,13 @@
 colormod_trans(Pixmap p, imlib_t *iml, GC gc, unsigned short w, unsigned short h)
 {
 
+#ifdef HAVE_SSE2
+    XImage * __attribute__ ((aligned (16))) ximg;
+#elif defined HAVE_MMX
+    XImage * __attribute__ ((aligned (8))) ximg;
+#else
     XImage *ximg;
+#endif
     register unsigned long i;
 
 #if 0
@@ -1845,7 +1856,10 @@
         /* Determine bitshift and bitmask values */
         switch (real_depth) {
             case 15:
-#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+                D_PIXMAP(("Using SSE2 - 15 bit\n"));
+                shade_ximage_15_sse2(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
+#elif defined HAVE_MMX
                 D_PIXMAP(("Using MMX - 15 bit\n"));
                 shade_ximage_15_mmx(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
 #else
@@ -1854,7 +1868,10 @@
 #endif
                 break;
             case 16:
-#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+                D_PIXMAP(("Using SSE2 - 16 bit\n"));
+                shade_ximage_16_sse2(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
+#elif defined HAVE_MMX
                 D_PIXMAP(("Using MMX - 16 bit\n"));
                 shade_ximage_16_mmx(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
 #else
@@ -1869,7 +1886,10 @@
                 }
                 /* drop */
             case 32:
-#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+                D_PIXMAP(("Using SSE2 - 32 bit\n"));
+                shade_ximage_32_sse2(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
+#elif defined HAVE_MMX
                 D_PIXMAP(("Using MMX - 32 bit\n"));
                 shade_ximage_32_mmx(ximg->data, ximg->bytes_per_line, w, h, rm, gm, bm);
 #else
/*  File:  sse2_cmod.c
 *  Written and Copyright (C) 2005 by Tres Melton
 *
 *  Permission is hereby granted to Michael Jennings to license this code as
 *  he sees fit.  I'd prefer the GPL but he will choose the BSD. The debate
 *  is moot as this is to become a part of the Eterm project, for which he is 
 *  the primary author.  For users of this code I ask that any modifications
 *  be released back into the community but with Michael Jennings chooses the
 *  BSD license then that request has no backing in law.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL   
 *  THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER  
 *  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN   
 *  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 *
 *  Much inspiration was drawn from the original x86 MMX port written by
 *	Willem Monsuwe <[EMAIL PROTECTED]> in pure x86/MMX Assembly.  The MMX
 *	instructions are taken almost verbatim but the memory and parameter
 *	accessing had to be completely reworked for the x86_64 ABI and to 
 *	ensure they worked with various gcc options.  Further the code was
 *	extended to take advantage of the 128 bit xmm registers in SSE2.
 *
 *  Manuals used in this port:
 *      The Gnu Assembler
 *              http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html
 *      AMD64 Architecture Programmer's Manual Volume 1: Application Programming
 *              http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf
 *      AMD64 Architecture Programmer's Manual Volume 2: System Programming
 *              http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
 *      AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions
 *              http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf
 *      AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions
 *              http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf
 *      AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions
 *              http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf
 *	AMD64 Application Binary Interface (v. 0.95)
 *		http://www.x86-64.org/documentation/documentation/abi-0.95.pdf
 *
 *  The 32 bit color modification algorithm is simple but so optimized 
 *    (even the C version) that it is almost unreadable.  
 *    Therefore the pseudo code is:
 *
 *	for each color of each pixel
 *		new_color = color * modifier
 *		if ( new_color > max_color_value )
 *			new_color = max_color_value
 *		end if
 *	end for
 *
 *  The AMD64 ABI is at version 0.95 and might change in the future.  Further it has changed a
 *	number of times in the past (although mostly in 2002-2003) as evidenced by the mailing
 *	list on http://www.x86-64.org.  The GCC, Glibc, and Linux kernel have changed as well
 *	during this time to keep up.  The standard C definition states that function parameters
 *	are to be passed on the stack but that can be very inefficient compared to passing them
 *	in registers so gcc tries to use registers.  This is very different than on the register
 *	starved i386 architecture (AMD64 adds 8 general purpose registers: %r8-%r15, and 
 *	MMX/SSE2/F87 adds 16 64/128/80bit registers: %xmm0-%xmm15. The x86_64 version of GCC uses
 *	registers as efficiently as possible and as a result exactly which registers are used
 *	for which parameters has evolved.  Since all of these tools change simultaneously inline 
 *	assembly code in C functions is the only way to ensure that this code will continue to 
 *	function through a (however unlikely) change.  If pure assembly were to be used as the
 *	original MMX author, Willem Monsuwe, did and the ABI changed then this code would cease
 *	to function properly.  After examination of Willem's code I'm wondering if he
 *	wrote it for GNU/Linux originally.  The ENTER and LEAVE macros put all of the 
 *	parameters on the stack so that they can be accessed by references to the Base_Pointer
 *	the way that ANSI C is defined.  If he originally wrote this for GNU/Linux then he most
 *	likely would have just used the registers instead of unwinding those optimizations
 * 	manually by pushing them to the stack.  And if he explicitly wanted to use the stack
 *	then there are parameters to gcc that would have performed those operations for him:
 *	-mregparm/-mmemparm.  Other gcc options that can tweak with the stack and the number
 *	of registers available for function parameters are: -fcall-used/-fcall-saved,
 *	-fcaller-saves, -fstack-protector, -fPIC/-fpic, -mno-push-args, etc..  It might be
 *	advisable to check for these switches when using the original MMX code and emit a
 *	warning if any are enabled.  I know that the PIC option trashes the BX register and
 *	that both Willem and I use that register.  In other words If you do manage to get it
 *	to compile & run w/ -fpic it WILL break. On the plus side, you can keep the pieces!  :-)
 *	On the other hand I could be wrong about everything
 *
 *  In Conclusion:
 *	Using C functions and inline assembly code should alleviate all of the concerns as the
 *	C compiler will ensure that the parameters get to the function in a gauranteed manner
 *	and the inline assembly explicitly loads them into the desired registers for the assembly
 *	code.  This might seem like alot of overhead but great care has been taken to adhere to
 *	the x86_64 ABI so that gcc/gas/ld will not perform any unneeded operations even when no
 *	optimizations have been enabled (-O[123]).
 */

#include "config.h"

#ifdef HAVE_SSE2

void shade_ximage_15_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
{
  __asm__ __volatile__ (
	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
	"leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t"	/* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6		*/
	"negq %%rbx			\n\t"	/* Negate the width to that we can increment the counter					*/
	"jz 10f				\n\t"	/* Jump to end if the line count is zero							*/
	"movd %[red_mod], %%xmm5	\n\t"	/* Load the color modifiers into mmx registers							*/
	"movd %[green_mod], %%xmm6	\n\t"	/* " "												*/
	"movd %[blue_mod], %%xmm7	\n\t"	/* " "												*/
	"punpcklwd %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low words.  From A64_128bit_Media_Programming (p. 380)			*/
	"punpcklwd %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same)		*/
	"punpcklwd %%xmm7, %%xmm7	\n\t"
	"punpckldq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low double words.  From A64_128bit_Media_Programming (p. 376)		*/
	"punpckldq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same)		*/
	"punpckldq %%xmm7, %%xmm7	\n\t"
	"punpcklqdq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low quad words.  From A64_128bit_Media_Programming (p. 378)		*/
	"punpcklqdq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same)		*/
	"punpcklqdq %%xmm7, %%xmm7	\n\t"
	"or %[red_mod], %[green_mod]	\n\t"	/* This, and the following 4 instructions, check to see if all three colormodifiers are		*/
	"or %[blue_mod], %[green_mod]	\n\t"	/* less than 256.  If any of the modifiers are > 256 then they will have the 9th, or higher,	*/
	"sar $8, %[green_mod]		\n\t"	/* bit set.  Then we shift off eight bits, leaving something set if a modifier > 256. 		*/
	"movq %%rax, %[blue_mod]	\n\t"	/* Use the register named blue_mod to now store bytes_per_line.					*/
	"xor %[red_mod], %[red_mod]	\n\t"	/* zero red so we don't have to load an immediate value for the following compare.		*/
	"cmp %[red_mod], %[green_mod]	\n\t"	/* Compare the left over bits to zero								*/
	"jg 5f				\n\t"	/* If one of the colors (might) need saturated then jump to the secondary set of loops.		*/
	"1:				\n\t"	/* Start of the outer loop (lines).								*/
	"movq %%rbx, %%rcx		\n\t"	/* Move the width into the count register							*/
	"addq $7, %%rcx			\n\t"	
	"jns 3f				\n\t"
	"2:				\n\t"	/* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register )		*/
	"movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t"	/* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue)		*/
	"movdqu %%xmm0, %%xmm1		\n\t"	/* Create a copy of the pixel for the green color						*/
	"movdqu %%xmm0, %%xmm2		\n\t"	/* Create a copy of the pixel for the blue color						*/
	"psrlw $5, %%xmm1		\n\t"	/* Packed Shift Right Logical Words								*/
						/* From A64_128bit_Media_Programming (p. 347)							*/
						/* Shifts the blue off of the green color							*/
	"psrlw $10, %%xmm0		\n\t"	/* Shifts the blue & green off of the red color							*/
	"psllw $11, %%xmm2		\n\t"	/* Packed Shift Left Logical Words								*/
						/* From A64_128bit_Media_Programming (p. 330)							*/
						/* Shifts the red & green off of the blue color							*/
	"psllw $11, %%xmm1		\n\t"	/* Shifts the red off of the green color							*/
	"psllw $8, %%xmm0		\n\t"	/* Shifts the red color into position								*/
	"psrlw $3, %%xmm1		\n\t"	/* Shifts the green color into position								*/
	"psrlw $3, %%xmm2		\n\t"	/* Shifts the blue color into position								*/
	"pmulhw %%xmm5, %%xmm0		\n\t"	/* color *= modifier										*/
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"psllw $10, %%xmm0		\n\t"	/* Shift red back into its original position							*/
	"psllw $5, %%xmm1		\n\t"	/* Shift green back into its original position							*/
	"por %%xmm2, %%xmm0		\n\t"	/* Mesh the colors back together								*/
	"por %%xmm1, %%xmm0		\n\t"
	"movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t"	/* Place the shaded 8 pixels back into the image map						*/
	"addq $8, %%rcx			\n\t"	
	"js 2b				\n\t"
	"jmp 4f				\n\t"
	"3:				\n\t"	/* Deal with pixels one at a time here.								 */
	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
	"movd %%eax, %%xmm0		\n\t"
	"movq %%xmm0, %%xmm1		\n\t"
	"movq %%xmm0, %%xmm2		\n\t"
	"psrlw $5, %%xmm1		\n\t"
	"psrlw $10, %%xmm0		\n\t"
	"psllw $11, %%xmm2		\n\t"
	"psllw $11, %%xmm1		\n\t"
	"psllw $8, %%xmm0		\n\t"
	"psrlw $3, %%xmm1		\n\t"
	"psrlw $3, %%xmm2		\n\t"
	"pmulhw %%xmm5, %%xmm0		\n\t"
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"psllw $10, %%xmm0		\n\t"
	"psllw $5, %%xmm1		\n\t"
	"por %%xmm2, %%xmm0		\n\t"
	"por %%xmm1, %%xmm0		\n\t"
	"movd %%xmm0, %%eax		\n\t"
	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
	"incq %%rcx			\n\t"
	"4:				\n\t"
	"cmpq $6, %%rcx			\n\t"
	"jng 3b				\n\t"
	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
	"decq %%rdx			\n\t"
	"jnz 1b				\n\t"
	"jmp 10f			\n\t"	/* We're done!											*/

	"5:				\n\t"	/*  Saturation is required									*/
	"pcmpeqw %%xmm3, %%xmm3		\n\t"	/* Packed Compare Equal Words									*/
						/* From A64_128bit_Media_Programming (p. 276)							*/
						/* This sets xmm3 to 128 1's (since mm6 = mm6)							*/
	"psllw $5, %%xmm3		\n\t"	/* xmm3 = 8 copies of 1111 1111 1110 0000							*/
	"6:				\n\t"
	"movq %%rbx, %%rcx		\n\t"
	"addq $7, %%rcx			\n\t"
	"jns 8f				\n\t"
	"7:				\n\t"
	"movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t"
	"movdqu %%xmm0, %%xmm1		\n\t"
	"movdqu %%xmm0, %%xmm2		\n\t"
	"psrlw $5, %%xmm1		\n\t"
	"psrlw $10, %%xmm0		\n\t"
	"psllw $11, %%xmm2		\n\t"
	"psllw $11, %%xmm1		\n\t"
	"psllw $8, %%xmm0		\n\t"
	"psrlw $3, %%xmm1		\n\t"
	"psrlw $3, %%xmm2		\n\t"
	"pmulhw %%xmm5, %%xmm0		\n\t"
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"paddusw %%xmm3, %%xmm0		\n\t"
	"paddusw %%xmm3, %%xmm1		\n\t"
	"paddusw %%xmm3, %%xmm2		\n\t"
	"psubw %%xmm3, %%xmm0		\n\t"	/* FIXME: This line needs added to the original asm code					*/
	"psubw %%xmm3, %%xmm1		\n\t"
	"psubw %%xmm3, %%xmm2		\n\t"
	"psllw $10, %%xmm0		\n\t"
	"psllw $5, %%xmm1		\n\t"
	"por %%xmm2, %%xmm0		\n\t"
	"por %%xmm1, %%xmm0		\n\t"
	"movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t"
	"addq $8, %%rcx			\n\t"
	"js 7b				\n\t"
	"jmp 9f				\n\t"
	"8:				\n\t"
	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
	"movd %%eax, %%xmm0		\n\t"
	"movq %%xmm0, %%xmm1		\n\t"
	"movq %%xmm0, %%xmm2		\n\t"
	"psrlw $5, %%xmm1		\n\t"
	"psrlw $10, %%xmm0		\n\t"
	"psllw $11, %%xmm2		\n\t"
	"psllw $11, %%xmm1		\n\t"
	"psllw $8, %%xmm0		\n\t"
	"psrlw $3, %%xmm1		\n\t"
	"psrlw $3, %%xmm2		\n\t"
	"pmulhw %%xmm5, %%xmm0		\n\t"
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"paddusw %%xmm3, %%xmm0		\n\t"
	"paddusw %%xmm3, %%xmm1		\n\t"
	"paddusw %%xmm3, %%xmm2		\n\t"
	"psubw %%xmm3, %%xmm0		\n\t"	/* FIXME: This line needs added to the original asm code					*/
	"psubw %%xmm3, %%xmm1		\n\t"
	"psubw %%xmm3, %%xmm2		\n\t"
	"psllw $10, %%xmm0		\n\t"
	"psllw $5, %%xmm1		\n\t"
	"por %%xmm2, %%xmm0		\n\t"
	"por %%xmm1, %%xmm0		\n\t"
	"movd %%xmm0, %%eax		\n\t"
	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
	"incq %%rcx			\n\t"
	"9:				\n\t"
	"cmpq $6, %%rcx			\n\t"
	"jng 8b				\n\t"
	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
	"decq %%rdx			\n\t"
	"jnz 6b				\n\t"
	"10:				\n\t"	/* This is the end.  Jump here if the line count is zero.					*/
	"emms				\n\t"	/* exit multi-media state (last asm instruction)						*/
	: 					/* outputs: none										*/
						/* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly)	*/
						/*	(however the compiler/assembler can preload 32bit values into 64bit registers)		*/
						/*	(that is why certain variables cannot be referenced by name -- use their register)	*/
	: [data]       "S" (data), 		/*   put the pointer data into the rsi register							*/
	  [width]      "b" (w),			/*   put the width in the %rbx register	(cannot be referenced by name)				*/
	  [height]     "d" (h),			/*   put the heigth in the %rdx register (cannot be referenced by name)				*/
	  [red_mod]    "r" ((unsigned long)(rm)),/*  put the red_modifier   in a register (referenced by name)					*/
	  [green_mod]  "r" ((unsigned long)(gm)),/*  put the green_modifier in a register (referenced by name)					*/
	  [blue_mod]   "r" ((unsigned long)(bm)),/*  put the blue_modifier  in a register (referenced by name)	Later store the bytes_line here	*/
	  [bytes_line] "a" (bpl)		/*   put the bytes_per_line in the %rax register (cannot be referenced by name)			*/
	: "memory"				/* clobbers: (memory includes all the registers)						*/
  );	/*  End of Assembly  */
}


void shade_ximage_16_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
{
  __asm__ __volatile__ (
	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
	"leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t"	/* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6		*/
	"negq %%rbx			\n\t"	/* Negate the width to that we can increment the counter					*/
	"jz 10f				\n\t"	/* Jump to end if the line count is zero							*/
	"movd %[red_mod], %%xmm5	\n\t"	/* Load the color modifiers into mmx registers							*/
	"movd %[green_mod], %%xmm6	\n\t"	/* " "												*/
	"movd %[blue_mod], %%xmm7	\n\t"	/* " "												*/
	"punpcklwd %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low words.  From A64_128bit_Media_Programming (p. 380)			*/
	"punpcklwd %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same)		*/
	"punpcklwd %%xmm7, %%xmm7	\n\t"
	"punpckldq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low double words.  From A64_128bit_Media_Programming (p. 376)		*/
	"punpckldq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same)		*/
	"punpckldq %%xmm7, %%xmm7	\n\t"
	"punpcklqdq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low quad words.  From A64_128bit_Media_Programming (p. 378)		*/
	"punpcklqdq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same)		*/
	"punpcklqdq %%xmm7, %%xmm7	\n\t"
	"or %[red_mod], %[green_mod]	\n\t"	/* This, and the following 4 instructions, check to see if all three colormodifiers are		*/
	"or %[blue_mod], %[green_mod]	\n\t"	/* less than 256.  If any of the modifiers are > 256 then they will have the 9th, or higher,	*/
	"sar $8, %[green_mod]		\n\t"	/* bit set.  Then we shift off eight bits, leaving something set if a modifier > 256. 		*/
	"movq %%rax, %[blue_mod]	\n\t"	/* Use the register named blue_mod to now store bytes_per_line.		*/
	"xor %[red_mod], %[red_mod]	\n\t"	/* zero red so we don't have to load an immediate value for the following compare.		*/
	"cmp %[red_mod], %[green_mod]	\n\t"	/* Compare the left over bits to zero								*/
	"jg 5f				\n\t"	/* If one of the colors (might) need saturated then jump to the secondary set of loops.		*/
	"1:				\n\t"	/* Start of the outer loop (lines).								*/
	"movq %%rbx, %%rcx		\n\t"	/* Move the width into the count register							*/
	"addq $7, %%rcx			\n\t"	
	"jns 3f				\n\t"
	"2:				\n\t"	/* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register )		*/
	"movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t"	/* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue)		*/
	"movdqu %%xmm0, %%xmm1		\n\t"	/* Create a copy of the pixel for the green color						*/
	"movdqu %%xmm0, %%xmm2		\n\t"	/* Create a copy of the pixel for the blue color						*/
	"psrlw $5, %%xmm1		\n\t"	/* Packed Shift Right Logical Words								*/
						/* From A64_128bit_Media_Programming (p. 347)							*/
						/* Shifts the blue off of the green color							*/
	"psrlw $11, %%xmm0		\n\t"	/* Shifts the blue & green off of the red color							*/
	"psllw $11, %%xmm2		\n\t"	/* Packed Shift Left Logical Words								*/
						/* From A64_128bit_Media_Programming (p. 330)							*/
						/* Shifts the red & green off of the blue color							*/
	"psllw $10, %%xmm1		\n\t"	/* Shifts the red off of the green color							*/
	"psllw $8, %%xmm0		\n\t"	/* Shifts the red color into position								*/
	"psrlw $2, %%xmm1		\n\t"	/* Shifts the green color into position								*/
	"psrlw $3, %%xmm2		\n\t"	/* Shifts the blue color into position								*/
	"pmulhw %%xmm5, %%xmm0		\n\t"	/* color *= modifier										*/
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"psllw $11, %%xmm0		\n\t"	/* Shift red back into its original position							*/
	"psllw $5, %%xmm1		\n\t"	/* Shift green back into its original position							*/
	"por %%xmm2, %%xmm0		\n\t"	/* Mesh the colors back together								*/
	"por %%xmm1, %%xmm0		\n\t"
	"movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t"	/* Place the shaded 8 pixels back into the image map						*/
	"addq $8, %%rcx			\n\t"	
	"js 2b				\n\t"
	"jmp 4f				\n\t"
	"3:				\n\t"	/* Deal with pixels one at a time here.								 */
	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
	"movd %%eax, %%xmm0		\n\t"
	"movq %%xmm0, %%xmm1		\n\t"
	"movq %%xmm0, %%xmm2		\n\t"
	"psrlw $5, %%xmm1		\n\t"
	"psrlw $11, %%xmm0		\n\t"
	"psllw $11, %%xmm2		\n\t"
	"psllw $10, %%xmm1		\n\t"
	"psllw $8, %%xmm0		\n\t"
	"psrlw $2, %%xmm1		\n\t"
	"psrlw $3, %%xmm2		\n\t"
	"pmulhw %%xmm5, %%xmm0		\n\t"
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"psllw $11, %%xmm0		\n\t"
	"psllw $5, %%xmm1		\n\t"
	"por %%xmm2, %%xmm0		\n\t"
	"por %%xmm1, %%xmm0		\n\t"
	"movd %%xmm0, %%eax		\n\t"
	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
	"incq %%rcx			\n\t"
	"4:				\n\t"
	"cmpq $6, %%rcx			\n\t"
	"jng 3b				\n\t"
	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
	"decq %%rdx			\n\t"
	"jnz 1b				\n\t"
	"jmp 10f			\n\t"	/* We're done!											*/

	"5:				\n\t"	/*  Saturation is required									*/
	"pcmpeqw %%xmm3, %%xmm3		\n\t"	/* Packed Compare Equal Words									*/
						/* From A64_128bit_Media_Programming (p. 276)							*/
						/* This sets xmm3 to 128 1's (since mm6 = mm6)							*/
	"movdqu %%xmm3, %%xmm4		\n\t"	/* Make copy of 128 ones									*/
	"psllw $5, %%xmm3		\n\t"	/* xmm3 = 8 copies of 1111 1111 1110 0000							*/
	"psllw $6, %%xmm4		\n\t"	/* xmm4 = 8 copies of 1111 1111 1100 0000							*/
	"6:				\n\t"
	"movq %%rbx, %%rcx		\n\t"
	"addq $7, %%rcx			\n\t"
	"jns 8f				\n\t"
	"7:				\n\t"
	"movdqu (%%rsi, %%rcx, 2), %%xmm0\n\t"
	"movdqu %%xmm0, %%xmm1		\n\t"
	"movdqu %%xmm0, %%xmm2		\n\t"
	"psrlw $5, %%xmm1		\n\t"
	"psrlw $11, %%xmm0		\n\t"
	"psllw $11, %%xmm2		\n\t"
	"psllw $10, %%xmm1		\n\t"
	"psllw $8, %%xmm0		\n\t"
	"psrlw $2, %%xmm1		\n\t"
	"psrlw $3, %%xmm2		\n\t"
	"pmulhw %%xmm5, %%xmm0		\n\t"
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"paddusw %%xmm3, %%xmm0		\n\t"
	"paddusw %%xmm4, %%xmm1		\n\t"
	"paddusw %%xmm3, %%xmm2		\n\t"
	"psubw %%xmm4, %%xmm1		\n\t"
	"psubw %%xmm3, %%xmm2		\n\t"
	"psllw $11, %%xmm0		\n\t"
	"psllw $5, %%xmm1		\n\t"
	"por %%xmm2, %%xmm0		\n\t"
	"por %%xmm1, %%xmm0		\n\t"
	"movdqu %%xmm0, (%%rsi, %%rcx, 2)\n\t"
	"addq $8, %%rcx			\n\t"
	"js 7b				\n\t"
	"jmp 9f				\n\t"
	"8:				\n\t"
	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
	"movd %%eax, %%xmm0		\n\t"
	"movq %%xmm0, %%xmm1		\n\t"
	"movq %%xmm0, %%xmm2		\n\t"
	"psrlw $5, %%xmm1		\n\t"
	"psrlw $11, %%xmm0		\n\t"
	"psllw $11, %%xmm2		\n\t"
	"psllw $10, %%xmm1		\n\t"
	"psllw $8, %%xmm0		\n\t"
	"psrlw $2, %%xmm1		\n\t"
	"psrlw $3, %%xmm2		\n\t"
	"		\n\t"
	"pmulhw %%xmm5, %%xmm0		\n\t"
	"pmulhw %%xmm6, %%xmm1		\n\t"
	"pmulhw %%xmm7, %%xmm2		\n\t"
	"		\n\t"
	"paddusw %%xmm3, %%xmm0		\n\t"
	"paddusw %%xmm4, %%xmm1		\n\t"
	"paddusw %%xmm3, %%xmm2		\n\t"
	"		\n\t"
	"psubw %%xmm4, %%xmm1		\n\t"
	"psubw %%xmm3, %%xmm2		\n\t"
	"		\n\t"
	"psllw $11, %%xmm0		\n\t"
	"psllw $5, %%xmm1		\n\t"
	"por %%xmm2, %%xmm0		\n\t"
	"por %%xmm1, %%xmm0		\n\t"
	"movd %%xmm0, %%eax		\n\t"
	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
	"incq %%rcx			\n\t"
	"9:				\n\t"
	"cmpq $6, %%rcx			\n\t"
	"jng 8b				\n\t"
	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
	"decq %%rdx			\n\t"
	"jnz 6b				\n\t"
	"10:				\n\t"	/* This is the end.  Jump here if the line count is zero.					*/
	"emms				\n\t"	/* exit multi-media state (last asm instruction)						*/
	: 					/* outputs: none										*/
						/* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly)	*/
						/*	(however the compiler/assembler can preload 32bit values into 64bit registers)		*/
						/*	(that is why certain variables cannot be referenced by name -- use their register)	*/
	: [data]       "S" (data), 		/*   put the pointer data into the rsi register							*/
	  [width]      "b" (w),			/*   put the width in the %rbx register	(cannot be referenced by name)				*/
	  [height]     "d" (h),			/*   put the heigth in the %rdx register (cannot be referenced by name)				*/
	  [red_mod]    "r" ((unsigned long)(rm)),/*  put the red_modifier   in a register (referenced by name)					*/
	  [green_mod]  "r" ((unsigned long)(gm)),/*  put the green_modifier in a register (referenced by name)					*/
	  [blue_mod]   "r" ((unsigned long)(bm)),/*  put the blue_modifier  in a register (referenced by name)	Later store the bytes_line here	*/
	  [bytes_line] "a" (bpl)		/*   put the bytes_per_line in the %rax register (cannot be referenced by name)			*/
	: "memory"				/* clobbers: (memory includes all the registers)						*/
  );	/*  End of Assembly  */
}


void shade_ximage_32_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
{
  __asm__ __volatile__ (
	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
	"leaq -4(%%rsi, %%rbx, 4), %%rsi\n\t"	/* From A64_General_Purpose_and_System_Instructions (p. 182)					*/
						/* Intel syntax section:[base + index*scale + disp]  (used by AMD manuals)			*/
						/* AT&T  syntax section:disp(base, index, scale)     (used by gas/gcc)				*/
						/* Load Effective Address of (rsi + (rbx * size)) into rsi					*/
						/* 32 bits per pixel means a multiplier of 4.							*/
	"negq %%rbx			\n\t"	/* two's compliment negation of ebx (width) and sets the Zero Flag based on the results		*/
						/* From A64_General_Purpose_and_System_Instructions (p. 212)					*/
	"jz 10f				\n\t"	/* Jump to label 3 forward on Zero								*/
						/* Basically if width = 0 blowout								*/
						/* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations)	*/
	"movd %[red_mod], %%xmm4	\n\t"	/* move red modifier into mm4 w/ zero extension to 128bits					*/
						/* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended	*/
	"psllq $16, %%xmm4		\n\t"	/* Packed Shift Left Logical Quad words (left shift mm4 16bits twice, once for each 64bit value)*/
						/* From A64_128bit_Media_Programming (p. 328)							*/
	"movd %[green_mod], %%xmm5	\n\t"	/* move green modifier into mm5 w/ zero extension to 128bits					*/
	"por %%xmm5, %%xmm4		\n\t"	/* Mesh green modifier into color modifier							*/
	"psllq $16, %%xmm4		\n\t"	/* Packed Shift Left Logical Quad words (left shift mm4 16bits twice, once for each 64bit value)*/
	"movd %[blue_mod], %%xmm5	\n\t"	/* move blue modifier (32 bits) into mm4 w/ zero extension to 128bits				*/
	"por %%xmm5, %%xmm4		\n\t"	/* Mesh blue modifier into color modifier							*/
						/* mm4 (color modifier) now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 rm : 00 gm 00 bm	*/
        "punpcklqdq %%xmm4, %%xmm4      \n\t"   /* Unpack and Interleave low quad words.  From A64_128bit_Media_Programming (p. 378)            */
           					/* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same)              */
	"pcmpeqw %%xmm6, %%xmm6		\n\t"	/* Packed Compare Equal Words									*/
						/* From A64_128bit_Media_Programming (p. 276)							*/
						/* This sets mm6 to 128 1's (since mm6 = mm6)							*/
	"psllw $15, %%xmm6		\n\t"	/* Packed Shift Left Logical Words								*/
						/* From A64_128bit_Media_Programming (p. 330)							*/
						/* This sets 8 16 bit values of  1000 0000 0000 0000 in the 128 bit word			*/
	"movdqu %%xmm6, %%xmm5		\n\t"	/* Copy mm6 to mm5 (we need mm6 later)								*/
	"pmulhw %%xmm4, %%xmm5		\n\t"	/* Packed Multiply High Signed Word								*/
						/* mm4 = ( mm4 * mm5 ) >> 16  (8 times, once for each 16bit value)				*/
						/* For each color_ modifier (cm)								*/
						/*   (( cm * 80 00 ) >> 16 ) = (( cm << 15 ) >> 16 )  = cm >> 1					*/
	"1:				\n\t"	/* The start of the outer loop (lines)								*/
	"movq %%rbx, %%rcx		\n\t"	/* Load the counting register (rcx) with the width of the window to shade			*/
	"incq %%rcx			\n\t"
	"2:				\n\t"	/* The start of the inner loop (columns)							*/
	"movq (%%rsi, %%rcx, 4), %%xmm1	\n\t"	/* sets mm1 to the 32bit color in the image map (data[ rcx ])					*/
						/* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits	*/
						/* only move 32 bits with movd so we don't get two pixels worth of colors			*/
	"pxor %%xmm0, %%xmm0		\n\t"	/* 128bit exclusive or (sets mm0 to 0)								*/
	"punpcklbw %%xmm1, %%xmm0	\n\t"   /* Unpack and interleave low bytes								*/
						/* For each color of the pixel expand to 16 bits and shift left 8 bits				*/
						/* From A64_128bit_Media_Programming (p. 374)							*/
						/* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first)	*/
	"pxor %%xmm6, %%xmm0		\n\t"	/* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times)	*/
	"pmulhw %%xmm4, %%xmm0		\n\t"	/* Package Multiply High Signed Word  (an SSE2 instruction) 128bit     mm0=color  mm4=cm	*/
						/* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0				*/
						/*    and the high 16 bits of the result replace the 16 bits used from mm0			*/
						/* For (( each 16 bit color * each 16 bit color modifier ) >> 16 )				*/
	"psubw %%xmm5, %%xmm0		\n\t"	/* Packed Subtract Words									*/
						/* From A64_128bit_Media_Programming (p. 364)							*/
						/* mm0=modified color  mm5=corrected color modifier. mm0 = ( mm0 - mm5 )			*/
						/* 16 bit corrected modified color = ( modified color - corrected color modifier )		*/
	"packuswb %%xmm0, %%xmm0	\n\t"	/* Pack with Saturation Signed Word to Unsigned Byte						*/
						/* From A64_128bit_Media_Programming (p. 246)							*/
						/* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0					*/
						/* The top 64 bits are now trashed.  The remaining 64 bits are 2 pixels				*/
	"movq %%xmm0, (%%rsi, %%rcx, 4)	\n\t"	/* puts the new 32 bit color value back into the data (image map)				*/
						/* 32 bit color is still a double word so movd stays movd					*/
	"addq $2, %%rcx			\n\t"	/* Increment the count register (more pixels left)						*/
	"js 2b				\n\t"	/* Jump backwards to label 2 (restart inner loop) on negative (more pixels left)		*/
	"jmp 5f				\n\t"	/* Jump to single pixel section after pairs are exhausted					*/
	"4:				\n\t"	/* The start of the inner loop (columns)							*/
	"movd (%%rsi, %%rcx, 4), %%xmm1	\n\t"	/* sets mm1 to the 32bit color in the image map (data[ rcx ])					*/
						/* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits	*/
						/* only move 32 bits with movd so we don't get two pixels worth of colors			*/
	"pxor %%xmm0, %%xmm0		\n\t"	/* 128bit exclusive or (sets mm0 to 0)								*/
	"punpcklbw %%xmm1, %%xmm0	\n\t"   /* Unpack and interleave low bytes								*/
						/* For each color of the pixel expand to 16 bits and shift left 8 bits				*/
						/* From A64_128bit_Media_Programming (p. 374)							*/
						/* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first)	*/
	"pxor %%xmm6, %%xmm0		\n\t"	/* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times)	*/
	"pmulhw %%xmm4, %%xmm0		\n\t"	/* Package Multiply High Signed Word  (an SSE2 instruction) 128bit     mm0=color  mm4=cm	*/
						/* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0				*/
						/*    and the high 16 bits of the result replace the 16 bits used from mm0			*/
						/* For (( each 16 bit color * each 16 bit color modifier ) >> 16 )				*/
	"psubw %%xmm5, %%xmm0		\n\t"	/* Packed Subtract Words									*/
						/* From A64_128bit_Media_Programming (p. 364)							*/
						/* mm0=modified color  mm5=corrected color modifier. mm0 = ( mm0 - mm5 )			*/
						/* 16 bit corrected modified color = ( modified color - corrected color modifier )		*/
	"packuswb %%xmm0, %%xmm0	\n\t"	/* Pack with Saturation Signed Word to Unsigned Byte						*/
						/* From A64_128bit_Media_Programming (p. 246)							*/
						/* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0					*/
	"movd %%xmm0, (%%rsi, %%rcx, 4)	\n\t"	/* puts the new 32 bit color value back into the data (image map)				*/
						/* 32 bit color is still a double word so movd stays movd					*/
	"incq %%rcx			\n\t"	/* Increment the count register (more pixels left)						*/
	"5:				\n\t"	/* Jump here after all pairs of pixels are exhausted						*/
	"cmpq $0, %%rcx			\n\t"	/* Increment the count register (more pixels left)						*/
	"jng 4b				\n\t"	/* Jump backwards to label 2 (restart inner loop) on NOT zero (more pixels left)		*/

	"addq %%rax, %%rsi		\n\t"	/* Add bytes per line to the data pointer (advance the pointer to the next line)		*/
	"decq %%rdx			\n\t"	/* Decrement the dx register (row count)							*/
	"jnz 1b				\n\t"	/* Jump backwards to label 1 (restart outer loop) if not zero (more rows left)			*/
	"10:				\n\t"	/* End of function (jump here to clean up and return to caller					*/
	"emms				\n\t"	/* exit multi-media state (last asm instruction)						*/
	: 					/* outputs: none										*/
						/* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly)	*/
						/*	(however the compiler/assembler can preload 32bit values into 64bit registers)		*/
						/*	(that is why certain variables cannot be referenced by name -- use their register)	*/
	: [data]       "S" (data), 		/*   put the pointer data into the rsi register							*/
	  [width]      "b" (w),			/*   put the width in the %rbx register	(cannot be referenced by name)				*/
	  [height]     "d" (h),			/*   put the heigth in the %rdx register (cannot be referenced by name)				*/
	  [red_mod]    "r" (rm),		/*   put the red_modifier   in a register (referenced by name)					*/
	  [green_mod]  "r" (gm),		/*   put the green_modifier in a register (referenced by name)					*/
	  [blue_mod]   "r" (bm),		/*   put the blue_modifier  in a register (referenced by name)					*/
	  [bytes_line] "a" (bpl)		/*   put the bytes_per_line in the %rax register (cannot be referenced by name)			*/
	: "memory"				/* clobbers: (memory includes all the registers)						*/
  );	/*  End of Assembly  */
}

#endif
























Reply via email to