On Tue, 2006-01-03 at 15:49 -0700, Tres Melton wrote:
> Michael,
> 
Crap,  Wrong damn tree.  Try this instead of the last one please.


-- 
Tres Melton
IRC & Gentoo: RiverRat
Index: eterm/Eterm/src/pixmap.c
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/src/pixmap.c,v
retrieving revision 1.115
diff -u -b -B -u -r1.115 pixmap.c
--- eterm/Eterm/src/pixmap.c	22 Dec 2005 23:31:33 -0000	1.115
+++ eterm/Eterm/src/pixmap.c	3 Jan 2006 22:59:40 -0000
@@ -66,10 +66,30 @@
 extern void shade_ximage_32_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 
 /* Assembler routines for 64 bit cpu with sse2 */
-extern void shade_ximage_15_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
-extern void shade_ximage_16_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
+#ifdef HAVE_SSE2
+extern void shade_ximage_15_sse2_A(void *data, int bpl, int w, int h, int rm, int gm, int bm );
+extern void shade_ximage_15_sse2_U(void *data, int bpl, int w, int h, int rm, int gm, int bm );
+extern void shade_ximage_16_sse2_A(void *data, int bpl, int w, int h, int rm, int gm, int bm );
+extern void shade_ximage_16_sse2_U(void *data, int bpl, int w, int h, int rm, int gm, int bm );
 extern void shade_ximage_32_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 
+#define ETERM_ALIGNMENT 16
+
+#define shade_ximage_15_sse2( data, bpl, w, h, rm, gm, bm )                   \
+{                                                                             \
+  (((long) ( data )) & ((long) ( bpl )) & ((long) ( ETERM_ALIGNMENT - 1 ))) ? \
+    shade_ximage_15_sse2_U((data), (bpl), (w), (h), (rm), (gm), (bm))       : \
+    shade_ximage_15_sse2_A((data), (bpl), (w), (h), (rm), (gm), (bm));        \
+}
+
+#define shade_ximage_16_sse2( data, bpl, w, h, rm, gm, bm )                   \
+{                                                                             \
+  (((long) ( data )) & ((long) ( bpl )) & ((long) ( ETERM_ALIGNMENT - 1 ))) ? \
+    shade_ximage_16_sse2_U((data), (bpl), (w), (h), (rm), (gm), (bm))       : \
+    shade_ximage_16_sse2_A((data), (bpl), (w), (h), (rm), (gm), (bm));        \
+}
+#endif
+
 #ifdef PIXMAP_SUPPORT
 static Imlib_Border bord_none = { 0, 0, 0, 0 };
 #endif
Index: eterm/Eterm/src/sse2_cmod.c
===================================================================
RCS file: /cvsroot/enlightenment/eterm/Eterm/src/sse2_cmod.c,v
retrieving revision 1.1
diff -u -b -B -u -r1.1 sse2_cmod.c
--- eterm/Eterm/src/sse2_cmod.c	14 Jun 2005 19:39:01 -0000	1.1
+++ eterm/Eterm/src/sse2_cmod.c	3 Jan 2006 22:59:41 -0000
@@ -94,7 +88,7 @@
 
 #ifdef HAVE_SSE2
 
-void shade_ximage_15_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+void shade_ximage_15_sse2_U( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
 {
   __asm__ __volatile__ (
 	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
@@ -269,7 +263,7 @@
 }
 
 
-void shade_ximage_16_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+void shade_ximage_16_sse2_U( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
 {
   __asm__ __volatile__ (
 	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
@@ -447,6 +441,359 @@
   );	/*  End of Assembly  */
 }
 
+void shade_ximage_15_sse2_A( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+  __asm__ __volatile__ (
+	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+	"leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t"	/* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6		*/
+	"negq %%rbx			\n\t"	/* Negate the width to that we can increment the counter					*/
+	"jz 10f				\n\t"	/* Jump to end if the line count is zero							*/
+	"movd %[red_mod], %%xmm5	\n\t"	/* Load the color modifiers into mmx registers							*/
+	"movd %[green_mod], %%xmm6	\n\t"	/* " "												*/
+	"movd %[blue_mod], %%xmm7	\n\t"	/* " "												*/
+	"punpcklwd %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low words.  From A64_128bit_Media_Programming (p. 380)			*/
+	"punpcklwd %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same)		*/
+	"punpcklwd %%xmm7, %%xmm7	\n\t"
+	"punpckldq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low double words.  From A64_128bit_Media_Programming (p. 376)		*/
+	"punpckldq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same)		*/
+	"punpckldq %%xmm7, %%xmm7	\n\t"
+	"punpcklqdq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low quad words.  From A64_128bit_Media_Programming (p. 378)		*/
+	"punpcklqdq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same)		*/
+	"punpcklqdq %%xmm7, %%xmm7	\n\t"
+	"or %[red_mod], %[green_mod]	\n\t"	/* This, and the following 4 instructions, check to see if all three colormodifiers are		*/
+	"or %[blue_mod], %[green_mod]	\n\t"	/* less than 256.  If any of the modifiers are > 256 then they will have the 9th, or higher,	*/
+	"sar $8, %[green_mod]		\n\t"	/* bit set.  Then we shift off eight bits, leaving something set if a modifier > 256. 		*/
+	"movq %%rax, %[blue_mod]	\n\t"	/* Use the register named blue_mod to now store bytes_per_line.					*/
+	"xor %[red_mod], %[red_mod]	\n\t"	/* zero red so we don't have to load an immediate value for the following compare.		*/
+	"cmp %[red_mod], %[green_mod]	\n\t"	/* Compare the left over bits to zero								*/
+	"jg 5f				\n\t"	/* If one of the colors (might) need saturated then jump to the secondary set of loops.		*/
+	"1:				\n\t"	/* Start of the outer loop (lines).								*/
+	"movq %%rbx, %%rcx		\n\t"	/* Move the width into the count register							*/
+	"addq $7, %%rcx			\n\t"	
+	"jns 3f				\n\t"
+	"2:				\n\t"	/* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register )		*/
+	"movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t"	/* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue)		*/
+	"movdqa %%xmm0, %%xmm1		\n\t"	/* Create a copy of the pixel for the green color						*/
+	"movdqa %%xmm0, %%xmm2		\n\t"	/* Create a copy of the pixel for the blue color						*/
+	"psrlw $5, %%xmm1		\n\t"	/* Packed Shift Right Logical Words								*/
+						/* From A64_128bit_Media_Programming (p. 347)							*/
+						/* Shifts the blue off of the green color							*/
+	"psrlw $10, %%xmm0		\n\t"	/* Shifts the blue & green off of the red color							*/
+	"psllw $11, %%xmm2		\n\t"	/* Packed Shift Left Logical Words								*/
+						/* From A64_128bit_Media_Programming (p. 330)							*/
+						/* Shifts the red & green off of the blue color							*/
+	"psllw $11, %%xmm1		\n\t"	/* Shifts the red off of the green color							*/
+	"psllw $8, %%xmm0		\n\t"	/* Shifts the red color into position								*/
+	"psrlw $3, %%xmm1		\n\t"	/* Shifts the green color into position								*/
+	"psrlw $3, %%xmm2		\n\t"	/* Shifts the blue color into position								*/
+	"pmulhw %%xmm5, %%xmm0		\n\t"	/* color *= modifier										*/
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"psllw $10, %%xmm0		\n\t"	/* Shift red back into its original position							*/
+	"psllw $5, %%xmm1		\n\t"	/* Shift green back into its original position							*/
+	"por %%xmm2, %%xmm0		\n\t"	/* Mesh the colors back together								*/
+	"por %%xmm1, %%xmm0		\n\t"
+	"movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t"	/* Place the shaded 8 pixels back into the image map						*/
+	"addq $8, %%rcx			\n\t"	
+	"js 2b				\n\t"
+	"jmp 4f				\n\t"
+	"3:				\n\t"	/* Deal with pixels one at a time here.								 */
+	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
+	"movd %%eax, %%xmm0		\n\t"
+	"movq %%xmm0, %%xmm1		\n\t"
+	"movq %%xmm0, %%xmm2		\n\t"
+	"psrlw $5, %%xmm1		\n\t"
+	"psrlw $10, %%xmm0		\n\t"
+	"psllw $11, %%xmm2		\n\t"
+	"psllw $11, %%xmm1		\n\t"
+	"psllw $8, %%xmm0		\n\t"
+	"psrlw $3, %%xmm1		\n\t"
+	"psrlw $3, %%xmm2		\n\t"
+	"pmulhw %%xmm5, %%xmm0		\n\t"
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"psllw $10, %%xmm0		\n\t"
+	"psllw $5, %%xmm1		\n\t"
+	"por %%xmm2, %%xmm0		\n\t"
+	"por %%xmm1, %%xmm0		\n\t"
+	"movd %%xmm0, %%eax		\n\t"
+	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
+	"incq %%rcx			\n\t"
+	"4:				\n\t"
+	"cmpq $6, %%rcx			\n\t"
+	"jng 3b				\n\t"
+	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
+	"decq %%rdx			\n\t"
+	"jnz 1b				\n\t"
+	"jmp 10f			\n\t"	/* We're done!											*/
+
+	"5:				\n\t"	/*  Saturation is required									*/
+	"pcmpeqw %%xmm3, %%xmm3		\n\t"	/* Packed Compare Equal Words									*/
+						/* From A64_128bit_Media_Programming (p. 276)							*/
+						/* This sets xmm3 to 128 1's (since mm6 = mm6)							*/
+	"psllw $5, %%xmm3		\n\t"	/* xmm3 = 8 copies of 1111 1111 1110 0000							*/
+	"6:				\n\t"
+	"movq %%rbx, %%rcx		\n\t"
+	"addq $7, %%rcx			\n\t"
+	"jns 8f				\n\t"
+	"7:				\n\t"
+	"movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t"
+	"movdqa %%xmm0, %%xmm1		\n\t"
+	"movdqa %%xmm0, %%xmm2		\n\t"
+	"psrlw $5, %%xmm1		\n\t"
+	"psrlw $10, %%xmm0		\n\t"
+	"psllw $11, %%xmm2		\n\t"
+	"psllw $11, %%xmm1		\n\t"
+	"psllw $8, %%xmm0		\n\t"
+	"psrlw $3, %%xmm1		\n\t"
+	"psrlw $3, %%xmm2		\n\t"
+	"pmulhw %%xmm5, %%xmm0		\n\t"
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"paddusw %%xmm3, %%xmm0		\n\t"
+	"paddusw %%xmm3, %%xmm1		\n\t"
+	"paddusw %%xmm3, %%xmm2		\n\t"
+	"psubw %%xmm3, %%xmm0		\n\t"	/* FIXME: This line needs added to the original asm code					*/
+	"psubw %%xmm3, %%xmm1		\n\t"
+	"psubw %%xmm3, %%xmm2		\n\t"
+	"psllw $10, %%xmm0		\n\t"
+	"psllw $5, %%xmm1		\n\t"
+	"por %%xmm2, %%xmm0		\n\t"
+	"por %%xmm1, %%xmm0		\n\t"
+	"movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t"
+	"addq $8, %%rcx			\n\t"
+	"js 7b				\n\t"
+	"jmp 9f				\n\t"
+	"8:				\n\t"
+	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
+	"movd %%eax, %%xmm0		\n\t"
+	"movq %%xmm0, %%xmm1		\n\t"
+	"movq %%xmm0, %%xmm2		\n\t"
+	"psrlw $5, %%xmm1		\n\t"
+	"psrlw $10, %%xmm0		\n\t"
+	"psllw $11, %%xmm2		\n\t"
+	"psllw $11, %%xmm1		\n\t"
+	"psllw $8, %%xmm0		\n\t"
+	"psrlw $3, %%xmm1		\n\t"
+	"psrlw $3, %%xmm2		\n\t"
+	"pmulhw %%xmm5, %%xmm0		\n\t"
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"paddusw %%xmm3, %%xmm0		\n\t"
+	"paddusw %%xmm3, %%xmm1		\n\t"
+	"paddusw %%xmm3, %%xmm2		\n\t"
+	"psubw %%xmm3, %%xmm0		\n\t"	/* FIXME: This line needs added to the original asm code					*/
+	"psubw %%xmm3, %%xmm1		\n\t"
+	"psubw %%xmm3, %%xmm2		\n\t"
+	"psllw $10, %%xmm0		\n\t"
+	"psllw $5, %%xmm1		\n\t"
+	"por %%xmm2, %%xmm0		\n\t"
+	"por %%xmm1, %%xmm0		\n\t"
+	"movd %%xmm0, %%eax		\n\t"
+	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
+	"incq %%rcx			\n\t"
+	"9:				\n\t"
+	"cmpq $6, %%rcx			\n\t"
+	"jng 8b				\n\t"
+	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
+	"decq %%rdx			\n\t"
+	"jnz 6b				\n\t"
+	"10:				\n\t"	/* This is the end.  Jump here if the line count is zero.					*/
+	"emms				\n\t"	/* exit multi-media state (last asm instruction)						*/
+	: 					/* outputs: none										*/
+						/* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly)	*/
+						/*	(however the compiler/assembler can preload 32bit values into 64bit registers)		*/
+						/*	(that is why certain variables cannot be referenced by name -- use their register)	*/
+	: [data]       "S" (data), 		/*   put the pointer data into the rsi register							*/
+	  [width]      "b" (w),			/*   put the width in the %rbx register	(cannot be referenced by name)				*/
+	  [height]     "d" (h),			/*   put the heigth in the %rdx register (cannot be referenced by name)				*/
+	  [red_mod]    "r" ((unsigned long)(rm)),/*  put the red_modifier   in a register (referenced by name)					*/
+	  [green_mod]  "r" ((unsigned long)(gm)),/*  put the green_modifier in a register (referenced by name)					*/
+	  [blue_mod]   "r" ((unsigned long)(bm)),/*  put the blue_modifier  in a register (referenced by name)	Later store the bytes_line here	*/
+	  [bytes_line] "a" (bpl)		/*   put the bytes_per_line in the %rax register (cannot be referenced by name)			*/
+	: "memory"				/* clobbers: (memory includes all the registers)						*/
+  );	/*  End of Assembly  */
+}
+
+
+void shade_ximage_16_sse2_A( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
+{
+  __asm__ __volatile__ (
+	".align 16                      \n\t"   /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/
+	"leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t"	/* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6		*/
+	"negq %%rbx			\n\t"	/* Negate the width to that we can increment the counter					*/
+	"jz 10f				\n\t"	/* Jump to end if the line count is zero							*/
+	"movd %[red_mod], %%xmm5	\n\t"	/* Load the color modifiers into mmx registers							*/
+	"movd %[green_mod], %%xmm6	\n\t"	/* " "												*/
+	"movd %[blue_mod], %%xmm7	\n\t"	/* " "												*/
+	"punpcklwd %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low words.  From A64_128bit_Media_Programming (p. 380)			*/
+	"punpcklwd %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same)		*/
+	"punpcklwd %%xmm7, %%xmm7	\n\t"
+	"punpckldq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low double words.  From A64_128bit_Media_Programming (p. 376)		*/
+	"punpckldq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same)		*/
+	"punpckldq %%xmm7, %%xmm7	\n\t"
+	"punpcklqdq %%xmm5, %%xmm5	\n\t"	/* Unpack and Interleave low quad words.  From A64_128bit_Media_Programming (p. 378)		*/
+	"punpcklqdq %%xmm6, %%xmm6	\n\t"	/* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same)		*/
+	"punpcklqdq %%xmm7, %%xmm7	\n\t"
+	"or %[red_mod], %[green_mod]	\n\t"	/* This, and the following 4 instructions, check to see if all three colormodifiers are		*/
+	"or %[blue_mod], %[green_mod]	\n\t"	/* less than 256.  If any of the modifiers are > 256 then they will have the 9th, or higher,	*/
+	"sar $8, %[green_mod]		\n\t"	/* bit set.  Then we shift off eight bits, leaving something set if a modifier > 256. 		*/
+	"movq %%rax, %[blue_mod]	\n\t"	/* Use the register named blue_mod to now store bytes_per_line.		*/
+	"xor %[red_mod], %[red_mod]	\n\t"	/* zero red so we don't have to load an immediate value for the following compare.		*/
+	"cmp %[red_mod], %[green_mod]	\n\t"	/* Compare the left over bits to zero								*/
+	"jg 5f				\n\t"	/* If one of the colors (might) need saturated then jump to the secondary set of loops.		*/
+	"1:				\n\t"	/* Start of the outer loop (lines).								*/
+	"movq %%rbx, %%rcx		\n\t"	/* Move the width into the count register							*/
+	"addq $7, %%rcx			\n\t"	
+	"jns 3f				\n\t"
+	"2:				\n\t"	/* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register )		*/
+	"movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t"	/* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue)		*/
+	"movdqa %%xmm0, %%xmm1		\n\t"	/* Create a copy of the pixel for the green color						*/
+	"movdqa %%xmm0, %%xmm2		\n\t"	/* Create a copy of the pixel for the blue color						*/
+	"psrlw $5, %%xmm1		\n\t"	/* Packed Shift Right Logical Words								*/
+						/* From A64_128bit_Media_Programming (p. 347)							*/
+						/* Shifts the blue off of the green color							*/
+	"psrlw $11, %%xmm0		\n\t"	/* Shifts the blue & green off of the red color							*/
+	"psllw $11, %%xmm2		\n\t"	/* Packed Shift Left Logical Words								*/
+						/* From A64_128bit_Media_Programming (p. 330)							*/
+						/* Shifts the red & green off of the blue color							*/
+	"psllw $10, %%xmm1		\n\t"	/* Shifts the red off of the green color							*/
+	"psllw $8, %%xmm0		\n\t"	/* Shifts the red color into position								*/
+	"psrlw $2, %%xmm1		\n\t"	/* Shifts the green color into position								*/
+	"psrlw $3, %%xmm2		\n\t"	/* Shifts the blue color into position								*/
+	"pmulhw %%xmm5, %%xmm0		\n\t"	/* color *= modifier										*/
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"psllw $11, %%xmm0		\n\t"	/* Shift red back into its original position							*/
+	"psllw $5, %%xmm1		\n\t"	/* Shift green back into its original position							*/
+	"por %%xmm2, %%xmm0		\n\t"	/* Mesh the colors back together								*/
+	"por %%xmm1, %%xmm0		\n\t"
+	"movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t"	/* Place the shaded 8 pixels back into the image map						*/
+	"addq $8, %%rcx			\n\t"	
+	"js 2b				\n\t"
+	"jmp 4f				\n\t"
+	"3:				\n\t"	/* Deal with pixels one at a time here.								 */
+	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
+	"movd %%eax, %%xmm0		\n\t"
+	"movq %%xmm0, %%xmm1		\n\t"
+	"movq %%xmm0, %%xmm2		\n\t"
+	"psrlw $5, %%xmm1		\n\t"
+	"psrlw $11, %%xmm0		\n\t"
+	"psllw $11, %%xmm2		\n\t"
+	"psllw $10, %%xmm1		\n\t"
+	"psllw $8, %%xmm0		\n\t"
+	"psrlw $2, %%xmm1		\n\t"
+	"psrlw $3, %%xmm2		\n\t"
+	"pmulhw %%xmm5, %%xmm0		\n\t"
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"psllw $11, %%xmm0		\n\t"
+	"psllw $5, %%xmm1		\n\t"
+	"por %%xmm2, %%xmm0		\n\t"
+	"por %%xmm1, %%xmm0		\n\t"
+	"movd %%xmm0, %%eax		\n\t"
+	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
+	"incq %%rcx			\n\t"
+	"4:				\n\t"
+	"cmpq $6, %%rcx			\n\t"
+	"jng 3b				\n\t"
+	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
+	"decq %%rdx			\n\t"
+	"jnz 1b				\n\t"
+	"jmp 10f			\n\t"	/* We're done!											*/
+
+	"5:				\n\t"	/*  Saturation is required									*/
+	"pcmpeqw %%xmm3, %%xmm3		\n\t"	/* Packed Compare Equal Words									*/
+						/* From A64_128bit_Media_Programming (p. 276)							*/
+						/* This sets xmm3 to 128 1's (since mm6 = mm6)							*/
+	"movdqa %%xmm3, %%xmm4		\n\t"	/* Make copy of 128 ones									*/
+	"psllw $5, %%xmm3		\n\t"	/* xmm3 = 8 copies of 1111 1111 1110 0000							*/
+	"psllw $6, %%xmm4		\n\t"	/* xmm4 = 8 copies of 1111 1111 1100 0000							*/
+	"6:				\n\t"
+	"movq %%rbx, %%rcx		\n\t"
+	"addq $7, %%rcx			\n\t"
+	"jns 8f				\n\t"
+	"7:				\n\t"
+	"movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t"
+	"movdqa %%xmm0, %%xmm1		\n\t"
+	"movdqa %%xmm0, %%xmm2		\n\t"
+	"psrlw $5, %%xmm1		\n\t"
+	"psrlw $11, %%xmm0		\n\t"
+	"psllw $11, %%xmm2		\n\t"
+	"psllw $10, %%xmm1		\n\t"
+	"psllw $8, %%xmm0		\n\t"
+	"psrlw $2, %%xmm1		\n\t"
+	"psrlw $3, %%xmm2		\n\t"
+	"pmulhw %%xmm5, %%xmm0		\n\t"
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"paddusw %%xmm3, %%xmm0		\n\t"
+	"paddusw %%xmm4, %%xmm1		\n\t"
+	"paddusw %%xmm3, %%xmm2		\n\t"
+	"psubw %%xmm4, %%xmm1		\n\t"
+	"psubw %%xmm3, %%xmm2		\n\t"
+	"psllw $11, %%xmm0		\n\t"
+	"psllw $5, %%xmm1		\n\t"
+	"por %%xmm2, %%xmm0		\n\t"
+	"por %%xmm1, %%xmm0		\n\t"
+	"movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t"
+	"addq $8, %%rcx			\n\t"
+	"js 7b				\n\t"
+	"jmp 9f				\n\t"
+	"8:				\n\t"
+	"movw (%%rsi, %%rcx, 2), %%ax	\n\t"
+	"movd %%eax, %%xmm0		\n\t"
+	"movq %%xmm0, %%xmm1		\n\t"
+	"movq %%xmm0, %%xmm2		\n\t"
+	"psrlw $5, %%xmm1		\n\t"
+	"psrlw $11, %%xmm0		\n\t"
+	"psllw $11, %%xmm2		\n\t"
+	"psllw $10, %%xmm1		\n\t"
+	"psllw $8, %%xmm0		\n\t"
+	"psrlw $2, %%xmm1		\n\t"
+	"psrlw $3, %%xmm2		\n\t"
+	"		\n\t"
+	"pmulhw %%xmm5, %%xmm0		\n\t"
+	"pmulhw %%xmm6, %%xmm1		\n\t"
+	"pmulhw %%xmm7, %%xmm2		\n\t"
+	"		\n\t"
+	"paddusw %%xmm3, %%xmm0		\n\t"
+	"paddusw %%xmm4, %%xmm1		\n\t"
+	"paddusw %%xmm3, %%xmm2		\n\t"
+	"		\n\t"
+	"psubw %%xmm4, %%xmm1		\n\t"
+	"psubw %%xmm3, %%xmm2		\n\t"
+	"		\n\t"
+	"psllw $11, %%xmm0		\n\t"
+	"psllw $5, %%xmm1		\n\t"
+	"por %%xmm2, %%xmm0		\n\t"
+	"por %%xmm1, %%xmm0		\n\t"
+	"movd %%xmm0, %%eax		\n\t"
+	"movw %%ax, (%%rsi, %%rcx, 2)	\n\t"
+	"incq %%rcx			\n\t"
+	"9:				\n\t"
+	"cmpq $6, %%rcx			\n\t"
+	"jng 8b				\n\t"
+	"addq %[blue_mod], %%rsi	\n\t"	/* Blue_mod is the name of a register that now contains bytes_per_line.				*/
+	"decq %%rdx			\n\t"
+	"jnz 6b				\n\t"
+	"10:				\n\t"	/* This is the end.  Jump here if the line count is zero.					*/
+	"emms				\n\t"	/* exit multi-media state (last asm instruction)						*/
+	: 					/* outputs: none										*/
+						/* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly)	*/
+						/*	(however the compiler/assembler can preload 32bit values into 64bit registers)		*/
+						/*	(that is why certain variables cannot be referenced by name -- use their register)	*/
+	: [data]       "S" (data), 		/*   put the pointer data into the rsi register							*/
+	  [width]      "b" (w),			/*   put the width in the %rbx register	(cannot be referenced by name)				*/
+	  [height]     "d" (h),			/*   put the heigth in the %rdx register (cannot be referenced by name)				*/
+	  [red_mod]    "r" ((unsigned long)(rm)),/*  put the red_modifier   in a register (referenced by name)					*/
+	  [green_mod]  "r" ((unsigned long)(gm)),/*  put the green_modifier in a register (referenced by name)					*/
+	  [blue_mod]   "r" ((unsigned long)(bm)),/*  put the blue_modifier  in a register (referenced by name)	Later store the bytes_line here	*/
+	  [bytes_line] "a" (bpl)		/*   put the bytes_per_line in the %rax register (cannot be referenced by name)			*/
+	: "memory"				/* clobbers: (memory includes all the registers)						*/
+  );	/*  End of Assembly  */
+}
+
 
 void shade_ximage_32_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm )
 {

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to