On Tue, 2006-01-03 at 15:49 -0700, Tres Melton wrote: > Michael, > Crap, Wrong damn tree. Try this instead of the last one please.
-- Tres Melton IRC & Gentoo: RiverRat
Index: eterm/Eterm/src/pixmap.c =================================================================== RCS file: /cvsroot/enlightenment/eterm/Eterm/src/pixmap.c,v retrieving revision 1.115 diff -u -b -B -u -r1.115 pixmap.c --- eterm/Eterm/src/pixmap.c 22 Dec 2005 23:31:33 -0000 1.115 +++ eterm/Eterm/src/pixmap.c 3 Jan 2006 22:59:40 -0000 @@ -66,10 +66,30 @@ extern void shade_ximage_32_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm); /* Assembler routines for 64 bit cpu with sse2 */ -extern void shade_ximage_15_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm); -extern void shade_ximage_16_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm); +#ifdef HAVE_SSE2 +extern void shade_ximage_15_sse2_A(void *data, int bpl, int w, int h, int rm, int gm, int bm ); +extern void shade_ximage_15_sse2_U(void *data, int bpl, int w, int h, int rm, int gm, int bm ); +extern void shade_ximage_16_sse2_A(void *data, int bpl, int w, int h, int rm, int gm, int bm ); +extern void shade_ximage_16_sse2_U(void *data, int bpl, int w, int h, int rm, int gm, int bm ); extern void shade_ximage_32_sse2(void *data, int bpl, int w, int h, int rm, int gm, int bm); +#define ETERM_ALIGNMENT 16 + +#define shade_ximage_15_sse2( data, bpl, w, h, rm, gm, bm ) \ +{ \ + (((long) ( data )) & ((long) ( bpl )) & ((long) ( ETERM_ALIGNMENT - 1 ))) ? \ + shade_ximage_15_sse2_U((data), (bpl), (w), (h), (rm), (gm), (bm)) : \ + shade_ximage_15_sse2_A((data), (bpl), (w), (h), (rm), (gm), (bm)); \ +} + +#define shade_ximage_16_sse2( data, bpl, w, h, rm, gm, bm ) \ +{ \ + (((long) ( data )) & ((long) ( bpl )) & ((long) ( ETERM_ALIGNMENT - 1 ))) ? \ + shade_ximage_16_sse2_U((data), (bpl), (w), (h), (rm), (gm), (bm)) : \ + shade_ximage_16_sse2_A((data), (bpl), (w), (h), (rm), (gm), (bm)); \ +} +#endif + #ifdef PIXMAP_SUPPORT static Imlib_Border bord_none = { 0, 0, 0, 0 }; #endif Index: eterm/Eterm/src/sse2_cmod.c =================================================================== RCS file: /cvsroot/enlightenment/eterm/Eterm/src/sse2_cmod.c,v retrieving revision 1.1 diff -u -b -B -u -r1.1 sse2_cmod.c --- eterm/Eterm/src/sse2_cmod.c 14 Jun 2005 19:39:01 -0000 1.1 +++ eterm/Eterm/src/sse2_cmod.c 3 Jan 2006 22:59:41 -0000 @@ -94,7 +88,7 @@ #ifdef HAVE_SSE2 -void shade_ximage_15_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) +void shade_ximage_15_sse2_U( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) { __asm__ __volatile__ ( ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/ @@ -269,7 +263,7 @@ } -void shade_ximage_16_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) +void shade_ximage_16_sse2_U( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) { __asm__ __volatile__ ( ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/ @@ -447,6 +441,359 @@ ); /* End of Assembly */ } +void shade_ximage_15_sse2_A( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) +{ + __asm__ __volatile__ ( + ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/ + "leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t" /* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6 */ + "negq %%rbx \n\t" /* Negate the width to that we can increment the counter */ + "jz 10f \n\t" /* Jump to end if the line count is zero */ + "movd %[red_mod], %%xmm5 \n\t" /* Load the color modifiers into mmx registers */ + "movd %[green_mod], %%xmm6 \n\t" /* " " */ + "movd %[blue_mod], %%xmm7 \n\t" /* " " */ + "punpcklwd %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low words. From A64_128bit_Media_Programming (p. 380) */ + "punpcklwd %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same) */ + "punpcklwd %%xmm7, %%xmm7 \n\t" + "punpckldq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low double words. From A64_128bit_Media_Programming (p. 376) */ + "punpckldq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same) */ + "punpckldq %%xmm7, %%xmm7 \n\t" + "punpcklqdq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */ + "punpcklqdq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */ + "punpcklqdq %%xmm7, %%xmm7 \n\t" + "or %[red_mod], %[green_mod] \n\t" /* This, and the following 4 instructions, check to see if all three colormodifiers are */ + "or %[blue_mod], %[green_mod] \n\t" /* less than 256. If any of the modifiers are > 256 then they will have the 9th, or higher, */ + "sar $8, %[green_mod] \n\t" /* bit set. Then we shift off eight bits, leaving something set if a modifier > 256. */ + "movq %%rax, %[blue_mod] \n\t" /* Use the register named blue_mod to now store bytes_per_line. */ + "xor %[red_mod], %[red_mod] \n\t" /* zero red so we don't have to load an immediate value for the following compare. */ + "cmp %[red_mod], %[green_mod] \n\t" /* Compare the left over bits to zero */ + "jg 5f \n\t" /* If one of the colors (might) need saturated then jump to the secondary set of loops. */ + "1: \n\t" /* Start of the outer loop (lines). */ + "movq %%rbx, %%rcx \n\t" /* Move the width into the count register */ + "addq $7, %%rcx \n\t" + "jns 3f \n\t" + "2: \n\t" /* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register ) */ + "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t" /* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue) */ + "movdqa %%xmm0, %%xmm1 \n\t" /* Create a copy of the pixel for the green color */ + "movdqa %%xmm0, %%xmm2 \n\t" /* Create a copy of the pixel for the blue color */ + "psrlw $5, %%xmm1 \n\t" /* Packed Shift Right Logical Words */ + /* From A64_128bit_Media_Programming (p. 347) */ + /* Shifts the blue off of the green color */ + "psrlw $10, %%xmm0 \n\t" /* Shifts the blue & green off of the red color */ + "psllw $11, %%xmm2 \n\t" /* Packed Shift Left Logical Words */ + /* From A64_128bit_Media_Programming (p. 330) */ + /* Shifts the red & green off of the blue color */ + "psllw $11, %%xmm1 \n\t" /* Shifts the red off of the green color */ + "psllw $8, %%xmm0 \n\t" /* Shifts the red color into position */ + "psrlw $3, %%xmm1 \n\t" /* Shifts the green color into position */ + "psrlw $3, %%xmm2 \n\t" /* Shifts the blue color into position */ + "pmulhw %%xmm5, %%xmm0 \n\t" /* color *= modifier */ + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "psllw $10, %%xmm0 \n\t" /* Shift red back into its original position */ + "psllw $5, %%xmm1 \n\t" /* Shift green back into its original position */ + "por %%xmm2, %%xmm0 \n\t" /* Mesh the colors back together */ + "por %%xmm1, %%xmm0 \n\t" + "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t" /* Place the shaded 8 pixels back into the image map */ + "addq $8, %%rcx \n\t" + "js 2b \n\t" + "jmp 4f \n\t" + "3: \n\t" /* Deal with pixels one at a time here. */ + "movw (%%rsi, %%rcx, 2), %%ax \n\t" + "movd %%eax, %%xmm0 \n\t" + "movq %%xmm0, %%xmm1 \n\t" + "movq %%xmm0, %%xmm2 \n\t" + "psrlw $5, %%xmm1 \n\t" + "psrlw $10, %%xmm0 \n\t" + "psllw $11, %%xmm2 \n\t" + "psllw $11, %%xmm1 \n\t" + "psllw $8, %%xmm0 \n\t" + "psrlw $3, %%xmm1 \n\t" + "psrlw $3, %%xmm2 \n\t" + "pmulhw %%xmm5, %%xmm0 \n\t" + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "psllw $10, %%xmm0 \n\t" + "psllw $5, %%xmm1 \n\t" + "por %%xmm2, %%xmm0 \n\t" + "por %%xmm1, %%xmm0 \n\t" + "movd %%xmm0, %%eax \n\t" + "movw %%ax, (%%rsi, %%rcx, 2) \n\t" + "incq %%rcx \n\t" + "4: \n\t" + "cmpq $6, %%rcx \n\t" + "jng 3b \n\t" + "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */ + "decq %%rdx \n\t" + "jnz 1b \n\t" + "jmp 10f \n\t" /* We're done! */ + + "5: \n\t" /* Saturation is required */ + "pcmpeqw %%xmm3, %%xmm3 \n\t" /* Packed Compare Equal Words */ + /* From A64_128bit_Media_Programming (p. 276) */ + /* This sets xmm3 to 128 1's (since mm6 = mm6) */ + "psllw $5, %%xmm3 \n\t" /* xmm3 = 8 copies of 1111 1111 1110 0000 */ + "6: \n\t" + "movq %%rbx, %%rcx \n\t" + "addq $7, %%rcx \n\t" + "jns 8f \n\t" + "7: \n\t" + "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t" + "movdqa %%xmm0, %%xmm1 \n\t" + "movdqa %%xmm0, %%xmm2 \n\t" + "psrlw $5, %%xmm1 \n\t" + "psrlw $10, %%xmm0 \n\t" + "psllw $11, %%xmm2 \n\t" + "psllw $11, %%xmm1 \n\t" + "psllw $8, %%xmm0 \n\t" + "psrlw $3, %%xmm1 \n\t" + "psrlw $3, %%xmm2 \n\t" + "pmulhw %%xmm5, %%xmm0 \n\t" + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "paddusw %%xmm3, %%xmm0 \n\t" + "paddusw %%xmm3, %%xmm1 \n\t" + "paddusw %%xmm3, %%xmm2 \n\t" + "psubw %%xmm3, %%xmm0 \n\t" /* FIXME: This line needs added to the original asm code */ + "psubw %%xmm3, %%xmm1 \n\t" + "psubw %%xmm3, %%xmm2 \n\t" + "psllw $10, %%xmm0 \n\t" + "psllw $5, %%xmm1 \n\t" + "por %%xmm2, %%xmm0 \n\t" + "por %%xmm1, %%xmm0 \n\t" + "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t" + "addq $8, %%rcx \n\t" + "js 7b \n\t" + "jmp 9f \n\t" + "8: \n\t" + "movw (%%rsi, %%rcx, 2), %%ax \n\t" + "movd %%eax, %%xmm0 \n\t" + "movq %%xmm0, %%xmm1 \n\t" + "movq %%xmm0, %%xmm2 \n\t" + "psrlw $5, %%xmm1 \n\t" + "psrlw $10, %%xmm0 \n\t" + "psllw $11, %%xmm2 \n\t" + "psllw $11, %%xmm1 \n\t" + "psllw $8, %%xmm0 \n\t" + "psrlw $3, %%xmm1 \n\t" + "psrlw $3, %%xmm2 \n\t" + "pmulhw %%xmm5, %%xmm0 \n\t" + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "paddusw %%xmm3, %%xmm0 \n\t" + "paddusw %%xmm3, %%xmm1 \n\t" + "paddusw %%xmm3, %%xmm2 \n\t" + "psubw %%xmm3, %%xmm0 \n\t" /* FIXME: This line needs added to the original asm code */ + "psubw %%xmm3, %%xmm1 \n\t" + "psubw %%xmm3, %%xmm2 \n\t" + "psllw $10, %%xmm0 \n\t" + "psllw $5, %%xmm1 \n\t" + "por %%xmm2, %%xmm0 \n\t" + "por %%xmm1, %%xmm0 \n\t" + "movd %%xmm0, %%eax \n\t" + "movw %%ax, (%%rsi, %%rcx, 2) \n\t" + "incq %%rcx \n\t" + "9: \n\t" + "cmpq $6, %%rcx \n\t" + "jng 8b \n\t" + "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */ + "decq %%rdx \n\t" + "jnz 6b \n\t" + "10: \n\t" /* This is the end. Jump here if the line count is zero. */ + "emms \n\t" /* exit multi-media state (last asm instruction) */ + : /* outputs: none */ + /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */ + /* (however the compiler/assembler can preload 32bit values into 64bit registers) */ + /* (that is why certain variables cannot be referenced by name -- use their register) */ + : [data] "S" (data), /* put the pointer data into the rsi register */ + [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */ + [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */ + [red_mod] "r" ((unsigned long)(rm)),/* put the red_modifier in a register (referenced by name) */ + [green_mod] "r" ((unsigned long)(gm)),/* put the green_modifier in a register (referenced by name) */ + [blue_mod] "r" ((unsigned long)(bm)),/* put the blue_modifier in a register (referenced by name) Later store the bytes_line here */ + [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */ + : "memory" /* clobbers: (memory includes all the registers) */ + ); /* End of Assembly */ +} + + +void shade_ximage_16_sse2_A( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) +{ + __asm__ __volatile__ ( + ".align 16 \n\t" /* SIMD instructions should be aligned on 16 byte (128 bit) boundraries for performance reasons.*/ + "leaq -14(%%rsi, %%rbx, 2), %%rsi\n\t" /* Load the stack index register with a pointer to data + ( width * bytes/pixel ) -6 */ + "negq %%rbx \n\t" /* Negate the width to that we can increment the counter */ + "jz 10f \n\t" /* Jump to end if the line count is zero */ + "movd %[red_mod], %%xmm5 \n\t" /* Load the color modifiers into mmx registers */ + "movd %[green_mod], %%xmm6 \n\t" /* " " */ + "movd %[blue_mod], %%xmm7 \n\t" /* " " */ + "punpcklwd %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low words. From A64_128bit_Media_Programming (p. 380) */ + "punpcklwd %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 16 bits into the next 16 bits (both operands are the same) */ + "punpcklwd %%xmm7, %%xmm7 \n\t" + "punpckldq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low double words. From A64_128bit_Media_Programming (p. 376) */ + "punpckldq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 32 bits into the next 32 bits (both operands are the same) */ + "punpckldq %%xmm7, %%xmm7 \n\t" + "punpcklqdq %%xmm5, %%xmm5 \n\t" /* Unpack and Interleave low quad words. From A64_128bit_Media_Programming (p. 378) */ + "punpcklqdq %%xmm6, %%xmm6 \n\t" /* Duplicate the bottom 64 bits into the next 64 bits (both operands are the same) */ + "punpcklqdq %%xmm7, %%xmm7 \n\t" + "or %[red_mod], %[green_mod] \n\t" /* This, and the following 4 instructions, check to see if all three colormodifiers are */ + "or %[blue_mod], %[green_mod] \n\t" /* less than 256. If any of the modifiers are > 256 then they will have the 9th, or higher, */ + "sar $8, %[green_mod] \n\t" /* bit set. Then we shift off eight bits, leaving something set if a modifier > 256. */ + "movq %%rax, %[blue_mod] \n\t" /* Use the register named blue_mod to now store bytes_per_line. */ + "xor %[red_mod], %[red_mod] \n\t" /* zero red so we don't have to load an immediate value for the following compare. */ + "cmp %[red_mod], %[green_mod] \n\t" /* Compare the left over bits to zero */ + "jg 5f \n\t" /* If one of the colors (might) need saturated then jump to the secondary set of loops. */ + "1: \n\t" /* Start of the outer loop (lines). */ + "movq %%rbx, %%rcx \n\t" /* Move the width into the count register */ + "addq $7, %%rcx \n\t" + "jns 3f \n\t" + "2: \n\t" /* Start of the inner loop (pixels 8 at a time --> 8 * 16 = 128bits/xmm register ) */ + "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t" /* Load the 16 bits of the pixel (5 bits for red, 6 bits for green, 5 bits for blue) */ + "movdqa %%xmm0, %%xmm1 \n\t" /* Create a copy of the pixel for the green color */ + "movdqa %%xmm0, %%xmm2 \n\t" /* Create a copy of the pixel for the blue color */ + "psrlw $5, %%xmm1 \n\t" /* Packed Shift Right Logical Words */ + /* From A64_128bit_Media_Programming (p. 347) */ + /* Shifts the blue off of the green color */ + "psrlw $11, %%xmm0 \n\t" /* Shifts the blue & green off of the red color */ + "psllw $11, %%xmm2 \n\t" /* Packed Shift Left Logical Words */ + /* From A64_128bit_Media_Programming (p. 330) */ + /* Shifts the red & green off of the blue color */ + "psllw $10, %%xmm1 \n\t" /* Shifts the red off of the green color */ + "psllw $8, %%xmm0 \n\t" /* Shifts the red color into position */ + "psrlw $2, %%xmm1 \n\t" /* Shifts the green color into position */ + "psrlw $3, %%xmm2 \n\t" /* Shifts the blue color into position */ + "pmulhw %%xmm5, %%xmm0 \n\t" /* color *= modifier */ + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "psllw $11, %%xmm0 \n\t" /* Shift red back into its original position */ + "psllw $5, %%xmm1 \n\t" /* Shift green back into its original position */ + "por %%xmm2, %%xmm0 \n\t" /* Mesh the colors back together */ + "por %%xmm1, %%xmm0 \n\t" + "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t" /* Place the shaded 8 pixels back into the image map */ + "addq $8, %%rcx \n\t" + "js 2b \n\t" + "jmp 4f \n\t" + "3: \n\t" /* Deal with pixels one at a time here. */ + "movw (%%rsi, %%rcx, 2), %%ax \n\t" + "movd %%eax, %%xmm0 \n\t" + "movq %%xmm0, %%xmm1 \n\t" + "movq %%xmm0, %%xmm2 \n\t" + "psrlw $5, %%xmm1 \n\t" + "psrlw $11, %%xmm0 \n\t" + "psllw $11, %%xmm2 \n\t" + "psllw $10, %%xmm1 \n\t" + "psllw $8, %%xmm0 \n\t" + "psrlw $2, %%xmm1 \n\t" + "psrlw $3, %%xmm2 \n\t" + "pmulhw %%xmm5, %%xmm0 \n\t" + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "psllw $11, %%xmm0 \n\t" + "psllw $5, %%xmm1 \n\t" + "por %%xmm2, %%xmm0 \n\t" + "por %%xmm1, %%xmm0 \n\t" + "movd %%xmm0, %%eax \n\t" + "movw %%ax, (%%rsi, %%rcx, 2) \n\t" + "incq %%rcx \n\t" + "4: \n\t" + "cmpq $6, %%rcx \n\t" + "jng 3b \n\t" + "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */ + "decq %%rdx \n\t" + "jnz 1b \n\t" + "jmp 10f \n\t" /* We're done! */ + + "5: \n\t" /* Saturation is required */ + "pcmpeqw %%xmm3, %%xmm3 \n\t" /* Packed Compare Equal Words */ + /* From A64_128bit_Media_Programming (p. 276) */ + /* This sets xmm3 to 128 1's (since mm6 = mm6) */ + "movdqa %%xmm3, %%xmm4 \n\t" /* Make copy of 128 ones */ + "psllw $5, %%xmm3 \n\t" /* xmm3 = 8 copies of 1111 1111 1110 0000 */ + "psllw $6, %%xmm4 \n\t" /* xmm4 = 8 copies of 1111 1111 1100 0000 */ + "6: \n\t" + "movq %%rbx, %%rcx \n\t" + "addq $7, %%rcx \n\t" + "jns 8f \n\t" + "7: \n\t" + "movdqa (%%rsi, %%rcx, 2), %%xmm0\n\t" + "movdqa %%xmm0, %%xmm1 \n\t" + "movdqa %%xmm0, %%xmm2 \n\t" + "psrlw $5, %%xmm1 \n\t" + "psrlw $11, %%xmm0 \n\t" + "psllw $11, %%xmm2 \n\t" + "psllw $10, %%xmm1 \n\t" + "psllw $8, %%xmm0 \n\t" + "psrlw $2, %%xmm1 \n\t" + "psrlw $3, %%xmm2 \n\t" + "pmulhw %%xmm5, %%xmm0 \n\t" + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + "paddusw %%xmm3, %%xmm0 \n\t" + "paddusw %%xmm4, %%xmm1 \n\t" + "paddusw %%xmm3, %%xmm2 \n\t" + "psubw %%xmm4, %%xmm1 \n\t" + "psubw %%xmm3, %%xmm2 \n\t" + "psllw $11, %%xmm0 \n\t" + "psllw $5, %%xmm1 \n\t" + "por %%xmm2, %%xmm0 \n\t" + "por %%xmm1, %%xmm0 \n\t" + "movdqa %%xmm0, (%%rsi, %%rcx, 2)\n\t" + "addq $8, %%rcx \n\t" + "js 7b \n\t" + "jmp 9f \n\t" + "8: \n\t" + "movw (%%rsi, %%rcx, 2), %%ax \n\t" + "movd %%eax, %%xmm0 \n\t" + "movq %%xmm0, %%xmm1 \n\t" + "movq %%xmm0, %%xmm2 \n\t" + "psrlw $5, %%xmm1 \n\t" + "psrlw $11, %%xmm0 \n\t" + "psllw $11, %%xmm2 \n\t" + "psllw $10, %%xmm1 \n\t" + "psllw $8, %%xmm0 \n\t" + "psrlw $2, %%xmm1 \n\t" + "psrlw $3, %%xmm2 \n\t" + " \n\t" + "pmulhw %%xmm5, %%xmm0 \n\t" + "pmulhw %%xmm6, %%xmm1 \n\t" + "pmulhw %%xmm7, %%xmm2 \n\t" + " \n\t" + "paddusw %%xmm3, %%xmm0 \n\t" + "paddusw %%xmm4, %%xmm1 \n\t" + "paddusw %%xmm3, %%xmm2 \n\t" + " \n\t" + "psubw %%xmm4, %%xmm1 \n\t" + "psubw %%xmm3, %%xmm2 \n\t" + " \n\t" + "psllw $11, %%xmm0 \n\t" + "psllw $5, %%xmm1 \n\t" + "por %%xmm2, %%xmm0 \n\t" + "por %%xmm1, %%xmm0 \n\t" + "movd %%xmm0, %%eax \n\t" + "movw %%ax, (%%rsi, %%rcx, 2) \n\t" + "incq %%rcx \n\t" + "9: \n\t" + "cmpq $6, %%rcx \n\t" + "jng 8b \n\t" + "addq %[blue_mod], %%rsi \n\t" /* Blue_mod is the name of a register that now contains bytes_per_line. */ + "decq %%rdx \n\t" + "jnz 6b \n\t" + "10: \n\t" /* This is the end. Jump here if the line count is zero. */ + "emms \n\t" /* exit multi-media state (last asm instruction) */ + : /* outputs: none */ + /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */ + /* (however the compiler/assembler can preload 32bit values into 64bit registers) */ + /* (that is why certain variables cannot be referenced by name -- use their register) */ + : [data] "S" (data), /* put the pointer data into the rsi register */ + [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */ + [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */ + [red_mod] "r" ((unsigned long)(rm)),/* put the red_modifier in a register (referenced by name) */ + [green_mod] "r" ((unsigned long)(gm)),/* put the green_modifier in a register (referenced by name) */ + [blue_mod] "r" ((unsigned long)(bm)),/* put the blue_modifier in a register (referenced by name) Later store the bytes_line here */ + [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */ + : "memory" /* clobbers: (memory includes all the registers) */ + ); /* End of Assembly */ +} + void shade_ximage_32_sse2( volatile void *data, volatile int bpl, volatile int w, volatile int h, volatile int rm, volatile int gm, volatile int bm ) {
signature.asc
Description: This is a digitally signed message part