Michael, I have completed the x86_64 mmx port of the 32bpp shader. I have included it for you to review. I have cut and pasted it into Eterm's pixmap.c and used it and it seems to work well. I'm not asking you to add it yet, just give me your thoughts. First I have to add some hooks to pixmap.c, adjust the Makefile.in, add the HAVE_MMX_64 macro, and a few other things before we can smash it in there.
What I have included is a stand alone program (that contains the function in question) that will create two 5000x5000 pixel images, fill each of them with random colors (each image is identical), shade one with and one without mmx, compare the two for differences (and print the differences), then exit. On my AMD64 3500+ this takes just under a second for 25 million pixels (both were identical over many runs). I would like to discuss the best way to add this into Eterm so that the adventurous can test it but not plague the stable users. It would also be great if we could find an EM64T guinea pig to test this! Hopefully Mike Frysinger will add a ~AMD64 package to Gentoo soon after we finish so that it really gets tested. ;-) How goes the configure.in modifications? Does it properly sense the correct processor like we were discussing last week? -- Tres
/* File: tst.c * Written & Copyright (c) 2005 by Tres Melton * Donated to the Eterm project. http://www.Eterm.org * To be licensed under the GNU Public License v2.0 * * Much inspiration was drawn from the original x86 MMX port written by * Willem Monsuwe <[EMAIL PROTECTED]> in pure x86/MMX Assembly * * Manuals used in this port: * The Gnu Assembler * http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html * AMD64 Architecture Programmer's Manual Volume 1: Application Programming * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf * AMD64 Architecture Programmer's Manual Volume 2: System Programming * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf * AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf * AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf * AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf * * The 32 bit color modification algorithm is simple but so optimized * (even the C version) that it is almost unreadable. * Therefore the pseudo code is: * * for each color of each pixel * new_color = color * modifier * if ( new_color >= 256 ) * new_color = 255 * end if * end for * */ // The following includes are for the testing portion only (main) // #include "tst.h" // Not Needed #include <time.h> #include <stdio.h> #include <stdlib.h> #include <malloc.h> void shade_ximage_32_mmx_64( void *data, int bpl, int w, int h, int rm, int gm, int bm ) { __asm__ __volatile__ ( "leaq (%%rsi, %%rbx, 4), %%rsi \n\t" /* From A64_General_Purpose_and_System_Instructions (p. 182) */ /* Intel syntax section:[base + index*scale + disp] (used by AMD manuals) */ /* AT&T syntax section:disp(base, index, scale) (used by gas/gcc) */ /* Load Effective Address of (rsi + (rbx * size)) into rsi */ /* 32 bits per pixel means a multiplier of 4. */ "negq %%rbx \n\t" /* two's compliment negation of ebx (width) and sets the Zero Flag based on the results */ /* From A64_General_Purpose_and_System_Instructions (p. 212) */ "jz 3f \n\t" /* Jump to label 3 forward on Zero */ /* Basically if width = 0 blowout */ /* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations) */ "movd %[red_mod], %%mm4 \n\t" /* move red modifier into mm4 w/ zero extension to 128bits */ /* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended */ "psllq $16, %%mm4 \n\t" /* Packed Shift Left Logical Quad words (left shift mm4 16bits twice, once for each 64bit value)*/ /* From A64_128bit_Media_Programming (p. 328) */ "movd %[green_mod], %%mm5 \n\t" /* move green modifier into mm5 w/ zero extension to 128bits */ "por %%mm5, %%mm4 \n\t" /* Mesh green modifier into color modifier */ "psllq $16, %%mm4 \n\t" /* Packed Shift Left Logical Quad words (left shift mm4 16bits twice, once for each 64bit value)*/ "movd %[blue_mod], %%mm5 \n\t" /* move blue modifier (32 bits) into mm4 w/ zero extension to 128bits */ "por %%mm5, %%mm4 \n\t" /* Mesh blue modifier into color modifier */ /* mm4 (color modifier) now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 rm : 00 gm 00 bm */ "pcmpeqw %%mm6, %%mm6 \n\t" /* Packed Compare Equal Words */ /* From A64_128bit_Media_Programming (p. 276) */ /* This sets mm6 to 128 1's (since mm6 = mm6) */ "psllw $15, %%mm6 \n\t" /* Packed Shift Left Logical Words */ /* From A64_128bit_Media_Programming (p. 330) */ /* This sets 8 16 bit values of 1000 0000 0000 0000 in the 128 bit word */ "movq %%mm6, %%mm5 \n\t" /* Copy mm6 to mm5 (we need mm6 later) */ "pmulhw %%mm4, %%mm5 \n\t" /* Packed Multiply High Signed Word */ /* mm4 = ( mm4 * mm5 ) >> 16 (8 times, once for each 16bit value) */ /* For each color_ modifier (cm) */ /* (( cm * 80 00 ) >> 16 ) = (( cm << 15 ) >> 16 ) = cm >> 1 */ "1: \n\t" /* The start of the outer loop (lines) */ "movq %%rbx, %%rcx \n\t" /* Load the counting register (rcx) with the width of the window to shade */ "2: \n\t" /* The start of the inner loop (columns) */ "movd (%%rsi, %%rcx, 4), %%mm1 \n\t" /* sets mm1 to the 32bit color in the image map (data[ rcx ]) */ /* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits */ /* only move 32 bits with movd so we don't get two pixels worth of colors */ "pxor %%mm0, %%mm0 \n\t" /* 128bit exclusive or (sets mm0 to 0) */ "punpcklbw %%mm1, %%mm0 \n\t" /* Unpack and interleave low bytes */ /* For each color of the pixel expand to 16 bits and shift left 8 bits */ /* From A64_128bit_Media_Programming (p. 374) */ /* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first) */ "pxor %%mm6, %%mm0 \n\t" /* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times) */ "pmulhw %%mm4, %%mm0 \n\t" /* Package Multiply High Signed Word (an SSE2 instruction) 128bit mm0=color mm4=cm */ /* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0 */ /* and the high 16 bits of the result replace the 16 bits used from mm0 */ /* For (( each 16 bit color * each 16 bit color modifier ) >> 16 ) */ "psubw %%mm5, %%mm0 \n\t" /* Packed Subtract Words */ /* From A64_128bit_Media_Programming (p. 364) */ /* mm0=modified color mm5=corrected color modifier. mm0 = ( mm0 - mm5 ) */ /* 16 bit corrected modified color = ( modified color - corrected color modifier ) */ "packuswb %%mm0, %%mm0 \n\t" /* Pack with Saturation Signed Word to Unsigned Byte */ /* From A64_128bit_Media_Programming (p. 246) */ /* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0 */ "movd %%mm0, (%%rsi, %%rcx, 4) \n\t" /* puts the new 32 bit color value back into the data (image map) */ /* 32 bit color is still a double word so movd stays movd */ "incq %%rcx \n\t" /* Increment the count register (more pixels left) */ "jnz 2b \n\t" /* Jump backwards to label 2 (restart inner loop) on NOT zero (more pixels left) */ "addq %%rax, %%rsi \n\t" /* Add bytes per line to the data pointer (advance the pointer to the next line) */ "decq %%rdx \n\t" /* Decrement the dx register (row count) */ "jnz 1b \n\t" /* Jump backwards to label 1 (restart outer loop) if not zero (more rows left) */ "3: \n\t" /* End of function (jump here to clean up and return to caller */ "emms \n\t" /* exit multi-media state (last asm instruction) */ : /* outputs: none */ /* inputs: (many operations cannot be performed with a mix of 32bit & 64bit operands directly) */ /* (however the compiler/assembler can preload 32bit values into 64bit registers) */ /* (that is why certain variables cannot be referenced by name -- use their register) */ : [data] "S" (data), /* put the pointer data into the rsi register */ [width] "b" (w), /* put the width in the %rbx register (cannot be referenced by name) */ [height] "d" (h), /* put the heigth in the %rdx register (cannot be referenced by name) */ [red_mod] "r" (rm), /* put the red_modifier in a register (referenced by name) */ [green_mod] "r" (gm), /* put the green_modifier in a register (referenced by name) */ [blue_mod] "r" (bm), /* put the blue_modifier in a register (referenced by name) */ [bytes_line] "a" (bpl) /* put the bytes_per_line in the %rax register (cannot be referenced by name) */ : "memory" /* clobbers: (memory includes all the registers) */ ); /* End of Assembly */ } int main( void ) { unsigned int *data_ptr1, *data_ptr2; unsigned char *ptr; int width = 5000, height = 5000; int rm, gm, bm; int index1, index2; int error = 0; int temp; rm = gm = bm = 0x30; srand( time( NULL )); data_ptr1 = ( unsigned int* ) malloc( width * height * sizeof( unsigned int )); data_ptr2 = ( unsigned int* ) malloc( width * height * sizeof( unsigned int )); for( index2 = 0; index2 < height; index2++ ) for( index1 = 0; index1 < width; index1++ ) { data_ptr1[ index1 + width * index2 ] = data_ptr2[ index1 + width * index2 ] = 0x00ffffff & rand(); } shade_ximage_32_mmx_64( data_ptr1, width * 4, width, height, rm, gm, bm ); for( index2 = 0; index2 < height; index2++ ) for( index1 = 0; index1 < width; index1++ ) { ptr = ((( unsigned char* ) &( data_ptr2[ index1 + width * index2 ] ))); // These work only when the color_modifiers are < 256 // ptr[ 2 ] = ( unsigned char ) (( ptr[ 2 ] * rm ) >> 8 ); // ptr[ 1 ] = ( unsigned char ) (( ptr[ 1 ] * gm ) >> 8 ); // ptr[ 0 ] = ( unsigned char ) (( ptr[ 0 ] * bm ) >> 8 ); // These always work temp = ( ptr[ 2 ] * rm ) >> 8; ptr[ 2] = temp | ( !( temp >> 8 ) - 1 ); temp = ( ptr[ 1 ] * rm ) >> 8; ptr[ 1] = temp | ( !( temp >> 8 ) - 1 ); temp = ( ptr[ 0 ] * rm ) >> 8; ptr[ 0] = temp | ( !( temp >> 8 ) - 1 ); } for( index2 = 0; index2 < height; index2++ ) for( index1 = 0; index1 < width; index1++ ) if( data_ptr1[ index1 + width * index2 ] != data_ptr2[ index1 + width * index2 ] ) { if( !error ) { error++; printf( "\nData Check: (w) (h) (Assembly) (C-code)\n" ); } printf( " Mismatch: index1: %-3d index2: %-3d data_ptr1: 0x%08x data_ptr2: 0x%08x\n", index1, index2, data_ptr1[ index1 + width * index2 ], data_ptr2[ index1 + width * index2 ] ); } if( !error ) printf( "\nData Compared Identically. (%dx%d pixels).\n", width, height ); return 0; }