Michael,

        I have completed the x86_64 mmx port of the 32bpp shader.  I have
included it for you to review.  I have cut and pasted it into Eterm's
pixmap.c and used it and it seems to work well.  I'm not asking you to
add it yet, just give me your thoughts.  First I have to add some hooks
to pixmap.c, adjust the Makefile.in, add the HAVE_MMX_64 macro, and a
few other things before we can smash it in there.  

        What I have included is a stand alone program (that contains the
function in question) that will create two 5000x5000 pixel images, fill
each of them with random colors (each image is identical), shade one
with and one without mmx, compare the two for differences (and print the
differences), then exit.  On my AMD64 3500+ this takes just under a
second for 25 million pixels (both were identical over many runs).

        I would like to discuss the best way to add this into Eterm so that the
adventurous can test it but not plague the stable users.  It would also
be great if we could find an EM64T guinea pig to test this!  Hopefully
Mike Frysinger will add a ~AMD64 package to Gentoo soon after we finish
so that it really gets tested.  ;-)

        How goes the configure.in modifications?  Does it properly sense the
correct processor like we were discussing last week?

-- 
Tres
/*  File:  tst.c
 *  Written & Copyright (c) 2005 by Tres Melton
 *  Donated to the Eterm project.  http://www.Eterm.org
 *  To be licensed under the GNU Public License v2.0
 *
 *  Much inspiration was drawn from the original x86 MMX port written by
 *    Willem Monsuwe <[EMAIL PROTECTED]> in pure x86/MMX Assembly
 *
 *  Manuals used in this port:
 *      The Gnu Assembler
 *              
http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html
 *      AMD64 Architecture Programmer's Manual Volume 1: Application Programming
 *              
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf
 *      AMD64 Architecture Programmer's Manual Volume 2: System Programming
 *              
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
 *      AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and 
System Instructions
 *              
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf
 *      AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media 
Instructions
 *              
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf
 *      AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 
Floating-Point Instructions
 *              
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf
 *
 *  The 32 bit color modification algorithm is simple but so optimized 
 *    (even the C version) that it is almost unreadable.  
 *    Therefore the pseudo code is:
 *
 *      for each color of each pixel
 *              new_color = color * modifier
 *              if ( new_color >= 256 )
 *                      new_color = 255
 *              end if
 *      end for
 *
 */

//  The following includes are for the testing portion only (main)
// #include "tst.h"     // Not Needed
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

void shade_ximage_32_mmx_64( void *data, int bpl, int w, int h, int rm, int gm, 
int bm )
{
  __asm__ __volatile__ (
        "leaq (%%rsi, %%rbx, 4), %%rsi  \n\t"   /* From 
A64_General_Purpose_and_System_Instructions (p. 182)                            
        */
                                                /* Intel syntax section:[base + 
index*scale + disp]  (used by AMD manuals)                      */
                                                /* AT&T  syntax 
section:disp(base, index, scale)     (used by gas/gcc)                          
*/
                                                /* Load Effective Address of 
(rsi + (rbx * size)) into rsi                                      */
                                                /* 32 bits per pixel means a 
multiplier of 4.                                                   */
        "negq %%rbx                     \n\t"   /* two's compliment negation of 
ebx (width) and sets the Zero Flag based on the results         */
                                                /* From 
A64_General_Purpose_and_System_Instructions (p. 212)                            
        */
        "jz 3f                          \n\t"   /* Jump to label 3 forward on 
Zero                                                              */
                                                /* Basically if width = 0 
blowout                                                               */
                                                /* I don't understand why the 
height isn't checked (shouldn't matter, zero loop iterations)     */
        "movd %[red_mod], %%mm4         \n\t"   /* move red modifier into mm4 
w/ zero extension to 128bits                                      */
                                                /* RGB's are 8 bit values. 
regardless of them coming in in 32/64 bit they are zero extended     */
        "psllq $16, %%mm4               \n\t"   /* Packed Shift Left Logical 
Quad words (left shift mm4 16bits twice, once for each 64bit value)*/
                                                /* From 
A64_128bit_Media_Programming (p. 328)                                           
        */
        "movd %[green_mod], %%mm5       \n\t"   /* move green modifier into mm5 
w/ zero extension to 128bits                                    */
        "por %%mm5, %%mm4               \n\t"   /* Mesh green modifier into 
color modifier                                                      */
        "psllq $16, %%mm4               \n\t"   /* Packed Shift Left Logical 
Quad words (left shift mm4 16bits twice, once for each 64bit value)*/
        "movd %[blue_mod], %%mm5        \n\t"   /* move blue modifier (32 bits) 
into mm4 w/ zero extension to 128bits                           */
        "por %%mm5, %%mm4               \n\t"   /* Mesh blue modifier into 
color modifier                                                       */
                                                /* mm4 (color modifier) now 
contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 rm : 00 gm 00 bm     */
        "pcmpeqw %%mm6, %%mm6           \n\t"   /* Packed Compare Equal Words   
                                                                */
                                                /* From 
A64_128bit_Media_Programming (p. 276)                                           
        */
                                                /* This sets mm6 to 128 1's 
(since mm6 = mm6)                                                   */
        "psllw $15, %%mm6               \n\t"   /* Packed Shift Left Logical 
Words                                                              */
                                                /* From 
A64_128bit_Media_Programming (p. 330)                                           
        */
                                                /* This sets 8 16 bit values of 
 1000 0000 0000 0000 in the 128 bit word                        */
        "movq %%mm6, %%mm5              \n\t"   /* Copy mm6 to mm5 (we need mm6 
later)                                                          */
        "pmulhw %%mm4, %%mm5            \n\t"   /* Packed Multiply High Signed 
Word                                                             */
                                                /* mm4 = ( mm4 * mm5 ) >> 16  
(8 times, once for each 16bit value)                              */
                                                /* For each color_ modifier 
(cm)                                                                */
                                                /*   (( cm * 80 00 ) >> 16 ) = 
(( cm << 15 ) >> 16 )  = cm >> 1                                 */
        "1:                             \n\t"   /* The start of the outer loop 
(lines)                                                          */
        "movq %%rbx, %%rcx              \n\t"   /* Load the counting register 
(rcx) with the width of the window to shade                       */
        "2:                             \n\t"   /* The start of the inner loop 
(columns)                                                        */
        "movd (%%rsi, %%rcx, 4), %%mm1  \n\t"   /* sets mm1 to the 32bit color 
in the image map (data[ rcx ])                                   */
                                                /* 32 bit color is still 4 
bytes so leave the multiplier alone it is zero extended to 128 bits  */
                                                /* only move 32 bits with movd 
so we don't get two pixels worth of colors                       */
        "pxor %%mm0, %%mm0              \n\t"   /* 128bit exclusive or (sets 
mm0 to 0)                                                          */
        "punpcklbw %%mm1, %%mm0         \n\t"   /* Unpack and interleave low 
bytes                                                              */
                                                /* For each color of the pixel 
expand to 16 bits and shift left 8 bits                          */
                                                /* From 
A64_128bit_Media_Programming (p. 374)                                           
        */
                                                /* discard high 64 bits and 
expand both mm0 and mm1 a byte at a time into mm0 (mm0 first)       */
        "pxor %%mm6, %%mm0              \n\t"   /* This flips the sign of the 
16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times)    */
        "pmulhw %%mm4, %%mm0            \n\t"   /* Package Multiply High Signed 
Word  (an SSE2 instruction) 128bit     mm0=color  mm4=cm        */
                                                /* Each 16 bit signed int in 
mm4 (8) is multiplied by the same in mm0                           */
                                                /*    and the high 16 bits of 
the result replace the 16 bits used from mm0                      */
                                                /* For (( each 16 bit color * 
each 16 bit color modifier ) >> 16 )                              */
        "psubw %%mm5, %%mm0             \n\t"   /* Packed Subtract Words        
                                                                */
                                                /* From 
A64_128bit_Media_Programming (p. 364)                                           
        */
                                                /* mm0=modified color  
mm5=corrected color modifier. mm0 = ( mm0 - mm5 )                        */
                                                /* 16 bit corrected modified 
color = ( modified color - corrected color modifier )              */
        "packuswb %%mm0, %%mm0          \n\t"   /* Pack with Saturation Signed 
Word to Unsigned Byte                                            */
                                                /* From 
A64_128bit_Media_Programming (p. 246)                                           
        */
                                                /* if mm0 > 255 then mm0=255 
elsif mm0 < 0 mm0=0 else mm0=mm0                                   */
        "movd %%mm0, (%%rsi, %%rcx, 4)  \n\t"   /* puts the new 32 bit color 
value back into the data (image map)                               */
                                                /* 32 bit color is still a 
double word so movd stays movd                                       */
        "incq %%rcx                     \n\t"   /* Increment the count register 
(more pixels left)                                              */
        "jnz 2b                         \n\t"   /* Jump backwards to label 2 
(restart inner loop) on NOT zero (more pixels left)                */
        "addq %%rax, %%rsi              \n\t"   /* Add bytes per line to the 
data pointer (advance the pointer to the next line)                */
        "decq %%rdx                     \n\t"   /* Decrement the dx register 
(row count)                                                        */
        "jnz 1b                         \n\t"   /* Jump backwards to label 1 
(restart outer loop) if not zero (more rows left)                  */
        "3:                             \n\t"   /* End of function (jump here 
to clean up and return to caller                                  */
        "emms                           \n\t"   /* exit multi-media state (last 
asm instruction)                                                */
        :                                       /* outputs: none                
                                                                */
                                                /* inputs: (many operations 
cannot be performed with a mix of 32bit & 64bit operands directly)  */
                                                /*      (however the 
compiler/assembler can preload 32bit values into 64bit registers)          */
                                                /*      (that is why certain 
variables cannot be referenced by name -- use their register)      */
        : [data]       "S" (data),              /*   put the pointer data into 
the rsi register                                                 */
          [width]      "b" (w),                 /*   put the width in the %rbx 
register (cannot be referenced by name)                          */
          [height]     "d" (h),                 /*   put the heigth in the %rdx 
register (cannot be referenced by name)                         */
          [red_mod]    "r" (rm),                /*   put the red_modifier   in 
a register (referenced by name)                                  */
          [green_mod]  "r" (gm),                /*   put the green_modifier in 
a register (referenced by name)                                  */
          [blue_mod]   "r" (bm),                /*   put the blue_modifier  in 
a register (referenced by name)                                  */
          [bytes_line] "a" (bpl)                /*   put the bytes_per_line in 
the %rax register (cannot be referenced by name)                 */
        : "memory"                              /* clobbers: (memory includes 
all the registers)                                                */
  );    /*  End of Assembly  */
}
                          


int main( void )
{
  unsigned int   *data_ptr1, *data_ptr2;
  unsigned char  *ptr;
  int            width = 5000, height = 5000;
  int            rm, gm, bm;
  int            index1, index2;
  int            error = 0;
  int            temp;

  rm = gm = bm = 0x30;
  srand( time( NULL ));
  data_ptr1 = ( unsigned int* ) malloc( width * height * sizeof( unsigned int 
));
  data_ptr2 = ( unsigned int* ) malloc( width * height * sizeof( unsigned int 
));
  for( index2 = 0; index2 < height; index2++ )
    for( index1 = 0; index1 < width; index1++ )
    {
      data_ptr1[ index1 + width * index2 ] = data_ptr2[ index1 + width * index2 
] = 0x00ffffff & rand();
    }
  shade_ximage_32_mmx_64( data_ptr1, width * 4, width, height, rm, gm, bm );
  for( index2 = 0; index2 < height; index2++ )
    for( index1 = 0; index1 < width; index1++ )
    {
      ptr = ((( unsigned char* ) &( data_ptr2[ index1 + width * index2 ] )));

//  These work only when the color_modifiers are < 256
//      ptr[ 2 ] = ( unsigned char ) (( ptr[ 2 ] * rm ) >> 8 );
//      ptr[ 1 ] = ( unsigned char ) (( ptr[ 1 ] * gm ) >> 8 );
//      ptr[ 0 ] = ( unsigned char ) (( ptr[ 0 ] * bm ) >> 8 );

//  These always work
      temp = ( ptr[ 2 ] * rm ) >> 8;
      ptr[ 2] = temp | ( !( temp >> 8 ) - 1 );
      temp = ( ptr[ 1 ] * rm ) >> 8;
      ptr[ 1] = temp | ( !( temp >> 8 ) - 1 );
      temp = ( ptr[ 0 ] * rm ) >> 8;
      ptr[ 0] = temp | ( !( temp >> 8 ) - 1 );
    }
  for( index2 = 0; index2 < height; index2++ )
    for( index1 = 0; index1 < width; index1++ )
      if( data_ptr1[ index1 + width * index2 ] != data_ptr2[ index1 + width * 
index2 ] )
      {
        if( !error )
        {
          error++;
          printf( "\nData Check:            (w)           (h)               
(Assembly)                 (C-code)\n" );
        }
        printf( "    Mismatch:  index1:  %-3d  index2:  %-3d  data_ptr1:  
0x%08x   data_ptr2:  0x%08x\n", 
                 index1, index2, data_ptr1[ index1 + width * index2 ], 
data_ptr2[ index1 + width * index2 ] );
      }
  if( !error )
    printf( "\nData Compared Identically.  (%dx%d pixels).\n", width, height );
  return 0; 
}




Reply via email to