inline assembly

Eero Pajarre Tue, 23 Feb 1999 04:38:43 -0500
(No I don't want to get into that discussion, and actually
my case is mostly about hacked C code, and only small assembly
additions.)

Anyways I have been playing with doing the perspective division
and clip testing in parallel by running the clip testing
with integer instructions. This might be usefull for architectures
which have relatively slow floating point division and which
cannot by default run floating point comparison code
parallel to the division stuff.
(As an example Intel pentium family)

The included file can be used as an replacement to the
clip_tmp.h (which is used by xform.c)
in the latest mesa-3.1-kw-beta. the only hacked
function is cliptest_points4. The code will obviously
fail horribly if the floating point bit patterns
don't match what I expect, so it cannot be used
in any real mesa release as it is. 
(need to have a real FP fallback if the format is not compatible)

Based on my experiments my integer version of floating point
comparison works on 32 bit Ieee floating point, except with NAN numbers.

The inline assembly is for visual C++ and it is there so that 
the fdiv would be detached from the integer execution.
There is also C version of the lines in question and as
far as I could see my version of Gnu-C (egcs-2.91.57)
did compile the C to suitable code without need for 
assembly.

I have problems with profiling of the code. I have an open
case with Intel Vtune people regarding the timing of the
division on Pentium PRO. I hope they can solve the Vtune issue,
it would make the tuning easier.

BTW, the code is propably slower than the original if large portion
of the triangles are clipped. It performs division for every
triangle whereas the original code only did division for
non clipped triangles.

                Eero Pajarre

clip_tmp.h.mod

#include <float.h>
/* KW: a clever asm implementation would nestle integer versions
 * of the outcode calculation underneath the division.  Gcc won't
 * do this, strangely enough, so I only do the divide in
 * the case where the cliptest passes.  This isn't essential,
 * and an asm implementation needn't replicate that behaviour.
 *   
 * For clipped primitives with W all +ve, we can ignore planes not
 * in the union of the bitmasks of the vertices.  With mixed +,-
 * this doesn't seem to be possible.  Thus the test for cw < 0 and
 * the new flag.  This test could alternately be done in 
 * viewclip_polygon_4.
 */

static GLvector4f * TAG(cliptest_points4)( GLvector4f *clip_vec, 
                                           GLvector4f *proj_vec, 
                                           GLubyte clipMask[],
                                           GLubyte *orMask, 
                                           GLubyte *andMask )
{
   const GLuint stride = clip_vec->stride;
   const GLfloat *from = (GLfloat *)clip_vec->start;
   const GLuint count = clip_vec->count;
   GLuint c = 0;
   GLfloat (*vProj)[4] = (GLfloat (*)[4])proj_vec->start;
   GLubyte tmpAndMask = *andMask;
   GLubyte tmpOrMask = *orMask;
   GLuint i;
   int cmp_mask=(1<<31);
   int cmp_fill=cmp_mask - 1;
   STRIDE_LOOP {
      GLubyte mask = 0;
      const GLfloat cx = from[0];
      const GLfloat cy = from[1];
      const GLfloat cz = from[2];
      const GLfloat cw = from[3];
      GLfloat oow = 1.0 ;
      int icx,icy,icz,icw,nicw;
#ifdef __WIN32__
      _asm{
        fld oow
        fdiv cw
     }
#else
      oow /= cw;
#endif
      icx=*(int*)(from+0);
      icy=*(int*)(from+1);
      icz=*(int*)(from+2);
      icw=*(int*)(from+3);

      if (icx & cmp_mask)
        icx ^= cmp_fill;
      if (icy & cmp_mask)
        icy ^= cmp_fill;
      if (icz & cmp_mask)
        icz ^= cmp_fill;
      if (icw & cmp_mask)
        icw ^= cmp_fill;
      if (icx >  icw)   mask |= CLIP_RIGHT_BIT;
      else if (icx < -icw)  mask |= CLIP_LEFT_BIT;
      if (icy >  icw)       mask |= CLIP_TOP_BIT;
      else if (icy < -icw)  mask |= CLIP_BOTTOM_BIT;
      if (icz > icw)       mask |= CLIP_FAR_BIT;
      else if (icz < -icw)  mask |= CLIP_NEAR_BIT;
#ifdef __WIN32__
      _asm{
        fstp oow
      }
#endif
      if (mask) {
         if (cw <= 0.0) mask |= CLIP_4D; /* can't skip planes in clipping */
         c++;
         tmpAndMask &= mask;
         clipMask[i] = mask;
         tmpOrMask |= mask;
#if 0
      } else if (cw == 0) {
         /* only get here for 0,0,0,0 - not really sure what
          * the correct behaviour should be - at the moment 
          * I'm inclined to ignore it.
          */
         clipMask[i] = tmpOrMask = CLIP_ALL_BITS|CLIP_4D;
#endif
      } else {
//      oow = 1.0f/cw;
         vProj[i][3] = oow;
         vProj[i][0] = cx * oow;
         vProj[i][1] = cy * oow;
         vProj[i][2] = cz * oow;
      }
   }
   *orMask = tmpOrMask;
   *andMask = (c < count ? 0 : tmpAndMask);

   proj_vec->flags |= VEC_SIZE_4;
   proj_vec->size = 3;
   proj_vec->count = clip_vec->count;
   return proj_vec;
}

static GLvector4f * TAG(cliptest_points3)( GLvector4f *clip_vec, 
                                           GLvector4f *proj_vec, 
                                           GLubyte clipMask[],
                                           GLubyte *orMask, 
                                           GLubyte *andMask )
{
   const GLuint stride = clip_vec->stride;
   const GLuint count = clip_vec->count;
   const GLfloat *from = (GLfloat *)clip_vec->start;

   GLubyte tmpOrMask = *orMask;
   GLubyte tmpAndMask = *andMask;
   GLuint i;
   STRIDE_LOOP {
      const GLfloat cx = from[0], cy = from[1], cz = from[2];
      GLubyte mask = 0;
      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
      if (cz >  1.0)       mask |= CLIP_FAR_BIT;
      else if (cz < -1.0)  mask |= CLIP_NEAR_BIT;
      clipMask[i] = mask;
      tmpOrMask |= mask;
      tmpAndMask &= mask;
   }

   if (proj_vec->flags & VEC_DIRTY_3) gl_clean_elem(3, proj_vec);
      
   *orMask = tmpOrMask;
   *andMask = tmpAndMask;
   return clip_vec;
}

static GLvector4f * TAG(cliptest_points2)( GLvector4f *clip_vec, 
                                           GLvector4f *proj_vec, 
                                           GLubyte clipMask[],
                                           GLubyte *orMask, 
                                           GLubyte *andMask )
{
   const GLuint stride = clip_vec->stride;
   const GLuint count = clip_vec->count;
   const GLfloat *from = (GLfloat *)clip_vec->start;

   GLubyte tmpOrMask = *orMask;
   GLubyte tmpAndMask = *andMask;
   GLuint i;
   STRIDE_LOOP {
      const GLfloat cx = from[0], cy = from[1];
      GLubyte mask = 0;
      if (cx >  1.0)       mask |= CLIP_RIGHT_BIT;
      else if (cx < -1.0)  mask |= CLIP_LEFT_BIT;
      if (cy >  1.0)       mask |= CLIP_TOP_BIT;
      else if (cy < -1.0)  mask |= CLIP_BOTTOM_BIT;
      clipMask[i] = mask;
      tmpOrMask |= mask;
      tmpAndMask &= mask;
   }

   if (proj_vec->flags & VEC_DIRTY_3) gl_clean_elem(3, proj_vec);

   *orMask = tmpOrMask;
   *andMask = tmpAndMask;
   return clip_vec;
}





static void TAG(init_c_cliptest)()
{
   gl_clip_tab[4] = TAG(cliptest_points4);
   gl_clip_tab[3] = TAG(cliptest_points3);
   gl_clip_tab[2] = TAG(cliptest_points2);
}
inline assembly

Reply via email to