> --- coregrind/m_oset.c        (revision 7273)
> +++ coregrind/m_oset.c        (working copy)
> @@ -175,7 +175,15 @@
>  // Compare the first word of each element.  Inlining is *crucial*.
>  static inline Word fast_cmp(void* k, AvlNode* n)
>  {
> -   return ( *(Word*)k - *(Word*)elem_of_node(n) );
> +   Word w1 = *(Word*)k;
> +   Word w2 = *(Word*)elem_of_node(n);
> +   // In previous versions, we tried to do this faster by doing
> +   // "return w1 - w2".  But it didn't always work, due to the carry bit
> +   // when the subtraction overflowed(?)  The branching version is slightly
> +   // slows, but safer and easier to understand.
> +   if (w1 > w2) return  1;
> +   if (w1 < w2) return -1;
> +   return 0;
>  }

For a closed subroutine of which the callers have no knowledge other than
arguments and result, and the specific values -1, 0, +1 are important:
        return (w1 > w2) - (w1 < w2);
which looks cute, and fully exploits the value of a relational expression
(1 if true, 0 if false).  On i686 a good compiler should generate
        movl w1,%eax
        cmpl w2,%eax  # w1 vs w2  (AT&T syntax reverses args to 'cmp')
        sgt %al  # %al= (w1 > w2);  /* use 'sa' for unsigned */
        slt %cl  # %cl= (w1 < w2);  /* use 'sb' for unsigned */
        subb %cl,%al  # %al -= %cl; /* 8-bit arithmetic */
        movsbl %al,%eax  # sign extend 8 to 32 bits /* does not change CC */
featuring no branches (therefore no penalties for mis-prediction)
but requiring two registers.

If you are restricted to only one register, but addressing is inexpensive
and the CPU+compiler has a conditional move instruction:
        Word         rv =  0;
        if (w1 > w2) rv =  1;
        if (w1 < w2) rv = -1;
        return       rv;
which becomes
        .rodata
zero:   Int  0
pos1:   Int +1
neg1:   Int -1
        .text
        movl w1,%eax
        cmpl w2,%eax  # w1 vs w2
        movl   zero,%eax  # assume (w1==w2)              /* no change in CC */
        cmovgt pos1,%eax  # if (w1 > w2) then %eax = +1; /* no change in CC */
        cmovlt neg1,%eax  # if (w1 < w2) then %eax = -1; /* no change in CC */

If the result of fast_cmp is used to index an array, then either method
above works well.

However, the declaration says "static inline" and the caller uses the result
as the decision variable for a branch tree.  Gcc knows how to expand the
function inline and propagate the constant values across 'return',
so why was there any need for "fast" cmp in the first place?
Indeed, a single compare instruction 'cmp' ( with no set-characteristic-value,
and no conditional-move) is enough.  If the usage of the specific function
is that important, then prepare a separate version of the caller which
has the comparison function written literally, fully integrated with
the caller's decision tree.

-- 

-------------------------------------------------------------------------
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
_______________________________________________
Valgrind-developers mailing list
Valgrind-developers@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/valgrind-developers

Reply via email to