Ping? Am 09.08.2015 um 17:28 schrieb srol...@vmware.com: > From: Roland Scheidegger <srol...@vmware.com> > > I used this as some testing ground for investigating some compiler > bits initially (e.g. lrint calls etc.), figured I could do much better > in the end just for fun... > This is mathematically equivalent, but uses some tricks to avoid > doubles and also replaces some float math with ints. Good for another > performance doubling or so. As a side note, some quick tests show that > llvm's loop vectorizer would be able to properly vectorize this version > (which it failed to do earlier due to doubles, producing a mess), giving > another 3 times performance increase with sse2 (more with sse4.1), but this > may not apply to mesa. > --- > src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 > ++++++++++++++-------------- > 1 file changed, 42 insertions(+), 45 deletions(-) > > diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h > b/src/gallium/auxiliary/util/u_format_rgb9e5.h > index d11bfa8..21feba7 100644 > --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h > +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h > @@ -74,62 +74,59 @@ typedef union { > } field; > } rgb9e5; > > -static inline float rgb9e5_ClampRange(float x) > -{ > - if (x > 0.0f) { > - if (x >= MAX_RGB9E5) { > - return MAX_RGB9E5; > - } else { > - return x; > - } > - } else { > - /* NaN gets here too since comparisons with NaN always fail! */ > - return 0.0f; > - } > -} > > -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we > - are going to do a max of this value with the minimum rgb9e5 exponent > - that will hide these problem cases. */ > -static inline int rgb9e5_FloorLog2(float x) > +static inline int rgb9e5_ClampRange(float x) > { > float754 f; > - > + float754 max; > f.value = x; > - return (f.field.biasedexponent - 127); > + max.value = MAX_RGB9E5; > + > + if (f.raw > 0x7f800000) > + /* catches neg, NaNs */ > + return 0; > + else if (f.raw >= max.raw) > + return max.raw; > + else > + return f.raw; > } > > static inline unsigned float3_to_rgb9e5(const float rgb[3]) > { > rgb9e5 retval; > - float maxrgb; > - int rm, gm, bm; > - float rc, gc, bc; > - int exp_shared, maxm; > + int rm, gm, bm, exp_shared; > float754 revdenom = {0}; > - > - rc = rgb9e5_ClampRange(rgb[0]); > - gc = rgb9e5_ClampRange(rgb[1]); > - bc = rgb9e5_ClampRange(rgb[2]); > - > - maxrgb = MAX3(rc, gc, bc); > - exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + > RGB9E5_EXP_BIAS; > + float754 rc, bc, gc, maxrgb; > + > + rc.raw = rgb9e5_ClampRange(rgb[0]); > + gc.raw = rgb9e5_ClampRange(rgb[1]); > + bc.raw = rgb9e5_ClampRange(rgb[2]); > + maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw); > + > + /* > + * Compared to what the spec suggests, instead of conditionally adjusting > + * the exponent after the fact do it here by doing the equivalent of +0.5 > - > + * the int add will spill over into the exponent in this case. > + */ > + maxrgb.raw += maxrgb.raw & (1 << (23-9)); > + exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) + > + 1 + RGB9E5_EXP_BIAS - 127; > + revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - > + RGB9E5_MANTISSA_BITS) + 1; > assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); > - assert(exp_shared >= 0); > - revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - > RGB9E5_MANTISSA_BITS); > - > - maxm = (int) (maxrgb * revdenom.value + 0.5); > - if (maxm == MAX_RGB9E5_MANTISSA + 1) { > - revdenom.value *= 0.5f; > - exp_shared += 1; > - assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); > - } else { > - assert(maxm <= MAX_RGB9E5_MANTISSA); > - } > - > - rm = (int) (rc * revdenom.value + 0.5); > - gm = (int) (gc * revdenom.value + 0.5); > - bm = (int) (bc * revdenom.value + 0.5); > + > + /* > + * The spec uses strict round-up behavior (d3d10 disagrees, but in any > case > + * must match what is done above for figuring out exponent). > + * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding > + * ourselves (revdenom was adjusted by +1, above). > + */ > + rm = (int) (rc.value * revdenom.value); > + gm = (int) (gc.value * revdenom.value); > + bm = (int) (bc.value * revdenom.value); > + rm = (rm & 1) + (rm >> 1); > + gm = (gm & 1) + (gm >> 1); > + bm = (bm & 1) + (bm >> 1); > > assert(rm <= MAX_RGB9E5_MANTISSA); > assert(gm <= MAX_RGB9E5_MANTISSA); >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev