If there are no piglit regressions: Acked-by: Marek Olšák <marek.ol...@amd.com>
Marek On Fri, Aug 21, 2015 at 12:06 AM, Roland Scheidegger <srol...@vmware.com> wrote: > Ping? > > Am 09.08.2015 um 17:28 schrieb srol...@vmware.com: >> From: Roland Scheidegger <srol...@vmware.com> >> >> I used this as some testing ground for investigating some compiler >> bits initially (e.g. lrint calls etc.), figured I could do much better >> in the end just for fun... >> This is mathematically equivalent, but uses some tricks to avoid >> doubles and also replaces some float math with ints. Good for another >> performance doubling or so. As a side note, some quick tests show that >> llvm's loop vectorizer would be able to properly vectorize this version >> (which it failed to do earlier due to doubles, producing a mess), giving >> another 3 times performance increase with sse2 (more with sse4.1), but this >> may not apply to mesa. >> --- >> src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 >> ++++++++++++++-------------- >> 1 file changed, 42 insertions(+), 45 deletions(-) >> >> diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h >> b/src/gallium/auxiliary/util/u_format_rgb9e5.h >> index d11bfa8..21feba7 100644 >> --- a/src/gallium/auxiliary/util/u_format_rgb9e5.h >> +++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h >> @@ -74,62 +74,59 @@ typedef union { >> } field; >> } rgb9e5; >> >> -static inline float rgb9e5_ClampRange(float x) >> -{ >> - if (x > 0.0f) { >> - if (x >= MAX_RGB9E5) { >> - return MAX_RGB9E5; >> - } else { >> - return x; >> - } >> - } else { >> - /* NaN gets here too since comparisons with NaN always fail! */ >> - return 0.0f; >> - } >> -} >> >> -/* Ok, FloorLog2 is not correct for the denorm and zero values, but we >> - are going to do a max of this value with the minimum rgb9e5 exponent >> - that will hide these problem cases. */ >> -static inline int rgb9e5_FloorLog2(float x) >> +static inline int rgb9e5_ClampRange(float x) >> { >> float754 f; >> - >> + float754 max; >> f.value = x; >> - return (f.field.biasedexponent - 127); >> + max.value = MAX_RGB9E5; >> + >> + if (f.raw > 0x7f800000) >> + /* catches neg, NaNs */ >> + return 0; >> + else if (f.raw >= max.raw) >> + return max.raw; >> + else >> + return f.raw; >> } >> >> static inline unsigned float3_to_rgb9e5(const float rgb[3]) >> { >> rgb9e5 retval; >> - float maxrgb; >> - int rm, gm, bm; >> - float rc, gc, bc; >> - int exp_shared, maxm; >> + int rm, gm, bm, exp_shared; >> float754 revdenom = {0}; >> - >> - rc = rgb9e5_ClampRange(rgb[0]); >> - gc = rgb9e5_ClampRange(rgb[1]); >> - bc = rgb9e5_ClampRange(rgb[2]); >> - >> - maxrgb = MAX3(rc, gc, bc); >> - exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + >> RGB9E5_EXP_BIAS; >> + float754 rc, bc, gc, maxrgb; >> + >> + rc.raw = rgb9e5_ClampRange(rgb[0]); >> + gc.raw = rgb9e5_ClampRange(rgb[1]); >> + bc.raw = rgb9e5_ClampRange(rgb[2]); >> + maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw); >> + >> + /* >> + * Compared to what the spec suggests, instead of conditionally adjusting >> + * the exponent after the fact do it here by doing the equivalent of >> +0.5 - >> + * the int add will spill over into the exponent in this case. >> + */ >> + maxrgb.raw += maxrgb.raw & (1 << (23-9)); >> + exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) + >> + 1 + RGB9E5_EXP_BIAS - 127; >> + revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - >> + RGB9E5_MANTISSA_BITS) + 1; >> assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); >> - assert(exp_shared >= 0); >> - revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - >> RGB9E5_MANTISSA_BITS); >> - >> - maxm = (int) (maxrgb * revdenom.value + 0.5); >> - if (maxm == MAX_RGB9E5_MANTISSA + 1) { >> - revdenom.value *= 0.5f; >> - exp_shared += 1; >> - assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); >> - } else { >> - assert(maxm <= MAX_RGB9E5_MANTISSA); >> - } >> - >> - rm = (int) (rc * revdenom.value + 0.5); >> - gm = (int) (gc * revdenom.value + 0.5); >> - bm = (int) (bc * revdenom.value + 0.5); >> + >> + /* >> + * The spec uses strict round-up behavior (d3d10 disagrees, but in any >> case >> + * must match what is done above for figuring out exponent). >> + * We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding >> + * ourselves (revdenom was adjusted by +1, above). >> + */ >> + rm = (int) (rc.value * revdenom.value); >> + gm = (int) (gc.value * revdenom.value); >> + bm = (int) (bc.value * revdenom.value); >> + rm = (rm & 1) + (rm >> 1); >> + gm = (gm & 1) + (gm >> 1); >> + bm = (bm & 1) + (bm >> 1); >> >> assert(rm <= MAX_RGB9E5_MANTISSA); >> assert(gm <= MAX_RGB9E5_MANTISSA); >> > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev