----- Original Message ----- > Am 11.07.2013 18:54, schrieb Jose Fonseca: > > ----- Original Message ----- > >> From: Roland Scheidegger <srol...@vmware.com> > >> > >> srgb-to-linear is using 3rd degree polynomial for now which should be > >> _just_ > >> good enough. Reverse is using some rational polynomials and is quite > >> accurate, > >> though not hooked into llvmpipe's blend code yet and hence unused > >> (untested). > >> Using a table might also be an option (for srgb-to-linear especially). > >> This does not enable any new features yet because EXT_texture_srgb was > >> already > >> supported via util_format fallbacks, but performance was lacking probably > >> due > >> to the external function call (the table used by the util_format_srgb code > >> may > >> not be all that much slower on its own). > >> Some performance figures (taken from modified gloss, replaced both base > >> and > >> sphere texture to use GL_SRGB instead of GL_RGB, measured on 1Ghz Sandy > >> Bridge, > >> the numbers aren't terribly accurate): > >> > >> normal gloss, aos, 8-wide: 47 fps > >> normal gloss, aos, 4-wide: 48 fps > >> > >> normal gloss, forced to soa, 8-wide: 48 fps > >> normal gloss, forced to soa, 4-wide: 47 fps > >> > >> patched gloss, old code, soa, 8-wide: 21 fps > >> patched gloss, old code, soa, 4-wide: 24 fps > >> > >> patched gloss, new code, soa, 8-wide: 41 fps > >> patched gloss, new code, soa, 4-wide: 38 fps > >> > >> So there's a performance hit but it seems acceptable, certainly better > >> than using the fallback. > >> Note the new code only works for 4x8bit srgb formats, others (L8/L8A8) > >> will > >> continue to use the old util_format fallback, because I can't be bothered > >> to write code for formats noone uses anyway (as decoding is done as part > >> of > >> lp_build_unpack_rgba_soa which can only handle block type width of 32). > >> Compressed srgb formats should get their own path though eventually (it is > >> going to be expensive in any case, first decompress, then convert). > >> No piglit regressions. > >> --- > >> src/gallium/auxiliary/Makefile.sources | 1 + > >> src/gallium/auxiliary/gallivm/lp_bld_format.h | 11 + > >> src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 25 +- > >> src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c | 308 > >> ++++++++++++++++++++ > >> 4 files changed, 339 insertions(+), 6 deletions(-) > >> create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > >> > >> diff --git a/src/gallium/auxiliary/Makefile.sources > >> b/src/gallium/auxiliary/Makefile.sources > >> index 4751762..8cffeb0 100644 > >> --- a/src/gallium/auxiliary/Makefile.sources > >> +++ b/src/gallium/auxiliary/Makefile.sources > >> @@ -172,6 +172,7 @@ GALLIVM_SOURCES := \ > >> gallivm/lp_bld_format_aos.c \ > >> gallivm/lp_bld_format_aos_array.c \ > >> gallivm/lp_bld_format_float.c \ > >> + gallivm/lp_bld_format_srgb.c \ > >> gallivm/lp_bld_format_soa.c \ > >> gallivm/lp_bld_format_yuv.c \ > >> gallivm/lp_bld_gather.c \ > >> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h > >> b/src/gallium/auxiliary/gallivm/lp_bld_format.h > >> index 12a0318..744d002 100644 > >> --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h > >> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h > >> @@ -158,4 +158,15 @@ lp_build_rgb9e5_to_float(struct gallivm_state > >> *gallivm, > >> LLVMValueRef src, > >> LLVMValueRef *dst); > >> > >> +LLVMValueRef > >> +lp_build_linear_to_srgb(struct gallivm_state *gallivm, > >> + struct lp_type src_type, > >> + LLVMValueRef src); > >> + > >> +LLVMValueRef > >> +lp_build_srgb_to_linear(struct gallivm_state *gallivm, > >> + struct lp_type src_type, > >> + LLVMValueRef src); > >> + > >> + > >> #endif /* !LP_BLD_FORMAT_H */ > >> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > >> b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > >> index 4c6bd81..114ce03 100644 > >> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > >> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c > >> @@ -163,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state > >> *gallivm, > >> */ > >> > >> if (type.floating) { > >> - if(format_desc->channel[chan].normalized) > >> - input = lp_build_unsigned_norm_to_float(gallivm, width, > >> type, > >> input); > >> - else > >> - input = LLVMBuildSIToFP(builder, input, > >> - lp_build_vec_type(gallivm, type), > >> ""); > >> + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { > >> + assert(width == 8); > >> + if (format_desc->swizzle[3] == chan) { > >> + input = lp_build_unsigned_norm_to_float(gallivm, width, > >> type, input); > >> + } > >> + else { > >> + struct lp_type conv_type = lp_uint_type(type); > >> + input = lp_build_srgb_to_linear(gallivm, conv_type, > >> input); > >> + } > >> + } > >> + else { > >> + if(format_desc->channel[chan].normalized) > >> + input = lp_build_unsigned_norm_to_float(gallivm, width, > >> type, input); > >> + else > >> + input = LLVMBuildSIToFP(builder, input, > >> + lp_build_vec_type(gallivm, > >> type), > >> ""); > >> + } > >> } > >> else if (format_desc->channel[chan].pure_integer) { > >> /* Nothing to do */ > >> @@ -344,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, > >> > >> if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && > >> (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || > >> + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || > >> format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && > >> format_desc->block.width == 1 && > >> format_desc->block.height == 1 && > >> @@ -394,7 +407,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, > >> packed = lp_build_gather(gallivm, type.length, > >> format_desc->block.bits, > >> type.width, base_ptr, offset, > >> - FALSE); > >> + FALSE); > >> if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { > >> lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); > >> } > >> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > >> b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > >> new file mode 100644 > >> index 0000000..2422817 > >> --- /dev/null > >> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c > >> @@ -0,0 +1,308 @@ > >> +/************************************************************************** > >> + * > >> + * Copyright 2013 VMware, Inc. > >> + * All Rights Reserved. > >> + * > >> + * Permission is hereby granted, free of charge, to any person obtaining > >> a > >> + * copy of this software and associated documentation files (the > >> + * "Software"), to deal in the Software without restriction, including > >> + * without limitation the rights to use, copy, modify, merge, publish, > >> + * distribute, sub license, and/or sell copies of the Software, and to > >> + * permit persons to whom the Software is furnished to do so, subject to > >> + * the following conditions: > >> + * > >> + * The above copyright notice and this permission notice (including the > >> + * next paragraph) shall be included in all copies or substantial > >> portions > >> + * of the Software. > >> + * > >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > >> EXPRESS > >> + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > >> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND > >> NON-INFRINGEMENT. > >> + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR > >> + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF > >> CONTRACT, > >> + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE > >> + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. > >> + * > >> + > >> **************************************************************************/ > >> + > >> + > >> +/** > >> + * @file > >> + * Format conversion code for srgb formats. > >> + * > >> + * Functions for converting from srgb to linear and vice versa. > >> + * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt: > >> + * > >> + * srgb->linear: > >> + * cl = cs / 12.92, cs <= 0.04045 > >> + * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045 > >> + * > >> + * linear->srgb: > >> + * if (isnan(cl)) { > >> + * Map IEEE-754 Not-a-number to zero. > >> + * cs = 0.0; > >> + * } else if (cl > 1.0) { > >> + * cs = 1.0; > >> + * } else if (cl < 0.0) { > >> + * cs = 0.0; > >> + * } else if (cl < 0.0031308) { > >> + * cs = 12.92 * cl; > >> + * } else { > >> + * cs = 1.055 * pow(cl, 0.41666) - 0.055; > >> + * } > >> + * > >> + * This does not need to be accurate, however at least for d3d10 > >> + * > >> (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx): > >> + * 1) For srgb->linear, it is required that the error on the srgb side is > >> + * not larger than 0.5f, which I interpret that if you map the value > >> back > >> + * to srgb from linear using the ideal conversion, it would not be off > >> by > >> + * more than 0.5f (that is, it would map to the same 8-bit integer > >> value > >> + * as it was before conversion to linear). > >> + * 2) linear->srgb is permitted 0.6f which luckily looks like quite a > >> large > >> + * error is allowed. > >> + * 3) Additionally, all srgb values converted to linear and back must > >> result > >> + * in the same value as they were originally. > >> + * > >> + * @author Roland Scheidegger <srol...@vmware.com> > >> + */ > >> + > >> + > >> +#include "util/u_debug.h" > >> + > >> +#include "lp_bld_type.h" > >> +#include "lp_bld_const.h" > >> +#include "lp_bld_arit.h" > >> +#include "lp_bld_bitarit.h" > >> +#include "lp_bld_logic.h" > >> +#include "lp_bld_format.h" > >> + > >> + > >> + > >> +/** > >> + * Convert srgb int values to linear float values. > >> + * Several possibilities how to do this, e.g. > >> + * - table > >> + * - doing the pow() with int-to-float and float-to-int tricks > >> + * > >> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) > >> + * - just using standard polynomial approximation > >> + * (3rd order polynomial is required for crappy but just sufficient > >> accuracy) > >> + * > >> + * @param src integer (vector) value(s) to convert > >> + * (8 bit values unpacked to 32 bit already). > >> + */ > >> +LLVMValueRef > >> +lp_build_srgb_to_linear(struct gallivm_state *gallivm, > >> + struct lp_type src_type, > >> + LLVMValueRef src) > >> +{ > >> + struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32); > >> + struct lp_build_context f32_bld; > >> + LLVMValueRef srcf, part_lin, part_pow, tmp, is_linear; > >> + LLVMValueRef lin_const, tmp_const, lin_thresh; > >> + > >> + assert(src_type.width == 32); > >> + lp_build_context_init(&f32_bld, gallivm, f32_type); > >> + > >> + /* > >> + * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + > >> 0.0023) > >> + * (found with octave polyfit and some magic as I couldn't get the > >> error > >> + * function right). Using the above mentioned error function, the > >> values > >> stay > >> + * within +-0.35, except for the lowest values - hence tweaking linear > >> segment > >> + * to cover the first 16 instead of the first 11 values (the error > >> stays > >> + * just about acceptable there too). > >> + * Hence: lin = src > 15 ? > >> + * (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) > >> : > >> + * src / 12.6; > >> + * This function really only makes sense for vectors, should use LUT > >> otherwise. > >> + * All in all (including float conversion) 10 instructions (with > >> sse4.1), > >> + * 6 constants. Bad dependency chains though, but FMA should help > >> (minus > >> 3 > >> + * instructions). > > > > Please use lp_build_polynomial. It tries to avoid data dependency. > > Furthermore, if we start using FMA, then it's less one place to update. > Ok. Are you sure it's worth avoiding data dependency at the cost of extra > instructions (the way I built the polynomial, it's 6 instructions, and with > lp_build_polynomial it would be 7)?
I'm not sure for this particular polynomial order (you could benchmark). It did make a significant improvement for log2/exp2 's polynomials at the time James did this. If it's not worth it, then lp_build_polynomial should do a straight polynomial for that order and lower. But lp_build_polynomial should still be used no matter what. The expectation being that lp_build_polynomial will emit the best code possible for any polynomial. > I thought because r/g/b will be done in > parallel anyway it wouldn't be much of an issue. Didn't measure it, though. > I am actually not really sure if fma isn't already used, while this is > an non-conformant optimization to optimize mul+add into fma some > compilers do it by default anyway IIRC. If you want to add a new flag lp_build_polynomial to force a straightforward polynomial expansion that's fine too > >> + */ > >> + /* doing the 1/255 mul as part of the approximation */ > >> + srcf = lp_build_int_to_float(&f32_bld, src); > >> + lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * > >> 255.0f)); > >> + part_lin = lp_build_mul(&f32_bld, srcf, lin_const); > >> + > >> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.3012f / (255.0f * > >> 255.0f * 255.0f)); > >> + tmp = lp_build_mul(&f32_bld, srcf, tmp_const); > >> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.6935f / (255.0f * > >> 255.0f)); > >> + tmp = lp_build_add(&f32_bld, tmp, tmp_const); > >> + tmp = lp_build_mul(&f32_bld, srcf, tmp); > >> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0030f / 255.0f); > >> + tmp = lp_build_add(&f32_bld, tmp, tmp_const); > >> + tmp = lp_build_mul(&f32_bld, srcf, tmp); > >> + tmp_const = lp_build_const_vec(gallivm, f32_type, 0.0023f); > >> + part_pow = lp_build_add(&f32_bld, tmp, tmp_const); > >> + > >> + lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f); > >> + is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, > >> srcf, > >> lin_thresh); > >> + return lp_build_select(&f32_bld, is_linear, part_lin, part_pow); > >> +} > >> + > >> + > >> +/** > >> + * Convert linear float values to srgb int values. > >> + * Several possibilities how to do this, e.g. > >> + * - use table (based on exponent/highest order mantissa bits) and do > >> + * linear interpolation (https://gist.github.com/rygorous/2203834) > >> + * - Chebyshev polynomial > >> + * - Approximation using reciprocals > >> + * - using int-to-float and float-to-int tricks for pow() > >> + * > >> (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) > >> + * > >> + * @param src float (vector) value(s) to convert. > >> + */ > >> +#if 0 > > > > I'd prefer you used if (0) instead of #if 0, so that we can ensure all > > these variants staying working and buidling in the future. > > > > Something like > > > > lp_build_linear_to_srgb_foo() {} > > lp_build_linear_to_srgb_boo() {} > > > > lp_build_linear_to_srgb() { > > if(0) > > return lp_build_linear_to_srgb_foo() > > else > > return lp_build_linear_to_srgb_boo() > > } > > > > And you could probably refactor so that the linear part is only implemented > > in one place. Note I don't think that leaving multiple implementations > > of this is bad, as the optimal way of doing this sort of things tends to > > vary with introduction of newer processor generations. > Ok. I did it that way because I only really planned for one method then > figured the > quite a bit simpler one should do, but of course didn't want to nuke the > other one before I'm really sure it works... But when keeping both it > certainly makes sense to actually compile them both. > The more complex one cannot be faster though than the simpler regardless > of cpu I'm quite sure. The dependency chains are similar as are the > instructions used except that the complex version has an additional 4 > instructions (also in the longest dependency chain) more or less added > on top of it. > > >> +/* XXX remove this after verifying less accurate method is really good > >> enough */ > >> +LLVMValueRef > >> +lp_build_linear_to_srgb(struct gallivm_state *gallivm, > >> + struct lp_type src_type, > >> + LLVMValueRef src) > >> +{ > >> + LLVMBuilderRef builder = gallivm->builder; > >> + struct lp_build_context f32_bld; > >> + float exp_f = 2.0f/3.0f; > >> + float coeff_f = 0.62996f; > >> + LLVMValueRef pow_approx, coeff, x2, exponent, tmp, pow_1, pow_2, > >> pow_final; > >> + LLVMValueRef lin_thresh, lin, lin_const, is_linear; > >> + struct lp_type int_type = lp_int_type(src_type); > >> + > >> + lp_build_context_init(&f32_bld, gallivm, src_type); > >> + > >> + src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); > >> + > >> + /* > >> + * using int-to-float and float-to-int trick for pow(). > >> + * This is much more accurate than necessary thanks to the correction, > >> + * but it most certainly makes no sense without rsqrt available. > >> + * Bonus points if you understand how this works... > >> + * All in all (including min/max clamp, conversion) 19 instructions. > >> + */ > >> + > >> + /* > >> + * First calculate approx x^8/12 > >> + */ > >> + exponent = lp_build_const_vec(gallivm, src_type, exp_f); > >> + coeff = lp_build_const_vec(gallivm, src_type, > >> + exp2f(127 / exp_f - 127) * powf(coeff_f, > >> 1.0 > > > > 1.0 -> 1.0f > > > >> /exp_f)); > >> + > >> + /* premultiply src */ > >> + tmp = lp_build_mul(&f32_bld, coeff, src); > >> + /* "log2" */ > >> + tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, > >> int_type), ""); > >> + tmp = lp_build_int_to_float(&f32_bld, tmp); > >> + /* multiply for pow */ > >> + tmp = lp_build_mul(&f32_bld, tmp, exponent); > >> + /* "exp2" */ > >> + pow_approx = lp_build_itrunc(&f32_bld, tmp); > >> + pow_approx = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, > >> src_type), ""); > >> + > >> + /* > >> + * Since that pow was inaccurate (like 3 bits, though each sqrt step > >> would > >> + * give another bit), compensate the error (which is why we chose > >> another > >> + * exponent in the first place). > >> + */ > >> + /* x * x^(8/12) = x^(20/12) */ > >> + pow_1 = lp_build_mul(&f32_bld, pow_approx, src); > >> + /* x * x * x^(-4/12) = x^(20/12) */ > >> + tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); > >> + x2 = lp_build_mul(&f32_bld, src, src); > >> + pow_2 = lp_build_mul(&f32_bld, x2, tmp); > >> + > >> + /* average the values so the errors cancel out, compensate bias, > >> + * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 > >> mul > >> + * for conversion to int in here */ > >> + tmp = lp_build_add(&f32_bld, pow_1, pow_2); > >> + coeff = lp_build_const_vec(gallivm, src_type, > >> + 1.0f/(3.0f*coeff_f) * 0.999852f * > >> powf(1.055f > >> * 255.0f, 4.0f)); > >> + pow_final = lp_build_mul(&f32_bld, tmp, coeff); > >> + > >> + /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ > >> + pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final); > >> + pow_final = lp_build_fast_rsqrt(&f32_bld, pow_final); > >> + pow_final = lp_build_add(&f32_bld, pow_final, > >> + lp_build_const_vec(gallivm, src_type, > >> -0.055f)); > >> + > >> + /* linear part is child's play */ > >> + lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); > >> + lin = lp_build_mul(&f32_bld, src, lin_const); > >> + > >> + lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); > >> + is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, > >> lin_thresh); > >> + tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); > >> + > >> + f32_bld.type.sign = 0; > >> + return lp_build_iround(&f32_bld, tmp); > >> +} > >> + > >> +#else > >> +LLVMValueRef > >> +lp_build_linear_to_srgb(struct gallivm_state *gallivm, > >> + struct lp_type src_type, > >> + LLVMValueRef src) > >> +{ > >> + struct lp_build_context f32_bld; > >> + LLVMValueRef pow_final, tmp, tmp1, tmp2, x05, x0375, a_const, b_const, > >> c_const; > >> + LLVMValueRef lin_thresh, lin, lin_const, is_linear; > >> + > >> + lp_build_context_init(&f32_bld, gallivm, src_type); > >> + > >> + /* > >> + * using "rational polynomial" approximation here. > >> + * Essentially y = a*x^0.375 + b*x^0.5 + c, with also > >> + * factoring in the 255.0 mul and the scaling mul. > >> + * (a is closer to actual value so has higher weight than b.) > >> + * Note: the constants are magic values. They were found empirically, > >> + * possibly could be improved but good enough (be VERY careful with > >> + * error metric if you'd want to tweak them, they also MUST fit with > >> + * the crappy polynomial above for srgb->linear since it is required > >> + * that each srgb value maps back to the same value). > >> + * This function has an error of max +-0.17 (and we'd only require > >> +-0.6), > >> + * for the approximated srgb->linear values the error is naturally > >> larger > >> + * (+-0.42) but still accurate enough (required +-0.5 essentially). > >> + * All in all (including min/max clamp, conversion) 15 instructions. > >> + * FMA would help (minus 2 instructions). > >> + */ > >> + > >> + if (lp_build_fast_rsqrt_available(src_type)) { > >> + tmp = lp_build_fast_rsqrt(&f32_bld, src); > >> + x05 = lp_build_mul(&f32_bld, src, tmp); > >> + } > >> + else { > >> + /* > >> + * I don't really expect this to be practical without rsqrt > >> + * but there's no reason for triple punishment so at least > >> + * save the otherwise resulting division and unnecessary mul... > >> + */ > >> + x05 = lp_build_sqrt(&f32_bld, src); > >> + } > >> + > >> + tmp = lp_build_mul(&f32_bld, x05, src); > >> + if (lp_build_fast_rsqrt_available(src_type)) { > >> + x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, > >> tmp)); > >> + } > >> + else { > >> + x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); > >> + } > >> + > >> + a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * > >> 255.0f); > >> + b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * > >> 255.0f); > >> + c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); > >> + > >> + tmp1 = lp_build_mul(&f32_bld, a_const, x0375); > >> + tmp2 = lp_build_mul(&f32_bld, b_const, x05); > >> + tmp2 = lp_build_add(&f32_bld, tmp2, c_const); > >> + pow_final = lp_build_add(&f32_bld, tmp1, tmp2); > > > > Again, please use lp_build_polynomial for polynomial construction. > No that can't work here (only works for ordinary polynomials but not for > something of the sort here which is a^0.5 + b^0.375 + c). Right. > > > >> + > >> + /* linear part */ > >> + lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); > >> + lin = lp_build_mul(&f32_bld, src, lin_const); > >> + > >> + lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); > >> + is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, > >> lin_thresh); > >> + tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); > >> + > >> + f32_bld.type.sign = 0; > >> + return lp_build_iround(&f32_bld, tmp); > >> +} > >> +#endif > > > > Otherwise looks good. I'm trusting you on the maths side though! > The math is funky I don't even trust it! For some reason though it seems to > give the right results :-). > > > Roland > > > > Jose > > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev