On Mon, Jan 21, 2013 at 12:49 AM, Chad Versace <chad.vers...@linux.intel.com> wrote: > Lower them to arithmetic and bit manipulation expressions. > > v2: > - Rewrite using ir_builder. [for idr] > - In lowering packHalf2x16, don't truncate subnormal float16 values to zero. > And round to even rather than to zero. [for stereotype441] > > CC: Ian Romanick <i...@freedesktop.org> > CC: Paul Berry <stereotype...@gmail.com> > Signed-off-by: Chad Versace <chad.vers...@linux.intel.com> > --- > src/glsl/Makefile.sources | 1 + > src/glsl/ir_optimization.h | 20 + > src/glsl/lower_packing_builtins.cpp | 1043 > +++++++++++++++++++++++++++++++++++ > 3 files changed, 1064 insertions(+) > create mode 100644 src/glsl/lower_packing_builtins.cpp > > diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources > index 2227c5e..ef71715 100644 > --- a/src/glsl/Makefile.sources > +++ b/src/glsl/Makefile.sources > @@ -59,6 +59,7 @@ LIBGLSL_FILES = \ > $(GLSL_SRCDIR)/lower_mat_op_to_vec.cpp \ > $(GLSL_SRCDIR)/lower_noise.cpp \ > $(GLSL_SRCDIR)/lower_packed_varyings.cpp \ > + $(GLSL_SRCDIR)/lower_packing_builtins.cpp \ > $(GLSL_SRCDIR)/lower_texture_projection.cpp \ > $(GLSL_SRCDIR)/lower_variable_index_to_cond_assign.cpp \ > $(GLSL_SRCDIR)/lower_vec_index_to_cond_assign.cpp \ > diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h > index 6b95191..ac90b87 100644 > --- a/src/glsl/ir_optimization.h > +++ b/src/glsl/ir_optimization.h > @@ -37,6 +37,25 @@ > #define MOD_TO_FRACT 0x20 > #define INT_DIV_TO_MUL_RCP 0x40 > > +/** > + * \see class lower_packing_builtins_visitor > + */ > +enum lower_packing_builtins_op { > + LOWER_PACK_UNPACK_NONE = 0x0000, > + > + LOWER_PACK_SNORM_2x16 = 0x0001, > + LOWER_UNPACK_SNORM_2x16 = 0x0002, > + > + LOWER_PACK_UNORM_2x16 = 0x0004, > + LOWER_UNPACK_UNORM_2x16 = 0x0008, > + > + LOWER_PACK_HALF_2x16 = 0x0010, > + LOWER_UNPACK_HALF_2x16 = 0x0020, > + > + LOWER_PACK_HALF_2x16_TO_SPLIT = 0x0040, > + LOWER_UNPACK_HALF_2x16_TO_SPLIT = 0x0080, > +}; > + > bool do_common_optimization(exec_list *ir, bool linked, > bool uniform_locations_assigned, > unsigned max_unroll_iterations); > @@ -74,6 +93,7 @@ bool lower_variable_index_to_cond_assign(exec_list > *instructions, > bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz); > bool lower_clip_distance(gl_shader *shader); > void lower_output_reads(exec_list *instructions); > +bool lower_packing_builtins(exec_list *instructions, int op_mask); > void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions); > void lower_packed_varyings(void *mem_ctx, unsigned location_base, > unsigned locations_used, ir_variable_mode mode, > diff --git a/src/glsl/lower_packing_builtins.cpp > b/src/glsl/lower_packing_builtins.cpp > new file mode 100644 > index 0000000..49176cc > --- /dev/null > +++ b/src/glsl/lower_packing_builtins.cpp > @@ -0,0 +1,1043 @@ > +/* > + * Copyright © 2012 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > + * DEALINGS IN THE SOFTWARE. > + */ > + > +#include "ir.h" > +#include "ir_builder.h" > +#include "ir_optimization.h" > +#include "ir_rvalue_visitor.h" > + > +namespace { > + > +using namespace ir_builder; > + > +/** > + * A visitor that lowers built-in floating-point pack/unpack expressions > + * such packSnorm2x16. > + */ > +class lower_packing_builtins_visitor : public ir_rvalue_visitor { > +public: > + /** > + * \param op_mask is a bitmask of `enum lower_packing_builtins_op` > + */ > + explicit lower_packing_builtins_visitor(int op_mask) > + : op_mask(op_mask), > + progress(false) > + { > + /* Mutually exclusive options. */ > + assert(!((op_mask & LOWER_PACK_HALF_2x16) && > + (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT))); > + > + assert(!((op_mask & LOWER_UNPACK_HALF_2x16) && > + (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT))); > + } > + > + virtual ~lower_packing_builtins_visitor() > + { > + assert(factory.instructions == NULL || > + factory.instructions->is_empty()); > + > + delete factory.instructions; > + } > + > + bool get_progress() { return progress; } > + > + void handle_rvalue(ir_rvalue **rvalue) > + { > + if (!*rvalue) > + return; > + > + ir_expression *expr = (*rvalue)->as_expression(); > + if (!expr) > + return; > + > + enum lower_packing_builtins_op lowering_op = > + choose_lowering_op(expr->operation); > + > + if (lowering_op == LOWER_PACK_UNPACK_NONE) > + return; > + > + setup_factory(ralloc_parent(expr)); > + > + ir_rvalue *op0 = expr->operands[0]; > + ralloc_steal(factory.mem_ctx, op0); > + > + switch (lowering_op) { > + case LOWER_PACK_SNORM_2x16: > + *rvalue = lower_pack_snorm_2x16(op0); > + break; > + case LOWER_PACK_UNORM_2x16: > + *rvalue = lower_pack_unorm_2x16(op0); > + break; > + case LOWER_PACK_HALF_2x16: > + *rvalue = lower_pack_half_2x16(op0); > + break; > + case LOWER_PACK_HALF_2x16_TO_SPLIT: > + *rvalue = split_pack_half_2x16(op0); > + break; > + case LOWER_UNPACK_SNORM_2x16: > + *rvalue = lower_unpack_snorm_2x16(op0); > + break; > + case LOWER_UNPACK_UNORM_2x16: > + *rvalue = lower_unpack_unorm_2x16(op0); > + break; > + case LOWER_UNPACK_HALF_2x16: > + *rvalue = lower_unpack_half_2x16(op0); > + break; > + case LOWER_UNPACK_HALF_2x16_TO_SPLIT: > + *rvalue = split_unpack_half_2x16(op0); > + break; > + case LOWER_PACK_UNPACK_NONE: > + assert(!"not reached"); > + break; > + } > + > + teardown_factory(); > + progress = true; > + } > + > +private: > + const int op_mask; > + bool progress; > + ir_factory factory; > + > + /** > + * Determine the needed lowering operation by filtering \a expr_op > + * through \ref op_mask. > + */ > + enum lower_packing_builtins_op > + choose_lowering_op(ir_expression_operation expr_op) > + { > + /* C++ regards int and enum as fundamentally different types. > + * So, we can't simply return from each case; we must cast the return > + * value. > + */ > + int result; > + > + switch (expr_op) { > + case ir_unop_pack_snorm_2x16: > + result = op_mask & LOWER_PACK_SNORM_2x16; > + break; > + case ir_unop_pack_unorm_2x16: > + result = op_mask & LOWER_PACK_UNORM_2x16; > + break; > + case ir_unop_pack_half_2x16: > + result = op_mask & (LOWER_PACK_HALF_2x16 | > LOWER_PACK_HALF_2x16_TO_SPLIT); > + break; > + case ir_unop_unpack_snorm_2x16: > + result = op_mask & LOWER_UNPACK_SNORM_2x16; > + break; > + case ir_unop_unpack_unorm_2x16: > + result = op_mask & LOWER_UNPACK_UNORM_2x16; > + break; > + case ir_unop_unpack_half_2x16: > + result = op_mask & (LOWER_UNPACK_HALF_2x16 | > LOWER_UNPACK_HALF_2x16_TO_SPLIT); > + break; > + default: > + result = LOWER_PACK_UNPACK_NONE; > + break; > + } > + > + return static_cast<enum lower_packing_builtins_op>(result); > + } > + > + void > + setup_factory(void *mem_ctx) > + { > + assert(factory.mem_ctx == NULL); > + factory.mem_ctx = mem_ctx; > + > + /* Avoid making a new list for each call to handle_rvalue(). Make a > + * single list and reuse it. > + */ > + if (factory.instructions == NULL) { > + factory.instructions = new(NULL) exec_list(); > + } else { > + assert(factory.instructions->is_empty()); > + } > + } > + > + void > + teardown_factory() > + { > + base_ir->insert_before(factory.instructions); > + assert(factory.instructions->is_empty()); > + factory.mem_ctx = NULL; > + } > + > + template <typename T> > + ir_constant* > + constant(T x) > + { > + return factory.constant(x); > + } > + > + /** > + * \brief Pack two uint16's into a single uint32. > + * > + * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32 > + * where the least significant bits specify the first element of the pair. > + * Return the uint32. > + */ > + ir_rvalue* > + pack_uvec2_to_uint(ir_rvalue *uvec2_rval) > + { > + assert(uvec2_rval->type == glsl_type::uvec2_type); > + > + /* uvec2 u = UVEC2_RVAL; */ > + ir_variable *u = factory.make_temp(glsl_type::uvec2_type, > + "tmp_pack_uvec2_to_uint"); > + factory.emit(assign(u, uvec2_rval)); > + > + /* return (u.y << 16) | (u.x & 0xffff); */ > + return bit_or(lshift(swizzle_y(u), constant(16u)), > + bit_and(swizzle_x(u), constant(0xffffu))); > + } > + > + /** > + * \brief Unpack a uint32 into two uint16's. > + * > + * Interpret the given uint32 as a uint16 pair where the uint32's least > + * significant bits specify the pair's first element. Return the uint16 > + * pair as a uvec2. > + */ > + ir_rvalue* > + unpack_uint_to_uvec2(ir_rvalue *uint_rval) > + { > + assert(uint_rval->type == glsl_type::uint_type); > + > + /* uint u = UINT_RVAL; */ > + ir_variable *u = factory.make_temp(glsl_type::uint_type, > + "tmp_unpack_uint_to_uvec2_u"); > + factory.emit(assign(u, uint_rval)); > + > + /* uvec2 u2; */ > + ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type, > + "tmp_unpack_uint_to_uvec2_u2"); > + > + /* u2.x = u & 0xffffu; */ > + factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X)); > + > + /* u2.y = u >> 16u; */ > + factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y)); > + > + return deref(u2).val; > + } > + > + /** > + * \brief Lower a packSnorm2x16 expression. > + * > + * \param vec2_rval is packSnorm2x16's input > + * \return packSnorm2x16's output as a uint rvalue > + */ > + ir_rvalue* > + lower_pack_snorm_2x16(ir_rvalue *vec2_rval) > + { > + /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec: > + * > + * hihgp uint packSnorm2x16(vec2 v)
high. > + * -------------------------------- > + * First, converts each component of the normalized floating-point > value > + * v into 16-bit integer values. Then, the results are packed into > the > + * returned 32-bit unsigned integer. > + * > + * The conversion for component c of v to fixed point is done as > + * follows: > + * > + * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0) > + * > + * The first component of the vector will be written to the least > + * significant bits of the output; the last component will be > written to > + * the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return pack_uvec2_to_uint( > + * uvec2(ivec2( > + * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f)))); > + * > + * It is necessary to first convert the vec2 to ivec2 rather than > directly > + * converting vec2 to uvec2 because the latter conversion is undefined. > + * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined > to > + * convert a negative floating point value to an uint". > + */ > + assert(vec2_rval->type == glsl_type::vec2_type); > + > + ir_rvalue *result = pack_uvec2_to_uint( > + i2u(f2i(round_even(mul(clamp(vec2_rval, > + constant(-1.0f), > + constant(1.0f)), > + constant(32767.0f)))))); > + > + assert(result->type == glsl_type::uint_type); > + return result; > + } > + > + /** > + * \brief Lower an unpackSnorm2x16 expression. > + * > + * \param uint_rval is unpackSnorm2x16's input > + * \return unpackSnorm2x16's output as a vec2 rvalue > + */ > + ir_rvalue* > + lower_unpack_snorm_2x16(ir_rvalue *uint_rval) > + { > + /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec: > + * > + * highp vec2 unpackSnorm2x16 (highp uint p) > + * ----------------------------------------- > + * First, unpacks a single 32-bit unsigned integer p into a pair of > + * 16-bit unsigned integers. Then, each component is converted to > + * a normalized floating-point value to generate the returned > + * two-component vector. > + * > + * The conversion for unpacked fixed-point value f to floating > point is > + * done as follows: > + * > + * unpackSnorm2x16: clamp(f / 32767.0, -1,+1) > + * > + * The first component of the returned vector will be extracted > from the > + * least significant bits of the input; the last component will be > + * extracted from the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return clamp( > + * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / > 32767.0f, > + * -1.0f, 1.0f); > + * > + * The above IR may appear unnecessarily complex, but the intermediate > + * conversion to ivec2 and the bit shifts are necessary to correctly > unpack > + * negative floats. > + * > + * To see why, consider packing and then unpacking vec2(-1.0, 0.0). > + * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we > + * place that int16 into an int32, which results in the *positive* > integer > + * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather > + * unimportant bit 16. We must now extend the int16's sign bit into > bits > + * 17-32, which is accomplished by left-shifting then right-shifting. > + */ > + > + assert(uint_rval->type == glsl_type::uint_type); > + > + ir_rvalue *result = > + clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)), > + constant(16)), constant(16u) instead of constant(16)? All other cases use 16u. > + constant(16u))), > + constant(32767.0f)), > + constant(-1.0f), > + constant(1.0f)); > + > + assert(result->type == glsl_type::vec2_type); > + return result; > + } > + > + /** > + * \brief Lower a packUnorm2x16 expression. > + * > + * \param vec2_rval is packUnorm2x16's input > + * \return packUnorm2x16's as a uint rvalue Missing the word 'output'. > + */ > + ir_rvalue* > + lower_pack_unorm_2x16(ir_rvalue *vec2_rval) > + { > + /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec: > + * > + * highp uint packUnorm2x16 (vec2 v) > + * --------------------------------- > + * First, converts each component of the normalized floating-point > value > + * v into 16-bit integer values. Then, the results are packed into > the > + * returned 32-bit unsigned integer. > + * > + * The conversion for component c of v to fixed point is done as > + * follows: > + * > + * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0) > + * > + * The first component of the vector will be written to the least > + * significant bits of the output; the last component will be > written to > + * the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return pack_uvec2_to_uint(uvec2( > + * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f))); > + * > + * Here it is safe to directly convert the vec2 to uvec2 because the > the > + * vec2 has been clamped to a non-negative range. > + */ > + > + assert(vec2_rval->type == glsl_type::vec2_type); > + > + ir_rvalue *result = pack_uvec2_to_uint( > + f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f))))); > + > + assert(result->type == glsl_type::uint_type); > + return result; > + } > + > + /** > + * \brief Lower an unpackUnorm2x16 expression. > + * > + * \param uint_rval is unpackUnorm2x16's input > + * \return unpackUnorm2x16's output as a vec2 rvalue > + */ > + ir_rvalue* > + lower_unpack_unorm_2x16(ir_rvalue *uint_rval) > + { > + /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec: > + * > + * highp vec2 unpackUnorm2x16 (highp uint p) > + * ----------------------------------------- > + * First, unpacks a single 32-bit unsigned integer p into a pair of > + * 16-bit unsigned integers. Then, each component is converted to > + * a normalized floating-point value to generate the returned > + * two-component vector. > + * > + * The conversion for unpacked fixed-point value f to floating > point is > + * done as follows: > + * > + * unpackUnorm2x16: f / 65535.0 > + * > + * The first component of the returned vector will be extracted > from the > + * least significant bits of the input; the last component will be > + * extracted from the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0; > + */ > + > + assert(uint_rval->type == glsl_type::uint_type); > + > + ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)), > + constant(65535.0f)); > + > + assert(result->type == glsl_type::vec2_type); > + return result; > + } > + > + /** > + * \brief Lower the component-wise calculation of packHalf2x16. > + * > + * \param f_rval is one component of packHafl2x16's input > + * \param e_rval is the unshifted exponent bits of f_rval > + * \param m_rval is the unshifted mantissa bits of f_rval > + * > + * \return a uint rvalue that encodes a float16 in its lower 16 bits > + */ > + ir_rvalue* > + pack_half_1x16_nosign(ir_rvalue *f_rval, > + ir_rvalue *e_rval, > + ir_rvalue *m_rval) > + { > + assert(e_rval->type == glsl_type::uint_type); > + assert(m_rval->type == glsl_type::uint_type); > + > + /* uint u16; */ > + ir_variable *u16 = factory.make_temp(glsl_type::uint_type, > + "tmp_pack_half_1x16_u16"); > + > + /* float f = FLOAT_RVAL; */ > + ir_variable *f = factory.make_temp(glsl_type::float_type, > + "tmp_pack_half_1x16_f"); > + factory.emit(assign(f, f_rval)); > + > + /* uint e = E_RVAL; */ > + ir_variable *e = factory.make_temp(glsl_type::uint_type, > + "tmp_pack_half_1x16_e"); > + factory.emit(assign(e, e_rval)); > + > + /* uint m = M_RVAL; */ > + ir_variable *m = factory.make_temp(glsl_type::uint_type, > + "tmp_pack_half_1x16_m"); > + factory.emit(assign(m, m_rval)); > + > + /* Preliminaries > + * ------------- > + * > + * For a float16, the bit layout is: > + * > + * sign: 15 > + * exponent: 10:14 > + * mantissa: 0:9 > + * > + * Let f16 be a float16 value. The sign, exponent, and mantissa > + * determine its value thus: > + * > + * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 > (1) > + * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) > * (m16 / 2^10) (2) > + * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) > * (1 + m16 / 2^10) (3) > + * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf > (4) > + * if e16 = 31 and m16 != 0, then NaN > (5) > + * > + * where 0 <= m16 < 2^10. > + * > + * For a float32, the bit layout is: > + * > + * sign: 31 > + * exponent: 23:30 > + * mantissa: 0:22 > + * > + * Let f32 be a float32 value. The sign, exponent, and mantissa > + * determine its value thus: > + * > + * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 > (10) > + * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) > * (m32 / 2^23) (11) > + * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) > * (1 + m32 / 2^23) (12) > + * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf > (13) > + * if e32 = 255 and m32 != 0, then NaN > (14) > + * > + * where 0 <= m32 < 2^23. > + * > + * The minimum and maximum normal float16 values are > + * > + * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20) > + * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21) > + * > + * The step at max_norm16 is > + * > + * max_step16 = 2^5 (22) > + * > + * Observe that the float16 boundary values in equations 20-21 lie in > the > + * range of normal float32 values. > + * > + * > + * Rounding Behavior > + * ----------------- > + * Not all float32 values can be exactly represented as a float16. We > + * round all such intermediate float32 values to the nearest float16; > if > + * the float32 is exactly between to float16 values, we round to the > one > + * with an even mantissa. This rounding behavior has several benefits: > + * > + * - It has no sign bias. > + * > + * - It reproduces the behavior of real hardware: opcode F32TO16 in > Intel's > + * GPU ISA. > + * > + * - By reproducing the behavior of the GPU (at least on Intel > hardware), > + * compile-time evaluation of constant packHalf2x16 GLSL > expressions will > + * result in the same value as if the expression were executed on > the > + * GPU. > + * > + * Calculation > + * ----------- > + * Our task is to compute s16, e16, m16 given f32. Since this function > + * ignores the sign bit, assume that s32 = s16 = 0. There are several > + * cases consider. > + */ > + > + factory.emit( > + > + /* Case 1) f32 is NaN > + * > + * The resultant f16 will also be NaN. > + */ > + > + /* if (e32 == 255 && m32 != 0) { */ > + if_tree(logic_and(equal(e, constant(0xffu << 23u)), > + logic_not(equal(m, constant(0u)))), > + > + assign(u16, constant(0x7fffu)), > + > + /* Case 2) f32 lies in the range [0, min_norm16). > + * > + * The resultant float16 will be either zero, subnormal, or > normal. > + * > + * Solving > + * > + * f32 = min_norm16 (30) > + * > + * gives > + * > + * e32 = 113 and m32 = 0 (31) > + * > + * Therefore this case occurs if and only if > + * > + * e32 < 113 (32) > + */ > + > + /* } else if (e32 < 113) { */ > + if_tree(less(e, constant(113u << 23u)), > + > + /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */ > + assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f), > + constant((float) (1 << 24)))))), > + > + /* Case 3) f32 lies in the range > + * [min_norm16, max_norm16 + max_step16). > + * > + * The resultant float16 will be either normal or infinite. > + * > + * Solving > + * > + * f32 = max_norm16 + max_step16 (40) > + * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41) > + * = 2^16 (42) > + * gives > + * > + * e32 = 142 and m32 = 0 (43) > + * > + * We already solved the boundary condition f32 = min_norm16 above > + * in equation 31. Therefore this case occurs if and only if > + * > + * 113 <= e32 and e32 < 142 > + */ > + > + /* } else if (e32 < 142) { */ > + if_tree(lequal(e, constant(142u << 23u)), > + > + /* The addition below handles the case where the mantissa rounds > + * up to 1024 and bumps the exponent. > + * > + * u16 = ((e - (112u << 23u)) >> 13u) > + * + round_to_even((float(m) / (1u << 13u)); > + */ > + assign(u16, add(rshift(sub(e, constant(112u << 23u)), > + constant(13u)), > + f2u(round_even( > + div(u2f(m), constant((float) (1 << > 13))))))), > + > + /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf]. > + * > + * The resultant float16 will be infinite. > + * > + * The cases above caught all float32 values in the range > + * [0, max_norm16 + max_step16), so this is the fall-through case. > + */ > + > + /* } else { */ > + > + assign(u16, constant(31u << 10u)))))); > + > + /* } */ > + > + return deref(u16).val; > + } > + > + /** > + * \brief Lower a packHalf2x16 expression. > + * > + * \param vec2_rval is packHalf2x16's input > + * \return packHalf2x16's output as a uint rvalue > + */ > + ir_rvalue* > + lower_pack_half_2x16(ir_rvalue *vec2_rval) > + { > + /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec: > + * > + * highp uint packHalf2x16 (mediump vec2 v) > + * ---------------------------------------- > + * Returns an unsigned integer obtained by converting the > components of > + * a two-component floating-point vector to the 16-bit > floating-point > + * representation found in the OpenGL ES Specification, and then > packing > + * these two 16-bit integers into a 32-bit unsigned integer. > + * > + * The first vector component specifies the 16 least- significant > bits > + * of the result; the second component specifies the 16 > most-significant > + * bits. > + */ > + > + assert(vec2_rval->type == glsl_type::vec2_type); > + > + /* vec2 f = VEC2_RVAL; */ > + ir_variable *f = factory.make_temp(glsl_type::vec2_type, > + "tmp_pack_half_2x16_f"); > + factory.emit(assign(f, vec2_rval)); > + > + /* uvec2 f32 = bitcast_f2u(f); */ > + ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type, > + "tmp_pack_half_2x16_f32"); > + factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f))); > + > + /* uvec2 f16; */ > + ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type, > + "tmp_pack_half_2x16_f16"); > + > + /* Get f32's unshifted exponent bits. > + * > + * uvec2 e = f32 & 0x7f800000u; > + */ > + ir_variable *e = factory.make_temp(glsl_type::uvec2_type, > + "tmp_pack_half_2x16_e"); > + factory.emit(assign(e, bit_and(f32, constant(0x7f800000u)))); > + > + /* Get f32's unshifted mantissa bits. > + * > + * uvec2 m = f32 & 0x007fffffu; > + */ > + ir_variable *m = factory.make_temp(glsl_type::uvec2_type, > + "tmp_pack_half_2x16_m"); > + factory.emit(assign(m, bit_and(f32, constant(0x007fffffu)))); > + > + /* Set f16's exponent and mantissa bits. > + * > + * f16.x = pack_half_1x16_nosign(e.x, m.x); > + * f16.y = pack_half_1y16_nosign(e.y, m.y); > + */ > + factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f), > + swizzle_x(e), > + swizzle_x(m)), > + WRITEMASK_X)); > + factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f), > + swizzle_y(e), > + swizzle_y(m)), > + WRITEMASK_Y)); > + > + /* Set f16's sign bits. > + * > + * f16 |= (f32 & (1u << 31u) >> 16u; > + */ > + factory.emit( > + assign(f16, bit_or(f16, > + rshift(bit_and(f32, constant(1u << 31u)), > + constant(16u))))); > + > + > + /* return (f16.y << 16u) | f16.x; */ > + ir_rvalue *result = bit_or(lshift(swizzle_y(f16), > + constant(16u)), > + swizzle_x(f16)); > + > + assert(result->type == glsl_type::uint_type); > + return result; > + } > + > + /** > + * \brief Split packHalf2x16's vec2 operand into two floats. > + * > + * \param vec2_rval is packHalf2x16's input > + * \return a uint rvalue > + * > + * Some code generators, such as the i965 fragment shader, require that > all > + * vector expressions be lowered to a sequence of scalar expressions. > + * However, packHalf2x16 cannot be scalarized by the same mechanism as > + * a true vector operation because its input and output have a differing > + * number of vector components. > + * > + * This method scalarizes packHalf2x16 by transforming it from an unary > + * operation having vector input to a binary operation having scalar > input. > + * That is, it transforms > + * > + * packHalf2x16(VEC2_RVAL); > + * > + * into > + * > + * vec2 v = VEC2_RVAL; > + * return packHalf2x16_split(v.x, v.y); > + */ > + ir_rvalue* > + split_pack_half_2x16(ir_rvalue *vec2_rval) > + { > + assert(vec2_rval->type == glsl_type::vec2_type); > + > + ir_variable *v = factory.make_temp(glsl_type::vec2_type, > + "tmp_split_pack_half_2x16_v"); > + factory.emit(assign(v, vec2_rval)); > + > + return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v)); > + } > + > + /** > + * \brief Lower the component-wise calculation of unpackHalf2x16. > + * > + * Given a uint that encodes a float16 in its lower 16 bits, this function > + * returns a uint that encodes a float32 with the same value. The sign bit > + * of the float16 is ignored. > + * > + * \param e_rval is the unshifted exponent bits of a float16 > + * \param m_rval is the unshifted mantissa bits of a float16 > + * \param a uint rvalue that encodes a float32 > + */ > + ir_rvalue* > + unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval) > + { > + assert(e_rval->type == glsl_type::uint_type); > + assert(m_rval->type == glsl_type::uint_type); > + > + /* uint u32; */ > + ir_variable *u32 = factory.make_temp(glsl_type::uint_type, > + "tmp_unpack_half_1x16_u32"); > + > + /* uint e = E_RVAL; */ > + ir_variable *e = factory.make_temp(glsl_type::uint_type, > + "tmp_unpack_half_1x16_e"); > + factory.emit(assign(e, e_rval)); > + > + /* uint m = M_RVAL; */ > + ir_variable *m = factory.make_temp(glsl_type::uint_type, > + "tmp_unpack_half_1x16_m"); > + factory.emit(assign(m, m_rval)); > + > + /* Preliminaries > + * ------------- > + * > + * For a float16, the bit layout is: > + * > + * sign: 15 > + * exponent: 10:14 > + * mantissa: 0:9 > + * > + * Let f16 be a float16 value. The sign, exponent, and mantissa > + * determine its value thus: > + * > + * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 > (1) > + * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) > * (m16 / 2^10) (2) > + * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) > * (1 + m16 / 2^10) (3) > + * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf > (4) > + * if e16 = 31 and m16 != 0, then NaN > (5) > + * > + * where 0 <= m16 < 2^10. > + * > + * For a float32, the bit layout is: > + * > + * sign: 31 > + * exponent: 23:30 > + * mantissa: 0:22 > + * > + * Let f32 be a float32 value. The sign, exponent, and mantissa > + * determine its value thus: > + * > + * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 > (10) > + * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) > * (m32 / 2^23) (11) > + * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) > * (1 + m32 / 2^23) (12) > + * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf > (13) > + * if e32 = 255 and m32 != 0, then NaN > (14) > + * > + * where 0 <= m32 < 2^23. > + * > + * Calculation > + * ----------- > + * Our task is to compute s32, e32, m32 given f16. Since this function > + * ignores the sign bit, assume that s32 = s16 = 0. There are several > + * cases consider. > + */ > + > + factory.emit( > + > + /* Case 1) f16 is zero or subnormal. > + * > + * The simplest method of calcuating f32 in this case is > + * > + * f32 = f16 (20) > + * = 2^(-14) * (m16 / 2^10) (21) > + * = m16 / 2^(-24) (22) > + */ > + > + /* if (e16 == 0) { */ > + if_tree(equal(e, constant(0u)), > + > + /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */ > + assign(u32, expr(ir_unop_bitcast_f2u, > + div(u2f(m), constant((float)(1 << 24))))), > + > + /* Case 2) f16 is normal. > + * > + * The equation > + * > + * f32 = f16 (30) > + * 2^(e32 - 127) * (1 + m32 / 2^23) = (31) > + * 2^(e16 - 15) * (1 + m16 / 2^10) > + * > + * can be decomposed into two > + * > + * 2^(e32 - 127) = 2^(e16 - 15) (32) > + * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33) > + * > + * which solve to > + * > + * e32 = e16 + 112 (34) > + * m32 = m16 * 2^13 (35) > + */ > + > + /* } else if (e16 < 31)) { */ > + if_tree(less(e, constant(31u << 10u)), > + > + /* u32 = ((e << 13) + (112 << 23)) > + * | (m << 13); > + */ > + assign(u32, bit_or(add(lshift(e, constant(13u)), > + constant(112u << 23u)), > + lshift(m, constant(13u)))), > + > + /* Case 3) f16 is infinite. */ > + if_tree(equal(m, constant(0u)), > + > + assign(u32, constant(255u << 23u)), > + > + /* Case 4) f16 is NaN. */ > + /* } else { */ > + > + assign(u32, constant(0x7fffffffu)))))); > + > + /* } */ > + > + return deref(u32).val; > + } > + > + /** > + * \brief Lower an unpackHalf2x16 expression. > + * > + * \param uint_rval is unpackHalf2x16's input > + * \return unpackHalf2x16's output as a vec2 rvalue > + */ > + ir_rvalue* > + lower_unpack_half_2x16(ir_rvalue *uint_rval) > + { > + /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec: > + * > + * mediump vec2 unpackHalf2x16 (highp uint v) > + * ------------------------------------------ > + * Returns a two-component floating-point vector with components > + * obtained by unpacking a 32-bit unsigned integer into a pair of > 16-bit > + * values, interpreting those values as 16-bit floating-point > numbers > + * according to the OpenGL ES Specification, and converting them to > + * 32-bit floating-point values. > + * > + * The first component of the vector is obtained from the > + * 16 least-significant bits of v; the second component is obtained > + * from the 16 most-significant bits of v. > + */ > + assert(uint_rval->type == glsl_type::uint_type); > + > + /* uint u = RVALUE; > + * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16); > + */ > + ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type, > + "tmp_unpack_half_2x16_f16"); > + factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval))); > + > + /* uvec2 f32; */ > + ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type, > + "tmp_unpack_half_2x16_f32"); > + > + /* Get f16's unshifted exponent bits. > + * > + * uvec2 e = f16 & 0x7c00u; > + */ > + ir_variable *e = factory.make_temp(glsl_type::uvec2_type, > + "tmp_unpack_half_2x16_e"); > + factory.emit(assign(e, bit_and(f16, constant(0x7c00u)))); > + > + /* Get f16's unshifted mantissa bits. > + * > + * uvec2 m = f16 & 0x03ffu; > + */ > + ir_variable *m = factory.make_temp(glsl_type::uvec2_type, > + "tmp_unpack_half_2x16_m"); > + factory.emit(assign(m, bit_and(f16, constant(0x03ffu)))); > + > + /* Set f32's exponent and mantissa bits. > + * > + * f32.x = unpack_half_1x16_nosign(e.x, m.x); > + * f32.y = unpack_half_1x16_nosign(e.y, m.y); > + */ > + factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e), > + swizzle_x(m)), > + WRITEMASK_X)); > + factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e), > + swizzle_y(m)), > + WRITEMASK_Y)); > + > + /* Set f32's sign bit. > + * > + * f32 |= (f16 & 0x8000u) << 16u; > + */ > + factory.emit(assign(f32, bit_or(f32, > + lshift(bit_and(f16, > + constant(0x8000u)), > + constant(16u))))); > + > + /* return bitcast_u2f(f32); */ > + ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32); > + assert(result->type == glsl_type::vec2_type); > + return result; > + } > + > + /** > + * \brief Split unpackHalf2x16 into two operations. > + * > + * \param uint_rval is unpackHalf2x16's input > + * \return a vec2 rvalue > + * > + * Some code generators, such as the i965 fragment shader, require that > all > + * vector expressions be lowered to a sequence of scalar expressions. > + * However, unpackHalf2x16 cannot be scalarized by the same method as > + * a true vector operation because the number of components of its input > + * and output differ. > + * > + * This method scalarizes unpackHalf2x16 by transforming it from a single > + * operation having vec2 output to a pair of operations each having float > + * output. That is, it transforms > + * > + * unpackHalf2x16(UINT_RVAL) > + * > + * into > + * > + * uint u = UINT_RVAL; > + * vec2 v; > + * > + * v.x = unpackHalf2x16_split_x(u); > + * v.y = unpackHalf2x16_split_y(u); > + * > + * return v; > + */ > + ir_rvalue* > + split_unpack_half_2x16(ir_rvalue *uint_rval) > + { > + assert(uint_rval->type == glsl_type::uint_type); > + > + /* uint u = uint_rval; */ > + ir_variable *u = factory.make_temp(glsl_type::uint_type, > + "tmp_split_unpack_half_2x16_u"); > + factory.emit(assign(u, uint_rval)); > + > + /* vec2 v; */ > + ir_variable *v = factory.make_temp(glsl_type::vec2_type, > + "tmp_split_unpack_half_2x16_v"); > + > + /* v.x = unpack_half_2x16_split_x(u); */ > + factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u), > + WRITEMASK_X)); > + > + /* v.y = unpack_half_2x16_split_y(u); */ > + factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u), > + WRITEMASK_Y)); > + > + return deref(v).val; > + } > +}; > + > +} // namespace anonymous > + > +/** > + * \brief Lower the builtin packing functions. > + * > + * \param op_mask is a bitmask of `enum lower_packing_builtins_op`. > + */ > +bool > +lower_packing_builtins(exec_list *instructions, int op_mask) > +{ > + lower_packing_builtins_visitor v(op_mask); > + visit_list_elements(&v, instructions, true); > + return v.get_progress(); > +} > -- > 1.8.1.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev