On 24 January 2013 19:47, Matt Turner <matts...@gmail.com> wrote: > Lower them to arithmetic and bit manipulation expressions. > --- > src/glsl/ir_optimization.h | 6 + > src/glsl/lower_packing_builtins.cpp | 279 > +++++++++++++++++++++++++++++++++++ > 2 files changed, 285 insertions(+), 0 deletions(-) > > diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h > index ac90b87..8f33018 100644 > --- a/src/glsl/ir_optimization.h > +++ b/src/glsl/ir_optimization.h > @@ -54,6 +54,12 @@ enum lower_packing_builtins_op { > > LOWER_PACK_HALF_2x16_TO_SPLIT = 0x0040, > LOWER_UNPACK_HALF_2x16_TO_SPLIT = 0x0080, > + > + LOWER_PACK_SNORM_4x8 = 0x0100, > + LOWER_UNPACK_SNORM_4x8 = 0x0200, > + > + LOWER_PACK_UNORM_4x8 = 0x0400, > + LOWER_UNPACK_UNORM_4x8 = 0x0800, > }; > > bool do_common_optimization(exec_list *ir, bool linked, > diff --git a/src/glsl/lower_packing_builtins.cpp > b/src/glsl/lower_packing_builtins.cpp > index 49176cc..aa6765f 100644 > --- a/src/glsl/lower_packing_builtins.cpp > +++ b/src/glsl/lower_packing_builtins.cpp > @@ -85,9 +85,15 @@ public: > case LOWER_PACK_SNORM_2x16: > *rvalue = lower_pack_snorm_2x16(op0); > break; > + case LOWER_PACK_SNORM_4x8: > + *rvalue = lower_pack_snorm_4x8(op0); > + break; > case LOWER_PACK_UNORM_2x16: > *rvalue = lower_pack_unorm_2x16(op0); > break; > + case LOWER_PACK_UNORM_4x8: > + *rvalue = lower_pack_unorm_4x8(op0); > + break; > case LOWER_PACK_HALF_2x16: > *rvalue = lower_pack_half_2x16(op0); > break; > @@ -97,9 +103,15 @@ public: > case LOWER_UNPACK_SNORM_2x16: > *rvalue = lower_unpack_snorm_2x16(op0); > break; > + case LOWER_UNPACK_SNORM_4x8: > + *rvalue = lower_unpack_snorm_4x8(op0); > + break; > case LOWER_UNPACK_UNORM_2x16: > *rvalue = lower_unpack_unorm_2x16(op0); > break; > + case LOWER_UNPACK_UNORM_4x8: > + *rvalue = lower_unpack_unorm_4x8(op0); > + break; > case LOWER_UNPACK_HALF_2x16: > *rvalue = lower_unpack_half_2x16(op0); > break; > @@ -137,18 +149,30 @@ private: > case ir_unop_pack_snorm_2x16: > result = op_mask & LOWER_PACK_SNORM_2x16; > break; > + case ir_unop_pack_snorm_4x8: > + result = op_mask & LOWER_PACK_SNORM_4x8; > + break; > case ir_unop_pack_unorm_2x16: > result = op_mask & LOWER_PACK_UNORM_2x16; > break; > + case ir_unop_pack_unorm_4x8: > + result = op_mask & LOWER_PACK_UNORM_4x8; > + break; > case ir_unop_pack_half_2x16: > result = op_mask & (LOWER_PACK_HALF_2x16 | > LOWER_PACK_HALF_2x16_TO_SPLIT); > break; > case ir_unop_unpack_snorm_2x16: > result = op_mask & LOWER_UNPACK_SNORM_2x16; > break; > + case ir_unop_unpack_snorm_4x8: > + result = op_mask & LOWER_UNPACK_SNORM_4x8; > + break; > case ir_unop_unpack_unorm_2x16: > result = op_mask & LOWER_UNPACK_UNORM_2x16; > break; > + case ir_unop_unpack_unorm_4x8: > + result = op_mask & LOWER_UNPACK_UNORM_4x8; > + break; > case ir_unop_unpack_half_2x16: > result = op_mask & (LOWER_UNPACK_HALF_2x16 | > LOWER_UNPACK_HALF_2x16_TO_SPLIT); > break; > @@ -214,6 +238,30 @@ private: > } > > /** > + * \brief Pack four uint8's into a single uint32. > + * > + * Interpret the given uvec4 as a uint32 quad. Pack the quad into a > uint32 > + * where the least significant bits specify the first element of the > quad. > + * Return the uint32. > + */ > + ir_rvalue* > + pack_uvec4_to_uint(ir_rvalue *uvec4_rval) > + { > + assert(uvec4_rval->type == glsl_type::uvec4_type); > + > + /* uvec4 u = UVEC4_RVAL; */ > + ir_variable *u = factory.make_temp(glsl_type::uvec4_type, > + "tmp_pack_uvec4_to_uint"); > + factory.emit(assign(u, uvec4_rval)); >
Rather than do four scalar bit_and(..., constant(0xffu)) instructions below, how about changing the above line to: factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu)))); That way we take advantage of vector processing in the GPU to do all four bit_ands at once. With that fixed (as well as the copy/paste errors Ian spotted), this patch is: Reviewed-by: Paul Berry <stereotype...@gmail.com> > + > + /* return ((u.w 0xff) << 24) | ((u.z & 0xff) << 16) | ((u.y & 0xff) > << 8) | (u.x & 0xff); */ > + return bit_or(bit_or(lshift(bit_and(swizzle_w(u), constant(0xffu)), > constant(24u)), > + lshift(bit_and(swizzle_z(u), constant(0xffu)), > constant(16u))), > + bit_or(lshift(bit_and(swizzle_y(u), constant(0xffu)), > constant(8u)), > + bit_and(swizzle_x(u), constant(0xffu)))); > + } > + > + /** > * \brief Unpack a uint32 into two uint16's. > * > * Interpret the given uint32 as a uint16 pair where the uint32's least > @@ -244,6 +292,44 @@ private: > } > > /** > + * \brief Unpack a uint32 into four uint8's. > + * > + * Interpret the given uint32 as a uint8 quad where the uint32's least > + * significant bits specify the quad's first element. Return the uint8 > + * quad as a uvec4. > + */ > + ir_rvalue* > + unpack_uint_to_uvec4(ir_rvalue *uint_rval) > + { > + assert(uint_rval->type == glsl_type::uint_type); > + > + /* uint u = UINT_RVAL; */ > + ir_variable *u = factory.make_temp(glsl_type::uint_type, > + "tmp_unpack_uint_to_uvec4_u"); > + factory.emit(assign(u, uint_rval)); > + > + /* uvec4 u4; */ > + ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type, > + "tmp_unpack_uint_to_uvec4_u4"); > + > + /* u4.x = u & 0xffu; */ > + factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X)); > + > + /* u4.y = (u >> 8u) & 0xffu; */ > + factory.emit(assign(u4, bit_and(rshift(u, constant(8u)), > + constant(0xffu)), WRITEMASK_Y)); > + > + /* u4.z = (u >> 16u) & 0xffu; */ > + factory.emit(assign(u4, bit_and(rshift(u, constant(16u)), > + constant(0xffu)), WRITEMASK_Z)); > + > + /* u4.w = (u >> 24u) */ > + factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W)); > + > + return deref(u4).val; > + } > + > + /** > * \brief Lower a packSnorm2x16 expression. > * > * \param vec2_rval is packSnorm2x16's input > @@ -293,6 +379,55 @@ private: > } > > /** > + * \brief Lower a packSnorm4x8 expression. > + * > + * \param vec4_rval is packSnorm4x8's input > + * \return packSnorm4x8's output as a uint rvalue > + */ > + ir_rvalue* > + lower_pack_snorm_4x8(ir_rvalue *vec4_rval) > + { > + /* From page 137 (143 of pdf) of the GLSL 4.30 spec: > + * > + * highp uint packSnorm4x8(vec4 v) > + * ------------------------------- > + * First, converts each component of the normalized > floating-point value > + * v into 8-bit integer values. Then, the results are packed > into the > + * returned 32-bit unsigned integer. > + * > + * The conversion for component c of v to fixed point is done as > + * follows: > + * > + * packSnorm4x8: round(clamp(c, -1, +1) * 127.0) > + * > + * The first component of the vector will be written to the least > + * significant bits of the output; the last component will be > written to > + * the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return pack_uvec4_to_uint( > + * uvec4(ivec4( > + * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f)))); > + * > + * It is necessary to first convert the vec4 to ivec4 rather than > directly > + * converting vec4 to uvec4 because the latter conversion is > undefined. > + * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined > to > + * convert a negative floating point value to an uint". > + */ > + assert(vec4_rval->type == glsl_type::vec4_type); > + > + ir_rvalue *result = pack_uvec4_to_uint( > + i2u(f2i(round_even(mul(clamp(vec4_rval, > + constant(-1.0f), > + constant(1.0f)), > + constant(127.0f)))))); > + > + assert(result->type == glsl_type::uint_type); > + return result; > + } > + > + /** > * \brief Lower an unpackSnorm2x16 expression. > * > * \param uint_rval is unpackSnorm2x16's input > @@ -352,6 +487,65 @@ private: > } > > /** > + * \brief Lower an unpackSnorm4x8 expression. > + * > + * \param uint_rval is unpackSnorm4x8's input > + * \return unpackSnorm4x8's output as a vec4 rvalue > + */ > + ir_rvalue* > + lower_unpack_snorm_4x8(ir_rvalue *uint_rval) > + { > + /* From page 137 (143 of pdf) of the GLSL 4.30 spec: > + * > + * highp vec4 unpackSnorm4x8 (highp uint p) > + * ---------------------------------------- > + * First, unpacks a single 32-bit unsigned integer p into four > + * 8-bit unsigned integers. Then, each component is converted to > + * a normalized floating-point value to generate the returned > + * four-component vector. > + * > + * The conversion for unpacked fixed-point value f to floating > point is > + * done as follows: > + * > + * unpackSnorm4x8: clamp(f / 127.0, -1, +1) > + * > + * The first component of the returned vector will be extracted > from the > + * least significant bits of the input; the last component will > be > + * extracted from the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return clamp( > + * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / > 127.0f, > + * -1.0f, 1.0f); > + * > + * The above IR may appear unnecessarily complex, but the > intermediate > + * conversion to ivec4 and the bit shifts are necessary to > correctly unpack > + * negative floats. > + * > + * To see why, consider packing and then unpacking vec4(-1.0, 0.0, > 0.0, > + * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During > unpacking, we > + * place that int8 into an int32, which results in the *positive* > integer > + * 0x000000ff. The int8's sign bit becomes, in the int32, the > rather > + * unimportant bit 8. We must now extend the int8's sign bit into > bits > + * 9-32, which is accomplished by left-shifting then right-shifting. > + */ > + > + assert(uint_rval->type == glsl_type::uint_type); > + > + ir_rvalue *result = > + clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)), > + constant(24u)), > + constant(24u))), > + constant(127.0f)), > + constant(-1.0f), > + constant(1.0f)); > + > + assert(result->type == glsl_type::vec4_type); > + return result; > + } > + > + /** > * \brief Lower a packUnorm2x16 expression. > * > * \param vec2_rval is packUnorm2x16's input > @@ -396,6 +590,50 @@ private: > } > > /** > + * \brief Lower a packUnorm4x8 expression. > + * > + * \param vec4_rval is packUnorm4x8's input > + * \return packUnorm4x8's output as a uint rvalue > + */ > + ir_rvalue* > + lower_pack_unorm_4x8(ir_rvalue *vec4_rval) > + { > + /* From page 137 (143 of pdf) of the GLSL 4.30 spec: > + * > + * highp uint packUnorm4x8 (vec4 v) > + * -------------------------------- > + * First, converts each component of the normalized > floating-point value > + * v into 16-bit integer values. Then, the results are packed > into the > + * returned 32-bit unsigned integer. > + * > + * The conversion for component c of v to fixed point is done as > + * follows: > + * > + * packUnorm4x8: round(clamp(c, 0, +1) * 65535.0) > + * > + * The first component of the vector will be written to the least > + * significant bits of the output; the last component will be > written to > + * the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return pack_uvec4_to_uint(uvec4( > + * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f))); > + * > + * Here it is safe to directly convert the vec4 to uvec4 because > the the > + * vec4 has been clamped to a non-negative range. > + */ > + > + assert(vec4_rval->type == glsl_type::vec4_type); > + > + ir_rvalue *result = pack_uvec4_to_uint( > + f2u(round_even(mul(saturate(vec4_rval), constant(255.0f))))); > + > + assert(result->type == glsl_type::uint_type); > + return result; > + } > + > + /** > * \brief Lower an unpackUnorm2x16 expression. > * > * \param uint_rval is unpackUnorm2x16's input > @@ -437,6 +675,47 @@ private: > } > > /** > + * \brief Lower an unpackUnorm4x8 expression. > + * > + * \param uint_rval is unpackUnorm4x8's input > + * \return unpackUnorm4x8's output as a vec4 rvalue > + */ > + ir_rvalue* > + lower_unpack_unorm_4x8(ir_rvalue *uint_rval) > + { > + /* From page 137 (143 of pdf) of the GLSL 4.30 spec: > + * > + * highp vec4 unpackUnorm4x8 (highp uint p) > + * ---------------------------------------- > + * First, unpacks a single 32-bit unsigned integer p into four > + * 8-bit unsigned integers. Then, each component is converted to > + * a normalized floating-point value to generate the returned > + * two-component vector. > + * > + * The conversion for unpacked fixed-point value f to floating > point is > + * done as follows: > + * > + * unpackUnorm4x8: f / 255.0 > + * > + * The first component of the returned vector will be extracted > from the > + * least significant bits of the input; the last component will > be > + * extracted from the most significant bits. > + * > + * This function generates IR that approximates the following > pseudo-GLSL: > + * > + * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0; > + */ > + > + assert(uint_rval->type == glsl_type::uint_type); > + > + ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)), > + constant(255.0f)); > + > + assert(result->type == glsl_type::vec4_type); > + return result; > + } > + > + /** > * \brief Lower the component-wise calculation of packHalf2x16. > * > * \param f_rval is one component of packHafl2x16's input > -- > 1.7.8.6 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev