The SQRT instruction is defined as TGSI_OUTPUT_REPLICATE (REPL in the table) in this patch, but the implementation looks like it's component-wise, which is much slower (4 sqrt calls instead of 1).
For example, r600g implements such instructions by computing func(src.x) in a temporary register and then replicating the result in the destination register. The same applies to other instructions marked as REPL. tgsi_exec seems to be implemented rather inefficiently. Marek On Fri, Feb 1, 2013 at 7:29 PM, Brian Paul <bri...@vmware.com> wrote: > --- > src/gallium/auxiliary/tgsi/tgsi_exec.c | 14 ++++++++++++++ > src/gallium/auxiliary/tgsi/tgsi_exec.h | 2 ++ > src/gallium/auxiliary/tgsi/tgsi_info.c | 2 +- > src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h | 1 + > 4 files changed, 18 insertions(+), 1 deletions(-) > > diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c > b/src/gallium/auxiliary/tgsi/tgsi_exec.c > index 9f226c4..1220478 100644 > --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c > +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c > @@ -343,6 +343,16 @@ micro_rsq(union tgsi_exec_channel *dst, > } > > static void > +micro_sqrt(union tgsi_exec_channel *dst, > + const union tgsi_exec_channel *src) > +{ > + dst->f[0] = sqrtf(fabsf(src->f[0])); > + dst->f[1] = sqrtf(fabsf(src->f[1])); > + dst->f[2] = sqrtf(fabsf(src->f[2])); > + dst->f[3] = sqrtf(fabsf(src->f[3])); > +} > + > +static void > micro_seq(union tgsi_exec_channel *dst, > const union tgsi_exec_channel *src0, > const union tgsi_exec_channel *src1) > @@ -3562,6 +3572,10 @@ exec_instruction( > exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, > TGSI_EXEC_DATA_FLOAT); > break; > > + case TGSI_OPCODE_SQRT: > + exec_vector_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, > TGSI_EXEC_DATA_FLOAT); > + break; > + > case TGSI_OPCODE_DP2A: > exec_dp2a(mach, inst); > break; > diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h > b/src/gallium/auxiliary/tgsi/tgsi_exec.h > index fbd28a2..1a7d979 100644 > --- a/src/gallium/auxiliary/tgsi/tgsi_exec.h > +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h > @@ -441,6 +441,8 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param) > return 1; > case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: > return PIPE_MAX_SAMPLERS; > + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: > + return 1; > default: > return 0; > } > diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c > b/src/gallium/auxiliary/tgsi/tgsi_info.c > index 458bc69..94b6f60 100644 > --- a/src/gallium/auxiliary/tgsi/tgsi_info.c > +++ b/src/gallium/auxiliary/tgsi/tgsi_info.c > @@ -57,7 +57,7 @@ static const struct tgsi_opcode_info > opcode_info[TGSI_OPCODE_LAST] = > { 1, 2, 0, 0, 0, 0, COMP, "SUB", TGSI_OPCODE_SUB }, > { 1, 3, 0, 0, 0, 0, COMP, "LRP", TGSI_OPCODE_LRP }, > { 1, 3, 0, 0, 0, 0, COMP, "CND", TGSI_OPCODE_CND }, > - { 0, 0, 0, 0, 0, 0, NONE, "", 20 }, /* removed */ > + { 1, 1, 0, 0, 0, 0, REPL, "SQRT", TGSI_OPCODE_SQRT }, > { 1, 3, 0, 0, 0, 0, REPL, "DP2A", TGSI_OPCODE_DP2A }, > { 0, 0, 0, 0, 0, 0, NONE, "", 22 }, /* removed */ > { 0, 0, 0, 0, 0, 0, NONE, "", 23 }, /* removed */ > diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h > b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h > index 96b864f..75e27a6 100644 > --- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h > +++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h > @@ -60,6 +60,7 @@ OP13(MAD) > OP12(SUB) > OP13(LRP) > OP13(CND) > +OP11(SQRT) > OP13(DP2A) > OP11(FRC) > OP13(CLAMP) > -- > 1.7.3.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev