RSQ results

Roland Scheidegger Mon, 23 Feb 2015 05:27:57 -0800

Does this give correct results for special floats (0, infs)?
We tried to improve (for single floats) x86 rcp in llvmpipe with
newton-raphson, but unfortunately not being able to give correct results
for these two cases (without even more additional code) meant it got all
disabled in the end (you can still see that code in the driver) since
the problems are at least as bad as those due to bad accuracy...


Roland

Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
> Signed-off-by: Ilia Mirkin <[email protected]>
> ---
> 
> Not sure how many steps are needed for the necessary accuracy. Just
> doing 2 because that seems like a reasonable number.
> 
>  .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp      | 42 
> ++++++++++++++++++++--
>  1 file changed, 39 insertions(+), 3 deletions(-)
> 
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 87e75e1..9767566 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>     bld.setPosition(i, false);
>  
>     // 1. Take the source and it up.
> -   Value *src[2], *dst[2], *def = i->getDef(0);
> -   bld.mkSplit(src, 4, i->getSrc(0));
> +   Value *input = i->getSrc(0);
> +   Value *src[2], *dst[2], *guess, *def = i->getDef(0);
> +   bld.mkSplit(src, 4, input);
>  
>     // 2. We don't care about the low 32 bits of the destination. Stick a 0 
> in.
>     dst[0] = bld.loadImm(NULL, 0);
> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>  
>     // 4. Recombine the two dst pieces back into the original destination.
>     bld.setPosition(i, true);
> -   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
> +   guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
> +
> +   // 5. Perform 2 Newton-Raphson steps
> +   if (i->op == OP_RCP) {
> +      // RCP: x_{n+1} = 2 * x_n - input * x_n^2
> +      Value *two = bld.getSSA(8);
> +
> +      bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
> +
> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, 
> guess),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess)));
> +      guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, 
> guess),
> +                         bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess)));
> +   } else {
> +      // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
> +      Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
> +      bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, 
> -0.5f));
> +      bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 
> 1.5f));
> +
> +      half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, 
> input);
> +      // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), 
> half_input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess),
> +                                    three_half));
> +      guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> +                         bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), 
> half_input,
> +                                    bld.mkOp2v(OP_MUL, TYPE_F64, 
> bld.getSSA(8), guess, guess),
> +                                    three_half));
> +   }
> +
> +   bld.mkMov(def, guess);
>  }
>  
>  bool
> 

_______________________________________________
Nouveau mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/nouveau

Re: [Nouveau] [Mesa-dev] [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results

Reply via email to