Does this give correct results for special floats (0, infs)? We tried to improve (for single floats) x86 rcp in llvmpipe with newton-raphson, but unfortunately not being able to give correct results for these two cases (without even more additional code) meant it got all disabled in the end (you can still see that code in the driver) since the problems are at least as bad as those due to bad accuracy...
Roland Am 23.02.2015 um 05:01 schrieb Ilia Mirkin: > Signed-off-by: Ilia Mirkin <[email protected]> > --- > > Not sure how many steps are needed for the necessary accuracy. Just > doing 2 because that seems like a reasonable number. > > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 > ++++++++++++++++++++-- > 1 file changed, 39 insertions(+), 3 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > index 87e75e1..9767566 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) > bld.setPosition(i, false); > > // 1. Take the source and it up. > - Value *src[2], *dst[2], *def = i->getDef(0); > - bld.mkSplit(src, 4, i->getSrc(0)); > + Value *input = i->getSrc(0); > + Value *src[2], *dst[2], *guess, *def = i->getDef(0); > + bld.mkSplit(src, 4, input); > > // 2. We don't care about the low 32 bits of the destination. Stick a 0 > in. > dst[0] = bld.loadImm(NULL, 0); > @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) > > // 4. Recombine the two dst pieces back into the original destination. > bld.setPosition(i, true); > - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); > + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); > + > + // 5. Perform 2 Newton-Raphson steps > + if (i->op == OP_RCP) { > + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 > + Value *two = bld.getSSA(8); > + > + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); > + > + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, > guess), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, > + bld.mkOp2v(OP_MUL, TYPE_F64, > bld.getSSA(8), guess, guess))); > + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, > guess), > + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, > + bld.mkOp2v(OP_MUL, TYPE_F64, > bld.getSSA(8), guess, guess))); > + } else { > + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) > + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); > + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, > -0.5f)); > + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, > 1.5f)); > + > + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, > input); > + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) > + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, > + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), > half_input, > + bld.mkOp2v(OP_MUL, TYPE_F64, > bld.getSSA(8), guess, guess), > + three_half)); > + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, > + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), > half_input, > + bld.mkOp2v(OP_MUL, TYPE_F64, > bld.getSSA(8), guess, guess), > + three_half)); > + } > + > + bld.mkMov(def, guess); > } > > bool > _______________________________________________ Nouveau mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/nouveau
