Just to follow up, this is the code the nvidia blob emits for rsq(). The argument is in c1[0] + c1[4]. I _think_ that c3 contains:
PB: 0x7fffffff GF100_M2MF.DATA = 0x7fffffff PB: 0x7ff00000 GF100_M2MF.DATA = 0x7ff00000 PB: 0x00000000 GF100_M2MF.DATA = 0 PB: 0x3fe00000 GF100_M2MF.DATA = 0x3fe00000 PB: 0x3ff00000 GF100_M2MF.DATA = 0x3ff00000 00000000: 10005de4 28004400 mov b32 $r1 c1[0x4] 00000008: 10109c03 68004c00 and b32 $r2 $r1 c3[0x4] r2 = exponent bits 00000010: 00101c03 68004c00 and b32 $r0 $r1 c3[0x0] r0 = non-sign bits (of upper word) 00000018: 1021dc03 1a8e4c00 set $p0 0x1 ne u32 $r2 c3[0x4] p0 = exponent == 0x7ff (i.e. inf/nan) 00000020: 00001c43 68004400 or b32 $r0 $r0 c1[0x0] r0 = all non-sign bits or'd 00000028: fc001c04 20000000 selp b32 $r0 $r0 0x0 $p0 if (exponent == 0x7ff) r0 = all mantissa bits else r0 = 0 00000030: fc01dc23 190e0000 set $p0 0x1 eq s32 $r0 0x0 p0 = is input a nan (i.e. exponent = 0x7ff and mantissa bits set) 00000038: 00001de4 28004400 mov b32 $r0 c1[0x0] 00000040: 200081e7 40000001 $p0 bra allwarp 0x90 So all of the (not $p0) stuff happens for all non-nan's, including infinities. r0d = input 00000048: 0000a1e2 19ff0000 (not $p0) mov b32 $r2 0x7fc00000 00000050: fc0121e4 28000000 (not $p0) mov b32 $r4 0x0 00000058: 1020a103 68004400 (not $p0) and b32 $r2 $r2 not c1[0x4] 00000060: 00216002 08004000 (not $p0) add b32 $r5 $r2 0x100000 00000068: 0450e203 5800c000 (not $p0) shr u32 $r3 $r5 wrap 0x1 00000070: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d 00000078: fc00a1e4 28000000 (not $p0) mov b32 $r2 0x0 00000080: 0030e002 087fe000 (not $p0) add b32 $r3 $r3 0x1ff80000 I tried decoding this, but it's some crazy business -- futzing with the exponent, I think it's trying to compress more into the high 32-bits so that rsqrt64h below has more precision. 00000088: 00001de4 40000000 nop 00000090: fc011de4 28000000 B mov b32 $r4 0x0 00000098: 1c115c00 c8000000 rsqrt64h $r5 $r1 000000a0: 200081e7 40000001 $p0 bra allwarp 0xf0 000000a8: 00002001 5000cff8 (not $p0) mul rn f64 $r0d $r0d 0x3fe0000000000000 r0d = input * 0.5 000000b0: 1001a001 50000000 (not $p0) mul rn f64 $r6d $r0d $r4d r6d = 0.5 * input * guess 000000b8: 3061a201 20088c00 (not $p0) fma rn f64 $r6d neg $r6d $r4d c3[0xc] r6d = -0.5 * input * guess * guess + 0.5 (? actually 0.5000001190928742, i.e. 0x3fe000003ff00000... seems like a bug, but maybe it'll only read it as a f32, not f64 like I'm assuming) 000000c0: 18412001 20080000 (not $p0) fma rn f64 $r4d $r4d $r6d $r4d guess = guess * (0.5 - 0.5 * input * guess * guess) + guess i.e. newton-raphson step. [why didn't they just throw in a 1.5 instead of 0.5? who knows. maybe something to do with numeric stability. or perhaps a bug in their const upload logic which got papered over with the extra + guess.] 000000c8: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d 000000d0: 30002201 20088c00 (not $p0) fma rn f64 $r0d neg $r0d $r4d c3[0xc] 000000d8: 00402001 20080000 (not $p0) fma rn f64 $r0d $r4d $r0d $r4d 000000e0: 08012001 50000000 (not $p0) mul rn f64 $r4d $r0d $r2d And this is another step. Not sure why they're so careful to still go through the motions for infinity and only skip for nan -- seems like they could just as well skip it for inf as well, with less code. That's what I plan on doing. On Mon, Feb 23, 2015 at 10:40 AM, Ilia Mirkin <[email protected]> wrote: > Oh right. I think the NVIDIA blob executes those steps conditionally > based on the upper bits not being 0x7ff (== infinity/nan). I should do > the same thing here. [FWIW I was able to test the nv50 code last night > and that one's a total fail for rcp/rsq... will need to port that over > to my nvc0 and debug there.] > > On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <[email protected]> > wrote: >> Does this give correct results for special floats (0, infs)? >> We tried to improve (for single floats) x86 rcp in llvmpipe with >> newton-raphson, but unfortunately not being able to give correct results >> for these two cases (without even more additional code) meant it got all >> disabled in the end (you can still see that code in the driver) since >> the problems are at least as bad as those due to bad accuracy... >> >> Roland >> >> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin: >>> Signed-off-by: Ilia Mirkin <[email protected]> >>> --- >>> >>> Not sure how many steps are needed for the necessary accuracy. Just >>> doing 2 because that seems like a reasonable number. >>> >>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 >>> ++++++++++++++++++++-- >>> 1 file changed, 39 insertions(+), 3 deletions(-) >>> >>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> index 87e75e1..9767566 100644 >>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp >>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >>> bld.setPosition(i, false); >>> >>> // 1. Take the source and it up. >>> - Value *src[2], *dst[2], *def = i->getDef(0); >>> - bld.mkSplit(src, 4, i->getSrc(0)); >>> + Value *input = i->getSrc(0); >>> + Value *src[2], *dst[2], *guess, *def = i->getDef(0); >>> + bld.mkSplit(src, 4, input); >>> >>> // 2. We don't care about the low 32 bits of the destination. Stick a 0 >>> in. >>> dst[0] = bld.loadImm(NULL, 0); >>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) >>> >>> // 4. Recombine the two dst pieces back into the original destination. >>> bld.setPosition(i, true); >>> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); >>> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]); >>> + >>> + // 5. Perform 2 Newton-Raphson steps >>> + if (i->op == OP_RCP) { >>> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2 >>> + Value *two = bld.getSSA(8); >>> + >>> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f)); >>> + >>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, >>> guess), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, >>> bld.getSSA(8), guess, guess))); >>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, >>> guess), >>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, >>> bld.getSSA(8), guess, guess))); >>> + } else { >>> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2) >>> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8); >>> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, >>> -0.5f)); >>> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, >>> 1.5f)); >>> + >>> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, >>> input); >>> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2) >>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), >>> half_input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, >>> bld.getSSA(8), guess, guess), >>> + three_half)); >>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, >>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), >>> half_input, >>> + bld.mkOp2v(OP_MUL, TYPE_F64, >>> bld.getSSA(8), guess, guess), >>> + three_half)); >>> + } >>> + >>> + bld.mkMov(def, guess); >>> } >>> >>> bool >>> >> _______________________________________________ Nouveau mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/nouveau
