Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
Hello Ian, Since I am working on a direct SPIR-V to NV50 IR translator, ultimately to be used for OpenCL kernels, I will still need the patch for that work. (I even wrote that patch because I needed it when handling 64-bit addresses. :-) ) But thanks for the heads-up! Pierre On 02:07 pm - Oct 17 2016, Ian Romanick wrote: > I know know if it will make this patch unnecessary, but I have a GLSL > IR-level lowering pass for 64-bit multiplication. I'm going to send > that out with the rest of the GL_ARB_gpu_shader_int64 series within the > next day or so. > > On 10/15/2016 03:24 PM, Pierre Moreau wrote: > > Hardware does not support 64-bit integers MAD and MUL operations, so we need > > to transform them in 32-bit operations. > > > > Signed-off-by: Pierre Moreau> > --- > > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 > > + > > 1 file changed, 121 insertions(+) > > > > Tested with (the GPU result was compared to the CPU result): > > * 0xfff3lu * 0xfff2lu + 0x80070002lu > > * 0xfff3lu * 0x80070002lu + 0x80070002lu > > * 0x80010003lu * 0xfff2lu + 0x80070002lu > > * 0x80010003lu * 0x80070002lu + 0x80070002lu > > > > * -523456791234l * 929835793793l + -15793793l > > * 523456791234l * 929835793793l + -15793793l > > * -523456791234l * -929835793793l + -15793793l > > * 523456791234l * -929835793793l + -15793793l > > > > v2: > > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) > > * Move pass prior to Register Allocation, as some temporaries need to > > be created. > > > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > index d88bb34..a610eb5 100644 > > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) > > > > // > > = > > > > +// Split 64-bit MUL and MAD > > +class Split64BitOpPreRA : public Pass > > +{ > > +private: > > + virtual bool visit(BasicBlock *); > > + void split64BitReg(Function *, Instruction *, Instruction *, > > + Instruction *, Value *, int); > > + void split64MulMad(Function *, Instruction *, DataType); > > + > > + BuildUtil bld; > > +}; > > + > > +bool > > +Split64BitOpPreRA::visit(BasicBlock *bb) > > +{ > > + Instruction *i, *next; > > + Modifier mod; > > + > > + for (i = bb->getEntry(); i; i = next) { > > + next = i->next; > > + > > + if (typeSizeof(i->dType) != 8) > > + continue; > > + > > + DataType hTy; > > + switch (i->dType) { > > + case TYPE_U64: hTy = TYPE_U32; break; > > + case TYPE_S64: hTy = TYPE_S32; break; > > + default: > > + continue; > > + } > > + > > + if (i->op == OP_MAD || i->op == OP_MUL) > > + split64MulMad(bb->getFunction(), i, hTy); > > + } > > + > > + return true; > > +} > > + > > +void > > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType > > hTy) > > +{ > > + assert(i->op == OP_MAD || i->op == OP_MUL); > > + if (isFloatType(i->dType) || isFloatType(i->sType)) > > + return; > > + > > + bld.setPosition(i, true); > > + > > + Value *zero = bld.mkImm(0u); > > + Value *carry = bld.getSSA(1, FILE_FLAGS); > > + > > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit > > + // values (a, b and c might be 32-bit values), using 32-bit operations. > > This > > + // gives the following operations: > > + // * `d.low = low(a.low * b.low) (+ c.low)?` > > + // * `d.high = low(a.high * b.low) + low(a.low * b.high) > > + // + high(a.low * b.low) (+ c.high)?` > > + // > > + // To compute the high bits, we can split in the following operations: > > + // * `tmp1 = low(a.high * b.low) (+ c.high)?` > > + // * `tmp2 = low(a.low * b.high) + tmp1` > > + // * `d.high = high(a.low * b.low) + tmp2` > > + // > > + // mkSplit put lower bits at index 0 and higher bits at index 1 > > + > > + Value *op1[2]; > > + if (i->getSrc(0)->reg.size == 8) > > + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0)); > > + else { > > + op1[0] = i->getSrc(0); > > + op1[1] = zero; > > + } > > + Value *op2[2]; > > + if (i->getSrc(1)->reg.size == 8) > > + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1)); > > + else { > > + op2[0] = i->getSrc(1); > > + op2[1] = zero; > > + } > > + > > + Value *op3[2] = { NULL, NULL }; > > + if (i->op == OP_MAD) { > > + if (i->getSrc(2)->reg.size == 8) > > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2)); > > + else { > > + op3[0] = i->getSrc(2); > > + op3[1] = zero; > > +
Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
I know know if it will make this patch unnecessary, but I have a GLSL IR-level lowering pass for 64-bit multiplication. I'm going to send that out with the rest of the GL_ARB_gpu_shader_int64 series within the next day or so. On 10/15/2016 03:24 PM, Pierre Moreau wrote: > Hardware does not support 64-bit integers MAD and MUL operations, so we need > to transform them in 32-bit operations. > > Signed-off-by: Pierre Moreau> --- > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 > + > 1 file changed, 121 insertions(+) > > Tested with (the GPU result was compared to the CPU result): > * 0xfff3lu * 0xfff2lu + 0x80070002lu > * 0xfff3lu * 0x80070002lu + 0x80070002lu > * 0x80010003lu * 0xfff2lu + 0x80070002lu > * 0x80010003lu * 0x80070002lu + 0x80070002lu > > * -523456791234l * 929835793793l + -15793793l > * 523456791234l * 929835793793l + -15793793l > * -523456791234l * -929835793793l + -15793793l > * 523456791234l * -929835793793l + -15793793l > > v2: > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) > * Move pass prior to Register Allocation, as some temporaries need to > be created. > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index d88bb34..a610eb5 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) > > // > = > > +// Split 64-bit MUL and MAD > +class Split64BitOpPreRA : public Pass > +{ > +private: > + virtual bool visit(BasicBlock *); > + void split64BitReg(Function *, Instruction *, Instruction *, > + Instruction *, Value *, int); > + void split64MulMad(Function *, Instruction *, DataType); > + > + BuildUtil bld; > +}; > + > +bool > +Split64BitOpPreRA::visit(BasicBlock *bb) > +{ > + Instruction *i, *next; > + Modifier mod; > + > + for (i = bb->getEntry(); i; i = next) { > + next = i->next; > + > + if (typeSizeof(i->dType) != 8) > + continue; > + > + DataType hTy; > + switch (i->dType) { > + case TYPE_U64: hTy = TYPE_U32; break; > + case TYPE_S64: hTy = TYPE_S32; break; > + default: > + continue; > + } > + > + if (i->op == OP_MAD || i->op == OP_MUL) > + split64MulMad(bb->getFunction(), i, hTy); > + } > + > + return true; > +} > + > +void > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy) > +{ > + assert(i->op == OP_MAD || i->op == OP_MUL); > + if (isFloatType(i->dType) || isFloatType(i->sType)) > + return; > + > + bld.setPosition(i, true); > + > + Value *zero = bld.mkImm(0u); > + Value *carry = bld.getSSA(1, FILE_FLAGS); > + > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit > + // values (a, b and c might be 32-bit values), using 32-bit operations. > This > + // gives the following operations: > + // * `d.low = low(a.low * b.low) (+ c.low)?` > + // * `d.high = low(a.high * b.low) + low(a.low * b.high) > + // + high(a.low * b.low) (+ c.high)?` > + // > + // To compute the high bits, we can split in the following operations: > + // * `tmp1 = low(a.high * b.low) (+ c.high)?` > + // * `tmp2 = low(a.low * b.high) + tmp1` > + // * `d.high = high(a.low * b.low) + tmp2` > + // > + // mkSplit put lower bits at index 0 and higher bits at index 1 > + > + Value *op1[2]; > + if (i->getSrc(0)->reg.size == 8) > + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0)); > + else { > + op1[0] = i->getSrc(0); > + op1[1] = zero; > + } > + Value *op2[2]; > + if (i->getSrc(1)->reg.size == 8) > + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1)); > + else { > + op2[0] = i->getSrc(1); > + op2[1] = zero; > + } > + > + Value *op3[2] = { NULL, NULL }; > + if (i->op == OP_MAD) { > + if (i->getSrc(2)->reg.size == 8) > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2)); > + else { > + op3[0] = i->getSrc(2); > + op3[1] = zero; > + } > + } > + > + Value *tmpRes1Hi = bld.getSSA(); > + if (i->op == OP_MAD) > + bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]); > + else > + bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]); > + > + Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], > tmpRes1Hi); > + > + Value *def[2] = { bld.getSSA(), bld.getSSA() }; > + > + // If it was a MAD, add the carry from the low bits > + // It is not needed if it was a MUL, since we added high(a.low * b.low) to > + // d.high > + if (i->op == OP_MAD) > +
Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
On 06:38 pm - Oct 15 2016, Ilia Mirkin wrote: > On Sat, Oct 15, 2016 at 6:24 PM, Pierre Moreauwrote: > > Hardware does not support 64-bit integers MAD and MUL operations, so we need > > to transform them in 32-bit operations. > > > > Signed-off-by: Pierre Moreau > > --- > > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 > > + > > 1 file changed, 121 insertions(+) > > > > Tested with (the GPU result was compared to the CPU result): > > * 0xfff3lu * 0xfff2lu + 0x80070002lu > > * 0xfff3lu * 0x80070002lu + 0x80070002lu > > * 0x80010003lu * 0xfff2lu + 0x80070002lu > > * 0x80010003lu * 0x80070002lu + 0x80070002lu > > > > * -523456791234l * 929835793793l + -15793793l > > * 523456791234l * 929835793793l + -15793793l > > * -523456791234l * -929835793793l + -15793793l > > * 523456791234l * -929835793793l + -15793793l > > > > v2: > > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) > > * Move pass prior to Register Allocation, as some temporaries need to > > be created. > > In principle I like this approach. I don't remember what your old one > was, but this is good. I think that nearly all of our "legalize" step > items, including the gpu-family specific ones, need to be moved to > this type of pass. The old one inserted itself within the existing `BuildUtil::split64BitOpPostRA()`. > > > > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > index d88bb34..a610eb5 100644 > > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) > > > > // > > = > > > > +// Split 64-bit MUL and MAD > > +class Split64BitOpPreRA : public Pass > > +{ > > +private: > > + virtual bool visit(BasicBlock *); > > + void split64BitReg(Function *, Instruction *, Instruction *, > > + Instruction *, Value *, int); Oops, forgot to remove the above prototype, will do it for v3. > > + void split64MulMad(Function *, Instruction *, DataType); > > + > > + BuildUtil bld; > > +}; > > + > > +bool > > +Split64BitOpPreRA::visit(BasicBlock *bb) > > +{ > > + Instruction *i, *next; > > + Modifier mod; > > + > > + for (i = bb->getEntry(); i; i = next) { > > + next = i->next; > > + > > + if (typeSizeof(i->dType) != 8) > > + continue; > > Is this necessary? You exclusively operate on U64/S64 below. The above was added as I thought this pass could be reused for other 64-bit operations that need to be split, while the below switch statement is more of a remaining from when the code was in `BuildUtil::split64BitOpPostRA()`. I guess that even if the pass gets support for more operations, FP64 are not going to be part of it as the hardware supports them. In which case, only 64-bit integers are left, and the below switch statement would indeed be enough. > > > + > > + DataType hTy; > > + switch (i->dType) { > > + case TYPE_U64: hTy = TYPE_U32; break; > > + case TYPE_S64: hTy = TYPE_S32; break; > > + default: > > + continue; > > + } > > + > > + if (i->op == OP_MAD || i->op == OP_MUL) > > + split64MulMad(bb->getFunction(), i, hTy); > > There's an instance variable "func" (and "prog") you can use. Oh, nice! Will use it. > > > + } > > + > > + return true; > > +} > > + > > +void > > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType > > hTy) > > +{ > > + assert(i->op == OP_MAD || i->op == OP_MUL); > > + if (isFloatType(i->dType) || isFloatType(i->sType)) > > + return; > > I'd make this into an assert. Given the checks before calling this > function, it can't really happen. True, I’ll change that. > > > + > > + bld.setPosition(i, true); > > + > > + Value *zero = bld.mkImm(0u); > > + Value *carry = bld.getSSA(1, FILE_FLAGS); > > + > > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit > > + // values (a, b and c might be 32-bit values), using 32-bit operations. > > This > > + // gives the following operations: > > + // * `d.low = low(a.low * b.low) (+ c.low)?` > > + // * `d.high = low(a.high * b.low) + low(a.low * b.high) > > + // + high(a.low * b.low) (+ c.high)?` > > + // > > + // To compute the high bits, we can split in the following operations: > > + // * `tmp1 = low(a.high * b.low) (+ c.high)?` > > + // * `tmp2 = low(a.low * b.high) + tmp1` > > + // * `d.high = high(a.low * b.low) + tmp2` > > + // > > + // mkSplit put lower bits at index 0 and higher bits at index 1 > > + > > + Value *op1[2]; > > + if
Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
On Sat, Oct 15, 2016 at 6:24 PM, Pierre Moreauwrote: > Hardware does not support 64-bit integers MAD and MUL operations, so we need > to transform them in 32-bit operations. > > Signed-off-by: Pierre Moreau > --- > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 > + > 1 file changed, 121 insertions(+) > > Tested with (the GPU result was compared to the CPU result): > * 0xfff3lu * 0xfff2lu + 0x80070002lu > * 0xfff3lu * 0x80070002lu + 0x80070002lu > * 0x80010003lu * 0xfff2lu + 0x80070002lu > * 0x80010003lu * 0x80070002lu + 0x80070002lu > > * -523456791234l * 929835793793l + -15793793l > * 523456791234l * 929835793793l + -15793793l > * -523456791234l * -929835793793l + -15793793l > * 523456791234l * -929835793793l + -15793793l > > v2: > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) > * Move pass prior to Register Allocation, as some temporaries need to > be created. In principle I like this approach. I don't remember what your old one was, but this is good. I think that nearly all of our "legalize" step items, including the gpu-family specific ones, need to be moved to this type of pass. > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index d88bb34..a610eb5 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) > > // > = > > +// Split 64-bit MUL and MAD > +class Split64BitOpPreRA : public Pass > +{ > +private: > + virtual bool visit(BasicBlock *); > + void split64BitReg(Function *, Instruction *, Instruction *, > + Instruction *, Value *, int); > + void split64MulMad(Function *, Instruction *, DataType); > + > + BuildUtil bld; > +}; > + > +bool > +Split64BitOpPreRA::visit(BasicBlock *bb) > +{ > + Instruction *i, *next; > + Modifier mod; > + > + for (i = bb->getEntry(); i; i = next) { > + next = i->next; > + > + if (typeSizeof(i->dType) != 8) > + continue; Is this necessary? You exclusively operate on U64/S64 below. > + > + DataType hTy; > + switch (i->dType) { > + case TYPE_U64: hTy = TYPE_U32; break; > + case TYPE_S64: hTy = TYPE_S32; break; > + default: > + continue; > + } > + > + if (i->op == OP_MAD || i->op == OP_MUL) > + split64MulMad(bb->getFunction(), i, hTy); There's an instance variable "func" (and "prog") you can use. > + } > + > + return true; > +} > + > +void > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy) > +{ > + assert(i->op == OP_MAD || i->op == OP_MUL); > + if (isFloatType(i->dType) || isFloatType(i->sType)) > + return; I'd make this into an assert. Given the checks before calling this function, it can't really happen. > + > + bld.setPosition(i, true); > + > + Value *zero = bld.mkImm(0u); > + Value *carry = bld.getSSA(1, FILE_FLAGS); > + > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit > + // values (a, b and c might be 32-bit values), using 32-bit operations. > This > + // gives the following operations: > + // * `d.low = low(a.low * b.low) (+ c.low)?` > + // * `d.high = low(a.high * b.low) + low(a.low * b.high) > + // + high(a.low * b.low) (+ c.high)?` > + // > + // To compute the high bits, we can split in the following operations: > + // * `tmp1 = low(a.high * b.low) (+ c.high)?` > + // * `tmp2 = low(a.low * b.high) + tmp1` > + // * `d.high = high(a.low * b.low) + tmp2` > + // > + // mkSplit put lower bits at index 0 and higher bits at index 1 > + > + Value *op1[2]; > + if (i->getSrc(0)->reg.size == 8) > + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0)); > + else { > + op1[0] = i->getSrc(0); > + op1[1] = zero; > + } > + Value *op2[2]; > + if (i->getSrc(1)->reg.size == 8) > + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1)); > + else { > + op2[0] = i->getSrc(1); > + op2[1] = zero; > + } > + > + Value *op3[2] = { NULL, NULL }; > + if (i->op == OP_MAD) { > + if (i->getSrc(2)->reg.size == 8) > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2)); > + else { > + op3[0] = i->getSrc(2); > + op3[1] = zero; > + } > + } > + > + Value *tmpRes1Hi = bld.getSSA(); > + if (i->op == OP_MAD) > + bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]); > + else > + bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]); > + > + Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], > tmpRes1Hi); > + > + Value
Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
Sorry, there should have been a v2 next to PATCH in the subject… Pierre On 12:24 am - Oct 16 2016, Pierre Moreau wrote: > Hardware does not support 64-bit integers MAD and MUL operations, so we need > to transform them in 32-bit operations. > > Signed-off-by: Pierre Moreau> --- > .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 > + > 1 file changed, 121 insertions(+) > > Tested with (the GPU result was compared to the CPU result): > * 0xfff3lu * 0xfff2lu + 0x80070002lu > * 0xfff3lu * 0x80070002lu + 0x80070002lu > * 0x80010003lu * 0xfff2lu + 0x80070002lu > * 0x80010003lu * 0x80070002lu + 0x80070002lu > > * -523456791234l * 929835793793l + -15793793l > * 523456791234l * 929835793793l + -15793793l > * -523456791234l * -929835793793l + -15793793l > * 523456791234l * -929835793793l + -15793793l > > v2: > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) > * Move pass prior to Register Allocation, as some temporaries need to > be created. > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > index d88bb34..a610eb5 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) > > // > = > > +// Split 64-bit MUL and MAD > +class Split64BitOpPreRA : public Pass > +{ > +private: > + virtual bool visit(BasicBlock *); > + void split64BitReg(Function *, Instruction *, Instruction *, > + Instruction *, Value *, int); > + void split64MulMad(Function *, Instruction *, DataType); > + > + BuildUtil bld; > +}; > + > +bool > +Split64BitOpPreRA::visit(BasicBlock *bb) > +{ > + Instruction *i, *next; > + Modifier mod; > + > + for (i = bb->getEntry(); i; i = next) { > + next = i->next; > + > + if (typeSizeof(i->dType) != 8) > + continue; > + > + DataType hTy; > + switch (i->dType) { > + case TYPE_U64: hTy = TYPE_U32; break; > + case TYPE_S64: hTy = TYPE_S32; break; > + default: > + continue; > + } > + > + if (i->op == OP_MAD || i->op == OP_MUL) > + split64MulMad(bb->getFunction(), i, hTy); > + } > + > + return true; > +} > + > +void > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy) > +{ > + assert(i->op == OP_MAD || i->op == OP_MUL); > + if (isFloatType(i->dType) || isFloatType(i->sType)) > + return; > + > + bld.setPosition(i, true); > + > + Value *zero = bld.mkImm(0u); > + Value *carry = bld.getSSA(1, FILE_FLAGS); > + > + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit > + // values (a, b and c might be 32-bit values), using 32-bit operations. > This > + // gives the following operations: > + // * `d.low = low(a.low * b.low) (+ c.low)?` > + // * `d.high = low(a.high * b.low) + low(a.low * b.high) > + // + high(a.low * b.low) (+ c.high)?` > + // > + // To compute the high bits, we can split in the following operations: > + // * `tmp1 = low(a.high * b.low) (+ c.high)?` > + // * `tmp2 = low(a.low * b.high) + tmp1` > + // * `d.high = high(a.low * b.low) + tmp2` > + // > + // mkSplit put lower bits at index 0 and higher bits at index 1 > + > + Value *op1[2]; > + if (i->getSrc(0)->reg.size == 8) > + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0)); > + else { > + op1[0] = i->getSrc(0); > + op1[1] = zero; > + } > + Value *op2[2]; > + if (i->getSrc(1)->reg.size == 8) > + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1)); > + else { > + op2[0] = i->getSrc(1); > + op2[1] = zero; > + } > + > + Value *op3[2] = { NULL, NULL }; > + if (i->op == OP_MAD) { > + if (i->getSrc(2)->reg.size == 8) > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2)); > + else { > + op3[0] = i->getSrc(2); > + op3[1] = zero; > + } > + } > + > + Value *tmpRes1Hi = bld.getSSA(); > + if (i->op == OP_MAD) > + bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]); > + else > + bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]); > + > + Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], > tmpRes1Hi); > + > + Value *def[2] = { bld.getSSA(), bld.getSSA() }; > + > + // If it was a MAD, add the carry from the low bits > + // It is not needed if it was a MUL, since we added high(a.low * b.low) to > + // d.high > + if (i->op == OP_MAD) > + bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, > carry); > + else > + bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]); > + > +
[Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations
Hardware does not support 64-bit integers MAD and MUL operations, so we need to transform them in 32-bit operations. Signed-off-by: Pierre Moreau--- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 121 + 1 file changed, 121 insertions(+) Tested with (the GPU result was compared to the CPU result): * 0xfff3lu * 0xfff2lu + 0x80070002lu * 0xfff3lu * 0x80070002lu + 0x80070002lu * 0x80010003lu * 0xfff2lu + 0x80070002lu * 0x80010003lu * 0x80070002lu + 0x80070002lu * -523456791234l * 929835793793l + -15793793l * 523456791234l * 929835793793l + -15793793l * -523456791234l * -929835793793l + -15793793l * 523456791234l * -929835793793l + -15793793l v2: * Completely re-write the patch, as it was completely flawed (Ilia Mirkin) * Move pass prior to Register Allocation, as some temporaries need to be created. diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index d88bb34..a610eb5 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i) // = +// Split 64-bit MUL and MAD +class Split64BitOpPreRA : public Pass +{ +private: + virtual bool visit(BasicBlock *); + void split64BitReg(Function *, Instruction *, Instruction *, + Instruction *, Value *, int); + void split64MulMad(Function *, Instruction *, DataType); + + BuildUtil bld; +}; + +bool +Split64BitOpPreRA::visit(BasicBlock *bb) +{ + Instruction *i, *next; + Modifier mod; + + for (i = bb->getEntry(); i; i = next) { + next = i->next; + + if (typeSizeof(i->dType) != 8) + continue; + + DataType hTy; + switch (i->dType) { + case TYPE_U64: hTy = TYPE_U32; break; + case TYPE_S64: hTy = TYPE_S32; break; + default: + continue; + } + + if (i->op == OP_MAD || i->op == OP_MUL) + split64MulMad(bb->getFunction(), i, hTy); + } + + return true; +} + +void +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy) +{ + assert(i->op == OP_MAD || i->op == OP_MUL); + if (isFloatType(i->dType) || isFloatType(i->sType)) + return; + + bld.setPosition(i, true); + + Value *zero = bld.mkImm(0u); + Value *carry = bld.getSSA(1, FILE_FLAGS); + + // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit + // values (a, b and c might be 32-bit values), using 32-bit operations. This + // gives the following operations: + // * `d.low = low(a.low * b.low) (+ c.low)?` + // * `d.high = low(a.high * b.low) + low(a.low * b.high) + // + high(a.low * b.low) (+ c.high)?` + // + // To compute the high bits, we can split in the following operations: + // * `tmp1 = low(a.high * b.low) (+ c.high)?` + // * `tmp2 = low(a.low * b.high) + tmp1` + // * `d.high = high(a.low * b.low) + tmp2` + // + // mkSplit put lower bits at index 0 and higher bits at index 1 + + Value *op1[2]; + if (i->getSrc(0)->reg.size == 8) + bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0)); + else { + op1[0] = i->getSrc(0); + op1[1] = zero; + } + Value *op2[2]; + if (i->getSrc(1)->reg.size == 8) + bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1)); + else { + op2[0] = i->getSrc(1); + op2[1] = zero; + } + + Value *op3[2] = { NULL, NULL }; + if (i->op == OP_MAD) { + if (i->getSrc(2)->reg.size == 8) + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2)); + else { + op3[0] = i->getSrc(2); + op3[1] = zero; + } + } + + Value *tmpRes1Hi = bld.getSSA(); + if (i->op == OP_MAD) + bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]); + else + bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]); + + Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi); + + Value *def[2] = { bld.getSSA(), bld.getSSA() }; + + // If it was a MAD, add the carry from the low bits + // It is not needed if it was a MUL, since we added high(a.low * b.low) to + // d.high + if (i->op == OP_MAD) + bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry); + else + bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]); + + Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi); + hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH; + if (i->op == OP_MAD) + hiPart3->setFlagsSrc(3, carry); + + bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]); + + delete_Instruction(fn->getProgram(), i); +} + +// = + static inline void