Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

2016-10-18 Thread Pierre Moreau
Hello Ian,

Since I am working on a direct SPIR-V to NV50 IR translator, ultimately to be
used for OpenCL kernels, I will still need the patch for that work. (I even
wrote that patch because I needed it when handling 64-bit addresses. :-) )
But thanks for the heads-up!

Pierre


On 02:07 pm - Oct 17 2016, Ian Romanick wrote:
> I know know if it will make this patch unnecessary, but I have a GLSL
> IR-level lowering pass for 64-bit multiplication.  I'm going to send
> that out with the rest of the GL_ARB_gpu_shader_int64 series within the
> next day or so.
> 
> On 10/15/2016 03:24 PM, Pierre Moreau wrote:
> > Hardware does not support 64-bit integers MAD and MUL operations, so we need
> > to transform them in 32-bit operations.
> > 
> > Signed-off-by: Pierre Moreau 
> > ---
> >  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 
> > +
> >  1 file changed, 121 insertions(+)
> > 
> > Tested with (the GPU result was compared to the CPU result):
> > * 0xfff3lu * 0xfff2lu + 0x80070002lu
> > * 0xfff3lu * 0x80070002lu + 0x80070002lu
> > * 0x80010003lu * 0xfff2lu + 0x80070002lu
> > * 0x80010003lu * 0x80070002lu + 0x80070002lu
> > 
> > * -523456791234l * 929835793793l + -15793793l
> > *  523456791234l * 929835793793l + -15793793l
> > * -523456791234l * -929835793793l + -15793793l
> > *  523456791234l * -929835793793l + -15793793l
> > 
> > v2:
> > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> > * Move pass prior to Register Allocation, as some temporaries need to
> >   be created.
> > 
> > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
> > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > index d88bb34..a610eb5 100644
> > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
> >  
> >  // 
> > =
> >  
> > +// Split 64-bit MUL and MAD
> > +class Split64BitOpPreRA : public Pass
> > +{
> > +private:
> > +   virtual bool visit(BasicBlock *);
> > +   void split64BitReg(Function *, Instruction *, Instruction *,
> > +  Instruction *, Value *, int);
> > +   void split64MulMad(Function *, Instruction *, DataType);
> > +
> > +   BuildUtil bld;
> > +};
> > +
> > +bool
> > +Split64BitOpPreRA::visit(BasicBlock *bb)
> > +{
> > +   Instruction *i, *next;
> > +   Modifier mod;
> > +
> > +   for (i = bb->getEntry(); i; i = next) {
> > +  next = i->next;
> > +
> > +  if (typeSizeof(i->dType) != 8)
> > + continue;
> > +
> > +  DataType hTy;
> > +  switch (i->dType) {
> > +  case TYPE_U64: hTy = TYPE_U32; break;
> > +  case TYPE_S64: hTy = TYPE_S32; break;
> > +  default:
> > + continue;
> > +  }
> > +
> > +  if (i->op == OP_MAD || i->op == OP_MUL)
> > + split64MulMad(bb->getFunction(), i, hTy);
> > +   }
> > +
> > +   return true;
> > +}
> > +
> > +void
> > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType 
> > hTy)
> > +{
> > +   assert(i->op == OP_MAD || i->op == OP_MUL);
> > +   if (isFloatType(i->dType) || isFloatType(i->sType))
> > +  return;
> > +
> > +   bld.setPosition(i, true);
> > +
> > +   Value *zero = bld.mkImm(0u);
> > +   Value *carry = bld.getSSA(1, FILE_FLAGS);
> > +
> > +   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> > +   // values (a, b and c might be 32-bit values), using 32-bit operations. 
> > This
> > +   // gives the following operations:
> > +   // * `d.low = low(a.low * b.low) (+ c.low)?`
> > +   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> > +   //   + high(a.low * b.low) (+ c.high)?`
> > +   //
> > +   // To compute the high bits, we can split in the following operations:
> > +   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
> > +   // * `tmp2   = low(a.low * b.high) + tmp1`
> > +   // * `d.high = high(a.low * b.low) + tmp2`
> > +   //
> > +   // mkSplit put lower bits at index 0 and higher bits at index 1
> > +
> > +   Value *op1[2];
> > +   if (i->getSrc(0)->reg.size == 8)
> > +  bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
> > +   else {
> > +  op1[0] = i->getSrc(0);
> > +  op1[1] = zero;
> > +   }
> > +   Value *op2[2];
> > +   if (i->getSrc(1)->reg.size == 8)
> > +  bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
> > +   else {
> > +  op2[0] = i->getSrc(1);
> > +  op2[1] = zero;
> > +   }
> > +
> > +   Value *op3[2] = { NULL, NULL };
> > +   if (i->op == OP_MAD) {
> > +  if (i->getSrc(2)->reg.size == 8)
> > + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
> > +  else {
> > + op3[0] = i->getSrc(2);
> > + op3[1] = zero;
> > + 

Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

2016-10-17 Thread Ian Romanick
I know know if it will make this patch unnecessary, but I have a GLSL
IR-level lowering pass for 64-bit multiplication.  I'm going to send
that out with the rest of the GL_ARB_gpu_shader_int64 series within the
next day or so.

On 10/15/2016 03:24 PM, Pierre Moreau wrote:
> Hardware does not support 64-bit integers MAD and MUL operations, so we need
> to transform them in 32-bit operations.
> 
> Signed-off-by: Pierre Moreau 
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 
> +
>  1 file changed, 121 insertions(+)
> 
> Tested with (the GPU result was compared to the CPU result):
> * 0xfff3lu * 0xfff2lu + 0x80070002lu
> * 0xfff3lu * 0x80070002lu + 0x80070002lu
> * 0x80010003lu * 0xfff2lu + 0x80070002lu
> * 0x80010003lu * 0x80070002lu + 0x80070002lu
> 
> * -523456791234l * 929835793793l + -15793793l
> *  523456791234l * 929835793793l + -15793793l
> * -523456791234l * -929835793793l + -15793793l
> *  523456791234l * -929835793793l + -15793793l
> 
> v2:
> * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> * Move pass prior to Register Allocation, as some temporaries need to
>   be created.
> 
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index d88bb34..a610eb5 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
>  
>  // 
> =
>  
> +// Split 64-bit MUL and MAD
> +class Split64BitOpPreRA : public Pass
> +{
> +private:
> +   virtual bool visit(BasicBlock *);
> +   void split64BitReg(Function *, Instruction *, Instruction *,
> +  Instruction *, Value *, int);
> +   void split64MulMad(Function *, Instruction *, DataType);
> +
> +   BuildUtil bld;
> +};
> +
> +bool
> +Split64BitOpPreRA::visit(BasicBlock *bb)
> +{
> +   Instruction *i, *next;
> +   Modifier mod;
> +
> +   for (i = bb->getEntry(); i; i = next) {
> +  next = i->next;
> +
> +  if (typeSizeof(i->dType) != 8)
> + continue;
> +
> +  DataType hTy;
> +  switch (i->dType) {
> +  case TYPE_U64: hTy = TYPE_U32; break;
> +  case TYPE_S64: hTy = TYPE_S32; break;
> +  default:
> + continue;
> +  }
> +
> +  if (i->op == OP_MAD || i->op == OP_MUL)
> + split64MulMad(bb->getFunction(), i, hTy);
> +   }
> +
> +   return true;
> +}
> +
> +void
> +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
> +{
> +   assert(i->op == OP_MAD || i->op == OP_MUL);
> +   if (isFloatType(i->dType) || isFloatType(i->sType))
> +  return;
> +
> +   bld.setPosition(i, true);
> +
> +   Value *zero = bld.mkImm(0u);
> +   Value *carry = bld.getSSA(1, FILE_FLAGS);
> +
> +   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> +   // values (a, b and c might be 32-bit values), using 32-bit operations. 
> This
> +   // gives the following operations:
> +   // * `d.low = low(a.low * b.low) (+ c.low)?`
> +   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> +   //   + high(a.low * b.low) (+ c.high)?`
> +   //
> +   // To compute the high bits, we can split in the following operations:
> +   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
> +   // * `tmp2   = low(a.low * b.high) + tmp1`
> +   // * `d.high = high(a.low * b.low) + tmp2`
> +   //
> +   // mkSplit put lower bits at index 0 and higher bits at index 1
> +
> +   Value *op1[2];
> +   if (i->getSrc(0)->reg.size == 8)
> +  bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
> +   else {
> +  op1[0] = i->getSrc(0);
> +  op1[1] = zero;
> +   }
> +   Value *op2[2];
> +   if (i->getSrc(1)->reg.size == 8)
> +  bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
> +   else {
> +  op2[0] = i->getSrc(1);
> +  op2[1] = zero;
> +   }
> +
> +   Value *op3[2] = { NULL, NULL };
> +   if (i->op == OP_MAD) {
> +  if (i->getSrc(2)->reg.size == 8)
> + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
> +  else {
> + op3[0] = i->getSrc(2);
> + op3[1] = zero;
> +  }
> +   }
> +
> +   Value *tmpRes1Hi = bld.getSSA();
> +   if (i->op == OP_MAD)
> +  bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
> +   else
> +  bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
> +
> +   Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], 
> tmpRes1Hi);
> +
> +   Value *def[2] = { bld.getSSA(), bld.getSSA() };
> +
> +   // If it was a MAD, add the carry from the low bits
> +   // It is not needed if it was a MUL, since we added high(a.low * b.low) to
> +   // d.high
> +   if (i->op == OP_MAD)
> +  

Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

2016-10-16 Thread Pierre Moreau
On 06:38 pm - Oct 15 2016, Ilia Mirkin wrote:
> On Sat, Oct 15, 2016 at 6:24 PM, Pierre Moreau  wrote:
> > Hardware does not support 64-bit integers MAD and MUL operations, so we need
> > to transform them in 32-bit operations.
> >
> > Signed-off-by: Pierre Moreau 
> > ---
> >  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 
> > +
> >  1 file changed, 121 insertions(+)
> >
> > Tested with (the GPU result was compared to the CPU result):
> > * 0xfff3lu * 0xfff2lu + 0x80070002lu
> > * 0xfff3lu * 0x80070002lu + 0x80070002lu
> > * 0x80010003lu * 0xfff2lu + 0x80070002lu
> > * 0x80010003lu * 0x80070002lu + 0x80070002lu
> >
> > * -523456791234l * 929835793793l + -15793793l
> > *  523456791234l * 929835793793l + -15793793l
> > * -523456791234l * -929835793793l + -15793793l
> > *  523456791234l * -929835793793l + -15793793l
> >
> > v2:
> > * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> > * Move pass prior to Register Allocation, as some temporaries need to
> >   be created.
> 
> In principle I like this approach. I don't remember what your old one
> was, but this is good. I think that nearly all of our "legalize" step
> items, including the gpu-family specific ones, need to be moved to
> this type of pass.

The old one inserted itself within the existing
`BuildUtil::split64BitOpPostRA()`.

> 
> >
> > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
> > b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > index d88bb34..a610eb5 100644
> > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> > @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
> >
> >  // 
> > =
> >
> > +// Split 64-bit MUL and MAD
> > +class Split64BitOpPreRA : public Pass
> > +{
> > +private:
> > +   virtual bool visit(BasicBlock *);
> > +   void split64BitReg(Function *, Instruction *, Instruction *,
> > +  Instruction *, Value *, int);

Oops, forgot to remove the above prototype, will do it for v3.

> > +   void split64MulMad(Function *, Instruction *, DataType);
> > +
> > +   BuildUtil bld;
> > +};
> > +
> > +bool
> > +Split64BitOpPreRA::visit(BasicBlock *bb)
> > +{
> > +   Instruction *i, *next;
> > +   Modifier mod;
> > +
> > +   for (i = bb->getEntry(); i; i = next) {
> > +  next = i->next;
> > +
> > +  if (typeSizeof(i->dType) != 8)
> > + continue;
> 
> Is this necessary? You exclusively operate on U64/S64 below.

The above was added as I thought this pass could be reused for other 64-bit
operations that need to be split, while the below switch statement is more of a
remaining from when the code was in `BuildUtil::split64BitOpPostRA()`.
I guess that even if the pass gets support for more operations, FP64 are not
going to be part of it as the hardware supports them. In which case, only
64-bit integers are left, and the below switch statement would indeed be
enough.

> 
> > +
> > +  DataType hTy;
> > +  switch (i->dType) {
> > +  case TYPE_U64: hTy = TYPE_U32; break;
> > +  case TYPE_S64: hTy = TYPE_S32; break;
> > +  default:
> > + continue;
> > +  }
> > +
> > +  if (i->op == OP_MAD || i->op == OP_MUL)
> > + split64MulMad(bb->getFunction(), i, hTy);
> 
> There's an instance variable "func" (and "prog") you can use.

Oh, nice! Will use it.

> 
> > +   }
> > +
> > +   return true;
> > +}
> > +
> > +void
> > +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType 
> > hTy)
> > +{
> > +   assert(i->op == OP_MAD || i->op == OP_MUL);
> > +   if (isFloatType(i->dType) || isFloatType(i->sType))
> > +  return;
> 
> I'd make this into an assert. Given the checks before calling this
> function, it can't really happen.

True, I’ll change that.

> 
> > +
> > +   bld.setPosition(i, true);
> > +
> > +   Value *zero = bld.mkImm(0u);
> > +   Value *carry = bld.getSSA(1, FILE_FLAGS);
> > +
> > +   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> > +   // values (a, b and c might be 32-bit values), using 32-bit operations. 
> > This
> > +   // gives the following operations:
> > +   // * `d.low = low(a.low * b.low) (+ c.low)?`
> > +   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> > +   //   + high(a.low * b.low) (+ c.high)?`
> > +   //
> > +   // To compute the high bits, we can split in the following operations:
> > +   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
> > +   // * `tmp2   = low(a.low * b.high) + tmp1`
> > +   // * `d.high = high(a.low * b.low) + tmp2`
> > +   //
> > +   // mkSplit put lower bits at index 0 and higher bits at index 1
> > +
> > +   Value *op1[2];
> > +   if 

Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

2016-10-15 Thread Ilia Mirkin
On Sat, Oct 15, 2016 at 6:24 PM, Pierre Moreau  wrote:
> Hardware does not support 64-bit integers MAD and MUL operations, so we need
> to transform them in 32-bit operations.
>
> Signed-off-by: Pierre Moreau 
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 
> +
>  1 file changed, 121 insertions(+)
>
> Tested with (the GPU result was compared to the CPU result):
> * 0xfff3lu * 0xfff2lu + 0x80070002lu
> * 0xfff3lu * 0x80070002lu + 0x80070002lu
> * 0x80010003lu * 0xfff2lu + 0x80070002lu
> * 0x80010003lu * 0x80070002lu + 0x80070002lu
>
> * -523456791234l * 929835793793l + -15793793l
> *  523456791234l * 929835793793l + -15793793l
> * -523456791234l * -929835793793l + -15793793l
> *  523456791234l * -929835793793l + -15793793l
>
> v2:
> * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> * Move pass prior to Register Allocation, as some temporaries need to
>   be created.

In principle I like this approach. I don't remember what your old one
was, but this is good. I think that nearly all of our "legalize" step
items, including the gpu-family specific ones, need to be moved to
this type of pass.

>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index d88bb34..a610eb5 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
>
>  // 
> =
>
> +// Split 64-bit MUL and MAD
> +class Split64BitOpPreRA : public Pass
> +{
> +private:
> +   virtual bool visit(BasicBlock *);
> +   void split64BitReg(Function *, Instruction *, Instruction *,
> +  Instruction *, Value *, int);
> +   void split64MulMad(Function *, Instruction *, DataType);
> +
> +   BuildUtil bld;
> +};
> +
> +bool
> +Split64BitOpPreRA::visit(BasicBlock *bb)
> +{
> +   Instruction *i, *next;
> +   Modifier mod;
> +
> +   for (i = bb->getEntry(); i; i = next) {
> +  next = i->next;
> +
> +  if (typeSizeof(i->dType) != 8)
> + continue;

Is this necessary? You exclusively operate on U64/S64 below.

> +
> +  DataType hTy;
> +  switch (i->dType) {
> +  case TYPE_U64: hTy = TYPE_U32; break;
> +  case TYPE_S64: hTy = TYPE_S32; break;
> +  default:
> + continue;
> +  }
> +
> +  if (i->op == OP_MAD || i->op == OP_MUL)
> + split64MulMad(bb->getFunction(), i, hTy);

There's an instance variable "func" (and "prog") you can use.

> +   }
> +
> +   return true;
> +}
> +
> +void
> +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
> +{
> +   assert(i->op == OP_MAD || i->op == OP_MUL);
> +   if (isFloatType(i->dType) || isFloatType(i->sType))
> +  return;

I'd make this into an assert. Given the checks before calling this
function, it can't really happen.

> +
> +   bld.setPosition(i, true);
> +
> +   Value *zero = bld.mkImm(0u);
> +   Value *carry = bld.getSSA(1, FILE_FLAGS);
> +
> +   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> +   // values (a, b and c might be 32-bit values), using 32-bit operations. 
> This
> +   // gives the following operations:
> +   // * `d.low = low(a.low * b.low) (+ c.low)?`
> +   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> +   //   + high(a.low * b.low) (+ c.high)?`
> +   //
> +   // To compute the high bits, we can split in the following operations:
> +   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
> +   // * `tmp2   = low(a.low * b.high) + tmp1`
> +   // * `d.high = high(a.low * b.low) + tmp2`
> +   //
> +   // mkSplit put lower bits at index 0 and higher bits at index 1
> +
> +   Value *op1[2];
> +   if (i->getSrc(0)->reg.size == 8)
> +  bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
> +   else {
> +  op1[0] = i->getSrc(0);
> +  op1[1] = zero;
> +   }
> +   Value *op2[2];
> +   if (i->getSrc(1)->reg.size == 8)
> +  bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
> +   else {
> +  op2[0] = i->getSrc(1);
> +  op2[1] = zero;
> +   }
> +
> +   Value *op3[2] = { NULL, NULL };
> +   if (i->op == OP_MAD) {
> +  if (i->getSrc(2)->reg.size == 8)
> + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
> +  else {
> + op3[0] = i->getSrc(2);
> + op3[1] = zero;
> +  }
> +   }
> +
> +   Value *tmpRes1Hi = bld.getSSA();
> +   if (i->op == OP_MAD)
> +  bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
> +   else
> +  bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
> +
> +   Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], 
> tmpRes1Hi);
> +
> +   Value 

Re: [Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

2016-10-15 Thread Pierre Moreau
Sorry, there should have been a v2 next to PATCH in the subject…

Pierre


On 12:24 am - Oct 16 2016, Pierre Moreau wrote:
> Hardware does not support 64-bit integers MAD and MUL operations, so we need
> to transform them in 32-bit operations.
> 
> Signed-off-by: Pierre Moreau 
> ---
>  .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 
> +
>  1 file changed, 121 insertions(+)
> 
> Tested with (the GPU result was compared to the CPU result):
> * 0xfff3lu * 0xfff2lu + 0x80070002lu
> * 0xfff3lu * 0x80070002lu + 0x80070002lu
> * 0x80010003lu * 0xfff2lu + 0x80070002lu
> * 0x80010003lu * 0x80070002lu + 0x80070002lu
> 
> * -523456791234l * 929835793793l + -15793793l
> *  523456791234l * 929835793793l + -15793793l
> * -523456791234l * -929835793793l + -15793793l
> *  523456791234l * -929835793793l + -15793793l
> 
> v2:
> * Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
> * Move pass prior to Register Allocation, as some temporaries need to
>   be created.
> 
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
> b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> index d88bb34..a610eb5 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
> @@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
>  
>  // 
> =
>  
> +// Split 64-bit MUL and MAD
> +class Split64BitOpPreRA : public Pass
> +{
> +private:
> +   virtual bool visit(BasicBlock *);
> +   void split64BitReg(Function *, Instruction *, Instruction *,
> +  Instruction *, Value *, int);
> +   void split64MulMad(Function *, Instruction *, DataType);
> +
> +   BuildUtil bld;
> +};
> +
> +bool
> +Split64BitOpPreRA::visit(BasicBlock *bb)
> +{
> +   Instruction *i, *next;
> +   Modifier mod;
> +
> +   for (i = bb->getEntry(); i; i = next) {
> +  next = i->next;
> +
> +  if (typeSizeof(i->dType) != 8)
> + continue;
> +
> +  DataType hTy;
> +  switch (i->dType) {
> +  case TYPE_U64: hTy = TYPE_U32; break;
> +  case TYPE_S64: hTy = TYPE_S32; break;
> +  default:
> + continue;
> +  }
> +
> +  if (i->op == OP_MAD || i->op == OP_MUL)
> + split64MulMad(bb->getFunction(), i, hTy);
> +   }
> +
> +   return true;
> +}
> +
> +void
> +Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
> +{
> +   assert(i->op == OP_MAD || i->op == OP_MUL);
> +   if (isFloatType(i->dType) || isFloatType(i->sType))
> +  return;
> +
> +   bld.setPosition(i, true);
> +
> +   Value *zero = bld.mkImm(0u);
> +   Value *carry = bld.getSSA(1, FILE_FLAGS);
> +
> +   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
> +   // values (a, b and c might be 32-bit values), using 32-bit operations. 
> This
> +   // gives the following operations:
> +   // * `d.low = low(a.low * b.low) (+ c.low)?`
> +   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
> +   //   + high(a.low * b.low) (+ c.high)?`
> +   //
> +   // To compute the high bits, we can split in the following operations:
> +   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
> +   // * `tmp2   = low(a.low * b.high) + tmp1`
> +   // * `d.high = high(a.low * b.low) + tmp2`
> +   //
> +   // mkSplit put lower bits at index 0 and higher bits at index 1
> +
> +   Value *op1[2];
> +   if (i->getSrc(0)->reg.size == 8)
> +  bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
> +   else {
> +  op1[0] = i->getSrc(0);
> +  op1[1] = zero;
> +   }
> +   Value *op2[2];
> +   if (i->getSrc(1)->reg.size == 8)
> +  bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
> +   else {
> +  op2[0] = i->getSrc(1);
> +  op2[1] = zero;
> +   }
> +
> +   Value *op3[2] = { NULL, NULL };
> +   if (i->op == OP_MAD) {
> +  if (i->getSrc(2)->reg.size == 8)
> + bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
> +  else {
> + op3[0] = i->getSrc(2);
> + op3[1] = zero;
> +  }
> +   }
> +
> +   Value *tmpRes1Hi = bld.getSSA();
> +   if (i->op == OP_MAD)
> +  bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
> +   else
> +  bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
> +
> +   Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], 
> tmpRes1Hi);
> +
> +   Value *def[2] = { bld.getSSA(), bld.getSSA() };
> +
> +   // If it was a MAD, add the carry from the low bits
> +   // It is not needed if it was a MUL, since we added high(a.low * b.low) to
> +   // d.high
> +   if (i->op == OP_MAD)
> +  bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, 
> carry);
> +   else
> +  bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
> +
> +   

[Mesa-dev] [PATCH] nv50/ir: Split 64-bit integer MAD/MUL operations

2016-10-15 Thread Pierre Moreau
Hardware does not support 64-bit integers MAD and MUL operations, so we need
to transform them in 32-bit operations.

Signed-off-by: Pierre Moreau 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 121 +
 1 file changed, 121 insertions(+)

Tested with (the GPU result was compared to the CPU result):
* 0xfff3lu * 0xfff2lu + 0x80070002lu
* 0xfff3lu * 0x80070002lu + 0x80070002lu
* 0x80010003lu * 0xfff2lu + 0x80070002lu
* 0x80010003lu * 0x80070002lu + 0x80070002lu

* -523456791234l * 929835793793l + -15793793l
*  523456791234l * 929835793793l + -15793793l
* -523456791234l * -929835793793l + -15793793l
*  523456791234l * -929835793793l + -15793793l

v2:
* Completely re-write the patch, as it was completely flawed (Ilia Mirkin)
* Move pass prior to Register Allocation, as some temporaries need to
  be created.

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index d88bb34..a610eb5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -2218,6 +2218,126 @@ LateAlgebraicOpt::visit(Instruction *i)
 
 // 
=
 
+// Split 64-bit MUL and MAD
+class Split64BitOpPreRA : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+   void split64BitReg(Function *, Instruction *, Instruction *,
+  Instruction *, Value *, int);
+   void split64MulMad(Function *, Instruction *, DataType);
+
+   BuildUtil bld;
+};
+
+bool
+Split64BitOpPreRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+   Modifier mod;
+
+   for (i = bb->getEntry(); i; i = next) {
+  next = i->next;
+
+  if (typeSizeof(i->dType) != 8)
+ continue;
+
+  DataType hTy;
+  switch (i->dType) {
+  case TYPE_U64: hTy = TYPE_U32; break;
+  case TYPE_S64: hTy = TYPE_S32; break;
+  default:
+ continue;
+  }
+
+  if (i->op == OP_MAD || i->op == OP_MUL)
+ split64MulMad(bb->getFunction(), i, hTy);
+   }
+
+   return true;
+}
+
+void
+Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
+{
+   assert(i->op == OP_MAD || i->op == OP_MUL);
+   if (isFloatType(i->dType) || isFloatType(i->sType))
+  return;
+
+   bld.setPosition(i, true);
+
+   Value *zero = bld.mkImm(0u);
+   Value *carry = bld.getSSA(1, FILE_FLAGS);
+
+   // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
+   // values (a, b and c might be 32-bit values), using 32-bit operations. This
+   // gives the following operations:
+   // * `d.low = low(a.low * b.low) (+ c.low)?`
+   // * `d.high = low(a.high * b.low) + low(a.low * b.high)
+   //   + high(a.low * b.low) (+ c.high)?`
+   //
+   // To compute the high bits, we can split in the following operations:
+   // * `tmp1   = low(a.high * b.low) (+ c.high)?`
+   // * `tmp2   = low(a.low * b.high) + tmp1`
+   // * `d.high = high(a.low * b.low) + tmp2`
+   //
+   // mkSplit put lower bits at index 0 and higher bits at index 1
+
+   Value *op1[2];
+   if (i->getSrc(0)->reg.size == 8)
+  bld.mkSplit(op1, typeSizeof(hTy), i->getSrc(0));
+   else {
+  op1[0] = i->getSrc(0);
+  op1[1] = zero;
+   }
+   Value *op2[2];
+   if (i->getSrc(1)->reg.size == 8)
+  bld.mkSplit(op2, typeSizeof(hTy), i->getSrc(1));
+   else {
+  op2[0] = i->getSrc(1);
+  op2[1] = zero;
+   }
+
+   Value *op3[2] = { NULL, NULL };
+   if (i->op == OP_MAD) {
+  if (i->getSrc(2)->reg.size == 8)
+ bld.mkSplit(op3, typeSizeof(hTy), i->getSrc(2));
+  else {
+ op3[0] = i->getSrc(2);
+ op3[1] = zero;
+  }
+   }
+
+   Value *tmpRes1Hi = bld.getSSA();
+   if (i->op == OP_MAD)
+  bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
+   else
+  bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
+
+   Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], 
tmpRes1Hi);
+
+   Value *def[2] = { bld.getSSA(), bld.getSSA() };
+
+   // If it was a MAD, add the carry from the low bits
+   // It is not needed if it was a MUL, since we added high(a.low * b.low) to
+   // d.high
+   if (i->op == OP_MAD)
+  bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, 
carry);
+   else
+  bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
+
+   Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], 
tmpRes2Hi);
+   hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
+   if (i->op == OP_MAD)
+  hiPart3->setFlagsSrc(3, carry);
+
+   bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
+
+   delete_Instruction(fn->getProgram(), i);
+}
+
+// 
=
+
 static inline void