[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2023-06-06 Thread slash.tmp at free dot fr via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #16 from Mason  ---
For the record, the example I provided was intended to show that, with some
help, GCC can generate good code for bigint multiplication. In this situation,
"help" means a short asm template.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2023-06-06 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #15 from cqwrteur  ---
template<::std::unsigned_integral T>
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
T res{a+b};
carryout=res
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
assume(carryin==0||carryin==1);
a=add_carry_no_carry_in(carryin,a,carryout);
a=add_carry_no_carry_in(a,b,carryin);
carryout+=carryin;
assume(carryout==0||carryout==1);
return a;
}

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2023-06-06 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #14 from cqwrteur  ---
template
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
T res{a+b};
carryout=res
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
assume(carryin==0||carryin==1);
a=add_carry_no_carry_in(carryin,a,carryout);
a=add_carry_no_carry_in(a,b,carryin);
carryout+=carryin;
assume(carryout==0||carryout==1);
return a;
}

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2023-06-06 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #13 from cqwrteur  ---
Hi, the problem comes out GCC does not do a very good job to deal with crypto
computations that usually exploit all sorts of patterns.


template
inline constexpr T add_carry_no_carry_in(T a,T b,T& carryout) noexcept
{
T res{a+b};
carryout=res
inline constexpr T add_carry(T a,T b,T carryin,T& carryout) noexcept
{
assume(carryout==0||carryout==1);
a=add_carry_no_carry_in(carryin,a,carryout);
a=add_carry_no_carry_in(a,b,carryin);
carryout+=carryin;
return a;
}

See this pattern
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106865

I suggest just adding this addc pattern to GCC instead of adding a built-in
like clang. This can improve the existing code. It is, however, needed for
adding a backend hook for dealing with it.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2023-06-06 Thread slash.tmp at free dot fr via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #12 from Mason  ---
Actually, in this case, we don't need to propagate the carry over 3 limbs.

typedef unsigned int u32;
typedef unsigned long long u64;

/* u32 acc[2], a[1], b[1] */
static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b)
{
  u64 res = (u64)a[0] * b[0];
  u32 lo = res, hi = res >> 32;
  asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]" :
  [D0] "+m" (acc[0]), [D1] "+m" (acc[1]) :
  [LO] "r" (lo), [HI] "r" (hi) : "cc");
}

/* u32 acc[4], a[2], b[2] */
void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b)
{
  mul_add_32x32(acc+0, a+0, b+0);
  mul_add_32x32(acc+1, a+0, b+1);
  mul_add_32x32(acc+1, a+1, b+0);
  mul_add_32x32(acc+2, a+1, b+1);
}

gcc-trunk -O3 -m32

mul_64x64_128:
  pushl %esi
  pushl %ebx
  movl 16(%esp), %ebx
  movl 20(%esp), %esi
  movl 12(%esp), %ecx
  movl (%esi), %eax
  mull (%ebx)
  add %eax, (%ecx)
  adc %edx, 4(%ecx)
  movl 4(%esi), %eax
  mull (%ebx)
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  movl (%esi), %eax
  mull 4(%ebx)
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  movl 4(%esi), %eax
  mull 4(%ebx)
  add %eax, 8(%ecx)
  adc %edx, 12(%ecx)
  popl %ebx
  popl %esi
  ret

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2023-06-03 Thread slash.tmp at free dot fr via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #11 from Mason  ---
Here's umul_least_64() rewritten as mul_64x64_128() in C

typedef unsigned int u32;
typedef unsigned long long u64;

/* u32 acc[3], a[1], b[1] */
static void mul_add_32x32(u32 *acc, const u32 *a, const u32 *b)
{
  u64 res = (u64)a[0] * b[0];
  u32 lo = res, hi = res >> 32;
  asm("add %[LO], %[D0]\n\t" "adc %[HI], %[D1]\n\t" "adc $0, %[D2]" :
  [D0] "+m" (acc[0]), [D1] "+m" (acc[1]), [D2] "+m" (acc[2]) :
  [LO] "r" (lo), [HI] "r" (hi) : "cc");
}

/* u32 acc[5], a[2], b[2] */
void mul_64x64_128(u32 *acc, const u32 *a, const u32 *b)
{
  mul_add_32x32(acc+0, a+0, b+0);
  mul_add_32x32(acc+1, a+0, b+1);
  mul_add_32x32(acc+1, a+1, b+0);
  mul_add_32x32(acc+2, a+1, b+1);
}

gcc-trunk -O3 -m32

mul_64x64_128:
  pushl %esi
  pushl %ebx
  movl 16(%esp), %ebx   ; ebx = a
  movl 20(%esp), %esi   ; esi = b
  movl 12(%esp), %ecx   ; ecx = acc
  movl (%esi), %eax ; b0
  mull (%ebx)   ; a0*b0
  add %eax, (%ecx)
  adc %edx, 4(%ecx)
  adc $0, 8(%ecx)
  movl 4(%esi), %eax; b1
  mull (%ebx)   ; a0*b1
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  adc $0, 12(%ecx)
  movl (%esi), %eax ; b0
  mull 4(%ebx)  ; a1*b0
  add %eax, 4(%ecx)
  adc %edx, 8(%ecx)
  adc $0, 12(%ecx)
  movl 4(%esi), %eax; b1
  mull 4(%ebx)  ; a1*b1
  add %eax, 8(%ecx)
  adc %edx, 12(%ecx)
  adc $0, 16(%ecx)
  popl %ebx
  popl %esi
  ret

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-11-01 Thread marxin at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

Martin Liška  changed:

   What|Removed |Added

 CC||marxin at gcc dot gnu.org

--- Comment #10 from Martin Liška  ---
(In reply to cqwrteur from comment #9)
> (In reply to Richard Biener from comment #8)
> > As mentioned in the other bug STV might make things only worse.
> 
> what is stv?

$ gcc --help=target | grep mstv
  -mstv   Disable Scalar to Vector optimization pass
transforming 64-bit integer computations into a vector ones.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-28 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #9 from cqwrteur  ---
(In reply to Richard Biener from comment #8)
> As mentioned in the other bug STV might make things only worse.

what is stv?

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-28 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #8 from Richard Biener  ---
As mentioned in the other bug STV might make things only worse.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-27 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #7 from cqwrteur  ---
(In reply to cqwrteur from comment #6)
> (In reply to Andrew Pinski from comment #5)
> > (In reply to cqwrteur from comment #4)
> > > (In reply to cqwrteur from comment #3)
> > > > (In reply to Andrew Pinski from comment #2)
> > > > > There might be another bug about _addcarryx_u64 already.
> > > > 
> > > > This is 32 bit addcarry.
> > > 
> > > but yeah. GCC does not perform optimizations very well to add carries and
> > > mul + recognize >>64u <<64u patterns
> > 
> > I mean all of _addcarryx_* intrinsics.
> 
> https://godbolt.org/z/qq3nb49Eq
> https://godbolt.org/z/cqoYG35jx
> Also this is weird. just extract part of code into function generates
> different assembly for __builtin_bit_cast. It must be a inliner bug.

my fault for misreading(In reply to Andrew Pinski from comment #5)
> (In reply to cqwrteur from comment #4)
> > (In reply to cqwrteur from comment #3)
> > > (In reply to Andrew Pinski from comment #2)
> > > > There might be another bug about _addcarryx_u64 already.
> > > 
> > > This is 32 bit addcarry.
> > 
> > but yeah. GCC does not perform optimizations very well to add carries and
> > mul + recognize >>64u <<64u patterns
> 
> I mean all of _addcarryx_* intrinsics.

This example is also interesting that -O2, -O3, -Ofast generates much worse
assembly than -O1. There is no point for doing SIMD for things like this

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-27 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #6 from cqwrteur  ---
(In reply to Andrew Pinski from comment #5)
> (In reply to cqwrteur from comment #4)
> > (In reply to cqwrteur from comment #3)
> > > (In reply to Andrew Pinski from comment #2)
> > > > There might be another bug about _addcarryx_u64 already.
> > > 
> > > This is 32 bit addcarry.
> > 
> > but yeah. GCC does not perform optimizations very well to add carries and
> > mul + recognize >>64u <<64u patterns
> 
> I mean all of _addcarryx_* intrinsics.

https://godbolt.org/z/qq3nb49Eq
https://godbolt.org/z/cqoYG35jx
Also this is weird. just extract part of code into function generates different
assembly for __builtin_bit_cast. It must be a inliner bug.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-27 Thread pinskia at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #5 from Andrew Pinski  ---
(In reply to cqwrteur from comment #4)
> (In reply to cqwrteur from comment #3)
> > (In reply to Andrew Pinski from comment #2)
> > > There might be another bug about _addcarryx_u64 already.
> > 
> > This is 32 bit addcarry.
> 
> but yeah. GCC does not perform optimizations very well to add carries and
> mul + recognize >>64u <<64u patterns

I mean all of _addcarryx_* intrinsics.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-27 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #4 from cqwrteur  ---
(In reply to cqwrteur from comment #3)
> (In reply to Andrew Pinski from comment #2)
> > There might be another bug about _addcarryx_u64 already.
> 
> This is 32 bit addcarry.

but yeah. GCC does not perform optimizations very well to add carries and mul +
recognize >>64u <<64u patterns

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-27 Thread unlvsur at live dot com via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

--- Comment #3 from cqwrteur  ---
(In reply to Andrew Pinski from comment #2)
> There might be another bug about _addcarryx_u64 already.

This is 32 bit addcarry.

[Bug target/102974] GCC optimization is very poor for add carry and multiplication combos

2021-10-27 Thread pinskia at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102974

Andrew Pinski  changed:

   What|Removed |Added

  Component|tree-optimization   |target

--- Comment #2 from Andrew Pinski  ---
There might be another bug about _addcarryx_u64 already.