Re: gcd_22

2019-08-31 Thread Marco Bodrato
Ciao,

Il Mer, 28 Agosto 2019 11:39 am, Torbjörn Granlund ha scritto:
> Marco Bodrato  writes:

>   or maybe simply a loop that knows that U will be smaller until also V
>   will be one (some) limb shorter...
>
>   ...and finally tail-call to the smaller fixed-size asm function?
>
> I think this is a good proposal.

I implemented it, you can find it attached.

The funny thing is that with the attached code, all the chain gcd_55 ->
lowz -> gcd_44 -> lowz -> gcd_33 -> lowz is handled with tail calls.
On the other side, a great part of the code is duplicated, but it can not
exploit the fixed-size shortcuts... I'm not that sure the resulting code
is useful.

Ĝis,
m

-- 
http://bodrato.it/papers/#include "gmp-impl.h"
#include "longlong.h"

#if defined (__amd64__) && W_TYPE_SIZE == 64
#define CMP_SWAP(ap,bp,n)		\
  do {	\
mp_limb_t tmp;			\
__asm__ ("cmp\t%5, %6"	"\n\t"	\
	 "mov\t%1, %0"	"\n\t"	\
	 "cmovae\t%2, %1"	"\n\t"	\
	 "cmovae\t%0, %2"		\
	 : "=," (tmp), "=r,r" (ap), "=r,r" (bp)			\
	 : "1,1" (ap), "2,2" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
  } while (0)
#define badCMP_SWAP(ap,bp,n)		\
  do {	\
mp_limb_t tmp;			\
__asm__ ("cmp\t%5, %6"	"\n\t"	\
	 "mov\t%3, %0"	"\n\t"	\
	 "cmovae\t%4, %1"	"\n\t"	\
	 "cmovae\t%0, %2"		\
	 : "=," (tmp), "=r,r" (ap), "=r,r" (bp)			\
	 : "r,r" (ap), "r,r" (bp), "r,rm" (ap[n-1]), "rm,r" (bp[n-1])); \
  } while (0)
#endif

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)		\
  do {	\
mp_limb_t m = -(mp_limb_t) (ap[n - 1] < bp[n - 1]);			\
mp_limb_t *_tp = ap;		\
ap = (mp_ptr) (((size_t) ap & ~m) | ((size_t) bp & m));		\
bp = (mp_ptr) (((size_t) bp & ~m) | ((size_t) _tp & m));		\
  } while (0)
#endif

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)		\
  do {	\
if (ap[n - 1] < bp[n - 1])		\
  {	\
	mp_limb_t *_tp = ap; ap = bp; bp = _tp;\
  }	\
  } while (0)
#endif


#define SMASH(a,b) a##b

void mpn_gcd_33 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_44 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_55 (mp_limb_t *, mp_limb_t *, mp_limb_t *);

mp_limb_t mpn_sub_3 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_sub_4 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_sub_5 (mp_ptr, mp_srcptr, mp_srcptr);
void mpn_rshift_3 (mp_ptr, mp_srcptr, int);
void mpn_rshift_4 (mp_ptr, mp_srcptr, int);
void mpn_rshift_5 (mp_ptr, mp_srcptr, int);

static void
lowz (mp_ptr rp, mp_ptr up, mp_ptr vp, mp_size_t n)
{
  // Handle any number of low U zeros including that U = 0
  mp_size_t i;
  ASSERT (up[0] == 0);
  ASSERT (n > 1);
  for (i = 1; up[i] == 0;)
if (++i == n)
  {
	mpn_copyi (rp, vp, n);
	return;
  }

  if ((up[i] & 1) == 0)
{
  int cnt;
  count_trailing_zeros (cnt, up[i]);
  mpn_rshift (up, up + i, n - i, cnt);
}
  else
{
  mpn_copyi (up, up + i, n - i);
}
  MPN_FILL (up + n - i, i, 0);

  while (1)
{
  if (UNLIKELY (vp [n - 1] == 0))
	{
	  rp [--n] = 0;
	  if (--i == 0)
	break;
	}
  else
	{
	  vp [n - 1] -= mpn_sub_n (vp, vp, up, n - 1);

	  if (UNLIKELY (vp[0] == 0))
	{
	  lowz (rp, vp, up, n);
	  return;
	}
	  else
	{
	  int cnt;
	  count_trailing_zeros (cnt, vp[0]);
	  mpn_rshift (vp, vp, n, cnt);
	}
	}
}

  // We've reduced the operand by at least a full limb. Call the next smaller primitive.
  switch (n)
{
case 1:
  {
	rp[0] = mpn_gcd_11 (up[0], vp[0]);
	break;
  }
case 2:
  {
	mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
	rp[0] = g.d0;
	rp[1] = g.d1;
	break;
  }
case 3:
  mpn_gcd_33 (rp, up, vp);
  break;
default:
  ASSERT (n == 4);
  mpn_gcd_44 (rp, up, vp);
  break;
}
}

static inline void
mpn_gcd_NN (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp, size_t N)
{
  mp_limb_t cy;

  do
{
  CMP_SWAP (up, vp, N);

#if LATER
  if ((vp[0] - up[0] & 3) != 0
	  && LIKELY (((up[N - 1] | vp[N - 1]) & GMP_NUMB_HIGHBIT) == 0))
	{
	  cy = mpn_add_n (up, up, vp, N);
	}
  else
#endif
#if HAVE_NATIVE_mpn_sub_3
  if (N == 3)
	cy = mpn_sub_3 (up, up, vp);
  else
#endif
#if HAVE_NATIVE_mpn_sub_4
  if (N == 4)
	cy = mpn_sub_4 (up, up, vp);
  else
#endif
#if HAVE_NATIVE_mpn_sub_5
  if (N == 5)
	cy = mpn_sub_5 (up, up, vp);
  else
#endif
	cy = mpn_sub_n (up, up, vp, N);

  if (UNLIKELY (cy))
	mpn_neg (up, up, N);

  if (UNLIKELY (up[0] == 0))
	{
	  lowz (rp, up, vp, N);
	  return;
	}
  else
	{
	  int cnt;
	  count_trailing_zeros (cnt, up[0]);
#if HAVE_NATIVE_mpn_rshift_3
	  if (N == 3)
	mpn_rshift_3 (up, up, cnt);
	  else
#endif
#if HAVE_NATIVE_mpn_rshift_4
	  if (N == 4)
	mpn_rshift_4 (up, up, cnt);
	  else
#endif
#if HAVE_NATIVE_mpn_rshift_5
	  if (N == 5)
	mpn_rshift_5 (up, up, cnt);
	  else
#endif
	mpn_rshift (up, up, N, cnt);
	}
}
  while ((up[N - 1] | 

Re: gcd_22

2019-08-28 Thread Torbjörn Granlund
Marco Bodrato  writes:

  Maybe tail-call?

  When up[0] == 0 but U is not zero and we can not return the
  result... That's an unlikely case, but it means that one of the
  operands gets much smaller than the other. Maybe a special strategy
  can be used in this case? Maybe a division, maybe a 2-adic division,
  or maybe simply a loop that knows that U will be smaller until also V
  will be one (some) limb shorter...

  ...and finally tail-call to the smaller fixed-size asm function?

I think this is a good proposal.

It is not certain that u-v with its new low-order limb zero is smaller
than v.  But it it is not, we can go to the next smaller algorithm
directly, I assume.

Here is what I have now.  Feel free to improve it!



gcd-mpn.c
Description: Binary data


-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-28 Thread Marco Bodrato

Ciao,

Il 2019-08-28 00:09 t...@gmplib.org ha scritto:
I broke out the unlikely up[0] code into a separate function, a 
function

which can be common for any size.  A function which we might call from
asm.


Maybe tail-call?

When up[0] == 0 but U is not zero and we can not return the result... 
That's an unlikely case, but it means that one of the operands gets much 
smaller than the other. Maybe a special strategy can be used in this 
case? Maybe a division, maybe a 2-adic division, or maybe simply a loop 
that knows that U will be smaller until also V will be one (some) limb 
shorter...


...and finally tail-call to the smaller fixed-size asm function?

m
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Torbjörn Granlund
Marco Bodrato  writes:

  For a generic code with variable N, one may prefer a code that chooses
  if a copy or a shorter shift is needed. But this means more code and
  the shift could not be an in-lined fixed size version...

I broke out the unlikely up[0] code into a separate function, a function
which can be common for any size.  A function which we might call from
asm.

I agree that the copy and the shift could be combined.

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Marco Bodrato

Il 2019-08-27 21:10 t...@gmplib.org ha scritto:

Marco Bodrato  writes:



  ... and on some platform mpn_rshift may not support cnt==0.

That was taken care of in ny last version.


I wrote my message before, and did not realize, before sending it, that 
you sent a new version :-)



I added a goto, I think that is an OK solution.


It is correct.
For a generic code with variable N, one may prefer a code that chooses 
if a copy or a shorter shift is needed. But this means more code and the 
shift could not be an in-lined fixed size version...


m

___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Marco Bodrato

Ciao,

Il 2019-08-27 16:35 t...@gmplib.org ha scritto:

I got something working.  It runs quite well, and seems to beat the


Great!


static inline void
mpn_gcd_NN (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp, size_t N)


I see that your idea is to obtain a N-loop-unrolled version...


  if (UNLIKELY (cy))
mpn_neg (up, up, N);


This is an unlikely branch, but I'd really suggest
 {mpn_neg (up, up, N -1); up[N-1] = 0;}
because mpn_neg is full of branches :-)


  if (UNLIKELY (up[0] == 0))
{


This unlikely branch may end up with an odd up[0]...


}
  int cnt;
  count_trailing_zeros (cnt, up[0]);
  mpn_rshift (up, up, N, cnt);


... and on some platform mpn_rshift may not support cnt==0.

Maybe (for the C version) we should duplicate the ctz/shift instruction, 
both inside the UNLIKELY (*up == 0) branch (deciding whether to shift or 
copy) and in an else (likely) branch.


m

--
http://bodrato.it/papers/
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Torbjörn Granlund
Some cleanups and tweaks later.  The gcd_33 based on this, compiled with
gcc 8.3, runs at 30 cycles per iteration.  (Note, not cycles per bit!)

My best gcd_33 in assembly runs at 10 cycles per iteration.

The former uses memory based operands.  The latter keeps everything in
registers.

If we wrote an assembly variant of this, and inlined sub_3 and rshift_3,
I expect it to run at about 15 cycles per iteration.

(Timings are for AMD Ryzen.)



gcd-mpn.c
Description: Binary data


x64-mpn_N.asm
Description: Binary data

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Torbjörn Granlund
I got something working.  It runs quite well, and seems to beat the
performance of mpn_gcd.  Here is the code:

#include "gmp-impl.h"
#include "longlong.h"

#ifndef CMP_SWAP
#define CMP_SWAP(ap,bp,n)   \
  do {  \
if (ap[n - 1] < bp[n - 1])  \
  { \
mp_limb_t *_tp = ap; ap = bp; bp = _tp; \
  } \
  } while (0)
#endif
// We should do CMP_SWAP something like this:
// cmp
// mov up, tmp
// cmov vp, up
// cmov tmp, vp

#define SMASH(a,b) a##b

void mpn_gcd_33 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_44 (mp_limb_t *, mp_limb_t *, mp_limb_t *);
void mpn_gcd_55 (mp_limb_t *, mp_limb_t *, mp_limb_t *);

mp_limb_t mpn_sub_3 (mp_ptr, mp_srcptr, mp_srcptr);
mp_limb_t mpn_rshift_3 (mp_ptr, mp_srcptr, int);

static inline void
mpn_gcd_NN (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp, size_t N)
{
  mp_limb_t cy;

  do
{
  // Conditionally swap U and V.  This must be done without jumps.
  CMP_SWAP(up,vp,N);

#if HAVE_NATIVE_mpn_sub_3
  cy = mpn_sub_3 (up, up, vp);
#else
  cy = mpn_sub_n (up, up, vp, N);
#endif
  if (UNLIKELY (cy))
mpn_neg (up, up, N);

  if (UNLIKELY (up[0] == 0))
{
  // Handle any number of low U zeros including that U = 0
  int all_zeros = 1;
  mp_size_t i;
  for (i = 1; i < N && all_zeros; i++)
all_zeros = (up[i] == 0);

  if (all_zeros)
{
  mpn_copyi (rp, vp, N);
  return;
}
  mpn_copyi (up, up + (i - 1), N - (i - 1));
  mpn_zero (up + N - (i - 1), i - 1);
}
  int cnt;
  count_trailing_zeros (cnt, up[0]);
#if HAVE_NATIVE_mpn_rshift_3
  mpn_rshift_3 (up, up, cnt);
#else
  mpn_rshift (up, up, N, cnt);
#endif
}
  while ((up[N - 1] | vp[N - 1]) != 0);

  switch (N)
{
case 3:
  {
mp_double_limb_t g = mpn_gcd_22 (up[1], up[0], vp[1], vp[0]);
rp[0] = g.d0;
rp[1] = g.d1;
rp[2] = 0;
break;
  }
case 4:
  mpn_gcd_33 (rp, up, vp);
  rp[3] = 0;
  break;
case 5:
  mpn_gcd_44 (rp, up, vp);
  rp[4] = 0;
  break;
}
}

void
SMASH(mpn_gcd_,33) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_NN (rp, up, vp, 3);
}

void
SMASH(mpn_gcd_,44) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_NN (rp, up, vp, 4);
}

void
SMASH(mpn_gcd_,55) (mp_limb_t *rp, mp_limb_t *up, mp_limb_t *vp)
{
  mpn_gcd_NN (rp, up, vp, 5);
}

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Torbjörn Granlund
ni...@lysator.liu.se (Niels Möller) writes:

  And to make the loop work, it needs some condition to decrement N and
  maintain non-zero high limbs (if both up[N-1] and vp[N-1] are zero,
  comparison is no good). So that would be something like

Since N is my proposal is a constant, it is considered bad practice to
decrement it.

Instead i expect deflection to a loop handling N-1.

Besides, I am not sure you got the decrementation criteria exactly
right. ;-)

  Back when the current gcd code was written, I think it won against the
  accelerated binary gcd also for small sizes, and that's why we didn't
  introduce any lehmer-vs-binary threshold.

I believe we will be able to beat the old "accelerated binary" code and
the present code with a branch-reduced approach.

Reducing the branches should be done next, but it will take some work to
reduce it enough for a significant speedup.  At least that's my
understanding.

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-27 Thread Marco Bodrato
Ciao,

Il Dom, 25 Agosto 2019 2:28 am, Torbjörn Granlund ha scritto:
> Now we have a nice set of x86_64 gcd_22.  The code is not as well tuned
> as the gcd_11 code, but it runs somewhat fast.

So if I suggest to reorder some instructions in the loop, you will not
upset :-)
If we can change cmovc-s then there are a couple of instructions that can
be removed from the "unlikely" branch in gcd_22:

***
/tmp/extdiff.1GZGxB/gmp.746b5528f6a5/mpn/x86_64/core2/gcd_22.asm
2019-08-25
10:32:39.0 +0200
--- /home/bodrato/gmp/mpn/x86_64/core2/gcd_22.asm   2019-08-26
23:23:30.321470451 +0200
***
*** 94,108 

bsf t0, cnt

sub v0, u0
sbb v1, u1

- L(bck):   cmovc   t0, u0  C u = |u - v|
cmovc   t1, u1  C u = |u - v|
cmovc   s0, v0  C v = min(u,v)
cmovc   s1, v1  C v = min(u,v)

shrdR8(cnt), u1, u0
shr R8(cnt), u1

mov v1, t1
--- 94,108 

bsf t0, cnt

sub v0, u0
sbb v1, u1

cmovc   t1, u1  C u = |u - v|
cmovc   s0, v0  C v = min(u,v)
+ L(bck):   cmovc   t0, u0  C u = |u - v|
cmovc   s1, v1  C v = min(u,v)

shrdR8(cnt), u1, u0
shr R8(cnt), u1

mov v1, t1
***
*** 118,131 
C 1. If v1 - u1 = 0, then gcd is u = v.
C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
mov v1, t0
sub u1, t0
je  L(end)

!   xor t1, t1
!   mov u0, s0
mov u1, s1
bsf t0, cnt
mov u1, u0
xor u1, u1
sub v1, u0
jmp L(bck)
--- 118,131 
C 1. If v1 - u1 = 0, then gcd is u = v.
C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
mov v1, t0
sub u1, t0
je  L(end)

! C xor t1, t1
! C mov u0, s0
mov u1, s1
bsf t0, cnt
mov u1, u0
xor u1, u1
sub v1, u0
jmp L(bck)

The same applies to other x86_64 gcd_22...

> I haven't explored the table based variant which gives 3 bits of
> progress per iteration.  It might make the new code obsolete for
> machines with fast multiply.

I'm not sure it's easy to handle the case with u+v that overflows...

Ĝis,
m

-- 
http://bodrato.it/papers/

___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-26 Thread Niels Möller
"Marco Bodrato"  writes:

> Some messages ago, Niels suggested that discarding the smallest number
> keeping the largest one in some unlikely case could be a good idea.

I think that is ok provided the numbers are close...

> I did not completely understand what he meant... But here I can see his
> suggestion was quite clever.
> I'd suggest
>   if (UNLIKELY (cy)) { mpn_neg(up,up, N-1); up[N-1] = 0; }
> without jumping back... :-)

Including in this case, since if I understand it correctly, it can
happen only when the high limbs are equal, up[N-1] == vp[N-1], and
that's "close enough", progress should be almost the same.

And to make the loop work, it needs some condition to decrement N and
maintain non-zero high limbs (if both up[N-1] and vp[N-1] are zero,
comparison is no good). So that would be something like

 while (N > 2)
   {
 if (up[N-1] < vp[N-1])
   swap up,vp
 cy = mpn_sub_n (up, up, vp, N);
 if (UNLIKELY (cy)) { mpn_neg(up,up, N-1); up[N-1] = 0; }
 if (UNLIKELY (up[0] == 0)) {
   // handle any number zeros including that U = 0
 }
 count_trailing_zeros (cnt, up[0]);
 mpn_rshift (up, up, N, cnt);
 N -= ((up[n-1] | vp[N-1]) > 0);
   }
 mpn_gcd_22(...);

Would absolutely be interesting to benchmark that against the current
Lehmer code based on hgcd2, to see where the cross-over is. And
preferably benchmarked with respect to size in bits, not size in limbs
(for Lehmer, I suspect three full limbs may need two hgcd2 iterations,
and hence be significantly slower than 3 limbs minus a few bits).

Back when the current gcd code was written, I think it won against the
accelerated binary gcd also for small sizes, and that's why we didn't
introduce any lehmer-vs-binary threshold.

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-26 Thread Marco Bodrato
Ciao,

Il Lun, 26 Agosto 2019 4:36 pm, Torbjörn Granlund ha scritto:
>for (;;) {
>  if (up[N-1] < vp[N-1])
>  swap up,vp
>   back:
>cy = mpn_sub_n (up, up, vp, N);
>if (UNLIKELY (cy)) { swap up,vp; goto back }
>if (UNLIKELY (up[0] == 0)) {
>  // handle any number zeros including that U = 0
>}
>count_trailing_zeros (cnt, up[0]);
>mpn_rshift (up, up, N, cnt);
>}

Some messages ago, Niels suggested that discarding the smallest number
keeping the largest one in some unlikely case could be a good idea.

I did not completely understand what he meant... But here I can see his
suggestion was quite clever.
I'd suggest
  if (UNLIKELY (cy)) { mpn_neg(up,up, N-1); up[N-1] = 0; }
without jumping back... :-)


Ĝis,
m

-- 
http://bodrato.it/papers/

___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-26 Thread Torbjörn Granlund
I wonder if not something like this,

   for (;;) {
 if (up[N-1] < vp[N-1])
   swap up,vp
back:
 cy = mpn_sub_n (up, up, vp, N);
 if (UNLIKELY (cy)) { swap up,vp; goto back }
 if (UNLIKELY (up[0] == 0)) {
   // handle any number zeros including that U = 0
 }
 count_trailing_zeros (cnt, up[0]);
 mpn_rshift (up, up, N, cnt);
   }

would fit between our gcd_NN and the hgcd2 code?

The calls to mpn_sub_n and mpn_rshift should probably really be inlined
for specific values of N, but generic code for variable N might also be
useful.

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-26 Thread Torbjörn Granlund
"Marco Bodrato"  writes:

  If I'm not reading this timings wrongly, this means that with the current
  code (disregarding the overhead, for those 64-bits limbs)
  the bits in the limb 1 require 4 cycles each;
  the bits in the limb 2 require 8 cycles each;
  the bits in the limb 3 require 54 cycles each;
  the bits in the limb 4 require 33 cycles each...

Timing is confusing, and I cannot tell if you're right.  Perhaps Niels
can?!

For good measure, I write a gcd_33 in the style of gcd_22.  It runs
about 20% slower than gcd_22 on AMD Ryzen, so really well!  The code
runs on Intel Haswell and later and on AMD Excavator and later.
Attached below.

I agree with Niels that we should optimise hgcd2 and its div1 and div2
callees.  They are shock full of unpredicable branches.  But it might
also make sense to provide a set of gcd_kk for small k, as gcd is an
important operation which is also very slow compared to most other GMP
operations.



x64-zny-gcd_33.asm
Description: Binary data


-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-25 Thread Niels Möller
Victor Shoup  writes:

> Regarding the so-called doc bug, if I understand the issue correctly,
> I don’t think it’s a good idea to add more preconditions to the
> documentation. In fact, I think that would be a really bad idea.

I agree it's usually a bad idea, but may be ok under the circumstances.
What's happened:

1. There used to be rather specific input requirements on mpn_gcd
   inputs.

2. Implementation was rewritten, making most of those requirements
   irrelevant.

3. Documentation was updated to relax requirements. Unfortunately,
   documentation was relaxed a bit too far, since the implementation
   never worked for both inputs being even. (commit
   https://gmplib.org/repo/gmp/rev/71efb0367192, 2011).

   It would crash with an assert in gcd_22, if asserts are enabled. I'm
   not sure in which way it would fail if asserts are disabled, but I
   think it would both produce a wrong result and do that very slowly.

4. There's been no bug reports since, about mpn_gcd not working properly
   for two even inputs.
 
Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-25 Thread Marco Bodrato
Ciao,

Il Dom, 25 Agosto 2019 9:34 am, Niels Möller ha scritto:
> t...@gmplib.org (Torbjörn Granlund) writes:

>> Should we have gcd_33, gcd_44, and gcd_55 also?

> I think it would be more beneficial to optimize hgcd2.

> E.g., 3-limb gcd should typically boil down to one hgcd2, a couple of
> 3:1 mul_1/addmul_1, and one call to gcd_22. It's not obvious that a
> gcd_33 using the binary algorithm will be faster.

On shell I get:
$ tune/speed -CD -s16-64 -t48 mpn_gcd_11 mpn_gcd_22
overhead 6.84 cycles, precision 1 units of 2.86e-10 secs, CPU freq
3500.09 MHz
   mpn_gcd_11mpn_gcd_22
16 (#78.3223)(134.4004)
64#4.08777.8639

$ tune/speed -CD -s1-4 mpn_gcd
overhead 6.83 cycles, precision 1 units of 2.86e-10 secs, CPU freq
3500.09 MHz
  mpn_gcd
1  (350.7188)
2519.4961
3   3465.1028
4   2168.9167

If I'm not reading this timings wrongly, this means that with the current
code (disregarding the overhead, for those 64-bits limbs)
the bits in the limb 1 require 4 cycles each;
the bits in the limb 2 require 8 cycles each;
the bits in the limb 3 require 54 cycles each;
the bits in the limb 4 require 33 cycles each...


On the same machine, with ABI=32, gcd_22.c is used:

$ tune/speed -CD -s8-32 -t24 mpn_gcd_11 mpn_gcd_22
overhead 6.86 cycles, precision 1 units of 2.86e-10 secs, CPU freq
3500.09 MHz
   mpn_gcd_11mpn_gcd_22
8  (#45.9492)(126.3672)
32#4.5485   17.2290

$ tune/speed -CD -s1-4 mpn_gcd
overhead 6.85 cycles, precision 1 units of 2.86e-10 secs, CPU freq
3500.09 MHz
  mpn_gcd
1  (270.6172)
2538.6328
3   1797.6500
4   1258.4594

the bits in the limb 1 require 4.5 cycles each;
the bits in the limb 2 require 17 cycles each;
the bits in the limb 3 require 56 cycles each;
the bits in the limb 4 require 39 cycles each...


It really seem that the _33 case need some work.

Ĝis,
m

-- 
http://bodrato.it/papers/

___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-25 Thread Victor Shoup
Regarding the so-called doc bug, if I understand the issue correctly, I don’t 
think it’s a good idea to add more preconditions to the documentation. In fact, 
I think that would be a really bad idea.  

> On Aug 25, 2019, at 2:34 AM, niels mller  
> wrote:
> 
> t...@gmplib.org (Torbjörn Granlund) writes:
> 
>> Now what?
> 
> 1. Update mpn_gcd.c and other callers of gcd_11 to call the new
>   functions (like in the patch I posted few days ago). Watch out for
>   performance regressions.
> 
> 2. Consider alternative entry points. Allowing one even operand would
>   simplify mpn_gcd, at least. Entrypoints with initial reduction (like
>   current mpn_gcd_1, but with a more carefully tuned threshold).
> 
> 3. Fix the doc(?) bug in mpn_gcd, regarding even inputs.
> 
>> Should we have gcd_33, gcd_44, and gcd_55 also?
> 
> I think it would be more beneficial to optimize hgcd2. I think one could
> have a loop use table lookup on the most significant bits, to get an
> approximate quotient. Not sure if it's reasonable to do a
> count_leading_zeros per iteration, or if one should have some other
> book-keeping if (approximate) leading bits, or maintain normalization
> within the loop.
> 
> E.g., 3-limb gcd should typically boil down to one hgcd2, a couple of
> 3:1 mul_1/addmul_1, and one call to gcd_22. It's not obvious that a
> gcd_33 using the binary algorithm will be faster.
> 
> Regards,
> /Niels
> 
> -- 
> Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
> Internet email is subject to wholesale government surveillance.
> ___
> gmp-devel mailing list
> gmp-devel@gmplib.org
> https://urldefense.proofpoint.com/v2/url?u=https-3A__gmplib.org_mailman_listinfo_gmp-2Ddevel=DwIGaQ=slrrB7dE8n7gBJbeO0g-IQ=vPPV1mqW437RJtXssFXDXg=NKF0QnWJUmfxg89L0Z_PQnkD53xG3J-DMkl5rTZQkO0=v81U4r_fI3Rlv6qDtur2ADdxaY5esEp664zruTYmAgg=
>  

___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-25 Thread Niels Möller
t...@gmplib.org (Torbjörn Granlund) writes:

> Now what?

1. Update mpn_gcd.c and other callers of gcd_11 to call the new
   functions (like in the patch I posted few days ago). Watch out for
   performance regressions.

2. Consider alternative entry points. Allowing one even operand would
   simplify mpn_gcd, at least. Entrypoints with initial reduction (like
   current mpn_gcd_1, but with a more carefully tuned threshold).

3. Fix the doc(?) bug in mpn_gcd, regarding even inputs.

> Should we have gcd_33, gcd_44, and gcd_55 also?

I think it would be more beneficial to optimize hgcd2. I think one could
have a loop use table lookup on the most significant bits, to get an
approximate quotient. Not sure if it's reasonable to do a
count_leading_zeros per iteration, or if one should have some other
book-keeping if (approximate) leading bits, or maintain normalization
within the loop.

E.g., 3-limb gcd should typically boil down to one hgcd2, a couple of
3:1 mul_1/addmul_1, and one call to gcd_22. It's not obvious that a
gcd_33 using the binary algorithm will be faster.

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-24 Thread Torbjörn Granlund
Now we have a nice set of x86_64 gcd_22.  The code is not as well tuned
as the gcd_11 code, but it runs somewhat fast.

I haven't explored the table based variant which gives 3 bits of
progress per iteration.  It might make the new code obsolete for
machines with fast multiply.

Now what?  Should we have gcd_33, gcd_44, and gcd_55 also?  :-) (It is
clear that these could improve speed greatly, and with the gcd_22 code
they would not be hard to write these.  Well, gcd_44 and above would not
be able to keep things in the 15 usable registers of x86_64.)

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-23 Thread Torbjörn Granlund
ni...@lysator.liu.se (Niels Möller) writes:

  The below implementation appears to pass tests, and give a modest
  speedup of 0.2 cycles per input bit, or 2.5%. Benchmark, comparing C
  implementations of gcd_11 and gcd_22:

Beware of "turbo" when counting cycles!  (Relative measurements like
gcd_11 vs gcd_22 should be fine!)

The speed difference between C gcd_11 and gcd_22 is surprisingly small!
Perhaps gcd_11 should be rewritten in the style of gcd_22?


I did not provide a top-level gcd_22 for x86_64 as you might have seen.
The one similar to x86_64/gcd_11.asm is probably x86_64/k8/gcd_22.asm.
Perhaps it should be moved.

But as far as I can tell, that function is slower than you C gcd_22 for
some platforms, such as Intel haswell.

I'm curious if your C code could be made into competitive asm.  One
usually can beat the compiler some 10-30%.

Measurements for gcd_11/22 for most of our machines are in.  See
https://gmplib.org/devel/tm/gmp/date.html and click on any HOSTgentoo64
tuneup link.  Scroll down; after the normal *_THRESHOLD stuff comes
comparative measurements of asm code.  (The mpn/generic code is not
usually measured; the exception is when it appears in the default
column.  I plan to fix this some day, and have a few columns "gcc -O",
"gcc -Os", "gcc -O2".)

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-20 Thread Niels Möller
t...@gmplib.org (Torbjörn Granlund) writes:

> Why would it be faster?  Because while cnd1 needs to wait for the
> previous iteration's double-limb shifting, cnd2 depends on just the
> single-limb shifting of the high U world.

The below implementation appears to pass tests, and give a modest
speedup of 0.2 cycles per input bit, or 2.5%. Benchmark, comparing C
implementations of gcd_11 and gcd_22:

$ ./tune/speed -p 10 -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22
overhead 4.01 cycles, precision 10 units of 1.03e-09 secs, CPU freq
974.93 MHz
   mpn_gcd_11mpn_gcd_22
1 #7.0542   12.8931
4 #2.88956.7366
7 #3.21357.0463
10#3.84177.2487
13#4.46487.4802
16#5.09477.4609
19#5.21407.6955
22#5.36847.5914
25#5.22787.7002
28#5.46907.6998
31#5.37787.7374
34#5.43627.7808
37#5.52017.7397
40#5.39797.7096
43#5.43397.7887
46#5.42297.7318
49#5.45577.7160
52#5.40477.6908
55#5.40437.6791
58#5.38597.6680
61#5.40847.7529
64#5.42777.6582

Regards,
/Niels

mp_double_limb_t
mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0)
{
  mp_double_limb_t g;
  ASSERT_ALWAYS (u0 & v0 & 1);

  /* Implicit least significant bit */
  u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1));
  u1 >>= 1;

  v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1));
  v1 >>= 1;

  while (u1 || v1) /* u1 == 0 can happen at most twice per call */
{
  mp_limb_t vgtu, t1, t0;
  sub_ddmmss (t1, t0, u1, u0, v1, v0);
  vgtu = LIMB_HIGHBIT_TO_MASK(u1 - v1);

  if (UNLIKELY (t0 == 0))
{
  if (t1 == 0)
{
  g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1));
  g.d0 = (u0 << 1) | 1;
  return g;
}
  int c;
  count_trailing_zeros (c, t1);

  /* v1 = min (u1, v1) */
  v1 += (vgtu & t1);
  /* u0 = |u1 - v1| */
  u0 = (t1 ^ vgtu) - vgtu;
  ASSERT (c < GMP_LIMB_BITS - 1);
  u0 >>= c + 1;
  u1 = 0;
}
  else
{
  int c;
  count_trailing_zeros (c, t0);
  c++;
  /* V <-- min (U, V).

 Assembly version should use cmov. Another alternative,
 avoiding carry propagation, would be

 v0 += vgtu & t0; v1 += vtgu & (u1 - v1);
  */
  add_ss (v1, v0, v1, v0, vgtu & t1, vgtu & t0);
  /* U  <--  |U - V|
 No carry handling needed in this conditional negation,
 since t0 != 0. */
  u0 = (t0 ^ vgtu) - vgtu;
  u1 = t1 ^ vgtu;
  if (UNLIKELY ((mp_limb_signed_t) u1 < 0))
{
  u0 = -u0;
  u1 = ~u1;
}
  if (UNLIKELY (c == GMP_LIMB_BITS))
{
  u0 = u1;
  u1 = 0;
}
  else
{
  u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c));
  u1 >>= c;
}
}
}
  while ((v0 | u0) & GMP_LIMB_HIGHBIT)
{ /* At most two iterations */
  mp_limb_t vgtu, t0;
  int c;
  sub_ddmmss (vgtu, t0, 0, u0, 0, v0);
  if (UNLIKELY (t0 == 0))
{
  g.d1 = u0 >> (GMP_LIMB_BITS - 1);
  g.d0 = (u0 << 1) | 1;
  return g;
}

  /* v <-- min (u, v) */
  v0 += (vgtu & t0);

  /* u <-- |u - v| */
  u0 = (t0 ^ vgtu) - vgtu;

  count_trailing_zeros (c, t0);
  u0 = (u0 >> 1) >> c;
}

  g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1);
  g.d1 = 0;
  return g;
}

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-20 Thread Torbjörn Granlund
ni...@lysator.liu.se (Niels Möller) writes:

  I take it the idea is to have an unlikely branch for this case;
  attempting a branchless correction of vgtu would miss the point?

Yes.

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-20 Thread Niels Möller
t...@gmplib.org (Torbjörn Granlund) writes:

> I think we might get more speed from this algorithm:
>
>   while (highlimb(U) | highlimb(V)) != 0
> S = U - V
> T = V - U
> cnt = count_trailing_zeros(lowlimb(S))
> cnd1 = U < V
> cnd2 = highlimb(U) < highlimb(V)
> if (cnd2)
>   V = U
>   U = T
> else
>   U = S
> if (cnd1 != cnd2) { fix mistake! }
> U >>= cnt;
>
> Why would it be faster?  Because while cnd1 needs to wait for the
> previous iteration's double-limb shifting, cnd2 depends on just the
> single-limb shifting of the high U world.

Interesting. I did something related in the u±v gcd_22 variant, for
other reasons. We probably don't need any fixup for the V assignment, I
would expect that

   V <-- min(U,V)

can be replaced with

   V <-- (highlimb(U) < highlimb(V)) ? U : V

with no measurable impact on performance. Since it will make a
difference for at most one iteration, and we still make reasonable
progress when it happens.


--- a/mpn/generic/gcd_22.c Sat Aug 17 00:59:04 2019 +0200
+++ b/mpn/generic/gcd_22.c Tue Aug 20 07:51:25 2019 +0200
@@ -52,7 +53,7 @@ mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, 
 {
   mp_limb_t vgtu, t1, t0;
   sub_ddmmss (t1, t0, u1, u0, v1, v0);
-  vgtu = LIMB_HIGHBIT_TO_MASK(t1);
+  vgtu = LIMB_HIGHBIT_TO_MASK(u1 - v1);
 
   if (UNLIKELY (t0 == 0))
  {
@@ -91,6 +92,9 @@ mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, 
   since t0 != 0. */
u0 = (t0 ^ vgtu) - vgtu;
u1 = t1 ^ vgtu;
+   if (UNLIKELY ((mp_limb_signed_t) u1 < 0))
+ { 
+   u0 = ~u0;
+   u1 = -u1;
+ }
if (UNLIKELY (c == GMP_LIMB_BITS))
  {
u0 = u1;

I take it the idea is to have an unlikely branch for this case;
attempting a branchless correction of vgtu would miss the point?

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-19 Thread Torbjörn Granlund
We now have the algorithm

  while (highlimb(U) | highlimb(V)) != 0
S = U - V
T = V - U
cnt = count_trailing_zeros(lowlimb(S))
cnd1 = U < V
if (cnd1)
  V = U
  U = T
else
  U = S
U >>= cnt

with some variation.  The "if" statement is performed with masking
tricks or conditional move/conditional select.

The main performance problem with that code is the dependency on
lowlimb(U) in the conditional assignment, through the double-limb shift,
to the S and T assignments.  The looping criteria is a smaller problem,
as branch prediction takes care of that.

I think we might get more speed from this algorithm:

  while (highlimb(U) | highlimb(V)) != 0
S = U - V
T = V - U
cnt = count_trailing_zeros(lowlimb(S))
cnd1 = U < V
cnd2 = highlimb(U) < highlimb(V)
if (cnd2)
  V = U
  U = T
else
  U = S
if (cnd1 != cnd2) { fix mistake! }
U >>= cnt;

Why would it be faster?  Because while cnd1 needs to wait for the
previous iteration's double-limb shifting, cnd2 depends on just the
single-limb shifting of the high U world.

We might make the wrong decision iff highlimb(U) = highlimb(V) in
which case cnd1 might be true (cnd2 is false of course).

But when highlimb(U) = highlimb(V) then either highlimb(U-V) = 0 or
highlimb(V-U) = 0, so we're about to go into a gcd_21 situation.  (We
might also have highlimb(U-V) = highlimb(V-U) = 0, in which case we're
done completely.)

With some inspiration of Richard Sites, Alpha architect, I say: It's
the critical path, stupid!  :-)

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-19 Thread Torbjörn Granlund
t...@gmplib.org (Torbjörn Granlund) writes:

  Your gcd_22.c triggered a lingering bug in longlong.h:

  ARM target which default to Thumb code cannot to the rsc instruction
  (reverse subtract with carry).  We will need to write separate
  sub_ddmmss for Thumb.

  The problem statement is sub_ddmmss (vgtu, d0, 0, u0, 0, v0) which just
  is there to generate a mask.

I checked in a fix.  Not too pretty, since it did lots of code
duplication.

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-18 Thread Torbjörn Granlund
Your gcd_22.c triggered a lingering bug in longlong.h:

ARM target which default to Thumb code cannot to the rsc instruction
(reverse subtract with carry).  We will need to write separate
sub_ddmmss for Thumb.

The problem statement is sub_ddmmss (vgtu, d0, 0, u0, 0, v0) which just
is there to generate a mask.

(We may want to add specific support for that idiom in many sub_ddmmss;
subtract-with-borrow rx,rx,rx would work for any platform.  We could do
that for the ARM sub_ddmmss as a stop-gap fix, hiding this bug.  But I
think I prefer a proper fix.  C seems to definevarious __thumb symbobls
for our needs.)


-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-17 Thread Niels Möller
I've now tried the u + v variant for gcd_22. I've strived to minimize
the number of carry propagations, and I see no obvious
microptimizations. And it's slower, time increases from 7.8 cycles per
input bit to 9.8, when I measure; 

Unclear if it's because the critical path gets longer, or if it's just
too many instructions.

We get a sequence of depending instructions xor, and, neg, xor, before
we get to the sub_ddmmss that cancels some low bits for us.

Regards,
/Niels

mp_double_limb_t
mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0)
{
  mp_double_limb_t g;
  ASSERT (u0 & v0 & 1);

  /* Implicit least significant bit */
  u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1));
  u1 >>= 1;

  v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1));
  v1 >>= 1;

  while (u1 || v1) /* u1 == 0 can happen at most twice per call */
{
  mp_limb_t x1, x0, vgtu;
  mp_limb_t t1, t0, use_add;
  int c;
  x1 = u1 ^ v1;
  x0 = u0 ^ v0;

  /* Incorrectly false if u1 == v1 and v1 > u1, but that's good
 enough to select the "smallest" of u, v.*/ 
  vgtu = LIMB_HIGHBIT_TO_MASK (u1 - v1);

  use_add = - (x0 & 1);
  sub_ddmmss (t1, t0, u1, u0, use_add ^ v1, use_add ^ v0);
  if (UNLIKELY (t0 == 0))
{
  if (t1 == 0)
{
  g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1));
  g.d0 = (u0 << 1) | 1;
  return g;
}
  int c;
  count_trailing_zeros (c, t1);

  /* v1 = min (u1, v1) */
  v1 ^= (vgtu & x1);
  /* u0 = |u1 - v1| */
  vgtu = ~use_add & LIMB_HIGHBIT_TO_MASK(t1);;
  u0 = (t1 ^ vgtu) - vgtu;
  ASSERT (c < GMP_LIMB_BITS - 1);
  u0 >>= c + 1;
  u1 = 0;
}
  else
{
  count_trailing_zeros (c, t0);
  c++;
  /* V <-- min (U, V). */
  v1 ^= vgtu & x1;
  v0 ^= vgtu & x0;

  vgtu = ~use_add & LIMB_HIGHBIT_TO_MASK (t1);

  /* U  <--  |U - V|
 No carry handling needed in this conditional negation,
 since t0 != 0. */
  u0 = (t0 ^ vgtu) - vgtu;
  u1 = t1 ^ vgtu;
  if (UNLIKELY (c == GMP_LIMB_BITS))
{
  u0 = u1;
  u1 = 0;
}
  else
{
  u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c));
  u1 >>= c;
}
}
}
  while ((v0 | u0) & GMP_LIMB_HIGHBIT)
{ /* At most two iterations */
  mp_limb_t vgtu, d0;
  int c;
  sub_ddmmss (vgtu, d0, 0, u0, 0, v0);
  if (UNLIKELY (d0 == 0))
{
  g.d1 = u0 >> (GMP_LIMB_BITS - 1);
  g.d0 = (u0 << 1) | 1;
  return g;
}

  /* v <-- min (u, v) */
  v0 += (vgtu & d0);

  /* u <-- |u - v| */
  u0 = (d0 ^ vgtu) - vgtu;

  count_trailing_zeros (c, d0);
  u0 = (u0 >> 1) >> c;
}

  g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1);
  g.d1 = 0;
  return g;
}


-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-16 Thread Niels Möller
t...@gmplib.org (Torbjörn Granlund) writes:

> ni...@lysator.liu.se (Niels Möller) writes:
>
>   $ ./tune/speed -p 10 -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22
>   overhead 4.01 cycles, precision 10 units of 5.08e-10 secs, CPU freq 
> 1966.75 MHz
>  mpn_gcd_11mpn_gcd_22
>   1 #7.0459   11.0729
>   4 #2.87216.6946
>   7 #2.84157.3658
>   10#3.30397.4346
>   13#3.60847.8537
>   16#4.09527.8193
>   19#4.06208.0769
>   22#3.96247.9849
>   25#3.91698.0956
>   28#4.03598.2084
>   31#3.98168.0920
>   34#3.97488.0933
>   37#4.00418.0792
>   40#3.93278.0656
>   43#3.92398.1070
>   46#3.90648.0289
>   49#3.91438.0191
>   52#4.06429.5547
>   55#4.12519.8507
>   58#4.25237.9096
>   61#3.84407.8590
>   64#3.83257.8398
>
> Is that right column for C code?  That's pretty good!

Yep. I take it branches in the old code were expensive...

> I reach performance of 1.5t and 2t where t is the gcd_11 time.

The above is also about 1.5t, if compared to the C gcd_11, which runs at
5.4 cycles per input bit on my machine.

I've just pushed the new gcd_22.

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-16 Thread Torbjörn Granlund
ni...@lysator.liu.se (Niels Möller) writes:

  $ ./tune/speed -p 10 -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22
  overhead 4.01 cycles, precision 10 units of 5.08e-10 secs, CPU freq 
1966.75 MHz
 mpn_gcd_11mpn_gcd_22
  1 #7.0459   11.0729
  4 #2.87216.6946
  7 #2.84157.3658
  10#3.30397.4346
  13#3.60847.8537
  16#4.09527.8193
  19#4.06208.0769
  22#3.96247.9849
  25#3.91698.0956
  28#4.03598.2084
  31#3.98168.0920
  34#3.97488.0933
  37#4.00418.0792
  40#3.93278.0656
  43#3.92398.1070
  46#3.90648.0289
  49#3.91438.0191
  52#4.06429.5547
  55#4.12519.8507
  58#4.25237.9096
  61#3.84407.8590
  64#3.83257.8398

Is that right column for C code?  That's pretty good!

I now have two gcd_22 asm variants or x84-64, one based on negation with
a sbb-generated mask, and the other based on x86_64/core2/gcd_11.asm.

I reach performance of 1.5t and 2t where t is the gcd_11 time.

(Cycle counting is tricky now with chips overclocking themselves while
still counting cycles as per the base clock.)

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-16 Thread Niels Möller
ni...@lysator.liu.se (Niels Möller) writes:

> So the gcd_22 code with all the branches needs about 11 cycles per input
> bit. gcd_11 is coreihwl/gcd_11.asm in this build.

I had to do a quick try with the masking version before leaving for work:

$ ./tune/speed -p 10 -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22
overhead 4.01 cycles, precision 10 units of 5.08e-10 secs, CPU freq 1966.75 
MHz
   mpn_gcd_11mpn_gcd_22
1 #7.0459   11.0729
4 #2.87216.6946
7 #2.84157.3658
10#3.30397.4346
13#3.60847.8537
16#4.09527.8193
19#4.06208.0769
22#3.96247.9849
25#3.91698.0956
28#4.03598.2084
31#3.98168.0920
34#3.97488.0933
37#4.00418.0792
40#3.93278.0656
43#3.92398.1070
46#3.90648.0289
49#3.91438.0191
52#4.06429.5547
55#4.12519.8507
58#4.25237.9096
61#3.84407.8590
64#3.83257.8398

So down to around 8 cycles per input bit.

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-16 Thread Niels Möller
t...@gmplib.org (Torbjörn Granlund) writes:

> ni...@lysator.liu.se (Niels Möller) writes:
>
>   Attached is a patch to move the current (slow) gcd_2 to a separate file, 
> rename
>   it mpn_gcd_22, change argument and return type, and add a basic test.
>
> Nice!

Pushed now.

> Speed support would be very welcome!

Done! To get to measure only the gcd_22 loop, I reused the speed macros
for gcd_11, but call gcd_22 (al, al, bl, bl), in effect premultiplying
both inputs with a common factor (B+1). On my machine,

$ ./tune/speed -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22
overhead 4.05 cycles, precision 1 units of 5.87e-10 secs, CPU freq 1703.77 
MHz
   mpn_gcd_11mpn_gcd_22
1 #9.9349   11.4688
4 #3.72176.5098
7 #3.68976.7366
10#3.84777.9195
13#4.05839.3407
16#4.14609.9268
19#4.4428   10.8306
22#4.3512   10.9613
25#4.3475   10.9706
28#4.3407   11.3027
31#4.3135   10.9393
34#4.2116   11.6335
37#4.3125   11.5507
40#4.1918   11.9219
43#4.2716   10.2703
46#3.9071   10.2429
49#3.9190   10.2698
52#4.0637   10.3419
55#4.0631   11.6783
58#4.1228   11.8280
61#4.0512   10.3701
64#3.8345   10.3486

So the gcd_22 code with all the branches needs about 11 cycles per input
bit. gcd_11 is coreihwl/gcd_11.asm in this build.

Regards,
/Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel


Re: gcd_22

2019-08-15 Thread Torbjörn Granlund
ni...@lysator.liu.se (Niels Möller) writes:

  Attached is a patch to move the current (slow) gcd_2 to a separate file, 
rename
  it mpn_gcd_22, change argument and return type, and add a basic test.

Nice!

  Do you agree with the naming in 

  typedef struct
  {
mp_limb_t d0, d1;
  } mp_double_limb_t;

  or should it be something else?

That's a good name.  (I doubt we will need structs for 3 and 4 limbs, so
we don't need a fancier naming scheme.)

  Next would be speed support, and then it's easier to try ways to make it
  faster.

Speed support would be very welcome!

-- 
Torbjörn
Please encrypt, key id 0xC8601622
___
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel