Re: GCD project status?

Niels Möller Sun, 22 Sep 2019 14:51:11 -0700

t...@gmplib.org (Torbjörn Granlund) writes:

> I had not realised you meant left-to-right.  I thought you talked about
> right-to-left.  Interesting!


Below is one variant, that seems to pass tests. I haven't done any
benchmarks yet. All the SWAP_MASK calls look a bit expensive, should be
4 instructions each. Not sure if there's any way to do the swap in
assembly that's much cheaper; the obvious alternative would be one mov
and two cmov.

When doing this, I realized one could do a kind of SIMD optimization for
the double-precision loop (not yet tried). In the double-precision loop,
I think the matrix elements u00, u01, u10, u11 all fit in half a limb
each. So one could let u00, u10 share one limb, say u0010, using low anf
high half, and let u01, u11 share a single limb u0111. Then

  u01 += q * u00;
  u11 += q * u10;

could be replaced with the single multiply

  u0111 += q * u0010;

And for the binary version below, we'd need only three SWAP_MASK per
iteration, in both the double-limb and the single-limb loops.

Regards,
/Niels

#define SWAP_MASK(mask, x, y) do {              \
    mp_limb_t swap_xor = ((x) ^ (y)) & mask;    \
    (x) ^= swap_xor;                            \
    (y) ^= swap_xor;                            \
  } while (0);

int
mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
           struct hgcd_matrix1 *M)
{
  mp_limb_t u00, u01, u10, u11, swapped;

  if (ah < 2 || bh < 2)
    return 0;

  if (ah > bh || (ah == bh && al > bl))
    {
      sub_ddmmss (ah, al, ah, al, bh, bl);
      if (ah < 2)
        return 0;

      u00 = u01 = u11 = 1;
      u10 = 0;
    }
  else
    {
      sub_ddmmss (bh, bl, bh, bl, ah, al);
      if (bh < 2)
        return 0;

      u00 = u10 = u11 = 1;
      u01 = 0;
    }
  
  swapped = 0;

  for (;;)
    {
      mp_limb_t mask, dh, dl;
      int acnt, bcnt, shift;

      if (ah == bh)
        goto done;

      mask = -(mp_limb_t) (ah < bh);
      swapped ^= mask;
      SWAP_MASK (mask, ah, bh);
      SWAP_MASK (mask, al, bl);
      SWAP_MASK (mask, u00, u01);
      SWAP_MASK (mask, u10, u11);
      
      ASSERT_ALWAYS (ah > bh);

      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
        {
          ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
          bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));

          break;
        }

      count_leading_zeros (acnt, ah);
      count_leading_zeros (bcnt, bh);
      shift = bcnt - acnt;
      ASSERT_ALWAYS (shift >= 0);
      /* Avoid subtraction underflow */
      shift -= (shift > 0);

      dh = (bh << shift) | ((bl >> 1) >> (GMP_LIMB_BITS - 1 - shift));
      dl = bl << shift;
      sub_ddmmss (ah, al, ah, al, dh, dl);

      if (ah < 2) 
        {
          /* A is too small, so decrement q. */
          mp_limb_t q = ((mp_limb_t) 1 << shift) - 1;
          u01 += q * u00;
          u11 += q * u10;
          goto done;
        }
      u01 += (u00 << shift);
      u11 += (u10 << shift);
    }

  /* Single-precision loop */
  for (;;)
    {
      mp_limb_t mask;
      int acnt, bcnt, shift;
      if (ah == bh)
        break;

      mask = -(mp_limb_t) (ah < bh);
      swapped ^= mask;
      SWAP_MASK (mask, ah, bh);
      SWAP_MASK (mask, u00, u01);
      SWAP_MASK (mask, u10, u11);
      
      ASSERT_ALWAYS (ah > bh);
      
      count_leading_zeros (acnt, ah);
      count_leading_zeros (bcnt, bh);
      shift = bcnt - acnt;
      ASSERT_ALWAYS (shift >= 0);
      /* Avoid subtraction underflow */
      shift -= (shift > 0);

      ah -= bh << shift;
      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
        {
          /* A is too small, so decrement q. */
          mp_limb_t q = ((mp_limb_t) 1 << shift) - 1;
          u01 += q * u00;
          u11 += q * u10;
          break;
        }

      u01 += (u00 << shift);
      u11 += (u10 << shift);
    }

 done:
  SWAP_MASK (swapped, u00, u01);
  SWAP_MASK (swapped, u10, u11);
  ASSERT_ALWAYS (u00 * u11 - u01 * u10 == 1);
  M->u[0][0] = u00; M->u[0][1] = u01;
  M->u[1][0] = u10; M->u[1][1] = u11;

  return 1;
}


-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Re: GCD project status?

Reply via email to