Re: [13/13] [AArch64] Use vec_perm_indices helper routines

James Greenhalgh Tue, 09 Jan 2018 04:18:07 -0800

On Thu, Jan 04, 2018 at 11:27:56AM +0000, Richard Sandiford wrote:
> Ping**2


This is OK.

It took me a while to get the hang of the interface - a worked example
in the comment in vec-perm-indices.c would probably have been helpful.
It took until your code for REV for this to really make sense to me; so
perhaps that make for a good example.

James

> 
> Richard Sandiford <richard.sandif...@linaro.org> writes:
> > Ping
> >
> > Richard Sandiford <richard.sandif...@linaro.org> writes:
> >> This patch makes the AArch64 vec_perm_const code use the new
> >> vec_perm_indices routines, instead of checking each element individually.
> >> This means that they extend naturally to variable-length vectors.
> >>
> >> Also, aarch64_evpc_dup was the only function that generated rtl when
> >> testing_p is true, and that looked accidental.  The patch adds the
> >> missing check and then replaces the gen_rtx_REG/start_sequence/
> >> end_sequence stuff with an assert that no rtl is generated.
> >>
> >> Tested on aarch64-linux-gnu.  Also tested by making sure that there
> >> were no assembly output differences for aarch64_be-linux-gnu or
> >> aarch64_be-linux-gnu.  OK to install?
> >>
> >> Richard
> >>
> >>
> >> 2017-12-09  Richard Sandiford  <richard.sandif...@linaro.org>
> >>
> >> gcc/
> >>    * config/aarch64/aarch64.c (aarch64_evpc_trn): Use d.perm.series_p
> >>    instead of checking each element individually.
> >>    (aarch64_evpc_uzp): Likewise.
> >>    (aarch64_evpc_zip): Likewise.
> >>    (aarch64_evpc_ext): Likewise.
> >>    (aarch64_evpc_rev): Likewise.
> >>    (aarch64_evpc_dup): Test the encoding for a single duplicated element,
> >>    instead of checking each element individually.  Return true without
> >>    generating rtl if
> >>    (aarch64_vectorize_vec_perm_const): Use all_from_input_p to test
> >>    whether all selected elements come from the same input, instead of
> >>    checking each element individually.  Remove calls to gen_rtx_REG,
> >>    start_sequence and end_sequence and instead assert that no rtl is
> >>    generated.
> >>
> >> Index: gcc/config/aarch64/aarch64.c
> >> ===================================================================
> >> --- gcc/config/aarch64/aarch64.c   2017-12-09 22:48:47.535824832 +0000
> >> +++ gcc/config/aarch64/aarch64.c   2017-12-09 22:49:00.139270410 +0000
> >> @@ -13295,7 +13295,7 @@ aarch64_expand_vec_perm (rtx target, rtx
> >>  static bool
> >>  aarch64_evpc_trn (struct expand_vec_perm_d *d)
> >>  {
> >> -  unsigned int i, odd, mask, nelt = d->perm.length ();
> >> +  unsigned int odd, nelt = d->perm.length ();
> >>    rtx out, in0, in1, x;
> >>    machine_mode vmode = d->vmode;
> >>  
> >> @@ -13304,21 +13304,11 @@ aarch64_evpc_trn (struct expand_vec_perm
> >>  
> >>    /* Note that these are little-endian tests.
> >>       We correct for big-endian later.  */
> >> -  if (d->perm[0] == 0)
> >> -    odd = 0;
> >> -  else if (d->perm[0] == 1)
> >> -    odd = 1;
> >> -  else
> >> +  odd = d->perm[0];
> >> +  if ((odd != 0 && odd != 1)
> >> +      || !d->perm.series_p (0, 2, odd, 2)
> >> +      || !d->perm.series_p (1, 2, nelt + odd, 2))
> >>      return false;
> >> -  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
> >> -
> >> -  for (i = 0; i < nelt; i += 2)
> >> -    {
> >> -      if (d->perm[i] != i + odd)
> >> -  return false;
> >> -      if (d->perm[i + 1] != ((i + nelt + odd) & mask))
> >> -  return false;
> >> -    }
> >>  
> >>    /* Success!  */
> >>    if (d->testing_p)
> >> @@ -13342,7 +13332,7 @@ aarch64_evpc_trn (struct expand_vec_perm
> >>  static bool
> >>  aarch64_evpc_uzp (struct expand_vec_perm_d *d)
> >>  {
> >> -  unsigned int i, odd, mask, nelt = d->perm.length ();
> >> +  unsigned int odd;
> >>    rtx out, in0, in1, x;
> >>    machine_mode vmode = d->vmode;
> >>  
> >> @@ -13351,20 +13341,10 @@ aarch64_evpc_uzp (struct expand_vec_perm
> >>  
> >>    /* Note that these are little-endian tests.
> >>       We correct for big-endian later.  */
> >> -  if (d->perm[0] == 0)
> >> -    odd = 0;
> >> -  else if (d->perm[0] == 1)
> >> -    odd = 1;
> >> -  else
> >> +  odd = d->perm[0];
> >> +  if ((odd != 0 && odd != 1)
> >> +      || !d->perm.series_p (0, 1, odd, 2))
> >>      return false;
> >> -  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
> >> -
> >> -  for (i = 0; i < nelt; i++)
> >> -    {
> >> -      unsigned elt = (i * 2 + odd) & mask;
> >> -      if (d->perm[i] != elt)
> >> -  return false;
> >> -    }
> >>  
> >>    /* Success!  */
> >>    if (d->testing_p)
> >> @@ -13388,7 +13368,7 @@ aarch64_evpc_uzp (struct expand_vec_perm
> >>  static bool
> >>  aarch64_evpc_zip (struct expand_vec_perm_d *d)
> >>  {
> >> -  unsigned int i, high, mask, nelt = d->perm.length ();
> >> +  unsigned int high, nelt = d->perm.length ();
> >>    rtx out, in0, in1, x;
> >>    machine_mode vmode = d->vmode;
> >>  
> >> @@ -13397,25 +13377,11 @@ aarch64_evpc_zip (struct expand_vec_perm
> >>  
> >>    /* Note that these are little-endian tests.
> >>       We correct for big-endian later.  */
> >> -  high = nelt / 2;
> >> -  if (d->perm[0] == high)
> >> -    /* Do Nothing.  */
> >> -    ;
> >> -  else if (d->perm[0] == 0)
> >> -    high = 0;
> >> -  else
> >> +  high = d->perm[0];
> >> +  if ((high != 0 && high * 2 != nelt)
> >> +      || !d->perm.series_p (0, 2, high, 1)
> >> +      || !d->perm.series_p (1, 2, high + nelt, 1))
> >>      return false;
> >> -  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
> >> -
> >> -  for (i = 0; i < nelt / 2; i++)
> >> -    {
> >> -      unsigned elt = (i + high) & mask;
> >> -      if (d->perm[i * 2] != elt)
> >> -  return false;
> >> -      elt = (elt + nelt) & mask;
> >> -      if (d->perm[i * 2 + 1] != elt)
> >> -  return false;
> >> -    }
> >>  
> >>    /* Success!  */
> >>    if (d->testing_p)
> >> @@ -13440,23 +13406,14 @@ aarch64_evpc_zip (struct expand_vec_perm
> >>  static bool
> >>  aarch64_evpc_ext (struct expand_vec_perm_d *d)
> >>  {
> >> -  unsigned int i, nelt = d->perm.length ();
> >> +  unsigned int nelt = d->perm.length ();
> >>    rtx offset;
> >>  
> >>    unsigned int location = d->perm[0]; /* Always < nelt.  */
> >>  
> >>    /* Check if the extracted indices are increasing by one.  */
> >> -  for (i = 1; i < nelt; i++)
> >> -    {
> >> -      unsigned int required = location + i;
> >> -      if (d->one_vector_p)
> >> -        {
> >> -          /* We'll pass the same vector in twice, so allow indices to 
> >> wrap.  */
> >> -    required &= (nelt - 1);
> >> -  }
> >> -      if (d->perm[i] != required)
> >> -        return false;
> >> -    }
> >> +  if (!d->perm.series_p (0, 1, location, 1))
> >> +    return false;
> >>  
> >>    /* Success! */
> >>    if (d->testing_p)
> >> @@ -13488,7 +13445,7 @@ aarch64_evpc_ext (struct expand_vec_perm
> >>  static bool
> >>  aarch64_evpc_rev (struct expand_vec_perm_d *d)
> >>  {
> >> -  unsigned int i, j, diff, size, unspec, nelt = d->perm.length ();
> >> +  unsigned int i, diff, size, unspec;
> >>  
> >>    if (!d->one_vector_p)
> >>      return false;
> >> @@ -13504,18 +13461,10 @@ aarch64_evpc_rev (struct expand_vec_perm
> >>    else
> >>      return false;
> >>  
> >> -  for (i = 0; i < nelt ; i += diff + 1)
> >> -    for (j = 0; j <= diff; j += 1)
> >> -      {
> >> -  /* This is guaranteed to be true as the value of diff
> >> -     is 7, 3, 1 and we should have enough elements in the
> >> -     queue to generate this.  Getting a vector mask with a
> >> -     value of diff other than these values implies that
> >> -     something is wrong by the time we get here.  */
> >> -  gcc_assert (i + j < nelt);
> >> -  if (d->perm[i + j] != i + diff - j)
> >> -    return false;
> >> -      }
> >> +  unsigned int step = diff + 1;
> >> +  for (i = 0; i < step; ++i)
> >> +    if (!d->perm.series_p (i, step, diff - i, step))
> >> +      return false;
> >>  
> >>    /* Success! */
> >>    if (d->testing_p)
> >> @@ -13532,15 +13481,17 @@ aarch64_evpc_dup (struct expand_vec_perm
> >>    rtx out = d->target;
> >>    rtx in0;
> >>    machine_mode vmode = d->vmode;
> >> -  unsigned int i, elt, nelt = d->perm.length ();
> >> +  unsigned int elt;
> >>    rtx lane;
> >>  
> >> +  if (d->perm.encoding ().encoded_nelts () != 1)
> >> +    return false;
> >> +
> >> +  /* Success! */
> >> +  if (d->testing_p)
> >> +    return true;
> >> +
> >>    elt = d->perm[0];
> >> -  for (i = 1; i < nelt; i++)
> >> -    {
> >> -      if (elt != d->perm[i])
> >> -  return false;
> >> -    }
> >>  
> >>    /* The generic preparation in aarch64_expand_vec_perm_const_1
> >>       swaps the operand order and the permute indices if it finds
> >> @@ -13628,61 +13579,37 @@ aarch64_vectorize_vec_perm_const (machin
> >>                              rtx op1, const vec_perm_indices &sel)
> >>  {
> >>    struct expand_vec_perm_d d;
> >> -  unsigned int i, which;
> >>  
> >> -  d.vmode = vmode;
> >> -  d.target = target;
> >> -  d.op0 = op0;
> >> -  d.op1 = op1;
> >> -  d.testing_p = !target;
> >> -
> >> -  /* Calculate whether all elements are in one vector.  */
> >> -  unsigned int nelt = sel.length ();
> >> -  for (i = which = 0; i < nelt; ++i)
> >> +  /* Check whether the mask can be applied to a single vector.  */
> >> +  if (op0 && rtx_equal_p (op0, op1))
> >> +    d.one_vector_p = true;
> >> +  else if (sel.all_from_input_p (0))
> >>      {
> >> -      unsigned int ei = sel[i] & (2 * nelt - 1);
> >> -      which |= (ei < nelt ? 1 : 2);
> >> +      d.one_vector_p = true;
> >> +      op1 = op0;
> >>      }
> >> -
> >> -  switch (which)
> >> +  else if (sel.all_from_input_p (1))
> >>      {
> >> -    default:
> >> -      gcc_unreachable ();
> >> -
> >> -    case 3:
> >> -      d.one_vector_p = false;
> >> -      if (d.testing_p || !rtx_equal_p (op0, op1))
> >> -  break;
> >> -
> >> -      /* The elements of PERM do not suggest that only the first operand
> >> -   is used, but both operands are identical.  Allow easier matching
> >> -   of the permutation by folding the permutation into the single
> >> -   input vector.  */
> >> -      /* Fall Through.  */
> >> -    case 2:
> >> -      d.op0 = op1;
> >> -      d.one_vector_p = true;
> >> -      break;
> >> -
> >> -    case 1:
> >> -      d.op1 = op0;
> >>        d.one_vector_p = true;
> >> -      break;
> >> +      op0 = op1;
> >>      }
> >> +  else
> >> +    d.one_vector_p = false;
> >>  
> >> -  d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, nelt);
> >> +  d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
> >> +               sel.nelts_per_input ());
> >> +  d.vmode = vmode;
> >> +  d.target = target;
> >> +  d.op0 = op0;
> >> +  d.op1 = op1;
> >> +  d.testing_p = !target;
> >>  
> >>    if (!d.testing_p)
> >>      return aarch64_expand_vec_perm_const_1 (&d);
> >>  
> >> -  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
> >> -  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
> >> -  if (!d.one_vector_p)
> >> -    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
> >> -
> >> -  start_sequence ();
> >> +  rtx_insn *last = get_last_insn ();
> >>    bool ret = aarch64_expand_vec_perm_const_1 (&d);
> >> -  end_sequence ();
> >> +  gcc_assert (last == get_last_insn ());
> >>  
> >>    return ret;
> >>  }

Re: [13/13] [AArch64] Use vec_perm_indices helper routines

Reply via email to