Richard Biener <rguent...@suse.de> writes:
> The following teaches VN to handle reads from .MASK_STORE and
> .LEN_STORE.  For this push_partial_def is extended first for
> convenience so we don't have to handle the full def case in the
> caller (possibly other paths can be simplified then).  Also
> the partial definition stored value can have an offset applied
> so we don't have to build a fake RHS when we register the pieces
> of an existing store.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, Kewen is
> going to test on powerpc.
>
> I'm not sure about whether it's worth (or easily possible) to
> handle .MASK_STORE_LANES, I think handling the constant def case
> might be possible but since it has an intrinsic permute it might
> make more sense to rewrite the constant def case into a .MASK_STORE?
> (does the mask apply to the destination memory order or the source
> lane order?)

There's one mask bit for each structure, rather than one for each
element.  So converting a constant .MASK_STORE_LANES to a constant
.MASK_STORE would require some massaging of the predicate.

Thanks,
Richard

>
>       PR tree-optimization/106365
>       * tree-ssa-sccvn.cc (pd_data::rhs_off): New field determining
>       the offset to start encoding of RHS from.
>       (vn_walk_cb_data::vn_walk_cb_data): Initialize it.
>       (vn_walk_cb_data::push_partial_def): Allow the first partial
>       definition to be fully providing the def.  Offset RHS
>       before encoding if requested.
>       (vn_reference_lookup_3): Initialize def_rhs everywhere.
>       Add support for .MASK_STORE and .LEN_STORE (partial) definitions.
>
>       * gcc.target/i386/vec-maskstore-vn.c: New testcase.
> ---
>  .../gcc.target/i386/vec-maskstore-vn.c        |  30 +++
>  gcc/tree-ssa-sccvn.cc                         | 255 ++++++++++++++----
>  2 files changed, 228 insertions(+), 57 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/vec-maskstore-vn.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/vec-maskstore-vn.c 
> b/gcc/testsuite/gcc.target/i386/vec-maskstore-vn.c
> new file mode 100644
> index 00000000000..98213905ece
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vec-maskstore-vn.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -fdump-tree-fre5" } */
> +
> +void __attribute__((noinline,noclone))
> +foo (int *out, int *res)
> +{
> +  int mask[] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 };
> +  int i;
> +  for (i = 0; i < 16; ++i)
> +    {
> +      if (mask[i])
> +        out[i] = i;
> +    }
> +  int o0 = out[0];
> +  int o7 = out[7];
> +  int o14 = out[14];
> +  int o15 = out[15];
> +  res[0] = o0;
> +  res[2] = o7;
> +  res[4] = o14;
> +  res[6] = o15;
> +}
> +
> +/* Vectorization produces .MASK_STORE, unrolling will unroll the two
> +   vector iterations.  FRE5 after that should be able to CSE
> +   out[7] and out[15], but leave out[0] and out[14] alone.  */
> +/* { dg-final { scan-tree-dump " = o0_\[0-9\]+;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = 7;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = o14_\[0-9\]+;" "fre5" } } */
> +/* { dg-final { scan-tree-dump " = 15;" "fre5" } } */
> diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc
> index f41d5031365..7d947b55a27 100644
> --- a/gcc/tree-ssa-sccvn.cc
> +++ b/gcc/tree-ssa-sccvn.cc
> @@ -1790,6 +1790,7 @@ struct pd_range
>  struct pd_data
>  {
>    tree rhs;
> +  HOST_WIDE_INT rhs_off;
>    HOST_WIDE_INT offset;
>    HOST_WIDE_INT size;
>  };
> @@ -1816,6 +1817,7 @@ struct vn_walk_cb_data
>       unsigned int pos = 0, prec = w.get_precision ();
>       pd_data pd;
>       pd.rhs = build_constructor (NULL_TREE, NULL);
> +     pd.rhs_off = 0;
>       /* When bitwise and with a constant is done on a memory load,
>          we don't really need all the bits to be defined or defined
>          to constants, we don't really care what is in the position
> @@ -1976,6 +1978,7 @@ vn_walk_cb_data::push_partial_def (pd_data pd,
>  
>    bool pd_constant_p = (TREE_CODE (pd.rhs) == CONSTRUCTOR
>                       || CONSTANT_CLASS_P (pd.rhs));
> +  pd_range *r;
>    if (partial_defs.is_empty ())
>      {
>        /* If we get a clobber upfront, fail.  */
> @@ -1989,65 +1992,70 @@ vn_walk_cb_data::push_partial_def (pd_data pd,
>        first_set = set;
>        first_base_set = base_set;
>        last_vuse_ptr = NULL;
> -      /* Continue looking for partial defs.  */
> -      return NULL;
> -    }
> -
> -  if (!known_ranges)
> -    {
> -      /* ???  Optimize the case where the 2nd partial def completes things.  
> */
> -      gcc_obstack_init (&ranges_obstack);
> -      known_ranges = splay_tree_new_with_allocator (pd_range_compare, 0, 0,
> -                                                 pd_tree_alloc,
> -                                                 pd_tree_dealloc, this);
> -      splay_tree_insert (known_ranges,
> -                      (splay_tree_key)&first_range.offset,
> -                      (splay_tree_value)&first_range);
> -    }
> -
> -  pd_range newr = { pd.offset, pd.size };
> -  splay_tree_node n;
> -  pd_range *r;
> -  /* Lookup the predecessor of offset + 1 and see if we need to merge.  */
> -  HOST_WIDE_INT loffset = newr.offset + 1;
> -  if ((n = splay_tree_predecessor (known_ranges, (splay_tree_key)&loffset))
> -      && ((r = (pd_range *)n->value), true)
> -      && ranges_known_overlap_p (r->offset, r->size + 1,
> -                              newr.offset, newr.size))
> -    {
> -      /* Ignore partial defs already covered.  Here we also drop shadowed
> -         clobbers arriving here at the floor.  */
> -      if (known_subrange_p (newr.offset, newr.size, r->offset, r->size))
> -     return NULL;
> -      r->size = MAX (r->offset + r->size, newr.offset + newr.size) - 
> r->offset;
> +      r = &first_range;
> +      /* Go check if the first partial definition was a full one in case
> +      the caller didn't optimize for this.  */
>      }
>    else
>      {
> -      /* newr.offset wasn't covered yet, insert the range.  */
> -      r = XOBNEW (&ranges_obstack, pd_range);
> -      *r = newr;
> -      splay_tree_insert (known_ranges, (splay_tree_key)&r->offset,
> -                      (splay_tree_value)r);
> -    }
> -  /* Merge r which now contains newr and is a member of the splay tree with
> -     adjacent overlapping ranges.  */
> -  pd_range *rafter;
> -  while ((n = splay_tree_successor (known_ranges, 
> (splay_tree_key)&r->offset))
> -      && ((rafter = (pd_range *)n->value), true)
> -      && ranges_known_overlap_p (r->offset, r->size + 1,
> -                                 rafter->offset, rafter->size))
> -    {
> -      r->size = MAX (r->offset + r->size,
> -                  rafter->offset + rafter->size) - r->offset;
> -      splay_tree_remove (known_ranges, (splay_tree_key)&rafter->offset);
> -    }
> -  /* If we get a clobber, fail.  */
> -  if (TREE_CLOBBER_P (pd.rhs))
> -    return (void *)-1;
> -  /* Non-constants are OK as long as they are shadowed by a constant.  */
> -  if (!pd_constant_p)
> -    return (void *)-1;
> -  partial_defs.safe_push (pd);
> +      if (!known_ranges)
> +     {
> +       /* ???  Optimize the case where the 2nd partial def completes
> +          things.  */
> +       gcc_obstack_init (&ranges_obstack);
> +       known_ranges = splay_tree_new_with_allocator (pd_range_compare, 0, 0,
> +                                                     pd_tree_alloc,
> +                                                     pd_tree_dealloc, this);
> +       splay_tree_insert (known_ranges,
> +                          (splay_tree_key)&first_range.offset,
> +                          (splay_tree_value)&first_range);
> +     }
> +
> +      pd_range newr = { pd.offset, pd.size };
> +      splay_tree_node n;
> +      /* Lookup the predecessor of offset + 1 and see if we need to merge.  
> */
> +      HOST_WIDE_INT loffset = newr.offset + 1;
> +      if ((n = splay_tree_predecessor (known_ranges, 
> (splay_tree_key)&loffset))
> +       && ((r = (pd_range *)n->value), true)
> +       && ranges_known_overlap_p (r->offset, r->size + 1,
> +                                  newr.offset, newr.size))
> +     {
> +       /* Ignore partial defs already covered.  Here we also drop shadowed
> +          clobbers arriving here at the floor.  */
> +       if (known_subrange_p (newr.offset, newr.size, r->offset, r->size))
> +         return NULL;
> +       r->size
> +         = MAX (r->offset + r->size, newr.offset + newr.size) - r->offset;
> +     }
> +      else
> +     {
> +       /* newr.offset wasn't covered yet, insert the range.  */
> +       r = XOBNEW (&ranges_obstack, pd_range);
> +       *r = newr;
> +       splay_tree_insert (known_ranges, (splay_tree_key)&r->offset,
> +                          (splay_tree_value)r);
> +     }
> +      /* Merge r which now contains newr and is a member of the splay tree 
> with
> +      adjacent overlapping ranges.  */
> +      pd_range *rafter;
> +      while ((n = splay_tree_successor (known_ranges,
> +                                     (splay_tree_key)&r->offset))
> +          && ((rafter = (pd_range *)n->value), true)
> +          && ranges_known_overlap_p (r->offset, r->size + 1,
> +                                     rafter->offset, rafter->size))
> +     {
> +       r->size = MAX (r->offset + r->size,
> +                      rafter->offset + rafter->size) - r->offset;
> +       splay_tree_remove (known_ranges, (splay_tree_key)&rafter->offset);
> +     }
> +      /* If we get a clobber, fail.  */
> +      if (TREE_CLOBBER_P (pd.rhs))
> +     return (void *)-1;
> +      /* Non-constants are OK as long as they are shadowed by a constant.  */
> +      if (!pd_constant_p)
> +     return (void *)-1;
> +      partial_defs.safe_push (pd);
> +    }
>  
>    /* Now we have merged newr into the range tree.  When we have covered
>       [offseti, sizei] then the tree will contain exactly one node which has
> @@ -2081,7 +2089,8 @@ vn_walk_cb_data::push_partial_def (pd_data pd,
>        else
>       {
>         len = native_encode_expr (pd.rhs, this_buffer, bufsize,
> -                                 MAX (0, -pd.offset) / BITS_PER_UNIT);
> +                                 (MAX (0, -pd.offset)
> +                                  + pd.rhs_off) / BITS_PER_UNIT);
>         if (len <= 0
>             || len < (ROUND_UP (pd.size, BITS_PER_UNIT) / BITS_PER_UNIT
>                       - MAX (0, -pd.offset) / BITS_PER_UNIT))
> @@ -2906,6 +2915,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void 
> *data_,
>       {
>         pd_data pd;
>         pd.rhs = build_constructor (NULL_TREE, NULL);
> +       pd.rhs_off = 0;
>         pd.offset = offset2i;
>         pd.size = leni << LOG2_BITS_PER_UNIT;
>         return data->push_partial_def (pd, 0, 0, offseti, maxsizei);
> @@ -2955,6 +2965,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void 
> *data_,
>                by a later def.  */
>             pd_data pd;
>             pd.rhs = gimple_assign_rhs1 (def_stmt);
> +           pd.rhs_off = 0;
>             pd.offset = offset2i;
>             pd.size = size2i;
>             return data->push_partial_def (pd, ao_ref_alias_set (&lhs_ref),
> @@ -3107,6 +3118,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void 
> *data_,
>             if (TREE_CODE (rhs) == SSA_NAME)
>               rhs = SSA_VAL (rhs);
>             pd.rhs = rhs;
> +           pd.rhs_off = 0;
>             pd.offset = offset2i;
>             pd.size = size2i;
>             return data->push_partial_def (pd, ao_ref_alias_set (&lhs_ref),
> @@ -3186,6 +3198,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void 
> *data_,
>           {
>             pd_data pd;
>             pd.rhs = SSA_VAL (def_rhs);
> +           pd.rhs_off = 0;
>             pd.offset = offset2i;
>             pd.size = size2i;
>             return data->push_partial_def (pd, ao_ref_alias_set (&lhs_ref),
> @@ -3195,6 +3208,133 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void 
> *data_,
>       }
>      }
>  
> +  /* 4b) Assignment done via one of the vectorizer internal store
> +     functions where we may be able to access pieces from or we can
> +     combine to a larger entity.  */
> +  else if (known_eq (ref->size, maxsize)
> +        && is_gimple_reg_type (vr->type)
> +        && !reverse_storage_order_for_component_p (vr->operands)
> +        && !contains_storage_order_barrier_p (vr->operands)
> +        && is_gimple_call (def_stmt)
> +        && gimple_call_internal_p (def_stmt)
> +        && internal_store_fn_p (gimple_call_internal_fn (def_stmt)))
> +    {
> +      gcall *call = as_a <gcall *> (def_stmt);
> +      internal_fn fn = gimple_call_internal_fn (call);
> +      tree def_rhs = gimple_call_arg (call,
> +                                   internal_fn_stored_value_index (fn));
> +      def_rhs = vn_valueize (def_rhs);
> +      if (TREE_CODE (def_rhs) != VECTOR_CST)
> +     return (void *)-1;
> +
> +      tree mask = NULL_TREE, len = NULL_TREE, bias = NULL_TREE;
> +      switch (fn)
> +     {
> +     case IFN_MASK_STORE:
> +       mask = gimple_call_arg (call, internal_fn_mask_index (fn));
> +       mask = vn_valueize (mask);
> +       if (TREE_CODE (mask) != VECTOR_CST)
> +         return (void *)-1;
> +       break;
> +     case IFN_LEN_STORE:
> +       len = gimple_call_arg (call, 2);
> +       bias = gimple_call_arg (call, 4);
> +       if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias))
> +         return (void *)-1;
> +       break;
> +     default:
> +       return (void *)-1;
> +     }
> +      ao_ref_init_from_ptr_and_size (&lhs_ref,
> +                                  vn_valueize (gimple_call_arg (call, 0)),
> +                                  TYPE_SIZE_UNIT (TREE_TYPE (def_rhs)));
> +      tree base2;
> +      poly_int64 offset2, size2, maxsize2;
> +      HOST_WIDE_INT offset2i, size2i, offseti;
> +      base2 = ao_ref_base (&lhs_ref);
> +      offset2 = lhs_ref.offset;
> +      size2 = lhs_ref.size;
> +      maxsize2 = lhs_ref.max_size;
> +      if (known_size_p (maxsize2)
> +       && known_eq (maxsize2, size2)
> +       && adjust_offsets_for_equal_base_address (base, &offset,
> +                                                 base2, &offset2)
> +       && maxsize.is_constant (&maxsizei)
> +       && offset.is_constant (&offseti)
> +       && offset2.is_constant (&offset2i)
> +       && size2.is_constant (&size2i))
> +     {
> +       if (!ranges_maybe_overlap_p (offset, maxsize, offset2, size2))
> +         /* Poor-mans disambiguation.  */
> +         return NULL;
> +       else if (ranges_known_overlap_p (offset, maxsize, offset2, size2))
> +         {
> +           pd_data pd;
> +           pd.rhs = def_rhs;
> +           tree aa = gimple_call_arg (call, 1);
> +           alias_set_type set = get_deref_alias_set (TREE_TYPE (aa));
> +           tree vectype = TREE_TYPE (def_rhs);
> +           unsigned HOST_WIDE_INT elsz
> +             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
> +           if (mask)
> +             {
> +               HOST_WIDE_INT start = 0, len = 0;
> +               unsigned mask_idx = 0;
> +               do
> +                 {
> +                   if (integer_zerop (VECTOR_CST_ELT (mask, mask_idx)))
> +                     {
> +                       if (len != 0)
> +                         {
> +                           pd.rhs_off = start;
> +                           pd.offset = offset2i + start;
> +                           pd.size = len;
> +                           if (ranges_known_overlap_p
> +                                 (offset, maxsize, pd.offset, pd.size))
> +                             {
> +                               void *res = data->push_partial_def
> +                                           (pd, set, set, offseti, maxsizei);
> +                               if (res != NULL)
> +                                 return res;
> +                             }
> +                         }
> +                       start = (mask_idx + 1) * elsz;
> +                       len = 0;
> +                     }
> +                   else
> +                     len += elsz;
> +                   mask_idx++;
> +                 }
> +               while (known_lt (mask_idx, TYPE_VECTOR_SUBPARTS (vectype)));
> +               if (len != 0)
> +                 {
> +                   pd.rhs_off = start;
> +                   pd.offset = offset2i + start;
> +                   pd.size = len;
> +                   if (ranges_known_overlap_p (offset, maxsize,
> +                                               pd.offset, pd.size))
> +                     return data->push_partial_def (pd, set, set,
> +                                                    offseti, maxsizei);
> +                 }
> +             }
> +           else if (fn == IFN_LEN_STORE)
> +             {
> +               pd.rhs_off = 0;
> +               pd.offset = offset2i;
> +               pd.size = (tree_to_uhwi (len)
> +                          + -tree_to_shwi (bias)) * BITS_PER_UNIT;
> +               if (ranges_known_overlap_p (offset, maxsize,
> +                                           pd.offset, pd.size))
> +                 return data->push_partial_def (pd, set, set,
> +                                                offseti, maxsizei);
> +             }
> +           else
> +             gcc_unreachable ();
> +           return NULL;
> +         }
> +     }
> +    }
> +
>    /* 5) For aggregate copies translate the reference through them if
>       the copy kills ref.  */
>    else if (data->vn_walk_kind == VN_WALKREWRITE
> @@ -3327,6 +3467,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void 
> *data_,
>           {
>             pd_data pd;
>             pd.rhs = val;
> +           pd.rhs_off = 0;
>             pd.offset = 0;
>             pd.size = maxsizei;
>             return data->push_partial_def (pd, ao_ref_alias_set (&lhs_ref),

Reply via email to