On 04/25/12 06:45, Richard Guenther wrote:
On Tue, Apr 24, 2012 at 7:43 PM, Aldy Hernandez<al...@redhat.com>  wrote:
On 04/13/12 03:46, Richard Guenther wrote:

On Fri, Apr 13, 2012 at 12:11 AM, Aldy Hernandez<al...@redhat.com>    wrote:

+  /* ?? Perhaps we should cache this somewhere in the BB, or are
+     multiple levels of empty BB's common. ??  */
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      int res = bb_in_transaction_1 (e->src);
+      if (res != -1)
+       return (bool) res;
+    }
+  return false;

that's weird code - if predecessors are not all in or not in a transaction
you return a random value.  So it seems this is somewhat fragile.

If bb status can be derived from looking at a single stmt why is the
transaction-ness not recorded as a basic-block flag then?  Thus,
why do we have a bit in gimple stmts?

How about we get rid of the GIMPLE bit altogether and keep it in the BB flags like you mention? See patch.

+  bool single_exit_p = single_pred_p (ex->dest);

that's a strange variable name ;)

How about loop_has_only_one_exit? :)


+/* Helper function for execute_sm.  On every path that sets REF, set
+   an appropriate flag indicating the store.  */
+
+static tree
+execute_sm_if_changed_flag_set (struct loop *loop, mem_ref_p ref)
+{
...
+  FOR_EACH_VEC_ELT (mem_ref_loc_p, locs, i, loc)
+    {
+      gimple_stmt_iterator gsi;
+      gimple stmt;
+
+      gsi = gsi_start_bb (gimple_bb (loc->stmt));
+      for (; !gsi_end_p (gsi); gsi_next (&gsi))
+       if (gsi_stmt (gsi) == loc->stmt)

does not seem to do it on every path but on every REF setter.  And instead
of the innermost loop just use gsi_for_stmt (loc->stmt).

Updated comment.  Fixed.


+  /* Emit the load code into the latch, so that we are sure it will
+     be processed after all dependencies.  */
+  latch_edge = loop_latch_edge (loop);
+  load = gimple_build_assign (mem_ssa, unshare_expr (ref->mem));
    lim_data = init_lim_data (load);
    lim_data->max_loop = loop;
    lim_data->tgt_loop = loop;
+  gsi_insert_on_edge (latch_edge, load);
+  load = gimple_build_assign (tmp_var, mem_ssa);
+  lim_data = init_lim_data (load);
+  lim_data->max_loop = loop;
+  lim_data->tgt_loop = loop;
+  gsi_insert_on_edge (latch_edge, load);

the 2nd assign is no longer a "load", so I suppose you don't need any lim_data
for it?

Re-did all this.  See patch.


+      else if (store == STORE_IF_CHANGED_FLAG)
+       execute_sm_if_changed (ex, ref->mem, mem_ssa, tmp_var,
+                              store_flag);

and this can pass NULL instead of mem_ssa?

Got rid of mem_ssa altogether, as well as the if-changed approach which proved inferior to the flags based approach.

Overall this looks reasonable.  With also handling trapping things I meant
that we should be able to apply store-motion to

int a[256];
void foo (int *p)
{
   int i;
   for (i= 0;i<256;i++)
     if (flag)
       *p = a[i];
}

but we cannot, because the transform

   lsm = *p;
   for (i = 0; i<  256; ++i)
     if (flag)
       lsm = a[i];
   *p = lsm;

even when the store is properly conditional the load is not (and you do not
change that).  The unconditional load may trap here.  What I meant to say
is that we do not need the load at all unless there is also a load involved
inside the loop - if we use the flag-based approach.  If there is also a load
inside the loop we'd need to perform the load there (or not handle this
case)

Ahh! Yes, this sounds very profitable. Would you mind me doing this in a separate patch? I am already handling loads in this incarnation, so this should be straightforward.

Speak of loads, I am keeping the information as an additional bitmap in `memory_accesses', as ->refs_in_loop was set for stores as well, so I couldn't depend on it. Let me know if you have another idea.

Mildly tested.  Wadaya think?

Thanks again.
Aldy
        * gimple.h (gimple_set_in_transaction): Remove.
        (gimple_in_transaction): Look in BB instead.
        (gimple_statement_base): Remove in_transaction field.
        * basic-block.h (enum bb_flags): Add BB_IN_TRANSACTION.
        * trans-mem.c (compute_transaction_bits): Place transaction bit
        information into basic blocks.
        * Makefile.in (tree-ssa-loop-im.o): Depend on TARGET_H.
        * doc/tm.texi: Regenerate.
        * tree-ssa-loop-im.c (execute_sm_if_changed): New.
        (execute_sm_if_changed_flag): New.
        (execute_sm_if_changed_flag_set): New.
        (execute_sm): Do not generate data races unless requested.
        * targhooks.c (default_can_compare_bitwise_p): New.
        * targhooks.h (default_can_compare_bitwise_p): Protoize.
        * target.def (can_compare_bitwise_p): New hook.
        * tree-ssa-loop-im.c (execute_sm): Guard store motion with a
        conditional.
        * opts.c (finish_options): Do not allow store or load data races
        with -fgnu-tm.

Index: trans-mem.c
===================================================================
--- trans-mem.c (revision 186108)
+++ trans-mem.c (working copy)
@@ -2449,13 +2449,15 @@ compute_transaction_bits (void)
   struct tm_region *region;
   VEC (basic_block, heap) *queue;
   unsigned int i;
-  gimple_stmt_iterator gsi;
   basic_block bb;
 
   /* ?? Perhaps we need to abstract gate_tm_init further, because we
      certainly don't need it to calculate CDI_DOMINATOR info.  */
   gate_tm_init ();
 
+  FOR_EACH_BB (bb)
+    bb->flags &= ~BB_IN_TRANSACTION;
+
   for (region = all_tm_regions; region; region = region->next)
     {
       queue = get_tm_region_blocks (region->entry_block,
@@ -2464,11 +2466,7 @@ compute_transaction_bits (void)
                                    NULL,
                                    /*stop_at_irr_p=*/true);
       for (i = 0; VEC_iterate (basic_block, queue, i, bb); ++i)
-       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-         {
-           gimple stmt = gsi_stmt (gsi);
-           gimple_set_in_transaction (stmt, true);
-         }
+       bb->flags |= BB_IN_TRANSACTION;
       VEC_free (basic_block, heap, queue);
     }
 
Index: basic-block.h
===================================================================
--- basic-block.h       (revision 186108)
+++ basic-block.h       (working copy)
@@ -256,7 +256,12 @@ enum bb_flags
      df_set_bb_dirty, but not cleared by df_analyze, so it can be used
      to test whether a block has been modified prior to a df_analyze
      call.  */
-  BB_MODIFIED = 1 << 12
+  BB_MODIFIED = 1 << 12,
+
+  /* Set on blocks that are in a transaction.  This is calculated on
+     demand, and is available after calling
+     compute_transaction_bits().  */
+  BB_IN_TRANSACTION = 1 << 13
 };
 
 /* Dummy flag for convenience in the hot/cold partitioning code.  */
Index: tree-ssa-loop-im.c
===================================================================
--- tree-ssa-loop-im.c  (revision 186108)
+++ tree-ssa-loop-im.c  (working copy)
@@ -52,7 +52,7 @@ along with GCC; see the file COPYING3.
         }
      }
 
-   Where COND and INV are is invariants, but evaluating INV may trap or be
+   Where COND and INV are invariants, but evaluating INV may trap or be
    invalid from some other reason if !COND.  This may be transformed to
 
    if (cond)
@@ -175,6 +175,9 @@ static struct
   /* The set of memory references accessed in each loop.  */
   VEC (bitmap, heap) *refs_in_loop;
 
+  /* The set of memory references read in each loop.  */
+  VEC (bitmap, heap) *reads_in_loop;
+
   /* The set of memory references accessed in each loop, including
      subloops.  */
   VEC (bitmap, heap) *all_refs_in_loop;
@@ -1555,6 +1558,13 @@ record_mem_ref_loc (mem_ref_p ref, struc
 
   VEC_safe_push (mem_ref_loc_p, heap, accs->locs, aref);
   bitmap_set_bit (ril, ref->id);
+  if (gimple_assign_single_p (stmt)
+      && gimple_assign_rhs1 (stmt) == ref->mem)
+    {
+      bitmap reads;
+      reads = VEC_index (bitmap, memory_accesses.reads_in_loop, loop->num);
+      bitmap_set_bit (reads, ref->id);
+    }
 }
 
 /* Marks reference REF as stored in LOOP.  */
@@ -1726,6 +1736,8 @@ analyze_memory_references (void)
   memory_accesses.refs_list = NULL;
   memory_accesses.refs_in_loop = VEC_alloc (bitmap, heap,
                                            number_of_loops ());
+  memory_accesses.reads_in_loop = VEC_alloc (bitmap, heap,
+                                            number_of_loops ());
   memory_accesses.all_refs_in_loop = VEC_alloc (bitmap, heap,
                                                number_of_loops ());
   memory_accesses.all_refs_stored_in_loop = VEC_alloc (bitmap, heap,
@@ -1736,6 +1748,8 @@ analyze_memory_references (void)
       empty = BITMAP_ALLOC (NULL);
       VEC_quick_push (bitmap, memory_accesses.refs_in_loop, empty);
       empty = BITMAP_ALLOC (NULL);
+      VEC_quick_push (bitmap, memory_accesses.reads_in_loop, empty);
+      empty = BITMAP_ALLOC (NULL);
       VEC_quick_push (bitmap, memory_accesses.all_refs_in_loop, empty);
       empty = BITMAP_ALLOC (NULL);
       VEC_quick_push (bitmap, memory_accesses.all_refs_stored_in_loop, empty);
@@ -1956,6 +1970,133 @@ get_lsm_tmp_name (tree ref, unsigned n)
   return lsm_tmp_name;
 }
 
+/* Helper function for execute_sm.  Emit code to store TMP_VAR into
+   MEM along edge EX.
+
+   The store is only done if MEM has changed.  We do this so no
+   changes to MEM occur on code paths that did not originally store
+   into it.
+
+   The common case for execute_sm will transform:
+
+     for (...) {
+       if (foo)
+         stuff;
+       else
+         MEM = TMP_VAR;
+     }
+
+   into:
+
+     lsm = MEM;
+     for (...) {
+       if (foo)
+         stuff;
+       else
+         lsm = TMP_VAR;
+     }
+     MEM = lsm;
+
+  This function will generate:
+
+     // Note: this load can be avoided unless there is a load already
+     //       present in the loop.
+     lsm = MEM;
+
+     flag = false;
+     ...
+     for (...) {
+       if (foo)
+         stuff;
+       else {
+         lsm = TMP_VAR;
+         flag = true;
+       }
+     }
+     if (flag)         <--
+       MEM = lsm;      <--
+*/
+
+static void
+execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag)
+{
+  basic_block new_bb, then_bb, old_dest = ex->dest;
+  bool loop_has_only_one_exit = single_pred_p (ex->dest);
+  edge then_old_edge;
+  gimple_stmt_iterator gsi;
+  gimple stmt;
+
+  if (loop_has_only_one_exit)
+    ex = split_block_after_labels (old_dest);
+
+  old_dest = ex->dest;
+  new_bb = split_edge (ex);
+  then_bb = create_empty_bb (new_bb);
+  if (current_loops && new_bb->loop_father)
+    add_bb_to_loop (then_bb, new_bb->loop_father);
+
+  gsi = gsi_start_bb (new_bb);
+  stmt = gimple_build_cond (NE_EXPR, flag, boolean_false_node,
+                           NULL_TREE, NULL_TREE);
+  gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+  gsi = gsi_start_bb (then_bb);
+  /* Insert actual store.  */
+  stmt = gimple_build_assign (unshare_expr (mem), tmp_var);
+  gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+
+  make_edge (new_bb, then_bb, EDGE_TRUE_VALUE);
+  make_edge (new_bb, old_dest, EDGE_FALSE_VALUE);
+  then_old_edge = make_edge (then_bb, old_dest, EDGE_FALLTHRU);
+  set_immediate_dominator (CDI_DOMINATORS, then_bb, new_bb);
+
+  if (!loop_has_only_one_exit)
+    for (gsi = gsi_start_phis (old_dest); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+       gimple phi = gsi_stmt (gsi);
+       unsigned i;
+
+       for (i = 0; i < gimple_phi_num_args (phi); i++)
+         if (gimple_phi_arg_edge (phi, i)->src == new_bb)
+           {
+             tree arg = gimple_phi_arg_def (phi, i);
+             add_phi_arg (phi, arg, then_old_edge, UNKNOWN_LOCATION);
+             update_stmt (phi);
+           }
+      }
+  /* Remove the original fall through edge.  This was the
+     single_succ_edge (new_bb).  */
+  EDGE_SUCC (new_bb, 0)->flags &= ~EDGE_FALLTHRU;
+}
+
+/* Helper function for execute_sm.  On every location where REF is
+   set, set an appropriate flag indicating the store.  */
+
+static tree
+execute_sm_if_changed_flag_set (struct loop *loop, mem_ref_p ref)
+{
+  unsigned i;
+  mem_ref_loc_p loc;
+  tree flag;
+  VEC (mem_ref_loc_p, heap) *locs = NULL;
+  char *str = get_lsm_tmp_name (ref->mem, ~0);
+
+  lsm_tmp_name_add ("_flag");
+  flag = make_rename_temp (boolean_type_node, str);
+  get_all_locs_in_loop (loop, ref, &locs);
+  FOR_EACH_VEC_ELT (mem_ref_loc_p, locs, i, loc)
+    {
+      gimple_stmt_iterator gsi;
+      gimple stmt;
+
+      gsi = gsi_for_stmt (loc->stmt);
+      stmt = gimple_build_assign (flag, boolean_true_node);
+      gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+    }
+  VEC_free (mem_ref_loc_p, heap, locs);
+  return flag;
+}
+
 /* Executes store motion of memory reference REF from LOOP.
    Exits from the LOOP are stored in EXITS.  The initialization of the
    temporary variable is put to the preheader of the loop, and assignments
@@ -1964,12 +2105,14 @@ get_lsm_tmp_name (tree ref, unsigned n)
 static void
 execute_sm (struct loop *loop, VEC (edge, heap) *exits, mem_ref_p ref)
 {
-  tree tmp_var;
+  tree tmp_var, store_flag;
   unsigned i;
-  gimple load, store;
+  gimple load;
   struct fmt_data fmt_data;
-  edge ex;
+  edge ex, latch_edge;
   struct lim_aux_data *lim_data;
+  bool multi_threaded_model_p = false;
+  bool ref_read_in_loop_p = false;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
@@ -1985,23 +2128,55 @@ execute_sm (struct loop *loop, VEC (edge
   fmt_data.orig_loop = loop;
   for_each_index (&ref->mem, force_move_till, &fmt_data);
 
+  if ((flag_tm && loop_preheader_edge (loop)->src->flags & BB_IN_TRANSACTION)
+      || !PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES))
+    multi_threaded_model_p = true;
+
+  if (multi_threaded_model_p)
+    store_flag = execute_sm_if_changed_flag_set (loop, ref);
+
   rewrite_mem_refs (loop, ref, tmp_var);
 
-  /* Emit the load & stores.  */
-  load = gimple_build_assign (tmp_var, unshare_expr (ref->mem));
-  lim_data = init_lim_data (load);
-  lim_data->max_loop = loop;
-  lim_data->tgt_loop = loop;
-
-  /* Put this into the latch, so that we are sure it will be processed after
-     all dependencies.  */
-  gsi_insert_on_edge (loop_latch_edge (loop), load);
 
-  FOR_EACH_VEC_ELT (edge, exits, i, ex)
-    {
-      store = gimple_build_assign (unshare_expr (ref->mem), tmp_var);
-      gsi_insert_on_edge (ex, store);
+  /* Emit the load code into the latch, so that we are sure it will
+     be processed after all dependencies.  */
+  latch_edge = loop_latch_edge (loop);
+  if (multi_threaded_model_p)
+    {
+      bitmap reads = VEC_index (bitmap, memory_accesses.reads_in_loop,
+                               loop->num);
+      ref_read_in_loop_p = bitmap_bit_p (reads, ref->id);
+    }
+  /* For the multi-threaded variant, we can avoid the load altogether,
+     since the store predicated by a flag.  However, do the load if it
+     was originally in the loop.  */
+  if (!multi_threaded_model_p || ref_read_in_loop_p)
+    {
+      load = gimple_build_assign (tmp_var, unshare_expr (ref->mem));
+      lim_data = init_lim_data (load);
+      lim_data->max_loop = loop;
+      lim_data->tgt_loop = loop;
+      gsi_insert_on_edge (latch_edge, load);
+    }
+  if (multi_threaded_model_p)
+    {
+      load = gimple_build_assign (store_flag, boolean_false_node);
+      lim_data = init_lim_data (load);
+      lim_data->max_loop = loop;
+      lim_data->tgt_loop = loop;
+      gsi_insert_on_edge (latch_edge, load);
     }
+
+  /* Sink the store to every exit from the loop.  */
+  FOR_EACH_VEC_ELT (edge, exits, i, ex)
+    if (!multi_threaded_model_p)
+      {
+       gimple store;
+       store = gimple_build_assign (unshare_expr (ref->mem), tmp_var);
+       gsi_insert_on_edge (ex, store);
+      }
+    else
+      execute_sm_if_changed (ex, ref->mem, tmp_var, store_flag);
 }
 
 /* Hoists memory references MEM_REFS out of LOOP.  EXITS is the list of exit
@@ -2433,6 +2608,10 @@ tree_ssa_lim_finalize (void)
     BITMAP_FREE (b);
   VEC_free (bitmap, heap, memory_accesses.refs_in_loop);
 
+  FOR_EACH_VEC_ELT (bitmap, memory_accesses.reads_in_loop, i, b)
+    BITMAP_FREE (b);
+  VEC_free (bitmap, heap, memory_accesses.reads_in_loop);
+
   FOR_EACH_VEC_ELT (bitmap, memory_accesses.all_refs_in_loop, i, b)
     BITMAP_FREE (b);
   VEC_free (bitmap, heap, memory_accesses.all_refs_in_loop);
Index: testsuite/gcc.dg/pr52558-1.c
===================================================================
--- testsuite/gcc.dg/pr52558-1.c        (revision 0)
+++ testsuite/gcc.dg/pr52558-1.c        (revision 0)
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "--param allow-store-data-races=0 -O2 -fdump-tree-lim1" } */
+
+int count;
+
+struct obj {
+    int data;
+    struct obj *next;
+} *q;
+
+void func()
+{
+  struct obj *p;
+  for (p = q; p; p = p->next)
+    if (p->data > 0)
+      count++;
+}
+
+/* { dg-final { scan-tree-dump-times "MEM count_lsm.. count_lsm_flag" 1 "lim1" 
} } */
+/* { dg-final { cleanup-tree-dump "lim1" } } */
Index: testsuite/gcc.dg/pr52558-2.c
===================================================================
--- testsuite/gcc.dg/pr52558-2.c        (revision 0)
+++ testsuite/gcc.dg/pr52558-2.c        (revision 0)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "--param allow-store-data-races=0 -O2 -fdump-tree-lim1" } */
+
+int g_1 = 1;
+int g_2 = 0;
+
+int func_1(void)
+{
+ int l;
+ for (l = 0; l < 1234; l++)
+ {
+   if (g_1)
+     return l;
+   else
+     g_2 = 0;
+ }
+ return 999;
+}
+
+/* { dg-final { scan-tree-dump-times "MEM.*g_2_lsm_flag" 1 "lim1" } } */
+/* { dg-final { cleanup-tree-dump "lim1" } } */
Index: testsuite/gcc.dg/tm/reg-promotion.c
===================================================================
--- testsuite/gcc.dg/tm/reg-promotion.c (revision 0)
+++ testsuite/gcc.dg/tm/reg-promotion.c (revision 0)
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-fgnu-tm -O2 -fdump-tree-lim1" } */
+
+int count;
+
+struct obj {
+    int data;
+    struct obj *next;
+} *q;
+
+void func()
+{
+  struct obj *p;
+  __transaction_atomic {
+    for (p = q; p; p = p->next)
+      if (p->data > 0)
+       count++;
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "MEM count_lsm.. count_lsm_flag" 1 "lim1" 
} } */
+/* { dg-final { cleanup-tree-dump "lim1" } } */
Index: gimple.h
===================================================================
--- gimple.h    (revision 186108)
+++ gimple.h    (working copy)
@@ -305,11 +305,6 @@ struct GTY(()) gimple_statement_base {
   /* Nonzero if this statement contains volatile operands.  */
   unsigned has_volatile_ops    : 1;
 
-  /* Nonzero if this statement appears inside a transaction.  This bit
-     is calculated on de-mand and has relevant information only after
-     it has been calculated with compute_transaction_bits.  */
-  unsigned in_transaction      : 1;
-
   /* The SUBCODE field can be used for tuple-specific flags for tuples
      that do not require subcodes.  Note that SUBCODE should be at
      least as wide as tree codes, as several tuples store tree codes
@@ -1591,15 +1586,7 @@ gimple_set_has_volatile_ops (gimple stmt
 static inline bool
 gimple_in_transaction (gimple stmt)
 {
-  return stmt->gsbase.in_transaction;
-}
-
-/* Set the IN_TRANSACTION flag to TRANSACTIONP.  */
-
-static inline void
-gimple_set_in_transaction (gimple stmt, bool transactionp)
-{
-  stmt->gsbase.in_transaction = (unsigned) transactionp;
+  return gimple_bb (stmt)->flags & BB_IN_TRANSACTION;
 }
 
 /* Return true if statement STMT may access memory.  */

Reply via email to