Hi,

recently, I came across a problem that keeps a load instruction in a
loop although it is loop-invariant.

A simple example is:

#include <stdio.h>

#define SZ 256
int a[SZ], b[SZ], c[SZ];

int main() {

  int i;
  for (i = 0; i < SZ; i++) {
    a[i] = b[i] + c[i];
  }

  printf("%d\n", a[0]);
}

The resulting loop code in s390 assembly is:
(built with -march=z196 -O3)

.L2:
  l     %r3, 0(%r1, %r5)  # load b[i]
  a     %r3, 0(%r1, %r4)  # add c[i]
  st    %r3, 0(%r1, %r12) # store a[i]
  larl  %r3, a            # <-- load a, why?
  aghi  %r1, 4            # incr i
  brctg %r2, .L2          # branch

followed by the printf call:

  lgf   %r3, 0(%r3)       # load a[0]
  larl  %r2, .LC1         # load "%d\n"
  brasl %r14, printf      # call printf

The cse1 pass detects that "larl %r3, a" and "lgf %r3, 0(%r3)"
can use the same internal register and unifies them.

loop2_invariant then realizes that "larl  %r3, a" is loop-invariant and
moves it out of the loop leaving a register-register move in the loop
that should be removed in some subsequent pass.
However, cprop3 replaces the source register of this move by the
constant symbolref "a" again, the moved "larl  %r3, a" is removed by
cse2 directly afterwards and we are back to square one...
(I could also post the corresponding RTL if requested)

The problem begins, imho, when cse1 extends the BB of the loop to the BB
after the loop and uses the same register for the load in the loop as
well as after the loop. This prevents PRE, cprop and loop2_invariant
from correctly moving the load outside of the loop.

I wonder if this behavior is intended in general or if I'm missing
something? On i386, my example works because the load of "a" before
printf is handled differently.

I wrote a patch that disables basic block extension out of loops in the
first CSE pass for s390 which fixes the problem for me. It uses a target
hook and should only affect s390 but is not yet thoroughly tested.

Any opinions/remarks on this?

Regards
 Robin
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 538587e..c4be0b9 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -13871,6 +13871,11 @@ s390_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, const_tree ty
   return NULL;
 }
 
+static inline bool
+s390_cse_extend_out_of_loops (void)
+{
+  return false;
+}
 
 /* Initialize GCC target structure.  */
 
@@ -14092,6 +14097,9 @@ s390_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, const_tree ty
 #undef TARGET_ASM_FILE_END
 #define TARGET_ASM_FILE_END s390_asm_file_end
 
+#undef TARGET_CSE_EXTEND_OUT_OF_LOOPS
+#define TARGET_CSE_EXTEND_OUT_OF_LOOPS s390_cse_extend_out_of_loops
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/cse.c b/gcc/cse.c
index 2a33827..1aa1539 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -69,6 +69,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "df.h"
 #include "dbgcnt.h"
 #include "rtl-iter.h"
+#include "cfgloop.h"
 
 /* The basic idea of common subexpression elimination is to go
    through the code, keeping a record of expressions that would
@@ -451,6 +452,8 @@ struct table_elt
   char flag;
 };
 
+static bool extend_out_of_loops;
+
 /* We don't want a lot of buckets, because we rarely have very many
    things stored in the hash table, and a lot of buckets slows
    down a lot of loops that happen frequently.  */
@@ -6344,7 +6347,11 @@ cse_find_path (basic_block first_bb, struct cse_basic_block_data *data,
 	      && single_pred_p (e->dest)
 	      /* Avoid visiting basic blocks twice.  The large comment
 		 above explains why this can happen.  */
-	      && !bitmap_bit_p (cse_visited_basic_blocks, e->dest->index))
+	      && !bitmap_bit_p (cse_visited_basic_blocks, e->dest->index)
+	      /* Extend the basic block out of a loop if allowed */
+	      && (!loop_exit_edge_p( e->src->loop_father, e)
+	      || (loop_exit_edge_p (e->src->loop_father, e)
+	          && extend_out_of_loops)))
 	    {
 	      basic_block bb2 = e->dest;
 	      bitmap_set_bit (cse_visited_basic_blocks, bb2->index);
@@ -7521,6 +7528,8 @@ rest_of_handle_cse (void)
   if (dump_file)
     dump_flow_info (dump_file, dump_flags);
 
+  extend_out_of_loops = targetm.cse_extend_out_of_loops();
+
   tem = cse_main (get_insns (), max_reg_num ());
 
   /* If we are not running more CSE passes, then we are no longer
@@ -7582,10 +7591,14 @@ static unsigned int
 rest_of_handle_cse2 (void)
 {
   int tem;
+  int save_cel;
 
   if (dump_file)
     dump_flow_info (dump_file, dump_flags);
 
+  save_cel = extend_out_of_loops;
+  extend_out_of_loops = true;
+
   tem = cse_main (get_insns (), max_reg_num ());
 
   /* Run a pass to eliminate duplicated assignments to condition code
@@ -7606,6 +7619,7 @@ rest_of_handle_cse2 (void)
   else if (tem == 1)
     cleanup_cfg (0);
 
+  extend_out_of_loops = save_cel;
   cse_not_expected = 1;
   return 0;
 }
@@ -7656,12 +7670,16 @@ static unsigned int
 rest_of_handle_cse_after_global_opts (void)
 {
   int save_cfj;
+  int save_cel;
   int tem;
 
   /* We only want to do local CSE, so don't follow jumps.  */
   save_cfj = flag_cse_follow_jumps;
   flag_cse_follow_jumps = 0;
 
+  save_cel = extend_out_of_loops;
+  extend_out_of_loops = true;
+
   rebuild_jump_labels (get_insns ());
   tem = cse_main (get_insns (), max_reg_num ());
   purge_all_dead_edges ();
@@ -7681,6 +7699,7 @@ rest_of_handle_cse_after_global_opts (void)
     cleanup_cfg (0);
 
   flag_cse_follow_jumps = save_cfj;
+  extend_out_of_loops = save_cel;
   return 0;
 }
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 6c5bfab..661841c 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5448,6 +5448,13 @@ The typical use of this macro is to handle addresses containing
 a label_ref or symbol_ref within an UNSPEC@.
 @end defmac
 
+@deftypefn {Target Hook} bool TARGET_CSE_EXTEND_OUT_OF_LOOPS (void)
+This hook returns true if the first CSE pass may extend basic blocks
+across loop boundaries i.e. from inside a loop out of it.
+
+By default, it returns true.
+@end deftypefn
+
 @deftypefn {Target Hook} rtx TARGET_LEGITIMIZE_ADDRESS (rtx @var{x}, rtx @var{oldx}, machine_mode @var{mode})
 This hook is given an invalid memory address @var{x} for an
 operand of mode @var{mode} and should try to return a valid memory
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 8d6dfbc..cabd9f9 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4142,6 +4142,8 @@ The typical use of this macro is to handle addresses containing
 a label_ref or symbol_ref within an UNSPEC@.
 @end defmac
 
+@hook TARGET_CSE_EXTEND_OUT_OF_LOOPS
+
 @hook TARGET_LEGITIMIZE_ADDRESS
 
 @defmac LEGITIMIZE_RELOAD_ADDRESS (@var{x}, @var{mode}, @var{opnum}, @var{type}, @var{ind_levels}, @var{win})
diff --git a/gcc/target.def b/gcc/target.def
index a00181a..1a86f4a 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -920,6 +920,16 @@ DEFHOOK
 
 HOOK_VECTOR_END (asm_out)
 
+/* Hook to control if CSE may extend basic blocks over loop ends*/
+DEFHOOK
+(cse_extend_out_of_loops,
+ "This hook returns true if the first CSE pass may extend basic blocks\n\
+across loop boundaries i.e. from inside a loop out of it.\n\
+\n\
+By default, it returns true.",
+ bool, (void),
+ hook_bool_void_true)
+
 /* Functions relating to instruction scheduling.  All of these
    default to null pointers, which haifa-sched.c looks for and handles.  */
 #undef HOOK_PREFIX

Reply via email to