Handle store cacheline hazard for A700 cpus by inserting two NOP_S
between ST ST LD or their logical equivalent (like ST ST NOP_S NOP_S
J_L.D LD)

gcc/
2016-08-01  Claudiu Zissulescu  <claz...@synopsys.com>

        * config/arc/arc-arch.h (ARC_TUNE_ARC7XX): New tune value.
        * config/arc/arc.c (arc_active_insn): New function.
        (check_store_cacheline_hazard): Likewise.
        (workaround_arc_anomaly): Use check_store_cacheline_hazard.
        (arc_override_options): Disable delay slot scheduler for older
        A7.
        (arc_store_addr_hazard_p): New implementation, old one renamed to
        ...
        (arc_store_addr_hazard_internal_p): Renamed.
        (arc_reorg): Don't combine into brcc instructions which are part
        of hardware hazard solution.
        * config/arc/arc.md (attr tune): Consider new arc7xx tune value.
        (tune_arc700): Likewise.
        * config/arc/arc.opt (arc7xx): New tune value.
        * config/arc/arc700.md: Improve A7 scheduler.
---
 gcc/config/arc/arc-arch.h |   1 +
 gcc/config/arc/arc.c      | 142 ++++++++++++++++++++++++++++++++------
 gcc/config/arc/arc.md     |   8 ++-
 gcc/config/arc/arc.opt    |   3 +
 gcc/config/arc/arc700.md  |  18 +----
 5 files changed, 132 insertions(+), 40 deletions(-)

diff --git a/gcc/config/arc/arc-arch.h b/gcc/config/arc/arc-arch.h
index 859af0684b8..ad540607e55 100644
--- a/gcc/config/arc/arc-arch.h
+++ b/gcc/config/arc/arc-arch.h
@@ -71,6 +71,7 @@ enum arc_tune_attr
   {
     ARC_TUNE_NONE,
     ARC_TUNE_ARC600,
+    ARC_TUNE_ARC7XX,
     ARC_TUNE_ARC700_4_2_STD,
     ARC_TUNE_ARC700_4_2_XMAC,
     ARC_TUNE_CORE_3,
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index ab7735d6b38..90454928379 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -1308,6 +1308,10 @@ arc_override_options (void)
   if (TARGET_LONG_CALLS_SET)
     target_flags &= ~MASK_MILLICODE_THUNK_SET;
 
+  /* A7 has an issue with delay slots.  */
+  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
+    flag_delayed_branch = 0;
+
   /* These need to be done at start up.  It's convenient to do them here.  */
   arc_init ();
 }
@@ -7529,11 +7533,91 @@ arc_invalid_within_doloop (const rtx_insn *insn)
   return NULL;
 }
 
+static rtx_insn *
+arc_active_insn (rtx_insn *insn)
+{
+  rtx_insn *nxt = next_active_insn (insn);
+
+  if (nxt && GET_CODE (PATTERN (nxt)) == ASM_INPUT)
+    nxt = next_active_insn (nxt);
+  return nxt;
+}
+
+/* Search for a sequence made out of two stores and a given number of
+   loads, insert a nop if required.  */
+
+static void
+check_store_cacheline_hazard (void)
+{
+  rtx_insn *insn, *succ0, *insn1;
+  bool found = false;
+
+  for (insn = get_insns (); insn; insn = arc_active_insn (insn))
+    {
+      succ0 = arc_active_insn (insn);
+
+      if (!succ0)
+       return;
+
+      if (!single_set (insn) || !single_set (succ0))
+       continue;
+
+      if ((get_attr_type (insn) != TYPE_STORE)
+         || (get_attr_type (succ0) != TYPE_STORE))
+       continue;
+
+      /* Found at least two consecutive stores.  Goto the end of the
+        store sequence.  */
+      for (insn1 = succ0; insn1; insn1 = arc_active_insn (insn1))
+       if (!single_set (insn1) || get_attr_type (insn1) != TYPE_STORE)
+         break;
+
+      /* Now, check the next two instructions for the following cases:
+         1. next instruction is a LD => insert 2 nops between store
+           sequence and load.
+        2. next-next instruction is a LD => inset 1 nop after the store
+           sequence.  */
+      if (insn1 && single_set (insn1)
+         && (get_attr_type (insn1) == TYPE_LOAD))
+       {
+         found = true;
+         emit_insn_before (gen_nopv (), insn1);
+         emit_insn_before (gen_nopv (), insn1);
+       }
+      else
+       {
+         if (insn1 && (get_attr_type (insn1) == TYPE_COMPARE))
+           {
+             /* REG_SAVE_NOTE is used by Haifa scheduler, we are in
+                reorg, so it is safe to reuse it for avoiding the
+                current compare insn to be part of a BRcc
+                optimization.  */
+             add_reg_note (insn1, REG_SAVE_NOTE, GEN_INT (3));
+           }
+         insn1 = arc_active_insn (insn1);
+         if (insn1 && single_set (insn1)
+             && (get_attr_type (insn1) == TYPE_LOAD))
+           {
+             found = true;
+             emit_insn_before (gen_nopv (), insn1);
+           }
+       }
+
+      insn = insn1;
+      if (found)
+       {
+         /* warning (0, "Potential lockup sequence found, patching"); */
+         found = false;
+       }
+    }
+}
+
 /* Return true if a load instruction (CONSUMER) uses the same address as a
    store instruction (PRODUCER).  This function is used to avoid st/ld
    address hazard in ARC700 cores.  */
-bool
-arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
+
+static bool
+arc_store_addr_hazard_internal_p (rtx_insn* producer, rtx_insn* consumer)
 {
   rtx in_set, out_set;
   rtx out_addr, in_addr;
@@ -7581,6 +7665,14 @@ arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* 
consumer)
   return false;
 }
 
+bool
+arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer)
+{
+  if (TARGET_ARC700 && (arc_tune != ARC_TUNE_ARC7XX))
+    return true;
+  return arc_store_addr_hazard_internal_p (producer, consumer);
+}
+
 /* The same functionality as arc_hazard.  It is called in machine
    reorg before any other optimization.  Hence, the NOP size is taken
    into account when doing branch shortening.  */
@@ -7589,6 +7681,7 @@ static void
 workaround_arc_anomaly (void)
 {
   rtx_insn *insn, *succ0;
+  rtx_insn *succ1;
 
   /* For any architecture: call arc_hazard here.  */
   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
@@ -7600,27 +7693,30 @@ workaround_arc_anomaly (void)
        }
     }
 
-  if (TARGET_ARC700)
-    {
-      rtx_insn *succ1;
+  if (!TARGET_ARC700)
+    return;
 
-      for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
-       {
-         succ0 = next_real_insn (insn);
-         if (arc_store_addr_hazard_p (insn, succ0))
-           {
-             emit_insn_after (gen_nopv (), insn);
-             emit_insn_after (gen_nopv (), insn);
-             continue;
-           }
+  /* Old A7 are suffering of a cache hazard, and we need to insert two
+     nops between any sequence of stores and a load.  */
+  if (arc_tune != ARC_TUNE_ARC7XX)
+    check_store_cacheline_hazard ();
 
-         /* Avoid adding nops if the instruction between the ST and LD is
-            a call or jump.  */
-         succ1 = next_real_insn (succ0);
-         if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
-             && arc_store_addr_hazard_p (insn, succ1))
-           emit_insn_after (gen_nopv (), insn);
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      succ0 = next_real_insn (insn);
+      if (arc_store_addr_hazard_internal_p (insn, succ0))
+       {
+         emit_insn_after (gen_nopv (), insn);
+         emit_insn_after (gen_nopv (), insn);
+         continue;
        }
+
+      /* Avoid adding nops if the instruction between the ST and LD is
+        a call or jump.  */
+      succ1 = next_real_insn (succ0);
+      if (succ0 && !JUMP_P (succ0) && !CALL_P (succ0)
+         && arc_store_addr_hazard_internal_p (insn, succ1))
+       emit_insn_after (gen_nopv (), insn);
     }
 }
 
@@ -8291,11 +8387,15 @@ arc_reorg (void)
              if (!link_insn)
                continue;
              else
-               /* Check if this is a data dependency.  */
                {
+                 /* Check if this is a data dependency.  */
                  rtx op, cc_clob_rtx, op0, op1, brcc_insn, note;
                  rtx cmp0, cmp1;
 
+                 /* Make sure we can use it for brcc insns.  */
+                 if (find_reg_note (link_insn, REG_SAVE_NOTE, GEN_INT (3)))
+                   continue;
+
                  /* Ok this is the set cc. copy args here.  */
                  op = XEXP (pc_target, 0);
 
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index fb8a1c9ee09..caf7deda505 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -600,11 +600,13 @@
 ;;   somehow modify them to become inelegible for delay slots if a decision
 ;;   is made that makes conditional execution required.
 
-(define_attr "tune" "none,arc600,arc700_4_2_std,arc700_4_2_xmac, core_3, \
-archs4x, archs4xd, archs4xd_slow"
+(define_attr "tune" "none,arc600,arc7xx,arc700_4_2_std,arc700_4_2_xmac, \
+core_3, archs4x, archs4xd, archs4xd_slow"
   (const
    (cond [(symbol_ref "arc_tune == TUNE_ARC600")
          (const_string "arc600")
+         (symbol_ref "arc_tune == ARC_TUNE_ARC7XX")
+         (const_string "arc7xx")
          (symbol_ref "arc_tune == TUNE_ARC700_4_2_STD")
          (const_string "arc700_4_2_std")
          (symbol_ref "arc_tune == TUNE_ARC700_4_2_XMAC")
@@ -619,7 +621,7 @@ archs4x, archs4xd, archs4xd_slow"
         (const_string "none"))))
 
 (define_attr "tune_arc700" "false,true"
-  (if_then_else (eq_attr "tune" "arc700_4_2_std, arc700_4_2_xmac")
+  (if_then_else (eq_attr "tune" "arc7xx, arc700_4_2_std, arc700_4_2_xmac")
                (const_string "true")
                (const_string "false")))
 
diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt
index 93e18af1d27..bcffb2720ba 100644
--- a/gcc/config/arc/arc.opt
+++ b/gcc/config/arc/arc.opt
@@ -262,6 +262,9 @@ Enum(arc_tune_attr) String(arc600) Value(ARC_TUNE_ARC600)
 EnumValue
 Enum(arc_tune_attr) String(arc601) Value(ARC_TUNE_ARC600)
 
+EnumValue
+Enum(arc_tune_attr) String(arc7xx) Value(ARC_TUNE_ARC7XX)
+
 EnumValue
 Enum(arc_tune_attr) String(arc700) Value(ARC_TUNE_ARC700_4_2_STD)
 
diff --git a/gcc/config/arc/arc700.md b/gcc/config/arc/arc700.md
index a0f9f74a9f2..cbb868d8dcd 100644
--- a/gcc/config/arc/arc700.md
+++ b/gcc/config/arc/arc700.md
@@ -145,28 +145,14 @@
 ; no functional unit runs when blockage is reserved
 (exclusion_set "blockage" "core, multiplier")
 
-(define_insn_reservation "data_load_DI" 4
-  (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "load")
-       (match_operand:DI 0 "" ""))
-  "issue+dmp, issue+dmp, dmp_write_port, dmp_write_port")
-
 (define_insn_reservation "data_load" 3
   (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "load")
-       (not (match_operand:DI 0 "" "")))
+       (eq_attr "type" "load"))
   "issue+dmp, nothing, dmp_write_port")
 
-(define_insn_reservation "data_store_DI" 2
-  (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "store")
-       (match_operand:DI 0 "" ""))
-  "issue+dmp_write_port, issue+dmp_write_port")
-
 (define_insn_reservation "data_store" 1
   (and (eq_attr "tune_arc700" "true")
-       (eq_attr "type" "store")
-       (not (match_operand:DI 0 "" "")))
+       (eq_attr "type" "store"))
   "issue+dmp_write_port")
 
 (define_bypass 3 "data_store" "data_load" "arc_store_addr_hazard_p")
-- 
2.17.1

Reply via email to