[PATCH 2/2] Align tight&hot loop without considering max skipping bytes.

Haochen Jiang Tue, 14 May 2024 20:04:48 -0700

From: liuhongt <hongtao....@intel.com>

When hot loop is small enough to fix into one cacheline, we should align
the loop with ceil_log2 (loop_size) without considering maximum
skipp bytes. It will help code prefetch.


gcc/ChangeLog:

        * config/i386/i386.cc (ix86_avoid_jump_mispredicts): Change
        gen_pad to gen_max_skip_align.
        (ix86_align_loops): New function.
        (ix86_reorg): Call ix86_align_loops.
        * config/i386/i386.md (pad): Rename to ..
        (max_skip_align): .. this, and accept 2 operands for align and
        skip.
---
 gcc/config/i386/i386.cc | 148 +++++++++++++++++++++++++++++++++++++++-
 gcc/config/i386/i386.md |  10 +--
 2 files changed, 153 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e67e5f62533..c617091c8e1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23137,7 +23137,7 @@ ix86_avoid_jump_mispredicts (void)
          if (dump_file)
            fprintf (dump_file, "Padding insn %i by %i bytes!\n",
                     INSN_UID (insn), padsize);
-          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+         emit_insn_before (gen_max_skip_align (GEN_INT (4), GEN_INT 
(padsize)), insn);
        }
     }
 }
@@ -23410,6 +23410,150 @@ ix86_split_stlf_stall_load ()
     }
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+    return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *label = BB_HEAD (bb);
+      bool has_fallthru = 0;
+      edge e;
+      edge_iterator ei;
+
+      if (!LABEL_P (label))
+       continue;
+
+      profile_count fallthru_count = profile_count::zero ();
+      profile_count branch_count = profile_count::zero ();
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+       {
+         if (e->flags & EDGE_FALLTHRU)
+           has_fallthru = 1, fallthru_count += e->count ();
+         else
+           branch_count += e->count ();
+       }
+
+      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+       continue;
+
+      if (bb->loop_father
+         && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+         && (has_fallthru
+             ? (!(single_succ_p (bb)
+                  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+                && optimize_bb_for_speed_p (bb)
+                && branch_count + fallthru_count > count_threshold
+                && (branch_count > fallthru_count * 
param_align_loop_iterations))
+             /* In case there'no fallthru for the loop.
+                Nops inserted won't be executed.  */
+             : (branch_count > count_threshold
+                || (bb->count > bb->prev_bb->count * 10
+                    && (bb->prev_bb->count
+                        <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+       {
+         rtx_insn* insn, *end_insn;
+         HOST_WIDE_INT size = 0;
+         bool padding_p = true;
+         basic_block tbb = bb;
+         unsigned cond_branch_num = 0;
+         bool detect_tight_loop_p = false;
+
+         for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+              i++, tbb = tbb->next_bb)
+           {
+             /* Only handle continuous cfg layout. */
+             if (bb->loop_father != tbb->loop_father)
+               {
+                 padding_p = false;
+                 break;
+               }
+
+             FOR_BB_INSNS (tbb, insn)
+               {
+                 if (!NONDEBUG_INSN_P (insn))
+                   continue;
+                 size += ix86_min_insn_size (insn);
+
+                 /* We don't know size of inline asm.
+                    Don't align loop for call.  */
+                 if (asm_noperands (PATTERN (insn)) >= 0
+                     || CALL_P (insn))
+                   {
+                     size = -1;
+                     break;
+                   }
+               }
+
+             if (size == -1 || size > ix86_cost->prefetch_block)
+               {
+                 padding_p = false;
+                 break;
+               }
+
+             FOR_EACH_EDGE (e, ei, tbb->succs)
+               {
+                 /* It could be part of the loop.  */
+                 if (e->dest == bb)
+                   {
+                     detect_tight_loop_p = true;
+                     break;
+                   }
+               }
+
+             if (detect_tight_loop_p)
+               break;
+
+             end_insn = BB_END (tbb);
+             if (JUMP_P (end_insn))
+               {
+                 /* For decoded icache:
+                    1. Up to two branches are allowed per Way.
+                    2. A non-conditional branch is the last micro-op in a Way.
+                 */
+                 if (onlyjump_p (end_insn)
+                     && (any_uncondjump_p (end_insn)
+                         || single_succ_p (tbb)))
+                   {
+                     padding_p = false;
+                     break;
+                   }
+                 else if (++cond_branch_num >= 2)
+                   {
+                     padding_p = false;
+                     break;
+                   }
+               }
+
+           }
+
+         if (padding_p && detect_tight_loop_p)
+           {
+             emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+                                                   GEN_INT (0)), label);
+             /* End of function.  */
+             if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+               break;
+             /* Skip bb which already fits into one cacheline.  */
+             bb = tbb;
+           }
+       }
+    }
+
+  loop_optimizer_finalize ();
+  free_dominance_info (CDI_DOMINATORS);
+}
+
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -23433,6 +23577,8 @@ ix86_reorg (void)
 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
       if (TARGET_FOUR_JUMP_LIMIT)
        ix86_avoid_jump_mispredicts ();
+
+      ix86_align_loops ();
 #endif
     }
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 764bfe20ff2..686de0bf2ff 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19150,16 +19150,18 @@
    (set_attr "length_immediate" "0")
    (set_attr "modrm" "0")])
 
-;; Pad to 16-byte boundary, max skip in op0.  Used to avoid
+;; Pad to 1 << op0 byte boundary, max skip in op1.  Used to avoid
 ;; branch prediction penalty for the third jump in a 16-byte
 ;; block on K8.
+;; Also it's used to align tight loops which can be fix into 1 cacheline.
+;; It can help code prefetch and reduce DSB miss.
 
-(define_insn "pad"
-  [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)]
+(define_insn "max_skip_align"
+  [(unspec_volatile [(match_operand 0) (match_operand 1)] UNSPECV_ALIGN)]
   ""
 {
 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
-  ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, 4, (int)INTVAL (operands[0]));
+  ASM_OUTPUT_MAX_SKIP_ALIGN (asm_out_file, (int)INTVAL (operands[0]), 
(int)INTVAL (operands[1]));
 #else
   /* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that.
      The align insn is used to avoid 3 jump instructions in the row to improve
-- 
2.31.1

[PATCH 2/2] Align tight&hot loop without considering max skipping bytes.

Reply via email to