New patch to avoid LCP stalls based on feedback from earlier patch. I modified
H.J.'s old patch to perform the peephole2 to split immediate moves to HImode
memory. This is now enabled for Core2, Corei7 and Generic.

I verified that this enables the splitting to occur in the case that originally
motivated the optimization. If we subsequently find situations where LCP stalls
are hurting performance but an extra register is required to perform the
splitting, then we can revisit whether this should be performed earlier.

I also measured SPEC 2000/2006 performance using Generic64 on an AMD Opteron
and the results were neutral.

Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk?

Thanks,
Teresa

2012-04-04   Teresa Johnson  <tejohn...@google.com>

        * config/i386/i386.h (ix86_tune_indices): Add
        X86_TUNE_LCP_STALL.
        * config/i386/i386.md (move immediate to memory peephole2):
        Add cases for HImode move when LCP stall avoidance is needed.
        * config/i386/i386.c (initial_ix86_tune_features): Initialize
        X86_TUNE_LCP_STALL entry.

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h  (revision 185920)
+++ config/i386/i386.h  (working copy)
@@ -262,6 +262,7 @@ enum ix86_tune_indices {
   X86_TUNE_MOVX,
   X86_TUNE_PARTIAL_REG_STALL,
   X86_TUNE_PARTIAL_FLAG_REG_STALL,
+  X86_TUNE_LCP_STALL,
   X86_TUNE_USE_HIMODE_FIOP,
   X86_TUNE_USE_SIMODE_FIOP,
   X86_TUNE_USE_MOV0,
@@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L
 #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL]
 #define TARGET_PARTIAL_FLAG_REG_STALL \
        ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL]
+#define TARGET_LCP_STALL \
+       ix86_tune_features[X86_TUNE_LCP_STALL]
 #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP]
 #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP]
 #define TARGET_USE_MOV0                ix86_tune_features[X86_TUNE_USE_MOV0]
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 185920)
+++ config/i386/i386.md (working copy)
@@ -16977,9 +16977,11 @@
    (set (match_operand:SWI124 0 "memory_operand")
         (const_int 0))]
   "optimize_insn_for_speed_p ()
-   && !TARGET_USE_MOV0
-   && TARGET_SPLIT_LONG_MOVES
-   && get_attr_length (insn) >= ix86_cur_cost ()->large_insn
+   && ((TARGET_LCP_STALL
+       && GET_MODE (operands[0]) == HImode)
+       || (!TARGET_USE_MOV0
+          && TARGET_SPLIT_LONG_MOVES
+          && get_attr_length (insn) >= ix86_cur_cost ()->large_insn))
    && peep2_regno_dead_p (0, FLAGS_REG)"
   [(parallel [(set (match_dup 2) (const_int 0))
              (clobber (reg:CC FLAGS_REG))])
@@ -16991,8 +16993,10 @@
    (set (match_operand:SWI124 0 "memory_operand")
         (match_operand:SWI124 1 "immediate_operand"))]
   "optimize_insn_for_speed_p ()
-   && TARGET_SPLIT_LONG_MOVES
-   && get_attr_length (insn) >= ix86_cur_cost ()->large_insn"
+   && ((TARGET_LCP_STALL
+       && GET_MODE (operands[0]) == HImode)
+       || (TARGET_SPLIT_LONG_MOVES
+          && get_attr_length (insn) >= ix86_cur_cost ()->large_insn))"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (match_dup 2))])
 
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 185920)
+++ config/i386/i386.c  (working copy)
@@ -1964,6 +1964,10 @@ static unsigned int initial_ix86_tune_features[X86
   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
   m_CORE2I7 | m_GENERIC,
 
+  /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
+   * on 16-bit immediate moves into memory on Core2 and Corei7.  */
+  m_CORE2I7 | m_GENERIC,
+
   /* X86_TUNE_USE_HIMODE_FIOP */
   m_386 | m_486 | m_K6_GEODE,
 

--
This patch is available for review at http://codereview.appspot.com/5975045

Reply via email to