Hi,
this patch adds code to produce prologues/epilogues as suggested by Ondrej Bilka
(I described more the approach in
http://gcc.gnu.org/ml/gcc-patches/2013-09/msg02082.html)
This patch is updated and cleaned up version after Mikhail changes merging
memset/memcpy
generation code. (I will continue with some incremental cleanups for the code
dulication
we ended up with).
For now I don't have value range code in, but all logic is in place once
http://gcc.gnu.org/ml/gcc-patches/2013-09/msg02011.html
gets reviewed.
Bootstrapped/regtesed x86_64-linux also with -minline-all-stringops and tested
on SPEC2k6.
I will commit it later today after more testing.
Honza
* i386.h (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES): New
tuning flag.
* x86-tune.def (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES): Define it.
* i386.c (expand_small_movmem_or_setmem): New function.
(expand_set_or_movmem_prologue_epilogue_by_misaligned_moves): New
function
(alg_usable_p): Add support for value ranges; cleanup.
(ix86_expand_set_or_movmem): Add support for misaligned moves.
Index: i386.h
===
--- i386.h (revision 203888)
+++ i386.h (working copy)
@@ -350,6 +350,8 @@ extern unsigned char ix86_tune_features[
#define TARGET_PROMOTE_QImode ix86_tune_features[X86_TUNE_PROMOTE_QIMODE]
#define TARGET_FAST_PREFIX ix86_tune_features[X86_TUNE_FAST_PREFIX]
#define TARGET_SINGLE_STRINGOP ix86_tune_features[X86_TUNE_SINGLE_STRINGOP]
+#define TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES \
+ ix86_tune_features[TARGET_MISALIGNED_MOVE_STRING_PROLOGUES]
#define TARGET_QIMODE_MATH ix86_tune_features[X86_TUNE_QIMODE_MATH]
#define TARGET_HIMODE_MATH ix86_tune_features[X86_TUNE_HIMODE_MATH]
#define TARGET_PROMOTE_QI_REGS ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS]
Index: x86-tune.def
===
--- x86-tune.def(revision 203888)
+++ x86-tune.def(working copy)
@@ -239,6 +239,15 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CM
as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, single_stringop, m_386 | m_P4_NOCONA)
+/* TARGET_MISALIGNED_MOVE_STRING_PROLOGUES: Enable generation of compace
+ prologues and epilogues by issuing a misaligned moves. This require
+ target to handle misaligned moves and partial memory stalls resonably
+ well.
+ FIXME: This actualy may be a win on more targets than listed here. */
+DEF_TUNE (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES,
+ misaligned_move_string_prologues,
+ m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
+
/* X86_TUNE_USE_SAHF: Controls use of SAHF. */
DEF_TUNE (X86_TUNE_USE_SAHF, use_sahf,
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
Index: i386.c
===
--- i386.c (revision 203888)
+++ i386.c (working copy)
@@ -22734,6 +22734,314 @@ expand_set_or_movmem_prologue (rtx destm
return destmem;
}
+/* Test if COUNTSIZE is nonzero and if so, expand movme
+ or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
+ and jump to DONE_LABEL. */
+static void
+expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr,
+ rtx value, rtx vec_value,
+ rtx count, int size,
+ rtx done_label, bool issetmem)
+{
+ rtx label = ix86_expand_aligntest (count, size, false);
+ enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
+ rtx modesize;
+ int n;
+
+ /* If we do not have vector value to copy, we must reduce size. */
+ if (issetmem)
+{
+ if (!vec_value)
+ {
+ if (GET_MODE (value) == VOIDmode size 8)
+ mode = Pmode;
+ else if (GET_MODE_SIZE (mode) GET_MODE_SIZE (GET_MODE (value)))
+ mode = GET_MODE (value);
+ }
+ else
+ mode = GET_MODE (vec_value), value = vec_value;
+}
+ else
+{
+ /* Choose appropriate vector mode. */
+ if (size = 32)
+ mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+ else if (size = 16)
+ mode = TARGET_SSE ? V16QImode : DImode;
+ srcmem = change_address (srcmem, mode, srcptr);
+}
+ destmem = change_address (destmem, mode, destptr);
+ modesize = GEN_INT (GET_MODE_SIZE (mode));
+ gcc_assert (GET_MODE_SIZE (mode) = size);
+ for (n = 0; n * GET_MODE_SIZE (mode) size; n++)
+{
+ if (issetmem)
+ emit_move_insn (destmem, gen_lowpart (mode, value));
+ else
+ {
+ emit_move_insn (destmem, srcmem);
+ srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+ }
+ destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));