Hi, this patch makes ACCUMULATE_OUTGOING_ARGS to disable itself when function is cold. I did some extra testing and to my amusement we now seem to output more compact unwind info when ACCUMULATE_OUTGOING_ARGS is disabled, so this seems quite consistent code size win.
We actually can do better and enable ACCUMULATE_OUTGOING_ARGS only when function contains hot calls. This should also avoid need for frame allocation in prologue/epilogue on hot path then. I will look into this incrementally. I also noticed that we still have some tuning flags in i386.c rather than in x86-tune.c so I moved them there. Testing x86_64-linux and will commit it once testing converge. Honza * config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation for cold functions. * x86-tune.def (X86_TUNE_USE_LEAVE): Update comment. (X86_TUNE_PUSH_MEMORY): Likewise. (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New. (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New. * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store): Remove. (ix86_option_override_internal): Update to use tune features instead of variables. Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 203380) +++ config/i386/i386.h (working copy) @@ -1492,13 +1492,26 @@ enum reg_class will be computed and placed into the variable `crtl->outgoing_args_size'. No space will be pushed onto the stack for each call; instead, the function prologue should increase the stack frame size by this amount. + + In 32bit mode enabling argument accumulation results in about 5% code size + growth becuase move instructions are less compact than push. In 64bit + mode the difference is less drastic but visible. + + FIXME: Unlike earlier implementations, the size of unwind info seems to + actually grouw with accumulation. Is that because accumulated args + unwind info became unnecesarily bloated? 64-bit MS ABI seem to require 16 byte alignment everywhere except for - function prologue and apilogue. This is not possible without - ACCUMULATE_OUTGOING_ARGS. */ + function prologue and epilogue. This is not possible without + ACCUMULATE_OUTGOING_ARGS. + + If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ #define ACCUMULATE_OUTGOING_ARGS \ - (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI) + ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \ + || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI) /* If defined, a C expression whose value is nonzero when we want to use PUSH instructions to pass outgoing arguments. */ Index: config/i386/x86-tune.def =================================================================== --- config/i386/x86-tune.def (revision 203387) +++ config/i386/x86-tune.def (working copy) @@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Except see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results - negatively, so enabling for Generic64 seems like good code size - tradeoff. We can't enable it for 32bit generic because it does not - work well with PPro base chips. */ +/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. - Some chips, like 486 and Pentium have problems with these sequences. */ + Some chips, like 486 and Pentium works faster with separate load + and push instructions. */ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) @@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OP DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", m_COREI7 | m_BDVER | m_SLM | m_GENERIC) +/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are + split. */ +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", + ~(m_COREI7 | m_GENERIC)) + +/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are + split. */ +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", + ~(m_COREI7 | m_BDVER | m_GENERIC)) + /* Use packed single precision instructions where posisble. I.e. movups instead of movupd. */ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", @@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CM fp converts to destination register. */ DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", m_SLM) + +/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing + arguments in prologue/epilogue instead of separately for each call + by push/pop instructions. + This increase code size by about 5% in 32bit mode, less so in 64bit mode + because parameters are passed in registers. It is considerable + win for targets without stack engine that prevents multple push operations + to happen in parallel. + + FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer, + Bobcat and Generic. This is because disabling it causes large + regression on mgrid due to IRA limitation leading to unecessary + use of the frame pointer in 32bit mode. */ +DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", + m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, + such as fsqrt, fprem, fsin, fcos, fsincos etc. + Should be enabled for all targets that always has coprocesor. */ +DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", + ~(m_386 | m_486)) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 203380) +++ config/i386/i386.c (working copy) @@ -1898,18 +1898,6 @@ static unsigned int initial_ix86_arch_fe ~m_386, }; -static const unsigned int x86_accumulate_outgoing_args - = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; - -static const unsigned int x86_arch_always_fancy_math_387 - = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; - -static const unsigned int x86_avx256_split_unaligned_load - = m_COREI7 | m_GENERIC; - -static const unsigned int x86_avx256_split_unaligned_store - = m_COREI7 | m_BDVER | m_GENERIC; - /* In case the average insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and epilogue code. */ @@ -2920,7 +2908,7 @@ static void ix86_option_override_internal (bool main_args_p) { int i; - unsigned int ix86_arch_mask, ix86_tune_mask; + unsigned int ix86_arch_mask; const bool ix86_tune_specified = (ix86_tune_string != NULL); const char *prefix; const char *suffix; @@ -3673,7 +3661,7 @@ ix86_option_override_internal (bool main /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, since the insns won't need emulation. */ - if (x86_arch_always_fancy_math_387 & ix86_arch_mask) + if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387]) target_flags &= ~MASK_NO_FANCY_MATH_387; /* Likewise, if the target doesn't have a 387, or we've specified @@ -3805,8 +3793,7 @@ ix86_option_override_internal (bool main gcc_unreachable (); } - ix86_tune_mask = 1u << ix86_tune; - if ((x86_accumulate_outgoing_args & ix86_tune_mask) + if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS] && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) && !optimize_size) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; @@ -3946,10 +3933,10 @@ ix86_option_override_internal (bool main if (flag_expensive_optimizations && !(target_flags_explicit & MASK_VZEROUPPER)) target_flags |= MASK_VZEROUPPER; - if ((x86_avx256_split_unaligned_load & ix86_tune_mask) + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if ((x86_avx256_split_unaligned_store & ix86_tune_mask) + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL] && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; /* Enable 128-bit AVX instruction generation