On 04/08/16 16:56, Richard Earnshaw (lists) wrote: > On 04/08/16 12:06, Wilco Dijkstra wrote: >> Improve stack adjustment by reusing a temporary move immediate >> from the epilog if the register is still valid in the epilog. This generates >> smaller code for leaf functions: >> >> mov x16, 40000 >> sub sp, sp, x16 >> ldr w0, [sp, w0, sxtw 2] >> add sp, sp, x16 >> ret >> >> Passes GCC regression tests. >> >> ChangeLog: >> 2016-08-04 Wilco Dijkstra <wdijk...@arm.com> >> >> gcc/ >> * config/aarch64/aarch64.c (aarch64_add_constant): >> Add extra argument to allow emitting the move immediate. >> Use add/sub with positive immediate. >> (aarch64_expand_epilogue): Decide when to leave out move. >> >> testsuite/ >> * gcc.target/aarch64/test_frame_17.c: New test. >> -- >> > > I see you've added a default argument for your new parameter. I think > doing that is fine, but I have two comments about how we might use that > in this case. > > Firstly, if this parameter is suitable for having a default value, then > I think the preceding one should also be treated in the same way. > > Secondly, I think (due to these parameters being BOOL with no useful > context to make it clear which is which) that having wrapper functions > (inlined, of course) that describe the intended behaviour more clearly > would be useful. So, defining > > static inline void > aarch64_add_frame_constant (mode, regnum, scratchreg, delta) > { > aarch64_add_frame_constant (mode, regnum, scratchreg, delta, true);
The inner function should be aarch64_add_constant of course! R. > } > > would make it clearer at the call point than having a lot of true and > false parameters scattered round the code. > > Alternatively we might remove all the default parameters and require > wrappers like the above to make it more explicit which API is intended - > this might make more sense if not all combinations make sense. > > R. > >> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c >> index >> ce2cc5ae3e1291f4ef4a8408461678c9397b06bd..5b59e4dd157351f301fc563a724cefe8a9be132c >> 100644 >> --- a/gcc/config/aarch64/aarch64.c >> +++ b/gcc/config/aarch64/aarch64.c >> @@ -1941,15 +1941,21 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) >> } >> >> /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to held >> - intermediate value if necessary. >> + intermediate value if necessary. FRAME_RELATED_P should be true if >> + the RTX_FRAME_RELATED flag should be set and CFA adjustments added >> + to the generated instructions. If SCRATCHREG is known to hold >> + abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the >> + immediate again. >> >> - This function is sometimes used to adjust the stack pointer, so we must >> - ensure that it can never cause transient stack deallocation by writing an >> - invalid value into REGNUM. */ >> + Since this function may be used to adjust the stack pointer, we must >> + ensure that it cannot cause transient stack deallocation (for example >> + by first incrementing SP and then decrementing when adjusting by a >> + large immediate). */ >> >> static void >> aarch64_add_constant (machine_mode mode, int regnum, int scratchreg, >> - HOST_WIDE_INT delta, bool frame_related_p) >> + HOST_WIDE_INT delta, bool frame_related_p, >> + bool emit_move_imm = true) >> { >> HOST_WIDE_INT mdelta = abs_hwi (delta); >> rtx this_rtx = gen_rtx_REG (mode, regnum); >> @@ -1967,11 +1973,11 @@ aarch64_add_constant (machine_mode mode, int regnum, >> int scratchreg, >> return; >> } >> >> - /* We need two add/sub instructions, each one performing part of the >> - calculation. Don't do this if the addend can be loaded into register >> with >> - a single instruction, in that case we prefer a move to a scratch >> register >> - following by an addition. */ >> - if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode)) >> + /* We need two add/sub instructions, each one perform part of the >> + addition/subtraction, but don't this if the addend can be loaded into >> + register by single instruction, in that case we prefer a move to >> scratch >> + register following by addition. */ >> + if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode)) >> { >> HOST_WIDE_INT low_off = mdelta & 0xfff; >> >> @@ -1985,8 +1991,10 @@ aarch64_add_constant (machine_mode mode, int regnum, >> int scratchreg, >> >> /* Otherwise use generic function to handle all other situations. */ >> rtx scratch_rtx = gen_rtx_REG (mode, scratchreg); >> - aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode); >> - insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx)); >> + if (emit_move_imm) >> + aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, >> mode); >> + insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx) >> + : gen_add2_insn (this_rtx, scratch_rtx)); >> if (frame_related_p) >> { >> RTX_FRAME_RELATED_P (insn) = frame_related_p; >> @@ -3288,7 +3296,8 @@ aarch64_expand_epilogue (bool for_sibcall) >> RTX_FRAME_RELATED_P (insn) = callee_adjust == 0; >> } >> else >> - aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true); >> + aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true, >> + df_regs_ever_live_p (IP1_REGNUM)); >> >> aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, >> R30_REGNUM, >> callee_adjust != 0, &cfi_ops); >> @@ -3311,7 +3320,8 @@ aarch64_expand_epilogue (bool for_sibcall) >> cfi_ops = NULL; >> } >> >> - aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true); >> + aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true, >> + df_regs_ever_live_p (IP0_REGNUM)); >> >> if (cfi_ops) >> { >> diff --git a/gcc/testsuite/gcc.target/aarch64/test_frame_17.c >> b/gcc/testsuite/gcc.target/aarch64/test_frame_17.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..c214431999b60cce8a75204876a8c73ec6304128 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/test_frame_17.c >> @@ -0,0 +1,21 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 --save-temps" } */ >> + >> +/* Test reuse of stack adjustment temporaries. */ >> + >> +void foo (); >> + >> +int reuse_mov (int i) >> +{ >> + int arr[1025]; >> + return arr[i]; >> +} >> + >> +int no_reuse_mov (int i) >> +{ >> + int arr[1025]; >> + foo (); >> + return arr[i]; >> +} >> + >> +/* { dg-final { scan-assembler-times "mov\tx16, \[0-9\]+" 3 } } */ >> >