On Tue, Dec 10, 2019 at 10:57 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The stack_protect_set_1_<mode> pattern intentionally clears the register it > used as a temporary to read the canary from the register and push it back > on the stack for security reasons, to make sure the stack canary isn't > spilled somewhere else etc. On the following testcase, we end up with a > weird: > movq %fs:40, %rax > movq %rax, 24(%rsp) > xorl %eax, %eax > movl $30, %eax > sequence though, where the reporter rightfully complains it is a waste > to clear the register and immediately set it to something else. > > We really don't want to split this into two patterns, because then the > scheduler and whatever other post-RA passes could stick some further > code in between and increase the lifetime of the security sensitive > data in the register. > > So, the following patch uses peephole2 to merge the > stack_protect_set_1_<mode> insn with following setter of the same register. > Only SImode and for TARGET_64BIT DImode moves are considered, as QI/HImode > (unless actually emitted as SImode) moves don't overwrite the whole > register, and for simplicity only the most common cases (no XMM/MM etc. > sources). > > Bootstrapped/regtested on x86_64-linux and i686-linux, additionally tested > (just check-gcc check-c++-all check-target-libstdc++-v3 with > --target_board=unix/-fstack-protector-strong). The -fstack-protector-strong > testing was done with additional logging whenever the peephole2's kick in. > During the -m64 testing, it kicked in 17682 times, during -m32 testing > 6344 times. Ok for trunk? > > 2019-12-10 Jakub Jelinek <ja...@redhat.com> > > PR target/92841 > * config/i386/i386.md (*stack_protect_set_2_<mode>, > *stack_protect_set_3): New define_insns and corresponding > define_peephole2s. > > * gcc.target/i386/pr92841.c: New test.
OK with a couple of changes below. Thanks, Uros. > --- gcc/config/i386/i386.md.jj 2019-12-05 10:04:05.357235555 +0100 > +++ gcc/config/i386/i386.md 2019-12-09 19:29:31.578885594 +0100 > @@ -19732,6 +19732,98 @@ (define_insn "@stack_protect_set_1_<mode > "mov{<imodesuffix>}\t{%1, %2|%2, %1}\;mov{<imodesuffix>}\t{%2, %0|%0, > %2}\;xor{l}\t%k2, %k2" > [(set_attr "type" "multi")]) > > +;; Patterns and peephole2s to optimize stack_protect_set_1_<mode> > +;; immediately followed by *mov{s,d}i_internal to the same register, > +;; where we can avoid the xor{l} above. We don't split this, so that > +;; scheduling or anything else doesn't separate the *stack_protect_set* > +;; pattern from the set of the register that overwrites the register > +;; with a new value. > +(define_insn "*stack_protect_set_2_<mode>" > + [(set (match_operand:PTR 0 "memory_operand" "=m") > + (unspec:PTR [(match_operand:PTR 3 "memory_operand" "m")] > + UNSPEC_SP_SET)) > + (set (match_operand:SI 1 "register_operand" "=&r") > + (match_operand:SI 2 "general_operand" "g")) > + (clobber (reg:CC FLAGS_REG))] > + "reload_completed > + && !reg_overlap_mentioned_p (operands[1], operands[2])" > +{ > + output_asm_insn ("mov{<imodesuffix>}\t{%3, %<k>1|%<k>1, %3}", operands); > + output_asm_insn ("mov{<imodesuffix>}\t{%<k>1, %0|%0, %<k>1}", operands); This looks much better than the current asm template in stack_protect_set_1_<mode> and stack_protect_test_1_<mode>. Can you maybe also change templates there? > + if (pic_32bit_operand (operands[2], SImode) > + || ix86_use_lea_for_mov (insn, operands + 1)) > + return "lea{l}\t{%E2, %1|%1, %E2}"; > + else > + return "mov{l}\t{%2, %1|%1, %2}"; > +} > + [(set_attr "type" "multi") > + (set_attr "length" "24")]) > + > +(define_peephole2 > + [(parallel [(set (match_operand:PTR 0 "memory_operand") > + (unspec:PTR [(match_operand:PTR 1 "memory_operand")] > + UNSPEC_SP_SET)) > + (set (match_operand:PTR 2 "general_reg_operand") (const_int 0)) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_operand:SI 3 "general_reg_operand") > + (match_operand:SI 4 "general_operand"))] > + "reload_completed > + && REGNO (operands[2]) == REGNO (operands[3]) > + && !reg_overlap_mentioned_p (operands[3], operands[4]) > + && (general_reg_operand (operands[4], SImode) > + || !register_operand (operands[4], SImode))" > + [(parallel [(set (match_dup 0) > + (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET)) > + (set (match_dup 3) (match_dup 4)) > + (clobber (reg:CC FLAGS_REG))])]) No need for "reload_completed" in peephole2 patterns. Also (IIRC), operand predicate is not needed for operand 4 when it is mentioned in the insn condition. > +(define_insn "*stack_protect_set_3" > + [(set (match_operand:DI 0 "memory_operand" "=m,m,m") > + (unspec:DI [(match_operand:DI 3 "memory_operand" "m,m,m")] > + UNSPEC_SP_SET)) > + (set (match_operand:DI 1 "register_operand" "=&r,r,r") > + (match_operand:DI 2 "general_operand" "Z,rem,i")) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_64BIT > + && reload_completed > + && !reg_overlap_mentioned_p (operands[1], operands[2])" > +{ > + output_asm_insn ("mov{q}\t{%3, %1|%1, %3}", operands); > + output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", operands); > + if (which_alternative == 0) > + return "mov{l}\t{%k2, %k1|%k1, %k2}"; > + else if (which_alternative == 2) > + return "movabs{q}\t{%2, %1|%1, %2}"; > + else if (pic_32bit_operand (operands[2], DImode) > + || ix86_use_lea_for_mov (insn, operands + 1)) > + return "lea{q}\t{%E2, %1|%1, %E2}"; > + else > + return "mov{q}\t{%2, %1|%1, %2}"; > +} > + [(set_attr "type" "multi") > + (set_attr "length" "24")]) > + > +(define_peephole2 > + [(parallel [(set (match_operand:DI 0 "memory_operand") > + (unspec:DI [(match_operand:DI 1 "memory_operand")] > + UNSPEC_SP_SET)) > + (set (match_operand:DI 2 "general_reg_operand") (const_int 0)) > + (clobber (reg:CC FLAGS_REG))]) > + (set (match_dup 2) (match_operand:DI 3 "general_operand"))] > + "TARGET_64BIT > + && reload_completed > + && !reg_overlap_mentioned_p (operands[2], operands[3]) > + && (general_reg_operand (operands[3], DImode) > + || memory_operand (operands[3], DImode) > + || x86_64_zext_immediate_operand (operands[3], DImode) > + || x86_64_immediate_operand (operands[3], DImode) > + || (CONSTANT_P (operands[3]) > + && (!flag_pic || LEGITIMATE_PIC_OPERAND_P (operands[3]))))" > + [(parallel [(set (match_dup 0) > + (unspec:PTR [(match_dup 1)] UNSPEC_SP_SET)) > + (set (match_dup 2) (match_dup 3)) > + (clobber (reg:CC FLAGS_REG))])]) Same here. > (define_expand "stack_protect_test" > [(match_operand 0 "memory_operand") > (match_operand 1 "memory_operand") > --- gcc/testsuite/gcc.target/i386/pr92841.c.jj 2019-12-09 19:38:29.572759215 > +0100 > +++ gcc/testsuite/gcc.target/i386/pr92841.c 2019-12-09 19:40:59.642492417 > +0100 > @@ -0,0 +1,17 @@ > +/* PR target/92841 */ > +/* { dg-do compile { target fstack_protector } } */ > +/* { dg-options "-O2 -fstack-protector-strong -masm=att" } */ > +/* { dg-final { scan-assembler-not "xor\[lq]\t%(\[re]\[a-z0-9]*), > %\\1\[\n\r]*\tmov\[lq]\t\[^\n\r]*, %\\1" } } */ > + > +const struct S { int b; } c[] = {30, 12, 20, 0, 11}; > +void bar (int *); > + > +void > +foo (void) > +{ > + int e[4]; > + const struct S *a; > + for (a = c; a < c + sizeof (c); a++) > + if (a->b) > + bar (e); > +} > > Jakub >