Since there is /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates directly to memory. */ DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
to avoid long immediate store instructions, like c7 02 00 00 00 00 movl $0x0,(%rdx) c7 02 ff ff ff ff movl $0xffffffff,(%rdx) add TARGET_USE_AND0_ORM1_STORE and enable *mov<mode>_(and|or) for TARGET_USE_AND0_ORM1_STORE, which is true for TARGET_SPLIT_LONG_MOVES or -Oz, to also generate: 83 22 00 andl $0x0,(%rdx) 83 0a ff orl $0xffffffff,(%rdx) for TARGET_SPLIT_LONG_MOVES. gcc/ PR target/120734 * config/i386/i386.h (TARGET_USE_AND0_ORM1_STORE): New. * config/i386/i386.md (*mov<mode>_and): Replace not -Oz split condition with !TARGET_USE_AND0_ORM1_STORE. (*mov<mode>_or): Likewise. (peephole2): Transform "mov $0,mem" to "*mov<mode>_and" and "mov $-1,mem" to "*mov<mode>_or" for TARGET_USE_AND0_ORM1_STORE. gcc/testsuite/ PR target/120734 * gcc.target/i386/pr120734a.c: New test. * gcc.target/i386/pr120734b.c: Likewise. * gcc.target/i386/pr120734c.c: Likewise. OK for master? Thanks. -- H.J.
From 1e3c5540a7c57db91d8d65a98aa9b378b506b62c Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Sat, 21 Jun 2025 09:10:07 +0800 Subject: [PATCH] x86: Enable *mov<mode>_(and|or) for TARGET_SPLIT_LONG_MOVES Since there is /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates directly to memory. */ DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO) to avoid long immediate store instructions, like c7 02 00 00 00 00 movl $0x0,(%rdx) c7 02 ff ff ff ff movl $0xffffffff,(%rdx) add TARGET_USE_AND0_ORM1_STORE and enable *mov<mode>_(and|or) for TARGET_USE_AND0_ORM1_STORE, which is true for TARGET_SPLIT_LONG_MOVES or -Oz, to also generate: 83 22 00 andl $0x0,(%rdx) 83 0a ff orl $0xffffffff,(%rdx) for TARGET_SPLIT_LONG_MOVES. gcc/ PR target/120734 * config/i386/i386.h (TARGET_USE_AND0_ORM1_STORE): New. * config/i386/i386.md (*mov<mode>_and): Replace not -Oz split condition with !TARGET_USE_AND0_ORM1_STORE. (*mov<mode>_or): Likewise. (peephole2): Transform "mov $0,mem" to "*mov<mode>_and" and "mov $-1,mem" to "*mov<mode>_or" for TARGET_USE_AND0_ORM1_STORE. gcc/testsuite/ PR target/120734 * gcc.target/i386/pr120734a.c: New test. * gcc.target/i386/pr120734b.c: Likewise. * gcc.target/i386/pr120734c.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386.h | 6 +++++ gcc/config/i386/i386.md | 19 +++++++------- gcc/testsuite/gcc.target/i386/pr120734a.c | 32 +++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr120734b.c | 10 +++++++ gcc/testsuite/gcc.target/i386/pr120734c.c | 10 +++++++ 5 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr120734a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr120734b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr120734c.c diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 7c16eac7700..f3e30932526 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -494,6 +494,12 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_SSE_REDUCTION_PREFER_PSHUF \ ix86_tune_features[X86_TUNE_SSE_REDUCTION_PREFER_PSHUF] +/* Generate "and $0,mem" and "or $-1,mem", instead of "mov $0,mem" and + "mov $-1,mem" with shorter encoding for TARGET_SPLIT_LONG_MOVES or + -Oz. */ +#define TARGET_USE_AND0_ORM1_STORE \ + (TARGET_SPLIT_LONG_MOVES \ + || (optimize_insn_for_size_p () && optimize_size > 1)) /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 423ef48e518..7b3857f972c 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -2438,30 +2438,30 @@ (define_insn "*mov<mode>_xor" (set_attr "mode" "SI") (set_attr "length_immediate" "0")]) -;; Generate shorter "and $0,mem" for -Oz. Split it to "mov $0,mem" -;; otherwise. +;; Generate shorter "and $0,mem" for TARGET_USE_AND0_ORM1_STORE. Split +;; it to "mov $0,mem" otherwise. (define_insn_and_split "*mov<mode>_and" [(set (match_operand:SWI248 0 "memory_operand" "=m") (match_operand:SWI248 1 "const0_operand")) (clobber (reg:CC FLAGS_REG))] "reload_completed" "and{<imodesuffix>}\t{%1, %0|%0, %1}" - "&& !(optimize_insn_for_size_p () && optimize_size > 1)" + "&& !TARGET_USE_AND0_ORM1_STORE" [(set (match_dup 0) (match_dup 1))] "" [(set_attr "type" "alu1") (set_attr "mode" "<MODE>") (set_attr "length_immediate" "1")]) -;; Generate shorter "or $-1,mem" for -Oz. Split it to "mov $-1,mem" -;; otherwise. +;; Generate shorter "or $-1,mem" for TARGET_USE_AND0_ORM1_STORE. Split +;; it to "mov $-1,mem" otherwise. (define_insn_and_split "*mov<mode>_or" [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm") (match_operand:SWI248 1 "constm1_operand")) (clobber (reg:CC FLAGS_REG))] "reload_completed" "or{<imodesuffix>}\t{%1, %0|%0, %1}" - "&& !(optimize_insn_for_size_p () && optimize_size > 1)" + "&& !TARGET_USE_AND0_ORM1_STORE" [(set (match_dup 0) (match_dup 1))] "" [(set_attr "type" "alu1") @@ -2984,13 +2984,14 @@ (define_peephole2 gen_rtx_POST_INC (Pmode, stack_pointer_rtx)); }) -;; With -Oz, transform mov $0,mem to the shorter and $0,mem. -;; Likewise, transform mov $-1,mem to the shorter or $-1,mem. +;; With TARGET_USE_AND0_ORM1_STORE, transform "mov $0,mem" to the +;; shorter "and $0,mem". Likewise, transform "mov $-1,mem" to the +;; shorter "or $-1,mem". (define_peephole2 [(set (match_operand:SWI248 0 "memory_operand") (match_operand:SWI248 1 "const_int_operand"))] "(operands[1] == const0_rtx || operands[1] == constm1_rtx) - && optimize_insn_for_size_p () && optimize_size > 1 + && TARGET_USE_AND0_ORM1_STORE && peep2_regno_dead_p (0, FLAGS_REG)" [(parallel [(set (match_dup 0) (match_dup 1)) (clobber (reg:CC FLAGS_REG))])]) diff --git a/gcc/testsuite/gcc.target/i386/pr120734a.c b/gcc/testsuite/gcc.target/i386/pr120734a.c new file mode 100644 index 00000000000..4dd2f4ded5a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120734a.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune-ctrl=split_long_moves,^lcp_stall" } */ +/* { dg-final { scan-assembler-not "mov\[wlq\]\[\t \]+\\\$0, " } } */ +/* { dg-final { scan-assembler-not "mov\[wlq\]\[\t \]+\\\$-1, " } } */ +/* { dg-final { scan-assembler-times "and(?:l|w|q)\[\\t ]+\\\$0, " 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "or(?:l|w|q)\[\\t ]+\\\$-1, " 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "and(?:l|w)\[\\t ]+\\\$0, " 2 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "or(?:l|w)\[\\t ]+\\\$-1, " 2 { target ia32 } } } */ + +extern short s; +extern int i; +extern long long int ll; + +void +zero (void) +{ + s = 0; + i = 0; +#ifdef __x86_64__ + ll = 0; +#endif +} + +void +m1 (void) +{ + s = -1; + i = -1; +#ifdef __x86_64__ + ll = -1; +#endif +} diff --git a/gcc/testsuite/gcc.target/i386/pr120734b.c b/gcc/testsuite/gcc.target/i386/pr120734b.c new file mode 100644 index 00000000000..b261af52925 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120734b.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune-ctrl=^split_long_moves,^lcp_stall" } */ +/* { dg-final { scan-assembler-not "and\[wlq\]\[\t \]+\\\$0, " } } */ +/* { dg-final { scan-assembler-not "or\[wlq\]\[\t \]+\\\$-1, " } } */ +/* { dg-final { scan-assembler-times "mov(?:w|l|q)\[\\t \]+\\\$0, " 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "mov(?:w|l|q)\[\\t \]+\\\$-1, " 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "mov(?:w|l)\[\\t \]+\\\$0, " 2 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "mov(?:w|l)\[\\t \]+\\\$-1, " 2 { target ia32 } } } */ + +#include "pr120734a.c" diff --git a/gcc/testsuite/gcc.target/i386/pr120734c.c b/gcc/testsuite/gcc.target/i386/pr120734c.c new file mode 100644 index 00000000000..e29d3df7fe6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120734c.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-Oz -mtune-ctrl=^split_long_moves,^lcp_stall" } */ +/* { dg-final { scan-assembler-not "mov\[wlq\]\[\t \]+\\\$0, " } } */ +/* { dg-final { scan-assembler-not "mov\[wlq\]\[\t \]+\\\$-1, " } } */ +/* { dg-final { scan-assembler-times "and(?:l|w|q)\[\\t ]+\\\$0, " 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "or(?:l|w|q)\[\\t ]+\\\$-1, " 3 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "and(?:l|w)\[\\t ]+\\\$0, " 2 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "or(?:l|w)\[\\t ]+\\\$-1, " 2 { target ia32 } } } */ + +#include "pr120734a.c" -- 2.49.0