Hi Richard,
This is the updated version that could pass all regression test on
Power9-LE.
Just need another "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:
equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)
Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5: r119:DI=0x6867666564636261
32: r126:SI=0x646362
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)
Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5: r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)
So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...
[PATCH v4] dse: Remove partial load after full store for high part
access[PR71309]
This patch could optimize(works for char/short/int/void*):
6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]
=>
6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8
Final ASM will be as below without partial load after full store(stxv+ld):
ld 10,16(3)
mr 9,3
ld 3,24(3)
std 10,0(9)
std 3,8(9)
blr
It could achieve ~25% performance improvement for typical cases on
Power9. Bootstrap and regression tested on Power9-LE.
For AArch64, one ldr is replaced by mov with this patch:
ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]
=>
mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]
gcc/ChangeLog:
2020-07-24 Xionghu Luo <luo...@linux.ibm.com>
PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.
gcc/testsuite/ChangeLog:
2020-07-24 Xionghu Luo <luo...@linux.ibm.com>
PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
gcc/dse.c | 25 ++++++++++++++--
gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
2 files changed, 56 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
the machine. */
opt_scalar_int_mode new_mode_iter;
- FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+ FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
{
rtx target, new_reg, new_lhs;
rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
e.g. at -Os, even when no actual shift will be needed. */
if (store_info->const_rhs)
{
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+ continue;
+
poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
store_mode, byte);
@@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
&& !targetm.modes_tieable_p (new_mode, store_mode))
continue;
+ if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+ {
+ /* Try to implement the shift using a subreg. */
+ poly_int64 offset = subreg_offset_from_lsb (new_mode,
+ store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+ store_mode, offset);
+ if (rhs_subreg)
+ {
+ read_reg
+ = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+ }
+ }
+
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+ continue;
+
new_reg = gen_reg_rtx (new_mode);
start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+ TYPE2 mnt;
+ TYPE dentry;
+};
+
+struct nameidata {
+ struct path path;
+ struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+ TYPE d;
+ TYPE2 d2;
+
+ nd->path = nd->root;
+ d = nd->path.dentry;
+ d2 = nd->path.mnt;
+ return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */