This patch addresses an issue where GCC generates sub-optimal code for
64-bit (DImode) loads and stores on RV32 targets when the 'zilsd'
extension is enabled. Instead of utilizing the 'ld' instruction
provided by zilsd, the compiler was splitting the operation into two
32-bit 'lw' instructions.
The issue was caused by:
1. The cost model underestimating the cost of splitting vs keeping
DImode operations.
2. The lack of specific shift patterns for RV32 DImode, causing early
splitting in the expand pass.
3. Strict address validity checks rejecting DImode LO_SUM addresses.
This patch:
1. Adjusts riscv_rtx_costs to favor ZILSD DImode operations.
2. Relaxes riscv_valid_lo_sum_p for ZILSD.
3. Implements deferred splitting for DImode shifts (ashl, ashr, lshr)
using a new helper riscv_expand_di_shift_32bit, which uses a
scratch register to ensure valid immediate handling and avoid ICEs.
Regression tested on riscv-sim (rv32i_zilsd).
Fixed gcc.target/riscv/zilsd-code-gen-split-subreg-2.c.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (riscv_expand_di_shift_32bit): New
prototype.
* config/riscv/riscv.cc (riscv_valid_lo_sum_p): Allow DImode LO_SUM for
ZILSD.
(riscv_address_insns): Adjust cost calculation for ZILSD DImode.
(riscv_rtx_costs): Adjust costs to favor DImode operations on ZILSD;
explicitly set SImode move costs.
(riscv_expand_di_shift_32bit): New helper function to expand 64-bit
shifts into 32-bit instructions using a scratch register.
* config/riscv/riscv.md (ashrdi3): New define_insn_and_split for ZILSD.
(ashldi3): Likewise.
(lshrdi3): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/zilsd-code-gen-split-subreg-2.c: Remove XFAIL and
TODO.
---
gcc/config/riscv/riscv-protos.h | 2 +-
gcc/config/riscv/riscv.cc | 137 +++++++++---------
gcc/config/riscv/riscv.md | 33 +++--
.../riscv/zilsd-code-gen-split-subreg-2.c | 7 +-
4 files changed, 96 insertions(+), 83 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index ce8b9c44019..c4f03b1d799 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -861,7 +861,7 @@ extern void riscv_reset_previous_fndecl (void);
extern rtx riscv_prefetch_cookie (rtx, rtx);
extern bool riscv_prefetch_offset_address_p (rtx, machine_mode);
-extern void riscv_expand_di_shift_32bit (rtx, rtx, rtx, enum rtx_code);
+extern void riscv_expand_di_shift_32bit (rtx, rtx, rtx, rtx, enum rtx_code);
struct riscv_tune_param;
/* Information about one micro-arch we know about. */
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 244e95b508f..b5e6688d9d9 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -16766,107 +16766,114 @@ riscv_prefetch_offset_address_p (rtx x,
machine_mode mode)
/* Helper to expand 64-bit shifts on RV32 (ZILSD) using 32-bit operations. */
void
-riscv_expand_di_shift_32bit (rtx dest, rtx src, rtx count, enum rtx_code code)
+riscv_expand_di_shift_32bit (rtx dest, rtx src, rtx count, rtx scratch,
+ enum rtx_code code)
{
rtx dest_lo = gen_lowpart (SImode, dest);
rtx dest_hi = gen_highpart (SImode, dest);
rtx src_lo = gen_lowpart (SImode, src);
rtx src_hi = gen_highpart (SImode, src);
- rtx sh_reg = gen_lowpart (SImode, count); /* Shift amount as SImode register
*/
+ /* Shift amount as SImode register. */
+ rtx sh_reg = gen_lowpart (SImode, count);
/* Create temporaries for shift count comparison. */
rtx sh_lt_32_label = gen_label_rtx ();
rtx done_label = gen_label_rtx ();
- rtx sh_minus_32 = gen_reg_rtx (SImode);
- /* sh_minus_32 = sh_reg - 32 */
+ /* Use scratch for sh_minus_32. */
+ rtx sh_minus_32 = scratch;
+
+ /* sh_minus_32 = sh_reg - 32. */
emit_insn (gen_addsi3 (sh_minus_32, sh_reg, GEN_INT (-32)));
- /* if (sh_minus_32 < 0) goto sh_lt_32_label (i.e., if sh_reg < 32) */
+ /* if (sh_minus_32 < 0) goto sh_lt_32_label (i.e., if sh_reg < 32). */
emit_cmp_and_jump_insns (sh_minus_32, const0_rtx, LT, NULL_RTX, SImode, 0,
- sh_lt_32_label);
+ sh_lt_32_label);
/* --- Case: sh_reg >= 32 --- */
if (code == ASHIFTRT)
{
- /* dest_hi = src_hi >> 31 (sign extend) */
+ /* dest_hi = src_hi >> 31 (sign extend). */
emit_insn (gen_ashrsi3 (dest_hi, src_hi, GEN_INT (31)));
- /* dest_lo = src_hi >> (sh_reg - 32) */
- emit_insn (gen_ashrsi3 (dest_lo, src_hi, sh_minus_32));
+ /* dest_lo = src_hi >> (sh_reg - 32). */
+ emit_insn (gen_ashrsi3 (dest_lo, src_hi,
+ gen_lowpart (QImode, sh_minus_32)));
}
else if (code == LSHIFTRT)
{
- /* dest_hi = 0 */
+ /* dest_hi = 0. */
emit_move_insn (dest_hi, const0_rtx);
- /* dest_lo = src_hi >> (sh_reg - 32) */
- emit_insn (gen_lshrsi3 (dest_lo, src_hi, sh_minus_32));
+ /* dest_lo = src_hi >> (sh_reg - 32). */
+ emit_insn (gen_lshrsi3 (dest_lo, src_hi,
+ gen_lowpart (QImode, sh_minus_32)));
}
- else /* ASHIFT (Left Shift) */
+ else /* ASHIFT (Left Shift). */
{
- /* dest_lo = 0 */
+ /* dest_lo = 0. */
emit_move_insn (dest_lo, const0_rtx);
- /* dest_hi = src_lo << (sh_reg - 32) */
- emit_insn (gen_ashlsi3 (dest_hi, src_lo, sh_minus_32));
+ /* dest_hi = src_lo << (sh_reg - 32). */
+ emit_insn (gen_ashlsi3 (dest_hi, src_lo,
+ gen_lowpart (QImode, sh_minus_32)));
}
- emit_jump (done_label); /* Jump to end */
+ emit_jump (done_label); /* Jump to end. */
/* --- Case: sh_reg < 32 (sh_lt_32_label) --- */
emit_label (sh_lt_32_label);
- rtx sh_complement_32 = gen_reg_rtx (SImode);
- /* sh_complement_32 = 32 - sh_reg */
- emit_insn (gen_subsi3 (sh_complement_32, GEN_INT (32), sh_reg));
+ rtx sh_complement_32 = scratch;
+ /* sh_complement_32 = 32 - sh_reg. */
+ emit_move_insn (sh_complement_32, GEN_INT (32));
+ emit_insn (gen_subsi3 (sh_complement_32, sh_complement_32, sh_reg));
if (code == ASHIFTRT)
{
- /* dest_hi = src_hi >> sh_reg */
- emit_insn (gen_ashrsi3 (dest_hi, src_hi, sh_reg));
-
- rtx tmp1 = gen_reg_rtx (SImode);
- /* tmp1 = src_lo >> sh_reg */
- emit_insn (gen_lshrsi3 (tmp1, src_lo, sh_reg));
-
- rtx tmp2 = gen_reg_rtx (SImode);
- /* tmp2 = src_hi << (32 - sh_reg) */
- emit_insn (gen_ashlsi3 (tmp2, src_hi, sh_complement_32));
-
- /* dest_lo = tmp1 | tmp2 */
- emit_insn (gen_iorsi3 (dest_lo, tmp1, tmp2));
+ /* dest_hi = src_hi >> sh_reg. */
+ emit_insn (gen_ashrsi3 (dest_hi, src_hi, gen_lowpart (QImode, sh_reg)));
+
+ /* Reuse dest_lo for tmp1 = src_lo >> sh_reg. */
+ emit_insn (gen_lshrsi3 (dest_lo, src_lo, gen_lowpart (QImode, sh_reg)));
+
+ /* Reuse scratch for tmp2 = src_hi << (32 - sh_reg).
+ Note: scratch holds sh_complement_32 (shift amount) as input,
+ and is overwritten with result. */
+ emit_insn (gen_ashlsi3 (scratch, src_hi,
+ gen_lowpart (QImode, sh_complement_32)));
+
+ /* dest_lo = tmp1 | tmp2. */
+ emit_insn (gen_iorsi3 (dest_lo, dest_lo, scratch));
}
else if (code == LSHIFTRT)
{
- /* dest_hi = src_hi >>> sh_reg */
- emit_insn (gen_lshrsi3 (dest_hi, src_hi, sh_reg));
-
- rtx tmp1 = gen_reg_rtx (SImode);
- /* tmp1 = src_lo >>> sh_reg */
- emit_insn (gen_lshrsi3 (tmp1, src_lo, sh_reg));
-
- rtx tmp2 = gen_reg_rtx (SImode);
- /* tmp2 = src_hi << (32 - sh_reg) */
- emit_insn (gen_ashlsi3 (tmp2, src_hi, sh_complement_32));
-
- /* dest_lo = tmp1 | tmp2 */
- emit_insn (gen_iorsi3 (dest_lo, tmp1, tmp2));
- }
- else /* ASHIFT (Left Shift) */
- {
- /* dest_lo = src_lo << sh_reg */
- emit_insn (gen_ashlsi3 (dest_lo, src_lo, sh_reg));
-
- rtx tmp1 = gen_reg_rtx (SImode);
- /* tmp1 = src_hi << sh_reg */
- emit_insn (gen_ashlsi3 (tmp1, src_hi, sh_reg));
-
- rtx tmp2 = gen_reg_rtx (SImode);
- /* tmp2 = src_lo >>> (32 - sh_reg) */
- emit_insn (gen_lshrsi3 (tmp2, src_lo, sh_complement_32));
-
- /* dest_hi = tmp1 | tmp2 */
- emit_insn (gen_iorsi3 (dest_hi, tmp1, tmp2));
- }
-
- emit_label (done_label); /* End of shift logic */
+ /* dest_hi = src_hi >>> sh_reg. */
+ emit_insn (gen_lshrsi3 (dest_hi, src_hi, gen_lowpart (QImode, sh_reg)));
+
+ /* Reuse dest_lo for tmp1 = src_lo >>> sh_reg. */
+ emit_insn (gen_lshrsi3 (dest_lo, src_lo, gen_lowpart (QImode, sh_reg)));
+
+ /* Reuse scratch for tmp2 = src_hi << (32 - sh_reg). */
+ emit_insn (gen_ashlsi3 (scratch, src_hi,
+ gen_lowpart (QImode, sh_complement_32)));
+
+ /* dest_lo = tmp1 | tmp2. */
+ emit_insn (gen_iorsi3 (dest_lo, dest_lo, scratch));
+ }
+ else /* ASHIFT (Left Shift). */
+ {
+ /* dest_lo = src_lo << sh_reg. */
+ emit_insn (gen_ashlsi3 (dest_lo, src_lo, gen_lowpart (QImode, sh_reg)));
+
+ /* Reuse dest_hi for tmp1 = src_hi << sh_reg. */
+ emit_insn (gen_ashlsi3 (dest_hi, src_hi, gen_lowpart (QImode, sh_reg)));
+
+ /* Reuse scratch for tmp2 = src_lo >>> (32 - sh_reg). */
+ emit_insn (gen_lshrsi3 (scratch, src_lo,
+ gen_lowpart (QImode, sh_complement_32)));
+
+ /* dest_hi = tmp1 | tmp2. */
+ emit_insn (gen_iorsi3 (dest_hi, dest_hi, scratch));
+ }
+
+ emit_label (done_label); /* End of shift logic. */
}
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 2984732eb4c..9db889b8544 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -779,42 +779,51 @@
(define_insn_and_split "ashrdi3"
[(set (match_operand:DI 0 "register_operand" "=&r")
- (ashiftrt:DI (match_operand:DI 1 "register_operand" "r")
- (match_operand:QI 2 "register_operand" "r")))]
+ (ashiftrt:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (clobber (match_scratch:SI 3 "=&r"))]
"TARGET_ZILSD && !TARGET_64BIT"
"#"
"&& reload_completed"
[(const_int 0)]
{
- riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
ASHIFTRT);
+ riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
+ operands[3], ASHIFTRT);
DONE;
- })
+ }
+ [(set_attr "type" "shift")])
(define_insn_and_split "ashldi3"
[(set (match_operand:DI 0 "register_operand" "=&r")
- (ashift:DI (match_operand:DI 1 "register_operand" "r")
- (match_operand:QI 2 "register_operand" "r")))]
+ (ashift:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (clobber (match_scratch:SI 3 "=&r"))]
"TARGET_ZILSD && !TARGET_64BIT"
"#"
"&& reload_completed"
[(const_int 0)]
{
- riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
ASHIFT);
+ riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
+ operands[3], ASHIFT);
DONE;
- })
+ }
+ [(set_attr "type" "shift")])
(define_insn_and_split "lshrdi3"
[(set (match_operand:DI 0 "register_operand" "=&r")
- (lshiftrt:DI (match_operand:DI 1 "register_operand" "r")
- (match_operand:QI 2 "register_operand" "r")))]
+ (lshiftrt:DI (match_operand:DI 1 "register_operand" "r")
+ (match_operand:QI 2 "register_operand" "r")))
+ (clobber (match_scratch:SI 3 "=&r"))]
"TARGET_ZILSD && !TARGET_64BIT"
"#"
"&& reload_completed"
[(const_int 0)]
{
- riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
LSHIFTRT);
+ riscv_expand_di_shift_32bit (operands[0], operands[1], operands[2],
+ operands[3], LSHIFTRT);
DONE;
- })
+ }
+ [(set_attr "type" "shift")])
(define_expand "addv<mode>4"
[(set (match_operand:GPR 0 "register_operand" "=r,r")
diff --git a/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
b/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
index 3adcd21ea06..8e68efa702a 100644
--- a/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
+++ b/gcc/testsuite/gcc.target/riscv/zilsd-code-gen-split-subreg-2.c
@@ -6,11 +6,8 @@ long long foo(long long x)
{
return y >> x;
}
-/* TODO: We should not split that 64 bit load into two 32 bit load if we have
- zilsd, but we split that during the expand time, so it's hard to fix via
cost
- model turning, we could either fix that for expander, or...combine those two
- 32 bit load back later. */
-/* { dg-final { scan-assembler-times "ld\t" 1 { xfail riscv*-*-* } } } */
+
+/* { dg-final { scan-assembler-times "ld\t" 1 } } */
/* Os and Oz will use libcall, so the 64 bit load won't be split. */
/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" } } */
--
2.43.0