From: Christoph Müllner <[email protected]>
The current implementation of riscv_block_move_straight() emits a couple
of load-store pairs with maximum width (e.g. 8-byte for RV64).
The remainder is handed over to move_by_pieces(), which emits code based
target settings like slow_unaligned_access and overlap_op_by_pieces.
move_by_pieces() will emit overlapping memory accesses with maximum
width only if the given length exceeds the size of one access
(e.g. 15-bytes for 8-byte accesses).
This patch changes the implementation of riscv_block_move_straight()
such, that it preserves a remainder within the interval
[delta..2*delta) instead of [0..delta), so that overlapping memory
access may be emitted (if the requirements for them are given).
gcc/ChangeLog:
* config/riscv/riscv-string.c (riscv_block_move_straight):
Adjust range for emitted load/store pairs.
Signed-off-by: Christoph Müllner <[email protected]>
---
gcc/config/riscv/riscv-string.cc | 8 ++++----
.../gcc.target/riscv/memcpy-overlapping.c | 19 ++++++++-----------
2 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 6882f0be269..1137df475be 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -57,18 +57,18 @@ riscv_block_move_straight (rtx dest, rtx src, unsigned
HOST_WIDE_INT length)
delta = bits / BITS_PER_UNIT;
/* Allocate a buffer for the temporary registers. */
- regs = XALLOCAVEC (rtx, length / delta);
+ regs = XALLOCAVEC (rtx, length / delta - 1);
/* Load as many BITS-sized chunks as possible. Use a normal load if
the source has enough alignment, otherwise use left/right pairs. */
- for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+ for (offset = 0, i = 0; offset + 2 * delta <= length; offset += delta, i++)
{
regs[i] = gen_reg_rtx (mode);
riscv_emit_move (regs[i], adjust_address (src, mode, offset));
}
/* Copy the chunks to the destination. */
- for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+ for (offset = 0, i = 0; offset + 2 * delta <= length; offset += delta, i++)
riscv_emit_move (adjust_address (dest, mode, offset), regs[i]);
/* Mop up any left-over bytes. */
@@ -166,7 +166,7 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length)
if (hwi_length <= (RISCV_MAX_MOVE_BYTES_STRAIGHT / factor))
{
- riscv_block_move_straight (dest, src, INTVAL (length));
+ riscv_block_move_straight (dest, src, hwi_length);
return true;
}
else if (optimize && align >= BITS_PER_WORD)
diff --git a/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c
b/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c
index ffb7248bfd1..ef95bfb879b 100644
--- a/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c
+++ b/gcc/testsuite/gcc.target/riscv/memcpy-overlapping.c
@@ -25,26 +25,23 @@ COPY_N(15)
/* Emits 2x {ld,sd} and 1x {lw,sw}. */
COPY_N(19)
-/* Emits 3x ld and 3x sd. */
+/* Emits 3x {ld,sd}. */
COPY_N(23)
/* The by-pieces infrastructure handles up to 24 bytes.
So the code below is emitted via cpymemsi/block_move_straight. */
-/* Emits 3x {ld,sd} and 1x {lhu,lbu,sh,sb}. */
+/* Emits 3x {ld,sd} and 1x {lw,sw}. */
COPY_N(27)
-/* Emits 3x {ld,sd} and 1x {lw,lbu,sw,sb}. */
+/* Emits 4x {ld,sd}. */
COPY_N(29)
-/* Emits 3x {ld,sd} and 2x {lw,sw}. */
+/* Emits 4x {ld,sd}. */
COPY_N(31)
-/* { dg-final { scan-assembler-times "ld\t" 21 } } */
-/* { dg-final { scan-assembler-times "sd\t" 21 } } */
+/* { dg-final { scan-assembler-times "ld\t" 23 } } */
+/* { dg-final { scan-assembler-times "sd\t" 23 } } */
-/* { dg-final { scan-assembler-times "lw\t" 5 } } */
-/* { dg-final { scan-assembler-times "sw\t" 5 } } */
-
-/* { dg-final { scan-assembler-times "lbu\t" 2 } } */
-/* { dg-final { scan-assembler-times "sb\t" 2 } } */
+/* { dg-final { scan-assembler-times "lw\t" 3 } } */
+/* { dg-final { scan-assembler-times "sw\t" 3 } } */
--
2.38.1