[PATCH v5 14/14] arm64: usercopy: Reduce overhead in fixup

Oliver Swede Mon, 14 Sep 2020 09:45:11 -0700

In the usercopy fixups the intermediate in-order copy step could
create an overhead for a fault that occurs a large number of
bytes ahead in the buffer. On inspection of the copy routine,
it appears possible to leverage the property where all bytes lower
than the fault address minus N bytes (128 for this algorithm) have
already been copied at the point of a fault .


This adds a preprocessor directive for defining the value that should
be subtracted from the intermediate fault address by the first-stage
fixup routine. This is the only dependency on the copy routine and
this change should be re-evaluated when importing new optimized copy
routines to determine if the property still holds, or e.g. if N needs
to be increased, to ensure the fixup remains precise.

Signed-off-by: Oliver Swede <oli.sw...@arm.com>
---
 arch/arm64/lib/copy_user_fixup.S | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/arm64/lib/copy_user_fixup.S b/arch/arm64/lib/copy_user_fixup.S
index 4858edd55994..970370b5b0a5 100644
--- a/arch/arm64/lib/copy_user_fixup.S
+++ b/arch/arm64/lib/copy_user_fixup.S
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 
+#define FIXUP_GRANULE 128
+
 addr   .req    x15
 .section .fixup,"ax"
 .align 2
@@ -36,6 +38,13 @@ L(src_fault):
        mov     x4, x0 // x4: initial target store address
        add     x5, x1, x2 // x5: srcend
 
+       subs    x3, x15, FIXUP_GRANULE
+       ccmp    x3, x1, #0, pl
+       csel    x3, x3, x1, ge // x3: initial target (user) load address
+       sub     x4, x3, x1
+       add     x4, x0, x4 // x4: initial target store address
+       add     x5, x1, x2 // x5: srcend
+
 L(src_buf_scan):
        ldrb2_post      w6, x3, #1
        strb2_post      w6, x4, #1
@@ -52,6 +61,13 @@ L(dst_fault):
        mov     x4, x1 // x4: initial target load address
        add     x5, x0, x2 // x5: dstend
 
+       subs    x3, x15, FIXUP_GRANULE
+       ccmp    x3, x0, #0, pl
+       csel    x3, x3, x0, ge // x3: initial target (user) store address
+       sub     x4, x3, x0
+       add     x4, x1, x4 // x4: initial target load address
+       add     x5, x0, x2 // x5: dstend
+
 L(dst_buf_scan):
        ldrb2_post      w6, x4, #1
        strb2_post      w6, x3, #1
-- 
2.17.1

[PATCH v5 14/14] arm64: usercopy: Reduce overhead in fixup

Reply via email to