When copy less than 128 and ge than 64 bytes, add src/dst after
load and store 64 bytes to improve performance.

Copy 127 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 14.62 copy_from_user: 14.23 copy_to_user: 14.42

With this patch:
memcpy: 13.85 copy_from_user: 13.26 copy_to_user: 13.84

It's about 5.27% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingli...@huawei.com>
---
 arch/arm64/lib/copy_template.S | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index c3cd6f84c9c0..a9cbd47473f0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -132,14 +132,16 @@ D_h       .req    x14
        * Less than 128 bytes to copy, so handle 64 here and then jump
        * to the tail.
        */
-       ldp1    A_l, A_h, src, #16
-       stp1    A_l, A_h, dst, #16
-       ldp1    B_l, B_h, src, #16
-       ldp1    C_l, C_h, src, #16
-       stp1    B_l, B_h, dst, #16
-       stp1    C_l, C_h, dst, #16
-       ldp1    D_l, D_h, src, #16
-       stp1    D_l, D_h, dst, #16
+       ldp2    A_l, A_h, src, #0,  #8
+       stp2    A_l, A_h, dst, #0,  #8
+       ldp2    B_l, B_h, src, #16, #24
+       ldp2    C_l, C_h, src, #32, #40
+       stp2    B_l, B_h, dst, #16, #24
+       stp2    C_l, C_h, dst, #32, #40
+       ldp2    D_l, D_h, src, #48, #56
+       stp2    D_l, D_h, dst, #48, #56
+       add     src, src, #64
+       add     dst, dst, #64
 
        tst     count, #0x3f
        b.ne    .Ltail63
-- 
2.25.1

Reply via email to