[PATCH] [8/35] x86_64: a memcpy that tries to reduce cache pressure

Andi Kleen Sat, 28 Apr 2007 11:36:07 -0700

From: "Bryan O'Sullivan" <[EMAIL PROTECTED]>

This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.


One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any work
with it.  Since the source data is read-once, write-never from the CPU's
perspective, caching the data at those addresses can only evict potentially
useful data.

We provide an x86_64 implementation that uses SSE non-temporal loads, and a
generic version that falls back to plain memcpy.

Implementors for other arches should not use cache-bypassing stores to the
destination, as in most cases, the destination is accessed almost immediately
after a copy finishes.

[EMAIL PROTECTED]: add module export]
[EMAIL PROTECTED]: remove an ARCH_HAS_foo]
Signed-off-by: Bryan O'Sullivan <[EMAIL PROTECTED]>
Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
Cc: Andi Kleen <[EMAIL PROTECTED]>
Cc: Roland Dreier <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 arch/x86_64/kernel/x8664_ksyms.c       |    2 
 arch/x86_64/lib/Makefile               |    1 
 arch/x86_64/lib/memcpy_uncached_read.S |  142 +++++++++++++++++++++++++++++++++
 include/asm-x86_64/string.h            |    2 
 include/linux/string.h                 |    3 
 5 files changed, 150 insertions(+)

Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -8,6 +8,7 @@
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/string.h>
 
 EXPORT_SYMBOL(kernel_thread);
 
@@ -54,6 +55,7 @@ extern void * __memcpy(void *,const void
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(memcpy_uncached_read);
 
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(init_level4_pgt);
Index: linux/arch/x86_64/lib/Makefile
===================================================================
--- linux.orig/arch/x86_64/lib/Makefile
+++ linux/arch/x86_64/lib/Makefile
@@ -11,3 +11,4 @@ lib-y := csum-partial.o csum-copy.o csum
        usercopy.o getuser.o putuser.o  \
        thunk.o clear_page.o copy_page.o bitstr.o bitops.o
 lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
+lib-y += memcpy_uncached_read.o
Index: linux/arch/x86_64/lib/memcpy_uncached_read.S
===================================================================
--- /dev/null
+++ linux/arch/x86_64/lib/memcpy_uncached_read.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_uncached_read - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+       .text
+       .p2align 4,,15
+       /* rdi  destination, rsi source, rdx count */
+       .globl  memcpy_uncached_read
+       .type   memcpy_uncached_read, @function
+memcpy_uncached_read:
+       movq    %rdi, %rax
+.L5:
+       cmpq    $15, %rdx
+       ja      .L34
+.L3:
+       cmpl    $8, %edx        /* rdx is 0..15 */
+       jbe     .L9
+.L6:
+       testb   $8, %dxl        /* rdx is 3,5,6,7,9..15 */
+       je      .L13
+       movq    (%rsi), %rcx
+       addq    $8, %rsi
+       movq    %rcx, (%rdi)
+       addq    $8, %rdi
+.L13:
+       testb   $4, %dxl
+       je      .L15
+       movl    (%rsi), %ecx
+       addq    $4, %rsi
+       movl    %ecx, (%rdi)
+       addq    $4, %rdi
+.L15:
+       testb   $2, %dxl
+       je      .L17
+       movzwl  (%rsi), %ecx
+       addq    $2, %rsi
+       movw    %cx, (%rdi)
+       addq    $2, %rdi
+.L17:
+       testb   $1, %dxl
+       je      .L33
+.L1:
+       movzbl  (%rsi), %ecx
+       movb    %cl, (%rdi)
+.L33:
+       ret
+.L34:
+       cmpq    $63, %rdx       /* rdx is > 15 */
+       ja      .L64
+       movl    $16, %ecx       /* rdx is 16..63 */
+.L25:
+       movq    8(%rsi), %r8
+       movq    (%rsi), %r9
+       addq    %rcx, %rsi
+       movq    %r8, 8(%rdi)
+       movq    %r9, (%rdi)
+       addq    %rcx, %rdi
+       subq    %rcx, %rdx
+       cmpl    %edx, %ecx      /* is rdx >= 16? */
+       jbe     .L25
+       jmp     .L3             /* rdx is 0..15 */
+       .p2align 4,,7
+.L64:
+       movl    $64, %ecx
+.L42:
+       prefetchnta     128(%rsi)
+       movq    (%rsi), %r8
+       movq    8(%rsi), %r9
+       movq    16(%rsi), %r10
+       movq    24(%rsi), %r11
+       subq    %rcx, %rdx
+       movq    %r8, (%rdi)
+       movq    32(%rsi), %r8
+       movq    %r9, 8(%rdi)
+       movq    40(%rsi), %r9
+       movq    %r10, 16(%rdi)
+       movq    48(%rsi), %r10
+       movq    %r11, 24(%rdi)
+       movq    56(%rsi), %r11
+       addq    %rcx, %rsi
+       movq    %r8, 32(%rdi)
+       movq    %r9, 40(%rdi)
+       movq    %r10, 48(%rdi)
+       movq    %r11, 56(%rdi)
+       addq    %rcx, %rdi
+       cmpq    %rdx, %rcx      /* is rdx >= 64? */
+       jbe     .L42
+       sfence
+       orl     %edx, %edx
+       je      .L33
+       jmp     .L5
+.L9:
+       jmp     *.L12(,%rdx,8)  /* rdx is 0..8 */
+       .section        .rodata
+       .align 8
+       .align 4
+.L12:
+       .quad   .L33
+       .quad   .L1
+       .quad   .L2
+       .quad   .L6
+       .quad   .L4
+       .quad   .L6
+       .quad   .L6
+       .quad   .L6
+       .quad   .L8
+       .text
+.L2:
+       movzwl  (%rsi), %ecx
+       movw    %cx, (%rdi)
+       ret
+.L4:
+       movl    (%rsi), %ecx
+       movl    %ecx, (%rdi)
+       ret
+.L8:
+       movq    (%rsi), %rcx
+       movq    %rcx, (%rdi)
+       ret
Index: linux/include/asm-x86_64/string.h
===================================================================
--- linux.orig/include/asm-x86_64/string.h
+++ linux/include/asm-x86_64/string.h
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
                 __ret = __builtin_memcpy((dst),(src),__len);   \
           __ret; }) 
 
+extern void *memcpy_uncached_read(void *to, const void *from, size_t len);
+#define memcpy_uncached_read memcpy_uncached_read
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
Index: linux/include/linux/string.h
===================================================================
--- linux.orig/include/linux/string.h
+++ linux/include/linux/string.h
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
 #ifndef __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *,const void *,__kernel_size_t);
 #endif
+#ifndef memcpy_uncached_read
+#define memcpy_uncached_read(dest, src, count) memcpy((dest), (src), (count))
+#endif
 #ifndef __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *,const void *,__kernel_size_t);
 #endif
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] [8/35] x86_64: a memcpy that tries to reduce cache pressure

Reply via email to