[PATCH] x86-64: Add __copy_from_user_nocache

Linux Kernel Mailing List Thu, 15 Feb 2007 00:59:11 -0800

Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=0812a579c92fefa57506821fa08e90f47cb6dbdd
Commit:     0812a579c92fefa57506821fa08e90f47cb6dbdd
Parent:     ee55c0be30429d7c3e61fa26c7f7e323c80e14f0
Author:     Andi Kleen <[EMAIL PROTECTED]>
AuthorDate: Tue Feb 13 13:26:19 2007 +0100
Committer:  Andi Kleen <[EMAIL PROTECTED]>
CommitDate: Tue Feb 13 13:26:19 2007 +0100


    [PATCH] x86-64: Add __copy_from_user_nocache
    
    This does user copies in fs write() into the page cache with write 
combining.
    This pushes the destination out of the CPU's cache, but allows higher 
bandwidth
    in some case.
    
    The theory is that the page cache data is usually not touched by the
    CPU again and it's better to not pollute the cache with it. Also it is a 
little
    faster.
    
    Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
---
 arch/x86_64/kernel/x8664_ksyms.c    |    1 +
 arch/x86_64/lib/Makefile            |    2 +-
 arch/x86_64/lib/copy_user_nocache.S |  217 +++++++++++++++++++++++++++++++++++
 include/asm-x86_64/uaccess.h        |   14 +++
 4 files changed, 233 insertions(+), 1 deletions(-)

diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e47..23a7da3 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
 EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(__copy_from_user_inatomic);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index b78d417..8d5f835 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
        usercopy.o getuser.o putuser.o  \
        thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S 
b/arch/x86_64/lib/copy_user_nocache.S
new file mode 100644
index 0000000..4620efb
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag       when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+       CFI_STARTPROC
+       pushq %rbx
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rbx, 0
+       pushq %rcx              /* save zero flag */
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rcx, 0
+
+       xorl %eax,%eax          /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+       /* check for bad alignment of destination */
+       movl %edi,%ecx
+       andl $7,%ecx
+       jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+       movq %rdx,%rcx
+
+       movl $64,%ebx
+       shrq $6,%rdx
+       decq %rdx
+       js   .Lhandle_tail
+
+       .p2align 4
+.Lloop:
+.Ls1:  movq (%rsi),%r11
+.Ls2:  movq 1*8(%rsi),%r8
+.Ls3:  movq 2*8(%rsi),%r9
+.Ls4:  movq 3*8(%rsi),%r10
+.Ld1:  movnti %r11,(%rdi)
+.Ld2:  movnti %r8,1*8(%rdi)
+.Ld3:  movnti %r9,2*8(%rdi)
+.Ld4:  movnti %r10,3*8(%rdi)
+
+.Ls5:  movq 4*8(%rsi),%r11
+.Ls6:  movq 5*8(%rsi),%r8
+.Ls7:  movq 6*8(%rsi),%r9
+.Ls8:  movq 7*8(%rsi),%r10
+.Ld5:  movnti %r11,4*8(%rdi)
+.Ld6:  movnti %r8,5*8(%rdi)
+.Ld7:  movnti %r9,6*8(%rdi)
+.Ld8:  movnti %r10,7*8(%rdi)
+
+       dec  %rdx
+
+       leaq 64(%rsi),%rsi
+       leaq 64(%rdi),%rdi
+
+       jns  .Lloop
+
+       .p2align 4
+.Lhandle_tail:
+       movl %ecx,%edx
+       andl $63,%ecx
+       shrl $3,%ecx
+       jz   .Lhandle_7
+       movl $8,%ebx
+       .p2align 4
+.Lloop_8:
+.Ls9:  movq (%rsi),%r8
+.Ld9:  movnti %r8,(%rdi)
+       decl %ecx
+       leaq 8(%rdi),%rdi
+       leaq 8(%rsi),%rsi
+       jnz .Lloop_8
+
+.Lhandle_7:
+       movl %edx,%ecx
+       andl $7,%ecx
+       jz   .Lende
+       .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+       incq %rdi
+       incq %rsi
+       decl %ecx
+       jnz .Lloop_1
+
+       CFI_REMEMBER_STATE
+.Lende:
+       popq %rcx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE %rcx
+       popq %rbx
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rbx
+       ret
+       CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+       /* align destination */
+       .p2align 4
+.Lbad_alignment:
+       movl $8,%r9d
+       subl %ecx,%r9d
+       movl %r9d,%ecx
+       cmpq %r9,%rdx
+       jz   .Lhandle_7
+       js   .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+       incq %rsi
+       incq %rdi
+       decl %ecx
+       jnz .Lalign_1
+       subq %r9,%rdx
+       jmp .Lafter_bad_alignment
+#endif
+
+       /* table sorted by exception address */
+       .section __ex_table,"a"
+       .align 8
+       .quad .Ls1,.Ls1e
+       .quad .Ls2,.Ls2e
+       .quad .Ls3,.Ls3e
+       .quad .Ls4,.Ls4e
+       .quad .Ld1,.Ls1e
+       .quad .Ld2,.Ls2e
+       .quad .Ld3,.Ls3e
+       .quad .Ld4,.Ls4e
+       .quad .Ls5,.Ls5e
+       .quad .Ls6,.Ls6e
+       .quad .Ls7,.Ls7e
+       .quad .Ls8,.Ls8e
+       .quad .Ld5,.Ls5e
+       .quad .Ld6,.Ls6e
+       .quad .Ld7,.Ls7e
+       .quad .Ld8,.Ls8e
+       .quad .Ls9,.Le_quad
+       .quad .Ld9,.Le_quad
+       .quad .Ls10,.Le_byte
+       .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+       .quad .Ls11,.Lzero_rest
+       .quad .Ld11,.Lzero_rest
+#endif
+       .quad .Le5,.Le_zero
+       .previous
+
+       /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+          pessimistic side. this is gross. it would be better to fix the
+          interface. */
+       /* eax: zero, ebx: 64 */
+.Ls1e:         addl $8,%eax
+.Ls2e:         addl $8,%eax
+.Ls3e:         addl $8,%eax
+.Ls4e:         addl $8,%eax
+.Ls5e:         addl $8,%eax
+.Ls6e:         addl $8,%eax
+.Ls7e:         addl $8,%eax
+.Ls8e:         addl $8,%eax
+       addq %rbx,%rdi  /* +64 */
+       subq %rax,%rdi  /* correct destination with computed offset */
+
+       shlq $6,%rdx    /* loop counter * 64 (stride length) */
+       addq %rax,%rdx  /* add offset to loopcnt */
+       andl $63,%ecx   /* remaining bytes */
+       addq %rcx,%rdx  /* add them */
+       jmp .Lzero_rest
+
+       /* exception on quad word loop in tail handling */
+       /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+       shll $3,%ecx
+       andl $7,%edx
+       addl %ecx,%edx
+       /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+       cmpl $0,(%rsp)  /* zero flag set? */
+       jz   .Le_zero
+       movq %rdx,%rcx
+.Le_byte:
+       xorl %eax,%eax
+.Le5:  rep
+       stosb
+       /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+       movq %rdx,%rax
+       jmp .Lende
+       CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h
index 8079e29..1981f70 100644
--- a/include/asm-x86_64/uaccess.h
+++ b/include/asm-x86_64/uaccess.h
@@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, 
unsigned size)
        return copy_user_generic((__force void *)dst, src, size);
 }
 
+#define ARCH_HAS_NOCACHE_UACCESS 1
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned 
size, int zerorest);
+
+static inline int __copy_from_user_nocache(void *dst, const void __user *src, 
unsigned size)
+{
+       might_sleep();
+       return __copy_user_nocache(dst, (__force void *)src, size, 1);
+}
+
+static inline int __copy_from_user_inatomic_nocache(void *dst, const void 
__user *src, unsigned size)
+{
+       return __copy_user_nocache(dst, (__force void *)src, size, 0);
+}
+
 #endif /* __X86_64_UACCESS_H */
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] x86-64: Add __copy_from_user_nocache

Reply via email to