Alternatives allow to pick faster code: REP STOSQ, REP STOSB or else. Default to REP STOSQ (as memset() does).
Sticking printk() into clear_user() and booting Debian showed that it is usually executed with 4-8 bytes of alignment and ~2000-4000 bytes of length. On this scale difference should be noticeable. Also make __clear_user() more, shall we say, "modern". Remove storing zeroes via zeroed register, replace INC/DEC with ADD/SUB for dependency breakage. Signed-off-by: Alexey Dobriyan <adobri...@gmail.com> --- arch/x86/lib/Makefile | 1 arch/x86/lib/clear_user_64.S | 93 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy_64.c | 35 ---------------- 3 files changed, 95 insertions(+), 34 deletions(-) --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o +lib-$(CONFIG_X86_64) += clear_user_64.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o --- /dev/null +++ b/arch/x86/lib/clear_user_64.S @@ -0,0 +1,93 @@ +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/dwarf2.h> +#include <asm/smap.h> + +# unsigned long __clear_user(void __user *, unsigned long); +ENTRY(__clear_user) + CFI_STARTPROC + + ALTERNATIVE_2 "jmp __clear_user_movq", \ + "", X86_FEATURE_REP_GOOD, \ + "jmp __clear_user_rep_stosb", X86_FEATURE_ERMS + + ASM_STAC + xor %eax, %eax + mov %rsi, %rcx + and $7, %esi + shr $3, %rcx +1: rep stosq + mov %esi, %ecx +2: rep stosb +3: + mov %rcx, %rax + ASM_CLAC + ret + + .section .fixup,"ax" +4: lea (%rsi,%rcx,8),%rcx + jmp 3b + .previous + + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,3b) + + CFI_ENDPROC +ENDPROC(__clear_user) + +ENTRY(__clear_user_movq) + CFI_STARTPROC + + ASM_STAC + mov %rsi, %rcx + and $7, %esi + shr $3, %rcx + jz 2f + .p2align 4 +1: + movq $0, (%rdi) + add $8, %rdi + sub $1, %rcx + jnz 1b +2: + mov %esi, %ecx + test %ecx, %ecx + jz 4f + .p2align 4 +3: + movb $0, (%rdi) + add $1, %rdi + sub $1, %ecx + jnz 3b +4: + mov %rcx, %rax + ASM_CLAC + ret + + .section .fixup,"ax" +5: lea (%rsi,%rcx,8),%rcx + jmp 4b + .previous + + _ASM_EXTABLE(1b,5b) + _ASM_EXTABLE(3b,4b) + + CFI_ENDPROC +ENDPROC(__clear_user_movq) + +ENTRY(__clear_user_rep_stosb) + CFI_STARTPROC + + ASM_STAC + xor %eax, %eax + mov %rsi, %rcx +1: rep stosb +2: + mov %rcx, %rax + ASM_CLAC + ret + + _ASM_EXTABLE(1b,2b) + + CFI_ENDPROC +ENDPROC(__clear_user_rep_stosb) --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -12,40 +12,6 @@ * Zero Userspace */ -unsigned long __clear_user(void __user *addr, unsigned long size) -{ - long __d0; - might_fault(); - /* no memory constraint because it doesn't change any memory gcc knows - about */ - stac(); - asm volatile( - " testq %[size8],%[size8]\n" - " jz 4f\n" - "0: movq %[zero],(%[dst])\n" - " addq %[eight],%[dst]\n" - " decl %%ecx ; jnz 0b\n" - "4: movq %[size1],%%rcx\n" - " testl %%ecx,%%ecx\n" - " jz 2f\n" - "1: movb %b[zero],(%[dst])\n" - " incq %[dst]\n" - " decl %%ecx ; jnz 1b\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: lea 0(%[size1],%[size8],8),%[size8]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(0b,3b) - _ASM_EXTABLE(1b,2b) - : [size8] "=&c"(size), [dst] "=&D" (__d0) - : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), - [zero] "r" (0UL), [eight] "r" (8UL)); - clac(); - return size; -} -EXPORT_SYMBOL(__clear_user); - unsigned long clear_user(void __user *to, unsigned long n) { if (access_ok(VERIFY_WRITE, to, n)) @@ -53,6 +19,7 @@ unsigned long clear_user(void __user *to, unsigned long n) return n; } EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) { -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in Please read the FAQ at http://www.tux.org/lkml/