The new x86 memcpy_nt() helper in this series maps to
memcpy_flushcache(), and the ZONE_DEVICE fast path uses that primitive
for constant-sized struct page template copies.

memcpy_flushcache() currently inlines only the 4, 8, and 16-byte
cases. Larger constant-sized copies fall back to __memcpy_flushcache()
even when the destination is naturally aligned. Extend the inline
movnti coverage to 32, 48, 64, 80, and 96 bytes so the struct
page-sized copies used by that path can stay on the inline
non-temporal store path instead of dropping into the out-of-line
helper.

Factor the store sequences into 8/16/32/64-byte helpers, keep the
existing 4/8/16-byte cases handled directly in memcpy_flushcache(),
issue the stores in ascending address order, and leave all other sizes
on __memcpy_flushcache().

Signed-off-by: Li Zhe <[email protected]>
---
 arch/x86/include/asm/string_64.h | 80 +++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 6f36abedc56a..95ef2d481418 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -4,6 +4,7 @@
 
 #ifdef __KERNEL__
 #include <linux/jump_label.h>
+#include <linux/align.h>
 
 /* Written 2002 by Andi Kleen */
 
@@ -82,8 +83,81 @@ int strcmp(const char *cs, const char *ct);
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
 void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
-static __always_inline void memcpy_flushcache(void *dst, const void *src, 
size_t cnt)
+
+static __always_inline void memcpy_flushcache_8(void *dst, const void *src)
+{
+       asm volatile("movntiq %1, %0"
+                    : "=m"(*(u64 *)dst)
+                    : "r"(*(const u64 *)src)
+                    : "memory");
+}
+
+static __always_inline void memcpy_flushcache_16(void *dst,
+                                                const void *src)
+{
+       memcpy_flushcache_8(dst, src);
+       memcpy_flushcache_8(dst + 8, src + 8);
+}
+
+static __always_inline void memcpy_flushcache_32(void *dst,
+                                                const void *src)
+{
+       memcpy_flushcache_16(dst, src);
+       memcpy_flushcache_16(dst + 16, src + 16);
+}
+
+static __always_inline void memcpy_flushcache_64(void *dst,
+                                                const void *src)
 {
+       memcpy_flushcache_32(dst, src);
+       memcpy_flushcache_32(dst + 32, src + 32);
+}
+
+/*
+ * Keep the additional aligned fixed-size cases on the inline movnti path.
+ * Leave the existing 4/8/16-byte cases handled directly in
+ * memcpy_flushcache() so their code generation stays unchanged.
+ */
+static __always_inline int memcpy_flushcache_large(void *dst,
+                                                  const void *src,
+                                                  size_t cnt)
+{
+       char *dptr = dst;
+       const char *sptr = src;
+
+       if (!IS_ALIGNED((unsigned long)dst, 8))
+               return 0;
+
+       switch (cnt) {
+       case 32:
+               memcpy_flushcache_32(dptr, sptr);
+               return 1;
+       case 48:
+               memcpy_flushcache_32(dptr, sptr);
+               memcpy_flushcache_16(dptr + 32, sptr + 32);
+               return 1;
+       case 64:
+               memcpy_flushcache_64(dptr, sptr);
+               return 1;
+       case 80:
+               memcpy_flushcache_64(dptr, sptr);
+               memcpy_flushcache_16(dptr + 64, sptr + 64);
+               return 1;
+       case 96:
+               memcpy_flushcache_64(dptr, sptr);
+               memcpy_flushcache_32(dptr + 64, sptr + 64);
+               return 1;
+       }
+
+       return 0;
+}
+
+static __always_inline void memcpy_flushcache(void *dst, const void *src,
+                                             size_t cnt)
+{
+       if (!cnt)
+               return;
+
        if (__builtin_constant_p(cnt)) {
                switch (cnt) {
                        case 4:
@@ -97,7 +171,11 @@ static __always_inline void memcpy_flushcache(void *dst, 
const void *src, size_t
                                asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) 
: "r"(*(u64 *)(src + 8)));
                                return;
                }
+
+               if (memcpy_flushcache_large(dst, src, cnt))
+                       return;
        }
+
        __memcpy_flushcache(dst, src, cnt);
 }
 
-- 
2.20.1

Reply via email to