Small constant-sized flushcache copies currently fall back to
__memcpy_flushcache() unless they are exactly 4, 8, or 16 bytes.

Factor the existing inline movnti sequences into small helpers and
extend the fixed-size fastpath coverage to 24..96 bytes for naturally
aligned transfers. This keeps common struct-page-sized copies on the
inline path for the upcoming memcpy_streaming() user, while still
falling back to __memcpy_flushcache() for unaligned or uncommon sizes.
Zero-length copies return immediately.

Issue the fixed-size stores in ascending address order so
write-combining sees a forward stream.

Signed-off-by: Li Zhe <[email protected]>
---
 arch/x86/include/asm/string_64.h | 125 ++++++++++++++++++++++++++-----
 1 file changed, 107 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b57e9e6f3db..8e6fca0185ee 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -82,24 +82,6 @@ int strcmp(const char *cs, const char *ct);
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
 void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
-static __always_inline void memcpy_flushcache(void *dst, const void *src, 
size_t cnt)
-{
-       if (__builtin_constant_p(cnt)) {
-               switch (cnt) {
-                       case 4:
-                               asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : 
"r"(*(u32 *)src));
-                               return;
-                       case 8:
-                               asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : 
"r"(*(u64 *)src));
-                               return;
-                       case 16:
-                               asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : 
"r"(*(u64 *)src));
-                               asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) 
: "r"(*(u64 *)(src + 8)));
-                               return;
-               }
-       }
-       __memcpy_flushcache(dst, src, cnt);
-}
 
 /*
  * Only reuse memcpy_flushcache() for transfers that can stay entirely
@@ -123,6 +105,113 @@ static __always_inline int 
memcpy_flushcache_nt_safe(const void *dst,
        return cnt == 4 && !(d & 3) && !(s & 3);
 }
 
+static __always_inline void memcpy_flushcache_4(void *dst, const void *src)
+{
+       asm volatile("movntil %1, %0"
+                    : "=m"(*(u32 *)dst)
+                    : "r"(*(const u32 *)src)
+                    : "memory");
+}
+
+static __always_inline void memcpy_flushcache_8(void *dst, const void *src)
+{
+       asm volatile("movntiq %1, %0"
+                    : "=m"(*(u64 *)dst)
+                    : "r"(*(const u64 *)src)
+                    : "memory");
+}
+
+static __always_inline void memcpy_flushcache_16(void *dst,
+                                                const void *src)
+{
+       memcpy_flushcache_8(dst, src);
+       memcpy_flushcache_8(dst + 8, src + 8);
+}
+
+static __always_inline void memcpy_flushcache_32(void *dst,
+                                                const void *src)
+{
+       memcpy_flushcache_16(dst, src);
+       memcpy_flushcache_16(dst + 16, src + 16);
+}
+
+static __always_inline void memcpy_flushcache_64(void *dst,
+                                                const void *src)
+{
+       memcpy_flushcache_32(dst, src);
+       memcpy_flushcache_32(dst + 32, src + 32);
+}
+
+/*
+ * Keep common fixed-size copies on the inline movnti path when they can
+ * stay entirely on aligned non-temporal stores. Issue the stores in
+ * ascending address order so write-combining sees a forward stream.
+ */
+static __always_inline int memcpy_flushcache_small(void *dst,
+                                                  const void *src,
+                                                  size_t cnt)
+{
+       char *d = dst;
+       const char *s = src;
+
+       if (!memcpy_flushcache_nt_safe(dst, src, cnt))
+               return 0;
+
+       switch (cnt) {
+       case 4:
+               memcpy_flushcache_4(d, s);
+               return 1;
+       case 8:
+               memcpy_flushcache_8(d, s);
+               return 1;
+       }
+
+       if (cnt & 8) {
+               memcpy_flushcache_8(d, s);
+               d += 8;
+               s += 8;
+               cnt -= 8;
+       }
+
+       switch (cnt) {
+       case 16:
+               memcpy_flushcache_16(d, s);
+               return 1;
+       case 32:
+               memcpy_flushcache_32(d, s);
+               return 1;
+       case 48:
+               memcpy_flushcache_32(d, s);
+               memcpy_flushcache_16(d + 32, s + 32);
+               return 1;
+       case 64:
+               memcpy_flushcache_64(d, s);
+               return 1;
+       case 80:
+               memcpy_flushcache_64(d, s);
+               memcpy_flushcache_16(d + 64, s + 64);
+               return 1;
+       case 96:
+               memcpy_flushcache_64(d, s);
+               memcpy_flushcache_32(d + 64, s + 64);
+               return 1;
+       }
+
+       return 0;
+}
+
+static __always_inline void memcpy_flushcache(void *dst, const void *src,
+                                             size_t cnt)
+{
+       if (!cnt)
+               return;
+
+       if (__builtin_constant_p(cnt) && memcpy_flushcache_small(dst, src, cnt))
+               return;
+
+       __memcpy_flushcache(dst, src, cnt);
+}
+
 #define __HAVE_ARCH_MEMCPY_STREAMING 1
 static __always_inline void memcpy_streaming(void *dst, const void *src,
                                             size_t cnt)
-- 
2.20.1

Reply via email to