[RESEND PATCH v2 1/5] lib/sort: Make swap functions more generic

2019-03-19 Thread George Spelvin
Rather than having special-case swap functions for 4- and 8-byte objects,
special-case aligned multiples of 4 or 8 bytes.  This speeds up most
users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

The functions also got renamed to reflect the fact that they support
multiple words.  In the great tradition of bikeshedding, the names were
by far the most contentious issue during review of this patch series.

x86-64 code size 872 -> 886 bytes (+14)

Signed-off-by: George Spelvin 
Acked-by: Andrey Abramov 
Feedback-from: Andy Shevchenko 
Feedback-from: Rasmus Villemoes 
Feedback-from: Geert Uytterhoeven 
---
 lib/sort.c | 135 +
 1 file changed, 105 insertions(+), 30 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..ec79eac85e21 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,108 @@
 #include 
 #include 
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-   return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-   ((unsigned long)base & (align - 1)) == 0;
+   unsigned char lsbits = (unsigned char)size;
+
+   (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+   lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+   return (lsbits & (align - 1)) == 0;
 }
 
-static void u32_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, int size)
 {
-   u32 t = *(u32 *)a;
-   *(u32 *)a = *(u32 *)b;
-   *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, int size)
-{
-   u64 t = *(u64 *)a;
-   *(u64 *)a = *(u64 *)b;
-   *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-   char t;
+   size_t n = (unsigned int)size;
 
do {
-   t = *(char *)a;
-   *(char *)a++ = *(char *)b;
-   *(char *)b++ = t;
-   } while (--size > 0);
+   u32 t = *(u32 *)(a + (n -= 4));
+   *(u32 *)(a + n) = *(u32 *)(b + n);
+   *(u32 *)(b + n) = t;
+   } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, int size)
+{
+   size_t n = (unsigned int)size;
+
+   do {
+#ifdef CONFIG_64BIT
+   u64 t = *(u64 *)(a + (n -= 8));
+   *(u64 *)(a + n) = *(u64 *)(b + n);
+   *(u64 *)(b + n) = t;
+#else
+   /* Use two 32-bit transfers to avoid base+index+4 addressing */
+   u32 t = *(u32 *)(a + (n -= 4));
+   *(u32 *)(a + n) = *(u32 *)(b + n);
+   *(u32 *)(b + n) = t;
+
+   t = *(u32 *)(a + (n -= 4));
+   *(u32 *)(a + n) = *(u32 *)(b + n);
+   *(u32 *)(b + n) = t;
+#endif
+   } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a, @b: 

[PATCH v2 1/5] lib/sort: Make swap functions more generic

2019-03-15 Thread George Spelvin
Rather than having special-case swap functions for 4- and 8-byte objects,
special-case aligned multiples of 4 or 8 bytes.  This speeds up most
users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

The functions also got renamed to reflect the fact that they support
multiple words.  In the great tradition of bikeshedding, the names were
by far the most contentious issue during review of this patch series.

x86-64 code size 872 -> 886 bytes (+14)

Signed-off-by: George Spelvin 
Acked-by: Andrey Abramov 
Feedback-from: Andy Shevchenko 
Feedback-from: Rasmus Villemoes 
Feedback-from: Geert Uytterhoeven 
---
 lib/sort.c | 135 +
 1 file changed, 105 insertions(+), 30 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..ec79eac85e21 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,108 @@
 #include 
 #include 
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-   return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-   ((unsigned long)base & (align - 1)) == 0;
+   unsigned char lsbits = (unsigned char)size;
+
+   (void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+   lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+   return (lsbits & (align - 1)) == 0;
 }
 
-static void u32_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, int size)
 {
-   u32 t = *(u32 *)a;
-   *(u32 *)a = *(u32 *)b;
-   *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, int size)
-{
-   u64 t = *(u64 *)a;
-   *(u64 *)a = *(u64 *)b;
-   *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-   char t;
+   size_t n = (unsigned int)size;
 
do {
-   t = *(char *)a;
-   *(char *)a++ = *(char *)b;
-   *(char *)b++ = t;
-   } while (--size > 0);
+   u32 t = *(u32 *)(a + (n -= 4));
+   *(u32 *)(a + n) = *(u32 *)(b + n);
+   *(u32 *)(b + n) = t;
+   } while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, int size)
+{
+   size_t n = (unsigned int)size;
+
+   do {
+#ifdef CONFIG_64BIT
+   u64 t = *(u64 *)(a + (n -= 8));
+   *(u64 *)(a + n) = *(u64 *)(b + n);
+   *(u64 *)(b + n) = t;
+#else
+   /* Use two 32-bit transfers to avoid base+index+4 addressing */
+   u32 t = *(u32 *)(a + (n -= 4));
+   *(u32 *)(a + n) = *(u32 *)(b + n);
+   *(u32 *)(b + n) = t;
+
+   t = *(u32 *)(a + (n -= 4));
+   *(u32 *)(a + n) = *(u32 *)(b + n);
+   *(u32 *)(b + n) = t;
+#endif
+   } while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a, @b: