[OpenWrt-Devel] [PATCH] kernel: allow compiler to use load-and-byteswap / store-and-byteswap insns

David Woodhouse Tue, 18 Dec 2012 07:57:17 -0800

If we use __builtin_bswap32() et al instead of manually coded assembler,
the compiler actually knows what's going on and can optimise it for us.


Tested with mpc5200 target, this cuts the size of the PPC kernel by
about 10kB (mostly in ext4 which is a heavy user of little-endian data).
And shaves a bit off a bunch of modules too. No size win on x86 — the
movbe instruction takes as many bytes to encode as the equivalent bswap
+ mov. But it's allegedly a significant *performance* win on Atom, which
is why it was added there in the first place.

Signed-off-by: David Woodhouse <[email protected]>
---
 .../linux/generic/patches-3.3/150-byteswap.patch   |  151 ++++++++++++++++++++
 .../linux/generic/patches-3.6/150-byteswap.patch   |  135 +++++++++++++++++
 .../linux/generic/patches-3.7/150-byteswap.patch   |  124 ++++++++++++++++
 3 files changed, 410 insertions(+), 0 deletions(-)
 create mode 100644 target/linux/generic/patches-3.3/150-byteswap.patch
 create mode 100644 target/linux/generic/patches-3.6/150-byteswap.patch
 create mode 100644 target/linux/generic/patches-3.7/150-byteswap.patch

diff --git a/target/linux/generic/patches-3.3/150-byteswap.patch 
b/target/linux/generic/patches-3.3/150-byteswap.patch
new file mode 100644
index 0000000..5a97d52
--- /dev/null
+++ b/target/linux/generic/patches-3.3/150-byteswap.patch
@@ -0,0 +1,151 @@
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 4f55c73..dc774e0 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -84,6 +84,25 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS
+         See Documentation/unaligned-memory-access.txt for more
+         information on the topic of unaligned memory accesses.
+ 
++config ARCH_USE_BUILTIN_BSWAP
++       bool
++       help
++       Modern versions of GCC (since 4.4) have builtin functions
++       for handling byte-swapping. Using these, instead of the old
++       inline assembler that the architecture code provides in the
++       __arch_bswapXX() macros, allows the compiler to see what's
++       happening and offers more opportunity for optimisation. In
++       particular, the compiler will be able to combine the byteswap
++       with a nearby load or store and use load-and-swap or
++       store-and-swap instructions if the architecture has them. It
++       should almost *never* result in code which is worse than the
++       hand-coded assembler in <asm/swab.h>.  But just in case it
++       does, the use of the builtins is optional.
++
++       Any architecture with load-and-swap or store-and-swap
++       instructions should set this. And it shouldn't hurt to set it
++       on architectures that don't have such instructions.
++
+ config HAVE_SYSCALL_WRAPPERS
+       bool
+ 
+diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
+index 1919634..f6013bd 100644
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -143,6 +143,7 @@ config PPC
+       select HAVE_BPF_JIT if (PPC64 && NET)
+       select HAVE_ARCH_JUMP_LABEL
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
++      select ARCH_USE_BUILTIN_BSWAP
+ 
+ config EARLY_PRINTK
+       bool
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 5bed94e..e632151 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -82,6 +82,7 @@ config X86
+       select CLKEVT_I8253
+       select ARCH_HAVE_NMI_SAFE_CMPXCHG
+       select GENERIC_IOMAP
++      select ARCH_USE_BUILTIN_BSWAP
+ 
+ config INSTRUCTION_DECODER
+       def_bool (KPROBES || PERF_EVENTS)
+diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
+index 3c57033..71d7e0e 100644
+--- a/arch/x86/Kconfig.cpu
++++ b/arch/x86/Kconfig.cpu
+@@ -353,6 +353,10 @@ config X86_BSWAP
+       def_bool y
+       depends on X86_32 && !M386
+ 
++config X86_MOVBE
++      def_bool y
++      depends on MATOM && !X86_GENERIC
++
+ config X86_POPAD_OK
+       def_bool y
+       depends on X86_32 && !M386
+diff --git a/arch/x86/Makefile b/arch/x86/Makefile
+index 209ba12..5c32564 100644
+--- a/arch/x86/Makefile
++++ b/arch/x86/Makefile
+@@ -57,6 +57,7 @@ else
+                 $(call cc-option,-march=core2,$(call 
cc-option,-mtune=generic))
+       cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
+               $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
++        cflags-$(CONFIG_X86_MOVBE) += $(call cc-option,-mmovbe)
+         cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
+         KBUILD_CFLAGS += $(cflags-y)
+ 
+diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
+index 2f40791..86b0852 100644
+--- a/include/linux/compiler-gcc4.h
++++ b/include/linux/compiler-gcc4.h
+@@ -56,3 +56,13 @@
+ #define __compiletime_warning(message) __attribute__((warning(message)))
+ #define __compiletime_error(message) __attribute__((error(message)))
+ #endif
++
++#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
++#if __GNUC_MINOR__ >= 4
++#define __HAVE_BUILTIN_BSWAP32__
++#define __HAVE_BUILTIN_BSWAP64__
++#endif
++#if __GNUC_MINOR__ >= 8 || (defined(__powerpc__) && __GNUC_MINOR__ >= 6)
++#define __HAVE_BUILTIN_BSWAP16__
++#endif
++#endif
+diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
+index d8e636e..973ce10 100644
+--- a/include/linux/compiler-intel.h
++++ b/include/linux/compiler-intel.h
+@@ -29,3 +29,10 @@
+ #endif
+ 
+ #define uninitialized_var(x) x
++
++#ifndef __HAVE_BUILTIN_BSWAP16__
++/* icc has this, but it's called _bswap16 */
++#define __HAVE_BUILTIN_BSWAP16__
++#define __builtin_bswap16 _bswap16
++#endif
++
+diff --git a/include/linux/swab.h b/include/linux/swab.h
+index ea0c02f..2defbc6 100644
+--- a/include/linux/swab.h
++++ b/include/linux/swab.h
+@@ -45,7 +45,9 @@
+ 
+ static inline __attribute_const__ __u16 __fswab16(__u16 val)
+ {
+-#ifdef __arch_swab16
++#ifdef __HAVE_BUILTIN_BSWAP16__
++      return __builtin_bswap16(val);
++#elif defined (__arch_swab16)
+       return __arch_swab16(val);
+ #else
+       return ___constant_swab16(val);
+@@ -54,7 +56,9 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
+ 
+ static inline __attribute_const__ __u32 __fswab32(__u32 val)
+ {
+-#ifdef __arch_swab32
++#ifdef __HAVE_BUILTIN_BSWAP32__
++      return __builtin_bswap32(val);
++#elif defined(__arch_swab32)
+       return __arch_swab32(val);
+ #else
+       return ___constant_swab32(val);
+@@ -63,7 +67,9 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
+ 
+ static inline __attribute_const__ __u64 __fswab64(__u64 val)
+ {
+-#ifdef __arch_swab64
++#ifdef __HAVE_BUILTIN_BSWAP64__
++      return __builtin_bswap64(val);
++#elif defined (__arch_swab64)
+       return __arch_swab64(val);
+ #elif defined(__SWAB_64_THRU_32__)
+       __u32 h = val >> 32;
diff --git a/target/linux/generic/patches-3.6/150-byteswap.patch 
b/target/linux/generic/patches-3.6/150-byteswap.patch
new file mode 100644
index 0000000..9a6f306
--- /dev/null
+++ b/target/linux/generic/patches-3.6/150-byteswap.patch
@@ -0,0 +1,135 @@
+--- linux-3.6.10/arch/x86/Kconfig.bswap        2012-12-12 23:32:33.000000000 
+0000
++++ linux-3.6.10/arch/x86/Kconfig      2012-12-12 23:35:33.516959401 +0000
+@@ -97,6 +97,7 @@ config X86
+       select KTIME_SCALAR if X86_32
+       select GENERIC_STRNCPY_FROM_USER
+       select GENERIC_STRNLEN_USER
++      select ARCH_USE_BUILTIN_BSWAP
+ 
+ config INSTRUCTION_DECODER
+       def_bool (KPROBES || PERF_EVENTS || UPROBES)
+--- linux-3.6.10/arch/x86/Kconfig.cpu.bswap    2012-12-10 19:13:27.000000000 
+0000
++++ linux-3.6.10/arch/x86/Kconfig.cpu  2012-12-12 23:34:50.262959728 +0000
+@@ -352,6 +352,10 @@ config X86_BSWAP
+       def_bool y
+       depends on X86_32 && !M386
+ 
++config X86_MOVBE
++      def_bool y
++      depends on MATOM && !X86_GENERIC
++
+ config X86_POPAD_OK
+       def_bool y
+       depends on X86_32 && !M386
+--- linux-3.6.10/arch/x86/Makefile.bswap       2012-12-10 19:13:27.000000000 
+0000
++++ linux-3.6.10/arch/x86/Makefile     2012-12-12 23:34:50.262959728 +0000
+@@ -64,6 +64,7 @@ else
+                 $(call cc-option,-march=core2,$(call 
cc-option,-mtune=generic))
+       cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
+               $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
++        cflags-$(CONFIG_X86_MOVBE) += $(call cc-option,-mmovbe)
+         cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
+         KBUILD_CFLAGS += $(cflags-y)
+ 
+--- linux-3.6.10/arch/Kconfig.bswap    2012-12-10 19:13:27.000000000 +0000
++++ linux-3.6.10/arch/Kconfig  2012-12-12 23:34:50.260959794 +0000
+@@ -112,6 +112,25 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS
+         See Documentation/unaligned-memory-access.txt for more
+         information on the topic of unaligned memory accesses.
+ 
++config ARCH_USE_BUILTIN_BSWAP
++       bool
++       help
++       Modern versions of GCC (since 4.4) have builtin functions
++       for handling byte-swapping. Using these, instead of the old
++       inline assembler that the architecture code provides in the
++       __arch_bswapXX() macros, allows the compiler to see what's
++       happening and offers more opportunity for optimisation. In
++       particular, the compiler will be able to combine the byteswap
++       with a nearby load or store and use load-and-swap or
++       store-and-swap instructions if the architecture has them. It
++       should almost *never* result in code which is worse than the
++       hand-coded assembler in <asm/swab.h>.  But just in case it
++       does, the use of the builtins is optional.
++
++       Any architecture with load-and-swap or store-and-swap
++       instructions should set this. And it shouldn't hurt to set it
++       on architectures that don't have such instructions.
++
+ config HAVE_SYSCALL_WRAPPERS
+       bool
+ 
+--- linux-3.6.10/arch/powerpc/Kconfig.bswap    2012-12-10 19:13:27.000000000 
+0000
++++ linux-3.6.10/arch/powerpc/Kconfig  2012-12-12 23:35:24.340958729 +0000
+@@ -139,6 +139,7 @@ config PPC
+       select GENERIC_CLOCKEVENTS
+       select GENERIC_STRNCPY_FROM_USER
+       select GENERIC_STRNLEN_USER
++      select ARCH_USE_BUILTIN_BSWAP
+ 
+ config EARLY_PRINTK
+       bool
+--- linux-3.6.10/include/linux/compiler-gcc4.h.bswap   2012-12-10 
19:13:27.000000000 +0000
++++ linux-3.6.10/include/linux/compiler-gcc4.h 2012-12-12 23:34:50.262959728 
+0000
+@@ -63,3 +63,13 @@
+ #define __compiletime_warning(message) __attribute__((warning(message)))
+ #define __compiletime_error(message) __attribute__((error(message)))
+ #endif
++
++#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
++#if __GNUC_MINOR__ >= 4
++#define __HAVE_BUILTIN_BSWAP32__
++#define __HAVE_BUILTIN_BSWAP64__
++#endif
++#if __GNUC_MINOR__ >= 8 || (defined(__powerpc__) && __GNUC_MINOR__ >= 6)
++#define __HAVE_BUILTIN_BSWAP16__
++#endif
++#endif
+--- linux-3.6.10/include/linux/compiler-intel.h.bswap  2012-12-10 
19:13:27.000000000 +0000
++++ linux-3.6.10/include/linux/compiler-intel.h        2012-12-12 
23:34:50.263959688 +0000
+@@ -29,3 +29,10 @@
+ #endif
+ 
+ #define uninitialized_var(x) x
++
++#ifndef __HAVE_BUILTIN_BSWAP16__
++/* icc has this, but it's called _bswap16 */
++#define __HAVE_BUILTIN_BSWAP16__
++#define __builtin_bswap16 _bswap16
++#endif
++
+--- linux-3.6.10/include/linux/swab.h.bswap    2012-12-10 19:13:27.000000000 
+0000
++++ linux-3.6.10/include/linux/swab.h  2012-12-12 23:34:57.228971261 +0000
+@@ -45,7 +45,9 @@
+ 
+ static inline __attribute_const__ __u16 __fswab16(__u16 val)
+ {
+-#ifdef __arch_swab16
++#ifdef __HAVE_BUILTIN_BSWAP16__
++      return __builtin_bswap16(val);
++#elif defined (__arch_swab16)
+       return __arch_swab16(val);
+ #else
+       return ___constant_swab16(val);
+@@ -54,7 +56,9 @@ static inline __attribute_const__ __u16
+ 
+ static inline __attribute_const__ __u32 __fswab32(__u32 val)
+ {
+-#ifdef __arch_swab32
++#ifdef __HAVE_BUILTIN_BSWAP32__
++      return __builtin_bswap32(val);
++#elif defined(__arch_swab32)
+       return __arch_swab32(val);
+ #else
+       return ___constant_swab32(val);
+@@ -63,7 +67,9 @@ static inline __attribute_const__ __u32
+ 
+ static inline __attribute_const__ __u64 __fswab64(__u64 val)
+ {
+-#ifdef __arch_swab64
++#ifdef __HAVE_BUILTIN_BSWAP64__
++      return __builtin_bswap64(val);
++#elif defined (__arch_swab64)
+       return __arch_swab64(val);
+ #elif defined(__SWAB_64_THRU_32__)
+       __u32 h = val >> 32;
diff --git a/target/linux/generic/patches-3.7/150-byteswap.patch 
b/target/linux/generic/patches-3.7/150-byteswap.patch
new file mode 100644
index 0000000..b5ab601
--- /dev/null
+++ b/target/linux/generic/patches-3.7/150-byteswap.patch
@@ -0,0 +1,124 @@
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 366ec06..c31416b 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -112,6 +112,25 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS
+         See Documentation/unaligned-memory-access.txt for more
+         information on the topic of unaligned memory accesses.
+ 
++config ARCH_USE_BUILTIN_BSWAP
++       bool
++       help
++       Modern versions of GCC (since 4.4) have builtin functions
++       for handling byte-swapping. Using these, instead of the old
++       inline assembler that the architecture code provides in the
++       __arch_bswapXX() macros, allows the compiler to see what's
++       happening and offers more opportunity for optimisation. In
++       particular, the compiler will be able to combine the byteswap
++       with a nearby load or store and use load-and-swap or
++       store-and-swap instructions if the architecture has them. It
++       should almost *never* result in code which is worse than the
++       hand-coded assembler in <asm/swab.h>.  But just in case it
++       does, the use of the builtins is optional.
++
++       Any architecture with load-and-swap or store-and-swap
++       instructions should set this. And it shouldn't hurt to set it
++       on architectures that don't have such instructions.
++
+ config HAVE_SYSCALL_WRAPPERS
+       bool
+ 
+diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
+index a902a5c..02a9698 100644
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -144,6 +144,7 @@ config PPC
+       select GENERIC_KERNEL_THREAD
+       select HAVE_MOD_ARCH_SPECIFIC
+       select MODULES_USE_ELF_RELA
++      select ARCH_USE_BUILTIN_BSWAP
+ 
+ config EARLY_PRINTK
+       bool
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 46c3bff..bbf5ff8 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -112,6 +112,7 @@ config X86
+       select GENERIC_KERNEL_EXECVE
+       select MODULES_USE_ELF_REL if X86_32
+       select MODULES_USE_ELF_RELA if X86_64
++      select ARCH_USE_BUILTIN_BSWAP
+ 
+ config INSTRUCTION_DECODER
+       def_bool y
+diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
+index 412bc6c..dc16a85 100644
+--- a/include/linux/compiler-gcc4.h
++++ b/include/linux/compiler-gcc4.h
+@@ -63,3 +63,13 @@
+ #define __compiletime_warning(message) __attribute__((warning(message)))
+ #define __compiletime_error(message) __attribute__((error(message)))
+ #endif
++
++#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
++#if __GNUC_MINOR__ >= 4
++#define __HAVE_BUILTIN_BSWAP32__
++#define __HAVE_BUILTIN_BSWAP64__
++#endif
++#if __GNUC_MINOR__ >= 8 || (defined(__powerpc__) && __GNUC_MINOR__ >= 6)
++#define __HAVE_BUILTIN_BSWAP16__
++#endif
++#endif
+diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
+index d8e636e..973ce10 100644
+--- a/include/linux/compiler-intel.h
++++ b/include/linux/compiler-intel.h
+@@ -29,3 +29,10 @@
+ #endif
+ 
+ #define uninitialized_var(x) x
++
++#ifndef __HAVE_BUILTIN_BSWAP16__
++/* icc has this, but it's called _bswap16 */
++#define __HAVE_BUILTIN_BSWAP16__
++#define __builtin_bswap16 _bswap16
++#endif
++
+diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
+index e811474..0e011eb 100644
+--- a/include/uapi/linux/swab.h
++++ b/include/uapi/linux/swab.h
+@@ -45,7 +45,9 @@
+ 
+ static inline __attribute_const__ __u16 __fswab16(__u16 val)
+ {
+-#ifdef __arch_swab16
++#ifdef __HAVE_BUILTIN_BSWAP16__
++      return __builtin_bswap16(val);
++#elif defined (__arch_swab16)
+       return __arch_swab16(val);
+ #else
+       return ___constant_swab16(val);
+@@ -54,7 +56,9 @@ static inline __attribute_const__ __u16 __fswab16(__u16 val)
+ 
+ static inline __attribute_const__ __u32 __fswab32(__u32 val)
+ {
+-#ifdef __arch_swab32
++#ifdef __HAVE_BUILTIN_BSWAP32__
++      return __builtin_bswap32(val);
++#elif defined(__arch_swab32)
+       return __arch_swab32(val);
+ #else
+       return ___constant_swab32(val);
+@@ -63,7 +67,9 @@ static inline __attribute_const__ __u32 __fswab32(__u32 val)
+ 
+ static inline __attribute_const__ __u64 __fswab64(__u64 val)
+ {
+-#ifdef __arch_swab64
++#ifdef __HAVE_BUILTIN_BSWAP64__
++      return __builtin_bswap64(val);
++#elif defined (__arch_swab64)
+       return __arch_swab64(val);
+ #elif defined(__SWAB_64_THRU_32__)
+       __u32 h = val >> 32;
-- 
1.7.7.6


-- 
dwmw2

smime.p7s
Description: S/MIME cryptographic signature

_______________________________________________
openwrt-devel mailing list
[email protected]
https://lists.openwrt.org/mailman/listinfo/openwrt-devel

[OpenWrt-Devel] [PATCH] kernel: allow compiler to use load-and-byteswap / store-and-byteswap insns

Reply via email to