[tip:x86/asm] x86/asm/bitops: Force inlining of test_and_set_bit and friends

2016-02-09 Thread tip-bot for Denys Vlasenko
Commit-ID:  8dd5032d9c540111dd673078738d137a998d6c3f
Gitweb: http://git.kernel.org/tip/8dd5032d9c540111dd673078738d137a998d6c3f
Author: Denys Vlasenko 
AuthorDate: Sun, 7 Feb 2016 22:51:27 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 9 Feb 2016 10:31:54 +0100

x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

  test_and_set_bit (166 copies, ~1260 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f ab 3e  lock bts %rdi,(%rsi)
 72 04   jb 
 31 c0   xor%eax,%eax
 eb 05   jmp
 b8 01 00 00 00  mov$0x1,%eax
 5d  pop%rbp
 c3  retq

  test_and_clear_bit (124 copies, ~1000 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f b3 3e  lock btr %rdi,(%rsi)
 72 04   jb 
 31 c0   xor%eax,%eax
 eb 05   jmp
 b8 01 00 00 00  mov$0x1,%eax
 5d  pop%rbp
 c3  retq

  change_bit (3 copies, 8 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f bb 3e  lock btc %rdi,(%rsi)
 5d  pop%rbp
 c3  retq

  clear_bit_unlock (2 copies, 11 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f b3 3e  lock btr %rdi,(%rsi)
 5d  pop%rbp
 c3  retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

  text data  bss   decfilename
  92110727 20826144 36417536 149354407vmlinux.before
  92097234 20826176 36417536 149340946vmlinux.after

Signed-off-by: Denys Vlasenko 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: David Rientjes 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Thomas Graf 
Link: 
http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlas...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/bitops.h | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..7766d1c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long 
*addr)
 {
barrier();
clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned 
long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long 
*addr)
 {
barrier();
__clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile 
unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
   

[tip:x86/asm] x86/asm/bitops: Force inlining of test_and_set_bit and friends

2016-02-09 Thread tip-bot for Denys Vlasenko
Commit-ID:  8dd5032d9c540111dd673078738d137a998d6c3f
Gitweb: http://git.kernel.org/tip/8dd5032d9c540111dd673078738d137a998d6c3f
Author: Denys Vlasenko 
AuthorDate: Sun, 7 Feb 2016 22:51:27 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 9 Feb 2016 10:31:54 +0100

x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

  test_and_set_bit (166 copies, ~1260 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f ab 3e  lock bts %rdi,(%rsi)
 72 04   jb 
 31 c0   xor%eax,%eax
 eb 05   jmp
 b8 01 00 00 00  mov$0x1,%eax
 5d  pop%rbp
 c3  retq

  test_and_clear_bit (124 copies, ~1000 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f b3 3e  lock btr %rdi,(%rsi)
 72 04   jb 
 31 c0   xor%eax,%eax
 eb 05   jmp
 b8 01 00 00 00  mov$0x1,%eax
 5d  pop%rbp
 c3  retq

  change_bit (3 copies, 8 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f bb 3e  lock btc %rdi,(%rsi)
 5d  pop%rbp
 c3  retq

  clear_bit_unlock (2 copies, 11 calls)
 55  push   %rbp
 48 89 e5mov%rsp,%rbp
 f0 48 0f b3 3e  lock btr %rdi,(%rsi)
 5d  pop%rbp
 c3  retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

  text data  bss   decfilename
  92110727 20826144 36417536 149354407vmlinux.before
  92097234 20826176 36417536 149340946vmlinux.after

Signed-off-by: Denys Vlasenko 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: David Rientjes 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Thomas Graf 
Link: 
http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlas...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/bitops.h | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..7766d1c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long 
*addr)
 {
barrier();
clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned 
long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long 
*addr)
 {
barrier();
__clear_bit(nr, addr);