Use CC_SET(z)/CC_OUT(z) instead of explicit setz instruction.
Using these two defines, the compiler that supports generation of
condition code outputs from inline assembly flags generates one
conditional jump instruction, e.g:

        [lea    (%rdi),%rsi
         callq  this_cpu_cmpxchg16b_emu]
        -- or --
        [cmpxchg16b %gs:(%rdi)]
        jne    199764 <kmem_cache_alloc+0x44>

instead of

        [lea    (%rdi),%rsi
         callq  this_cpu_cmpxchg16b_emu]
        -- or --
        [cmpxchg16b %gs:(%rdi)]
        sete   %cl
        test   %cl,%cl
        je     19ae04 <kmem_cache_alloc+0x44>

The complication with percpu_cmpxchg16b_double is, that the definition
defaults to the call to this_cpu_cmpxchg16b_emu library function, which
(depending on X86_FEATURE_CX16 flag) is later patched with real cmpxchg16b
instruction. To solve this complication, the patch changes
this_cpu_cmpxchg16b_emu library function to return the result in ZF flag of
%rflags register, instead of %al register.  Please also note that instead
of popf instruction (which restores flags register to a previously saved state),
the patched function uses sti, but followed by a nop, which ends the
inhibition of interrupts early.

The patch also introduces alternative_io_tail definition. This definition
can take a tail instruction, common to all alternatives. By using this
definition, it is possible to remove setz from cmpxchg16b alternatives, saved
in .altinstr_replacement section, thus saving a few bytes from the binary.

Signed-off-by: Uros Bizjak <ubiz...@gmail.com>
Cc: x...@kernel.org
---
 arch/x86/include/asm/alternative.h |  6 ++++++
 arch/x86/include/asm/percpu.h      | 14 ++++++++------
 arch/x86/lib/cmpxchg16b_emu.S      | 16 ++++++----------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 0660e14690c8..49b29990950a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -205,6 +205,12 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
        asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
                : output : "i" (0), ## input)
 
+/* Like alternative_io, but with a common tail instruction. */
+#define alternative_io_tail(oldinstr, newinstr, feature, tail,         \
+                           output, input...)                           \
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) tail     \
+               : output : "i" (0), ## input)
+
 /* Like alternative_io, but for replacing a direct call with another one. */
 #define alternative_call(oldfunc, newfunc, feature, output, input...)  \
        asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 1a19d11cfbbd..9cf2e78eb4b3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -495,12 +495,14 @@ do {                                                      
                \
        bool __ret;                                                     \
        typeof(pcp1) __o1 = (o1), __n1 = (n1);                          \
        typeof(pcp2) __o2 = (o2), __n2 = (n2);                          \
-       alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
-                      "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
-                      X86_FEATURE_CX16,                                \
-                      ASM_OUTPUT2("=a" (__ret), "+m" (pcp1),           \
-                                  "+m" (pcp2), "+d" (__o2)),           \
-                      "b" (__n1), "c" (__n2), "a" (__o1) : "rsi");     \
+       alternative_io_tail("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu", \
+                           "cmpxchg16b "__percpu_arg(1),               \
+                           X86_FEATURE_CX16,                           \
+                           CC_SET(z),                                  \
+                           ASM_OUTPUT2(CC_OUT(z) (__ret),              \
+                                       "+m" (pcp1), "+m" (pcp2),       \
+                                       "+a" (__o1), "+d" (__o2)),      \
+                           "b" (__n1), "c" (__n2) : "rsi");            \
        __ret;                                                          \
 })
 
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 9b330242e740..d8c1ae48e0d9 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -17,20 +17,20 @@
  * %rdx : high 64 bits of old value
  * %rbx : low 64 bits of new value
  * %rcx : high 64 bits of new value
- * %al  : Operation successful
+ *
+ * Outputs:
+ * %rflags.zf: set if the destination operand and %rdx:%rax are equal
  */
 ENTRY(this_cpu_cmpxchg16b_emu)
 
 #
-# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
-# via the ZF.  Caller will access %al to get result.
+# Emulate 'cmpxchg16b %gs:(%rsi)'
 #
 # Note that this is only useful for a cpuops operation.  Meaning that we
 # do *not* have a fully atomic operation but just an operation that is
 # *atomic* on a single cpu (as provided by the this_cpu_xx class of
 # macros).
 #
-       pushfq
        cli
 
        cmpq PER_CPU_VAR((%rsi)), %rax
@@ -41,13 +41,9 @@ ENTRY(this_cpu_cmpxchg16b_emu)
        movq %rbx, PER_CPU_VAR((%rsi))
        movq %rcx, PER_CPU_VAR(8(%rsi))
 
-       popfq
-       mov $1, %al
-       ret
-
 .Lnot_same:
-       popfq
-       xor %al,%al
+       sti
+       nop
        ret
 
 ENDPROC(this_cpu_cmpxchg16b_emu)
-- 
2.20.1

Reply via email to