Allow the optimiser to elimiate the call completely, and use the compiler
builtin by default.  Architectures should only proide arch_ffs() if they think
they can do better than the compiler.

Confirm the expected behaviour with compile time and boot time tests.

For x86, correct the prototype, and simplify the asm() with the statement
given by the Intel architects to Linux about the behaviour on processors newer
than the 486.

For PPC, __builtin_ffs() is 1/3 of the size of size of the transform to
generic_fls().  Drop the definition entirely.

For ARM, simply rename ffs() to arch_ffs().  It appears that the
transformation to __builtin_clz() still makes better code than
__builtin_ffs().

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
---
CC: Jan Beulich <jbeul...@suse.com>
CC: Roger Pau Monné <roger....@citrix.com>
CC: Wei Liu <w...@xen.org>
CC: Stefano Stabellini <sstabell...@kernel.org>
CC: Julien Grall <jul...@xen.org>
CC: Volodymyr Babchuk <volodymyr_babc...@epam.com>
CC: Bertrand Marquis <bertrand.marq...@arm.com>
CC: Michal Orzel <michal.or...@amd.com>
CC: Oleksii Kurochko <oleksii.kuroc...@gmail.com>
CC: Shawn Anastasio <sanasta...@raptorengineering.com>
CC: consult...@bugseng.com <consult...@bugseng.com>
CC: Simone Ballarin <simone.balla...@bugseng.com>
CC: Federico Serafini <federico.seraf...@bugseng.com>
CC: Nicola Vetrini <nicola.vetr...@bugseng.com>
---
 xen/arch/arm/include/asm/bitops.h |  2 +-
 xen/arch/ppc/include/asm/bitops.h |  1 -
 xen/arch/x86/include/asm/bitops.h | 19 +++++++++++++------
 xen/common/bitops.c               | 10 ++++++++++
 xen/include/xen/bitops.h          | 15 +++++++++++++++
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/xen/arch/arm/include/asm/bitops.h 
b/xen/arch/arm/include/asm/bitops.h
index ab030b6cb032..09c6064274a7 100644
--- a/xen/arch/arm/include/asm/bitops.h
+++ b/xen/arch/arm/include/asm/bitops.h
@@ -157,7 +157,7 @@ static inline int fls(unsigned int x)
 }
 
 
-#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
+#define arch_ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
 #define ffsl(x) ({ unsigned long __t = (x); flsl(ISOLATE_LSB(__t)); })
 
 /**
diff --git a/xen/arch/ppc/include/asm/bitops.h 
b/xen/arch/ppc/include/asm/bitops.h
index 5820b9ce7bb5..635a3b4e3e33 100644
--- a/xen/arch/ppc/include/asm/bitops.h
+++ b/xen/arch/ppc/include/asm/bitops.h
@@ -173,7 +173,6 @@ static inline int __test_and_clear_bit(int nr, volatile 
void *addr)
 
 #define flsl(x) generic_flsl(x)
 #define fls(x) generic_fls(x)
-#define ffs(x) ({ unsigned int t_ = (x); fls(t_ & -t_); })
 #define ffsl(x) ({ unsigned long t_ = (x); flsl(t_ & -t_); })
 
 /* Based on linux/include/asm-generic/bitops/ffz.h */
diff --git a/xen/arch/x86/include/asm/bitops.h 
b/xen/arch/x86/include/asm/bitops.h
index 5a71afbc89d5..2c5b103cbbd9 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -430,16 +430,23 @@ static inline int ffsl(unsigned long x)
     return (int)r+1;
 }
 
-static inline int ffs(unsigned int x)
+static inline unsigned int arch_ffs(unsigned int x)
 {
-    int r;
+    int r = -1;
+
+    /*
+     * The AMD manual states that BSF won't modify the destination register if
+     * x=0.  The Intel manual states that the result is undefined, but the
+     * architects have said that the register is written back with it's old
+     * value, possibly zero extended above 32 bits.
+     */
+    asm ( "bsf %[val], %[res]"
+          : [res] "+r" (r)
+          : [val] "rm" (x) );
 
-    asm ( "bsf %1,%0\n\t"
-          "jnz 1f\n\t"
-          "mov $-1,%0\n"
-          "1:" : "=r" (r) : "rm" (x));
     return r + 1;
 }
+#define arch_ffs arch_ffs
 
 /**
  * fls - find last bit set
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
index 4c07191b4030..484df68768ad 100644
--- a/xen/common/bitops.c
+++ b/xen/common/bitops.c
@@ -34,8 +34,18 @@
         RUNTIME_CHECK(fn, val, res);            \
     } while ( 0 )
 
+static void test_ffs(void)
+{
+    /* unsigned int ffs(unsigned int) */
+    CHECK(ffs, 0, 0);
+    CHECK(ffs, 1, 1);
+    CHECK(ffs, 0x80000000U, 32);
+}
+
 static int __init cf_check test_bitops(void)
 {
+    test_ffs();
+
     return 0;
 }
 __initcall(test_bitops);
diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index 9b40f20381a2..fb3645d9cf87 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -110,6 +110,21 @@ static inline int generic_flsl(unsigned long x)
 
 #include <asm/bitops.h>
 
+/*
+ * Find First Set bit.  Bits are labelled from 1.
+ */
+static always_inline __pure unsigned int ffs(unsigned int x)
+{
+    if ( __builtin_constant_p(x) )
+        return __builtin_ffs(x);
+
+#ifndef arch_ffs
+#define arch_ffs __builtin_ffs
+#endif
+
+    return arch_ffs(x);
+}
+
 /* --------------------- Please tidy below here --------------------- */
 
 #ifndef find_next_bit
-- 
2.30.2


Reply via email to