Re: [PATCH v4 1/3] lib: find_*_bit reimplementation

2015-02-18 Thread Rasmus Villemoes
On Tue, Feb 17 2015, Yury Norov  wrote:

> The new implementation takes less space in the sources
> (see diffstat) and in the object. For me it's 710 vs 453
> bytes of text. It also shows a better performance.
>
> find_last_bit description fixed due to obvious typo.
>
> In this patch 2 macros were introduced: {LOW,HIGH}_BITS_MASK,
> that are doing almost the same as BITMAP_{FIRST,LAST}_WORD_MASK
> in include/linux/bitmap.h. But 'LAST' macro is potentially less
> effective, because it issues a conditional branch that can be
> omitted.  If it is replaced one day by a more effective
> implementation, {LOW,HIGH}_BITS_MASK can be removed.
>

I think it's better to use the existing macros and then improve them
instead of duplicating the functionality. I'll submit a patch for that
later tonight (that should then make it to 3.21 [or whatever 3.19+2 will
be called] together with this series). More on this issue below.

>
> Signed-off-by: Yury Norov 
> ---
>  include/linux/bitops.h |   4 +-
>  lib/find_last_bit.c|  37 +++
>  lib/find_next_bit.c| 269 
> ++---
>  3 files changed, 94 insertions(+), 216 deletions(-)
>
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index 5d858e0..297f5bd 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
>  /**
>   * find_last_bit - find the last set bit in a memory region
>   * @addr: The address to start the search at
> - * @size: The maximum size to search
> + * @size: The number of bits to search
>   *
> - * Returns the bit number of the first set bit, or size.
> + * Returns the bit number of the last set bit, or size.
>   */
>  extern unsigned long find_last_bit(const unsigned long *addr,
>  unsigned long size);
> diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
> index 91ca09f..edbb281 100644
> --- a/lib/find_last_bit.c
> +++ b/lib/find_last_bit.c
> @@ -4,6 +4,9 @@
>   * Written by Rusty Russell 
>   * (Inspired by David Howell's find_next_bit implementation)
>   *
> + * Rewritten by Yury Norov  to decrease
> + * size and improve performance, 2015.
> + *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
>   * as published by the Free Software Foundation; either version
> @@ -12,36 +15,26 @@
>  
>  #include 
>  #include 
> -#include 
> -#include 
> +#include 
> +
> +#define LOW_BITS_MASK(nr) (~0UL >> -(nr))

This is technically wrong, and may not even work on architectures that
are not as forgiving as x86: Whatever type and value nr has, -(nr) is
almost guaranteed not to be a number between 0 and BITS_PER_LONG-1. And
even on x86, gcc doesn't generate as good code as it could:

 163:   49 c7 c0 ff ff ff ffmov$0x,%r8
 16a:   83 e1 3fand$0x3f,%ecx
 16d:   f7 d9   neg%ecx
 16f:   48 c1 ea 06 shr$0x6,%rdx
 173:   49 d3 e8shr%cl,%r8

It doesn't realize that pre-masking %ecx with 0x3f is redundant when we
then negate it and use it as a shift amount.

So a better definition of the macro is

#define BITMAP_LAST_WORD_MASK(nr) (~0UL >> (-(nr) & (BITS_PER_LONG-1)))

and then callers shouldn't do the modulo. On x86, gcc knows that the &
is redundant. I use & instead of % so that nr may also have signed type
(otherwise we're again in UB land, since -(nr) % BITS_PER_LONG is then,
by the broken C standard, a negative number).


>  #include 
>  #include 
> -#include 
> -#include 
> +#include 
>  
> -#define BITOP_WORD(nr)   ((nr) / BITS_PER_LONG)
> +#define HIGH_BITS_MASK(nr)   (~0UL << (nr))
> +
> +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
>  
> -#ifndef find_next_bit
>  /*
> - * Find the next set bit in a memory region.
> + * This is a common helper function for find_next_bit and
> + * find_next_zero_bit.  The difference is the "invert" argument, which
> + * is XORed with each fetched word before searching it for one bits.
>   */
> -unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
> - unsigned long offset)
> +static unsigned long _find_next_bit(const unsigned long *addr,
> + unsigned long nbits, unsigned long start, unsigned long invert)
>  {
> - const unsigned long *p = addr + BITOP_WORD(offset);
> - unsigned long result = offset & ~(BITS_PER_LONG-1);
>   unsigned long tmp;
>  
> - if (offset >= size)
> - return size;
> - size -= result;
> - offset %= BITS_PER_LONG;
> - if (offset) {
> - tmp = *(p++);
> - tmp &= (~0UL << offset);
> - if (size < BITS_PER_LONG)
> - goto found_first;
> - if (tmp)
> - goto found_middle;
> - size -= BITS_PER_LONG;
> - result += BITS_PER_LONG;
> 

Re: [PATCH v4 1/3] lib: find_*_bit reimplementation

2015-02-18 Thread Rasmus Villemoes
On Tue, Feb 17 2015, Yury Norov yury.no...@gmail.com wrote:

 The new implementation takes less space in the sources
 (see diffstat) and in the object. For me it's 710 vs 453
 bytes of text. It also shows a better performance.

 find_last_bit description fixed due to obvious typo.

 In this patch 2 macros were introduced: {LOW,HIGH}_BITS_MASK,
 that are doing almost the same as BITMAP_{FIRST,LAST}_WORD_MASK
 in include/linux/bitmap.h. But 'LAST' macro is potentially less
 effective, because it issues a conditional branch that can be
 omitted.  If it is replaced one day by a more effective
 implementation, {LOW,HIGH}_BITS_MASK can be removed.


I think it's better to use the existing macros and then improve them
instead of duplicating the functionality. I'll submit a patch for that
later tonight (that should then make it to 3.21 [or whatever 3.19+2 will
be called] together with this series). More on this issue below.


 Signed-off-by: Yury Norov yury.no...@gmail.com
 ---
  include/linux/bitops.h |   4 +-
  lib/find_last_bit.c|  37 +++
  lib/find_next_bit.c| 269 
 ++---
  3 files changed, 94 insertions(+), 216 deletions(-)

 diff --git a/include/linux/bitops.h b/include/linux/bitops.h
 index 5d858e0..297f5bd 100644
 --- a/include/linux/bitops.h
 +++ b/include/linux/bitops.h
 @@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
  /**
   * find_last_bit - find the last set bit in a memory region
   * @addr: The address to start the search at
 - * @size: The maximum size to search
 + * @size: The number of bits to search
   *
 - * Returns the bit number of the first set bit, or size.
 + * Returns the bit number of the last set bit, or size.
   */
  extern unsigned long find_last_bit(const unsigned long *addr,
  unsigned long size);
 diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
 index 91ca09f..edbb281 100644
 --- a/lib/find_last_bit.c
 +++ b/lib/find_last_bit.c
 @@ -4,6 +4,9 @@
   * Written by Rusty Russell ru...@rustcorp.com.au
   * (Inspired by David Howell's find_next_bit implementation)
   *
 + * Rewritten by Yury Norov yury.no...@gmail.com to decrease
 + * size and improve performance, 2015.
 + *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public License
   * as published by the Free Software Foundation; either version
 @@ -12,36 +15,26 @@
  
  #include linux/bitops.h
  #include linux/export.h
 -#include asm/types.h
 -#include asm/byteorder.h
 +#include linux/kernel.h
 +
 +#define LOW_BITS_MASK(nr) (~0UL  -(nr))

This is technically wrong, and may not even work on architectures that
are not as forgiving as x86: Whatever type and value nr has, -(nr) is
almost guaranteed not to be a number between 0 and BITS_PER_LONG-1. And
even on x86, gcc doesn't generate as good code as it could:

 163:   49 c7 c0 ff ff ff ffmov$0x,%r8
 16a:   83 e1 3fand$0x3f,%ecx
 16d:   f7 d9   neg%ecx
 16f:   48 c1 ea 06 shr$0x6,%rdx
 173:   49 d3 e8shr%cl,%r8

It doesn't realize that pre-masking %ecx with 0x3f is redundant when we
then negate it and use it as a shift amount.

So a better definition of the macro is

#define BITMAP_LAST_WORD_MASK(nr) (~0UL  (-(nr)  (BITS_PER_LONG-1)))

and then callers shouldn't do the modulo. On x86, gcc knows that the 
is redundant. I use  instead of % so that nr may also have signed type
(otherwise we're again in UB land, since -(nr) % BITS_PER_LONG is then,
by the broken C standard, a negative number).


  #include linux/bitops.h
  #include linux/export.h
 -#include asm/types.h
 -#include asm/byteorder.h
 +#include linux/kernel.h
  
 -#define BITOP_WORD(nr)   ((nr) / BITS_PER_LONG)
 +#define HIGH_BITS_MASK(nr)   (~0UL  (nr))
 +
 +#if !defined(find_next_bit) || !defined(find_next_zero_bit)
  
 -#ifndef find_next_bit
  /*
 - * Find the next set bit in a memory region.
 + * This is a common helper function for find_next_bit and
 + * find_next_zero_bit.  The difference is the invert argument, which
 + * is XORed with each fetched word before searching it for one bits.
   */
 -unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 - unsigned long offset)
 +static unsigned long _find_next_bit(const unsigned long *addr,
 + unsigned long nbits, unsigned long start, unsigned long invert)
  {
 - const unsigned long *p = addr + BITOP_WORD(offset);
 - unsigned long result = offset  ~(BITS_PER_LONG-1);
   unsigned long tmp;
  
 - if (offset = size)
 - return size;
 - size -= result;
 - offset %= BITS_PER_LONG;
 - if (offset) {
 - tmp = *(p++);
 - tmp = (~0UL  offset);
 - if (size  BITS_PER_LONG)
 - goto found_first;
 - if (tmp)
 - 

[PATCH v4 1/3] lib: find_*_bit reimplementation

2015-02-16 Thread Yury Norov
The new implementation takes less space in the sources
(see diffstat) and in the object. For me it's 710 vs 453
bytes of text. It also shows a better performance.

find_last_bit description fixed due to obvious typo.

In this patch 2 macros were introduced: {LOW,HIGH}_BITS_MASK,
that are doing almost the same as BITMAP_{FIRST,LAST}_WORD_MASK
in include/linux/bitmap.h. But 'LAST' macro is potentially less
effective, because it issues a conditional branch that can be
omitted.  If it is replaced one day by a more effective
implementation, {LOW,HIGH}_BITS_MASK can be removed.

Signed-off-by: Yury Norov 
---
 include/linux/bitops.h |   4 +-
 lib/find_last_bit.c|  37 +++
 lib/find_next_bit.c| 269 ++---
 3 files changed, 94 insertions(+), 216 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5d858e0..297f5bd 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
 /**
  * find_last_bit - find the last set bit in a memory region
  * @addr: The address to start the search at
- * @size: The maximum size to search
+ * @size: The number of bits to search
  *
- * Returns the bit number of the first set bit, or size.
+ * Returns the bit number of the last set bit, or size.
  */
 extern unsigned long find_last_bit(const unsigned long *addr,
   unsigned long size);
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
index 91ca09f..edbb281 100644
--- a/lib/find_last_bit.c
+++ b/lib/find_last_bit.c
@@ -4,6 +4,9 @@
  * Written by Rusty Russell 
  * (Inspired by David Howell's find_next_bit implementation)
  *
+ * Rewritten by Yury Norov  to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -12,36 +15,26 @@
 
 #include 
 #include 
-#include 
-#include 
+#include 
+
+#define LOW_BITS_MASK(nr) (~0UL >> -(nr))
 
 #ifndef find_last_bit
 
 unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
 {
-   unsigned long words;
-   unsigned long tmp;
+   if (size) {
+   unsigned long val = LOW_BITS_MASK(size % BITS_PER_LONG);
+   unsigned long idx = (size-1) / BITS_PER_LONG;
 
-   /* Start at final word. */
-   words = size / BITS_PER_LONG;
+   do {
+   val &= addr[idx];
+   if (val)
+   return idx * BITS_PER_LONG + __fls(val);
 
-   /* Partial final word? */
-   if (size & (BITS_PER_LONG-1)) {
-   tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
-- (size & (BITS_PER_LONG-1);
-   if (tmp)
-   goto found;
+   val = ~0ul;
+   } while (idx--);
}
-
-   while (words) {
-   tmp = addr[--words];
-   if (tmp) {
-found:
-   return words * BITS_PER_LONG + __fls(tmp);
-   }
-   }
-
-   /* Not found */
return size;
 }
 EXPORT_SYMBOL(find_last_bit);
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index 0cbfc0b..1f5f108 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -3,6 +3,9 @@
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowe...@redhat.com)
  *
+ * Rewritten by Yury Norov  to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -11,98 +14,60 @@
 
 #include 
 #include 
-#include 
-#include 
+#include 
 
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+#define HIGH_BITS_MASK(nr) (~0UL << (nr))
+
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
 
-#ifndef find_next_bit
 /*
- * Find the next set bit in a memory region.
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit.  The difference is the "invert" argument, which
+ * is XORed with each fetched word before searching it for one bits.
  */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-   unsigned long offset)
+static unsigned long _find_next_bit(const unsigned long *addr,
+   unsigned long nbits, unsigned long start, unsigned long invert)
 {
-   const unsigned long *p = addr + BITOP_WORD(offset);
-   unsigned long result = offset & ~(BITS_PER_LONG-1);
unsigned long tmp;
 
-   if (offset >= size)
-   return size;
-   size -= result;
-   offset %= BITS_PER_LONG;
-   if (offset) {
-   tmp = *(p++);
-   tmp &= (~0UL 

[PATCH v4 1/3] lib: find_*_bit reimplementation

2015-02-16 Thread Yury Norov
The new implementation takes less space in the sources
(see diffstat) and in the object. For me it's 710 vs 453
bytes of text. It also shows a better performance.

find_last_bit description fixed due to obvious typo.

In this patch 2 macros were introduced: {LOW,HIGH}_BITS_MASK,
that are doing almost the same as BITMAP_{FIRST,LAST}_WORD_MASK
in include/linux/bitmap.h. But 'LAST' macro is potentially less
effective, because it issues a conditional branch that can be
omitted.  If it is replaced one day by a more effective
implementation, {LOW,HIGH}_BITS_MASK can be removed.

Signed-off-by: Yury Norov yury.no...@gmail.com
---
 include/linux/bitops.h |   4 +-
 lib/find_last_bit.c|  37 +++
 lib/find_next_bit.c| 269 ++---
 3 files changed, 94 insertions(+), 216 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5d858e0..297f5bd 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -218,9 +218,9 @@ static inline unsigned long __ffs64(u64 word)
 /**
  * find_last_bit - find the last set bit in a memory region
  * @addr: The address to start the search at
- * @size: The maximum size to search
+ * @size: The number of bits to search
  *
- * Returns the bit number of the first set bit, or size.
+ * Returns the bit number of the last set bit, or size.
  */
 extern unsigned long find_last_bit(const unsigned long *addr,
   unsigned long size);
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
index 91ca09f..edbb281 100644
--- a/lib/find_last_bit.c
+++ b/lib/find_last_bit.c
@@ -4,6 +4,9 @@
  * Written by Rusty Russell ru...@rustcorp.com.au
  * (Inspired by David Howell's find_next_bit implementation)
  *
+ * Rewritten by Yury Norov yury.no...@gmail.com to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -12,36 +15,26 @@
 
 #include linux/bitops.h
 #include linux/export.h
-#include asm/types.h
-#include asm/byteorder.h
+#include linux/kernel.h
+
+#define LOW_BITS_MASK(nr) (~0UL  -(nr))
 
 #ifndef find_last_bit
 
 unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
 {
-   unsigned long words;
-   unsigned long tmp;
+   if (size) {
+   unsigned long val = LOW_BITS_MASK(size % BITS_PER_LONG);
+   unsigned long idx = (size-1) / BITS_PER_LONG;
 
-   /* Start at final word. */
-   words = size / BITS_PER_LONG;
+   do {
+   val = addr[idx];
+   if (val)
+   return idx * BITS_PER_LONG + __fls(val);
 
-   /* Partial final word? */
-   if (size  (BITS_PER_LONG-1)) {
-   tmp = (addr[words]  (~0UL  (BITS_PER_LONG
-- (size  (BITS_PER_LONG-1);
-   if (tmp)
-   goto found;
+   val = ~0ul;
+   } while (idx--);
}
-
-   while (words) {
-   tmp = addr[--words];
-   if (tmp) {
-found:
-   return words * BITS_PER_LONG + __fls(tmp);
-   }
-   }
-
-   /* Not found */
return size;
 }
 EXPORT_SYMBOL(find_last_bit);
diff --git a/lib/find_next_bit.c b/lib/find_next_bit.c
index 0cbfc0b..1f5f108 100644
--- a/lib/find_next_bit.c
+++ b/lib/find_next_bit.c
@@ -3,6 +3,9 @@
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowe...@redhat.com)
  *
+ * Rewritten by Yury Norov yury.no...@gmail.com to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -11,98 +14,60 @@
 
 #include linux/bitops.h
 #include linux/export.h
-#include asm/types.h
-#include asm/byteorder.h
+#include linux/kernel.h
 
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+#define HIGH_BITS_MASK(nr) (~0UL  (nr))
+
+#if !defined(find_next_bit) || !defined(find_next_zero_bit)
 
-#ifndef find_next_bit
 /*
- * Find the next set bit in a memory region.
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit.  The difference is the invert argument, which
+ * is XORed with each fetched word before searching it for one bits.
  */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-   unsigned long offset)
+static unsigned long _find_next_bit(const unsigned long *addr,
+   unsigned long nbits, unsigned long start, unsigned long invert)
 {
-   const unsigned long *p = addr + BITOP_WORD(offset);
-   unsigned long result = offset  ~(BITS_PER_LONG-1);
unsigned long