Re: [PATCH] treewide: remove duplicate includes

2017-12-04 Thread Darrick J. Wong
On Mon, Dec 04, 2017 at 03:19:39AM +0530, Pravin Shedge wrote:
> These duplicate includes have been found with scripts/checkincludes.pl but
> they have been removed manually to avoid removing false positives.
> 
> Unit Testing:
> 
> - build successful
> - LTP testsuite passes.
> - checkpatch.pl passes
> 
> Signed-off-by: Pravin Shedge 



> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> index 9c42c4e..ab3aef2 100644
> --- a/fs/xfs/scrub/scrub.c
> +++ b/fs/xfs/scrub/scrub.c

These look reasonable, but please send me (and linux-xfs) the three
xfs changes separately so that I can add them to the xfs tree.

(Also, thank you for cc'ing the xfs list for this treewide change...)

--D


Re: crypto: Work around deallocated stack frame reference gcc bug on sparc.

2017-06-02 Thread Darrick J. Wong
[add ext4 list to cc]

On Fri, Jun 02, 2017 at 11:28:54AM -0400, David Miller wrote:
> 
> On sparc, if we have an alloca() like situation, as is the case with
> SHASH_DESC_ON_STACK(), we can end up referencing deallocated stack
> memory.  The result can be that the value is clobbered if a trap
> or interrupt arrives at just the right instruction.
> 
> It only occurs if the function ends returning a value from that
> alloca() area and that value can be placed into the return value
> register using a single instruction.
> 
> For example, in lib/libcrc32c.c:crc32c() we end up with a return
> sequence like:
> 
> return  %i7+8
>  lduw   [%o5+16], %o0   ! MEM[(u32 *)__shash_desc.1_10 + 16B],
> 
> %o5 holds the base of the on-stack area allocated for the shash
> descriptor.  But the return released the stack frame and the
> register window.
> 
> So if an intererupt arrives between 'return' and 'lduw', then
> the value read at %o5+16 can be corrupted.
> 
> Add a data compiler barrier to work around this problem.  This is
> exactly what the gcc fix will end up doing as well, and it absolutely
> should not change the code generated for other cpus (unless gcc
> on them has the same bug :-)
> 
> With crucial insight from Eric Sandeen.
> 
> Reported-by: Anatoly Pugachev 
> Signed-off-by: David S. Miller 
> ---
> 
> See the thread anchored at:
> 
>   http://marc.info/?l=linux-sparc=149623182616944=2
> 
> for discussion, it has a reproducer module.  The problem was
> first noticed as occaisional XFS checksum corruptions.
> 
> Herbert, I don't expect you to like this but it is the best we can do
> I think.  It should not pessimize code on other architectures at all.
> I will work on fixing the gcc bug but it's been around forever and all
> versions are effected.
> 
> I noticed while working on this that at least btrfs duplicates the
> facilities provided by lib/libcrc32c.c and therefore should probably
> be converted over to straight crc32c() calls if possible.

ext4/jbd2's crc32c implementations will also need a fix like this for
{ext4,jbd2}_chksum.  Note that both of these modules call the crypto api
directly to avoid a static dependence on libcrc32c; this was done to
reduce kernel footprint for applications that don't need it.  (ext2,
ext3, and ext4 before the metadata_csum feature existed).

--D

> 
> Thanks!
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
> index ecdba2f..1ac5b85 100644
> --- a/drivers/infiniband/sw/rxe/rxe.h
> +++ b/drivers/infiniband/sw/rxe/rxe.h
> @@ -68,6 +68,7 @@
>  static inline u32 rxe_crc32(struct rxe_dev *rxe,
>   u32 crc, void *next, size_t len)
>  {
> + u32 retval;
>   int err;
>  
>   SHASH_DESC_ON_STACK(shash, rxe->tfm);
> @@ -81,7 +82,9 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
>   return crc32_le(crc, next, len);
>   }
>  
> - return *(u32 *)shash_desc_ctx(shash);
> + retval = *(u32 *)shash_desc_ctx(shash);
> + barrier_data(shash_desc_ctx(shash));
> + return retval;
>  }
>  
>  int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
> diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
> index a97fdc1..baacc18 100644
> --- a/fs/btrfs/hash.c
> +++ b/fs/btrfs/hash.c
> @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int 
> length)
>  {
>   SHASH_DESC_ON_STACK(shash, tfm);
>   u32 *ctx = (u32 *)shash_desc_ctx(shash);
> + u32 retval;
>   int err;
>  
>   shash->tfm = tfm;
> @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int 
> length)
>   err = crypto_shash_update(shash, address, length);
>   BUG_ON(err);
>  
> - return *ctx;
> + retval = *ctx;
> + barrier_data(ctx);
> + return retval;
>  }
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 2185c7a..fd2e651 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -1078,6 +1078,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, 
> const void *address,
>  {
>   SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
>   u32 *ctx = (u32 *)shash_desc_ctx(shash);
> + u32 retval;
>   int err;
>  
>   shash->tfm = sbi->s_chksum_driver;
> @@ -1087,7 +1088,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, 
> const void *address,
>   err = crypto_shash_update(shash, address, length);
>   BUG_ON(err);
>  
> - return *ctx;
> + retval = *ctx;
> + barrier_data(ctx);
> + return retval;
>  }
>  
>  static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
> diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
> index 74a54b7..9f79547 100644
> --- a/lib/libcrc32c.c
> +++ b/lib/libcrc32c.c
> @@ -43,7 +43,7 @@ static struct crypto_shash *tfm;
>  u32 crc32c(u32 crc, const void *address, unsigned int length)
>  {
>   SHASH_DESC_ON_STACK(shash, tfm);
> - u32 *ctx = (u32 *)shash_desc_ctx(shash);
> + u32 ret, *ctx 

[PATCH 01/13] crc32: removed two instances of trailing whitespaces

2012-01-18 Thread Darrick J. Wong
- remove trailing whitespace from lib/crc32.c
- remove trailing whitespace from lib/crc32defs.h

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |2 +-
 lib/crc32defs.h |2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 4b35d2b..ffea0c9 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -317,7 +317,7 @@ EXPORT_SYMBOL(crc32_be);
  * in the correct multiple to subtract, we can shift a byte at a time.
  * This produces a 40-bit (rather than a 33-bit) intermediate remainder,
  * but again the multiple of the polynomial to subtract depends only on
- * the high bits, the high 8 bits in this case.  
+ * the high bits, the high 8 bits in this case.
  *
  * The multiple we need in that case is the low 32 bits of a 40-bit
  * value whose high 8 bits are given, and which is a multiple of the
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 9b6773d..f5a5401 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -8,7 +8,7 @@
 
 /* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
 /* For less performance-sensitive, use 4 */
-#ifndef CRC_LE_BITS 
+#ifndef CRC_LE_BITS
 # define CRC_LE_BITS 8
 #endif
 #ifndef CRC_BE_BITS

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/13] crc32: Simplify unit test code

2012-01-18 Thread Darrick J. Wong
Replaced the unit test provided in crc32.c, which doesn't have a
makefile and doesn't compile with current headers, with a simpler
self test routine that also gives a measure of performance and
runs at module init time. The self test option can be enabled
through a configuration option CONFIG_CRC32_SELFTEST.

The test stresses the pre and post loops and is thus not very
realistic since actual uses will likely have addresses and lengths
that are at least 4 byte aligned. However, the main loop is long
enough so that the performance is dominated by that loop.

The expected values for crc32_le and crc32_be were generated
with the original version of crc32.c using CRC_BITS_LE = 8 and
CRC_BITS_BE = 8. These values were then used to check all the
values of the BITS parameters in both the original and new versions.

The performance results show some variability from run to run
in spite of attempts to both warm the cache and reduce the amount
of OS noise by limiting interrutps during the test. To get comparable
results and to analyse options wrt performance the best time
reported over a small sample of runs has been taken.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   10 +
 lib/crc32.c |  798 ++-
 2 files changed, 691 insertions(+), 117 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index 201e1b3..4656dff 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -67,6 +67,16 @@ config CRC32
  kernel tree does. Such modules that use library CRC32 functions
  require M here.
 
+config CRC32_SELFTEST
+   bool CRC32 perform self test on init
+   default n
+   depends on CRC32
+   help
+ This option enables the CRC32 library functions to perform a
+ self test on initialization. The self test computes crc32_le
+ and crc32_be over byte strings with random alignment and length
+ and computes the total elapsed time and number of bytes processed.
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32.c b/lib/crc32.c
index c3ce94a..996115d 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -211,137 +211,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
 EXPORT_SYMBOL(crc32_le);
 EXPORT_SYMBOL(crc32_be);
 
-#ifdef UNITTEST
+#ifdef CONFIG_CRC32_SELFTEST
 
-#include stdlib.h
-#include stdio.h
-
-#if 0  /*Not used at present */
-static void
-buf_dump(char const *prefix, unsigned char const *buf, size_t len)
+/* 4096 random bytes */
+static u8 __attribute__((__aligned__(8))) test_buf[] =
 {
-   fputs(prefix, stdout);
-   while (len--)
-   printf( %02x, *buf++);
-   putchar('\n');
-
-}
-#endif
-
-static void bytereverse(unsigned char *buf, size_t len)
+   0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30,
+   0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4,
+   0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60,
+   0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c,
+   0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4,
+   0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a,
+   0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a,
+   0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4,
+   0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9,
+   0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4,
+   0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca,
+   0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61,
+   0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e,
+   0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a,
+   0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f,
+   0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd,
+   0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c,
+   0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88,
+   0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53,
+   0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f,
+   0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4,
+   0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74,
+   0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60,
+   0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09,
+   0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07,
+   0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1,
+   0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f,
+   0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2,
+   0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0,
+   0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95,
+   0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22,
+   0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93,
+   0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86,
+   0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d,
+   0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40,
+   0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b,
+   0xea, 0xc6, 0x55

[PATCH 04/13] crc32: Miscellaneous cleanups

2012-01-18 Thread Darrick J. Wong
Misc cleanup of lib/crc32.c and related files
- removed unnecessary header files.
- straightened out some convoluted ifdef's
- rewrote some references to 2 dimensional arrays as 1 dimensional
  arrays to make them correct. I.e. replaced tab[i] with tab[0][i].
- a few trivial whitespace changes
- fixed a warning in gen_crc32tables.c caused by a mismatch in the
  type of the pointer passed to output table. Since the table is
  only used at kernel compile time, it is simpler to make the table
  big enough to hold the largest column size used. One cannot make the
  column size smaller in output_table because it has to be used by
  both the le and be tables and they can have different column sizes.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |  104 +-
 lib/gen_crc32table.c |6 +--
 2 files changed, 39 insertions(+), 71 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 996115d..bf03922 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -23,13 +23,10 @@
 /* see: Documentation/crc32.txt for a description of algorithms */
 
 #include linux/crc32.h
-#include linux/kernel.h
 #include linux/module.h
-#include linux/compiler.h
 #include linux/types.h
-#include linux/init.h
-#include linux/atomic.h
 #include crc32defs.h
+
 #if CRC_LE_BITS == 8
 # define tole(x) __constant_cpu_to_le32(x)
 #else
@@ -41,6 +38,7 @@
 #else
 # define tobe(x) (x)
 #endif
+
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
@@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 #undef DO_CRC4
 }
 #endif
+
 /**
  * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_LE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
}
-   return crc;
-}
-#else  /* Table-based approach */
-
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
-{
-# if CRC_LE_BITS == 8
-   const u32  (*tab)[] = crc32table_le;
-
-   crc = __cpu_to_le32(crc);
-   crc = crc32_body(crc, p, len, tab);
-   return __le32_to_cpu(crc);
-# elif CRC_LE_BITS == 4
+# elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[crc  15];
-   crc = (crc  4) ^ crc32table_le[crc  15];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
}
-   return crc;
-# elif CRC_LE_BITS == 2
+# elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
}
+# elif CRC_LE_BITS == 8
+   const u32  (*tab)[] = crc32table_le;
+
+   crc = __cpu_to_le32(crc);
+   crc = crc32_body(crc, p, len, tab);
+   crc = __le32_to_cpu(crc);
+#endif
return crc;
-# endif
 }
-#endif
+EXPORT_SYMBOL(crc32_le);
 
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
@@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_BE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_BE_BITS == 1
int i;
while (len--) {
crc ^= *p++  24;
@@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len

[PATCH 06/13] crc32: Make CRC_*_BITS definition correspond to actual bit counts

2012-01-18 Thread Darrick J. Wong
crc32.c provides a choice of one of several algorithms for
computing the LSB and LSB versions of the CRC32 checksum
based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the
original version the values 1, 2, 4 and 8 respectively selected
versions of the alrogithm that computed the crc 1, 2, 4 and 32
bits as a time. This patch series adds a new version that computes
the CRC 64 bits at a time. To make things easier to understand
the parameter has been reinterpreted to actually stand for the
number of bits processed in each step of the algorithm so that
the old value 8 has been replaced with the value 32. This also
allows us to add in a widely used crc algorithm that
computes the crc 8 bits at a time called the Sarwate algorithm.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |   17 ++---
 lib/crc32defs.h  |   18 ++
 lib/gen_crc32table.c |   11 ++-
 3 files changed, 34 insertions(+), 12 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 7394288..5971f2a 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -27,13 +27,13 @@
 #include linux/types.h
 #include crc32defs.h
 
-#if CRC_LE_BITS == 8
+#if CRC_LE_BITS  8
 # define tole(x) ((__force u32) __constant_cpu_to_le32(x))
 #else
 # define tole(x) (x)
 #endif
 
-#if CRC_BE_BITS == 8
+#if CRC_BE_BITS  8
 # define tobe(x) ((__force u32) __constant_cpu_to_be32(x))
 #else
 # define tobe(x) (x)
@@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
 MODULE_DESCRIPTION(Ethernet CRC32 calculations);
 MODULE_LICENSE(GPL);
 
-#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8
+#if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
@@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_le[0][crc  15];
}
 # elif CRC_LE_BITS == 8
+   /* aka Sarwate algorithm */
+   while (len--) {
+   crc ^= *p++;
+   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   }
+# else
const u32  (*tab)[] = crc32table_le;
 
crc = (__force u32) __cpu_to_le32(crc);
@@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_be[0][crc  28];
}
 # elif CRC_BE_BITS == 8
+   while (len--) {
+   crc ^= *p++  24;
+   crc = (crc  8) ^ crc32table_be[0][crc  24];
+   }
+# else
const u32  (*tab)[] = crc32table_be;
 
crc = (__force u32) __cpu_to_be32(crc);
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index f5a5401..daa3a5e 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,27 +6,29 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
-/* For less performance-sensitive, use 4 */
+/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
+/* For less performance-sensitive, use 4 or 8 */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 8
+# define CRC_LE_BITS 32
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 8
+# define CRC_BE_BITS 32
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  8 || CRC_LE_BITS  1 || CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be a power of 2 between 1 and 8
+#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+   CRC_LE_BITS  CRC_LE_BITS-1
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  8 || CRC_BE_BITS  1 || CRC_BE_BITS  CRC_BE_BITS-1
-# error CRC_BE_BITS must be a power of 2 between 1 and 8
+#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
+   CRC_BE_BITS  CRC_BE_BITS-1
+# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index eced769..99ac744 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -4,8 +4,17 @@
 
 #define ENTRIES_PER_LINE 4
 
+#if CRC_LE_BITS = 8
 #define LE_TABLE_SIZE (1  CRC_LE_BITS)
+#else
+#define LE_TABLE_SIZE 256
+#endif
+
+#if CRC_BE_BITS = 8
 #define BE_TABLE_SIZE (1  CRC_BE_BITS)
+#else
+#define BE_TABLE_SIZE 256
+#endif
 
 static uint32_t crc32table_le[4][256];
 static uint32_t crc32table_be[4][256];
@@ -24,7 +33,7 @@ static void crc32init_le(void)
 
crc32table_le[0][0] = 0;
 
-   for (i = 1  (CRC_LE_BITS - 1); i; i = 1) {
+   for (i = LE_TABLE_SIZE  1; i; i = 1) {
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0

[PATCH 07/13] crc32: Add slice-by-8 algorithm to existing code

2012-01-18 Thread Darrick J. Wong
add slicing-by-8 algorithm to the existing
slicing-by-4 algorithm. This consists of:
- extend largest BITS size from 32 to 64
- extend tables from tab[4][256] to up to tab[8][256]
- Add code for inner loop.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |   38 +++---
 lib/crc32defs.h  |   29 +
 lib/gen_crc32table.c |   43 +++
 3 files changed, 75 insertions(+), 35 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 5971f2a..826e163 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -47,25 +47,28 @@ MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
+/* implements slicing-by-4 or slicing-by-8 algorithm */
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
 #  define DO_CRC(x) crc = t0[(crc ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = t3[(crc)  255] ^ \
-   t2[(crc  8)  255] ^ \
-   t1[(crc  16)  255] ^ \
-   t0[(crc  24)  255]
+#  define DO_CRC4 (t3[(q)  255] ^ t2[(q  8)  255] ^ \
+  t1[(q  16)  255] ^ t0[(q  24)  255])
+#  define DO_CRC8 (t7[(q)  255] ^ t6[(q  8)  255] ^ \
+  t5[(q  16)  255] ^ t4[(q  24)  255])
 # else
 #  define DO_CRC(x) crc = t0[((crc  24) ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = t0[(crc)  255] ^ \
-   t1[(crc  8)  255] ^  \
-   t2[(crc  16)  255] ^ \
-   t3[(crc  24)  255]
+#  define DO_CRC4 (t0[(q)  255] ^ t1[(q  8)  255] ^ \
+  t2[(q  16)  255] ^ t3[(q  24)  255])
+#  define DO_CRC8 (t4[(q)  255] ^ t5[(q  8)  255] ^ \
+  t6[(q  16)  255] ^ t7[(q  24)  255])
 # endif
const u32 *b;
size_trem_len;
const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
+   const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
+   u32 q;
 
/* Align it */
if (unlikely((long)buf  3  len)) {
@@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
DO_CRC(*buf++);
} while ((--len)  ((long)buf)3);
}
+
+# if CRC_LE_BITS == 32
rem_len = len  3;
-   /* load data 32 bits wide, xor data 32 bits wide. */
len = len  2;
+# else
+   rem_len = len  7;
+   len = len  3;
+# endif
+
b = (const u32 *)buf;
for (--b; len; --len) {
-   crc ^= *++b; /* use pre increment for speed */
-   DO_CRC4;
+   q = crc ^ *++b; /* use pre increment for speed */
+# if CRC_LE_BITS == 32
+   crc = DO_CRC4;
+# else
+   crc = DO_CRC8;
+   q = *++b;
+   crc ^= DO_CRC4;
+# endif
}
len = rem_len;
/* And the last few bytes */
@@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
return crc;
 #undef DO_CRC
 #undef DO_CRC4
+#undef DO_CRC8
 }
 #endif
 
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index daa3a5e..8181592 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,29 +6,42 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
-/* For less performance-sensitive, use 4 or 8 */
+/*
+ * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
+ * For less performance-sensitive, use 4 or 8 to save table size.
+ * For larger systems choose same as CPU architecture as default.
+ * This works well on X86_64, SPARC64 systems. This may require some
+ * elaboration after experiments with other architectures.
+ */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_LE_BITS 64
+#  else
+#  define CRC_LE_BITS 32
+#  endif
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_BE_BITS 64
+#  else
+#  define CRC_BE_BITS 32
+#  endif
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+#if CRC_LE_BITS  64 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
+#if CRC_BE_BITS  64 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
CRC_BE_BITS  CRC_BE_BITS-1

[PATCH 09/13] crc32: Add note about this patchset to crc32.c

2012-01-18 Thread Darrick J. Wong
Some final changes
- added a comment at the top of crc32.c

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 4eac9c7..a1a5145 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -1,4 +1,8 @@
 /*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
  * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
  * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
  * Code was from the public domain, copyright abandoned.  Code was

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5.4 00/13] crc32c: Add faster algorithm and self-test code

2012-01-18 Thread Darrick J. Wong
Hi all,

This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
software crc32c implementation.  It removes the crc32c implementation in
crypto/ in favor of using the stamped-out one in lib/.  There is also a change
to Kconfig so that the kernel builder can pick an implementation best suited
for the hardware.

The motivation for this patchset is that I am working on adding full metadata
checksumming to ext4.  As far as performance impact of adding checksumming
goes, I see nearly no change with a standard mail server ffsb simulation.  On a
test that involves only file creation and deletion and extent tree writes, I
see a drop of about 50 pcercent with the current kernel crc32c implementation;
this improves to a drop of about 20 percent with the enclosed crc32c code.

When metadata is usually a small fraction of total IO, this new implementation
doesn't help much because metadata is usually a small fraction of total IO.
However, when we are doing IO that is almost all metadata (such as rm -rf'ing a
tree), then this patch speeds up the operation substantially.

Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
should improve their speed as well.  I have not yet quantified that, however.
This latest submission combines Bob's patches from late August 2011 with mine
so that they can be one coherent patch set.  Please excuse my inability to
combine some of the patches; I've been advised to leave Bob's patches alone and
build atop them instead. :/

Since the last posting, I've also collected some crc32c test results on a bunch
of different x86/powerpc/sparc platforms.  The results can be viewed here:
http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the
performance of the kernel's current crc32 and crc32c software implementations.
The crc32c-by8-le column shows crc32c performance with this patchset applied.
I expect crc32 performance to be roughly the same.

The two _boost columns at the right side of the spreadsheet shows how much
faster the new implementation is over the old one.  As you can see, crc32 rises
substantially, and crc32c experiences a huge increase.

v2: Use the crypto testmgr api for self-test.
v3: Get rid of the -be version, which had no users.
v4: Allow kernel builder a choice of speed vs. space optimization.
v5: Reuse lib/crc32 for crc32c as well, and make crypto/crc32c use lib/crc32.c.
v5.1: Include Bob Pearson's patches in submission request.
v5.2: Fix changelogs for Bob's patches per akpm request.
v5.3: Fix from header bug in patch mail generation scripts.
v5.4: Rebase against next-20120118 per akpm request.  One patch was already
  committed, which shortens this patchset.

--D

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/13] crc32: Fix mixing of endian-specific types

2012-01-18 Thread Darrick J. Wong
crc32.c in its original version freely mixed u32, __le32 and __be32 types
which caused warnings from sparse with __CHECK_ENDIAN__.
This patch fixes these by forcing the types to u32.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index bf03922..7394288 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -28,13 +28,13 @@
 #include crc32defs.h
 
 #if CRC_LE_BITS == 8
-# define tole(x) __constant_cpu_to_le32(x)
+# define tole(x) ((__force u32) __constant_cpu_to_le32(x))
 #else
 # define tole(x) (x)
 #endif
 
 #if CRC_BE_BITS == 8
-# define tobe(x) __constant_cpu_to_be32(x)
+# define tobe(x) ((__force u32) __constant_cpu_to_be32(x))
 #else
 # define tobe(x) (x)
 #endif
@@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_LE_BITS == 8
const u32  (*tab)[] = crc32table_le;
 
-   crc = __cpu_to_le32(crc);
+   crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __le32_to_cpu(crc);
+   crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
@@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_BE_BITS == 8
const u32  (*tab)[] = crc32table_be;
 
-   crc = __cpu_to_be32(crc);
+   crc = (__force u32) __cpu_to_be32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __be32_to_cpu(crc);
+   crc = __be32_to_cpu((__force __be32)crc);
 # endif
return crc;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/13] crc32: Select an algorithm via kconfig

2012-01-18 Thread Darrick J. Wong
Allow the kernel builder to choose a crc32* algorithm for the kernel.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   43 +++
 lib/crc32defs.h |   18 ++
 2 files changed, 61 insertions(+), 0 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index 58da52d..13e1afa 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -77,6 +77,49 @@ config CRC32_SELFTEST
  and crc32_be over byte strings with random alignment and length
  and computes the total elapsed time and number of bytes processed.
 
+choice
+   prompt CRC32 implementation
+   depends on CRC32
+   default CRC32_SLICEBY8
+
+config CRC32_SLICEBY8
+   bool Slice by 8 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is the fastest algorithm, but comes with a 8KiB lookup table.
+ Most modern processors have enough cache to hold this table without
+ thrashing the cache.
+
+ This is the default implementation choice.  Choose this one unless
+ you have a good reason not to.
+
+config CRC32_SLICEBY4
+   bool Slice by 4 bytes
+   help
+ Calculate checksum 4 bytes at a time with a clever slicing algorithm.
+ This is a bit slower than slice by 8, but has a smaller 4KiB lookup
+ table.
+
+ Only choose this option if you know what you are doing.
+
+config CRC32_SARWATE
+   bool Sarwate's Algorithm (one byte at a time)
+   help
+ Calculate checksum a byte at a time using Sarwate's algorithm.  This
+ is not particularly fast, but has a small 256 byte lookup table.
+
+ Only choose this option if you know what you are doing.
+
+config CRC32_BIT
+   bool Classic Algorithm (one bit at a time)
+   help
+ Calculate checksum one bit at a time.  This is VERY slow, but has
+ no lookup table.  This is provided as a debugging option.
+
+ Only choose this option if you are debugging crc32.
+
+endchoice
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 6fd1917..64cba2c 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -13,6 +13,24 @@
  */
 #define CRC32C_POLY_LE 0x82F63B78
 
+/* Try to choose an implementation variant via Kconfig */
+#ifdef CONFIG_CRC32_SLICEBY8
+# define CRC_LE_BITS 64
+# define CRC_BE_BITS 64
+#endif
+#ifdef CONFIG_CRC32_SLICEBY4
+# define CRC_LE_BITS 32
+# define CRC_BE_BITS 32
+#endif
+#ifdef CONFIG_CRC32_SARWATE
+# define CRC_LE_BITS 8
+# define CRC_BE_BITS 8
+#endif
+#ifdef CONFIG_CRC32_BIT
+# define CRC_LE_BITS 1
+# define CRC_BE_BITS 1
+#endif
+
 /*
  * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
  * For less performance-sensitive, use 4 or 8 to save table size.

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/13] crypto: crc32c should use library implementation

2012-01-18 Thread Darrick J. Wong
Since lib/crc32.c now provides crc32c, remove the software implementation here
and call the library function instead.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Kconfig  |1 +
 crypto/crc32c.c |   94 ++-
 2 files changed, 4 insertions(+), 91 deletions(-)


diff --git a/crypto/Kconfig b/crypto/Kconfig
index e6cfe1a..29f4d73 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -308,6 +308,7 @@ comment Digest
 config CRYPTO_CRC32C
tristate CRC32c CRC algorithm
select CRYPTO_HASH
+   select CRC32
help
  Castagnoli, et al Cyclic Redundancy-Check Algorithm.  Used
  by iSCSI for header and data digests and by others.
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..06f7018 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -40,6 +40,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include linux/crc32.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -53,95 +54,6 @@ struct chksum_desc_ctx {
 };
 
 /*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
-
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-   0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-   0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-   0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-   0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
-   0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-   0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
-   0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
-   0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
-   0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
-   0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-   0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
-   0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
-   0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
-   0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
-   0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-   0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
-   0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-   0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
-   0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
-   0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-   0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
-   0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
-   0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
-   0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-   0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-   0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
-   0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
-   0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
-   0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
-   0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-   0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-   0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
-   0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
-   0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
-   0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-   0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
-   0x34F4F86AL, 0xC69F7B69L

[PATCH 12/13] crc32: Add self-test code for crc32c

2012-01-18 Thread Darrick J. Wong
Add self-test code for crc32c.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |  363 ++-
 1 files changed, 261 insertions(+), 102 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 22e3643..ebc5911 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -765,113 +765,265 @@ static struct crc_test {
u32 length; /* random 11 bit length of test */
u32 crc_le; /* expected crc32_le result */
u32 crc_be; /* expected crc32_be result */
+   u32 crc32c_le;  /* expected crc32c_le result */
 } test[] =
 {
-   {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1},
-   {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad},
-   {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f},
-   {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a},
-   {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2},
-   {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793},
-   {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed},
-   {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35},
-   {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2},
-   {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10},
-   {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb},
-   {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0},
-   {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb},
-   {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed},
-   {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591},
-   {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67},
-   {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd},
-   {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a},
-   {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b},
-   {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f},
-   {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d},
-   {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a},
-   {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97},
-   {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2},
-   {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138},
-   {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032},
-   {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f},
-   {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f},
-   {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32},
-   {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef},
-   {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0},
-   {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59},
-   {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4},
-   {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c},
-   {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51},
-   {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11},
-   {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659},
-   {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af},
-   {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99},
-   {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b},
-   {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521},
-   {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3},
-   {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d},
-   {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f},
-   {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b},
-   {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0},
-   {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195},
-   {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d},
-   {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4},
-   {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3},
-   {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643},
-   {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10},
-   {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d},
-   {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5},
-   {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b},
-   {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee},
-   {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14},
-   {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a},
-   {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b},
-   {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3},
-   {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826},
-   {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06},
-   {0xbbb52021, 0x003b, 0x0272

[PATCH 10/13] crc32: Bolt on crc32c

2012-01-18 Thread Darrick J. Wong
Reuse the existing crc32 code to stamp out a crc32c implementation.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32.h |2 ++
 lib/Kconfig   |8 +++---
 lib/crc32.c   |   62 +++--
 lib/crc32defs.h   |7 ++
 lib/gen_crc32table.c  |   35 ++--
 5 files changed, 80 insertions(+), 34 deletions(-)


diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 391a259..68267b6 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -11,6 +11,8 @@
 extern u32  crc32_le(u32 crc, unsigned char const *p, size_t len);
 extern u32  crc32_be(u32 crc, unsigned char const *p, size_t len);
 
+extern u32  __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const 
*)(data), length)
 
 /*
diff --git a/lib/Kconfig b/lib/Kconfig
index 4656dff..58da52d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -58,14 +58,14 @@ config CRC_ITU_T
  functions require M here.
 
 config CRC32
-   tristate CRC32 functions
+   tristate CRC32/CRC32c functions
default y
select BITREVERSE
help
  This option is provided for the case where no in-kernel-tree
- modules require CRC32 functions, but a module built outside the
- kernel tree does. Such modules that use library CRC32 functions
- require M here.
+ modules require CRC32/CRC32c functions, but a module built outside
+ the kernel tree does. Such modules that use library CRC32/CRC32c
+ functions require M here.
 
 config CRC32_SELFTEST
bool CRC32 perform self test on init
diff --git a/lib/crc32.c b/lib/crc32.c
index a1a5145..22e3643 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -46,7 +46,7 @@
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
-MODULE_DESCRIPTION(Ethernet CRC32 calculations);
+MODULE_DESCRIPTION(Various CRC32 calculations);
 MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
@@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256],
+ u32 polynomial)
 {
 #if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
-   crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
+   crc = (crc  1) ^ ((crc  1) ? polynomial : 0);
}
 # elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
}
 # elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
}
 # elif CRC_LE_BITS == 8
/* aka Sarwate algorithm */
while (len--) {
crc ^= *p++;
-   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   crc = (crc  8) ^ tab[0][crc  255];
}
 # else
-   const u32  (*tab)[] = crc32table_le;
-
crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE);
+}
 EXPORT_SYMBOL(crc32_le);
 
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
+}
+EXPORT_SYMBOL(__crc32c_le);
+
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le);
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256

[PATCH 02/13] crc32: Move long comment about crc32 fundamentals to Documentation/

2012-01-18 Thread Darrick J. Wong
Moved a long comment from lib/crc32.c to Documentation/crc32.txt
where it will more likely get read.
- Edited the resulting document to add an explanation of the 
slicing-by-n
  algorithm.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: George Spelvin li...@horizon.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 Documentation/00-INDEX  |2 +
 Documentation/crc32.txt |  183 +++
 lib/crc32.c |  129 +
 3 files changed, 187 insertions(+), 127 deletions(-)
 create mode 100644 Documentation/crc32.txt


diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 65bbd26..e7b38a0 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -104,6 +104,8 @@ cpuidle/
- info on CPU_IDLE, CPU idle state management subsystem.
 cputopology.txt
- documentation on how CPU topology info is exported via sysfs.
+crc32.txt
+   - brief tutorial on CRC computation
 cris/
- directory with info about Linux on CRIS architecture.
 crypto/
diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt
new file mode 100644
index 000..3d74ba4
--- /dev/null
+++ b/Documentation/crc32.txt
@@ -0,0 +1,183 @@
+A brief CRC tutorial.
+
+A CRC is a long-division remainder.  You add the CRC to the message,
+and the whole thing (message+CRC) is a multiple of the given
+CRC polynomial.  To check the CRC, you can either check that the
+CRC matches the recomputed value, *or* you can check that the
+remainder computed on the message+CRC is 0.  This latter approach
+is used by a lot of hardware implementations, and is why so many
+protocols put the end-of-frame flag after the CRC.
+
+It's actually the same long division you learned in school, except that
+- We're working in binary, so the digits are only 0 and 1, and
+- When dividing polynomials, there are no carries.  Rather than add and
+  subtract, we just xor.  Thus, we tend to get a bit sloppy about
+  the difference between adding and subtracting.
+
+Like all division, the remainder is always smaller than the divisor.
+To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial.
+Since it's 33 bits long, bit 32 is always going to be set, so usually the
+CRC is written in hex with the most significant bit omitted.  (If you're
+familiar with the IEEE 754 floating-point format, it's the same idea.)
+
+Note that a CRC is computed over a string of *bits*, so you have
+to decide on the endianness of the bits within each byte.  To get
+the best error-detecting properties, this should correspond to the
+order they're actually sent.  For example, standard RS-232 serial is
+little-endian; the most significant bit (sometimes used for parity)
+is sent last.  And when appending a CRC word to a message, you should
+do it in the right order, matching the endianness.
+
+Just like with ordinary division, you proceed one digit (bit) at a time.
+Each step of the division, division, you take one more digit (bit) of the
+dividend and append it to the current remainder.  Then you figure out the
+appropriate multiple of the divisor to subtract to being the remainder
+back into range.  In binary, this is easy - it has to be either 0 or 1,
+and to make the XOR cancel, it's just a copy of bit 32 of the remainder.
+
+When computing a CRC, we don't care about the quotient, so we can
+throw the quotient bit away, but subtract the appropriate multiple of
+the polynomial from the remainder and we're back to where we started,
+ready to process the next bit.
+
+A big-endian CRC written this way would be coded like:
+for (i = 0; i  input_bits; i++) {
+   multiple = remainder  0x8000 ? CRCPOLY : 0;
+   remainder = (remainder  1 | next_input_bit()) ^ multiple;
+}
+
+Notice how, to get at bit 32 of the shifted remainder, we look
+at bit 31 of the remainder *before* shifting it.
+
+But also notice how the next_input_bit() bits we're shifting into
+the remainder don't actually affect any decision-making until
+32 bits later.  Thus, the first 32 cycles of this are pretty boring.
+Also, to add the CRC to a message, we need a 32-bit-long hole for it at
+the end, so we have to add 32 extra cycles shifting in zeros at the
+end of every message,
+
+These details lead to a standard trick: rearrange merging in the
+next_input_bit() until the moment it's needed.  Then the first 32 cycles
+can be precomputed, and merging in the final 32 zero bits to make room
+for the CRC can be skipped entirely.  This changes the code to:
+
+for (i = 0; i  input_bits; i++) {
+   remainder ^= next_input_bit()  31;
+   multiple = (remainder  0x8000) ? CRCPOLY : 0;
+   remainder = (remainder  1) ^ multiple;
+}
+
+With this optimization, the little-endian code is particularly simple:
+for (i = 0; i  input_bits; i++) {
+   remainder

[PATCH v5.3 00/14] crc32c: Add faster algorithm and self-test code

2012-01-06 Thread Darrick J. Wong
Hi all,

This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
software crc32c implementation.  It removes the crc32c implementation in
crypto/ in favor of using the stamped-out one in lib/.  There is also a change
to Kconfig so that the kernel builder can pick an implementation best suited
for the hardware.

The motivation for this patchset is that I am working on adding full metadata
checksumming to ext4.  As far as performance impact of adding checksumming
goes, I see nearly no change with a standard mail server ffsb simulation.  On a
test that involves only file creation and deletion and extent tree writes, I
see a drop of about 50 pcercent with the current kernel crc32c implementation;
this improves to a drop of about 20 percent with the enclosed crc32c code.

When metadata is usually a small fraction of total IO, this new implementation
doesn't help much because metadata is usually a small fraction of total IO.
However, when we are doing IO that is almost all metadata (such as rm -rf'ing a
tree), then this patch speeds up the operation substantially.

Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
should improve their speed as well.  I have not yet quantified that, however.
This latest submission combines Bob's patches from late August 2011 with mine
so that they can be one coherent patch set.  Please excuse my inability to
combine some of the patches; I've been advised to leave Bob's patches alone and
build atop them instead. :/

Since the last posting, I've also collected some crc32c test results on a bunch
of different x86/powerpc/sparc platforms.  The results can be viewed here:
http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the
performance of the kernel's current crc32 and crc32c software implementations.
The crc32c-by8-le column shows crc32c performance with this patchset applied.
I expect crc32 performance to be roughly the same.

The two _boost columns at the right side of the spreadsheet shows how much
faster the new implementation is over the old one.  As you can see, crc32 rises
substantially, and crc32c experiences a huge increase.

Since this patch has been out for review for several weeks now without
objections, can this go into 3.3, please?

v2: Use the crypto testmgr api for self-test.
v3: Get rid of the -be version, which had no users.
v4: Allow kernel builder a choice of speed vs. space optimization.
v5: Reuse lib/crc32 for crc32c as well, and make crypto/crc32c use lib/crc32.c.
v5.1: Include Bob Pearson's patches in submission request.
v5.2: Fix changelogs for Bob's patches per akpm request.
v5.3: Fix from header bug in patch mail generation scripts.

--D

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/14] crc32: Move long comment about crc32 fundamentals to Documentation/

2012-01-06 Thread Darrick J. Wong
Moved a long comment from lib/crc32.c to Documentation/crc32.txt
where it will more likely get read.
- Edited the resulting document to add an explanation of the 
slicing-by-n
  algorithm.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: George Spelvin li...@horizon.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 Documentation/00-INDEX  |2 +
 Documentation/crc32.txt |  183 +++
 lib/crc32.c |  129 +
 3 files changed, 187 insertions(+), 127 deletions(-)
 create mode 100644 Documentation/crc32.txt


diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 65bbd26..e7b38a0 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -104,6 +104,8 @@ cpuidle/
- info on CPU_IDLE, CPU idle state management subsystem.
 cputopology.txt
- documentation on how CPU topology info is exported via sysfs.
+crc32.txt
+   - brief tutorial on CRC computation
 cris/
- directory with info about Linux on CRIS architecture.
 crypto/
diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt
new file mode 100644
index 000..3d74ba4
--- /dev/null
+++ b/Documentation/crc32.txt
@@ -0,0 +1,183 @@
+A brief CRC tutorial.
+
+A CRC is a long-division remainder.  You add the CRC to the message,
+and the whole thing (message+CRC) is a multiple of the given
+CRC polynomial.  To check the CRC, you can either check that the
+CRC matches the recomputed value, *or* you can check that the
+remainder computed on the message+CRC is 0.  This latter approach
+is used by a lot of hardware implementations, and is why so many
+protocols put the end-of-frame flag after the CRC.
+
+It's actually the same long division you learned in school, except that
+- We're working in binary, so the digits are only 0 and 1, and
+- When dividing polynomials, there are no carries.  Rather than add and
+  subtract, we just xor.  Thus, we tend to get a bit sloppy about
+  the difference between adding and subtracting.
+
+Like all division, the remainder is always smaller than the divisor.
+To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial.
+Since it's 33 bits long, bit 32 is always going to be set, so usually the
+CRC is written in hex with the most significant bit omitted.  (If you're
+familiar with the IEEE 754 floating-point format, it's the same idea.)
+
+Note that a CRC is computed over a string of *bits*, so you have
+to decide on the endianness of the bits within each byte.  To get
+the best error-detecting properties, this should correspond to the
+order they're actually sent.  For example, standard RS-232 serial is
+little-endian; the most significant bit (sometimes used for parity)
+is sent last.  And when appending a CRC word to a message, you should
+do it in the right order, matching the endianness.
+
+Just like with ordinary division, you proceed one digit (bit) at a time.
+Each step of the division, division, you take one more digit (bit) of the
+dividend and append it to the current remainder.  Then you figure out the
+appropriate multiple of the divisor to subtract to being the remainder
+back into range.  In binary, this is easy - it has to be either 0 or 1,
+and to make the XOR cancel, it's just a copy of bit 32 of the remainder.
+
+When computing a CRC, we don't care about the quotient, so we can
+throw the quotient bit away, but subtract the appropriate multiple of
+the polynomial from the remainder and we're back to where we started,
+ready to process the next bit.
+
+A big-endian CRC written this way would be coded like:
+for (i = 0; i  input_bits; i++) {
+   multiple = remainder  0x8000 ? CRCPOLY : 0;
+   remainder = (remainder  1 | next_input_bit()) ^ multiple;
+}
+
+Notice how, to get at bit 32 of the shifted remainder, we look
+at bit 31 of the remainder *before* shifting it.
+
+But also notice how the next_input_bit() bits we're shifting into
+the remainder don't actually affect any decision-making until
+32 bits later.  Thus, the first 32 cycles of this are pretty boring.
+Also, to add the CRC to a message, we need a 32-bit-long hole for it at
+the end, so we have to add 32 extra cycles shifting in zeros at the
+end of every message,
+
+These details lead to a standard trick: rearrange merging in the
+next_input_bit() until the moment it's needed.  Then the first 32 cycles
+can be precomputed, and merging in the final 32 zero bits to make room
+for the CRC can be skipped entirely.  This changes the code to:
+
+for (i = 0; i  input_bits; i++) {
+   remainder ^= next_input_bit()  31;
+   multiple = (remainder  0x8000) ? CRCPOLY : 0;
+   remainder = (remainder  1) ^ multiple;
+}
+
+With this optimization, the little-endian code is particularly simple:
+for (i = 0; i  input_bits; i++) {
+   remainder

[PATCH 10/14] crc32: Add note about this patchset to crc32.c

2012-01-06 Thread Darrick J. Wong
Some final changes
- added a comment at the top of crc32.c

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 2c8e8c0..d56516d 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -1,4 +1,8 @@
 /*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
  * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
  * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
  * Code was from the public domain, copyright abandoned.  Code was

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/14] crc32: Optimize loop counter for x86

2012-01-06 Thread Darrick J. Wong
Add two changes that improve the performance of x86 systems
1. replace main loop with incrementing counter
   this change improves the performance of the selftest
   by about 5-6% on Nehalem CPUs. The apparent
   reason is that the compiler can use the loop index
   to perform an indexed memory access. This is
   reported to make the performance of PowerPC CPUs
   to get worse.
2. replace the rem_len loop with incrementing counter
   this change improves the performance of the selftest,
   which has more than the usual number of occurances,
   by about 1-2% on x86 CPUs. In actual work loads
   the length is most often a multiple of 4 bytes and
   this code does not get executed as often if at all.
   Again this change is reported to make the performance
   of PowerPC get worse.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 6311712..2c8e8c0 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 # endif
const u32 *b;
size_t rem_len;
+# ifdef CONFIG_X86
+   size_t i;
+# endif
const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
u32 q;
@@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 # endif
 
b = (const u32 *)buf;
+# ifdef CONFIG_X86
+   --b;
+   for (i = 0; i  len; i++) {
+# else
for (--b; len; --len) {
+# endif
q = crc ^ *++b; /* use pre increment for speed */
 # if CRC_LE_BITS == 32
crc = DO_CRC4;
@@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
/* And the last few bytes */
if (len) {
u8 *p = (u8 *)(b + 1) - 1;
+# ifdef CONFIG_X86
+   for (i = 0; i  len; i++)
+   DO_CRC(*++p); /* use pre increment for speed */
+# else
do {
DO_CRC(*++p); /* use pre increment for speed */
} while (--len);
+# endif
}
return crc;
 #undef DO_CRC

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/14] crc32: Select an algorithm via kconfig

2012-01-06 Thread Darrick J. Wong
Allow the kernel builder to choose a crc32* algorithm for the kernel.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   43 +++
 lib/crc32defs.h |   18 ++
 2 files changed, 61 insertions(+), 0 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index cfddafc..029c0e3 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -70,6 +70,49 @@ config CRC32_SELFTEST
  and crc32_be over byte strings with random alignment and length
  and computes the total elapsed time and number of bytes processed.
 
+choice
+   prompt CRC32 implementation
+   depends on CRC32
+   default CRC32_SLICEBY8
+
+config CRC32_SLICEBY8
+   bool Slice by 8 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is the fastest algorithm, but comes with a 8KiB lookup table.
+ Most modern processors have enough cache to hold this table without
+ thrashing the cache.
+
+ This is the default implementation choice.  Choose this one unless
+ you have a good reason not to.
+
+config CRC32_SLICEBY4
+   bool Slice by 4 bytes
+   help
+ Calculate checksum 4 bytes at a time with a clever slicing algorithm.
+ This is a bit slower than slice by 8, but has a smaller 4KiB lookup
+ table.
+
+ Only choose this option if you know what you are doing.
+
+config CRC32_SARWATE
+   bool Sarwate's Algorithm (one byte at a time)
+   help
+ Calculate checksum a byte at a time using Sarwate's algorithm.  This
+ is not particularly fast, but has a small 256 byte lookup table.
+
+ Only choose this option if you know what you are doing.
+
+config CRC32_BIT
+   bool Classic Algorithm (one bit at a time)
+   help
+ Calculate checksum one bit at a time.  This is VERY slow, but has
+ no lookup table.  This is provided as a debugging option.
+
+ Only choose this option if you are debugging crc32.
+
+endchoice
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 6fd1917..64cba2c 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -13,6 +13,24 @@
  */
 #define CRC32C_POLY_LE 0x82F63B78
 
+/* Try to choose an implementation variant via Kconfig */
+#ifdef CONFIG_CRC32_SLICEBY8
+# define CRC_LE_BITS 64
+# define CRC_BE_BITS 64
+#endif
+#ifdef CONFIG_CRC32_SLICEBY4
+# define CRC_LE_BITS 32
+# define CRC_BE_BITS 32
+#endif
+#ifdef CONFIG_CRC32_SARWATE
+# define CRC_LE_BITS 8
+# define CRC_BE_BITS 8
+#endif
+#ifdef CONFIG_CRC32_BIT
+# define CRC_LE_BITS 1
+# define CRC_BE_BITS 1
+#endif
+
 /*
  * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
  * For less performance-sensitive, use 4 or 8 to save table size.

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] crc32: Add self-test code for crc32c

2012-01-06 Thread Darrick J. Wong
Add self-test code for crc32c.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |  363 ++-
 1 files changed, 261 insertions(+), 102 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 8df9561..382fa76 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -765,113 +765,265 @@ static struct crc_test {
u32 length; /* random 11 bit length of test */
u32 crc_le; /* expected crc32_le result */
u32 crc_be; /* expected crc32_be result */
+   u32 crc32c_le;  /* expected crc32c_le result */
 } test[] =
 {
-   {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1},
-   {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad},
-   {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f},
-   {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a},
-   {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2},
-   {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793},
-   {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed},
-   {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35},
-   {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2},
-   {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10},
-   {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb},
-   {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0},
-   {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb},
-   {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed},
-   {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591},
-   {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67},
-   {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd},
-   {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a},
-   {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b},
-   {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f},
-   {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d},
-   {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a},
-   {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97},
-   {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2},
-   {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138},
-   {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032},
-   {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f},
-   {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f},
-   {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32},
-   {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef},
-   {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0},
-   {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59},
-   {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4},
-   {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c},
-   {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51},
-   {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11},
-   {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659},
-   {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af},
-   {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99},
-   {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b},
-   {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521},
-   {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3},
-   {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d},
-   {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f},
-   {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b},
-   {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0},
-   {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195},
-   {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d},
-   {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4},
-   {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3},
-   {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643},
-   {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10},
-   {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d},
-   {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5},
-   {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b},
-   {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee},
-   {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14},
-   {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a},
-   {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b},
-   {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3},
-   {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826},
-   {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06},
-   {0xbbb52021, 0x003b, 0x0272

[PATCH 07/14] crc32: Make CRC_*_BITS definition correspond to actual bit counts

2012-01-06 Thread Darrick J. Wong
crc32.c provides a choice of one of several algorithms for
computing the LSB and LSB versions of the CRC32 checksum
based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the
original version the values 1, 2, 4 and 8 respectively selected
versions of the alrogithm that computed the crc 1, 2, 4 and 32
bits as a time. This patch series adds a new version that computes
the CRC 64 bits at a time. To make things easier to understand
the parameter has been reinterpreted to actually stand for the
number of bits processed in each step of the algorithm so that
the old value 8 has been replaced with the value 32. This also
allows us to add in a widely used crc algorithm that
computes the crc 8 bits at a time called the Sarwate algorithm.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |   17 ++---
 lib/crc32defs.h  |   18 ++
 lib/gen_crc32table.c |   11 ++-
 3 files changed, 34 insertions(+), 12 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index ff6bb9a..157b35f 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -27,13 +27,13 @@
 #include linux/types.h
 #include crc32defs.h
 
-#if CRC_LE_BITS == 8
+#if CRC_LE_BITS  8
 # define tole(x) (__force u32) __constant_cpu_to_le32(x)
 #else
 # define tole(x) (x)
 #endif
 
-#if CRC_BE_BITS == 8
+#if CRC_BE_BITS  8
 # define tobe(x) (__force u32) __constant_cpu_to_be32(x)
 #else
 # define tobe(x) (x)
@@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
 MODULE_DESCRIPTION(Ethernet CRC32 calculations);
 MODULE_LICENSE(GPL);
 
-#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8
+#if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
@@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_le[0][crc  15];
}
 # elif CRC_LE_BITS == 8
+   /* aka Sarwate algorithm */
+   while (len--) {
+   crc ^= *p++;
+   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   }
+# else
const u32  (*tab)[] = crc32table_le;
 
crc = (__force u32) __cpu_to_le32(crc);
@@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_be[0][crc  28];
}
 # elif CRC_BE_BITS == 8
+   while (len--) {
+   crc ^= *p++  24;
+   crc = (crc  8) ^ crc32table_be[0][crc  24];
+   }
+# else
const u32  (*tab)[] = crc32table_be;
 
crc = (__force u32) __cpu_to_be32(crc);
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index f5a5401..daa3a5e 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,27 +6,29 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
-/* For less performance-sensitive, use 4 */
+/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
+/* For less performance-sensitive, use 4 or 8 */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 8
+# define CRC_LE_BITS 32
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 8
+# define CRC_BE_BITS 32
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  8 || CRC_LE_BITS  1 || CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be a power of 2 between 1 and 8
+#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+   CRC_LE_BITS  CRC_LE_BITS-1
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  8 || CRC_BE_BITS  1 || CRC_BE_BITS  CRC_BE_BITS-1
-# error CRC_BE_BITS must be a power of 2 between 1 and 8
+#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
+   CRC_BE_BITS  CRC_BE_BITS-1
+# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index eced769..99ac744 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -4,8 +4,17 @@
 
 #define ENTRIES_PER_LINE 4
 
+#if CRC_LE_BITS = 8
 #define LE_TABLE_SIZE (1  CRC_LE_BITS)
+#else
+#define LE_TABLE_SIZE 256
+#endif
+
+#if CRC_BE_BITS = 8
 #define BE_TABLE_SIZE (1  CRC_BE_BITS)
+#else
+#define BE_TABLE_SIZE 256
+#endif
 
 static uint32_t crc32table_le[4][256];
 static uint32_t crc32table_be[4][256];
@@ -24,7 +33,7 @@ static void crc32init_le(void)
 
crc32table_le[0][0] = 0;
 
-   for (i = 1  (CRC_LE_BITS - 1); i; i = 1) {
+   for (i = LE_TABLE_SIZE  1; i; i = 1) {
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0

[PATCH 03/14] crc32: Simplify unit test code

2012-01-06 Thread Darrick J. Wong
Replaced the unit test provided in crc32.c, which doesn't have a
makefile and doesn't compile with current headers, with a simpler
self test routine that also gives a measure of performance and
runs at module init time. The self test option can be enabled
through a configuration option CONFIG_CRC32_SELFTEST.

The test stresses the pre and post loops and is thus not very
realistic since actual uses will likely have addresses and lengths
that are at least 4 byte aligned. However, the main loop is long
enough so that the performance is dominated by that loop.

The expected values for crc32_le and crc32_be were generated
with the original version of crc32.c using CRC_BITS_LE = 8 and
CRC_BITS_BE = 8. These values were then used to check all the
values of the BITS parameters in both the original and new versions.

The performance results show some variability from run to run
in spite of attempts to both warm the cache and reduce the amount
of OS noise by limiting interrutps during the test. To get comparable
results and to analyse options wrt performance the best time
reported over a small sample of runs has been taken.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   10 +
 lib/crc32.c |  798 ++-
 2 files changed, 691 insertions(+), 117 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5a..2bc5834 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -60,6 +60,16 @@ config CRC32
  kernel tree does. Such modules that use library CRC32 functions
  require M here.
 
+config CRC32_SELFTEST
+   bool CRC32 perform self test on init
+   default n
+   depends on CRC32
+   help
+ This option enables the CRC32 library functions to perform a
+ self test on initialization. The self test computes crc32_le
+ and crc32_be over byte strings with random alignment and length
+ and computes the total elapsed time and number of bytes processed.
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32.c b/lib/crc32.c
index 7ac8b0d..7a0e5a9 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -210,137 +210,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
 EXPORT_SYMBOL(crc32_le);
 EXPORT_SYMBOL(crc32_be);
 
-#ifdef UNITTEST
+#ifdef CONFIG_CRC32_SELFTEST
 
-#include stdlib.h
-#include stdio.h
-
-#if 0  /*Not used at present */
-static void
-buf_dump(char const *prefix, unsigned char const *buf, size_t len)
+/* 4096 random bytes */
+static u8 __attribute__((__aligned__(8))) test_buf[] =
 {
-   fputs(prefix, stdout);
-   while (len--)
-   printf( %02x, *buf++);
-   putchar('\n');
-
-}
-#endif
-
-static void bytereverse(unsigned char *buf, size_t len)
+   0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30,
+   0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4,
+   0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60,
+   0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c,
+   0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4,
+   0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a,
+   0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a,
+   0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4,
+   0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9,
+   0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4,
+   0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca,
+   0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61,
+   0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e,
+   0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a,
+   0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f,
+   0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd,
+   0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c,
+   0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88,
+   0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53,
+   0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f,
+   0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4,
+   0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74,
+   0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60,
+   0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09,
+   0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07,
+   0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1,
+   0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f,
+   0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2,
+   0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0,
+   0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95,
+   0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22,
+   0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93,
+   0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86,
+   0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d,
+   0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40,
+   0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b,
+   0xea, 0xc6, 0x55

[PATCH 04/14] crc32: Speed up memory table access on powerpc

2012-01-06 Thread Darrick J. Wong
Replace 2D array references by pointer references in loops.
This change has no effect on X86 code but improves PPC
performance.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |   21 +++--
 1 files changed, 11 insertions(+), 10 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 7a0e5a9..c93c9ae 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -53,20 +53,21 @@ static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC(x) crc = tab[0][(crc ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = tab[3][(crc)  255] ^ \
-   tab[2][(crc  8)  255] ^ \
-   tab[1][(crc  16)  255] ^ \
-   tab[0][(crc  24)  255]
+#  define DO_CRC(x) (crc = t0[(crc ^ (x))  255] ^ (crc  8))
+#  define DO_CRC4 crc = t3[(crc)  255] ^ \
+   t2[(crc  8)  255] ^ \
+   t1[(crc  16)  255] ^ \
+   t0[(crc  24)  255]
 # else
-#  define DO_CRC(x) crc = tab[0][((crc  24) ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = tab[0][(crc)  255] ^ \
-   tab[1][(crc  8)  255] ^ \
-   tab[2][(crc  16)  255] ^ \
-   tab[3][(crc  24)  255]
+#  define DO_CRC(x) (crc = t0[((crc  24) ^ (x))  255] ^ (crc  8))
+#  define DO_CRC4 crc = t0[(crc)  255] ^ \
+   t1[(crc  8)  255] ^ \
+   t2[(crc  16)  255] ^ \
+   t3[(crc  24)  255]
 # endif
const u32 *b;
size_trem_len;
+   const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
 
/* Align it */
if (unlikely((long)buf  3  len)) {

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/14] crc32: Miscellaneous cleanups

2012-01-06 Thread Darrick J. Wong
Misc cleanup of lib/crc32.c and related files
- removed unnecessary header files.
- straightened out some convoluted ifdef's
- rewrote some references to 2 dimensional arrays as 1 dimensional
  arrays to make them correct. I.e. replaced tab[i] with tab[0][i].
- a few trivial whitespace changes
- fixed a warning in gen_crc32tables.c caused by a mismatch in the
  type of the pointer passed to output table. Since the table is
  only used at kernel compile time, it is simpler to make the table
  big enough to hold the largest column size used. One cannot make the
  column size smaller in output_table because it has to be used by
  both the le and be tables and they can have different column sizes.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |  104 +-
 lib/gen_crc32table.c |6 +--
 2 files changed, 39 insertions(+), 71 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index c93c9ae..2a87ea2 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -23,13 +23,10 @@
 /* see: Documentation/crc32.txt for a description of algorithms */
 
 #include linux/crc32.h
-#include linux/kernel.h
 #include linux/module.h
-#include linux/compiler.h
 #include linux/types.h
-#include linux/init.h
-#include linux/atomic.h
 #include crc32defs.h
+
 #if CRC_LE_BITS == 8
 # define tole(x) __constant_cpu_to_le32(x)
 #else
@@ -41,6 +38,7 @@
 #else
 # define tobe(x) (x)
 #endif
+
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
@@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 #undef DO_CRC4
 }
 #endif
+
 /**
  * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_LE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
}
-   return crc;
-}
-#else  /* Table-based approach */
-
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
-{
-# if CRC_LE_BITS == 8
-   const u32  (*tab)[] = crc32table_le;
-
-   crc = __cpu_to_le32(crc);
-   crc = crc32_body(crc, p, len, tab);
-   return __le32_to_cpu(crc);
-# elif CRC_LE_BITS == 4
+# elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[crc  15];
-   crc = (crc  4) ^ crc32table_le[crc  15];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
}
-   return crc;
-# elif CRC_LE_BITS == 2
+# elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
}
+# elif CRC_LE_BITS == 8
+   const u32  (*tab)[] = crc32table_le;
+
+   crc = __cpu_to_le32(crc);
+   crc = crc32_body(crc, p, len, tab);
+   crc = __le32_to_cpu(crc);
+#endif
return crc;
-# endif
 }
-#endif
+EXPORT_SYMBOL(crc32_le);
 
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
@@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_BE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_BE_BITS == 1
int i;
while (len--) {
crc ^= *p++  24;
@@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len

[PATCH 06/14] crc32: Fix mixing of endian-specific types

2012-01-06 Thread Darrick J. Wong
crc32.c in its original version freely mixed u32, __le32 and __be32 types
which caused warnings from sparse with __CHECK_ENDIAN__.
This patch fixes these by forcing the types to u32.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 2a87ea2..ff6bb9a 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -28,13 +28,13 @@
 #include crc32defs.h
 
 #if CRC_LE_BITS == 8
-# define tole(x) __constant_cpu_to_le32(x)
+# define tole(x) (__force u32) __constant_cpu_to_le32(x)
 #else
 # define tole(x) (x)
 #endif
 
 #if CRC_BE_BITS == 8
-# define tobe(x) __constant_cpu_to_be32(x)
+# define tobe(x) (__force u32) __constant_cpu_to_be32(x)
 #else
 # define tobe(x) (x)
 #endif
@@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_LE_BITS == 8
const u32  (*tab)[] = crc32table_le;
 
-   crc = __cpu_to_le32(crc);
+   crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __le32_to_cpu(crc);
+   crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
@@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_BE_BITS == 8
const u32  (*tab)[] = crc32table_be;
 
-   crc = __cpu_to_be32(crc);
+   crc = (__force u32) __cpu_to_be32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __be32_to_cpu(crc);
+   crc = __be32_to_cpu((__force __be32)crc);
 # endif
return crc;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/14] crc32: removed two instances of trailing whitespaces

2012-01-06 Thread Darrick J. Wong
- remove trailing whitespace from lib/crc32.c
- remove trailing whitespace from lib/crc32defs.h

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |2 +-
 lib/crc32defs.h |2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index a6e633a..23b08ba 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -316,7 +316,7 @@ EXPORT_SYMBOL(crc32_be);
  * in the correct multiple to subtract, we can shift a byte at a time.
  * This produces a 40-bit (rather than a 33-bit) intermediate remainder,
  * but again the multiple of the polynomial to subtract depends only on
- * the high bits, the high 8 bits in this case.  
+ * the high bits, the high 8 bits in this case.
  *
  * The multiple we need in that case is the low 32 bits of a 40-bit
  * value whose high 8 bits are given, and which is a multiple of the
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 9b6773d..f5a5401 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -8,7 +8,7 @@
 
 /* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
 /* For less performance-sensitive, use 4 */
-#ifndef CRC_LE_BITS 
+#ifndef CRC_LE_BITS
 # define CRC_LE_BITS 8
 #endif
 #ifndef CRC_BE_BITS

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/14] crc32: Bolt on crc32c

2012-01-06 Thread Darrick J. Wong
Reuse the existing crc32 code to stamp out a crc32c implementation.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32.h |2 ++
 lib/Kconfig   |8 +++---
 lib/crc32.c   |   62 +++--
 lib/crc32defs.h   |7 ++
 lib/gen_crc32table.c  |   35 ++--
 5 files changed, 80 insertions(+), 34 deletions(-)


diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 391a259..68267b6 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -11,6 +11,8 @@
 extern u32  crc32_le(u32 crc, unsigned char const *p, size_t len);
 extern u32  crc32_be(u32 crc, unsigned char const *p, size_t len);
 
+extern u32  __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const 
*)(data), length)
 
 /*
diff --git a/lib/Kconfig b/lib/Kconfig
index 2bc5834..cfddafc 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -51,14 +51,14 @@ config CRC_ITU_T
  functions require M here.
 
 config CRC32
-   tristate CRC32 functions
+   tristate CRC32/CRC32c functions
default y
select BITREVERSE
help
  This option is provided for the case where no in-kernel-tree
- modules require CRC32 functions, but a module built outside the
- kernel tree does. Such modules that use library CRC32 functions
- require M here.
+ modules require CRC32/CRC32c functions, but a module built outside
+ the kernel tree does. Such modules that use library CRC32/CRC32c
+ functions require M here.
 
 config CRC32_SELFTEST
bool CRC32 perform self test on init
diff --git a/lib/crc32.c b/lib/crc32.c
index d56516d..8df9561 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -46,7 +46,7 @@
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
-MODULE_DESCRIPTION(Ethernet CRC32 calculations);
+MODULE_DESCRIPTION(Various CRC32 calculations);
 MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
@@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256],
+ u32 polynomial)
 {
 #if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
-   crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
+   crc = (crc  1) ^ ((crc  1) ? polynomial : 0);
}
 # elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
}
 # elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
}
 # elif CRC_LE_BITS == 8
/* aka Sarwate algorithm */
while (len--) {
crc ^= *p++;
-   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   crc = (crc  8) ^ tab[0][crc  255];
}
 # else
-   const u32  (*tab)[] = crc32table_le;
-
crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE);
+}
 EXPORT_SYMBOL(crc32_le);
 
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
+}
+EXPORT_SYMBOL(__crc32c_le);
+
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le);
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256

[PATCH 12/14] crypto: crc32c should use library implementation

2012-01-06 Thread Darrick J. Wong
Since lib/crc32.c now provides crc32c, remove the software implementation here
and call the library function instead.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Kconfig  |1 +
 crypto/crc32c.c |   94 ++-
 2 files changed, 4 insertions(+), 91 deletions(-)


diff --git a/crypto/Kconfig b/crypto/Kconfig
index 527a857..4c9e93a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -310,6 +310,7 @@ comment Digest
 config CRYPTO_CRC32C
tristate CRC32c CRC algorithm
select CRYPTO_HASH
+   select CRC32
help
  Castagnoli, et al Cyclic Redundancy-Check Algorithm.  Used
  by iSCSI for header and data digests and by others.
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..06f7018 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -40,6 +40,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include linux/crc32.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -53,95 +54,6 @@ struct chksum_desc_ctx {
 };
 
 /*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
-
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-   0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-   0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-   0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-   0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
-   0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-   0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
-   0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
-   0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
-   0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
-   0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-   0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
-   0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
-   0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
-   0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
-   0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-   0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
-   0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-   0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
-   0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
-   0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-   0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
-   0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
-   0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
-   0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-   0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-   0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
-   0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
-   0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
-   0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
-   0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-   0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-   0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
-   0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
-   0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
-   0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-   0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
-   0x34F4F86AL, 0xC69F7B69L

[PATCH 08/14] crc32: Add slice-by-8 algorithm to existing code

2012-01-06 Thread Darrick J. Wong
add slicing-by-8 algorithm to the existing
slicing-by-4 algorithm. This consists of:
- extend largest BITS size from 32 to 64
- extend tables from tab[4][256] to up to tab[8][256]
- Add code for inner loop.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |   40 
 lib/crc32defs.h  |   29 +
 lib/gen_crc32table.c |   43 +++
 3 files changed, 76 insertions(+), 36 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 157b35f..6311712 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -47,25 +47,28 @@ MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
+/* implements slicing-by-4 or slicing-by-8 algorithm */
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
 #  define DO_CRC(x) (crc = t0[(crc ^ (x))  255] ^ (crc  8))
-#  define DO_CRC4 crc = t3[(crc)  255] ^ \
-   t2[(crc  8)  255] ^ \
-   t1[(crc  16)  255] ^ \
-   t0[(crc  24)  255]
+#  define DO_CRC4 (t3[(q)  255] ^ t2[(q  8)  255] ^ \
+  t1[(q  16)  255] ^ t0[(q  24)  255])
+#  define DO_CRC8 (t7[(q)  255] ^ t6[(q  8)  255] ^ \
+  t5[(q  16)  255] ^ t4[(q  24)  255])
 # else
 #  define DO_CRC(x) (crc = t0[((crc  24) ^ (x))  255] ^ (crc  8))
-#  define DO_CRC4 crc = t0[(crc)  255] ^ \
-   t1[(crc  8)  255] ^ \
-   t2[(crc  16)  255] ^ \
-   t3[(crc  24)  255]
+#  define DO_CRC4 (t0[(q)  255] ^ t1[(q  8)  255] ^ \
+  t2[(q  16)  255] ^ t3[(q  24)  255])
+#  define DO_CRC8 (t4[(q)  255] ^ t5[(q  8)  255] ^ \
+  t6[(q  16)  255] ^ t7[(q  24)  255])
 # endif
const u32 *b;
-   size_trem_len;
+   size_t rem_len;
const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
+   const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
+   u32 q;
 
/* Align it */
if (unlikely((long)buf  3  len)) {
@@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
DO_CRC(*buf++);
} while ((--len)  ((long)buf)3);
}
+
+# if CRC_LE_BITS == 32
rem_len = len  3;
-   /* load data 32 bits wide, xor data 32 bits wide. */
len = len  2;
+# else
+   rem_len = len  7;
+   len = len  3;
+# endif
+
b = (const u32 *)buf;
for (--b; len; --len) {
-   crc ^= *++b; /* use pre increment for speed */
-   DO_CRC4;
+   q = crc ^ *++b; /* use pre increment for speed */
+# if CRC_LE_BITS == 32
+   crc = DO_CRC4;
+# else
+   crc = DO_CRC8;
+   q = *++b;
+   crc ^= DO_CRC4;
+# endif
}
len = rem_len;
/* And the last few bytes */
@@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
return crc;
 #undef DO_CRC
 #undef DO_CRC4
+#undef DO_CRC8
 }
 #endif
 
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index daa3a5e..8181592 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,29 +6,42 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
-/* For less performance-sensitive, use 4 or 8 */
+/*
+ * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
+ * For less performance-sensitive, use 4 or 8 to save table size.
+ * For larger systems choose same as CPU architecture as default.
+ * This works well on X86_64, SPARC64 systems. This may require some
+ * elaboration after experiments with other architectures.
+ */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_LE_BITS 64
+#  else
+#  define CRC_LE_BITS 32
+#  endif
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_BE_BITS 64
+#  else
+#  define CRC_BE_BITS 32
+#  endif
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+#if CRC_LE_BITS  64 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16

Re: [PATCH 14/14] crc32: Select an algorithm via kconfig

2011-12-13 Thread Darrick J. Wong
On Tue, Dec 13, 2011 at 09:27:10AM +0100, Joakim Tjernlund wrote:
 Darrick J. Wong djw...@us.ibm.com wrote on 2011/12/13 07:32:28:
 
  On Mon, Dec 12, 2011 at 05:10:45PM -0600, Bob Pearson wrote:
   That choice was for Joakim who measured better performance on his 32 bit 
   PPC
   platform with by 4.
 
  Ok.  On my 1.33GHz PowerBook I get ~255MB/s with slice by 4 and ~270MB/s 
  with
  slice by 8.  I think it's a PPC 7447, and definitely 32-bit.  In any case, 
  it
  reports having 32K of L1D cache.
 
 I tested Bobs early version on my mpc8321(266MHz, embedded CPU) and it was 
 just
 half the speed compared with current crc32.

I wonder, given the patch crc32: Speed up memory table access on powerpc
would you mind retesting to see if slice by 8 still trails slice by 4 on your
powerpc?  I see that your mpc8321 has 16K of L1D cache and a 32-bit memory bus
whereas my 7447 has a 64-bit memory bus.  I wonder if memory bus size could be
a defining characteristic...?

I tried it out the crc32c code on a s390x today; apparently by-8 trails by-4
there too.  It's unfortunately difficult to figure out the hardware details of
whatever's going on underneath that VM.

--D
 
  Jocke
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 14/14] crc32: Select an algorithm via kconfig

2011-12-12 Thread Darrick J. Wong
On Fri, Dec 02, 2011 at 06:36:46PM -0800, Darrick J. Wong wrote:
 On Fri, Dec 02, 2011 at 08:25:05AM +0800, Herbert Xu wrote:
  On Thu, Dec 01, 2011 at 12:15:17PM -0800, Darrick J. Wong wrote:
   Allow the kernel builder to choose a crc32* algorithm for the kernel.
   
   Signed-off-by: Darrick J. Wong djw...@us.ibm.com
  
  I don't like this at all.  How do you expect distros or indeed
  anyone to make this choice? For generic C implementations like
  this we should only have one, and not many.
 
 Slice-by-8 should be picked automatically if the builder doesn't explicitly
 pick another one.  The other choices are provided for people who want a 
 slimmer
 cache footprint.  I guess I could make the Kconfig file a bit more explicit
 about slice-by-8 being default, or I guess we could just ignore this one patch
 and thereby keeping us with the old method where anyone who wants the slimmer
 implementations patches the #defines.

Ok, here's a patch that makes it more explicit that sliceby8 is the default.
I expect distros and anyone else to simply hit Enter.  The only people who
should do otherwise are people who know they are building for machines that
have small cache sizes such that the crc table fights for cache lines with the
data being checksummed.

I made a quick survey of CPU L1 cache quantities:

All Intel CPUs since the Pentium MMX have  8KiB of L1.
All AMD CPUs since the K5 have had  8KiB of L1.
Most SPARC64 CPUs except the UltraSparc T1 and T2 CPUs have  8KiB of L1.
Most PowerPC CPUs since the 601 seem to have  8KiB of L1.
All IBM POWER CPUs since at least the POWER2 have had  8KiB of L1.
There are too many different ARM cores for me to track.   My smartphones and
embedded ARM controllers all have  8KIB of L1, but that's not enough to
generalize.

While I might've been tempted to agree with Herbert and hardwire the code to
use slice by 8, there are enough CPUs out there that *could* have too-small L1
caches that I'm not comfortable with _removing_ the Kconfig option to use a
slimmer algorithm.  I can't gate the decision on 64-bitness either, since I've
seen plenty of i386 CPUs that benefit from slice by 8, and the UltraSparc T2 is
a 64-bit processor that seems likely to suffer cache thrashing.

I think having a configurable menu that steers people towards slice by 8 is
fine.  Bob, was there a reason for picking slice by 4 for 32-bit machines?

D
---
Allow the kernel builder to choose a crc32* algorithm for the kernel.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---

 lib/Kconfig |   43 +++
 lib/crc32defs.h |   18 ++
 2 files changed, 61 insertions(+), 0 deletions(-)

diff --git a/lib/Kconfig b/lib/Kconfig
index cfddafc..029c0e3 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -70,6 +70,49 @@ config CRC32_SELFTEST
  and crc32_be over byte strings with random alignment and length
  and computes the total elapsed time and number of bytes processed.
 
+choice
+   prompt CRC32 implementation
+   depends on CRC32
+   default CRC32_SLICEBY8
+
+config CRC32_SLICEBY8
+   bool Slice by 8 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is the fastest algorithm, but comes with a 8KiB lookup table.
+ Most modern processors have enough cache to hold this table without
+ thrashing the cache.
+
+ This is the default implementation choice.  Choose this one unless
+ you have a good reason not to.
+
+config CRC32_SLICEBY4
+   bool Slice by 4 bytes
+   help
+ Calculate checksum 4 bytes at a time with a clever slicing algorithm.
+ This is a bit slower than slice by 8, but has a smaller 4KiB lookup
+ table.
+
+ Only choose this option if you know what you are doing.
+
+config CRC32_SARWATE
+   bool Sarwate's Algorithm (one byte at a time)
+   help
+ Calculate checksum a byte at a time using Sarwate's algorithm.  This
+ is not particularly fast, but has a small 256 byte lookup table.
+
+ Only choose this option if you know what you are doing.
+
+config CRC32_BIT
+   bool Classic Algorithm (one bit at a time)
+   help
+ Calculate checksum one bit at a time.  This is VERY slow, but has
+ no lookup table.  This is provided as a debugging option.
+
+ Only choose this option if you are debugging crc32.
+
+endchoice
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 6fd1917..64cba2c 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -13,6 +13,24 @@
  */
 #define CRC32C_POLY_LE 0x82F63B78
 
+/* Try to choose an implementation variant via Kconfig */
+#ifdef CONFIG_CRC32_SLICEBY8
+# define CRC_LE_BITS 64
+# define CRC_BE_BITS 64
+#endif
+#ifdef CONFIG_CRC32_SLICEBY4
+# define CRC_LE_BITS 32
+# define CRC_BE_BITS 32
+#endif
+#ifdef CONFIG_CRC32_SARWATE
+# define CRC_LE_BITS 8

Re: [PATCH v5.2 00/14] crc32c: Add faster algorithm and self-test code

2011-12-02 Thread Darrick J. Wong
On Fri, Dec 02, 2011 at 08:23:58AM +0800, Herbert Xu wrote:
 On Thu, Dec 01, 2011 at 12:31:22PM -0800, Darrick J. Wong wrote:
 .
  They seem to call crc32c(), which is in crypto/crc32c.  If you're 
  interested in
 
 Nope, the crypto API layer will use the SSE implementation
 where available.  Only when it isn't available will the C version
 in crypto/ be used.

There's a SSE version other than what's in crc32c-intel?

(I suspect we're talking about the same thing?)

--D
 
 Cheers,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
 --
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 14/14] crc32: Select an algorithm via kconfig

2011-12-02 Thread Darrick J. Wong
On Fri, Dec 02, 2011 at 08:25:05AM +0800, Herbert Xu wrote:
 On Thu, Dec 01, 2011 at 12:15:17PM -0800, Darrick J. Wong wrote:
  Allow the kernel builder to choose a crc32* algorithm for the kernel.
  
  Signed-off-by: Darrick J. Wong djw...@us.ibm.com
 
 I don't like this at all.  How do you expect distros or indeed
 anyone to make this choice? For generic C implementations like
 this we should only have one, and not many.

Slice-by-8 should be picked automatically if the builder doesn't explicitly
pick another one.  The other choices are provided for people who want a slimmer
cache footprint.  I guess I could make the Kconfig file a bit more explicit
about slice-by-8 being default, or I guess we could just ignore this one patch
and thereby keeping us with the old method where anyone who wants the slimmer
implementations patches the #defines.

--D
 
 Cheers,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
 --
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5.1 00/14] crc32c: Add faster algorithm and self-test code

2011-12-01 Thread Darrick J. Wong
On Wed, Nov 30, 2011 at 02:29:11PM -0800, Andrew Morton wrote:
 On Mon, 28 Nov 2011 14:36:59 -0800
 Darrick J. Wong djw...@us.ibm.com wrote:
 
  This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
  software crc32c implementation.
 
 I think the attributions here are all messed up.  As the patches stand,
 it appears that you wrote all of them.  But I don't think that is the
 case.
 
 If Bob wrote a particular patch then that patch should be sent with a
 From: Bob Pearson rpear...@systemfabricworks.com right at the start
 of the changelog so that he is recorded as the primary author.  If the
 email sender (ie: you) was the primary author then this attribution can
 be omitted and we fall back to using the From: from the email headers.
 
 Also, every one of these patches should have you own signed-off-by,
 regardless of its authorship.  For reasons explained in
 Documentation/SubmittingPatches, section 12.
 
 Please fix these things up and resend.
 
 Also, it would be conventional and useful if each patch title was
 prefixed by its subsystem identifier.  ie, removed two instances of
 trailing whitespaces should be titled crc32: remove two instances of
 trailing whitespace or lib/crc32.c: remove two instances of trailing
 whitespaces.

Okay, I'll massage the changelogs to give them more descriptive subjects, and
fix the attribution chain.  Thank you for the feedback.

--D
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/14] crc32: Make CRC_*_BITS definition correspond to actual bit counts

2011-12-01 Thread Darrick J. Wong
crc32.c provides a choice of one of several algorithms for
computing the LSB and LSB versions of the CRC32 checksum
based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the
original version the values 1, 2, 4 and 8 respectively selected
versions of the alrogithm that computed the crc 1, 2, 4 and 32
bits as a time. This patch series adds a new version that computes
the CRC 64 bits at a time. To make things easier to understand
the parameter has been reinterpreted to actually stand for the
number of bits processed in each step of the algorithm so that
the old value 8 has been replaced with the value 32. This also
allows us to add in a widely used crc algorithm that
computes the crc 8 bits at a time called the Sarwate algorithm.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |   17 ++---
 lib/crc32defs.h  |   18 ++
 lib/gen_crc32table.c |   11 ++-
 3 files changed, 34 insertions(+), 12 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index ff6bb9a..157b35f 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -27,13 +27,13 @@
 #include linux/types.h
 #include crc32defs.h
 
-#if CRC_LE_BITS == 8
+#if CRC_LE_BITS  8
 # define tole(x) (__force u32) __constant_cpu_to_le32(x)
 #else
 # define tole(x) (x)
 #endif
 
-#if CRC_BE_BITS == 8
+#if CRC_BE_BITS  8
 # define tobe(x) (__force u32) __constant_cpu_to_be32(x)
 #else
 # define tobe(x) (x)
@@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
 MODULE_DESCRIPTION(Ethernet CRC32 calculations);
 MODULE_LICENSE(GPL);
 
-#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8
+#if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
@@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_le[0][crc  15];
}
 # elif CRC_LE_BITS == 8
+   /* aka Sarwate algorithm */
+   while (len--) {
+   crc ^= *p++;
+   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   }
+# else
const u32  (*tab)[] = crc32table_le;
 
crc = (__force u32) __cpu_to_le32(crc);
@@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_be[0][crc  28];
}
 # elif CRC_BE_BITS == 8
+   while (len--) {
+   crc ^= *p++  24;
+   crc = (crc  8) ^ crc32table_be[0][crc  24];
+   }
+# else
const u32  (*tab)[] = crc32table_be;
 
crc = (__force u32) __cpu_to_be32(crc);
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index f5a5401..daa3a5e 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,27 +6,29 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
-/* For less performance-sensitive, use 4 */
+/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
+/* For less performance-sensitive, use 4 or 8 */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 8
+# define CRC_LE_BITS 32
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 8
+# define CRC_BE_BITS 32
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  8 || CRC_LE_BITS  1 || CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be a power of 2 between 1 and 8
+#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+   CRC_LE_BITS  CRC_LE_BITS-1
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  8 || CRC_BE_BITS  1 || CRC_BE_BITS  CRC_BE_BITS-1
-# error CRC_BE_BITS must be a power of 2 between 1 and 8
+#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
+   CRC_BE_BITS  CRC_BE_BITS-1
+# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index eced769..99ac744 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -4,8 +4,17 @@
 
 #define ENTRIES_PER_LINE 4
 
+#if CRC_LE_BITS = 8
 #define LE_TABLE_SIZE (1  CRC_LE_BITS)
+#else
+#define LE_TABLE_SIZE 256
+#endif
+
+#if CRC_BE_BITS = 8
 #define BE_TABLE_SIZE (1  CRC_BE_BITS)
+#else
+#define BE_TABLE_SIZE 256
+#endif
 
 static uint32_t crc32table_le[4][256];
 static uint32_t crc32table_be[4][256];
@@ -24,7 +33,7 @@ static void crc32init_le(void)
 
crc32table_le[0][0] = 0;
 
-   for (i = 1  (CRC_LE_BITS - 1); i; i = 1) {
+   for (i = LE_TABLE_SIZE  1; i; i = 1) {
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0

[PATCH 10/14] crc32: Add note about this patchset to crc32.c

2011-12-01 Thread Darrick J. Wong
Some final changes
- added a comment at the top of crc32.c

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 2c8e8c0..d56516d 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -1,4 +1,8 @@
 /*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
  * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
  * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
  * Code was from the public domain, copyright abandoned.  Code was

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] crc32: Add self-test code for crc32c

2011-12-01 Thread Darrick J. Wong
Add self-test code for crc32c.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |  363 ++-
 1 files changed, 261 insertions(+), 102 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 8df9561..382fa76 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -765,113 +765,265 @@ static struct crc_test {
u32 length; /* random 11 bit length of test */
u32 crc_le; /* expected crc32_le result */
u32 crc_be; /* expected crc32_be result */
+   u32 crc32c_le;  /* expected crc32c_le result */
 } test[] =
 {
-   {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1},
-   {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad},
-   {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f},
-   {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a},
-   {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2},
-   {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793},
-   {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed},
-   {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35},
-   {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2},
-   {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10},
-   {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb},
-   {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0},
-   {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb},
-   {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed},
-   {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591},
-   {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67},
-   {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd},
-   {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a},
-   {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b},
-   {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f},
-   {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d},
-   {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a},
-   {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97},
-   {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2},
-   {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138},
-   {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032},
-   {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f},
-   {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f},
-   {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32},
-   {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef},
-   {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0},
-   {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59},
-   {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4},
-   {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c},
-   {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51},
-   {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11},
-   {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659},
-   {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af},
-   {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99},
-   {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b},
-   {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521},
-   {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3},
-   {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d},
-   {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f},
-   {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b},
-   {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0},
-   {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195},
-   {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d},
-   {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4},
-   {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3},
-   {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643},
-   {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10},
-   {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d},
-   {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5},
-   {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b},
-   {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee},
-   {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14},
-   {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a},
-   {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b},
-   {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3},
-   {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826},
-   {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06},
-   {0xbbb52021, 0x003b, 0x0272

[PATCH 14/14] crc32: Select an algorithm via kconfig

2011-12-01 Thread Darrick J. Wong
Allow the kernel builder to choose a crc32* algorithm for the kernel.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   36 
 lib/crc32defs.h |   18 ++
 2 files changed, 54 insertions(+), 0 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index cfddafc..e9b9134 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -70,6 +70,42 @@ config CRC32_SELFTEST
  and crc32_be over byte strings with random alignment and length
  and computes the total elapsed time and number of bytes processed.
 
+choice
+   prompt CRC32 implementation
+   depends on CRC32
+   default CRC32_SLICEBY8
+
+config CRC32_SLICEBY8
+   bool Slice by 8 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is the fastest algorithm, but comes with a 8KiB lookup table.
+ Most modern processors have enough cache that this shouldn't be
+ a problem.
+
+ If you don't know which to choose, choose this one.
+
+config CRC32_SLICEBY4
+   bool Slice by 4 bytes
+   help
+ Calculate checksum 4 bytes at a time with a clever slicing algorithm.
+ This is a bit slower than slice by 8, but has a smaller 4KiB lookup
+ table.
+
+config CRC32_SARWATE
+   bool Sarwate's Algorithm (one byte at a time)
+   help
+ Calculate checksum a byte at a time using Sarwate's algorithm.  This
+ is not particularly fast, but has a small 256 byte lookup table.
+
+config CRC32_BIT
+   bool Classic Algorithm (one bit at a time)
+   help
+ Calculate checksum one bit at a time.  This is VERY slow, but has
+ no lookup table.  This is provided as a debugging option.
+
+endchoice
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 6fd1917..64cba2c 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -13,6 +13,24 @@
  */
 #define CRC32C_POLY_LE 0x82F63B78
 
+/* Try to choose an implementation variant via Kconfig */
+#ifdef CONFIG_CRC32_SLICEBY8
+# define CRC_LE_BITS 64
+# define CRC_BE_BITS 64
+#endif
+#ifdef CONFIG_CRC32_SLICEBY4
+# define CRC_LE_BITS 32
+# define CRC_BE_BITS 32
+#endif
+#ifdef CONFIG_CRC32_SARWATE
+# define CRC_LE_BITS 8
+# define CRC_BE_BITS 8
+#endif
+#ifdef CONFIG_CRC32_BIT
+# define CRC_LE_BITS 1
+# define CRC_BE_BITS 1
+#endif
+
 /*
  * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
  * For less performance-sensitive, use 4 or 8 to save table size.

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/14] crc32: Bolt on crc32c

2011-12-01 Thread Darrick J. Wong
Reuse the existing crc32 code to stamp out a crc32c implementation.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32.h |2 ++
 lib/Kconfig   |8 +++---
 lib/crc32.c   |   62 +++--
 lib/crc32defs.h   |7 ++
 lib/gen_crc32table.c  |   35 ++--
 5 files changed, 80 insertions(+), 34 deletions(-)


diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 391a259..68267b6 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -11,6 +11,8 @@
 extern u32  crc32_le(u32 crc, unsigned char const *p, size_t len);
 extern u32  crc32_be(u32 crc, unsigned char const *p, size_t len);
 
+extern u32  __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const 
*)(data), length)
 
 /*
diff --git a/lib/Kconfig b/lib/Kconfig
index 2bc5834..cfddafc 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -51,14 +51,14 @@ config CRC_ITU_T
  functions require M here.
 
 config CRC32
-   tristate CRC32 functions
+   tristate CRC32/CRC32c functions
default y
select BITREVERSE
help
  This option is provided for the case where no in-kernel-tree
- modules require CRC32 functions, but a module built outside the
- kernel tree does. Such modules that use library CRC32 functions
- require M here.
+ modules require CRC32/CRC32c functions, but a module built outside
+ the kernel tree does. Such modules that use library CRC32/CRC32c
+ functions require M here.
 
 config CRC32_SELFTEST
bool CRC32 perform self test on init
diff --git a/lib/crc32.c b/lib/crc32.c
index d56516d..8df9561 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -46,7 +46,7 @@
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
-MODULE_DESCRIPTION(Ethernet CRC32 calculations);
+MODULE_DESCRIPTION(Various CRC32 calculations);
 MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
@@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256],
+ u32 polynomial)
 {
 #if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
-   crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
+   crc = (crc  1) ^ ((crc  1) ? polynomial : 0);
}
 # elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
}
 # elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
}
 # elif CRC_LE_BITS == 8
/* aka Sarwate algorithm */
while (len--) {
crc ^= *p++;
-   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   crc = (crc  8) ^ tab[0][crc  255];
}
 # else
-   const u32  (*tab)[] = crc32table_le;
-
crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE);
+}
 EXPORT_SYMBOL(crc32_le);
 
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
+}
+EXPORT_SYMBOL(__crc32c_le);
+
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le);
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256

[PATCH 02/14] crc32: Move long comment about crc32 fundamentals to Documentation/

2011-12-01 Thread Darrick J. Wong
Moved a long comment from lib/crc32.c to Documentation/crc32.txt
where it will more likely get read.
- Edited the resulting document to add an explanation of the 
slicing-by-n
  algorithm.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: George Spelvin li...@horizon.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 Documentation/00-INDEX  |2 +
 Documentation/crc32.txt |  183 +++
 lib/crc32.c |  129 +
 3 files changed, 187 insertions(+), 127 deletions(-)
 create mode 100644 Documentation/crc32.txt


diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 65bbd26..e7b38a0 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -104,6 +104,8 @@ cpuidle/
- info on CPU_IDLE, CPU idle state management subsystem.
 cputopology.txt
- documentation on how CPU topology info is exported via sysfs.
+crc32.txt
+   - brief tutorial on CRC computation
 cris/
- directory with info about Linux on CRIS architecture.
 crypto/
diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt
new file mode 100644
index 000..3d74ba4
--- /dev/null
+++ b/Documentation/crc32.txt
@@ -0,0 +1,183 @@
+A brief CRC tutorial.
+
+A CRC is a long-division remainder.  You add the CRC to the message,
+and the whole thing (message+CRC) is a multiple of the given
+CRC polynomial.  To check the CRC, you can either check that the
+CRC matches the recomputed value, *or* you can check that the
+remainder computed on the message+CRC is 0.  This latter approach
+is used by a lot of hardware implementations, and is why so many
+protocols put the end-of-frame flag after the CRC.
+
+It's actually the same long division you learned in school, except that
+- We're working in binary, so the digits are only 0 and 1, and
+- When dividing polynomials, there are no carries.  Rather than add and
+  subtract, we just xor.  Thus, we tend to get a bit sloppy about
+  the difference between adding and subtracting.
+
+Like all division, the remainder is always smaller than the divisor.
+To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial.
+Since it's 33 bits long, bit 32 is always going to be set, so usually the
+CRC is written in hex with the most significant bit omitted.  (If you're
+familiar with the IEEE 754 floating-point format, it's the same idea.)
+
+Note that a CRC is computed over a string of *bits*, so you have
+to decide on the endianness of the bits within each byte.  To get
+the best error-detecting properties, this should correspond to the
+order they're actually sent.  For example, standard RS-232 serial is
+little-endian; the most significant bit (sometimes used for parity)
+is sent last.  And when appending a CRC word to a message, you should
+do it in the right order, matching the endianness.
+
+Just like with ordinary division, you proceed one digit (bit) at a time.
+Each step of the division, division, you take one more digit (bit) of the
+dividend and append it to the current remainder.  Then you figure out the
+appropriate multiple of the divisor to subtract to being the remainder
+back into range.  In binary, this is easy - it has to be either 0 or 1,
+and to make the XOR cancel, it's just a copy of bit 32 of the remainder.
+
+When computing a CRC, we don't care about the quotient, so we can
+throw the quotient bit away, but subtract the appropriate multiple of
+the polynomial from the remainder and we're back to where we started,
+ready to process the next bit.
+
+A big-endian CRC written this way would be coded like:
+for (i = 0; i  input_bits; i++) {
+   multiple = remainder  0x8000 ? CRCPOLY : 0;
+   remainder = (remainder  1 | next_input_bit()) ^ multiple;
+}
+
+Notice how, to get at bit 32 of the shifted remainder, we look
+at bit 31 of the remainder *before* shifting it.
+
+But also notice how the next_input_bit() bits we're shifting into
+the remainder don't actually affect any decision-making until
+32 bits later.  Thus, the first 32 cycles of this are pretty boring.
+Also, to add the CRC to a message, we need a 32-bit-long hole for it at
+the end, so we have to add 32 extra cycles shifting in zeros at the
+end of every message,
+
+These details lead to a standard trick: rearrange merging in the
+next_input_bit() until the moment it's needed.  Then the first 32 cycles
+can be precomputed, and merging in the final 32 zero bits to make room
+for the CRC can be skipped entirely.  This changes the code to:
+
+for (i = 0; i  input_bits; i++) {
+   remainder ^= next_input_bit()  31;
+   multiple = (remainder  0x8000) ? CRCPOLY : 0;
+   remainder = (remainder  1) ^ multiple;
+}
+
+With this optimization, the little-endian code is particularly simple:
+for (i = 0; i  input_bits; i++) {
+   remainder

[PATCH 04/14] crc32: Speed up memory table access on powerpc

2011-12-01 Thread Darrick J. Wong
Replace 2D array references by pointer references in loops.
This change has no effect on X86 code but improves PPC
performance.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |   21 +++--
 1 files changed, 11 insertions(+), 10 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 7a0e5a9..c93c9ae 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -53,20 +53,21 @@ static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC(x) crc = tab[0][(crc ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = tab[3][(crc)  255] ^ \
-   tab[2][(crc  8)  255] ^ \
-   tab[1][(crc  16)  255] ^ \
-   tab[0][(crc  24)  255]
+#  define DO_CRC(x) (crc = t0[(crc ^ (x))  255] ^ (crc  8))
+#  define DO_CRC4 crc = t3[(crc)  255] ^ \
+   t2[(crc  8)  255] ^ \
+   t1[(crc  16)  255] ^ \
+   t0[(crc  24)  255]
 # else
-#  define DO_CRC(x) crc = tab[0][((crc  24) ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = tab[0][(crc)  255] ^ \
-   tab[1][(crc  8)  255] ^ \
-   tab[2][(crc  16)  255] ^ \
-   tab[3][(crc  24)  255]
+#  define DO_CRC(x) (crc = t0[((crc  24) ^ (x))  255] ^ (crc  8))
+#  define DO_CRC4 crc = t0[(crc)  255] ^ \
+   t1[(crc  8)  255] ^ \
+   t2[(crc  16)  255] ^ \
+   t3[(crc  24)  255]
 # endif
const u32 *b;
size_trem_len;
+   const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
 
/* Align it */
if (unlikely((long)buf  3  len)) {

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5.2 00/14] crc32c: Add faster algorithm and self-test code

2011-12-01 Thread Darrick J. Wong
Hi all,

This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
software crc32c implementation.  It removes the crc32c implementation in
crypto/ in favor of using the stamped-out one in lib/.  There is also a change
to Kconfig so that the kernel builder can pick an implementation best suited
for the hardware.

The motivation for this patchset is that I am working on adding full metadata
checksumming to ext4.  As far as performance impact of adding checksumming
goes, I see nearly no change with a standard mail server ffsb simulation.  On a
test that involves only file creation and deletion and extent tree writes, I
see a drop of about 50 pcercent with the current kernel crc32c implementation;
this improves to a drop of about 20 percent with the enclosed crc32c code.

When metadata is usually a small fraction of total IO, this new implementation
doesn't help much because metadata is usually a small fraction of total IO.
However, when we are doing IO that is almost all metadata (such as rm -rf'ing a
tree), then this patch speeds up the operation substantially.

Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
should improve their speed as well.  I have not yet quantified that, however.
This latest submission combines Bob's patches from late August 2011 with mine
so that they can be one coherent patch set.  Please excuse my inability to
combine some of the patches; I've been advised to leave Bob's patches alone and
build atop them instead. :/

Since the last posting, I've also collected some crc32c test results on a bunch
of different x86/powerpc/sparc platforms.  The results can be viewed here:
http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the
performance of the kernel's current crc32 and crc32c software implementations.
The crc32c-by8-le column shows crc32c performance with this patchset applied.
I expect crc32 performance to be roughly the same.

The two _boost columns at the right side of the spreadsheet shows how much
faster the new implementation is over the old one.  As you can see, crc32 rises
substantially, and crc32c experiences a huge increase.  I'm hoping this patch
set meets with everyone's approval and can go in soon.  Herbert Xu didn't
appear to have any strong objections to last month's posting, so I'm wondering
if Andrew has an opinion?

v2: Use the crypto testmgr api for self-test.
v3: Get rid of the -be version, which had no users.
v4: Allow kernel builder a choice of speed vs. space optimization.
v5: Reuse lib/crc32 for crc32c as well, and make crypto/crc32c use lib/crc32.c.
v5.1: Include Bob Pearson's patches in submission request.
v5.2: Fix changelogs for Bob's patches per akpm request.

--D

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/14] crc32: Add slice-by-8 algorithm to existing code

2011-12-01 Thread Darrick J. Wong
add slicing-by-8 algorithm to the existing
slicing-by-4 algorithm. This consists of:
- extend largest BITS size from 32 to 64
- extend tables from tab[4][256] to up to tab[8][256]
- Add code for inner loop.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |   40 
 lib/crc32defs.h  |   29 +
 lib/gen_crc32table.c |   43 +++
 3 files changed, 76 insertions(+), 36 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 157b35f..6311712 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -47,25 +47,28 @@ MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
+/* implements slicing-by-4 or slicing-by-8 algorithm */
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
 #  define DO_CRC(x) (crc = t0[(crc ^ (x))  255] ^ (crc  8))
-#  define DO_CRC4 crc = t3[(crc)  255] ^ \
-   t2[(crc  8)  255] ^ \
-   t1[(crc  16)  255] ^ \
-   t0[(crc  24)  255]
+#  define DO_CRC4 (t3[(q)  255] ^ t2[(q  8)  255] ^ \
+  t1[(q  16)  255] ^ t0[(q  24)  255])
+#  define DO_CRC8 (t7[(q)  255] ^ t6[(q  8)  255] ^ \
+  t5[(q  16)  255] ^ t4[(q  24)  255])
 # else
 #  define DO_CRC(x) (crc = t0[((crc  24) ^ (x))  255] ^ (crc  8))
-#  define DO_CRC4 crc = t0[(crc)  255] ^ \
-   t1[(crc  8)  255] ^ \
-   t2[(crc  16)  255] ^ \
-   t3[(crc  24)  255]
+#  define DO_CRC4 (t0[(q)  255] ^ t1[(q  8)  255] ^ \
+  t2[(q  16)  255] ^ t3[(q  24)  255])
+#  define DO_CRC8 (t4[(q)  255] ^ t5[(q  8)  255] ^ \
+  t6[(q  16)  255] ^ t7[(q  24)  255])
 # endif
const u32 *b;
-   size_trem_len;
+   size_t rem_len;
const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
+   const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
+   u32 q;
 
/* Align it */
if (unlikely((long)buf  3  len)) {
@@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
DO_CRC(*buf++);
} while ((--len)  ((long)buf)3);
}
+
+# if CRC_LE_BITS == 32
rem_len = len  3;
-   /* load data 32 bits wide, xor data 32 bits wide. */
len = len  2;
+# else
+   rem_len = len  7;
+   len = len  3;
+# endif
+
b = (const u32 *)buf;
for (--b; len; --len) {
-   crc ^= *++b; /* use pre increment for speed */
-   DO_CRC4;
+   q = crc ^ *++b; /* use pre increment for speed */
+# if CRC_LE_BITS == 32
+   crc = DO_CRC4;
+# else
+   crc = DO_CRC8;
+   q = *++b;
+   crc ^= DO_CRC4;
+# endif
}
len = rem_len;
/* And the last few bytes */
@@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
return crc;
 #undef DO_CRC
 #undef DO_CRC4
+#undef DO_CRC8
 }
 #endif
 
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index daa3a5e..8181592 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,29 +6,42 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
-/* For less performance-sensitive, use 4 or 8 */
+/*
+ * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
+ * For less performance-sensitive, use 4 or 8 to save table size.
+ * For larger systems choose same as CPU architecture as default.
+ * This works well on X86_64, SPARC64 systems. This may require some
+ * elaboration after experiments with other architectures.
+ */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_LE_BITS 64
+#  else
+#  define CRC_LE_BITS 32
+#  endif
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_BE_BITS 64
+#  else
+#  define CRC_BE_BITS 32
+#  endif
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+#if CRC_LE_BITS  64 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16

[PATCH 06/14] crc32: Fix mixing of endian-specific types

2011-12-01 Thread Darrick J. Wong
crc32.c in its original version freely mixed u32, __le32 and __be32 types
which caused warnings from sparse with __CHECK_ENDIAN__.
This patch fixes these by forcing the types to u32.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 2a87ea2..ff6bb9a 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -28,13 +28,13 @@
 #include crc32defs.h
 
 #if CRC_LE_BITS == 8
-# define tole(x) __constant_cpu_to_le32(x)
+# define tole(x) (__force u32) __constant_cpu_to_le32(x)
 #else
 # define tole(x) (x)
 #endif
 
 #if CRC_BE_BITS == 8
-# define tobe(x) __constant_cpu_to_be32(x)
+# define tobe(x) (__force u32) __constant_cpu_to_be32(x)
 #else
 # define tobe(x) (x)
 #endif
@@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_LE_BITS == 8
const u32  (*tab)[] = crc32table_le;
 
-   crc = __cpu_to_le32(crc);
+   crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __le32_to_cpu(crc);
+   crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
@@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_BE_BITS == 8
const u32  (*tab)[] = crc32table_be;
 
-   crc = __cpu_to_be32(crc);
+   crc = (__force u32) __cpu_to_be32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __be32_to_cpu(crc);
+   crc = __be32_to_cpu((__force __be32)crc);
 # endif
return crc;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/14] crc32: Miscellaneous cleanups

2011-12-01 Thread Darrick J. Wong
Misc cleanup of lib/crc32.c and related files
- removed unnecessary header files.
- straightened out some convoluted ifdef's
- rewrote some references to 2 dimensional arrays as 1 dimensional
  arrays to make them correct. I.e. replaced tab[i] with tab[0][i].
- a few trivial whitespace changes
- fixed a warning in gen_crc32tables.c caused by a mismatch in the
  type of the pointer passed to output table. Since the table is
  only used at kernel compile time, it is simpler to make the table
  big enough to hold the largest column size used. One cannot make the
  column size smaller in output_table because it has to be used by
  both the le and be tables and they can have different column sizes.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c  |  104 +-
 lib/gen_crc32table.c |6 +--
 2 files changed, 39 insertions(+), 71 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index c93c9ae..2a87ea2 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -23,13 +23,10 @@
 /* see: Documentation/crc32.txt for a description of algorithms */
 
 #include linux/crc32.h
-#include linux/kernel.h
 #include linux/module.h
-#include linux/compiler.h
 #include linux/types.h
-#include linux/init.h
-#include linux/atomic.h
 #include crc32defs.h
+
 #if CRC_LE_BITS == 8
 # define tole(x) __constant_cpu_to_le32(x)
 #else
@@ -41,6 +38,7 @@
 #else
 # define tobe(x) (x)
 #endif
+
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
@@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 #undef DO_CRC4
 }
 #endif
+
 /**
  * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_LE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
}
-   return crc;
-}
-#else  /* Table-based approach */
-
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
-{
-# if CRC_LE_BITS == 8
-   const u32  (*tab)[] = crc32table_le;
-
-   crc = __cpu_to_le32(crc);
-   crc = crc32_body(crc, p, len, tab);
-   return __le32_to_cpu(crc);
-# elif CRC_LE_BITS == 4
+# elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[crc  15];
-   crc = (crc  4) ^ crc32table_le[crc  15];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
}
-   return crc;
-# elif CRC_LE_BITS == 2
+# elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
}
+# elif CRC_LE_BITS == 8
+   const u32  (*tab)[] = crc32table_le;
+
+   crc = __cpu_to_le32(crc);
+   crc = crc32_body(crc, p, len, tab);
+   crc = __le32_to_cpu(crc);
+#endif
return crc;
-# endif
 }
-#endif
+EXPORT_SYMBOL(crc32_le);
 
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
@@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_BE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_BE_BITS == 1
int i;
while (len--) {
crc ^= *p++  24;
@@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len

[PATCH 03/14] crc32: Simplify unit test code

2011-12-01 Thread Darrick J. Wong
Replaced the unit test provided in crc32.c, which doesn't have a
makefile and doesn't compile with current headers, with a simpler
self test routine that also gives a measure of performance and
runs at module init time. The self test option can be enabled
through a configuration option CONFIG_CRC32_SELFTEST.

The test stresses the pre and post loops and is thus not very
realistic since actual uses will likely have addresses and lengths
that are at least 4 byte aligned. However, the main loop is long
enough so that the performance is dominated by that loop.

The expected values for crc32_le and crc32_be were generated
with the original version of crc32.c using CRC_BITS_LE = 8 and
CRC_BITS_BE = 8. These values were then used to check all the
values of the BITS parameters in both the original and new versions.

The performance results show some variability from run to run
in spite of attempts to both warm the cache and reduce the amount
of OS noise by limiting interrutps during the test. To get comparable
results and to analyse options wrt performance the best time
reported over a small sample of runs has been taken.

From: Bob Pearson rpear...@systemfabricworks.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
[djw...@us.ibm.com: Minor changelog tweaks]
Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   10 +
 lib/crc32.c |  798 ++-
 2 files changed, 691 insertions(+), 117 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5a..2bc5834 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -60,6 +60,16 @@ config CRC32
  kernel tree does. Such modules that use library CRC32 functions
  require M here.
 
+config CRC32_SELFTEST
+   bool CRC32 perform self test on init
+   default n
+   depends on CRC32
+   help
+ This option enables the CRC32 library functions to perform a
+ self test on initialization. The self test computes crc32_le
+ and crc32_be over byte strings with random alignment and length
+ and computes the total elapsed time and number of bytes processed.
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32.c b/lib/crc32.c
index 7ac8b0d..7a0e5a9 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -210,137 +210,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
 EXPORT_SYMBOL(crc32_le);
 EXPORT_SYMBOL(crc32_be);
 
-#ifdef UNITTEST
+#ifdef CONFIG_CRC32_SELFTEST
 
-#include stdlib.h
-#include stdio.h
-
-#if 0  /*Not used at present */
-static void
-buf_dump(char const *prefix, unsigned char const *buf, size_t len)
+/* 4096 random bytes */
+static u8 __attribute__((__aligned__(8))) test_buf[] =
 {
-   fputs(prefix, stdout);
-   while (len--)
-   printf( %02x, *buf++);
-   putchar('\n');
-
-}
-#endif
-
-static void bytereverse(unsigned char *buf, size_t len)
+   0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30,
+   0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4,
+   0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60,
+   0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c,
+   0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4,
+   0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a,
+   0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a,
+   0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4,
+   0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9,
+   0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4,
+   0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca,
+   0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61,
+   0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e,
+   0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a,
+   0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f,
+   0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd,
+   0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c,
+   0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88,
+   0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53,
+   0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f,
+   0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4,
+   0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74,
+   0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60,
+   0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09,
+   0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07,
+   0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1,
+   0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f,
+   0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2,
+   0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0,
+   0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95,
+   0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22,
+   0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93,
+   0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86,
+   0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d,
+   0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40,
+   0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b,
+   0xea, 0xc6, 0x55

Re: [PATCH v5.2 00/14] crc32c: Add faster algorithm and self-test code

2011-12-01 Thread Darrick J. Wong
On Thu, Dec 01, 2011 at 12:20:53PM -0800, Joel Becker wrote:
 On Thu, Dec 01, 2011 at 12:13:41PM -0800, Darrick J. Wong wrote:
  Hi all,
  
  This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
  software crc32c implementation.  It removes the crc32c implementation in
  crypto/ in favor of using the stamped-out one in lib/.  There is also a 
  change
  to Kconfig so that the kernel builder can pick an implementation best suited
  for the hardware.
  
  The motivation for this patchset is that I am working on adding full 
  metadata
  checksumming to ext4.  As far as performance impact of adding checksumming
  goes, I see nearly no change with a standard mail server ffsb simulation.  
  On a
  test that involves only file creation and deletion and extent tree writes, I
  see a drop of about 50 pcercent with the current kernel crc32c 
  implementation;
  this improves to a drop of about 20 percent with the enclosed crc32c code.
  
  When metadata is usually a small fraction of total IO, this new 
  implementation
  doesn't help much because metadata is usually a small fraction of total IO.
  However, when we are doing IO that is almost all metadata (such as rm 
  -rf'ing a
  tree), then this patch speeds up the operation substantially.
  
  Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this 
  patchset
  should improve their speed as well.  I have not yet quantified that, 
  however.
 
   I thought they usually used the SSE instruction for crc32 or
 equivalent.

They seem to call crc32c(), which is in crypto/crc32c.  If you're interested in
hardware accelerated crc32c on Intel, it is still the case that the wrapper for
that can be loaded via crc32c-intel.

--D
 
 Joel
 
 -- 
 
 I almost ran over an angel
  He had a nice big fat cigar.
  'In a sense,' he said, 'You're alone here
  So if you jump, you'd best jump far.'
 
   http://www.jlbec.org/
   jl...@evilplan.org
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/14] Moved a long comment from lib/crc32.c to Documentation/crc32.txt

2011-11-28 Thread Darrick J. Wong
where it will more likely get read.
- Edited the resulting document to add an explanation of the 
slicing-by-n
  algorithm.

Signed-off-by: George Spelvin li...@horizon.com
Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 Documentation/00-INDEX  |2 +
 Documentation/crc32.txt |  183 +++
 lib/crc32.c |  129 +
 3 files changed, 187 insertions(+), 127 deletions(-)
 create mode 100644 Documentation/crc32.txt


diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 65bbd26..e7b38a0 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -104,6 +104,8 @@ cpuidle/
- info on CPU_IDLE, CPU idle state management subsystem.
 cputopology.txt
- documentation on how CPU topology info is exported via sysfs.
+crc32.txt
+   - brief tutorial on CRC computation
 cris/
- directory with info about Linux on CRIS architecture.
 crypto/
diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt
new file mode 100644
index 000..3d74ba4
--- /dev/null
+++ b/Documentation/crc32.txt
@@ -0,0 +1,183 @@
+A brief CRC tutorial.
+
+A CRC is a long-division remainder.  You add the CRC to the message,
+and the whole thing (message+CRC) is a multiple of the given
+CRC polynomial.  To check the CRC, you can either check that the
+CRC matches the recomputed value, *or* you can check that the
+remainder computed on the message+CRC is 0.  This latter approach
+is used by a lot of hardware implementations, and is why so many
+protocols put the end-of-frame flag after the CRC.
+
+It's actually the same long division you learned in school, except that
+- We're working in binary, so the digits are only 0 and 1, and
+- When dividing polynomials, there are no carries.  Rather than add and
+  subtract, we just xor.  Thus, we tend to get a bit sloppy about
+  the difference between adding and subtracting.
+
+Like all division, the remainder is always smaller than the divisor.
+To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial.
+Since it's 33 bits long, bit 32 is always going to be set, so usually the
+CRC is written in hex with the most significant bit omitted.  (If you're
+familiar with the IEEE 754 floating-point format, it's the same idea.)
+
+Note that a CRC is computed over a string of *bits*, so you have
+to decide on the endianness of the bits within each byte.  To get
+the best error-detecting properties, this should correspond to the
+order they're actually sent.  For example, standard RS-232 serial is
+little-endian; the most significant bit (sometimes used for parity)
+is sent last.  And when appending a CRC word to a message, you should
+do it in the right order, matching the endianness.
+
+Just like with ordinary division, you proceed one digit (bit) at a time.
+Each step of the division, division, you take one more digit (bit) of the
+dividend and append it to the current remainder.  Then you figure out the
+appropriate multiple of the divisor to subtract to being the remainder
+back into range.  In binary, this is easy - it has to be either 0 or 1,
+and to make the XOR cancel, it's just a copy of bit 32 of the remainder.
+
+When computing a CRC, we don't care about the quotient, so we can
+throw the quotient bit away, but subtract the appropriate multiple of
+the polynomial from the remainder and we're back to where we started,
+ready to process the next bit.
+
+A big-endian CRC written this way would be coded like:
+for (i = 0; i  input_bits; i++) {
+   multiple = remainder  0x8000 ? CRCPOLY : 0;
+   remainder = (remainder  1 | next_input_bit()) ^ multiple;
+}
+
+Notice how, to get at bit 32 of the shifted remainder, we look
+at bit 31 of the remainder *before* shifting it.
+
+But also notice how the next_input_bit() bits we're shifting into
+the remainder don't actually affect any decision-making until
+32 bits later.  Thus, the first 32 cycles of this are pretty boring.
+Also, to add the CRC to a message, we need a 32-bit-long hole for it at
+the end, so we have to add 32 extra cycles shifting in zeros at the
+end of every message,
+
+These details lead to a standard trick: rearrange merging in the
+next_input_bit() until the moment it's needed.  Then the first 32 cycles
+can be precomputed, and merging in the final 32 zero bits to make room
+for the CRC can be skipped entirely.  This changes the code to:
+
+for (i = 0; i  input_bits; i++) {
+   remainder ^= next_input_bit()  31;
+   multiple = (remainder  0x8000) ? CRCPOLY : 0;
+   remainder = (remainder  1) ^ multiple;
+}
+
+With this optimization, the little-endian code is particularly simple:
+for (i = 0; i  input_bits; i++) {
+   remainder ^= next_input_bit();
+   multiple = (remainder  1) ? CRCPOLY : 0;
+   remainder = (remainder  1) ^ multiple;
+}
+
+The most significant coefficient of the remainder polynomial is stored
+in the least 

[PATCH 05/14] Misc cleanup of lib/crc32.c and related files

2011-11-28 Thread Darrick J. Wong
- removed unnecessary header files.
- straightened out some convoluted ifdef's
- rewrote some references to 2 dimensional arrays as 1 dimensional
  arrays to make them correct. I.e. replaced tab[i] with tab[0][i].
- a few trivial whitespace changes
- fixed a warning in gen_crc32tables.c caused by a mismatch in the
  type of the pointer passed to output table. Since the table is
  only used at kernel compile time, it is simpler to make the table
  big enough to hold the largest column size used. One cannot make the
  column size smaller in output_table because it has to be used by
  both the le and be tables and they can have different column sizes.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c  |  104 +-
 lib/gen_crc32table.c |6 +--
 2 files changed, 39 insertions(+), 71 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index c93c9ae..2a87ea2 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -23,13 +23,10 @@
 /* see: Documentation/crc32.txt for a description of algorithms */
 
 #include linux/crc32.h
-#include linux/kernel.h
 #include linux/module.h
-#include linux/compiler.h
 #include linux/types.h
-#include linux/init.h
-#include linux/atomic.h
 #include crc32defs.h
+
 #if CRC_LE_BITS == 8
 # define tole(x) __constant_cpu_to_le32(x)
 #else
@@ -41,6 +38,7 @@
 #else
 # define tobe(x) (x)
 #endif
+
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
@@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 #undef DO_CRC4
 }
 #endif
+
 /**
  * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_LE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
}
-   return crc;
-}
-#else  /* Table-based approach */
-
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
-{
-# if CRC_LE_BITS == 8
-   const u32  (*tab)[] = crc32table_le;
-
-   crc = __cpu_to_le32(crc);
-   crc = crc32_body(crc, p, len, tab);
-   return __le32_to_cpu(crc);
-# elif CRC_LE_BITS == 4
+# elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[crc  15];
-   crc = (crc  4) ^ crc32table_le[crc  15];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ crc32table_le[0][crc  3];
}
-   return crc;
-# elif CRC_LE_BITS == 2
+# elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
-   crc = (crc  2) ^ crc32table_le[crc  3];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ crc32table_le[0][crc  15];
}
+# elif CRC_LE_BITS == 8
+   const u32  (*tab)[] = crc32table_le;
+
+   crc = __cpu_to_le32(crc);
+   crc = crc32_body(crc, p, len, tab);
+   crc = __le32_to_cpu(crc);
+#endif
return crc;
-# endif
 }
-#endif
+EXPORT_SYMBOL(crc32_le);
 
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
@@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len);
-
-#if CRC_BE_BITS == 1
-/*
- * In fact, the table-based code will work in this case, but it can be
- * simplified by inlining the table in ?: form.
- */
-
 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 {
+#if CRC_BE_BITS == 1
int i;
while (len--) {
crc ^= *p++  24;
@@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
(crc  1) ^ ((crc  0x8000) ? CRCPOLY_BE :
  0);
}
-   return crc;
-}
-
-#else  /* 

[PATCH 03/14] Replaced the unit test provided in crc32.c, which doesn't have a

2011-11-28 Thread Darrick J. Wong
makefile and doesn't compile with current headers, with a simpler
self test routine that also gives a measure of performance and
runs at module init time. The self test option can be enabled
through a configuration option CONFIG_CRC32_SELFTEST.

The test stresses the pre and post loops and is thus not very
realistic since actual uses will likely have addresses and lengths
that are at least 4 byte aligned. However, the main loop is long
enough so that the performance is dominated by that loop.

The expected values for crc32_le and crc32_be were generated
with the original version of crc32.c using CRC_BITS_LE = 8 and
CRC_BITS_BE = 8. These values were then used to check all the
values of the BITS parameters in both the original and new versions.

The performance results show some variability from run to run
in spite of attempts to both warm the cache and reduce the amount
of OS noise by limiting interrutps during the test. To get comparable
results and to analyse options wrt performance the best time
reported over a small sample of runs has been taken.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/Kconfig |   10 +
 lib/crc32.c |  798 ++-
 2 files changed, 691 insertions(+), 117 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5a..2bc5834 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -60,6 +60,16 @@ config CRC32
  kernel tree does. Such modules that use library CRC32 functions
  require M here.
 
+config CRC32_SELFTEST
+   bool CRC32 perform self test on init
+   default n
+   depends on CRC32
+   help
+ This option enables the CRC32 library functions to perform a
+ self test on initialization. The self test computes crc32_le
+ and crc32_be over byte strings with random alignment and length
+ and computes the total elapsed time and number of bytes processed.
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32.c b/lib/crc32.c
index 7ac8b0d..7a0e5a9 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -210,137 +210,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
 EXPORT_SYMBOL(crc32_le);
 EXPORT_SYMBOL(crc32_be);
 
-#ifdef UNITTEST
+#ifdef CONFIG_CRC32_SELFTEST
 
-#include stdlib.h
-#include stdio.h
-
-#if 0  /*Not used at present */
-static void
-buf_dump(char const *prefix, unsigned char const *buf, size_t len)
+/* 4096 random bytes */
+static u8 __attribute__((__aligned__(8))) test_buf[] =
 {
-   fputs(prefix, stdout);
-   while (len--)
-   printf( %02x, *buf++);
-   putchar('\n');
-
-}
-#endif
-
-static void bytereverse(unsigned char *buf, size_t len)
+   0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30,
+   0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4,
+   0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60,
+   0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c,
+   0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4,
+   0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a,
+   0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a,
+   0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4,
+   0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9,
+   0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4,
+   0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca,
+   0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61,
+   0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e,
+   0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a,
+   0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f,
+   0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd,
+   0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c,
+   0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88,
+   0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53,
+   0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f,
+   0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4,
+   0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74,
+   0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60,
+   0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09,
+   0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07,
+   0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1,
+   0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f,
+   0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2,
+   0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0,
+   0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95,
+   0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22,
+   0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93,
+   0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86,
+   0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d,
+   0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40,
+   0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b,
+   0xea, 0xc6, 0x55, 0x8e, 0x57, 0xe6, 0x64, 0x35,
+   0xf0, 0xc7, 0x16, 0x9f, 0x5d, 0x5e, 0x86, 0x40,
+   0x46, 0xbb, 0xe5, 0x45, 0x88, 0xfe, 0xc9, 0x63,
+   0x15, 0xfb, 0xf5, 0xbd, 0x71, 0x61, 0xeb, 0x7b,
+   

[PATCH 10/14] Some final changes

2011-11-28 Thread Darrick J. Wong
- added a comment at the top of crc32.c

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 2c8e8c0..d56516d 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -1,4 +1,8 @@
 /*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
  * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
  * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
  * Code was from the public domain, copyright abandoned.  Code was

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] crc32: Add self-test code for crc32c

2011-11-28 Thread Darrick J. Wong
Add self-test code for crc32c.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |  363 ++-
 1 files changed, 261 insertions(+), 102 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 8df9561..382fa76 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -765,113 +765,265 @@ static struct crc_test {
u32 length; /* random 11 bit length of test */
u32 crc_le; /* expected crc32_le result */
u32 crc_be; /* expected crc32_be result */
+   u32 crc32c_le;  /* expected crc32c_le result */
 } test[] =
 {
-   {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1},
-   {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad},
-   {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f},
-   {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a},
-   {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2},
-   {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793},
-   {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed},
-   {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35},
-   {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2},
-   {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10},
-   {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb},
-   {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0},
-   {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb},
-   {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed},
-   {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591},
-   {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67},
-   {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd},
-   {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a},
-   {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b},
-   {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f},
-   {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d},
-   {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a},
-   {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97},
-   {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2},
-   {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138},
-   {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032},
-   {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f},
-   {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f},
-   {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32},
-   {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef},
-   {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0},
-   {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59},
-   {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4},
-   {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c},
-   {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51},
-   {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11},
-   {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659},
-   {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af},
-   {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99},
-   {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b},
-   {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521},
-   {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3},
-   {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d},
-   {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f},
-   {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b},
-   {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0},
-   {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195},
-   {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d},
-   {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4},
-   {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3},
-   {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643},
-   {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10},
-   {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d},
-   {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5},
-   {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b},
-   {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee},
-   {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14},
-   {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a},
-   {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b},
-   {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3},
-   {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826},
-   {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06},
-   {0xbbb52021, 0x003b, 0x0272

[PATCH 06/14] crc32.c in its original version freely mixed u32, __le32 and __be32 types

2011-11-28 Thread Darrick J. Wong
which caused warnings from sparse with __CHECK_ENDIAN__.
This patch fixes these by forcing the types to u32.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 2a87ea2..ff6bb9a 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -28,13 +28,13 @@
 #include crc32defs.h
 
 #if CRC_LE_BITS == 8
-# define tole(x) __constant_cpu_to_le32(x)
+# define tole(x) (__force u32) __constant_cpu_to_le32(x)
 #else
 # define tole(x) (x)
 #endif
 
 #if CRC_BE_BITS == 8
-# define tobe(x) __constant_cpu_to_be32(x)
+# define tobe(x) (__force u32) __constant_cpu_to_be32(x)
 #else
 # define tobe(x) (x)
 #endif
@@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_LE_BITS == 8
const u32  (*tab)[] = crc32table_le;
 
-   crc = __cpu_to_le32(crc);
+   crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __le32_to_cpu(crc);
+   crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
@@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t 
len)
 # elif CRC_BE_BITS == 8
const u32  (*tab)[] = crc32table_be;
 
-   crc = __cpu_to_be32(crc);
+   crc = (__force u32) __cpu_to_be32(crc);
crc = crc32_body(crc, p, len, tab);
-   crc = __be32_to_cpu(crc);
+   crc = __be32_to_cpu((__force __be32)crc);
 # endif
return crc;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/14] crypto: crc32c should use library implementation

2011-11-28 Thread Darrick J. Wong
Since lib/crc32.c now provides crc32c, remove the software implementation here
and call the library function instead.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Kconfig  |1 +
 crypto/crc32c.c |   94 ++-
 2 files changed, 4 insertions(+), 91 deletions(-)


diff --git a/crypto/Kconfig b/crypto/Kconfig
index 527a857..4c9e93a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -310,6 +310,7 @@ comment Digest
 config CRYPTO_CRC32C
tristate CRC32c CRC algorithm
select CRYPTO_HASH
+   select CRC32
help
  Castagnoli, et al Cyclic Redundancy-Check Algorithm.  Used
  by iSCSI for header and data digests and by others.
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..06f7018 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -40,6 +40,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include linux/crc32.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -53,95 +54,6 @@ struct chksum_desc_ctx {
 };
 
 /*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
-
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-   0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-   0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-   0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-   0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
-   0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-   0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
-   0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
-   0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
-   0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
-   0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-   0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
-   0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
-   0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
-   0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
-   0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-   0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
-   0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-   0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
-   0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
-   0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-   0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
-   0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
-   0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
-   0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-   0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-   0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
-   0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
-   0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
-   0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
-   0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-   0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-   0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
-   0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
-   0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
-   0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-   0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
-   0x34F4F86AL, 0xC69F7B69L

[PATCH 11/14] crc32: Bolt on crc32c

2011-11-28 Thread Darrick J. Wong
Reuse the existing crc32 code to stamp out a crc32c implementation.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32.h |2 ++
 lib/Kconfig   |8 +++---
 lib/crc32.c   |   62 +++--
 lib/crc32defs.h   |7 ++
 lib/gen_crc32table.c  |   35 ++--
 5 files changed, 80 insertions(+), 34 deletions(-)


diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 391a259..68267b6 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -11,6 +11,8 @@
 extern u32  crc32_le(u32 crc, unsigned char const *p, size_t len);
 extern u32  crc32_be(u32 crc, unsigned char const *p, size_t len);
 
+extern u32  __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const 
*)(data), length)
 
 /*
diff --git a/lib/Kconfig b/lib/Kconfig
index 2bc5834..cfddafc 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -51,14 +51,14 @@ config CRC_ITU_T
  functions require M here.
 
 config CRC32
-   tristate CRC32 functions
+   tristate CRC32/CRC32c functions
default y
select BITREVERSE
help
  This option is provided for the case where no in-kernel-tree
- modules require CRC32 functions, but a module built outside the
- kernel tree does. Such modules that use library CRC32 functions
- require M here.
+ modules require CRC32/CRC32c functions, but a module built outside
+ the kernel tree does. Such modules that use library CRC32/CRC32c
+ functions require M here.
 
 config CRC32_SELFTEST
bool CRC32 perform self test on init
diff --git a/lib/crc32.c b/lib/crc32.c
index d56516d..8df9561 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -46,7 +46,7 @@
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
-MODULE_DESCRIPTION(Ethernet CRC32 calculations);
+MODULE_DESCRIPTION(Various CRC32 calculations);
 MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
@@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256],
+ u32 polynomial)
 {
 #if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
-   crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
+   crc = (crc  1) ^ ((crc  1) ? polynomial : 0);
}
 # elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
}
 # elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
}
 # elif CRC_LE_BITS == 8
/* aka Sarwate algorithm */
while (len--) {
crc ^= *p++;
-   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   crc = (crc  8) ^ tab[0][crc  255];
}
 # else
-   const u32  (*tab)[] = crc32table_le;
-
crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE);
+}
 EXPORT_SYMBOL(crc32_le);
 
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
+}
+EXPORT_SYMBOL(__crc32c_le);
+
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le);
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256

[PATCH 08/14] add slicing-by-8 algorithm to the existing

2011-11-28 Thread Darrick J. Wong
slicing-by-4 algorithm. This consists of:
- extend largest BITS size from 32 to 64
- extend tables from tab[4][256] to up to tab[8][256]
- Add code for inner loop.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c  |   40 
 lib/crc32defs.h  |   29 +
 lib/gen_crc32table.c |   43 +++
 3 files changed, 76 insertions(+), 36 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 157b35f..6311712 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -47,25 +47,28 @@ MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
+/* implements slicing-by-4 or slicing-by-8 algorithm */
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
 #  define DO_CRC(x) (crc = t0[(crc ^ (x))  255] ^ (crc  8))
-#  define DO_CRC4 crc = t3[(crc)  255] ^ \
-   t2[(crc  8)  255] ^ \
-   t1[(crc  16)  255] ^ \
-   t0[(crc  24)  255]
+#  define DO_CRC4 (t3[(q)  255] ^ t2[(q  8)  255] ^ \
+  t1[(q  16)  255] ^ t0[(q  24)  255])
+#  define DO_CRC8 (t7[(q)  255] ^ t6[(q  8)  255] ^ \
+  t5[(q  16)  255] ^ t4[(q  24)  255])
 # else
 #  define DO_CRC(x) (crc = t0[((crc  24) ^ (x))  255] ^ (crc  8))
-#  define DO_CRC4 crc = t0[(crc)  255] ^ \
-   t1[(crc  8)  255] ^ \
-   t2[(crc  16)  255] ^ \
-   t3[(crc  24)  255]
+#  define DO_CRC4 (t0[(q)  255] ^ t1[(q  8)  255] ^ \
+  t2[(q  16)  255] ^ t3[(q  24)  255])
+#  define DO_CRC8 (t4[(q)  255] ^ t5[(q  8)  255] ^ \
+  t6[(q  16)  255] ^ t7[(q  24)  255])
 # endif
const u32 *b;
-   size_trem_len;
+   size_t rem_len;
const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
+   const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
+   u32 q;
 
/* Align it */
if (unlikely((long)buf  3  len)) {
@@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
DO_CRC(*buf++);
} while ((--len)  ((long)buf)3);
}
+
+# if CRC_LE_BITS == 32
rem_len = len  3;
-   /* load data 32 bits wide, xor data 32 bits wide. */
len = len  2;
+# else
+   rem_len = len  7;
+   len = len  3;
+# endif
+
b = (const u32 *)buf;
for (--b; len; --len) {
-   crc ^= *++b; /* use pre increment for speed */
-   DO_CRC4;
+   q = crc ^ *++b; /* use pre increment for speed */
+# if CRC_LE_BITS == 32
+   crc = DO_CRC4;
+# else
+   crc = DO_CRC8;
+   q = *++b;
+   crc ^= DO_CRC4;
+# endif
}
len = rem_len;
/* And the last few bytes */
@@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
return crc;
 #undef DO_CRC
 #undef DO_CRC4
+#undef DO_CRC8
 }
 #endif
 
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index daa3a5e..8181592 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,29 +6,42 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
-/* For less performance-sensitive, use 4 or 8 */
+/*
+ * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
+ * For less performance-sensitive, use 4 or 8 to save table size.
+ * For larger systems choose same as CPU architecture as default.
+ * This works well on X86_64, SPARC64 systems. This may require some
+ * elaboration after experiments with other architectures.
+ */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_LE_BITS 64
+#  else
+#  define CRC_LE_BITS 32
+#  endif
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 32
+#  ifdef CONFIG_64BIT
+#  define CRC_BE_BITS 64
+#  else
+#  define CRC_BE_BITS 32
+#  endif
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+#if CRC_LE_BITS  64 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
+#if CRC_BE_BITS  64 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
CRC_BE_BITS  CRC_BE_BITS-1
-# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32}
+# error CRC_BE_BITS must be one of {1, 2, 4, 

[PATCH 09/14] Add two changes that improve the performance of x86 systems

2011-11-28 Thread Darrick J. Wong
1. replace main loop with incrementing counter
   this change improves the performance of the selftest
   by about 5-6% on Nehalem CPUs. The apparent
   reason is that the compiler can use the loop index
   to perform an indexed memory access. This is
   reported to make the performance of PowerPC CPUs
   to get worse.
2. replace the rem_len loop with incrementing counter
   this change improves the performance of the selftest,
   which has more than the usual number of occurances,
   by about 1-2% on x86 CPUs. In actual work loads
   the length is most often a multiple of 4 bytes and
   this code does not get executed as often if at all.
   Again this change is reported to make the performance
   of PowerPC get worse.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 6311712..2c8e8c0 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 # endif
const u32 *b;
size_t rem_len;
+# ifdef CONFIG_X86
+   size_t i;
+# endif
const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
u32 q;
@@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
 # endif
 
b = (const u32 *)buf;
+# ifdef CONFIG_X86
+   --b;
+   for (i = 0; i  len; i++) {
+# else
for (--b; len; --len) {
+# endif
q = crc ^ *++b; /* use pre increment for speed */
 # if CRC_LE_BITS == 32
crc = DO_CRC4;
@@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
/* And the last few bytes */
if (len) {
u8 *p = (u8 *)(b + 1) - 1;
+# ifdef CONFIG_X86
+   for (i = 0; i  len; i++)
+   DO_CRC(*++p); /* use pre increment for speed */
+# else
do {
DO_CRC(*++p); /* use pre increment for speed */
} while (--len);
+# endif
}
return crc;
 #undef DO_CRC

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5.1 00/14] crc32c: Add faster algorithm and self-test code

2011-11-28 Thread Darrick J. Wong
Hi all,

This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
software crc32c implementation.  It removes the crc32c implementation in
crypto/ in favor of using the stamped-out one in lib/.  There is also a change
to Kconfig so that the kernel builder can pick an implementation best suited
for the hardware.

The motivation for this patchset is that I am working on adding full metadata
checksumming to ext4.  As far as performance impact of adding checksumming
goes, I see nearly no change with a standard mail server ffsb simulation.  On a
test that involves only file creation and deletion and extent tree writes, I
see a drop of about 50 pcercent with the current kernel crc32c implementation;
this improves to a drop of about 20 percent with the enclosed crc32c code.

When metadata is usually a small fraction of total IO, this new implementation
doesn't help much because metadata is usually a small fraction of total IO.
However, when we are doing IO that is almost all metadata (such as rm -rf'ing a
tree), then this patch speeds up the operation substantially.

Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
should improve their speed as well.  I have not yet quantified that, however.
This latest submission combines Bob's patches from late August 2011 with mine
so that they can be one coherent patch set.  Please excuse my inability to
combine some of the patches; I've been advised to leave Bob's patches alone and
build atop them instead. :/

Since the last posting, I've also collected some crc32c test results on a bunch
of different x86/powerpc/sparc platforms.  The results can be viewed here:
http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the
performance of the kernel's current crc32 and crc32c software implementations.
The crc32c-by8-le column shows crc32c performance with this patchset applied.
I expect crc32 performance to be roughly the same.

The two _boost columns at the right side of the spreadsheet shows how much
faster the new implementation is over the old one.  As you can see, crc32 rises
substantially, and crc32c experiences a huge increase.  I'm hoping this patch
set meets with everyone's approval and can go in soon.  Herbert Xu didn't
appear to have any strong objections to last month's posting, so I'm wondering
if Andrew has an opinion?

--D

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/14] removed two instances of trailing whitespaces

2011-11-28 Thread Darrick J. Wong
- remove trailing whitespace from lib/crc32.c
- remove trailing whitespace from lib/crc32defs.h

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c |2 +-
 lib/crc32defs.h |2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index a6e633a..23b08ba 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -316,7 +316,7 @@ EXPORT_SYMBOL(crc32_be);
  * in the correct multiple to subtract, we can shift a byte at a time.
  * This produces a 40-bit (rather than a 33-bit) intermediate remainder,
  * but again the multiple of the polynomial to subtract depends only on
- * the high bits, the high 8 bits in this case.  
+ * the high bits, the high 8 bits in this case.
  *
  * The multiple we need in that case is the low 32 bits of a 40-bit
  * value whose high 8 bits are given, and which is a multiple of the
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 9b6773d..f5a5401 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -8,7 +8,7 @@
 
 /* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
 /* For less performance-sensitive, use 4 */
-#ifndef CRC_LE_BITS 
+#ifndef CRC_LE_BITS
 # define CRC_LE_BITS 8
 #endif
 #ifndef CRC_BE_BITS

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/14] crc32.c provides a choice of one of several algorithms for

2011-11-28 Thread Darrick J. Wong
computing the LSB and LSB versions of the CRC32 checksum
based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the
original version the values 1, 2, 4 and 8 respectively selected
versions of the alrogithm that computed the crc 1, 2, 4 and 32
bits as a time. This patch series adds a new version that computes
the CRC 64 bits at a time. To make things easier to understand
the parameter has been reinterpreted to actually stand for the
number of bits processed in each step of the algorithm so that
the old value 8 has been replaced with the value 32. This also
allows us to add in a widely used crc algorithm that
computes the crc 8 bits at a time called the Sarwate algorithm.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c  |   17 ++---
 lib/crc32defs.h  |   18 ++
 lib/gen_crc32table.c |   11 ++-
 3 files changed, 34 insertions(+), 12 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index ff6bb9a..157b35f 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -27,13 +27,13 @@
 #include linux/types.h
 #include crc32defs.h
 
-#if CRC_LE_BITS == 8
+#if CRC_LE_BITS  8
 # define tole(x) (__force u32) __constant_cpu_to_le32(x)
 #else
 # define tole(x) (x)
 #endif
 
-#if CRC_BE_BITS == 8
+#if CRC_BE_BITS  8
 # define tobe(x) (__force u32) __constant_cpu_to_be32(x)
 #else
 # define tobe(x) (x)
@@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
 MODULE_DESCRIPTION(Ethernet CRC32 calculations);
 MODULE_LICENSE(GPL);
 
-#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8
+#if CRC_LE_BITS  8 || CRC_BE_BITS  8
 
 static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
@@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_le[0][crc  15];
}
 # elif CRC_LE_BITS == 8
+   /* aka Sarwate algorithm */
+   while (len--) {
+   crc ^= *p++;
+   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   }
+# else
const u32  (*tab)[] = crc32table_le;
 
crc = (__force u32) __cpu_to_le32(crc);
@@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, 
size_t len)
crc = (crc  4) ^ crc32table_be[0][crc  28];
}
 # elif CRC_BE_BITS == 8
+   while (len--) {
+   crc ^= *p++  24;
+   crc = (crc  8) ^ crc32table_be[0][crc  24];
+   }
+# else
const u32  (*tab)[] = crc32table_be;
 
crc = (__force u32) __cpu_to_be32(crc);
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index f5a5401..daa3a5e 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -6,27 +6,29 @@
 #define CRCPOLY_LE 0xedb88320
 #define CRCPOLY_BE 0x04c11db7
 
-/* How many bits at a time to use.  Requires a table of 4CRC_xx_BITS bytes. 
*/
-/* For less performance-sensitive, use 4 */
+/* How many bits at a time to use.  Valid values are 1, 2, 4, 8, and 32. */
+/* For less performance-sensitive, use 4 or 8 */
 #ifndef CRC_LE_BITS
-# define CRC_LE_BITS 8
+# define CRC_LE_BITS 32
 #endif
 #ifndef CRC_BE_BITS
-# define CRC_BE_BITS 8
+# define CRC_BE_BITS 32
 #endif
 
 /*
  * Little-endian CRC computation.  Used with serial bit streams sent
  * lsbit-first.  Be sure to use cpu_to_le32() to append the computed CRC.
  */
-#if CRC_LE_BITS  8 || CRC_LE_BITS  1 || CRC_LE_BITS  CRC_LE_BITS-1
-# error CRC_LE_BITS must be a power of 2 between 1 and 8
+#if CRC_LE_BITS  32 || CRC_LE_BITS  1 || CRC_LE_BITS == 16 || \
+   CRC_LE_BITS  CRC_LE_BITS-1
+# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
 
 /*
  * Big-endian CRC computation.  Used with serial bit streams sent
  * msbit-first.  Be sure to use cpu_to_be32() to append the computed CRC.
  */
-#if CRC_BE_BITS  8 || CRC_BE_BITS  1 || CRC_BE_BITS  CRC_BE_BITS-1
-# error CRC_BE_BITS must be a power of 2 between 1 and 8
+#if CRC_BE_BITS  32 || CRC_BE_BITS  1 || CRC_BE_BITS == 16 || \
+   CRC_BE_BITS  CRC_BE_BITS-1
+# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32}
 #endif
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
index eced769..99ac744 100644
--- a/lib/gen_crc32table.c
+++ b/lib/gen_crc32table.c
@@ -4,8 +4,17 @@
 
 #define ENTRIES_PER_LINE 4
 
+#if CRC_LE_BITS = 8
 #define LE_TABLE_SIZE (1  CRC_LE_BITS)
+#else
+#define LE_TABLE_SIZE 256
+#endif
+
+#if CRC_BE_BITS = 8
 #define BE_TABLE_SIZE (1  CRC_BE_BITS)
+#else
+#define BE_TABLE_SIZE 256
+#endif
 
 static uint32_t crc32table_le[4][256];
 static uint32_t crc32table_be[4][256];
@@ -24,7 +33,7 @@ static void crc32init_le(void)
 
crc32table_le[0][0] = 0;
 
-   for (i = 1  (CRC_LE_BITS - 1); i; i = 1) {
+   for (i = LE_TABLE_SIZE  1; i; i = 1) {
crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
for (j = 0; j  LE_TABLE_SIZE; j += 2 * i)
crc32table_le[0][i + j] = crc ^ crc32table_le[0][j];

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in

[PATCH 04/14] Replace 2D array references by pointer references in loops.

2011-11-28 Thread Darrick J. Wong
This change has no effect on X86 code but improves PPC
performance.

Signed-off-by: Bob Pearson rpear...@systemfabricworks.com
---
 lib/crc32.c |   21 +++--
 1 files changed, 11 insertions(+), 10 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 7a0e5a9..c93c9ae 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -53,20 +53,21 @@ static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 
(*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC(x) crc = tab[0][(crc ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = tab[3][(crc)  255] ^ \
-   tab[2][(crc  8)  255] ^ \
-   tab[1][(crc  16)  255] ^ \
-   tab[0][(crc  24)  255]
+#  define DO_CRC(x) (crc = t0[(crc ^ (x))  255] ^ (crc  8))
+#  define DO_CRC4 crc = t3[(crc)  255] ^ \
+   t2[(crc  8)  255] ^ \
+   t1[(crc  16)  255] ^ \
+   t0[(crc  24)  255]
 # else
-#  define DO_CRC(x) crc = tab[0][((crc  24) ^ (x))  255] ^ (crc  8)
-#  define DO_CRC4 crc = tab[0][(crc)  255] ^ \
-   tab[1][(crc  8)  255] ^ \
-   tab[2][(crc  16)  255] ^ \
-   tab[3][(crc  24)  255]
+#  define DO_CRC(x) (crc = t0[((crc  24) ^ (x))  255] ^ (crc  8))
+#  define DO_CRC4 crc = t0[(crc)  255] ^ \
+   t1[(crc  8)  255] ^ \
+   t2[(crc  16)  255] ^ \
+   t3[(crc  24)  255]
 # endif
const u32 *b;
size_trem_len;
+   const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3];
 
/* Align it */
if (unlikely((long)buf  3  len)) {

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] crc32: Bolt on crc32c

2011-11-15 Thread Darrick J. Wong
On Fri, Oct 21, 2011 at 09:15:14PM +0200, Herbert Xu wrote:
 On Fri, Oct 21, 2011 at 09:57:03AM -0700, Darrick J. Wong wrote:
  
  My patchset builds upon Bob Pearson's crc32 patchset from early September.  
  Do
  my patches fail to apply after applying his patchset?
  
  Or, to speed things along, should I simply repost both Bob's and my patches 
  as
  one big series?
  
  Bob, have you sent out a new iteration of your patches since September 6th?
 
 I'm fine with you pushing this through whichever tree that Bob's
 patches are going through.

Well... it's been 2.5 weeks since I last asked about this.  No reply, afaict.
I haven't seen any complaints about Bob's latest patchset, nor any complaints
about my set that sits atop his.  On the other hand, I'm pretty sure I haven't
seen Bob's patches appear in any trees, and Google shows no recent progress.

Herbert, would you object to pushing the whole patchset through the crypto
tree?

--D
 
 Cheers,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
 --
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] crc32: Bolt on crc32c

2011-10-26 Thread Darrick J. Wong
On Fri, Oct 21, 2011 at 09:15:14PM +0200, Herbert Xu wrote:
 On Fri, Oct 21, 2011 at 09:57:03AM -0700, Darrick J. Wong wrote:
  
  My patchset builds upon Bob Pearson's crc32 patchset from early September.  
  Do
  my patches fail to apply after applying his patchset?
  
  Or, to speed things along, should I simply repost both Bob's and my patches 
  as
  one big series?
  
  Bob, have you sent out a new iteration of your patches since September 6th?
 
 I'm fine with you pushing this through whichever tree that Bob's
 patches are going through.

Bob,

Which tree (if any) are your patches going through?  

--D

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] crc32: Bolt on crc32c

2011-10-21 Thread Darrick J. Wong
On Fri, Oct 21, 2011 at 02:28:03PM +0200, Herbert Xu wrote:
 On Tue, Oct 04, 2011 at 04:54:03PM -0700, Darrick J. Wong wrote:
  Reuse the existing crc32 code to stamp out a crc32c implementation.
  
  Signed-off-by: Darrick J. Wong djw...@us.ibm.com
 
 Did you want this to go through my tree? If so then there is a
 problem since it doesn't apply at all.

My patchset builds upon Bob Pearson's crc32 patchset from early September.  Do
my patches fail to apply after applying his patchset?

Or, to speed things along, should I simply repost both Bob's and my patches as
one big series?

Bob, have you sent out a new iteration of your patches since September 6th?

--D
 
 Cheers,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
 --
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5.1 4/4] crc32: Select an algorithm via kconfig

2011-10-08 Thread Darrick J. Wong
Oops, the description of CRC32_SLICEBY4 is a bit screwy.  Let's try that again.
---
Allow the kernel builder to choose a crc32* algorithm for the kernel.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---

 lib/Kconfig |   36 
 lib/crc32defs.h |   18 ++
 2 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/lib/Kconfig b/lib/Kconfig
index 477be04..27881d9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -70,6 +70,42 @@ config CRC32_SELFTEST
  and crc32_be over byte strings with random alignment and length
  and computes the total elapsed time and number of bytes processed.
 
+choice
+   prompt CRC32 implementation
+   depends on CRC32
+   default CRC32_SLICEBY8
+
+config CRC32_SLICEBY8
+   bool Slice by 8 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is the fastest algorithm, but comes with a 8KiB lookup table.
+ Most modern processors have enough cache that this shouldn't be
+ a problem.
+
+ If you don't know which to choose, choose this one.
+
+config CRC32_SLICEBY4
+   bool Slice by 4 bytes
+   help
+ Calculate checksum 4 bytes at a time with a clever slicing algorithm.
+ This is a bit slower than slice by 8, but has a smaller 4KiB lookup
+ table.
+
+config CRC32_SARWATE
+   bool Sarwate's Algorithm (one byte at a time)
+   help
+ Calculate checksum a byte at a time using Sarwate's algorithm.  This
+ is not particularly fast, but has a small 256 byte lookup table.
+
+config CRC32_BIT
+   bool Classic Algorithm (one bit at a time)
+   help
+ Calculate checksum one bit at a time.  This is VERY slow, but has
+ no lookup table.  This is provided as a debugging option.
+
+endchoice
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 6fd1917..64cba2c 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -13,6 +13,24 @@
  */
 #define CRC32C_POLY_LE 0x82F63B78
 
+/* Try to choose an implementation variant via Kconfig */
+#ifdef CONFIG_CRC32_SLICEBY8
+# define CRC_LE_BITS 64
+# define CRC_BE_BITS 64
+#endif
+#ifdef CONFIG_CRC32_SLICEBY4
+# define CRC_LE_BITS 32
+# define CRC_BE_BITS 32
+#endif
+#ifdef CONFIG_CRC32_SARWATE
+# define CRC_LE_BITS 8
+# define CRC_BE_BITS 8
+#endif
+#ifdef CONFIG_CRC32_BIT
+# define CRC_LE_BITS 1
+# define CRC_BE_BITS 1
+#endif
+
 /*
  * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
  * For less performance-sensitive, use 4 or 8 to save table size.
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 0/4] crc32c: Add faster algorithm and self-test code

2011-10-06 Thread Darrick J. Wong
On Tue, Oct 04, 2011 at 04:53:57PM -0700, Darrick J. Wong wrote:
 Hi all,
 
 This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
 software crc32c implementation.  It requires that all ten of his patches (at
 least the ones dated 31 Aug 2011) be applied.  It removes the crc32c
 implementation in crypto/ in favor of using the stamped-out one in lib/.  
 There
 is also a change to Kconfig so that the kernel builder can pick an
 implementation best suited for the hardware.
 
 The motivation for this patchset is that I am working on adding full metadata
 checksumming to ext4.  As far as performance impact of adding checksumming
 goes, I see nearly no change with a standard mail server ffsb simulation.  On 
 a
 test that involves only file creation and deletion and extent tree writes, I
 see a drop of about 50 pcercent with the current kernel crc32c implementation;
 this improves to a drop of about 20 percent with the enclosed crc32c code.
 
 When metadata is usually a small fraction of total IO, this new implementation
 doesn't help much because metadata is usually a small fraction of total IO.
 However, when we are doing IO that is almost all metadata (such as rm -rf'ing 
 a
 tree), then this patch speeds up the operation substantially.
 
 Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
 should improve their speed as well.  I have not yet quantified that, however.

As for Mr. Tjernlund's unresolved questions regarding the v4 patch, I have
tested this new code on x64/x32/ppc32/ppc64 and it seems to work fine, both
with the crc32c selftest and also on a practical level with ext4 metadata
checksumming enabled.  Updating to Bob's newest calculation code brings about a
10-15% speedup on the ppc64 box.  I also see that slice-by-8 is about 20%
faster than slice-by-4 on my ppc32 box.

I did _not_ see any failures on ppc32 when running an extended ext4+checksum
test suite.

Details of the ppc32 box:
root@dyn9047029101:~# cat /proc/cpuinfo 
processor   : 0
cpu : 740/750
temperature : 45 C (uncalibrated)
clock   : 500.00MHz
revision: 131.0 (pvr 0008 8300)
bogomips: 49.86

total bogomips  : 49.86
timebase: 24934966
platform: PowerMac
model   : PowerMac1,1
machine : PowerMac1,1
motherboard : PowerMac1,1 MacRISC Power Macintosh
detected as : 66 (BlueWhite G3)
pmac flags  : 
L2 cache: 1024K unified
pmac-generation : NewWorld
Memory  : 896 MB
root@dyn9047029101:~# gcc --version
gcc-4.4.real (Ubuntu 4.4.3-4ubuntu5) 4.4.3
Copyright (C) 2009 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@dyn9047029101:~# for i in /sys/devices/system/cpu/cpu0/cache/*/*; do echo 
$i $(cat $i); done
/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size 32
/sys/devices/system/cpu/cpu0/cache/index0/level 1
/sys/devices/system/cpu/cpu0/cache/index0/number_of_sets 128
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_map 
,,,0001
/sys/devices/system/cpu/cpu0/cache/index0/size 32K
/sys/devices/system/cpu/cpu0/cache/index0/type Data
/sys/devices/system/cpu/cpu0/cache/index0/ways_of_associativity 8
/sys/devices/system/cpu/cpu0/cache/index1/coherency_line_size 32
/sys/devices/system/cpu/cpu0/cache/index1/level 1
/sys/devices/system/cpu/cpu0/cache/index1/number_of_sets 128
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_map 
,,,0001
/sys/devices/system/cpu/cpu0/cache/index1/size 32K
/sys/devices/system/cpu/cpu0/cache/index1/type Instruction
/sys/devices/system/cpu/cpu0/cache/index1/ways_of_associativity 8
/sys/devices/system/cpu/cpu0/cache/index2/coherency_line_size 128
/sys/devices/system/cpu/cpu0/cache/index2/level 2
/sys/devices/system/cpu/cpu0/cache/index2/number_of_sets 4096
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map 
,,,0001
/sys/devices/system/cpu/cpu0/cache/index2/size 1024K
/sys/devices/system/cpu/cpu0/cache/index2/type Unified
/sys/devices/system/cpu/cpu0/cache/index2/ways_of_associativity 2

The ppc64 box:
root@elm3c7:~# cat /proc/cpuinfo 
processor   : 0
cpu : POWER5+ (gs)
clock   : 1900.098000MHz
revision: 2.0 (pvr 003b 0200)

(the rest is omitted for brevity)

root@elm3c7:~# gcc --version
gcc-4.4.real (Ubuntu 4.4.3-4ubuntu5) 4.4.3
Copyright (C) 2009 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

root@elm3c7:~# for i in /sys/devices/system/cpu/cpu0/cache/*/*; do echo $i 
$(cat $i); done
/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size 128
/sys/devices/system/cpu/cpu0/cache/index0/level 1
/sys/devices/system/cpu/cpu0/cache/index0/number_of_sets 64
/sys

[PATCH v5 0/4] crc32c: Add faster algorithm and self-test code

2011-10-04 Thread Darrick J. Wong
Hi all,

This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a
software crc32c implementation.  It requires that all ten of his patches (at
least the ones dated 31 Aug 2011) be applied.  It removes the crc32c
implementation in crypto/ in favor of using the stamped-out one in lib/.  There
is also a change to Kconfig so that the kernel builder can pick an
implementation best suited for the hardware.

The motivation for this patchset is that I am working on adding full metadata
checksumming to ext4.  As far as performance impact of adding checksumming
goes, I see nearly no change with a standard mail server ffsb simulation.  On a
test that involves only file creation and deletion and extent tree writes, I
see a drop of about 50 pcercent with the current kernel crc32c implementation;
this improves to a drop of about 20 percent with the enclosed crc32c code.

When metadata is usually a small fraction of total IO, this new implementation
doesn't help much because metadata is usually a small fraction of total IO.
However, when we are doing IO that is almost all metadata (such as rm -rf'ing a
tree), then this patch speeds up the operation substantially.

Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
should improve their speed as well.  I have not yet quantified that, however.

--D
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] crc32: Bolt on crc32c

2011-10-04 Thread Darrick J. Wong
Reuse the existing crc32 code to stamp out a crc32c implementation.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32.h |2 ++
 lib/Kconfig   |8 +++---
 lib/crc32.c   |   62 +++--
 lib/crc32defs.h   |7 ++
 lib/gen_crc32table.c  |   35 ++--
 5 files changed, 80 insertions(+), 34 deletions(-)


diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 391a259..68267b6 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -11,6 +11,8 @@
 extern u32  crc32_le(u32 crc, unsigned char const *p, size_t len);
 extern u32  crc32_be(u32 crc, unsigned char const *p, size_t len);
 
+extern u32  __crc32c_le(u32 crc, unsigned char const *p, size_t len);
+
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const 
*)(data), length)
 
 /*
diff --git a/lib/Kconfig b/lib/Kconfig
index 8e0bcbd..477be04 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -51,14 +51,14 @@ config CRC_ITU_T
  functions require M here.
 
 config CRC32
-   tristate CRC32 functions
+   tristate CRC32/CRC32c functions
default y
select BITREVERSE
help
  This option is provided for the case where no in-kernel-tree
- modules require CRC32 functions, but a module built outside the
- kernel tree does. Such modules that use library CRC32 functions
- require M here.
+ modules require CRC32/CRC32c functions, but a module built outside
+ the kernel tree does. Such modules that use library CRC32/CRC32c
+ functions require M here.
 
 config CRC32_SELFTEST
bool CRC32 perform self test on init
diff --git a/lib/crc32.c b/lib/crc32.c
index d56516d..8df9561 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -46,7 +46,7 @@
 #include crc32table.h
 
 MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com);
-MODULE_DESCRIPTION(Ethernet CRC32 calculations);
+MODULE_DESCRIPTION(Various CRC32 calculations);
 MODULE_LICENSE(GPL);
 
 #if CRC_LE_BITS  8 || CRC_BE_BITS  8
@@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, 
const u32 (*tab)[256])
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256],
+ u32 polynomial)
 {
 #if CRC_LE_BITS == 1
int i;
while (len--) {
crc ^= *p++;
for (i = 0; i  8; i++)
-   crc = (crc  1) ^ ((crc  1) ? CRCPOLY_LE : 0);
+   crc = (crc  1) ^ ((crc  1) ? polynomial : 0);
}
 # elif CRC_LE_BITS == 2
while (len--) {
crc ^= *p++;
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
-   crc = (crc  2) ^ crc32table_le[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
+   crc = (crc  2) ^ tab[0][crc  3];
}
 # elif CRC_LE_BITS == 4
while (len--) {
crc ^= *p++;
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
-   crc = (crc  4) ^ crc32table_le[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
+   crc = (crc  4) ^ tab[0][crc  15];
}
 # elif CRC_LE_BITS == 8
/* aka Sarwate algorithm */
while (len--) {
crc ^= *p++;
-   crc = (crc  8) ^ crc32table_le[0][crc  255];
+   crc = (crc  8) ^ tab[0][crc  255];
}
 # else
-   const u32  (*tab)[] = crc32table_le;
-
crc = (__force u32) __cpu_to_le32(crc);
crc = crc32_body(crc, p, len, tab);
crc = __le32_to_cpu((__force __le32)crc);
 #endif
return crc;
 }
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE);
+}
 EXPORT_SYMBOL(crc32_le);
 
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+   return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
+}
+EXPORT_SYMBOL(__crc32c_le);
+
 /**
  * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
@@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le);
  * @p: pointer to buffer over which CRC is run
  * @len: length of buffer @p
  */
-u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
+ size_t len, const u32 (*tab)[256

[PATCH 2/4] crypto: crc32c should use library implementation

2011-10-04 Thread Darrick J. Wong
Since lib/crc32.c now provides crc32c, remove the software implementation here
and call the library function instead.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Kconfig  |1 +
 crypto/crc32c.c |   94 ++-
 2 files changed, 4 insertions(+), 91 deletions(-)


diff --git a/crypto/Kconfig b/crypto/Kconfig
index ae27b75..28fdbf6 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -302,6 +302,7 @@ comment Digest
 config CRYPTO_CRC32C
tristate CRC32c CRC algorithm
select CRYPTO_HASH
+   select CRC32
help
  Castagnoli, et al Cyclic Redundancy-Check Algorithm.  Used
  by iSCSI for header and data digests and by others.
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..06f7018 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -40,6 +40,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include linux/crc32.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -53,95 +54,6 @@ struct chksum_desc_ctx {
 };
 
 /*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
-
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-   0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-   0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-   0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-   0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
-   0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-   0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
-   0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
-   0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
-   0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
-   0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-   0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
-   0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
-   0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
-   0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
-   0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-   0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
-   0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-   0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
-   0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
-   0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-   0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
-   0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
-   0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
-   0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-   0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-   0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
-   0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
-   0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
-   0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
-   0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-   0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-   0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
-   0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
-   0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
-   0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-   0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
-   0x34F4F86AL, 0xC69F7B69L

[PATCH 3/4] crc32: Add self-test code for crc32c

2011-10-04 Thread Darrick J. Wong
Add self-test code for crc32c.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/crc32.c |  363 ++-
 1 files changed, 261 insertions(+), 102 deletions(-)


diff --git a/lib/crc32.c b/lib/crc32.c
index 8df9561..382fa76 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -765,113 +765,265 @@ static struct crc_test {
u32 length; /* random 11 bit length of test */
u32 crc_le; /* expected crc32_le result */
u32 crc_be; /* expected crc32_be result */
+   u32 crc32c_le;  /* expected crc32c_le result */
 } test[] =
 {
-   {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1},
-   {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad},
-   {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f},
-   {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a},
-   {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2},
-   {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793},
-   {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed},
-   {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35},
-   {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2},
-   {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10},
-   {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb},
-   {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0},
-   {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb},
-   {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed},
-   {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591},
-   {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67},
-   {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd},
-   {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a},
-   {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b},
-   {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f},
-   {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d},
-   {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a},
-   {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97},
-   {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2},
-   {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138},
-   {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032},
-   {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f},
-   {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f},
-   {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32},
-   {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef},
-   {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0},
-   {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59},
-   {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4},
-   {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c},
-   {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51},
-   {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11},
-   {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659},
-   {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af},
-   {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99},
-   {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b},
-   {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521},
-   {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3},
-   {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d},
-   {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f},
-   {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b},
-   {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0},
-   {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195},
-   {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d},
-   {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4},
-   {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3},
-   {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643},
-   {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10},
-   {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d},
-   {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5},
-   {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b},
-   {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee},
-   {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14},
-   {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a},
-   {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b},
-   {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3},
-   {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826},
-   {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06},
-   {0xbbb52021, 0x003b, 0x0272

[PATCH 4/4] crc32: Select an algorithm via kconfig

2011-10-04 Thread Darrick J. Wong
Allow the kernel builder to choose a crc32* algorithm for the kernel.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig |   35 +++
 lib/crc32defs.h |   18 ++
 2 files changed, 53 insertions(+), 0 deletions(-)


diff --git a/lib/Kconfig b/lib/Kconfig
index 477be04..9f08b64 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -70,6 +70,41 @@ config CRC32_SELFTEST
  and crc32_be over byte strings with random alignment and length
  and computes the total elapsed time and number of bytes processed.
 
+choice
+   prompt CRC32 implementation
+   depends on CRC32
+   default CRC32_SLICEBY8
+
+config CRC32_SLICEBY8
+   bool Slice by 8 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is the fastest algorithm, but comes with a 8KiB lookup table.
+ Most modern processors have enough cache that this shouldn't be
+ a problem.
+
+ If you don't know which to choose, choose this one.
+
+config CRC32_SLICEBY4
+   bool Slice by 4 bytes
+   help
+ Calculate checksum 8 bytes at a time with a clever slicing algorithm.
+ This is reasonably fast, but has a 4KiB lookup table.
+
+config CRC32_SARWATE
+   bool Sarwate's Algorithm (one byte at a time)
+   help
+ Calculate checksum a byte at a time using Sarwate's algorithm.  This
+ is not particularly fast, but has a small 256 byte lookup table.
+
+config CRC32_BIT
+   bool Classic Algorithm (one bit at a time)
+   help
+ Calculate checksum one bit at a time.  This is VERY slow, but has
+ no lookup table.  This is provided as a debugging option.
+
+endchoice
+
 config CRC7
tristate CRC7 functions
help
diff --git a/lib/crc32defs.h b/lib/crc32defs.h
index 6fd1917..64cba2c 100644
--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -13,6 +13,24 @@
  */
 #define CRC32C_POLY_LE 0x82F63B78
 
+/* Try to choose an implementation variant via Kconfig */
+#ifdef CONFIG_CRC32_SLICEBY8
+# define CRC_LE_BITS 64
+# define CRC_BE_BITS 64
+#endif
+#ifdef CONFIG_CRC32_SLICEBY4
+# define CRC_LE_BITS 32
+# define CRC_BE_BITS 32
+#endif
+#ifdef CONFIG_CRC32_SARWATE
+# define CRC_LE_BITS 8
+# define CRC_BE_BITS 8
+#endif
+#ifdef CONFIG_CRC32_BIT
+# define CRC_LE_BITS 1
+# define CRC_BE_BITS 1
+#endif
+
 /*
  * How many bits at a time to use.  Valid values are 1, 2, 4, 8, 32 and 64.
  * For less performance-sensitive, use 4 or 8 to save table size.

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-10-04 Thread Darrick J. Wong
On Tue, Oct 04, 2011 at 07:59:53AM +0100, Herbert Xu wrote:
 On Mon, Oct 03, 2011 at 05:55:10PM -0700, Darrick J. Wong wrote:
 
  So what I think I'm hearing is...
  
  1. Apply Bob's slice-by-8 algorithm patch to regular crc32.
  2. Adapt crc32's build code to generate crc32c as well.
  3. Remove crypto/crc32c.c's implementation and have it wrap the code 
  generated
 by #2.
  4. Retain the current libcrc32c.  I guess if you don't configure CRYPTO and
 CRYPTO_CRC32C then it could also just reference the generated crc32c 
  functions
 directly.
  
  Is this a satisfactory way to move forward?
 
 All good except that you don't really have to touch libcrc32c
 at all.

Ok, let's see what you think of my v5 patchset. :)

--D
 
 Cheers,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
 --
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-10-03 Thread Darrick J. Wong
On Sat, Oct 01, 2011 at 03:52:00PM +0200, Joakim Tjernlund wrote:
 Darrick J. Wong djw...@us.ibm.com wrote on 2011/09/30 18:12:23:
 
  [putting mailing lists on cc]
 
  On Fri, Sep 30, 2011 at 08:01:36AM +0200, Joakim Tjernlund wrote:
  
   (Just happen to see this patch in the archives)
  
   - This is basically an copy of Bobs crc32 work and duplicates code, this
 code needs to move into /lib/crc32.c and use the existing framework.
 
  Which framework are you talking about?  lib/crc32.c appears to be a simple
  module that exports a utility function.  Do you mean that you want to merge 
  the
  crc32{,c}defs.h and gen_crc32{,c}table.c code?  Do you want a build script 
  that
  starts with only a crc${ALG}_defs.h file and stamps out gencrc${ALG}_table.c
  and crc${ALG}.c boilerplate code and then builds it?
 
 I meant adding a crc32c_le in crc32.c and extend gen_table to generate the 
 crc32c table.
 
 
  I really don't know; from my perspective there was a slow implementation in
  crypto/crc32c.c and I wanted to speed it up.  crc32c seems to be in crypto/ 
  and
  not lib/ so that the implementation can be replaced with a hardware 
  accelerated
  version at runtime (crc32c-intel).
 
 It was a mistake to place it there IMHO.
 
 
  For crc32 which has no such hw replacement (as far as I know), moving it 
  into
  crypto/ would incur the overhead of going through the cryptoapi for not much
  benefit.  On the other hand it wouldn't be hard to put the crc32 code into
  crypto/.
 
 No, CRC is not a crypto. It is used by other subsystems like file systems that
 has nothing to do with crypto. Compare with the internet checksum, I think you
 will have a hard time moving it to crypto.

Yes, crc32* are not crypto hashes; crc32c is merely using the framework.  I'm
not inclined to tear it out of there unless the crypto maintainers tell me to
move it, which seems unlikely since Herbert made the move in the first place
for reasons I noted in my other reply.

   - Slice by 8 is just half the speed on my ppc32 compared to slice by 4 so
 it can't be enabled for all archs. Best to start with all 64 bit archs
 
  shrug I suppose I could make CRC32C_BITS configurable.  What is the 
  hardware
  profile of your ppc32 processor?  How much L1D/L2 cache?  slice-by-8 does 
  have
  a big cache footprint.  On the other hand it's faster than the slice-by-4
  (crc32) and Sarwate (crc32c) code in the kernel, even on old slow 32-bit x86
  processors (PII, PIII, P4).
 
 It is a low end embedded 333 MHz CPU with only L1 cache. How much faster
 is slice by 8 than slice by 4 on these old x86 machines?

How much L1 cache?  Or, if you'd rather not give away specifics, has the CPU
more than 8KB L1 cache?  I'm willing to concede that with little cache the
added memory pressure could be painful.

As for the old x86 machines, please have a look at:
http://djwong.org/docs/ext4_metadata_checksums.html#Benchmarking

~15% faster on a 2GHz Via C7
~20% faster on a 2.7GHz P4
~25% faster on a 500MHz P3

I vaguely recall it was ~20% faster on a 400MHz P2, but all the kernel.org
wikis are still down. :(

So I suspect the key factor here is memory hierachy, since all of those systems
have at least 16K of L1 cache.  Slice by 8 might actually suck on a Pentium
Proor earlier.  Unfortunately I don't have anything older than a PII...

 Bobs last version tested for 64/32 bits arch and selected slice by 8/slice by 
 4 based
 on that.
 
 
   - Last time I tested Bobs slice by 8 on ppc32 it didn't work.
 
  ... is crc32c broken *now*?  It seems fine on x86/amd64/ppc64.
 
 Don't know, I haven't tested it. Don't have much time ATM and I don't
 want to test something I don't agree with.

It seems fine on a ppc64 running in 32bit mode too.  I'll go find an old ppc32
and see how it fares.  I think it's a G3 500MHz.

--D
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-10-03 Thread Darrick J. Wong
On Sat, Oct 01, 2011 at 04:02:10PM +0200, Joakim Tjernlund wrote:
 
 Darrick J. Wong djw...@us.ibm.com wrote on 2011/09/30 21:29:56:
 
  The existing CRC32c implementation uses Sarwate's algorithm to calculate the
  code one byte at a time.  Using a slicing-by-8 algorithm adapted from Bob
  Pearson, we can process buffers 8 bytes at a time, for a substantial 
  increase
  in performance.
 
  The motivation for this patchset is that I am working on adding full 
  metadata
  checksumming to ext4 and jbd2.  As far as performance impact of adding
  checksumming goes, I see nearly no change with a standard mail server ffsb
  simulation.  On a test that involves only metadata operations (file creation
  and deletion, and fallocate/truncate), I see a drop of about 50 pcercent 
  with
  the current kernel crc32c implementation; this improves to a drop of about 
  20
  percent with the enclosed crc32c code.
 
  When metadata is usually a small fraction of total IO, this new 
  implementation
  doesn't help much because metadata is usually a small fraction of total IO.
  However, when we are doing IO that is almost all metadata (such as rm 
  -rf'ing a
  tree), then this patch speeds up the operation substantially.
 
  Given that iscsi, sctp, and btrfs also use crc32c, this patchset should 
  improve
  their speed as well.  I have some preliminary results[1] that show the
  difference in various crc algorithms that I've come across: the 
  crc32c-by8-le
  column is the new algorithm in the patch; the crc32c column is the current
  crc32c kernel implementation; and the crc32-kern-le column is the current
  crc32 kernel implementation, which is similar to the results one gets for
  CONFIG_CRC32C_SLICEBY4=y.  As you can see, the new implementation runs at
  nearly 4x the speed of the current implementation; even the slimmer 
  slice-by-4
  implementation is generally 2-3x faster.
 
  However, the implementation allows the kernel builder to select from a 
  variety
  of space-speed tradeoffs, should my results not hold true on a particular
  class of system.
 
  v2: Use the crypto testmgr api for self-test.
  v3: Get rid of the -be version, which had no users.
  v4: Allow kernel builder a choice of speed vs. space optimization.
 
  [1]http://djwong.org/docs/ext4_metadata_checksums.html
  (cached copy of the ext4 wiki)
 
  Signed-off-by: Darrick J. Wong djw...@us.ibm.com
 
 This is based on an old version of Bobs slice by 8 that has lots duplication 
 and
 hard to maintain.

Are you referring to [PATCH v6 05/10] crc32-misc-cleanup.diff from 8/31?  I
haven't seen that one, so I'll go comb the internet.  Thank you for the
pointer, I'll update my patch.

 Start from Bobs latest patches and add crc32c to lib/crc32.c

If I did that, how should I handle patching in the hardware accelerated version
on Intel systems?  That switcheroo ability seems to have been Herbert Xu's
motivation for moving crc32c into crypto/ in the first place:

libcrc32c: Move implementation to crypto crc32c

This patch swaps the role of libcrc32c and crc32c.  Previously
the implementation was in libcrc32c and crc32c was a wrapper.
Now the code is in crc32c and libcrc32c just calls the crypto
layer.

The reason for the change is to tap into the algorithm selection
capability of the crypto API so that optimised implementations
such as the one utilising Intel's CRC32C instruction can be
used where available.

 Also, for crc32c I think you only need slice by 4 and slice by 8

Yes.  The lookup table option is only for people with extremely small systems,
and the per-bit option is usable only for debugging.  They could go away if
anyone's really offended by them. :)

--D
 
  Jocke
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-10-03 Thread Darrick J. Wong
On Mon, Oct 03, 2011 at 09:35:13PM +0100, Herbert Xu wrote:
 On Mon, Oct 03, 2011 at 10:27:03PM +0200, Joakim Tjernlund wrote:
 
Start from Bobs latest patches and add crc32c to lib/crc32.c
  
   If I did that, how should I handle patching in the hardware accelerated 
   version
   on Intel systems?  That switcheroo ability seems to have been Herbert Xu's
   motivation for moving crc32c into crypto/ in the first place:
  
  I don't know, I haven't looked at that problem. I suspect it moved because 
  that
  was the easiest solution. Having an identical impl. of crc32(only the table 
  values differ)
  in crypto compared to the one in lib is not the way forward though.
 
 You can always get crypto/crc32c.c to use call helpers from
 lib/crc32.c.

So what I think I'm hearing is...

1. Apply Bob's slice-by-8 algorithm patch to regular crc32.
2. Adapt crc32's build code to generate crc32c as well.
3. Remove crypto/crc32c.c's implementation and have it wrap the code generated
   by #2.
4. Retain the current libcrc32c.  I guess if you don't configure CRYPTO and
   CRYPTO_CRC32C then it could also just reference the generated crc32c 
functions
   directly.

Is this a satisfactory way to move forward?

--D
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-09-30 Thread Darrick J. Wong
[putting mailing lists on cc]

On Fri, Sep 30, 2011 at 08:01:36AM +0200, Joakim Tjernlund wrote:
 
 (Just happen to see this patch in the archives)
 
 - This is basically an copy of Bobs crc32 work and duplicates code, this
   code needs to move into /lib/crc32.c and use the existing framework.

Which framework are you talking about?  lib/crc32.c appears to be a simple
module that exports a utility function.  Do you mean that you want to merge the
crc32{,c}defs.h and gen_crc32{,c}table.c code?  Do you want a build script that
starts with only a crc${ALG}_defs.h file and stamps out gencrc${ALG}_table.c
and crc${ALG}.c boilerplate code and then builds it?

I really don't know; from my perspective there was a slow implementation in
crypto/crc32c.c and I wanted to speed it up.  crc32c seems to be in crypto/ and
not lib/ so that the implementation can be replaced with a hardware accelerated
version at runtime (crc32c-intel).

For crc32 which has no such hw replacement (as far as I know), moving it into
crypto/ would incur the overhead of going through the cryptoapi for not much
benefit.  On the other hand it wouldn't be hard to put the crc32 code into
crypto/.
 
 - Slice by 8 is just half the speed on my ppc32 compared to slice by 4 so
   it can't be enabled for all archs. Best to start with all 64 bit archs

shrug I suppose I could make CRC32C_BITS configurable.  What is the hardware
profile of your ppc32 processor?  How much L1D/L2 cache?  slice-by-8 does have
a big cache footprint.  On the other hand it's faster than the slice-by-4
(crc32) and Sarwate (crc32c) code in the kernel, even on old slow 32-bit x86
processors (PII, PIII, P4).

 - Last time I tested Bobs slice by 8 on ppc32 it didn't work.

... is crc32c broken *now*?  It seems fine on x86/amd64/ppc64.

--D
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] libcrc32c: Expose big-endian version of crc32c

2011-09-28 Thread Darrick J. Wong
On Wed, Sep 28, 2011 at 01:53:59PM +1000, Herbert Xu wrote:
 On Tue, Sep 27, 2011 at 03:12:53PM -0700, Darrick J. Wong wrote:
  Provide a big-endian version of crc32c for modules that want it.
 
 Who is going to use this?

Well, I was using it for jbd2 ... but since you ask, it seems to work just as
well with crc32c-le, so I think I'll just drop the -be version.

--D
 
 Thanks,
 -- 
 Email: Herbert Xu herb...@gondor.apana.org.au
 Home Page: http://gondor.apana.org.au/~herbert/
 PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
 --
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] libcrc32c: Expose big-endian version of crc32c

2011-09-28 Thread Darrick J. Wong
On Wed, Sep 28, 2011 at 09:51:45AM -0700, Darrick J. Wong wrote:
 On Wed, Sep 28, 2011 at 01:53:59PM +1000, Herbert Xu wrote:
  On Tue, Sep 27, 2011 at 03:12:53PM -0700, Darrick J. Wong wrote:
   Provide a big-endian version of crc32c for modules that want it.
  
  Who is going to use this?
 
 Well, I was using it for jbd2 ... but since you ask, it seems to work just as
 well with crc32c-le, so I think I'll just drop the -be version.

Drat, it's also missing the gen_crc32ctable program.  Sorry for the noise; I'll
resend it.  With the -be parts stripped out I can remove all but the first
patch, which cuts down the code changes considerably.

--D
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-09-28 Thread Darrick J. Wong
The existing CRC32c implementation uses Sarwate's algorithm to calculate the
code one byte at a time.  Using slicing-by-8, we can process buffers 8 bytes at
a time, for a substantial increase in performance.

v2: Use the crypto testmgr api for self-test.
v3: Get rid of the -be version, which had no users.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Makefile  |   11 ++
 crypto/crc32c.c  |  305 ++
 crypto/crc32c_defs.h |   26 
 crypto/gen_crc32ctable.c |   79 
 4 files changed, 340 insertions(+), 81 deletions(-)
 create mode 100644 crypto/crc32c_defs.h
 create mode 100644 crypto/gen_crc32ctable.c


diff --git a/crypto/Makefile b/crypto/Makefile
index ce5a813..00811ef 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -94,3 +94,14 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 #
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
 obj-$(CONFIG_ASYNC_CORE) += async_tx/
+
+hostprogs-y:= gen_crc32ctable
+clean-files:= crc32ctable.h
+
+$(obj)/crc32c.o: $(obj)/crc32c_table.h
+
+quiet_cmd_crc32c = GEN $@
+  cmd_crc32c = $  $@
+
+$(obj)/crc32c_table.h: $(obj)/gen_crc32ctable
+   $(call cmd,crc32c)
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..d510ec8 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -33,6 +33,35 @@
  * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.
  *
+ * The current crc32c implementation is adapted from Bob Pearson's slice-by-8
+ * crc32 kernel patch from mid-2011.
+ *
+ * August 26, 2011 Darrick J. Wong djwong at us.ibm.com
+ * Reuse Bob Pearson's slice-by-8 implementation for e2fsprogs.
+ *
+ * July 20, 2011 Bob Pearson rpearson at systemfabricworks.com
+ * added slice by 8 algorithm to the existing conventional and
+ * slice by 4 algorithms.
+ *
+ * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
+ * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
+ * Code was from the public domain, copyright abandoned.  Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch matt_dom...@dell.com
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32().  Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0.  The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end.  Then individual
+ * users can do whatever they need.
+ *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ *   fs/jffs2 uses seed 0, doesn't xor with ~0.
+ *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
  */
 
 #include crypto/internal/hash.h
@@ -40,6 +69,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include crc32c_defs.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -52,92 +82,205 @@ struct chksum_desc_ctx {
u32 crc;
 };
 
-/*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
+#if CRC32C_BITS  8
+# define tole(x) (__force u32) __constant_cpu_to_le32(x)
+#else
+# define tole(x) (x)
+#endif
 
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L

[PATCH 1/3] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-09-27 Thread Darrick J. Wong
The existing CRC32c implementation uses Sarwate's algorithm to calculate the
code one byte at a time.  Using slicing-by-8, we can process buffers 8 bytes at
a time, for a substantial increase in performance.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Makefile  |   11 +
 crypto/crc32c.c  |  635 ++
 crypto/crc32c_defs.h |   34 +++
 3 files changed, 576 insertions(+), 104 deletions(-)
 create mode 100644 crypto/crc32c_defs.h


diff --git a/crypto/Makefile b/crypto/Makefile
index ce5a813..00811ef 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -94,3 +94,14 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 #
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
 obj-$(CONFIG_ASYNC_CORE) += async_tx/
+
+hostprogs-y:= gen_crc32ctable
+clean-files:= crc32ctable.h
+
+$(obj)/crc32c.o: $(obj)/crc32c_table.h
+
+quiet_cmd_crc32c = GEN $@
+  cmd_crc32c = $  $@
+
+$(obj)/crc32c_table.h: $(obj)/gen_crc32ctable
+   $(call cmd,crc32c)
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..d18f6a1 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -33,6 +33,35 @@
  * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.
  *
+ * The current crc32c implementation is adapted from Bob Pearson's slice-by-8
+ * crc32 kernel patch from mid-2011.
+ *
+ * August 26, 2011 Darrick J. Wong djwong at us.ibm.com
+ * Reuse Bob Pearson's slice-by-8 implementation for e2fsprogs.
+ *
+ * July 20, 2011 Bob Pearson rpearson at systemfabricworks.com
+ * added slice by 8 algorithm to the existing conventional and
+ * slice by 4 algorithms.
+ *
+ * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
+ * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
+ * Code was from the public domain, copyright abandoned.  Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch matt_dom...@dell.com
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32().  Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0.  The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end.  Then individual
+ * users can do whatever they need.
+ *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ *   fs/jffs2 uses seed 0, doesn't xor with ~0.
+ *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
  */
 
 #include crypto/internal/hash.h
@@ -40,6 +69,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include crc32c_defs.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -52,92 +82,398 @@ struct chksum_desc_ctx {
u32 crc;
 };
 
-/*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
+#if CRC_LE_BITS  8
+# define tole(x) (__force u32) __constant_cpu_to_le32(x)
+#else
+# define tole(x) (x)
+#endif
 
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-   0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-   0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-   0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-   0x7198540DL

[PATCH 3/3] crc32c: Implement a self-test for CRC32c

2011-09-27 Thread Darrick J. Wong
This is a self-test for the CRC32c code.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/tcrypt.c  |6 ++
 crypto/testmgr.c |   36 +--
 crypto/testmgr.h |  177 +-
 3 files changed, 211 insertions(+), 8 deletions(-)


diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 617..73c10f8 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -64,7 +64,7 @@ static char *check[] = {
cast6, arc4, michael_mic, deflate, crc32c, tea, xtea,
khazad, wp512, wp384, wp256, tnepres, xeta,  fcrypt,
camellia, seed, salsa20, rmd128, rmd160, rmd256, rmd320,
-   lzo, cts, zlib, NULL
+   lzo, cts, zlib, crc32c-be, NULL
 };
 
 static int test_cipher_jiffies(struct blkcipher_desc *desc, int enc,
@@ -944,6 +944,10 @@ static int do_test(int m)
ret += tcrypt_test(rfc4309(ccm(aes)));
break;
 
+   case 46:
+   ret += tcrypt_test(crc32c-be);
+   break;
+
case 100:
ret += tcrypt_test(hmac(md5));
break;
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b6b93d4..738b79f 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1457,7 +1457,8 @@ static int alg_test_hash(const struct alg_test_desc 
*desc, const char *driver,
 }
 
 static int alg_test_crc32c(const struct alg_test_desc *desc,
-  const char *driver, u32 type, u32 mask)
+  const char *driver, u32 type, u32 mask,
+  int big_endian)
 {
struct crypto_shash *tfm;
u32 val;
@@ -1484,7 +1485,10 @@ static int alg_test_crc32c(const struct alg_test_desc 
*desc,
sdesc.shash.tfm = tfm;
sdesc.shash.flags = 0;
 
-   *(u32 *)sdesc.ctx = le32_to_cpu(420553207);
+   if (big_endian)
+   *(u32 *)sdesc.ctx = be32_to_cpu(420553207);
+   else
+   *(u32 *)sdesc.ctx = le32_to_cpu(420553207);
err = crypto_shash_final(sdesc.shash, (u8 *)val);
if (err) {
printk(KERN_ERR alg: crc32c: Operation failed for 
@@ -1505,6 +1509,18 @@ out:
return err;
 }
 
+static int alg_test_crc32c_be(const struct alg_test_desc *desc,
+ const char *driver, u32 type, u32 mask)
+{
+   return alg_test_crc32c(desc, driver, type, mask, 1);
+}
+
+static int alg_test_crc32c_le(const struct alg_test_desc *desc,
+ const char *driver, u32 type, u32 mask)
+{
+   return alg_test_crc32c(desc, driver, type, mask, 0);
+}
+
 static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver,
  u32 type, u32 mask)
 {
@@ -1707,12 +1723,22 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}, {
.alg = crc32c,
-   .test = alg_test_crc32c,
+   .test = alg_test_crc32c_le,
+   .fips_allowed = 1,
+   .suite = {
+   .hash = {
+   .vecs = crc32c_le_tv_template,
+   .count = CRC32C_LE_TEST_VECTORS
+   }
+   }
+   }, {
+   .alg = crc32c-be,
+   .test = alg_test_crc32c_be,
.fips_allowed = 1,
.suite = {
.hash = {
-   .vecs = crc32c_tv_template,
-   .count = CRC32C_TEST_VECTORS
+   .vecs = crc32c_be_tv_template,
+   .count = CRC32C_BE_TEST_VECTORS
}
}
}, {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 27adc92..8223738 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -10172,9 +10172,10 @@ static struct hash_testvec michael_mic_tv_template[] = 
{
 /*
  * CRC32C test vectors
  */
-#define CRC32C_TEST_VECTORS 14
+#define CRC32C_LE_TEST_VECTORS 14
+#define CRC32C_BE_TEST_VECTORS 14
 
-static struct hash_testvec crc32c_tv_template[] = {
+static struct hash_testvec crc32c_le_tv_template[] = {
{
.psize = 0,
.digest = \x00\x00\x00\x00,
@@ -10346,4 +10347,176 @@ static struct hash_testvec crc32c_tv_template[] = {
},
 };
 
+static struct hash_testvec crc32c_be_tv_template[] = {
+   {
+   .psize = 0,
+   .digest = \x00\x00\x00\x00,
+   },
+   {
+   .key = \x87\xa9\xcb\xed,
+   .ksize = 4,
+   .psize = 0,
+   .digest = \x78\x56\x34\x12,
+   },
+   {
+   .key = \xff\xff\xff\xff,
+   .ksize = 4,
+   .plaintext = \x01\x02\x03\x04\x05\x06\x07\x08
+\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10
+\x11\x12\x13\x14\x15\x16\x17\x18

[PATCH 2/3] libcrc32c: Expose big-endian version of crc32c

2011-09-27 Thread Darrick J. Wong
Provide a big-endian version of crc32c for modules that want it.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32c.h |5 +++--
 lib/libcrc32c.c|   43 ++-
 2 files changed, 37 insertions(+), 11 deletions(-)


diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index bd8b44d..33320e1 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -3,9 +3,10 @@
 
 #include linux/types.h
 
-extern u32 crc32c(u32 crc, const void *address, unsigned int length);
+extern u32 crc32c_le(u32 crc, const void *address, unsigned int length);
+extern u32 crc32c_be(u32 crc, const void *address, unsigned int length);
 
 /* This macro exists for backwards-compatibility. */
-#define crc32c_le crc32c
+#define crc32c crc32c_le
 
 #endif /* _LINUX_CRC32C_H */
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 244f548..e421ff5 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -37,17 +37,17 @@
 #include linux/kernel.h
 #include linux/module.h
 
-static struct crypto_shash *tfm;
+static struct crypto_shash *tfm_le, *tfm_be;
 
-u32 crc32c(u32 crc, const void *address, unsigned int length)
+u32 crc32c_le(u32 crc, const void *address, unsigned int length)
 {
struct {
struct shash_desc shash;
-   char ctx[crypto_shash_descsize(tfm)];
+   char ctx[crypto_shash_descsize(tfm_le)];
} desc;
int err;
 
-   desc.shash.tfm = tfm;
+   desc.shash.tfm = tfm_le;
desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
 
@@ -56,21 +56,46 @@ u32 crc32c(u32 crc, const void *address, unsigned int 
length)
 
return *(u32 *)desc.ctx;
 }
+EXPORT_SYMBOL(crc32c_le);
 
-EXPORT_SYMBOL(crc32c);
+u32 crc32c_be(u32 crc, const void *address, unsigned int length)
+{
+   struct {
+   struct shash_desc shash;
+   char ctx[crypto_shash_descsize(tfm_be)];
+   } desc;
+   int err;
+
+   desc.shash.tfm = tfm_be;
+   desc.shash.flags = 0;
+   *(u32 *)desc.ctx = crc;
+
+   err = crypto_shash_update(desc.shash, address, length);
+   BUG_ON(err);
+
+   return *(u32 *)desc.ctx;
+}
+EXPORT_SYMBOL(crc32c_be);
 
 static int __init libcrc32c_mod_init(void)
 {
-   tfm = crypto_alloc_shash(crc32c, 0, 0);
-   if (IS_ERR(tfm))
-   return PTR_ERR(tfm);
+   tfm_le = crypto_alloc_shash(crc32c, 0, 0);
+   if (IS_ERR(tfm_le))
+   return PTR_ERR(tfm_le);
+
+   tfm_be = crypto_alloc_shash(crc32c-be, 0, 0);
+   if (IS_ERR(tfm_be)) {
+   crypto_free_shash(tfm_le);
+   return PTR_ERR(tfm_be);
+   }
 
return 0;
 }
 
 static void __exit libcrc32c_mod_fini(void)
 {
-   crypto_free_shash(tfm);
+   crypto_free_shash(tfm_be);
+   crypto_free_shash(tfm_le);
 }
 
 module_init(libcrc32c_mod_init);

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] crc32c: Add faster algorithm and self-test code

2011-09-27 Thread Darrick J. Wong
On Tue, Sep 27, 2011 at 03:12:39PM -0700, Darrick J. Wong wrote:
 Hi all,
 
 This patchset replaces the current crc32c software implementation, which uses 
 a
 slow per-byte lookup table algorithm, with a faster implementation that uses 
 an
 adaptation of the slice-by-8 algorithm that Bob Pearson has been pushing for
 crc32.
 
 The motivation for this patchset is that I am working on adding full metadata
 checksumming to ext4[1].  As far as performance impact of adding checksumming
 goes, I see nearly no change with a standard mail server ffsb simulation.  On 
 a
 test that involves only file creation and deletion and extent tree writes, I
 see a drop of about 50 pcercent with the current kernel crc32c implementation;
 this improves to a drop of about 20 percent with the enclosed crc32c code.
 
 When metadata is usually a small fraction of total IO, this new implementation
 doesn't help much because metadata is usually a small fraction of total IO.
 However, when we are doing IO that is almost all metadata (such as rm -rf'ing 
 a
 tree), then this patch speeds up the operation substantially.
 
 Please have a look at the patches, and please feel free to suggest any 
 changes.
 I will be at LPC next week if anyone wishes to discuss, debate, or protest.

Oops, ignore that sentence, since LPC has long passed. :(

--D
 
 Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset
 should improve their speed as well.  I have not yet quantified that, however.
 
 v2: Use the crypto test manager code to check crc32c operation.
 
 --D
 
 [1] https://ext4.wiki.kernel.org/index.php/Ext4_Metadata_Checksums
--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] libcrc32c: Expose big-endian version of crc32c

2011-08-31 Thread Darrick J. Wong
Provide a big-endian version of crc32c for modules that want it.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 include/linux/crc32c.h |5 +++--
 lib/libcrc32c.c|   43 ++-
 2 files changed, 37 insertions(+), 11 deletions(-)


diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index bd8b44d..33320e1 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -3,9 +3,10 @@
 
 #include linux/types.h
 
-extern u32 crc32c(u32 crc, const void *address, unsigned int length);
+extern u32 crc32c_le(u32 crc, const void *address, unsigned int length);
+extern u32 crc32c_be(u32 crc, const void *address, unsigned int length);
 
 /* This macro exists for backwards-compatibility. */
-#define crc32c_le crc32c
+#define crc32c crc32c_le
 
 #endif /* _LINUX_CRC32C_H */
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 244f548..e421ff5 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -37,17 +37,17 @@
 #include linux/kernel.h
 #include linux/module.h
 
-static struct crypto_shash *tfm;
+static struct crypto_shash *tfm_le, *tfm_be;
 
-u32 crc32c(u32 crc, const void *address, unsigned int length)
+u32 crc32c_le(u32 crc, const void *address, unsigned int length)
 {
struct {
struct shash_desc shash;
-   char ctx[crypto_shash_descsize(tfm)];
+   char ctx[crypto_shash_descsize(tfm_le)];
} desc;
int err;
 
-   desc.shash.tfm = tfm;
+   desc.shash.tfm = tfm_le;
desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
 
@@ -56,21 +56,46 @@ u32 crc32c(u32 crc, const void *address, unsigned int 
length)
 
return *(u32 *)desc.ctx;
 }
+EXPORT_SYMBOL(crc32c_le);
 
-EXPORT_SYMBOL(crc32c);
+u32 crc32c_be(u32 crc, const void *address, unsigned int length)
+{
+   struct {
+   struct shash_desc shash;
+   char ctx[crypto_shash_descsize(tfm_be)];
+   } desc;
+   int err;
+
+   desc.shash.tfm = tfm_be;
+   desc.shash.flags = 0;
+   *(u32 *)desc.ctx = crc;
+
+   err = crypto_shash_update(desc.shash, address, length);
+   BUG_ON(err);
+
+   return *(u32 *)desc.ctx;
+}
+EXPORT_SYMBOL(crc32c_be);
 
 static int __init libcrc32c_mod_init(void)
 {
-   tfm = crypto_alloc_shash(crc32c, 0, 0);
-   if (IS_ERR(tfm))
-   return PTR_ERR(tfm);
+   tfm_le = crypto_alloc_shash(crc32c, 0, 0);
+   if (IS_ERR(tfm_le))
+   return PTR_ERR(tfm_le);
+
+   tfm_be = crypto_alloc_shash(crc32c-be, 0, 0);
+   if (IS_ERR(tfm_be)) {
+   crypto_free_shash(tfm_le);
+   return PTR_ERR(tfm_be);
+   }
 
return 0;
 }
 
 static void __exit libcrc32c_mod_fini(void)
 {
-   crypto_free_shash(tfm);
+   crypto_free_shash(tfm_be);
+   crypto_free_shash(tfm_le);
 }
 
 module_init(libcrc32c_mod_init);

--
To unsubscribe from this list: send the line unsubscribe linux-crypto in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] crc32c: Implement CRC32c with slicing-by-8 algorithm

2011-08-31 Thread Darrick J. Wong
The existing CRC32c implementation uses Sarwate's algorithm to calculate the
code one byte at a time.  Using slicing-by-8, we can process buffers 8 bytes at
a time, for a substantial increase in performance.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 crypto/Makefile  |   11 +
 crypto/crc32c.c  |  635 ++
 crypto/crc32c_defs.h |   34 +++
 3 files changed, 576 insertions(+), 104 deletions(-)
 create mode 100644 crypto/crc32c_defs.h


diff --git a/crypto/Makefile b/crypto/Makefile
index ce5a813..00811ef 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -94,3 +94,14 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 #
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
 obj-$(CONFIG_ASYNC_CORE) += async_tx/
+
+hostprogs-y:= gen_crc32ctable
+clean-files:= crc32ctable.h
+
+$(obj)/crc32c.o: $(obj)/crc32c_table.h
+
+quiet_cmd_crc32c = GEN $@
+  cmd_crc32c = $  $@
+
+$(obj)/crc32c_table.h: $(obj)/gen_crc32ctable
+   $(call cmd,crc32c)
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index 3f9ad28..d18f6a1 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -33,6 +33,35 @@
  * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.
  *
+ * The current crc32c implementation is adapted from Bob Pearson's slice-by-8
+ * crc32 kernel patch from mid-2011.
+ *
+ * August 26, 2011 Darrick J. Wong djwong at us.ibm.com
+ * Reuse Bob Pearson's slice-by-8 implementation for e2fsprogs.
+ *
+ * July 20, 2011 Bob Pearson rpearson at systemfabricworks.com
+ * added slice by 8 algorithm to the existing conventional and
+ * slice by 4 algorithms.
+ *
+ * Oct 15, 2000 Matt Domsch matt_dom...@dell.com
+ * Nicer crc32 functions/docs submitted by li...@horizon.com.  Thanks!
+ * Code was from the public domain, copyright abandoned.  Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch matt_dom...@dell.com
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32().  Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0.  The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end.  Then individual
+ * users can do whatever they need.
+ *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ *   fs/jffs2 uses seed 0, doesn't xor with ~0.
+ *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
  */
 
 #include crypto/internal/hash.h
@@ -40,6 +69,7 @@
 #include linux/module.h
 #include linux/string.h
 #include linux/kernel.h
+#include crc32c_defs.h
 
 #define CHKSUM_BLOCK_SIZE  1
 #define CHKSUM_DIGEST_SIZE 4
@@ -52,92 +82,398 @@ struct chksum_desc_ctx {
u32 crc;
 };
 
-/*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
+#if CRC_LE_BITS  8
+# define tole(x) (__force u32) __constant_cpu_to_le32(x)
+#else
+# define tole(x) (x)
+#endif
 
-static const u32 crc32c_table[256] = {
-   0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-   0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-   0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-   0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-   0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-   0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-   0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-   0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-   0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-   0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-   0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-   0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-   0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-   0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-   0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-   0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-   0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-   0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-   0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-   0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-   0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-   0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-   0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-   0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-   0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-   0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-   0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-   0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-   0x7198540DL

[PATCH 3/3] crc32c: Implement a self-test for CRC32c

2011-08-31 Thread Darrick J. Wong
This is a loadable module that will self-test the CRC32c code.

Signed-off-by: Darrick J. Wong djw...@us.ibm.com
---
 lib/Kconfig  |7 +
 lib/Makefile |1 
 lib/libcrc32c_test.c |  694 ++
 3 files changed, 702 insertions(+), 0 deletions(-)
 create mode 100644 lib/libcrc32c_test.c


diff --git a/lib/Kconfig b/lib/Kconfig
index 6c695ff..2bfdde8 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -79,6 +79,13 @@ config LIBCRC32C
  require M here.  See Castagnoli93.
  Module will be libcrc32c.
 
+config LIBCRC32C_SELFTEST
+   tristate CRC32c Self-Test
+   depends on CRYPTO_CRC32C
+   help
+ This is a testing module that ensure that a crc32c implementation
+ is working correctly.
+
 config CRC8
tristate CRC8 function
help
diff --git a/lib/Makefile b/lib/Makefile
index 3f5bc6d..79ca5ed 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_CRC_ITU_T)   += crc-itu-t.o
 obj-$(CONFIG_CRC32)+= crc32.o
 obj-$(CONFIG_CRC7) += crc7.o
 obj-$(CONFIG_LIBCRC32C)+= libcrc32c.o
+obj-$(CONFIG_LIBCRC32C_SELFTEST) += libcrc32c_test.o
 obj-$(CONFIG_CRC8) += crc8.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
diff --git a/lib/libcrc32c_test.c b/lib/libcrc32c_test.c
new file mode 100644
index 000..8b5c75f
--- /dev/null
+++ b/lib/libcrc32c_test.c
@@ -0,0 +1,694 @@
+/*
+ * libcrc32c_test.c: Test buffer and checksums for crc32c.
+ */
+#include linux/module.h
+#include linux/crc32c.h
+
+static u8 test_buf[] = {
+   0xd9, 0xd7, 0x6a, 0x13, 0x3a, 0xb1, 0x05, 0x48,
+   0xda, 0xad, 0x14, 0xbd, 0x03, 0x3a, 0x58, 0x5e,
+   0x6e, 0xd1, 0x56, 0xc9, 0x2e, 0xc4, 0xcb, 0x6b,
+   0xe8, 0x77, 0x52, 0x37, 0x4e, 0x0f, 0x55, 0xd2,
+   0x12, 0x65, 0x90, 0xc2, 0x41, 0x49, 0x81, 0x01,
+   0xf5, 0x01, 0xeb, 0x2d, 0x78, 0x74, 0x23, 0x5d,
+   0x84, 0x5c, 0x81, 0x92, 0x21, 0xe9, 0x8d, 0x1d,
+   0x89, 0xf2, 0x4a, 0xac, 0xdd, 0xf9, 0xaf, 0xee,
+   0x44, 0xe7, 0x6e, 0xed, 0xfb, 0xd8, 0x89, 0x0e,
+   0x96, 0x62, 0xcd, 0xa4, 0x4b, 0xa9, 0xe5, 0x45,
+   0xb1, 0x29, 0x9b, 0x0f, 0xfc, 0xbd, 0x83, 0xab,
+   0xa8, 0x54, 0x96, 0x44, 0x2c, 0x7f, 0xbb, 0xe7,
+   0x52, 0x29, 0x08, 0xee, 0x14, 0xc5, 0xc2, 0xec,
+   0x5a, 0xeb, 0x40, 0x40, 0xea, 0xd1, 0x3d, 0x15,
+   0x73, 0xaa, 0x8c, 0x73, 0xfc, 0xf2, 0x2b, 0x49,
+   0x0b, 0x13, 0x96, 0xd9, 0x8e, 0x4b, 0xbc, 0xe0,
+   0xf4, 0xd2, 0xe0, 0x2e, 0x7a, 0xf0, 0x5d, 0x1f,
+   0xd2, 0x92, 0x97, 0xe0, 0xaa, 0x59, 0xab, 0xc9,
+   0x5c, 0xa6, 0x51, 0x1a, 0xe3, 0xd6, 0x06, 0xb9,
+   0xae, 0xb8, 0x76, 0x36, 0x79, 0x37, 0x52, 0xf6,
+   0x34, 0xaf, 0x27, 0x19, 0xe1, 0xc0, 0x2b, 0xdd,
+   0x01, 0x15, 0xcd, 0xce, 0x44, 0xf6, 0x4c, 0x18,
+   0x92, 0x69, 0xbe, 0x8a, 0x76, 0x23, 0x52, 0x13,
+   0x3f, 0xf9, 0xe0, 0xf5, 0x06, 0x28, 0x7c, 0xc7,
+   0xf3, 0x42, 0x0f, 0xdd, 0x40, 0x33, 0xf7, 0x99,
+   0xe2, 0xad, 0x26, 0xd9, 0x53, 0x10, 0x72, 0x0c,
+   0x4e, 0x43, 0x4c, 0x61, 0xfe, 0xd9, 0xc1, 0x16,
+   0xa1, 0x93, 0xca, 0x3c, 0x75, 0x7f, 0x07, 0x7a,
+   0x65, 0xb3, 0x53, 0x2a, 0x52, 0x00, 0xa0, 0x62,
+   0xe0, 0xa3, 0x1f, 0xad, 0xd7, 0xbb, 0xc0, 0x83,
+   0x5d, 0x54, 0x87, 0x5f, 0xc8, 0x2f, 0xc8, 0xbf,
+   0x69, 0x04, 0x91, 0xc8, 0xa6, 0x1d, 0x4d, 0x46,
+   0x91, 0xfc, 0x26, 0xf4, 0x16, 0xd1, 0xa4, 0xbf,
+   0x5c, 0xa2, 0x6c, 0xdd, 0xb4, 0x40, 0xf2, 0x2e,
+   0xa2, 0xad, 0xf7, 0xf4, 0xa5, 0x8a, 0x3e, 0x23,
+   0x64, 0x08, 0xc8, 0xa1, 0xa0, 0xf0, 0x5d, 0x70,
+   0xd2, 0x77, 0xfd, 0xc8, 0x50, 0x83, 0x0f, 0xd6,
+   0x2b, 0xe4, 0x1f, 0x52, 0x34, 0x33, 0x68, 0xfd,
+   0x92, 0xbe, 0x9f, 0x97, 0x6b, 0x8d, 0x81, 0x91,
+   0x0f, 0xef, 0x65, 0xc8, 0x0d, 0x15, 0x01, 0x77,
+   0x58, 0xb2, 0xf4, 0x1b, 0x06, 0x7e, 0xf5, 0xca,
+   0x15, 0x2e, 0x38, 0xd8, 0x81, 0x1c, 0x1c, 0xa0,
+   0xb6, 0x13, 0x6a, 0x2b, 0x71, 0x34, 0x52, 0xd7,
+   0x1d, 0xbd, 0x37, 0x59, 0xbc, 0x86, 0x25, 0x2b,
+   0xa8, 0x93, 0xce, 0x1a, 0x03, 0x16, 0xfe, 0x01,
+   0x57, 0x99, 0x24, 0x25, 0x2c, 0xb3, 0xab, 0x1e,
+   0x2d, 0x65, 0x20, 0x89, 0x17, 0x02, 0x0e, 0x0a,
+   0xf5, 0x1e, 0xc7, 0xff, 0x1f, 0x61, 0xa9, 0x54,
+   0x18, 0xd4, 0xba, 0x50, 0x57, 0x02, 0xa1, 0xab,
+   0x22, 0x2e, 0x07, 0xea, 0xa9, 0xa3, 0x83, 0x4f,
+   0x27, 0xf5, 0xc5, 0xee, 0x3c, 0x3b, 0x10, 0xad,
+   0x32, 0x2b, 0x1c, 0x03, 0xcb, 0xaf, 0x98, 0x83,
+   0x54, 0xc3, 0x68, 0x63, 0xd4, 0xe0, 0x0e, 0x3c,
+   0x1a, 0x4e, 0xc0, 0x81, 0xd0, 0xe8, 0x6a, 0x62,
+   0x6b, 0x3e, 0x6f, 0xc4, 0xc6, 0x33, 0x4e, 0x26,
+   0x21, 0xf5, 0x04, 0xdf, 0xfa, 0xce, 0x45, 0xaf,
+   0xdc, 0x5e, 0x1b, 0xad, 0x93, 0xca, 0xf5, 0xcf,
+   0xd7, 0xee, 0x0c, 0x5c, 0x5e, 0xb4, 0xf0, 0x92,
+   0xd2, 0xf2, 0xf0, 0xa9, 0x1e, 0xab, 0x80, 0x68,
+   0x46, 0xef, 0xcc, 0x26, 0x0c, 0x5c, 0xdd, 0x4e,
+   0x83, 0xb8, 0xb9, 0x53, 0x6e, 0xf8, 0x93, 0x38