Re: [PATCH] treewide: remove duplicate includes
On Mon, Dec 04, 2017 at 03:19:39AM +0530, Pravin Shedge wrote: > These duplicate includes have been found with scripts/checkincludes.pl but > they have been removed manually to avoid removing false positives. > > Unit Testing: > > - build successful > - LTP testsuite passes. > - checkpatch.pl passes > > Signed-off-by: Pravin Shedge> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c > index 9c42c4e..ab3aef2 100644 > --- a/fs/xfs/scrub/scrub.c > +++ b/fs/xfs/scrub/scrub.c These look reasonable, but please send me (and linux-xfs) the three xfs changes separately so that I can add them to the xfs tree. (Also, thank you for cc'ing the xfs list for this treewide change...) --D
Re: crypto: Work around deallocated stack frame reference gcc bug on sparc.
[add ext4 list to cc] On Fri, Jun 02, 2017 at 11:28:54AM -0400, David Miller wrote: > > On sparc, if we have an alloca() like situation, as is the case with > SHASH_DESC_ON_STACK(), we can end up referencing deallocated stack > memory. The result can be that the value is clobbered if a trap > or interrupt arrives at just the right instruction. > > It only occurs if the function ends returning a value from that > alloca() area and that value can be placed into the return value > register using a single instruction. > > For example, in lib/libcrc32c.c:crc32c() we end up with a return > sequence like: > > return %i7+8 > lduw [%o5+16], %o0 ! MEM[(u32 *)__shash_desc.1_10 + 16B], > > %o5 holds the base of the on-stack area allocated for the shash > descriptor. But the return released the stack frame and the > register window. > > So if an intererupt arrives between 'return' and 'lduw', then > the value read at %o5+16 can be corrupted. > > Add a data compiler barrier to work around this problem. This is > exactly what the gcc fix will end up doing as well, and it absolutely > should not change the code generated for other cpus (unless gcc > on them has the same bug :-) > > With crucial insight from Eric Sandeen. > > Reported-by: Anatoly Pugachev> Signed-off-by: David S. Miller > --- > > See the thread anchored at: > > http://marc.info/?l=linux-sparc=149623182616944=2 > > for discussion, it has a reproducer module. The problem was > first noticed as occaisional XFS checksum corruptions. > > Herbert, I don't expect you to like this but it is the best we can do > I think. It should not pessimize code on other architectures at all. > I will work on fixing the gcc bug but it's been around forever and all > versions are effected. > > I noticed while working on this that at least btrfs duplicates the > facilities provided by lib/libcrc32c.c and therefore should probably > be converted over to straight crc32c() calls if possible. ext4/jbd2's crc32c implementations will also need a fix like this for {ext4,jbd2}_chksum. Note that both of these modules call the crypto api directly to avoid a static dependence on libcrc32c; this was done to reduce kernel footprint for applications that don't need it. (ext2, ext3, and ext4 before the metadata_csum feature existed). --D > > Thanks! > > diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h > index ecdba2f..1ac5b85 100644 > --- a/drivers/infiniband/sw/rxe/rxe.h > +++ b/drivers/infiniband/sw/rxe/rxe.h > @@ -68,6 +68,7 @@ > static inline u32 rxe_crc32(struct rxe_dev *rxe, > u32 crc, void *next, size_t len) > { > + u32 retval; > int err; > > SHASH_DESC_ON_STACK(shash, rxe->tfm); > @@ -81,7 +82,9 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe, > return crc32_le(crc, next, len); > } > > - return *(u32 *)shash_desc_ctx(shash); > + retval = *(u32 *)shash_desc_ctx(shash); > + barrier_data(shash_desc_ctx(shash)); > + return retval; > } > > int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); > diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c > index a97fdc1..baacc18 100644 > --- a/fs/btrfs/hash.c > +++ b/fs/btrfs/hash.c > @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int > length) > { > SHASH_DESC_ON_STACK(shash, tfm); > u32 *ctx = (u32 *)shash_desc_ctx(shash); > + u32 retval; > int err; > > shash->tfm = tfm; > @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int > length) > err = crypto_shash_update(shash, address, length); > BUG_ON(err); > > - return *ctx; > + retval = *ctx; > + barrier_data(ctx); > + return retval; > } > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h > index 2185c7a..fd2e651 100644 > --- a/fs/f2fs/f2fs.h > +++ b/fs/f2fs/f2fs.h > @@ -1078,6 +1078,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, > const void *address, > { > SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); > u32 *ctx = (u32 *)shash_desc_ctx(shash); > + u32 retval; > int err; > > shash->tfm = sbi->s_chksum_driver; > @@ -1087,7 +1088,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, > const void *address, > err = crypto_shash_update(shash, address, length); > BUG_ON(err); > > - return *ctx; > + retval = *ctx; > + barrier_data(ctx); > + return retval; > } > > static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, > diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c > index 74a54b7..9f79547 100644 > --- a/lib/libcrc32c.c > +++ b/lib/libcrc32c.c > @@ -43,7 +43,7 @@ static struct crypto_shash *tfm; > u32 crc32c(u32 crc, const void *address, unsigned int length) > { > SHASH_DESC_ON_STACK(shash, tfm); > - u32 *ctx = (u32 *)shash_desc_ctx(shash); > + u32 ret, *ctx
[PATCH 01/13] crc32: removed two instances of trailing whitespaces
- remove trailing whitespace from lib/crc32.c - remove trailing whitespace from lib/crc32defs.h From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c |2 +- lib/crc32defs.h |2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 4b35d2b..ffea0c9 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -317,7 +317,7 @@ EXPORT_SYMBOL(crc32_be); * in the correct multiple to subtract, we can shift a byte at a time. * This produces a 40-bit (rather than a 33-bit) intermediate remainder, * but again the multiple of the polynomial to subtract depends only on - * the high bits, the high 8 bits in this case. + * the high bits, the high 8 bits in this case. * * The multiple we need in that case is the low 32 bits of a 40-bit * value whose high 8 bits are given, and which is a multiple of the diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 9b6773d..f5a5401 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -8,7 +8,7 @@ /* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ /* For less performance-sensitive, use 4 */ -#ifndef CRC_LE_BITS +#ifndef CRC_LE_BITS # define CRC_LE_BITS 8 #endif #ifndef CRC_BE_BITS -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/13] crc32: Simplify unit test code
Replaced the unit test provided in crc32.c, which doesn't have a makefile and doesn't compile with current headers, with a simpler self test routine that also gives a measure of performance and runs at module init time. The self test option can be enabled through a configuration option CONFIG_CRC32_SELFTEST. The test stresses the pre and post loops and is thus not very realistic since actual uses will likely have addresses and lengths that are at least 4 byte aligned. However, the main loop is long enough so that the performance is dominated by that loop. The expected values for crc32_le and crc32_be were generated with the original version of crc32.c using CRC_BITS_LE = 8 and CRC_BITS_BE = 8. These values were then used to check all the values of the BITS parameters in both the original and new versions. The performance results show some variability from run to run in spite of attempts to both warm the cache and reduce the amount of OS noise by limiting interrutps during the test. To get comparable results and to analyse options wrt performance the best time reported over a small sample of runs has been taken. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 10 + lib/crc32.c | 798 ++- 2 files changed, 691 insertions(+), 117 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 201e1b3..4656dff 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -67,6 +67,16 @@ config CRC32 kernel tree does. Such modules that use library CRC32 functions require M here. +config CRC32_SELFTEST + bool CRC32 perform self test on init + default n + depends on CRC32 + help + This option enables the CRC32 library functions to perform a + self test on initialization. The self test computes crc32_le + and crc32_be over byte strings with random alignment and length + and computes the total elapsed time and number of bytes processed. + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32.c b/lib/crc32.c index c3ce94a..996115d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -211,137 +211,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(crc32_be); -#ifdef UNITTEST +#ifdef CONFIG_CRC32_SELFTEST -#include stdlib.h -#include stdio.h - -#if 0 /*Not used at present */ -static void -buf_dump(char const *prefix, unsigned char const *buf, size_t len) +/* 4096 random bytes */ +static u8 __attribute__((__aligned__(8))) test_buf[] = { - fputs(prefix, stdout); - while (len--) - printf( %02x, *buf++); - putchar('\n'); - -} -#endif - -static void bytereverse(unsigned char *buf, size_t len) + 0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30, + 0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4, + 0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60, + 0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c, + 0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4, + 0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a, + 0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a, + 0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4, + 0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9, + 0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4, + 0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca, + 0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61, + 0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e, + 0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a, + 0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f, + 0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd, + 0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c, + 0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88, + 0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53, + 0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f, + 0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4, + 0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74, + 0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60, + 0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09, + 0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07, + 0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1, + 0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f, + 0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2, + 0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0, + 0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95, + 0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22, + 0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93, + 0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86, + 0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d, + 0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40, + 0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b, + 0xea, 0xc6, 0x55
[PATCH 04/13] crc32: Miscellaneous cleanups
Misc cleanup of lib/crc32.c and related files - removed unnecessary header files. - straightened out some convoluted ifdef's - rewrote some references to 2 dimensional arrays as 1 dimensional arrays to make them correct. I.e. replaced tab[i] with tab[0][i]. - a few trivial whitespace changes - fixed a warning in gen_crc32tables.c caused by a mismatch in the type of the pointer passed to output table. Since the table is only used at kernel compile time, it is simpler to make the table big enough to hold the largest column size used. One cannot make the column size smaller in output_table because it has to be used by both the le and be tables and they can have different column sizes. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 104 +- lib/gen_crc32table.c |6 +-- 2 files changed, 39 insertions(+), 71 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 996115d..bf03922 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -23,13 +23,10 @@ /* see: Documentation/crc32.txt for a description of algorithms */ #include linux/crc32.h -#include linux/kernel.h #include linux/module.h -#include linux/compiler.h #include linux/types.h -#include linux/init.h -#include linux/atomic.h #include crc32defs.h + #if CRC_LE_BITS == 8 # define tole(x) __constant_cpu_to_le32(x) #else @@ -41,6 +38,7 @@ #else # define tobe(x) (x) #endif + #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); @@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) #undef DO_CRC4 } #endif + /** * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); - -#if CRC_LE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { +#if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); } - return crc; -} -#else /* Table-based approach */ - -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) -{ -# if CRC_LE_BITS == 8 - const u32 (*tab)[] = crc32table_le; - - crc = __cpu_to_le32(crc); - crc = crc32_body(crc, p, len, tab); - return __le32_to_cpu(crc); -# elif CRC_LE_BITS == 4 +# elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[crc 15]; - crc = (crc 4) ^ crc32table_le[crc 15]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; } - return crc; -# elif CRC_LE_BITS == 2 +# elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; } +# elif CRC_LE_BITS == 8 + const u32 (*tab)[] = crc32table_le; + + crc = __cpu_to_le32(crc); + crc = crc32_body(crc, p, len, tab); + crc = __le32_to_cpu(crc); +#endif return crc; -# endif } -#endif +EXPORT_SYMBOL(crc32_le); /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 @@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); - -#if CRC_BE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { +#if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ 24; @@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len
[PATCH 06/13] crc32: Make CRC_*_BITS definition correspond to actual bit counts
crc32.c provides a choice of one of several algorithms for computing the LSB and LSB versions of the CRC32 checksum based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the original version the values 1, 2, 4 and 8 respectively selected versions of the alrogithm that computed the crc 1, 2, 4 and 32 bits as a time. This patch series adds a new version that computes the CRC 64 bits at a time. To make things easier to understand the parameter has been reinterpreted to actually stand for the number of bits processed in each step of the algorithm so that the old value 8 has been replaced with the value 32. This also allows us to add in a widely used crc algorithm that computes the crc 8 bits at a time called the Sarwate algorithm. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 17 ++--- lib/crc32defs.h | 18 ++ lib/gen_crc32table.c | 11 ++- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 7394288..5971f2a 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -27,13 +27,13 @@ #include linux/types.h #include crc32defs.h -#if CRC_LE_BITS == 8 +#if CRC_LE_BITS 8 # define tole(x) ((__force u32) __constant_cpu_to_le32(x)) #else # define tole(x) (x) #endif -#if CRC_BE_BITS == 8 +#if CRC_BE_BITS 8 # define tobe(x) ((__force u32) __constant_cpu_to_be32(x)) #else # define tobe(x) (x) @@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); MODULE_DESCRIPTION(Ethernet CRC32 calculations); MODULE_LICENSE(GPL); -#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8 +#if CRC_LE_BITS 8 || CRC_BE_BITS 8 static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) @@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_le[0][crc 15]; } # elif CRC_LE_BITS == 8 + /* aka Sarwate algorithm */ + while (len--) { + crc ^= *p++; + crc = (crc 8) ^ crc32table_le[0][crc 255]; + } +# else const u32 (*tab)[] = crc32table_le; crc = (__force u32) __cpu_to_le32(crc); @@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_be[0][crc 28]; } # elif CRC_BE_BITS == 8 + while (len--) { + crc ^= *p++ 24; + crc = (crc 8) ^ crc32table_be[0][crc 24]; + } +# else const u32 (*tab)[] = crc32table_be; crc = (__force u32) __cpu_to_be32(crc); diff --git a/lib/crc32defs.h b/lib/crc32defs.h index f5a5401..daa3a5e 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,27 +6,29 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ -/* For less performance-sensitive, use 4 */ +/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ +/* For less performance-sensitive, use 4 or 8 */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 8 +# define CRC_LE_BITS 32 #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 8 +# define CRC_BE_BITS 32 #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 8 || CRC_LE_BITS 1 || CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be a power of 2 between 1 and 8 +#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ + CRC_LE_BITS CRC_LE_BITS-1 +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 8 || CRC_BE_BITS 1 || CRC_BE_BITS CRC_BE_BITS-1 -# error CRC_BE_BITS must be a power of 2 between 1 and 8 +#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ + CRC_BE_BITS CRC_BE_BITS-1 +# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32} #endif diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index eced769..99ac744 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -4,8 +4,17 @@ #define ENTRIES_PER_LINE 4 +#if CRC_LE_BITS = 8 #define LE_TABLE_SIZE (1 CRC_LE_BITS) +#else +#define LE_TABLE_SIZE 256 +#endif + +#if CRC_BE_BITS = 8 #define BE_TABLE_SIZE (1 CRC_BE_BITS) +#else +#define BE_TABLE_SIZE 256 +#endif static uint32_t crc32table_le[4][256]; static uint32_t crc32table_be[4][256]; @@ -24,7 +33,7 @@ static void crc32init_le(void) crc32table_le[0][0] = 0; - for (i = 1 (CRC_LE_BITS - 1); i; i = 1) { + for (i = LE_TABLE_SIZE 1; i; i = 1) { crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0
[PATCH 07/13] crc32: Add slice-by-8 algorithm to existing code
add slicing-by-8 algorithm to the existing slicing-by-4 algorithm. This consists of: - extend largest BITS size from 32 to 64 - extend tables from tab[4][256] to up to tab[8][256] - Add code for inner loop. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 38 +++--- lib/crc32defs.h | 29 + lib/gen_crc32table.c | 43 +++ 3 files changed, 75 insertions(+), 35 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 5971f2a..826e163 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -47,25 +47,28 @@ MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 +/* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) crc = t0[(crc ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = t3[(crc) 255] ^ \ - t2[(crc 8) 255] ^ \ - t1[(crc 16) 255] ^ \ - t0[(crc 24) 255] +# define DO_CRC4 (t3[(q) 255] ^ t2[(q 8) 255] ^ \ + t1[(q 16) 255] ^ t0[(q 24) 255]) +# define DO_CRC8 (t7[(q) 255] ^ t6[(q 8) 255] ^ \ + t5[(q 16) 255] ^ t4[(q 24) 255]) # else # define DO_CRC(x) crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = t0[(crc) 255] ^ \ - t1[(crc 8) 255] ^ \ - t2[(crc 16) 255] ^ \ - t3[(crc 24) 255] +# define DO_CRC4 (t0[(q) 255] ^ t1[(q 8) 255] ^ \ + t2[(q 16) 255] ^ t3[(q 24) 255]) +# define DO_CRC8 (t4[(q) 255] ^ t5[(q 8) 255] ^ \ + t6[(q 16) 255] ^ t7[(q 24) 255]) # endif const u32 *b; size_trem_len; const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; + const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; + u32 q; /* Align it */ if (unlikely((long)buf 3 len)) { @@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) DO_CRC(*buf++); } while ((--len) ((long)buf)3); } + +# if CRC_LE_BITS == 32 rem_len = len 3; - /* load data 32 bits wide, xor data 32 bits wide. */ len = len 2; +# else + rem_len = len 7; + len = len 3; +# endif + b = (const u32 *)buf; for (--b; len; --len) { - crc ^= *++b; /* use pre increment for speed */ - DO_CRC4; + q = crc ^ *++b; /* use pre increment for speed */ +# if CRC_LE_BITS == 32 + crc = DO_CRC4; +# else + crc = DO_CRC8; + q = *++b; + crc ^= DO_CRC4; +# endif } len = rem_len; /* And the last few bytes */ @@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) return crc; #undef DO_CRC #undef DO_CRC4 +#undef DO_CRC8 } #endif diff --git a/lib/crc32defs.h b/lib/crc32defs.h index daa3a5e..8181592 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,29 +6,42 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ -/* For less performance-sensitive, use 4 or 8 */ +/* + * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. + * For less performance-sensitive, use 4 or 8 to save table size. + * For larger systems choose same as CPU architecture as default. + * This works well on X86_64, SPARC64 systems. This may require some + * elaboration after experiments with other architectures. + */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_LE_BITS 64 +# else +# define CRC_LE_BITS 32 +# endif #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_BE_BITS 64 +# else +# define CRC_BE_BITS 32 +# endif #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ +#if CRC_LE_BITS 64 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ +#if CRC_BE_BITS 64 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ CRC_BE_BITS CRC_BE_BITS-1
[PATCH 09/13] crc32: Add note about this patchset to crc32.c
Some final changes - added a comment at the top of crc32.c From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 4eac9c7..a1a5145 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1,4 +1,8 @@ /* + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * * Oct 15, 2000 Matt Domsch matt_dom...@dell.com * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5.4 00/13] crc32c: Add faster algorithm and self-test code
Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. This latest submission combines Bob's patches from late August 2011 with mine so that they can be one coherent patch set. Please excuse my inability to combine some of the patches; I've been advised to leave Bob's patches alone and build atop them instead. :/ Since the last posting, I've also collected some crc32c test results on a bunch of different x86/powerpc/sparc platforms. The results can be viewed here: http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the performance of the kernel's current crc32 and crc32c software implementations. The crc32c-by8-le column shows crc32c performance with this patchset applied. I expect crc32 performance to be roughly the same. The two _boost columns at the right side of the spreadsheet shows how much faster the new implementation is over the old one. As you can see, crc32 rises substantially, and crc32c experiences a huge increase. v2: Use the crypto testmgr api for self-test. v3: Get rid of the -be version, which had no users. v4: Allow kernel builder a choice of speed vs. space optimization. v5: Reuse lib/crc32 for crc32c as well, and make crypto/crc32c use lib/crc32.c. v5.1: Include Bob Pearson's patches in submission request. v5.2: Fix changelogs for Bob's patches per akpm request. v5.3: Fix from header bug in patch mail generation scripts. v5.4: Rebase against next-20120118 per akpm request. One patch was already committed, which shortens this patchset. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/13] crc32: Fix mixing of endian-specific types
crc32.c in its original version freely mixed u32, __le32 and __be32 types which caused warnings from sparse with __CHECK_ENDIAN__. This patch fixes these by forcing the types to u32. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index bf03922..7394288 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -28,13 +28,13 @@ #include crc32defs.h #if CRC_LE_BITS == 8 -# define tole(x) __constant_cpu_to_le32(x) +# define tole(x) ((__force u32) __constant_cpu_to_le32(x)) #else # define tole(x) (x) #endif #if CRC_BE_BITS == 8 -# define tobe(x) __constant_cpu_to_be32(x) +# define tobe(x) ((__force u32) __constant_cpu_to_be32(x)) #else # define tobe(x) (x) #endif @@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) # elif CRC_LE_BITS == 8 const u32 (*tab)[] = crc32table_le; - crc = __cpu_to_le32(crc); + crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); - crc = __le32_to_cpu(crc); + crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } @@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) # elif CRC_BE_BITS == 8 const u32 (*tab)[] = crc32table_be; - crc = __cpu_to_be32(crc); + crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); - crc = __be32_to_cpu(crc); + crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/13] crc32: Select an algorithm via kconfig
Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 43 +++ lib/crc32defs.h | 18 ++ 2 files changed, 61 insertions(+), 0 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 58da52d..13e1afa 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -77,6 +77,49 @@ config CRC32_SELFTEST and crc32_be over byte strings with random alignment and length and computes the total elapsed time and number of bytes processed. +choice + prompt CRC32 implementation + depends on CRC32 + default CRC32_SLICEBY8 + +config CRC32_SLICEBY8 + bool Slice by 8 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is the fastest algorithm, but comes with a 8KiB lookup table. + Most modern processors have enough cache to hold this table without + thrashing the cache. + + This is the default implementation choice. Choose this one unless + you have a good reason not to. + +config CRC32_SLICEBY4 + bool Slice by 4 bytes + help + Calculate checksum 4 bytes at a time with a clever slicing algorithm. + This is a bit slower than slice by 8, but has a smaller 4KiB lookup + table. + + Only choose this option if you know what you are doing. + +config CRC32_SARWATE + bool Sarwate's Algorithm (one byte at a time) + help + Calculate checksum a byte at a time using Sarwate's algorithm. This + is not particularly fast, but has a small 256 byte lookup table. + + Only choose this option if you know what you are doing. + +config CRC32_BIT + bool Classic Algorithm (one bit at a time) + help + Calculate checksum one bit at a time. This is VERY slow, but has + no lookup table. This is provided as a debugging option. + + Only choose this option if you are debugging crc32. + +endchoice + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 6fd1917..64cba2c 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -13,6 +13,24 @@ */ #define CRC32C_POLY_LE 0x82F63B78 +/* Try to choose an implementation variant via Kconfig */ +#ifdef CONFIG_CRC32_SLICEBY8 +# define CRC_LE_BITS 64 +# define CRC_BE_BITS 64 +#endif +#ifdef CONFIG_CRC32_SLICEBY4 +# define CRC_LE_BITS 32 +# define CRC_BE_BITS 32 +#endif +#ifdef CONFIG_CRC32_SARWATE +# define CRC_LE_BITS 8 +# define CRC_BE_BITS 8 +#endif +#ifdef CONFIG_CRC32_BIT +# define CRC_LE_BITS 1 +# define CRC_BE_BITS 1 +#endif + /* * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. * For less performance-sensitive, use 4 or 8 to save table size. -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/13] crypto: crc32c should use library implementation
Since lib/crc32.c now provides crc32c, remove the software implementation here and call the library function instead. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Kconfig |1 + crypto/crc32c.c | 94 ++- 2 files changed, 4 insertions(+), 91 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index e6cfe1a..29f4d73 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -308,6 +308,7 @@ comment Digest config CRYPTO_CRC32C tristate CRC32c CRC algorithm select CRYPTO_HASH + select CRC32 help Castagnoli, et al Cyclic Redundancy-Check Algorithm. Used by iSCSI for header and data digests and by others. diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..06f7018 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -40,6 +40,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include linux/crc32.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -53,95 +54,6 @@ struct chksum_desc_ctx { }; /* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ - -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, - 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, - 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, - 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, - 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, - 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, - 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, - 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, - 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, - 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, - 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, - 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, - 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, - 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, - 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, - 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, - 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, - 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, - 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, - 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, - 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, - 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, - 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, - 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, - 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, - 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, - 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, - 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, - 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, - 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, - 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, - 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, - 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, - 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, - 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, - 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, - 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, - 0x34F4F86AL, 0xC69F7B69L
[PATCH 12/13] crc32: Add self-test code for crc32c
Add self-test code for crc32c. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 363 ++- 1 files changed, 261 insertions(+), 102 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 22e3643..ebc5911 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -765,113 +765,265 @@ static struct crc_test { u32 length; /* random 11 bit length of test */ u32 crc_le; /* expected crc32_le result */ u32 crc_be; /* expected crc32_be result */ + u32 crc32c_le; /* expected crc32c_le result */ } test[] = { - {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1}, - {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad}, - {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f}, - {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a}, - {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2}, - {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793}, - {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed}, - {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35}, - {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2}, - {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10}, - {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb}, - {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0}, - {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb}, - {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed}, - {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591}, - {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67}, - {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd}, - {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a}, - {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b}, - {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f}, - {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d}, - {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a}, - {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97}, - {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2}, - {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138}, - {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032}, - {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f}, - {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f}, - {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32}, - {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef}, - {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0}, - {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59}, - {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4}, - {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c}, - {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51}, - {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11}, - {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659}, - {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af}, - {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99}, - {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b}, - {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521}, - {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3}, - {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d}, - {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f}, - {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b}, - {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0}, - {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195}, - {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d}, - {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4}, - {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3}, - {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643}, - {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10}, - {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d}, - {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5}, - {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b}, - {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee}, - {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14}, - {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a}, - {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b}, - {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3}, - {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826}, - {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06}, - {0xbbb52021, 0x003b, 0x0272
[PATCH 10/13] crc32: Bolt on crc32c
Reuse the existing crc32 code to stamp out a crc32c implementation. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32.h |2 ++ lib/Kconfig |8 +++--- lib/crc32.c | 62 +++-- lib/crc32defs.h |7 ++ lib/gen_crc32table.c | 35 ++-- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 391a259..68267b6 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -11,6 +11,8 @@ extern u32 crc32_le(u32 crc, unsigned char const *p, size_t len); extern u32 crc32_be(u32 crc, unsigned char const *p, size_t len); +extern u32 __crc32c_le(u32 crc, unsigned char const *p, size_t len); + #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/Kconfig b/lib/Kconfig index 4656dff..58da52d 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -58,14 +58,14 @@ config CRC_ITU_T functions require M here. config CRC32 - tristate CRC32 functions + tristate CRC32/CRC32c functions default y select BITREVERSE help This option is provided for the case where no in-kernel-tree - modules require CRC32 functions, but a module built outside the - kernel tree does. Such modules that use library CRC32 functions - require M here. + modules require CRC32/CRC32c functions, but a module built outside + the kernel tree does. Such modules that use library CRC32/CRC32c + functions require M here. config CRC32_SELFTEST bool CRC32 perform self test on init diff --git a/lib/crc32.c b/lib/crc32.c index a1a5145..22e3643 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -46,7 +46,7 @@ #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); -MODULE_DESCRIPTION(Ethernet CRC32 calculations); +MODULE_DESCRIPTION(Various CRC32 calculations); MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 @@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256], + u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) - crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); + crc = (crc 1) ^ ((crc 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[0][crc 15]; - crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; - crc = (crc 8) ^ crc32table_le[0][crc 255]; + crc = (crc 8) ^ tab[0][crc 255]; } # else - const u32 (*tab)[] = crc32table_le; - crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE); +} EXPORT_SYMBOL(crc32_le); +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); +} +EXPORT_SYMBOL(__crc32c_le); + /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le); * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256
[PATCH 02/13] crc32: Move long comment about crc32 fundamentals to Documentation/
Moved a long comment from lib/crc32.c to Documentation/crc32.txt where it will more likely get read. - Edited the resulting document to add an explanation of the slicing-by-n algorithm. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: George Spelvin li...@horizon.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- Documentation/00-INDEX |2 + Documentation/crc32.txt | 183 +++ lib/crc32.c | 129 + 3 files changed, 187 insertions(+), 127 deletions(-) create mode 100644 Documentation/crc32.txt diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 65bbd26..e7b38a0 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -104,6 +104,8 @@ cpuidle/ - info on CPU_IDLE, CPU idle state management subsystem. cputopology.txt - documentation on how CPU topology info is exported via sysfs. +crc32.txt + - brief tutorial on CRC computation cris/ - directory with info about Linux on CRIS architecture. crypto/ diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt new file mode 100644 index 000..3d74ba4 --- /dev/null +++ b/Documentation/crc32.txt @@ -0,0 +1,183 @@ +A brief CRC tutorial. + +A CRC is a long-division remainder. You add the CRC to the message, +and the whole thing (message+CRC) is a multiple of the given +CRC polynomial. To check the CRC, you can either check that the +CRC matches the recomputed value, *or* you can check that the +remainder computed on the message+CRC is 0. This latter approach +is used by a lot of hardware implementations, and is why so many +protocols put the end-of-frame flag after the CRC. + +It's actually the same long division you learned in school, except that +- We're working in binary, so the digits are only 0 and 1, and +- When dividing polynomials, there are no carries. Rather than add and + subtract, we just xor. Thus, we tend to get a bit sloppy about + the difference between adding and subtracting. + +Like all division, the remainder is always smaller than the divisor. +To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial. +Since it's 33 bits long, bit 32 is always going to be set, so usually the +CRC is written in hex with the most significant bit omitted. (If you're +familiar with the IEEE 754 floating-point format, it's the same idea.) + +Note that a CRC is computed over a string of *bits*, so you have +to decide on the endianness of the bits within each byte. To get +the best error-detecting properties, this should correspond to the +order they're actually sent. For example, standard RS-232 serial is +little-endian; the most significant bit (sometimes used for parity) +is sent last. And when appending a CRC word to a message, you should +do it in the right order, matching the endianness. + +Just like with ordinary division, you proceed one digit (bit) at a time. +Each step of the division, division, you take one more digit (bit) of the +dividend and append it to the current remainder. Then you figure out the +appropriate multiple of the divisor to subtract to being the remainder +back into range. In binary, this is easy - it has to be either 0 or 1, +and to make the XOR cancel, it's just a copy of bit 32 of the remainder. + +When computing a CRC, we don't care about the quotient, so we can +throw the quotient bit away, but subtract the appropriate multiple of +the polynomial from the remainder and we're back to where we started, +ready to process the next bit. + +A big-endian CRC written this way would be coded like: +for (i = 0; i input_bits; i++) { + multiple = remainder 0x8000 ? CRCPOLY : 0; + remainder = (remainder 1 | next_input_bit()) ^ multiple; +} + +Notice how, to get at bit 32 of the shifted remainder, we look +at bit 31 of the remainder *before* shifting it. + +But also notice how the next_input_bit() bits we're shifting into +the remainder don't actually affect any decision-making until +32 bits later. Thus, the first 32 cycles of this are pretty boring. +Also, to add the CRC to a message, we need a 32-bit-long hole for it at +the end, so we have to add 32 extra cycles shifting in zeros at the +end of every message, + +These details lead to a standard trick: rearrange merging in the +next_input_bit() until the moment it's needed. Then the first 32 cycles +can be precomputed, and merging in the final 32 zero bits to make room +for the CRC can be skipped entirely. This changes the code to: + +for (i = 0; i input_bits; i++) { + remainder ^= next_input_bit() 31; + multiple = (remainder 0x8000) ? CRCPOLY : 0; + remainder = (remainder 1) ^ multiple; +} + +With this optimization, the little-endian code is particularly simple: +for (i = 0; i input_bits; i++) { + remainder
[PATCH v5.3 00/14] crc32c: Add faster algorithm and self-test code
Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. This latest submission combines Bob's patches from late August 2011 with mine so that they can be one coherent patch set. Please excuse my inability to combine some of the patches; I've been advised to leave Bob's patches alone and build atop them instead. :/ Since the last posting, I've also collected some crc32c test results on a bunch of different x86/powerpc/sparc platforms. The results can be viewed here: http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the performance of the kernel's current crc32 and crc32c software implementations. The crc32c-by8-le column shows crc32c performance with this patchset applied. I expect crc32 performance to be roughly the same. The two _boost columns at the right side of the spreadsheet shows how much faster the new implementation is over the old one. As you can see, crc32 rises substantially, and crc32c experiences a huge increase. Since this patch has been out for review for several weeks now without objections, can this go into 3.3, please? v2: Use the crypto testmgr api for self-test. v3: Get rid of the -be version, which had no users. v4: Allow kernel builder a choice of speed vs. space optimization. v5: Reuse lib/crc32 for crc32c as well, and make crypto/crc32c use lib/crc32.c. v5.1: Include Bob Pearson's patches in submission request. v5.2: Fix changelogs for Bob's patches per akpm request. v5.3: Fix from header bug in patch mail generation scripts. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/14] crc32: Move long comment about crc32 fundamentals to Documentation/
Moved a long comment from lib/crc32.c to Documentation/crc32.txt where it will more likely get read. - Edited the resulting document to add an explanation of the slicing-by-n algorithm. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: George Spelvin li...@horizon.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- Documentation/00-INDEX |2 + Documentation/crc32.txt | 183 +++ lib/crc32.c | 129 + 3 files changed, 187 insertions(+), 127 deletions(-) create mode 100644 Documentation/crc32.txt diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 65bbd26..e7b38a0 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -104,6 +104,8 @@ cpuidle/ - info on CPU_IDLE, CPU idle state management subsystem. cputopology.txt - documentation on how CPU topology info is exported via sysfs. +crc32.txt + - brief tutorial on CRC computation cris/ - directory with info about Linux on CRIS architecture. crypto/ diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt new file mode 100644 index 000..3d74ba4 --- /dev/null +++ b/Documentation/crc32.txt @@ -0,0 +1,183 @@ +A brief CRC tutorial. + +A CRC is a long-division remainder. You add the CRC to the message, +and the whole thing (message+CRC) is a multiple of the given +CRC polynomial. To check the CRC, you can either check that the +CRC matches the recomputed value, *or* you can check that the +remainder computed on the message+CRC is 0. This latter approach +is used by a lot of hardware implementations, and is why so many +protocols put the end-of-frame flag after the CRC. + +It's actually the same long division you learned in school, except that +- We're working in binary, so the digits are only 0 and 1, and +- When dividing polynomials, there are no carries. Rather than add and + subtract, we just xor. Thus, we tend to get a bit sloppy about + the difference between adding and subtracting. + +Like all division, the remainder is always smaller than the divisor. +To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial. +Since it's 33 bits long, bit 32 is always going to be set, so usually the +CRC is written in hex with the most significant bit omitted. (If you're +familiar with the IEEE 754 floating-point format, it's the same idea.) + +Note that a CRC is computed over a string of *bits*, so you have +to decide on the endianness of the bits within each byte. To get +the best error-detecting properties, this should correspond to the +order they're actually sent. For example, standard RS-232 serial is +little-endian; the most significant bit (sometimes used for parity) +is sent last. And when appending a CRC word to a message, you should +do it in the right order, matching the endianness. + +Just like with ordinary division, you proceed one digit (bit) at a time. +Each step of the division, division, you take one more digit (bit) of the +dividend and append it to the current remainder. Then you figure out the +appropriate multiple of the divisor to subtract to being the remainder +back into range. In binary, this is easy - it has to be either 0 or 1, +and to make the XOR cancel, it's just a copy of bit 32 of the remainder. + +When computing a CRC, we don't care about the quotient, so we can +throw the quotient bit away, but subtract the appropriate multiple of +the polynomial from the remainder and we're back to where we started, +ready to process the next bit. + +A big-endian CRC written this way would be coded like: +for (i = 0; i input_bits; i++) { + multiple = remainder 0x8000 ? CRCPOLY : 0; + remainder = (remainder 1 | next_input_bit()) ^ multiple; +} + +Notice how, to get at bit 32 of the shifted remainder, we look +at bit 31 of the remainder *before* shifting it. + +But also notice how the next_input_bit() bits we're shifting into +the remainder don't actually affect any decision-making until +32 bits later. Thus, the first 32 cycles of this are pretty boring. +Also, to add the CRC to a message, we need a 32-bit-long hole for it at +the end, so we have to add 32 extra cycles shifting in zeros at the +end of every message, + +These details lead to a standard trick: rearrange merging in the +next_input_bit() until the moment it's needed. Then the first 32 cycles +can be precomputed, and merging in the final 32 zero bits to make room +for the CRC can be skipped entirely. This changes the code to: + +for (i = 0; i input_bits; i++) { + remainder ^= next_input_bit() 31; + multiple = (remainder 0x8000) ? CRCPOLY : 0; + remainder = (remainder 1) ^ multiple; +} + +With this optimization, the little-endian code is particularly simple: +for (i = 0; i input_bits; i++) { + remainder
[PATCH 10/14] crc32: Add note about this patchset to crc32.c
Some final changes - added a comment at the top of crc32.c From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 2c8e8c0..d56516d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1,4 +1,8 @@ /* + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * * Oct 15, 2000 Matt Domsch matt_dom...@dell.com * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/14] crc32: Optimize loop counter for x86
Add two changes that improve the performance of x86 systems 1. replace main loop with incrementing counter this change improves the performance of the selftest by about 5-6% on Nehalem CPUs. The apparent reason is that the compiler can use the loop index to perform an indexed memory access. This is reported to make the performance of PowerPC CPUs to get worse. 2. replace the rem_len loop with incrementing counter this change improves the performance of the selftest, which has more than the usual number of occurances, by about 1-2% on x86 CPUs. In actual work loads the length is most often a multiple of 4 bytes and this code does not get executed as often if at all. Again this change is reported to make the performance of PowerPC get worse. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 13 + 1 files changed, 13 insertions(+), 0 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 6311712..2c8e8c0 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) # endif const u32 *b; size_t rem_len; +# ifdef CONFIG_X86 + size_t i; +# endif const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; u32 q; @@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) # endif b = (const u32 *)buf; +# ifdef CONFIG_X86 + --b; + for (i = 0; i len; i++) { +# else for (--b; len; --len) { +# endif q = crc ^ *++b; /* use pre increment for speed */ # if CRC_LE_BITS == 32 crc = DO_CRC4; @@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) /* And the last few bytes */ if (len) { u8 *p = (u8 *)(b + 1) - 1; +# ifdef CONFIG_X86 + for (i = 0; i len; i++) + DO_CRC(*++p); /* use pre increment for speed */ +# else do { DO_CRC(*++p); /* use pre increment for speed */ } while (--len); +# endif } return crc; #undef DO_CRC -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/14] crc32: Select an algorithm via kconfig
Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 43 +++ lib/crc32defs.h | 18 ++ 2 files changed, 61 insertions(+), 0 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index cfddafc..029c0e3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -70,6 +70,49 @@ config CRC32_SELFTEST and crc32_be over byte strings with random alignment and length and computes the total elapsed time and number of bytes processed. +choice + prompt CRC32 implementation + depends on CRC32 + default CRC32_SLICEBY8 + +config CRC32_SLICEBY8 + bool Slice by 8 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is the fastest algorithm, but comes with a 8KiB lookup table. + Most modern processors have enough cache to hold this table without + thrashing the cache. + + This is the default implementation choice. Choose this one unless + you have a good reason not to. + +config CRC32_SLICEBY4 + bool Slice by 4 bytes + help + Calculate checksum 4 bytes at a time with a clever slicing algorithm. + This is a bit slower than slice by 8, but has a smaller 4KiB lookup + table. + + Only choose this option if you know what you are doing. + +config CRC32_SARWATE + bool Sarwate's Algorithm (one byte at a time) + help + Calculate checksum a byte at a time using Sarwate's algorithm. This + is not particularly fast, but has a small 256 byte lookup table. + + Only choose this option if you know what you are doing. + +config CRC32_BIT + bool Classic Algorithm (one bit at a time) + help + Calculate checksum one bit at a time. This is VERY slow, but has + no lookup table. This is provided as a debugging option. + + Only choose this option if you are debugging crc32. + +endchoice + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 6fd1917..64cba2c 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -13,6 +13,24 @@ */ #define CRC32C_POLY_LE 0x82F63B78 +/* Try to choose an implementation variant via Kconfig */ +#ifdef CONFIG_CRC32_SLICEBY8 +# define CRC_LE_BITS 64 +# define CRC_BE_BITS 64 +#endif +#ifdef CONFIG_CRC32_SLICEBY4 +# define CRC_LE_BITS 32 +# define CRC_BE_BITS 32 +#endif +#ifdef CONFIG_CRC32_SARWATE +# define CRC_LE_BITS 8 +# define CRC_BE_BITS 8 +#endif +#ifdef CONFIG_CRC32_BIT +# define CRC_LE_BITS 1 +# define CRC_BE_BITS 1 +#endif + /* * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. * For less performance-sensitive, use 4 or 8 to save table size. -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/14] crc32: Add self-test code for crc32c
Add self-test code for crc32c. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 363 ++- 1 files changed, 261 insertions(+), 102 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 8df9561..382fa76 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -765,113 +765,265 @@ static struct crc_test { u32 length; /* random 11 bit length of test */ u32 crc_le; /* expected crc32_le result */ u32 crc_be; /* expected crc32_be result */ + u32 crc32c_le; /* expected crc32c_le result */ } test[] = { - {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1}, - {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad}, - {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f}, - {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a}, - {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2}, - {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793}, - {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed}, - {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35}, - {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2}, - {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10}, - {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb}, - {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0}, - {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb}, - {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed}, - {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591}, - {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67}, - {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd}, - {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a}, - {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b}, - {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f}, - {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d}, - {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a}, - {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97}, - {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2}, - {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138}, - {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032}, - {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f}, - {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f}, - {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32}, - {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef}, - {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0}, - {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59}, - {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4}, - {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c}, - {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51}, - {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11}, - {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659}, - {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af}, - {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99}, - {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b}, - {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521}, - {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3}, - {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d}, - {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f}, - {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b}, - {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0}, - {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195}, - {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d}, - {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4}, - {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3}, - {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643}, - {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10}, - {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d}, - {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5}, - {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b}, - {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee}, - {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14}, - {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a}, - {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b}, - {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3}, - {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826}, - {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06}, - {0xbbb52021, 0x003b, 0x0272
[PATCH 07/14] crc32: Make CRC_*_BITS definition correspond to actual bit counts
crc32.c provides a choice of one of several algorithms for computing the LSB and LSB versions of the CRC32 checksum based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the original version the values 1, 2, 4 and 8 respectively selected versions of the alrogithm that computed the crc 1, 2, 4 and 32 bits as a time. This patch series adds a new version that computes the CRC 64 bits at a time. To make things easier to understand the parameter has been reinterpreted to actually stand for the number of bits processed in each step of the algorithm so that the old value 8 has been replaced with the value 32. This also allows us to add in a widely used crc algorithm that computes the crc 8 bits at a time called the Sarwate algorithm. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 17 ++--- lib/crc32defs.h | 18 ++ lib/gen_crc32table.c | 11 ++- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index ff6bb9a..157b35f 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -27,13 +27,13 @@ #include linux/types.h #include crc32defs.h -#if CRC_LE_BITS == 8 +#if CRC_LE_BITS 8 # define tole(x) (__force u32) __constant_cpu_to_le32(x) #else # define tole(x) (x) #endif -#if CRC_BE_BITS == 8 +#if CRC_BE_BITS 8 # define tobe(x) (__force u32) __constant_cpu_to_be32(x) #else # define tobe(x) (x) @@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); MODULE_DESCRIPTION(Ethernet CRC32 calculations); MODULE_LICENSE(GPL); -#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8 +#if CRC_LE_BITS 8 || CRC_BE_BITS 8 static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) @@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_le[0][crc 15]; } # elif CRC_LE_BITS == 8 + /* aka Sarwate algorithm */ + while (len--) { + crc ^= *p++; + crc = (crc 8) ^ crc32table_le[0][crc 255]; + } +# else const u32 (*tab)[] = crc32table_le; crc = (__force u32) __cpu_to_le32(crc); @@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_be[0][crc 28]; } # elif CRC_BE_BITS == 8 + while (len--) { + crc ^= *p++ 24; + crc = (crc 8) ^ crc32table_be[0][crc 24]; + } +# else const u32 (*tab)[] = crc32table_be; crc = (__force u32) __cpu_to_be32(crc); diff --git a/lib/crc32defs.h b/lib/crc32defs.h index f5a5401..daa3a5e 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,27 +6,29 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ -/* For less performance-sensitive, use 4 */ +/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ +/* For less performance-sensitive, use 4 or 8 */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 8 +# define CRC_LE_BITS 32 #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 8 +# define CRC_BE_BITS 32 #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 8 || CRC_LE_BITS 1 || CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be a power of 2 between 1 and 8 +#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ + CRC_LE_BITS CRC_LE_BITS-1 +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 8 || CRC_BE_BITS 1 || CRC_BE_BITS CRC_BE_BITS-1 -# error CRC_BE_BITS must be a power of 2 between 1 and 8 +#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ + CRC_BE_BITS CRC_BE_BITS-1 +# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32} #endif diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index eced769..99ac744 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -4,8 +4,17 @@ #define ENTRIES_PER_LINE 4 +#if CRC_LE_BITS = 8 #define LE_TABLE_SIZE (1 CRC_LE_BITS) +#else +#define LE_TABLE_SIZE 256 +#endif + +#if CRC_BE_BITS = 8 #define BE_TABLE_SIZE (1 CRC_BE_BITS) +#else +#define BE_TABLE_SIZE 256 +#endif static uint32_t crc32table_le[4][256]; static uint32_t crc32table_be[4][256]; @@ -24,7 +33,7 @@ static void crc32init_le(void) crc32table_le[0][0] = 0; - for (i = 1 (CRC_LE_BITS - 1); i; i = 1) { + for (i = LE_TABLE_SIZE 1; i; i = 1) { crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0
[PATCH 03/14] crc32: Simplify unit test code
Replaced the unit test provided in crc32.c, which doesn't have a makefile and doesn't compile with current headers, with a simpler self test routine that also gives a measure of performance and runs at module init time. The self test option can be enabled through a configuration option CONFIG_CRC32_SELFTEST. The test stresses the pre and post loops and is thus not very realistic since actual uses will likely have addresses and lengths that are at least 4 byte aligned. However, the main loop is long enough so that the performance is dominated by that loop. The expected values for crc32_le and crc32_be were generated with the original version of crc32.c using CRC_BITS_LE = 8 and CRC_BITS_BE = 8. These values were then used to check all the values of the BITS parameters in both the original and new versions. The performance results show some variability from run to run in spite of attempts to both warm the cache and reduce the amount of OS noise by limiting interrutps during the test. To get comparable results and to analyse options wrt performance the best time reported over a small sample of runs has been taken. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 10 + lib/crc32.c | 798 ++- 2 files changed, 691 insertions(+), 117 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 32f3e5a..2bc5834 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -60,6 +60,16 @@ config CRC32 kernel tree does. Such modules that use library CRC32 functions require M here. +config CRC32_SELFTEST + bool CRC32 perform self test on init + default n + depends on CRC32 + help + This option enables the CRC32 library functions to perform a + self test on initialization. The self test computes crc32_le + and crc32_be over byte strings with random alignment and length + and computes the total elapsed time and number of bytes processed. + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32.c b/lib/crc32.c index 7ac8b0d..7a0e5a9 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -210,137 +210,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(crc32_be); -#ifdef UNITTEST +#ifdef CONFIG_CRC32_SELFTEST -#include stdlib.h -#include stdio.h - -#if 0 /*Not used at present */ -static void -buf_dump(char const *prefix, unsigned char const *buf, size_t len) +/* 4096 random bytes */ +static u8 __attribute__((__aligned__(8))) test_buf[] = { - fputs(prefix, stdout); - while (len--) - printf( %02x, *buf++); - putchar('\n'); - -} -#endif - -static void bytereverse(unsigned char *buf, size_t len) + 0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30, + 0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4, + 0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60, + 0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c, + 0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4, + 0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a, + 0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a, + 0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4, + 0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9, + 0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4, + 0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca, + 0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61, + 0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e, + 0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a, + 0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f, + 0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd, + 0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c, + 0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88, + 0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53, + 0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f, + 0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4, + 0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74, + 0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60, + 0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09, + 0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07, + 0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1, + 0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f, + 0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2, + 0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0, + 0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95, + 0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22, + 0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93, + 0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86, + 0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d, + 0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40, + 0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b, + 0xea, 0xc6, 0x55
[PATCH 04/14] crc32: Speed up memory table access on powerpc
Replace 2D array references by pointer references in loops. This change has no effect on X86 code but improves PPC performance. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 21 +++-- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 7a0e5a9..c93c9ae 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -53,20 +53,21 @@ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = tab[3][(crc) 255] ^ \ - tab[2][(crc 8) 255] ^ \ - tab[1][(crc 16) 255] ^ \ - tab[0][(crc 24) 255] +# define DO_CRC(x) (crc = t0[(crc ^ (x)) 255] ^ (crc 8)) +# define DO_CRC4 crc = t3[(crc) 255] ^ \ + t2[(crc 8) 255] ^ \ + t1[(crc 16) 255] ^ \ + t0[(crc 24) 255] # else -# define DO_CRC(x) crc = tab[0][((crc 24) ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = tab[0][(crc) 255] ^ \ - tab[1][(crc 8) 255] ^ \ - tab[2][(crc 16) 255] ^ \ - tab[3][(crc 24) 255] +# define DO_CRC(x) (crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8)) +# define DO_CRC4 crc = t0[(crc) 255] ^ \ + t1[(crc 8) 255] ^ \ + t2[(crc 16) 255] ^ \ + t3[(crc 24) 255] # endif const u32 *b; size_trem_len; + const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; /* Align it */ if (unlikely((long)buf 3 len)) { -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/14] crc32: Miscellaneous cleanups
Misc cleanup of lib/crc32.c and related files - removed unnecessary header files. - straightened out some convoluted ifdef's - rewrote some references to 2 dimensional arrays as 1 dimensional arrays to make them correct. I.e. replaced tab[i] with tab[0][i]. - a few trivial whitespace changes - fixed a warning in gen_crc32tables.c caused by a mismatch in the type of the pointer passed to output table. Since the table is only used at kernel compile time, it is simpler to make the table big enough to hold the largest column size used. One cannot make the column size smaller in output_table because it has to be used by both the le and be tables and they can have different column sizes. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 104 +- lib/gen_crc32table.c |6 +-- 2 files changed, 39 insertions(+), 71 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index c93c9ae..2a87ea2 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -23,13 +23,10 @@ /* see: Documentation/crc32.txt for a description of algorithms */ #include linux/crc32.h -#include linux/kernel.h #include linux/module.h -#include linux/compiler.h #include linux/types.h -#include linux/init.h -#include linux/atomic.h #include crc32defs.h + #if CRC_LE_BITS == 8 # define tole(x) __constant_cpu_to_le32(x) #else @@ -41,6 +38,7 @@ #else # define tobe(x) (x) #endif + #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); @@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) #undef DO_CRC4 } #endif + /** * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); - -#if CRC_LE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { +#if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); } - return crc; -} -#else /* Table-based approach */ - -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) -{ -# if CRC_LE_BITS == 8 - const u32 (*tab)[] = crc32table_le; - - crc = __cpu_to_le32(crc); - crc = crc32_body(crc, p, len, tab); - return __le32_to_cpu(crc); -# elif CRC_LE_BITS == 4 +# elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[crc 15]; - crc = (crc 4) ^ crc32table_le[crc 15]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; } - return crc; -# elif CRC_LE_BITS == 2 +# elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; } +# elif CRC_LE_BITS == 8 + const u32 (*tab)[] = crc32table_le; + + crc = __cpu_to_le32(crc); + crc = crc32_body(crc, p, len, tab); + crc = __le32_to_cpu(crc); +#endif return crc; -# endif } -#endif +EXPORT_SYMBOL(crc32_le); /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 @@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); - -#if CRC_BE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { +#if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ 24; @@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len
[PATCH 06/14] crc32: Fix mixing of endian-specific types
crc32.c in its original version freely mixed u32, __le32 and __be32 types which caused warnings from sparse with __CHECK_ENDIAN__. This patch fixes these by forcing the types to u32. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 2a87ea2..ff6bb9a 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -28,13 +28,13 @@ #include crc32defs.h #if CRC_LE_BITS == 8 -# define tole(x) __constant_cpu_to_le32(x) +# define tole(x) (__force u32) __constant_cpu_to_le32(x) #else # define tole(x) (x) #endif #if CRC_BE_BITS == 8 -# define tobe(x) __constant_cpu_to_be32(x) +# define tobe(x) (__force u32) __constant_cpu_to_be32(x) #else # define tobe(x) (x) #endif @@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) # elif CRC_LE_BITS == 8 const u32 (*tab)[] = crc32table_le; - crc = __cpu_to_le32(crc); + crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); - crc = __le32_to_cpu(crc); + crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } @@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) # elif CRC_BE_BITS == 8 const u32 (*tab)[] = crc32table_be; - crc = __cpu_to_be32(crc); + crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); - crc = __be32_to_cpu(crc); + crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/14] crc32: removed two instances of trailing whitespaces
- remove trailing whitespace from lib/crc32.c - remove trailing whitespace from lib/crc32defs.h From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c |2 +- lib/crc32defs.h |2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index a6e633a..23b08ba 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -316,7 +316,7 @@ EXPORT_SYMBOL(crc32_be); * in the correct multiple to subtract, we can shift a byte at a time. * This produces a 40-bit (rather than a 33-bit) intermediate remainder, * but again the multiple of the polynomial to subtract depends only on - * the high bits, the high 8 bits in this case. + * the high bits, the high 8 bits in this case. * * The multiple we need in that case is the low 32 bits of a 40-bit * value whose high 8 bits are given, and which is a multiple of the diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 9b6773d..f5a5401 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -8,7 +8,7 @@ /* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ /* For less performance-sensitive, use 4 */ -#ifndef CRC_LE_BITS +#ifndef CRC_LE_BITS # define CRC_LE_BITS 8 #endif #ifndef CRC_BE_BITS -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/14] crc32: Bolt on crc32c
Reuse the existing crc32 code to stamp out a crc32c implementation. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32.h |2 ++ lib/Kconfig |8 +++--- lib/crc32.c | 62 +++-- lib/crc32defs.h |7 ++ lib/gen_crc32table.c | 35 ++-- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 391a259..68267b6 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -11,6 +11,8 @@ extern u32 crc32_le(u32 crc, unsigned char const *p, size_t len); extern u32 crc32_be(u32 crc, unsigned char const *p, size_t len); +extern u32 __crc32c_le(u32 crc, unsigned char const *p, size_t len); + #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/Kconfig b/lib/Kconfig index 2bc5834..cfddafc 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,14 +51,14 @@ config CRC_ITU_T functions require M here. config CRC32 - tristate CRC32 functions + tristate CRC32/CRC32c functions default y select BITREVERSE help This option is provided for the case where no in-kernel-tree - modules require CRC32 functions, but a module built outside the - kernel tree does. Such modules that use library CRC32 functions - require M here. + modules require CRC32/CRC32c functions, but a module built outside + the kernel tree does. Such modules that use library CRC32/CRC32c + functions require M here. config CRC32_SELFTEST bool CRC32 perform self test on init diff --git a/lib/crc32.c b/lib/crc32.c index d56516d..8df9561 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -46,7 +46,7 @@ #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); -MODULE_DESCRIPTION(Ethernet CRC32 calculations); +MODULE_DESCRIPTION(Various CRC32 calculations); MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 @@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256], + u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) - crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); + crc = (crc 1) ^ ((crc 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[0][crc 15]; - crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; - crc = (crc 8) ^ crc32table_le[0][crc 255]; + crc = (crc 8) ^ tab[0][crc 255]; } # else - const u32 (*tab)[] = crc32table_le; - crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE); +} EXPORT_SYMBOL(crc32_le); +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); +} +EXPORT_SYMBOL(__crc32c_le); + /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le); * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256
[PATCH 12/14] crypto: crc32c should use library implementation
Since lib/crc32.c now provides crc32c, remove the software implementation here and call the library function instead. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Kconfig |1 + crypto/crc32c.c | 94 ++- 2 files changed, 4 insertions(+), 91 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 527a857..4c9e93a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -310,6 +310,7 @@ comment Digest config CRYPTO_CRC32C tristate CRC32c CRC algorithm select CRYPTO_HASH + select CRC32 help Castagnoli, et al Cyclic Redundancy-Check Algorithm. Used by iSCSI for header and data digests and by others. diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..06f7018 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -40,6 +40,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include linux/crc32.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -53,95 +54,6 @@ struct chksum_desc_ctx { }; /* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ - -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, - 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, - 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, - 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, - 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, - 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, - 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, - 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, - 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, - 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, - 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, - 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, - 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, - 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, - 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, - 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, - 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, - 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, - 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, - 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, - 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, - 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, - 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, - 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, - 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, - 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, - 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, - 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, - 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, - 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, - 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, - 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, - 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, - 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, - 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, - 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, - 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, - 0x34F4F86AL, 0xC69F7B69L
[PATCH 08/14] crc32: Add slice-by-8 algorithm to existing code
add slicing-by-8 algorithm to the existing slicing-by-4 algorithm. This consists of: - extend largest BITS size from 32 to 64 - extend tables from tab[4][256] to up to tab[8][256] - Add code for inner loop. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 40 lib/crc32defs.h | 29 + lib/gen_crc32table.c | 43 +++ 3 files changed, 76 insertions(+), 36 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 157b35f..6311712 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -47,25 +47,28 @@ MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 +/* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) (crc = t0[(crc ^ (x)) 255] ^ (crc 8)) -# define DO_CRC4 crc = t3[(crc) 255] ^ \ - t2[(crc 8) 255] ^ \ - t1[(crc 16) 255] ^ \ - t0[(crc 24) 255] +# define DO_CRC4 (t3[(q) 255] ^ t2[(q 8) 255] ^ \ + t1[(q 16) 255] ^ t0[(q 24) 255]) +# define DO_CRC8 (t7[(q) 255] ^ t6[(q 8) 255] ^ \ + t5[(q 16) 255] ^ t4[(q 24) 255]) # else # define DO_CRC(x) (crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8)) -# define DO_CRC4 crc = t0[(crc) 255] ^ \ - t1[(crc 8) 255] ^ \ - t2[(crc 16) 255] ^ \ - t3[(crc 24) 255] +# define DO_CRC4 (t0[(q) 255] ^ t1[(q 8) 255] ^ \ + t2[(q 16) 255] ^ t3[(q 24) 255]) +# define DO_CRC8 (t4[(q) 255] ^ t5[(q 8) 255] ^ \ + t6[(q 16) 255] ^ t7[(q 24) 255]) # endif const u32 *b; - size_trem_len; + size_t rem_len; const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; + const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; + u32 q; /* Align it */ if (unlikely((long)buf 3 len)) { @@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) DO_CRC(*buf++); } while ((--len) ((long)buf)3); } + +# if CRC_LE_BITS == 32 rem_len = len 3; - /* load data 32 bits wide, xor data 32 bits wide. */ len = len 2; +# else + rem_len = len 7; + len = len 3; +# endif + b = (const u32 *)buf; for (--b; len; --len) { - crc ^= *++b; /* use pre increment for speed */ - DO_CRC4; + q = crc ^ *++b; /* use pre increment for speed */ +# if CRC_LE_BITS == 32 + crc = DO_CRC4; +# else + crc = DO_CRC8; + q = *++b; + crc ^= DO_CRC4; +# endif } len = rem_len; /* And the last few bytes */ @@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) return crc; #undef DO_CRC #undef DO_CRC4 +#undef DO_CRC8 } #endif diff --git a/lib/crc32defs.h b/lib/crc32defs.h index daa3a5e..8181592 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,29 +6,42 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ -/* For less performance-sensitive, use 4 or 8 */ +/* + * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. + * For less performance-sensitive, use 4 or 8 to save table size. + * For larger systems choose same as CPU architecture as default. + * This works well on X86_64, SPARC64 systems. This may require some + * elaboration after experiments with other architectures. + */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_LE_BITS 64 +# else +# define CRC_LE_BITS 32 +# endif #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_BE_BITS 64 +# else +# define CRC_BE_BITS 32 +# endif #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ +#if CRC_LE_BITS 64 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16
Re: [PATCH 14/14] crc32: Select an algorithm via kconfig
On Tue, Dec 13, 2011 at 09:27:10AM +0100, Joakim Tjernlund wrote: Darrick J. Wong djw...@us.ibm.com wrote on 2011/12/13 07:32:28: On Mon, Dec 12, 2011 at 05:10:45PM -0600, Bob Pearson wrote: That choice was for Joakim who measured better performance on his 32 bit PPC platform with by 4. Ok. On my 1.33GHz PowerBook I get ~255MB/s with slice by 4 and ~270MB/s with slice by 8. I think it's a PPC 7447, and definitely 32-bit. In any case, it reports having 32K of L1D cache. I tested Bobs early version on my mpc8321(266MHz, embedded CPU) and it was just half the speed compared with current crc32. I wonder, given the patch crc32: Speed up memory table access on powerpc would you mind retesting to see if slice by 8 still trails slice by 4 on your powerpc? I see that your mpc8321 has 16K of L1D cache and a 32-bit memory bus whereas my 7447 has a 64-bit memory bus. I wonder if memory bus size could be a defining characteristic...? I tried it out the crc32c code on a s390x today; apparently by-8 trails by-4 there too. It's unfortunately difficult to figure out the hardware details of whatever's going on underneath that VM. --D Jocke -- To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 14/14] crc32: Select an algorithm via kconfig
On Fri, Dec 02, 2011 at 06:36:46PM -0800, Darrick J. Wong wrote: On Fri, Dec 02, 2011 at 08:25:05AM +0800, Herbert Xu wrote: On Thu, Dec 01, 2011 at 12:15:17PM -0800, Darrick J. Wong wrote: Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com I don't like this at all. How do you expect distros or indeed anyone to make this choice? For generic C implementations like this we should only have one, and not many. Slice-by-8 should be picked automatically if the builder doesn't explicitly pick another one. The other choices are provided for people who want a slimmer cache footprint. I guess I could make the Kconfig file a bit more explicit about slice-by-8 being default, or I guess we could just ignore this one patch and thereby keeping us with the old method where anyone who wants the slimmer implementations patches the #defines. Ok, here's a patch that makes it more explicit that sliceby8 is the default. I expect distros and anyone else to simply hit Enter. The only people who should do otherwise are people who know they are building for machines that have small cache sizes such that the crc table fights for cache lines with the data being checksummed. I made a quick survey of CPU L1 cache quantities: All Intel CPUs since the Pentium MMX have 8KiB of L1. All AMD CPUs since the K5 have had 8KiB of L1. Most SPARC64 CPUs except the UltraSparc T1 and T2 CPUs have 8KiB of L1. Most PowerPC CPUs since the 601 seem to have 8KiB of L1. All IBM POWER CPUs since at least the POWER2 have had 8KiB of L1. There are too many different ARM cores for me to track. My smartphones and embedded ARM controllers all have 8KIB of L1, but that's not enough to generalize. While I might've been tempted to agree with Herbert and hardwire the code to use slice by 8, there are enough CPUs out there that *could* have too-small L1 caches that I'm not comfortable with _removing_ the Kconfig option to use a slimmer algorithm. I can't gate the decision on 64-bitness either, since I've seen plenty of i386 CPUs that benefit from slice by 8, and the UltraSparc T2 is a 64-bit processor that seems likely to suffer cache thrashing. I think having a configurable menu that steers people towards slice by 8 is fine. Bob, was there a reason for picking slice by 4 for 32-bit machines? D --- Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 43 +++ lib/crc32defs.h | 18 ++ 2 files changed, 61 insertions(+), 0 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index cfddafc..029c0e3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -70,6 +70,49 @@ config CRC32_SELFTEST and crc32_be over byte strings with random alignment and length and computes the total elapsed time and number of bytes processed. +choice + prompt CRC32 implementation + depends on CRC32 + default CRC32_SLICEBY8 + +config CRC32_SLICEBY8 + bool Slice by 8 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is the fastest algorithm, but comes with a 8KiB lookup table. + Most modern processors have enough cache to hold this table without + thrashing the cache. + + This is the default implementation choice. Choose this one unless + you have a good reason not to. + +config CRC32_SLICEBY4 + bool Slice by 4 bytes + help + Calculate checksum 4 bytes at a time with a clever slicing algorithm. + This is a bit slower than slice by 8, but has a smaller 4KiB lookup + table. + + Only choose this option if you know what you are doing. + +config CRC32_SARWATE + bool Sarwate's Algorithm (one byte at a time) + help + Calculate checksum a byte at a time using Sarwate's algorithm. This + is not particularly fast, but has a small 256 byte lookup table. + + Only choose this option if you know what you are doing. + +config CRC32_BIT + bool Classic Algorithm (one bit at a time) + help + Calculate checksum one bit at a time. This is VERY slow, but has + no lookup table. This is provided as a debugging option. + + Only choose this option if you are debugging crc32. + +endchoice + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 6fd1917..64cba2c 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -13,6 +13,24 @@ */ #define CRC32C_POLY_LE 0x82F63B78 +/* Try to choose an implementation variant via Kconfig */ +#ifdef CONFIG_CRC32_SLICEBY8 +# define CRC_LE_BITS 64 +# define CRC_BE_BITS 64 +#endif +#ifdef CONFIG_CRC32_SLICEBY4 +# define CRC_LE_BITS 32 +# define CRC_BE_BITS 32 +#endif +#ifdef CONFIG_CRC32_SARWATE +# define CRC_LE_BITS 8
Re: [PATCH v5.2 00/14] crc32c: Add faster algorithm and self-test code
On Fri, Dec 02, 2011 at 08:23:58AM +0800, Herbert Xu wrote: On Thu, Dec 01, 2011 at 12:31:22PM -0800, Darrick J. Wong wrote: . They seem to call crc32c(), which is in crypto/crc32c. If you're interested in Nope, the crypto API layer will use the SSE implementation where available. Only when it isn't available will the C version in crypto/ be used. There's a SSE version other than what's in crc32c-intel? (I suspect we're talking about the same thing?) --D Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 14/14] crc32: Select an algorithm via kconfig
On Fri, Dec 02, 2011 at 08:25:05AM +0800, Herbert Xu wrote: On Thu, Dec 01, 2011 at 12:15:17PM -0800, Darrick J. Wong wrote: Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com I don't like this at all. How do you expect distros or indeed anyone to make this choice? For generic C implementations like this we should only have one, and not many. Slice-by-8 should be picked automatically if the builder doesn't explicitly pick another one. The other choices are provided for people who want a slimmer cache footprint. I guess I could make the Kconfig file a bit more explicit about slice-by-8 being default, or I guess we could just ignore this one patch and thereby keeping us with the old method where anyone who wants the slimmer implementations patches the #defines. --D Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5.1 00/14] crc32c: Add faster algorithm and self-test code
On Wed, Nov 30, 2011 at 02:29:11PM -0800, Andrew Morton wrote: On Mon, 28 Nov 2011 14:36:59 -0800 Darrick J. Wong djw...@us.ibm.com wrote: This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. I think the attributions here are all messed up. As the patches stand, it appears that you wrote all of them. But I don't think that is the case. If Bob wrote a particular patch then that patch should be sent with a From: Bob Pearson rpear...@systemfabricworks.com right at the start of the changelog so that he is recorded as the primary author. If the email sender (ie: you) was the primary author then this attribution can be omitted and we fall back to using the From: from the email headers. Also, every one of these patches should have you own signed-off-by, regardless of its authorship. For reasons explained in Documentation/SubmittingPatches, section 12. Please fix these things up and resend. Also, it would be conventional and useful if each patch title was prefixed by its subsystem identifier. ie, removed two instances of trailing whitespaces should be titled crc32: remove two instances of trailing whitespace or lib/crc32.c: remove two instances of trailing whitespaces. Okay, I'll massage the changelogs to give them more descriptive subjects, and fix the attribution chain. Thank you for the feedback. --D -- To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/14] crc32: Make CRC_*_BITS definition correspond to actual bit counts
crc32.c provides a choice of one of several algorithms for computing the LSB and LSB versions of the CRC32 checksum based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the original version the values 1, 2, 4 and 8 respectively selected versions of the alrogithm that computed the crc 1, 2, 4 and 32 bits as a time. This patch series adds a new version that computes the CRC 64 bits at a time. To make things easier to understand the parameter has been reinterpreted to actually stand for the number of bits processed in each step of the algorithm so that the old value 8 has been replaced with the value 32. This also allows us to add in a widely used crc algorithm that computes the crc 8 bits at a time called the Sarwate algorithm. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 17 ++--- lib/crc32defs.h | 18 ++ lib/gen_crc32table.c | 11 ++- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index ff6bb9a..157b35f 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -27,13 +27,13 @@ #include linux/types.h #include crc32defs.h -#if CRC_LE_BITS == 8 +#if CRC_LE_BITS 8 # define tole(x) (__force u32) __constant_cpu_to_le32(x) #else # define tole(x) (x) #endif -#if CRC_BE_BITS == 8 +#if CRC_BE_BITS 8 # define tobe(x) (__force u32) __constant_cpu_to_be32(x) #else # define tobe(x) (x) @@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); MODULE_DESCRIPTION(Ethernet CRC32 calculations); MODULE_LICENSE(GPL); -#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8 +#if CRC_LE_BITS 8 || CRC_BE_BITS 8 static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) @@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_le[0][crc 15]; } # elif CRC_LE_BITS == 8 + /* aka Sarwate algorithm */ + while (len--) { + crc ^= *p++; + crc = (crc 8) ^ crc32table_le[0][crc 255]; + } +# else const u32 (*tab)[] = crc32table_le; crc = (__force u32) __cpu_to_le32(crc); @@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_be[0][crc 28]; } # elif CRC_BE_BITS == 8 + while (len--) { + crc ^= *p++ 24; + crc = (crc 8) ^ crc32table_be[0][crc 24]; + } +# else const u32 (*tab)[] = crc32table_be; crc = (__force u32) __cpu_to_be32(crc); diff --git a/lib/crc32defs.h b/lib/crc32defs.h index f5a5401..daa3a5e 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,27 +6,29 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ -/* For less performance-sensitive, use 4 */ +/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ +/* For less performance-sensitive, use 4 or 8 */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 8 +# define CRC_LE_BITS 32 #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 8 +# define CRC_BE_BITS 32 #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 8 || CRC_LE_BITS 1 || CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be a power of 2 between 1 and 8 +#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ + CRC_LE_BITS CRC_LE_BITS-1 +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 8 || CRC_BE_BITS 1 || CRC_BE_BITS CRC_BE_BITS-1 -# error CRC_BE_BITS must be a power of 2 between 1 and 8 +#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ + CRC_BE_BITS CRC_BE_BITS-1 +# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32} #endif diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index eced769..99ac744 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -4,8 +4,17 @@ #define ENTRIES_PER_LINE 4 +#if CRC_LE_BITS = 8 #define LE_TABLE_SIZE (1 CRC_LE_BITS) +#else +#define LE_TABLE_SIZE 256 +#endif + +#if CRC_BE_BITS = 8 #define BE_TABLE_SIZE (1 CRC_BE_BITS) +#else +#define BE_TABLE_SIZE 256 +#endif static uint32_t crc32table_le[4][256]; static uint32_t crc32table_be[4][256]; @@ -24,7 +33,7 @@ static void crc32init_le(void) crc32table_le[0][0] = 0; - for (i = 1 (CRC_LE_BITS - 1); i; i = 1) { + for (i = LE_TABLE_SIZE 1; i; i = 1) { crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0
[PATCH 10/14] crc32: Add note about this patchset to crc32.c
Some final changes - added a comment at the top of crc32.c From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 2c8e8c0..d56516d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1,4 +1,8 @@ /* + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * * Oct 15, 2000 Matt Domsch matt_dom...@dell.com * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/14] crc32: Add self-test code for crc32c
Add self-test code for crc32c. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 363 ++- 1 files changed, 261 insertions(+), 102 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 8df9561..382fa76 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -765,113 +765,265 @@ static struct crc_test { u32 length; /* random 11 bit length of test */ u32 crc_le; /* expected crc32_le result */ u32 crc_be; /* expected crc32_be result */ + u32 crc32c_le; /* expected crc32c_le result */ } test[] = { - {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1}, - {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad}, - {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f}, - {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a}, - {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2}, - {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793}, - {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed}, - {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35}, - {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2}, - {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10}, - {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb}, - {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0}, - {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb}, - {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed}, - {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591}, - {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67}, - {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd}, - {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a}, - {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b}, - {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f}, - {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d}, - {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a}, - {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97}, - {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2}, - {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138}, - {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032}, - {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f}, - {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f}, - {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32}, - {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef}, - {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0}, - {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59}, - {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4}, - {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c}, - {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51}, - {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11}, - {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659}, - {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af}, - {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99}, - {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b}, - {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521}, - {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3}, - {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d}, - {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f}, - {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b}, - {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0}, - {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195}, - {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d}, - {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4}, - {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3}, - {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643}, - {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10}, - {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d}, - {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5}, - {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b}, - {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee}, - {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14}, - {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a}, - {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b}, - {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3}, - {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826}, - {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06}, - {0xbbb52021, 0x003b, 0x0272
[PATCH 14/14] crc32: Select an algorithm via kconfig
Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 36 lib/crc32defs.h | 18 ++ 2 files changed, 54 insertions(+), 0 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index cfddafc..e9b9134 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -70,6 +70,42 @@ config CRC32_SELFTEST and crc32_be over byte strings with random alignment and length and computes the total elapsed time and number of bytes processed. +choice + prompt CRC32 implementation + depends on CRC32 + default CRC32_SLICEBY8 + +config CRC32_SLICEBY8 + bool Slice by 8 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is the fastest algorithm, but comes with a 8KiB lookup table. + Most modern processors have enough cache that this shouldn't be + a problem. + + If you don't know which to choose, choose this one. + +config CRC32_SLICEBY4 + bool Slice by 4 bytes + help + Calculate checksum 4 bytes at a time with a clever slicing algorithm. + This is a bit slower than slice by 8, but has a smaller 4KiB lookup + table. + +config CRC32_SARWATE + bool Sarwate's Algorithm (one byte at a time) + help + Calculate checksum a byte at a time using Sarwate's algorithm. This + is not particularly fast, but has a small 256 byte lookup table. + +config CRC32_BIT + bool Classic Algorithm (one bit at a time) + help + Calculate checksum one bit at a time. This is VERY slow, but has + no lookup table. This is provided as a debugging option. + +endchoice + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 6fd1917..64cba2c 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -13,6 +13,24 @@ */ #define CRC32C_POLY_LE 0x82F63B78 +/* Try to choose an implementation variant via Kconfig */ +#ifdef CONFIG_CRC32_SLICEBY8 +# define CRC_LE_BITS 64 +# define CRC_BE_BITS 64 +#endif +#ifdef CONFIG_CRC32_SLICEBY4 +# define CRC_LE_BITS 32 +# define CRC_BE_BITS 32 +#endif +#ifdef CONFIG_CRC32_SARWATE +# define CRC_LE_BITS 8 +# define CRC_BE_BITS 8 +#endif +#ifdef CONFIG_CRC32_BIT +# define CRC_LE_BITS 1 +# define CRC_BE_BITS 1 +#endif + /* * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. * For less performance-sensitive, use 4 or 8 to save table size. -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/14] crc32: Bolt on crc32c
Reuse the existing crc32 code to stamp out a crc32c implementation. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32.h |2 ++ lib/Kconfig |8 +++--- lib/crc32.c | 62 +++-- lib/crc32defs.h |7 ++ lib/gen_crc32table.c | 35 ++-- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 391a259..68267b6 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -11,6 +11,8 @@ extern u32 crc32_le(u32 crc, unsigned char const *p, size_t len); extern u32 crc32_be(u32 crc, unsigned char const *p, size_t len); +extern u32 __crc32c_le(u32 crc, unsigned char const *p, size_t len); + #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/Kconfig b/lib/Kconfig index 2bc5834..cfddafc 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,14 +51,14 @@ config CRC_ITU_T functions require M here. config CRC32 - tristate CRC32 functions + tristate CRC32/CRC32c functions default y select BITREVERSE help This option is provided for the case where no in-kernel-tree - modules require CRC32 functions, but a module built outside the - kernel tree does. Such modules that use library CRC32 functions - require M here. + modules require CRC32/CRC32c functions, but a module built outside + the kernel tree does. Such modules that use library CRC32/CRC32c + functions require M here. config CRC32_SELFTEST bool CRC32 perform self test on init diff --git a/lib/crc32.c b/lib/crc32.c index d56516d..8df9561 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -46,7 +46,7 @@ #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); -MODULE_DESCRIPTION(Ethernet CRC32 calculations); +MODULE_DESCRIPTION(Various CRC32 calculations); MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 @@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256], + u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) - crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); + crc = (crc 1) ^ ((crc 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[0][crc 15]; - crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; - crc = (crc 8) ^ crc32table_le[0][crc 255]; + crc = (crc 8) ^ tab[0][crc 255]; } # else - const u32 (*tab)[] = crc32table_le; - crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE); +} EXPORT_SYMBOL(crc32_le); +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); +} +EXPORT_SYMBOL(__crc32c_le); + /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le); * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256
[PATCH 02/14] crc32: Move long comment about crc32 fundamentals to Documentation/
Moved a long comment from lib/crc32.c to Documentation/crc32.txt where it will more likely get read. - Edited the resulting document to add an explanation of the slicing-by-n algorithm. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: George Spelvin li...@horizon.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- Documentation/00-INDEX |2 + Documentation/crc32.txt | 183 +++ lib/crc32.c | 129 + 3 files changed, 187 insertions(+), 127 deletions(-) create mode 100644 Documentation/crc32.txt diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 65bbd26..e7b38a0 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -104,6 +104,8 @@ cpuidle/ - info on CPU_IDLE, CPU idle state management subsystem. cputopology.txt - documentation on how CPU topology info is exported via sysfs. +crc32.txt + - brief tutorial on CRC computation cris/ - directory with info about Linux on CRIS architecture. crypto/ diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt new file mode 100644 index 000..3d74ba4 --- /dev/null +++ b/Documentation/crc32.txt @@ -0,0 +1,183 @@ +A brief CRC tutorial. + +A CRC is a long-division remainder. You add the CRC to the message, +and the whole thing (message+CRC) is a multiple of the given +CRC polynomial. To check the CRC, you can either check that the +CRC matches the recomputed value, *or* you can check that the +remainder computed on the message+CRC is 0. This latter approach +is used by a lot of hardware implementations, and is why so many +protocols put the end-of-frame flag after the CRC. + +It's actually the same long division you learned in school, except that +- We're working in binary, so the digits are only 0 and 1, and +- When dividing polynomials, there are no carries. Rather than add and + subtract, we just xor. Thus, we tend to get a bit sloppy about + the difference between adding and subtracting. + +Like all division, the remainder is always smaller than the divisor. +To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial. +Since it's 33 bits long, bit 32 is always going to be set, so usually the +CRC is written in hex with the most significant bit omitted. (If you're +familiar with the IEEE 754 floating-point format, it's the same idea.) + +Note that a CRC is computed over a string of *bits*, so you have +to decide on the endianness of the bits within each byte. To get +the best error-detecting properties, this should correspond to the +order they're actually sent. For example, standard RS-232 serial is +little-endian; the most significant bit (sometimes used for parity) +is sent last. And when appending a CRC word to a message, you should +do it in the right order, matching the endianness. + +Just like with ordinary division, you proceed one digit (bit) at a time. +Each step of the division, division, you take one more digit (bit) of the +dividend and append it to the current remainder. Then you figure out the +appropriate multiple of the divisor to subtract to being the remainder +back into range. In binary, this is easy - it has to be either 0 or 1, +and to make the XOR cancel, it's just a copy of bit 32 of the remainder. + +When computing a CRC, we don't care about the quotient, so we can +throw the quotient bit away, but subtract the appropriate multiple of +the polynomial from the remainder and we're back to where we started, +ready to process the next bit. + +A big-endian CRC written this way would be coded like: +for (i = 0; i input_bits; i++) { + multiple = remainder 0x8000 ? CRCPOLY : 0; + remainder = (remainder 1 | next_input_bit()) ^ multiple; +} + +Notice how, to get at bit 32 of the shifted remainder, we look +at bit 31 of the remainder *before* shifting it. + +But also notice how the next_input_bit() bits we're shifting into +the remainder don't actually affect any decision-making until +32 bits later. Thus, the first 32 cycles of this are pretty boring. +Also, to add the CRC to a message, we need a 32-bit-long hole for it at +the end, so we have to add 32 extra cycles shifting in zeros at the +end of every message, + +These details lead to a standard trick: rearrange merging in the +next_input_bit() until the moment it's needed. Then the first 32 cycles +can be precomputed, and merging in the final 32 zero bits to make room +for the CRC can be skipped entirely. This changes the code to: + +for (i = 0; i input_bits; i++) { + remainder ^= next_input_bit() 31; + multiple = (remainder 0x8000) ? CRCPOLY : 0; + remainder = (remainder 1) ^ multiple; +} + +With this optimization, the little-endian code is particularly simple: +for (i = 0; i input_bits; i++) { + remainder
[PATCH 04/14] crc32: Speed up memory table access on powerpc
Replace 2D array references by pointer references in loops. This change has no effect on X86 code but improves PPC performance. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 21 +++-- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 7a0e5a9..c93c9ae 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -53,20 +53,21 @@ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = tab[3][(crc) 255] ^ \ - tab[2][(crc 8) 255] ^ \ - tab[1][(crc 16) 255] ^ \ - tab[0][(crc 24) 255] +# define DO_CRC(x) (crc = t0[(crc ^ (x)) 255] ^ (crc 8)) +# define DO_CRC4 crc = t3[(crc) 255] ^ \ + t2[(crc 8) 255] ^ \ + t1[(crc 16) 255] ^ \ + t0[(crc 24) 255] # else -# define DO_CRC(x) crc = tab[0][((crc 24) ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = tab[0][(crc) 255] ^ \ - tab[1][(crc 8) 255] ^ \ - tab[2][(crc 16) 255] ^ \ - tab[3][(crc 24) 255] +# define DO_CRC(x) (crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8)) +# define DO_CRC4 crc = t0[(crc) 255] ^ \ + t1[(crc 8) 255] ^ \ + t2[(crc 16) 255] ^ \ + t3[(crc 24) 255] # endif const u32 *b; size_trem_len; + const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; /* Align it */ if (unlikely((long)buf 3 len)) { -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5.2 00/14] crc32c: Add faster algorithm and self-test code
Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. This latest submission combines Bob's patches from late August 2011 with mine so that they can be one coherent patch set. Please excuse my inability to combine some of the patches; I've been advised to leave Bob's patches alone and build atop them instead. :/ Since the last posting, I've also collected some crc32c test results on a bunch of different x86/powerpc/sparc platforms. The results can be viewed here: http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the performance of the kernel's current crc32 and crc32c software implementations. The crc32c-by8-le column shows crc32c performance with this patchset applied. I expect crc32 performance to be roughly the same. The two _boost columns at the right side of the spreadsheet shows how much faster the new implementation is over the old one. As you can see, crc32 rises substantially, and crc32c experiences a huge increase. I'm hoping this patch set meets with everyone's approval and can go in soon. Herbert Xu didn't appear to have any strong objections to last month's posting, so I'm wondering if Andrew has an opinion? v2: Use the crypto testmgr api for self-test. v3: Get rid of the -be version, which had no users. v4: Allow kernel builder a choice of speed vs. space optimization. v5: Reuse lib/crc32 for crc32c as well, and make crypto/crc32c use lib/crc32.c. v5.1: Include Bob Pearson's patches in submission request. v5.2: Fix changelogs for Bob's patches per akpm request. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/14] crc32: Add slice-by-8 algorithm to existing code
add slicing-by-8 algorithm to the existing slicing-by-4 algorithm. This consists of: - extend largest BITS size from 32 to 64 - extend tables from tab[4][256] to up to tab[8][256] - Add code for inner loop. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 40 lib/crc32defs.h | 29 + lib/gen_crc32table.c | 43 +++ 3 files changed, 76 insertions(+), 36 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 157b35f..6311712 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -47,25 +47,28 @@ MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 +/* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) (crc = t0[(crc ^ (x)) 255] ^ (crc 8)) -# define DO_CRC4 crc = t3[(crc) 255] ^ \ - t2[(crc 8) 255] ^ \ - t1[(crc 16) 255] ^ \ - t0[(crc 24) 255] +# define DO_CRC4 (t3[(q) 255] ^ t2[(q 8) 255] ^ \ + t1[(q 16) 255] ^ t0[(q 24) 255]) +# define DO_CRC8 (t7[(q) 255] ^ t6[(q 8) 255] ^ \ + t5[(q 16) 255] ^ t4[(q 24) 255]) # else # define DO_CRC(x) (crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8)) -# define DO_CRC4 crc = t0[(crc) 255] ^ \ - t1[(crc 8) 255] ^ \ - t2[(crc 16) 255] ^ \ - t3[(crc 24) 255] +# define DO_CRC4 (t0[(q) 255] ^ t1[(q 8) 255] ^ \ + t2[(q 16) 255] ^ t3[(q 24) 255]) +# define DO_CRC8 (t4[(q) 255] ^ t5[(q 8) 255] ^ \ + t6[(q 16) 255] ^ t7[(q 24) 255]) # endif const u32 *b; - size_trem_len; + size_t rem_len; const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; + const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; + u32 q; /* Align it */ if (unlikely((long)buf 3 len)) { @@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) DO_CRC(*buf++); } while ((--len) ((long)buf)3); } + +# if CRC_LE_BITS == 32 rem_len = len 3; - /* load data 32 bits wide, xor data 32 bits wide. */ len = len 2; +# else + rem_len = len 7; + len = len 3; +# endif + b = (const u32 *)buf; for (--b; len; --len) { - crc ^= *++b; /* use pre increment for speed */ - DO_CRC4; + q = crc ^ *++b; /* use pre increment for speed */ +# if CRC_LE_BITS == 32 + crc = DO_CRC4; +# else + crc = DO_CRC8; + q = *++b; + crc ^= DO_CRC4; +# endif } len = rem_len; /* And the last few bytes */ @@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) return crc; #undef DO_CRC #undef DO_CRC4 +#undef DO_CRC8 } #endif diff --git a/lib/crc32defs.h b/lib/crc32defs.h index daa3a5e..8181592 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,29 +6,42 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ -/* For less performance-sensitive, use 4 or 8 */ +/* + * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. + * For less performance-sensitive, use 4 or 8 to save table size. + * For larger systems choose same as CPU architecture as default. + * This works well on X86_64, SPARC64 systems. This may require some + * elaboration after experiments with other architectures. + */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_LE_BITS 64 +# else +# define CRC_LE_BITS 32 +# endif #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_BE_BITS 64 +# else +# define CRC_BE_BITS 32 +# endif #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ +#if CRC_LE_BITS 64 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16
[PATCH 06/14] crc32: Fix mixing of endian-specific types
crc32.c in its original version freely mixed u32, __le32 and __be32 types which caused warnings from sparse with __CHECK_ENDIAN__. This patch fixes these by forcing the types to u32. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 2a87ea2..ff6bb9a 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -28,13 +28,13 @@ #include crc32defs.h #if CRC_LE_BITS == 8 -# define tole(x) __constant_cpu_to_le32(x) +# define tole(x) (__force u32) __constant_cpu_to_le32(x) #else # define tole(x) (x) #endif #if CRC_BE_BITS == 8 -# define tobe(x) __constant_cpu_to_be32(x) +# define tobe(x) (__force u32) __constant_cpu_to_be32(x) #else # define tobe(x) (x) #endif @@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) # elif CRC_LE_BITS == 8 const u32 (*tab)[] = crc32table_le; - crc = __cpu_to_le32(crc); + crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); - crc = __le32_to_cpu(crc); + crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } @@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) # elif CRC_BE_BITS == 8 const u32 (*tab)[] = crc32table_be; - crc = __cpu_to_be32(crc); + crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); - crc = __be32_to_cpu(crc); + crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/14] crc32: Miscellaneous cleanups
Misc cleanup of lib/crc32.c and related files - removed unnecessary header files. - straightened out some convoluted ifdef's - rewrote some references to 2 dimensional arrays as 1 dimensional arrays to make them correct. I.e. replaced tab[i] with tab[0][i]. - a few trivial whitespace changes - fixed a warning in gen_crc32tables.c caused by a mismatch in the type of the pointer passed to output table. Since the table is only used at kernel compile time, it is simpler to make the table big enough to hold the largest column size used. One cannot make the column size smaller in output_table because it has to be used by both the le and be tables and they can have different column sizes. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 104 +- lib/gen_crc32table.c |6 +-- 2 files changed, 39 insertions(+), 71 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index c93c9ae..2a87ea2 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -23,13 +23,10 @@ /* see: Documentation/crc32.txt for a description of algorithms */ #include linux/crc32.h -#include linux/kernel.h #include linux/module.h -#include linux/compiler.h #include linux/types.h -#include linux/init.h -#include linux/atomic.h #include crc32defs.h + #if CRC_LE_BITS == 8 # define tole(x) __constant_cpu_to_le32(x) #else @@ -41,6 +38,7 @@ #else # define tobe(x) (x) #endif + #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); @@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) #undef DO_CRC4 } #endif + /** * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); - -#if CRC_LE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { +#if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); } - return crc; -} -#else /* Table-based approach */ - -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) -{ -# if CRC_LE_BITS == 8 - const u32 (*tab)[] = crc32table_le; - - crc = __cpu_to_le32(crc); - crc = crc32_body(crc, p, len, tab); - return __le32_to_cpu(crc); -# elif CRC_LE_BITS == 4 +# elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[crc 15]; - crc = (crc 4) ^ crc32table_le[crc 15]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; } - return crc; -# elif CRC_LE_BITS == 2 +# elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; } +# elif CRC_LE_BITS == 8 + const u32 (*tab)[] = crc32table_le; + + crc = __cpu_to_le32(crc); + crc = crc32_body(crc, p, len, tab); + crc = __le32_to_cpu(crc); +#endif return crc; -# endif } -#endif +EXPORT_SYMBOL(crc32_le); /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 @@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); - -#if CRC_BE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { +#if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ 24; @@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len
[PATCH 03/14] crc32: Simplify unit test code
Replaced the unit test provided in crc32.c, which doesn't have a makefile and doesn't compile with current headers, with a simpler self test routine that also gives a measure of performance and runs at module init time. The self test option can be enabled through a configuration option CONFIG_CRC32_SELFTEST. The test stresses the pre and post loops and is thus not very realistic since actual uses will likely have addresses and lengths that are at least 4 byte aligned. However, the main loop is long enough so that the performance is dominated by that loop. The expected values for crc32_le and crc32_be were generated with the original version of crc32.c using CRC_BITS_LE = 8 and CRC_BITS_BE = 8. These values were then used to check all the values of the BITS parameters in both the original and new versions. The performance results show some variability from run to run in spite of attempts to both warm the cache and reduce the amount of OS noise by limiting interrutps during the test. To get comparable results and to analyse options wrt performance the best time reported over a small sample of runs has been taken. From: Bob Pearson rpear...@systemfabricworks.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com [djw...@us.ibm.com: Minor changelog tweaks] Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 10 + lib/crc32.c | 798 ++- 2 files changed, 691 insertions(+), 117 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 32f3e5a..2bc5834 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -60,6 +60,16 @@ config CRC32 kernel tree does. Such modules that use library CRC32 functions require M here. +config CRC32_SELFTEST + bool CRC32 perform self test on init + default n + depends on CRC32 + help + This option enables the CRC32 library functions to perform a + self test on initialization. The self test computes crc32_le + and crc32_be over byte strings with random alignment and length + and computes the total elapsed time and number of bytes processed. + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32.c b/lib/crc32.c index 7ac8b0d..7a0e5a9 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -210,137 +210,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(crc32_be); -#ifdef UNITTEST +#ifdef CONFIG_CRC32_SELFTEST -#include stdlib.h -#include stdio.h - -#if 0 /*Not used at present */ -static void -buf_dump(char const *prefix, unsigned char const *buf, size_t len) +/* 4096 random bytes */ +static u8 __attribute__((__aligned__(8))) test_buf[] = { - fputs(prefix, stdout); - while (len--) - printf( %02x, *buf++); - putchar('\n'); - -} -#endif - -static void bytereverse(unsigned char *buf, size_t len) + 0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30, + 0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4, + 0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60, + 0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c, + 0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4, + 0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a, + 0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a, + 0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4, + 0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9, + 0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4, + 0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca, + 0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61, + 0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e, + 0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a, + 0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f, + 0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd, + 0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c, + 0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88, + 0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53, + 0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f, + 0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4, + 0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74, + 0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60, + 0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09, + 0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07, + 0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1, + 0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f, + 0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2, + 0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0, + 0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95, + 0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22, + 0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93, + 0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86, + 0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d, + 0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40, + 0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b, + 0xea, 0xc6, 0x55
Re: [PATCH v5.2 00/14] crc32c: Add faster algorithm and self-test code
On Thu, Dec 01, 2011 at 12:20:53PM -0800, Joel Becker wrote: On Thu, Dec 01, 2011 at 12:13:41PM -0800, Darrick J. Wong wrote: Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. I thought they usually used the SSE instruction for crc32 or equivalent. They seem to call crc32c(), which is in crypto/crc32c. If you're interested in hardware accelerated crc32c on Intel, it is still the case that the wrapper for that can be loaded via crc32c-intel. --D Joel -- I almost ran over an angel He had a nice big fat cigar. 'In a sense,' he said, 'You're alone here So if you jump, you'd best jump far.' http://www.jlbec.org/ jl...@evilplan.org -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/14] Moved a long comment from lib/crc32.c to Documentation/crc32.txt
where it will more likely get read. - Edited the resulting document to add an explanation of the slicing-by-n algorithm. Signed-off-by: George Spelvin li...@horizon.com Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- Documentation/00-INDEX |2 + Documentation/crc32.txt | 183 +++ lib/crc32.c | 129 + 3 files changed, 187 insertions(+), 127 deletions(-) create mode 100644 Documentation/crc32.txt diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 65bbd26..e7b38a0 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -104,6 +104,8 @@ cpuidle/ - info on CPU_IDLE, CPU idle state management subsystem. cputopology.txt - documentation on how CPU topology info is exported via sysfs. +crc32.txt + - brief tutorial on CRC computation cris/ - directory with info about Linux on CRIS architecture. crypto/ diff --git a/Documentation/crc32.txt b/Documentation/crc32.txt new file mode 100644 index 000..3d74ba4 --- /dev/null +++ b/Documentation/crc32.txt @@ -0,0 +1,183 @@ +A brief CRC tutorial. + +A CRC is a long-division remainder. You add the CRC to the message, +and the whole thing (message+CRC) is a multiple of the given +CRC polynomial. To check the CRC, you can either check that the +CRC matches the recomputed value, *or* you can check that the +remainder computed on the message+CRC is 0. This latter approach +is used by a lot of hardware implementations, and is why so many +protocols put the end-of-frame flag after the CRC. + +It's actually the same long division you learned in school, except that +- We're working in binary, so the digits are only 0 and 1, and +- When dividing polynomials, there are no carries. Rather than add and + subtract, we just xor. Thus, we tend to get a bit sloppy about + the difference between adding and subtracting. + +Like all division, the remainder is always smaller than the divisor. +To produce a 32-bit CRC, the divisor is actually a 33-bit CRC polynomial. +Since it's 33 bits long, bit 32 is always going to be set, so usually the +CRC is written in hex with the most significant bit omitted. (If you're +familiar with the IEEE 754 floating-point format, it's the same idea.) + +Note that a CRC is computed over a string of *bits*, so you have +to decide on the endianness of the bits within each byte. To get +the best error-detecting properties, this should correspond to the +order they're actually sent. For example, standard RS-232 serial is +little-endian; the most significant bit (sometimes used for parity) +is sent last. And when appending a CRC word to a message, you should +do it in the right order, matching the endianness. + +Just like with ordinary division, you proceed one digit (bit) at a time. +Each step of the division, division, you take one more digit (bit) of the +dividend and append it to the current remainder. Then you figure out the +appropriate multiple of the divisor to subtract to being the remainder +back into range. In binary, this is easy - it has to be either 0 or 1, +and to make the XOR cancel, it's just a copy of bit 32 of the remainder. + +When computing a CRC, we don't care about the quotient, so we can +throw the quotient bit away, but subtract the appropriate multiple of +the polynomial from the remainder and we're back to where we started, +ready to process the next bit. + +A big-endian CRC written this way would be coded like: +for (i = 0; i input_bits; i++) { + multiple = remainder 0x8000 ? CRCPOLY : 0; + remainder = (remainder 1 | next_input_bit()) ^ multiple; +} + +Notice how, to get at bit 32 of the shifted remainder, we look +at bit 31 of the remainder *before* shifting it. + +But also notice how the next_input_bit() bits we're shifting into +the remainder don't actually affect any decision-making until +32 bits later. Thus, the first 32 cycles of this are pretty boring. +Also, to add the CRC to a message, we need a 32-bit-long hole for it at +the end, so we have to add 32 extra cycles shifting in zeros at the +end of every message, + +These details lead to a standard trick: rearrange merging in the +next_input_bit() until the moment it's needed. Then the first 32 cycles +can be precomputed, and merging in the final 32 zero bits to make room +for the CRC can be skipped entirely. This changes the code to: + +for (i = 0; i input_bits; i++) { + remainder ^= next_input_bit() 31; + multiple = (remainder 0x8000) ? CRCPOLY : 0; + remainder = (remainder 1) ^ multiple; +} + +With this optimization, the little-endian code is particularly simple: +for (i = 0; i input_bits; i++) { + remainder ^= next_input_bit(); + multiple = (remainder 1) ? CRCPOLY : 0; + remainder = (remainder 1) ^ multiple; +} + +The most significant coefficient of the remainder polynomial is stored +in the least
[PATCH 05/14] Misc cleanup of lib/crc32.c and related files
- removed unnecessary header files. - straightened out some convoluted ifdef's - rewrote some references to 2 dimensional arrays as 1 dimensional arrays to make them correct. I.e. replaced tab[i] with tab[0][i]. - a few trivial whitespace changes - fixed a warning in gen_crc32tables.c caused by a mismatch in the type of the pointer passed to output table. Since the table is only used at kernel compile time, it is simpler to make the table big enough to hold the largest column size used. One cannot make the column size smaller in output_table because it has to be used by both the le and be tables and they can have different column sizes. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c | 104 +- lib/gen_crc32table.c |6 +-- 2 files changed, 39 insertions(+), 71 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index c93c9ae..2a87ea2 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -23,13 +23,10 @@ /* see: Documentation/crc32.txt for a description of algorithms */ #include linux/crc32.h -#include linux/kernel.h #include linux/module.h -#include linux/compiler.h #include linux/types.h -#include linux/init.h -#include linux/atomic.h #include crc32defs.h + #if CRC_LE_BITS == 8 # define tole(x) __constant_cpu_to_le32(x) #else @@ -41,6 +38,7 @@ #else # define tobe(x) (x) #endif + #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); @@ -96,6 +94,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) #undef DO_CRC4 } #endif + /** * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -103,53 +102,39 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); - -#if CRC_LE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { +#if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); } - return crc; -} -#else /* Table-based approach */ - -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) -{ -# if CRC_LE_BITS == 8 - const u32 (*tab)[] = crc32table_le; - - crc = __cpu_to_le32(crc); - crc = crc32_body(crc, p, len, tab); - return __le32_to_cpu(crc); -# elif CRC_LE_BITS == 4 +# elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[crc 15]; - crc = (crc 4) ^ crc32table_le[crc 15]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ crc32table_le[0][crc 3]; } - return crc; -# elif CRC_LE_BITS == 2 +# elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; - crc = (crc 2) ^ crc32table_le[crc 3]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ crc32table_le[0][crc 15]; } +# elif CRC_LE_BITS == 8 + const u32 (*tab)[] = crc32table_le; + + crc = __cpu_to_le32(crc); + crc = crc32_body(crc, p, len, tab); + crc = __le32_to_cpu(crc); +#endif return crc; -# endif } -#endif +EXPORT_SYMBOL(crc32_le); /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 @@ -158,16 +143,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); - -#if CRC_BE_BITS == 1 -/* - * In fact, the table-based code will work in this case, but it can be - * simplified by inlining the table in ?: form. - */ - u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { +#if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ 24; @@ -176,39 +154,29 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) (crc 1) ^ ((crc 0x8000) ? CRCPOLY_BE : 0); } - return crc; -} - -#else /*
[PATCH 03/14] Replaced the unit test provided in crc32.c, which doesn't have a
makefile and doesn't compile with current headers, with a simpler self test routine that also gives a measure of performance and runs at module init time. The self test option can be enabled through a configuration option CONFIG_CRC32_SELFTEST. The test stresses the pre and post loops and is thus not very realistic since actual uses will likely have addresses and lengths that are at least 4 byte aligned. However, the main loop is long enough so that the performance is dominated by that loop. The expected values for crc32_le and crc32_be were generated with the original version of crc32.c using CRC_BITS_LE = 8 and CRC_BITS_BE = 8. These values were then used to check all the values of the BITS parameters in both the original and new versions. The performance results show some variability from run to run in spite of attempts to both warm the cache and reduce the amount of OS noise by limiting interrutps during the test. To get comparable results and to analyse options wrt performance the best time reported over a small sample of runs has been taken. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/Kconfig | 10 + lib/crc32.c | 798 ++- 2 files changed, 691 insertions(+), 117 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 32f3e5a..2bc5834 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -60,6 +60,16 @@ config CRC32 kernel tree does. Such modules that use library CRC32 functions require M here. +config CRC32_SELFTEST + bool CRC32 perform self test on init + default n + depends on CRC32 + help + This option enables the CRC32 library functions to perform a + self test on initialization. The self test computes crc32_le + and crc32_be over byte strings with random alignment and length + and computes the total elapsed time and number of bytes processed. + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32.c b/lib/crc32.c index 7ac8b0d..7a0e5a9 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -210,137 +210,701 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(crc32_be); -#ifdef UNITTEST +#ifdef CONFIG_CRC32_SELFTEST -#include stdlib.h -#include stdio.h - -#if 0 /*Not used at present */ -static void -buf_dump(char const *prefix, unsigned char const *buf, size_t len) +/* 4096 random bytes */ +static u8 __attribute__((__aligned__(8))) test_buf[] = { - fputs(prefix, stdout); - while (len--) - printf( %02x, *buf++); - putchar('\n'); - -} -#endif - -static void bytereverse(unsigned char *buf, size_t len) + 0x5b, 0x85, 0x21, 0xcb, 0x09, 0x68, 0x7d, 0x30, + 0xc7, 0x69, 0xd7, 0x30, 0x92, 0xde, 0x59, 0xe4, + 0xc9, 0x6e, 0x8b, 0xdb, 0x98, 0x6b, 0xaa, 0x60, + 0xa8, 0xb5, 0xbc, 0x6c, 0xa9, 0xb1, 0x5b, 0x2c, + 0xea, 0xb4, 0x92, 0x6a, 0x3f, 0x79, 0x91, 0xe4, + 0xe9, 0x70, 0x51, 0x8c, 0x7f, 0x95, 0x6f, 0x1a, + 0x56, 0xa1, 0x5c, 0x27, 0x03, 0x67, 0x9f, 0x3a, + 0xe2, 0x31, 0x11, 0x29, 0x6b, 0x98, 0xfc, 0xc4, + 0x53, 0x24, 0xc5, 0x8b, 0xce, 0x47, 0xb2, 0xb9, + 0x32, 0xcb, 0xc1, 0xd0, 0x03, 0x57, 0x4e, 0xd4, + 0xe9, 0x3c, 0xa1, 0x63, 0xcf, 0x12, 0x0e, 0xca, + 0xe1, 0x13, 0xd1, 0x93, 0xa6, 0x88, 0x5c, 0x61, + 0x5b, 0xbb, 0xf0, 0x19, 0x46, 0xb4, 0xcf, 0x9e, + 0xb6, 0x6b, 0x4c, 0x3a, 0xcf, 0x60, 0xf9, 0x7a, + 0x8d, 0x07, 0x63, 0xdb, 0x40, 0xe9, 0x0b, 0x6f, + 0xad, 0x97, 0xf1, 0xed, 0xd0, 0x1e, 0x26, 0xfd, + 0xbf, 0xb7, 0xc8, 0x04, 0x94, 0xf8, 0x8b, 0x8c, + 0xf1, 0xab, 0x7a, 0xd4, 0xdd, 0xf3, 0xe8, 0x88, + 0xc3, 0xed, 0x17, 0x8a, 0x9b, 0x40, 0x0d, 0x53, + 0x62, 0x12, 0x03, 0x5f, 0x1b, 0x35, 0x32, 0x1f, + 0xb4, 0x7b, 0x93, 0x78, 0x0d, 0xdb, 0xce, 0xa4, + 0xc0, 0x47, 0xd5, 0xbf, 0x68, 0xe8, 0x5d, 0x74, + 0x8f, 0x8e, 0x75, 0x1c, 0xb2, 0x4f, 0x9a, 0x60, + 0xd1, 0xbe, 0x10, 0xf4, 0x5c, 0xa1, 0x53, 0x09, + 0xa5, 0xe0, 0x09, 0x54, 0x85, 0x5c, 0xdc, 0x07, + 0xe7, 0x21, 0x69, 0x7b, 0x8a, 0xfd, 0x90, 0xf1, + 0x22, 0xd0, 0xb4, 0x36, 0x28, 0xe6, 0xb8, 0x0f, + 0x39, 0xde, 0xc8, 0xf3, 0x86, 0x60, 0x34, 0xd2, + 0x5e, 0xdf, 0xfd, 0xcf, 0x0f, 0xa9, 0x65, 0xf0, + 0xd5, 0x4d, 0x96, 0x40, 0xe3, 0xdf, 0x3f, 0x95, + 0x5a, 0x39, 0x19, 0x93, 0xf4, 0x75, 0xce, 0x22, + 0x00, 0x1c, 0x93, 0xe2, 0x03, 0x66, 0xf4, 0x93, + 0x73, 0x86, 0x81, 0x8e, 0x29, 0x44, 0x48, 0x86, + 0x61, 0x7c, 0x48, 0xa3, 0x43, 0xd2, 0x9c, 0x8d, + 0xd4, 0x95, 0xdd, 0xe1, 0x22, 0x89, 0x3a, 0x40, + 0x4c, 0x1b, 0x8a, 0x04, 0xa8, 0x09, 0x69, 0x8b, + 0xea, 0xc6, 0x55, 0x8e, 0x57, 0xe6, 0x64, 0x35, + 0xf0, 0xc7, 0x16, 0x9f, 0x5d, 0x5e, 0x86, 0x40, + 0x46, 0xbb, 0xe5, 0x45, 0x88, 0xfe, 0xc9, 0x63, + 0x15, 0xfb, 0xf5, 0xbd, 0x71, 0x61, 0xeb, 0x7b, +
[PATCH 10/14] Some final changes
- added a comment at the top of crc32.c Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 2c8e8c0..d56516d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -1,4 +1,8 @@ /* + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * * Oct 15, 2000 Matt Domsch matt_dom...@dell.com * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/14] crc32: Add self-test code for crc32c
Add self-test code for crc32c. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 363 ++- 1 files changed, 261 insertions(+), 102 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 8df9561..382fa76 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -765,113 +765,265 @@ static struct crc_test { u32 length; /* random 11 bit length of test */ u32 crc_le; /* expected crc32_le result */ u32 crc_be; /* expected crc32_be result */ + u32 crc32c_le; /* expected crc32c_le result */ } test[] = { - {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1}, - {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad}, - {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f}, - {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a}, - {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2}, - {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793}, - {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed}, - {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35}, - {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2}, - {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10}, - {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb}, - {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0}, - {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb}, - {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed}, - {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591}, - {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67}, - {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd}, - {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a}, - {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b}, - {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f}, - {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d}, - {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a}, - {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97}, - {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2}, - {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138}, - {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032}, - {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f}, - {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f}, - {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32}, - {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef}, - {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0}, - {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59}, - {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4}, - {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c}, - {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51}, - {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11}, - {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659}, - {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af}, - {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99}, - {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b}, - {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521}, - {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3}, - {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d}, - {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f}, - {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b}, - {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0}, - {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195}, - {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d}, - {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4}, - {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3}, - {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643}, - {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10}, - {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d}, - {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5}, - {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b}, - {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee}, - {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14}, - {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a}, - {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b}, - {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3}, - {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826}, - {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06}, - {0xbbb52021, 0x003b, 0x0272
[PATCH 06/14] crc32.c in its original version freely mixed u32, __le32 and __be32 types
which caused warnings from sparse with __CHECK_ENDIAN__. This patch fixes these by forcing the types to u32. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 2a87ea2..ff6bb9a 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -28,13 +28,13 @@ #include crc32defs.h #if CRC_LE_BITS == 8 -# define tole(x) __constant_cpu_to_le32(x) +# define tole(x) (__force u32) __constant_cpu_to_le32(x) #else # define tole(x) (x) #endif #if CRC_BE_BITS == 8 -# define tobe(x) __constant_cpu_to_be32(x) +# define tobe(x) (__force u32) __constant_cpu_to_be32(x) #else # define tobe(x) (x) #endif @@ -128,9 +128,9 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) # elif CRC_LE_BITS == 8 const u32 (*tab)[] = crc32table_le; - crc = __cpu_to_le32(crc); + crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); - crc = __le32_to_cpu(crc); + crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } @@ -171,9 +171,9 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) # elif CRC_BE_BITS == 8 const u32 (*tab)[] = crc32table_be; - crc = __cpu_to_be32(crc); + crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); - crc = __be32_to_cpu(crc); + crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/14] crypto: crc32c should use library implementation
Since lib/crc32.c now provides crc32c, remove the software implementation here and call the library function instead. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Kconfig |1 + crypto/crc32c.c | 94 ++- 2 files changed, 4 insertions(+), 91 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 527a857..4c9e93a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -310,6 +310,7 @@ comment Digest config CRYPTO_CRC32C tristate CRC32c CRC algorithm select CRYPTO_HASH + select CRC32 help Castagnoli, et al Cyclic Redundancy-Check Algorithm. Used by iSCSI for header and data digests and by others. diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..06f7018 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -40,6 +40,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include linux/crc32.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -53,95 +54,6 @@ struct chksum_desc_ctx { }; /* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ - -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, - 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, - 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, - 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, - 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, - 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, - 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, - 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, - 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, - 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, - 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, - 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, - 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, - 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, - 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, - 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, - 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, - 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, - 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, - 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, - 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, - 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, - 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, - 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, - 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, - 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, - 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, - 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, - 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, - 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, - 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, - 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, - 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, - 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, - 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, - 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, - 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, - 0x34F4F86AL, 0xC69F7B69L
[PATCH 11/14] crc32: Bolt on crc32c
Reuse the existing crc32 code to stamp out a crc32c implementation. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32.h |2 ++ lib/Kconfig |8 +++--- lib/crc32.c | 62 +++-- lib/crc32defs.h |7 ++ lib/gen_crc32table.c | 35 ++-- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 391a259..68267b6 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -11,6 +11,8 @@ extern u32 crc32_le(u32 crc, unsigned char const *p, size_t len); extern u32 crc32_be(u32 crc, unsigned char const *p, size_t len); +extern u32 __crc32c_le(u32 crc, unsigned char const *p, size_t len); + #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/Kconfig b/lib/Kconfig index 2bc5834..cfddafc 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,14 +51,14 @@ config CRC_ITU_T functions require M here. config CRC32 - tristate CRC32 functions + tristate CRC32/CRC32c functions default y select BITREVERSE help This option is provided for the case where no in-kernel-tree - modules require CRC32 functions, but a module built outside the - kernel tree does. Such modules that use library CRC32 functions - require M here. + modules require CRC32/CRC32c functions, but a module built outside + the kernel tree does. Such modules that use library CRC32/CRC32c + functions require M here. config CRC32_SELFTEST bool CRC32 perform self test on init diff --git a/lib/crc32.c b/lib/crc32.c index d56516d..8df9561 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -46,7 +46,7 @@ #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); -MODULE_DESCRIPTION(Ethernet CRC32 calculations); +MODULE_DESCRIPTION(Various CRC32 calculations); MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 @@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256], + u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) - crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); + crc = (crc 1) ^ ((crc 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[0][crc 15]; - crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; - crc = (crc 8) ^ crc32table_le[0][crc 255]; + crc = (crc 8) ^ tab[0][crc 255]; } # else - const u32 (*tab)[] = crc32table_le; - crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE); +} EXPORT_SYMBOL(crc32_le); +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); +} +EXPORT_SYMBOL(__crc32c_le); + /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le); * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256
[PATCH 08/14] add slicing-by-8 algorithm to the existing
slicing-by-4 algorithm. This consists of: - extend largest BITS size from 32 to 64 - extend tables from tab[4][256] to up to tab[8][256] - Add code for inner loop. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c | 40 lib/crc32defs.h | 29 + lib/gen_crc32table.c | 43 +++ 3 files changed, 76 insertions(+), 36 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 157b35f..6311712 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -47,25 +47,28 @@ MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 +/* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) (crc = t0[(crc ^ (x)) 255] ^ (crc 8)) -# define DO_CRC4 crc = t3[(crc) 255] ^ \ - t2[(crc 8) 255] ^ \ - t1[(crc 16) 255] ^ \ - t0[(crc 24) 255] +# define DO_CRC4 (t3[(q) 255] ^ t2[(q 8) 255] ^ \ + t1[(q 16) 255] ^ t0[(q 24) 255]) +# define DO_CRC8 (t7[(q) 255] ^ t6[(q 8) 255] ^ \ + t5[(q 16) 255] ^ t4[(q 24) 255]) # else # define DO_CRC(x) (crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8)) -# define DO_CRC4 crc = t0[(crc) 255] ^ \ - t1[(crc 8) 255] ^ \ - t2[(crc 16) 255] ^ \ - t3[(crc 24) 255] +# define DO_CRC4 (t0[(q) 255] ^ t1[(q 8) 255] ^ \ + t2[(q 16) 255] ^ t3[(q 24) 255]) +# define DO_CRC8 (t4[(q) 255] ^ t5[(q 8) 255] ^ \ + t6[(q 16) 255] ^ t7[(q 24) 255]) # endif const u32 *b; - size_trem_len; + size_t rem_len; const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; + const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; + u32 q; /* Align it */ if (unlikely((long)buf 3 len)) { @@ -73,13 +76,25 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) DO_CRC(*buf++); } while ((--len) ((long)buf)3); } + +# if CRC_LE_BITS == 32 rem_len = len 3; - /* load data 32 bits wide, xor data 32 bits wide. */ len = len 2; +# else + rem_len = len 7; + len = len 3; +# endif + b = (const u32 *)buf; for (--b; len; --len) { - crc ^= *++b; /* use pre increment for speed */ - DO_CRC4; + q = crc ^ *++b; /* use pre increment for speed */ +# if CRC_LE_BITS == 32 + crc = DO_CRC4; +# else + crc = DO_CRC8; + q = *++b; + crc ^= DO_CRC4; +# endif } len = rem_len; /* And the last few bytes */ @@ -92,6 +107,7 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) return crc; #undef DO_CRC #undef DO_CRC4 +#undef DO_CRC8 } #endif diff --git a/lib/crc32defs.h b/lib/crc32defs.h index daa3a5e..8181592 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,29 +6,42 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ -/* For less performance-sensitive, use 4 or 8 */ +/* + * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. + * For less performance-sensitive, use 4 or 8 to save table size. + * For larger systems choose same as CPU architecture as default. + * This works well on X86_64, SPARC64 systems. This may require some + * elaboration after experiments with other architectures. + */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_LE_BITS 64 +# else +# define CRC_LE_BITS 32 +# endif #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 32 +# ifdef CONFIG_64BIT +# define CRC_BE_BITS 64 +# else +# define CRC_BE_BITS 32 +# endif #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ +#if CRC_LE_BITS 64 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32, 64} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ +#if CRC_BE_BITS 64 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ CRC_BE_BITS CRC_BE_BITS-1 -# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32} +# error CRC_BE_BITS must be one of {1, 2, 4,
[PATCH 09/14] Add two changes that improve the performance of x86 systems
1. replace main loop with incrementing counter this change improves the performance of the selftest by about 5-6% on Nehalem CPUs. The apparent reason is that the compiler can use the loop index to perform an indexed memory access. This is reported to make the performance of PowerPC CPUs to get worse. 2. replace the rem_len loop with incrementing counter this change improves the performance of the selftest, which has more than the usual number of occurances, by about 1-2% on x86 CPUs. In actual work loads the length is most often a multiple of 4 bytes and this code does not get executed as often if at all. Again this change is reported to make the performance of PowerPC get worse. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c | 13 + 1 files changed, 13 insertions(+), 0 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 6311712..2c8e8c0 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -66,6 +66,9 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) # endif const u32 *b; size_t rem_len; +# ifdef CONFIG_X86 + size_t i; +# endif const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; u32 q; @@ -86,7 +89,12 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) # endif b = (const u32 *)buf; +# ifdef CONFIG_X86 + --b; + for (i = 0; i len; i++) { +# else for (--b; len; --len) { +# endif q = crc ^ *++b; /* use pre increment for speed */ # if CRC_LE_BITS == 32 crc = DO_CRC4; @@ -100,9 +108,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) /* And the last few bytes */ if (len) { u8 *p = (u8 *)(b + 1) - 1; +# ifdef CONFIG_X86 + for (i = 0; i len; i++) + DO_CRC(*++p); /* use pre increment for speed */ +# else do { DO_CRC(*++p); /* use pre increment for speed */ } while (--len); +# endif } return crc; #undef DO_CRC -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5.1 00/14] crc32c: Add faster algorithm and self-test code
Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. This latest submission combines Bob's patches from late August 2011 with mine so that they can be one coherent patch set. Please excuse my inability to combine some of the patches; I've been advised to leave Bob's patches alone and build atop them instead. :/ Since the last posting, I've also collected some crc32c test results on a bunch of different x86/powerpc/sparc platforms. The results can be viewed here: http://goo.gl/sgt3i ; the crc32-kern-le and crc32c columns describe the performance of the kernel's current crc32 and crc32c software implementations. The crc32c-by8-le column shows crc32c performance with this patchset applied. I expect crc32 performance to be roughly the same. The two _boost columns at the right side of the spreadsheet shows how much faster the new implementation is over the old one. As you can see, crc32 rises substantially, and crc32c experiences a huge increase. I'm hoping this patch set meets with everyone's approval and can go in soon. Herbert Xu didn't appear to have any strong objections to last month's posting, so I'm wondering if Andrew has an opinion? --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/14] removed two instances of trailing whitespaces
- remove trailing whitespace from lib/crc32.c - remove trailing whitespace from lib/crc32defs.h Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c |2 +- lib/crc32defs.h |2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index a6e633a..23b08ba 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -316,7 +316,7 @@ EXPORT_SYMBOL(crc32_be); * in the correct multiple to subtract, we can shift a byte at a time. * This produces a 40-bit (rather than a 33-bit) intermediate remainder, * but again the multiple of the polynomial to subtract depends only on - * the high bits, the high 8 bits in this case. + * the high bits, the high 8 bits in this case. * * The multiple we need in that case is the low 32 bits of a 40-bit * value whose high 8 bits are given, and which is a multiple of the diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 9b6773d..f5a5401 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -8,7 +8,7 @@ /* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ /* For less performance-sensitive, use 4 */ -#ifndef CRC_LE_BITS +#ifndef CRC_LE_BITS # define CRC_LE_BITS 8 #endif #ifndef CRC_BE_BITS -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/14] crc32.c provides a choice of one of several algorithms for
computing the LSB and LSB versions of the CRC32 checksum based on the parameters CRC_LE_BITS and CRC_BE_BITS. In the original version the values 1, 2, 4 and 8 respectively selected versions of the alrogithm that computed the crc 1, 2, 4 and 32 bits as a time. This patch series adds a new version that computes the CRC 64 bits at a time. To make things easier to understand the parameter has been reinterpreted to actually stand for the number of bits processed in each step of the algorithm so that the old value 8 has been replaced with the value 32. This also allows us to add in a widely used crc algorithm that computes the crc 8 bits at a time called the Sarwate algorithm. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c | 17 ++--- lib/crc32defs.h | 18 ++ lib/gen_crc32table.c | 11 ++- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index ff6bb9a..157b35f 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -27,13 +27,13 @@ #include linux/types.h #include crc32defs.h -#if CRC_LE_BITS == 8 +#if CRC_LE_BITS 8 # define tole(x) (__force u32) __constant_cpu_to_le32(x) #else # define tole(x) (x) #endif -#if CRC_BE_BITS == 8 +#if CRC_BE_BITS 8 # define tobe(x) (__force u32) __constant_cpu_to_be32(x) #else # define tobe(x) (x) @@ -45,7 +45,7 @@ MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); MODULE_DESCRIPTION(Ethernet CRC32 calculations); MODULE_LICENSE(GPL); -#if CRC_LE_BITS == 8 || CRC_BE_BITS == 8 +#if CRC_LE_BITS 8 || CRC_BE_BITS 8 static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) @@ -126,6 +126,12 @@ u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_le[0][crc 15]; } # elif CRC_LE_BITS == 8 + /* aka Sarwate algorithm */ + while (len--) { + crc ^= *p++; + crc = (crc 8) ^ crc32table_le[0][crc 255]; + } +# else const u32 (*tab)[] = crc32table_le; crc = (__force u32) __cpu_to_le32(crc); @@ -169,6 +175,11 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) crc = (crc 4) ^ crc32table_be[0][crc 28]; } # elif CRC_BE_BITS == 8 + while (len--) { + crc ^= *p++ 24; + crc = (crc 8) ^ crc32table_be[0][crc 24]; + } +# else const u32 (*tab)[] = crc32table_be; crc = (__force u32) __cpu_to_be32(crc); diff --git a/lib/crc32defs.h b/lib/crc32defs.h index f5a5401..daa3a5e 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -6,27 +6,29 @@ #define CRCPOLY_LE 0xedb88320 #define CRCPOLY_BE 0x04c11db7 -/* How many bits at a time to use. Requires a table of 4CRC_xx_BITS bytes. */ -/* For less performance-sensitive, use 4 */ +/* How many bits at a time to use. Valid values are 1, 2, 4, 8, and 32. */ +/* For less performance-sensitive, use 4 or 8 */ #ifndef CRC_LE_BITS -# define CRC_LE_BITS 8 +# define CRC_LE_BITS 32 #endif #ifndef CRC_BE_BITS -# define CRC_BE_BITS 8 +# define CRC_BE_BITS 32 #endif /* * Little-endian CRC computation. Used with serial bit streams sent * lsbit-first. Be sure to use cpu_to_le32() to append the computed CRC. */ -#if CRC_LE_BITS 8 || CRC_LE_BITS 1 || CRC_LE_BITS CRC_LE_BITS-1 -# error CRC_LE_BITS must be a power of 2 between 1 and 8 +#if CRC_LE_BITS 32 || CRC_LE_BITS 1 || CRC_LE_BITS == 16 || \ + CRC_LE_BITS CRC_LE_BITS-1 +# error CRC_LE_BITS must be one of {1, 2, 4, 8, 32} #endif /* * Big-endian CRC computation. Used with serial bit streams sent * msbit-first. Be sure to use cpu_to_be32() to append the computed CRC. */ -#if CRC_BE_BITS 8 || CRC_BE_BITS 1 || CRC_BE_BITS CRC_BE_BITS-1 -# error CRC_BE_BITS must be a power of 2 between 1 and 8 +#if CRC_BE_BITS 32 || CRC_BE_BITS 1 || CRC_BE_BITS == 16 || \ + CRC_BE_BITS CRC_BE_BITS-1 +# error CRC_BE_BITS must be one of {1, 2, 4, 8, 32} #endif diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c index eced769..99ac744 100644 --- a/lib/gen_crc32table.c +++ b/lib/gen_crc32table.c @@ -4,8 +4,17 @@ #define ENTRIES_PER_LINE 4 +#if CRC_LE_BITS = 8 #define LE_TABLE_SIZE (1 CRC_LE_BITS) +#else +#define LE_TABLE_SIZE 256 +#endif + +#if CRC_BE_BITS = 8 #define BE_TABLE_SIZE (1 CRC_BE_BITS) +#else +#define BE_TABLE_SIZE 256 +#endif static uint32_t crc32table_le[4][256]; static uint32_t crc32table_be[4][256]; @@ -24,7 +33,7 @@ static void crc32init_le(void) crc32table_le[0][0] = 0; - for (i = 1 (CRC_LE_BITS - 1); i; i = 1) { + for (i = LE_TABLE_SIZE 1; i; i = 1) { crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); for (j = 0; j LE_TABLE_SIZE; j += 2 * i) crc32table_le[0][i + j] = crc ^ crc32table_le[0][j]; -- To unsubscribe from this list: send the line unsubscribe linux-crypto in
[PATCH 04/14] Replace 2D array references by pointer references in loops.
This change has no effect on X86 code but improves PPC performance. Signed-off-by: Bob Pearson rpear...@systemfabricworks.com --- lib/crc32.c | 21 +++-- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 7a0e5a9..c93c9ae 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -53,20 +53,21 @@ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = tab[3][(crc) 255] ^ \ - tab[2][(crc 8) 255] ^ \ - tab[1][(crc 16) 255] ^ \ - tab[0][(crc 24) 255] +# define DO_CRC(x) (crc = t0[(crc ^ (x)) 255] ^ (crc 8)) +# define DO_CRC4 crc = t3[(crc) 255] ^ \ + t2[(crc 8) 255] ^ \ + t1[(crc 16) 255] ^ \ + t0[(crc 24) 255] # else -# define DO_CRC(x) crc = tab[0][((crc 24) ^ (x)) 255] ^ (crc 8) -# define DO_CRC4 crc = tab[0][(crc) 255] ^ \ - tab[1][(crc 8) 255] ^ \ - tab[2][(crc 16) 255] ^ \ - tab[3][(crc 24) 255] +# define DO_CRC(x) (crc = t0[((crc 24) ^ (x)) 255] ^ (crc 8)) +# define DO_CRC4 crc = t0[(crc) 255] ^ \ + t1[(crc 8) 255] ^ \ + t2[(crc 16) 255] ^ \ + t3[(crc 24) 255] # endif const u32 *b; size_trem_len; + const u32 *t0 = tab[0], *t1 = tab[1], *t2 = tab[2], *t3 = tab[3]; /* Align it */ if (unlikely((long)buf 3 len)) { -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] crc32: Bolt on crc32c
On Fri, Oct 21, 2011 at 09:15:14PM +0200, Herbert Xu wrote: On Fri, Oct 21, 2011 at 09:57:03AM -0700, Darrick J. Wong wrote: My patchset builds upon Bob Pearson's crc32 patchset from early September. Do my patches fail to apply after applying his patchset? Or, to speed things along, should I simply repost both Bob's and my patches as one big series? Bob, have you sent out a new iteration of your patches since September 6th? I'm fine with you pushing this through whichever tree that Bob's patches are going through. Well... it's been 2.5 weeks since I last asked about this. No reply, afaict. I haven't seen any complaints about Bob's latest patchset, nor any complaints about my set that sits atop his. On the other hand, I'm pretty sure I haven't seen Bob's patches appear in any trees, and Google shows no recent progress. Herbert, would you object to pushing the whole patchset through the crypto tree? --D Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] crc32: Bolt on crc32c
On Fri, Oct 21, 2011 at 09:15:14PM +0200, Herbert Xu wrote: On Fri, Oct 21, 2011 at 09:57:03AM -0700, Darrick J. Wong wrote: My patchset builds upon Bob Pearson's crc32 patchset from early September. Do my patches fail to apply after applying his patchset? Or, to speed things along, should I simply repost both Bob's and my patches as one big series? Bob, have you sent out a new iteration of your patches since September 6th? I'm fine with you pushing this through whichever tree that Bob's patches are going through. Bob, Which tree (if any) are your patches going through? --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] crc32: Bolt on crc32c
On Fri, Oct 21, 2011 at 02:28:03PM +0200, Herbert Xu wrote: On Tue, Oct 04, 2011 at 04:54:03PM -0700, Darrick J. Wong wrote: Reuse the existing crc32 code to stamp out a crc32c implementation. Signed-off-by: Darrick J. Wong djw...@us.ibm.com Did you want this to go through my tree? If so then there is a problem since it doesn't apply at all. My patchset builds upon Bob Pearson's crc32 patchset from early September. Do my patches fail to apply after applying his patchset? Or, to speed things along, should I simply repost both Bob's and my patches as one big series? Bob, have you sent out a new iteration of your patches since September 6th? --D Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v5.1 4/4] crc32: Select an algorithm via kconfig
Oops, the description of CRC32_SLICEBY4 is a bit screwy. Let's try that again. --- Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 36 lib/crc32defs.h | 18 ++ 2 files changed, 54 insertions(+), 0 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 477be04..27881d9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -70,6 +70,42 @@ config CRC32_SELFTEST and crc32_be over byte strings with random alignment and length and computes the total elapsed time and number of bytes processed. +choice + prompt CRC32 implementation + depends on CRC32 + default CRC32_SLICEBY8 + +config CRC32_SLICEBY8 + bool Slice by 8 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is the fastest algorithm, but comes with a 8KiB lookup table. + Most modern processors have enough cache that this shouldn't be + a problem. + + If you don't know which to choose, choose this one. + +config CRC32_SLICEBY4 + bool Slice by 4 bytes + help + Calculate checksum 4 bytes at a time with a clever slicing algorithm. + This is a bit slower than slice by 8, but has a smaller 4KiB lookup + table. + +config CRC32_SARWATE + bool Sarwate's Algorithm (one byte at a time) + help + Calculate checksum a byte at a time using Sarwate's algorithm. This + is not particularly fast, but has a small 256 byte lookup table. + +config CRC32_BIT + bool Classic Algorithm (one bit at a time) + help + Calculate checksum one bit at a time. This is VERY slow, but has + no lookup table. This is provided as a debugging option. + +endchoice + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 6fd1917..64cba2c 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -13,6 +13,24 @@ */ #define CRC32C_POLY_LE 0x82F63B78 +/* Try to choose an implementation variant via Kconfig */ +#ifdef CONFIG_CRC32_SLICEBY8 +# define CRC_LE_BITS 64 +# define CRC_BE_BITS 64 +#endif +#ifdef CONFIG_CRC32_SLICEBY4 +# define CRC_LE_BITS 32 +# define CRC_BE_BITS 32 +#endif +#ifdef CONFIG_CRC32_SARWATE +# define CRC_LE_BITS 8 +# define CRC_BE_BITS 8 +#endif +#ifdef CONFIG_CRC32_BIT +# define CRC_LE_BITS 1 +# define CRC_BE_BITS 1 +#endif + /* * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. * For less performance-sensitive, use 4 or 8 to save table size. -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v5 0/4] crc32c: Add faster algorithm and self-test code
On Tue, Oct 04, 2011 at 04:53:57PM -0700, Darrick J. Wong wrote: Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It requires that all ten of his patches (at least the ones dated 31 Aug 2011) be applied. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. As for Mr. Tjernlund's unresolved questions regarding the v4 patch, I have tested this new code on x64/x32/ppc32/ppc64 and it seems to work fine, both with the crc32c selftest and also on a practical level with ext4 metadata checksumming enabled. Updating to Bob's newest calculation code brings about a 10-15% speedup on the ppc64 box. I also see that slice-by-8 is about 20% faster than slice-by-4 on my ppc32 box. I did _not_ see any failures on ppc32 when running an extended ext4+checksum test suite. Details of the ppc32 box: root@dyn9047029101:~# cat /proc/cpuinfo processor : 0 cpu : 740/750 temperature : 45 C (uncalibrated) clock : 500.00MHz revision: 131.0 (pvr 0008 8300) bogomips: 49.86 total bogomips : 49.86 timebase: 24934966 platform: PowerMac model : PowerMac1,1 machine : PowerMac1,1 motherboard : PowerMac1,1 MacRISC Power Macintosh detected as : 66 (BlueWhite G3) pmac flags : L2 cache: 1024K unified pmac-generation : NewWorld Memory : 896 MB root@dyn9047029101:~# gcc --version gcc-4.4.real (Ubuntu 4.4.3-4ubuntu5) 4.4.3 Copyright (C) 2009 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. root@dyn9047029101:~# for i in /sys/devices/system/cpu/cpu0/cache/*/*; do echo $i $(cat $i); done /sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size 32 /sys/devices/system/cpu/cpu0/cache/index0/level 1 /sys/devices/system/cpu/cpu0/cache/index0/number_of_sets 128 /sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_map ,,,0001 /sys/devices/system/cpu/cpu0/cache/index0/size 32K /sys/devices/system/cpu/cpu0/cache/index0/type Data /sys/devices/system/cpu/cpu0/cache/index0/ways_of_associativity 8 /sys/devices/system/cpu/cpu0/cache/index1/coherency_line_size 32 /sys/devices/system/cpu/cpu0/cache/index1/level 1 /sys/devices/system/cpu/cpu0/cache/index1/number_of_sets 128 /sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_map ,,,0001 /sys/devices/system/cpu/cpu0/cache/index1/size 32K /sys/devices/system/cpu/cpu0/cache/index1/type Instruction /sys/devices/system/cpu/cpu0/cache/index1/ways_of_associativity 8 /sys/devices/system/cpu/cpu0/cache/index2/coherency_line_size 128 /sys/devices/system/cpu/cpu0/cache/index2/level 2 /sys/devices/system/cpu/cpu0/cache/index2/number_of_sets 4096 /sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_map ,,,0001 /sys/devices/system/cpu/cpu0/cache/index2/size 1024K /sys/devices/system/cpu/cpu0/cache/index2/type Unified /sys/devices/system/cpu/cpu0/cache/index2/ways_of_associativity 2 The ppc64 box: root@elm3c7:~# cat /proc/cpuinfo processor : 0 cpu : POWER5+ (gs) clock : 1900.098000MHz revision: 2.0 (pvr 003b 0200) (the rest is omitted for brevity) root@elm3c7:~# gcc --version gcc-4.4.real (Ubuntu 4.4.3-4ubuntu5) 4.4.3 Copyright (C) 2009 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. root@elm3c7:~# for i in /sys/devices/system/cpu/cpu0/cache/*/*; do echo $i $(cat $i); done /sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size 128 /sys/devices/system/cpu/cpu0/cache/index0/level 1 /sys/devices/system/cpu/cpu0/cache/index0/number_of_sets 64 /sys
[PATCH v5 0/4] crc32c: Add faster algorithm and self-test code
Hi all, This patchset (re)uses Bob Pearson's crc32 slice-by-8 code to stamp out a software crc32c implementation. It requires that all ten of his patches (at least the ones dated 31 Aug 2011) be applied. It removes the crc32c implementation in crypto/ in favor of using the stamped-out one in lib/. There is also a change to Kconfig so that the kernel builder can pick an implementation best suited for the hardware. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] crc32: Bolt on crc32c
Reuse the existing crc32 code to stamp out a crc32c implementation. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32.h |2 ++ lib/Kconfig |8 +++--- lib/crc32.c | 62 +++-- lib/crc32defs.h |7 ++ lib/gen_crc32table.c | 35 ++-- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 391a259..68267b6 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -11,6 +11,8 @@ extern u32 crc32_le(u32 crc, unsigned char const *p, size_t len); extern u32 crc32_be(u32 crc, unsigned char const *p, size_t len); +extern u32 __crc32c_le(u32 crc, unsigned char const *p, size_t len); + #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/Kconfig b/lib/Kconfig index 8e0bcbd..477be04 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,14 +51,14 @@ config CRC_ITU_T functions require M here. config CRC32 - tristate CRC32 functions + tristate CRC32/CRC32c functions default y select BITREVERSE help This option is provided for the case where no in-kernel-tree - modules require CRC32 functions, but a module built outside the - kernel tree does. Such modules that use library CRC32 functions - require M here. + modules require CRC32/CRC32c functions, but a module built outside + the kernel tree does. Such modules that use library CRC32/CRC32c + functions require M here. config CRC32_SELFTEST bool CRC32 perform self test on init diff --git a/lib/crc32.c b/lib/crc32.c index d56516d..8df9561 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -46,7 +46,7 @@ #include crc32table.h MODULE_AUTHOR(Matt Domsch matt_dom...@dell.com); -MODULE_DESCRIPTION(Ethernet CRC32 calculations); +MODULE_DESCRIPTION(Various CRC32 calculations); MODULE_LICENSE(GPL); #if CRC_LE_BITS 8 || CRC_BE_BITS 8 @@ -135,46 +135,57 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256], + u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i 8; i++) - crc = (crc 1) ^ ((crc 1) ? CRCPOLY_LE : 0); + crc = (crc 1) ^ ((crc 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; - crc = (crc 2) ^ crc32table_le[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; + crc = (crc 2) ^ tab[0][crc 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; - crc = (crc 4) ^ crc32table_le[0][crc 15]; - crc = (crc 4) ^ crc32table_le[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; + crc = (crc 4) ^ tab[0][crc 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; - crc = (crc 8) ^ crc32table_le[0][crc 255]; + crc = (crc 8) ^ tab[0][crc 255]; } # else - const u32 (*tab)[] = crc32table_le; - crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32table_le, CRCPOLY_LE); +} EXPORT_SYMBOL(crc32_le); +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE); +} +EXPORT_SYMBOL(__crc32c_le); + /** * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for @@ -182,7 +193,9 @@ EXPORT_SYMBOL(crc32_le); * @p: pointer to buffer over which CRC is run * @len: length of buffer @p */ -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, + size_t len, const u32 (*tab)[256
[PATCH 2/4] crypto: crc32c should use library implementation
Since lib/crc32.c now provides crc32c, remove the software implementation here and call the library function instead. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Kconfig |1 + crypto/crc32c.c | 94 ++- 2 files changed, 4 insertions(+), 91 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index ae27b75..28fdbf6 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -302,6 +302,7 @@ comment Digest config CRYPTO_CRC32C tristate CRC32c CRC algorithm select CRYPTO_HASH + select CRC32 help Castagnoli, et al Cyclic Redundancy-Check Algorithm. Used by iSCSI for header and data digests and by others. diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..06f7018 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -40,6 +40,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include linux/crc32.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -53,95 +54,6 @@ struct chksum_desc_ctx { }; /* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ - -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, - 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, - 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, - 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, - 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, - 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, - 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, - 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, - 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, - 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, - 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, - 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, - 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, - 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, - 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, - 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, - 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, - 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, - 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, - 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, - 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, - 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, - 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, - 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, - 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, - 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, - 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, - 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, - 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, - 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, - 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, - 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, - 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, - 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, - 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, - 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, - 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, - 0x34F4F86AL, 0xC69F7B69L
[PATCH 3/4] crc32: Add self-test code for crc32c
Add self-test code for crc32c. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/crc32.c | 363 ++- 1 files changed, 261 insertions(+), 102 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 8df9561..382fa76 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -765,113 +765,265 @@ static struct crc_test { u32 length; /* random 11 bit length of test */ u32 crc_le; /* expected crc32_le result */ u32 crc_be; /* expected crc32_be result */ + u32 crc32c_le; /* expected crc32c_le result */ } test[] = { - {0x674bf11d, 0x0038, 0x0542, 0x0af6d466, 0xd8b6e4c1}, - {0x35c672c6, 0x003a, 0x01aa, 0xc6d3dfba, 0x28aaf3ad}, - {0x496da28e, 0x0039, 0x05af, 0xd933660f, 0x5d57e81f}, - {0x09a9b90e, 0x0027, 0x01f8, 0xb45fe007, 0xf45fca9a}, - {0xdc97e5a9, 0x0025, 0x03b6, 0xf81a3562, 0xe0126ba2}, - {0x47c58900, 0x000a, 0x00b9, 0x8e58eccf, 0xf3afc793}, - {0x292561e8, 0x000c, 0x0403, 0xa2ba8aaf, 0x0b797aed}, - {0x415037f6, 0x0003, 0x0676, 0xa17d52e8, 0x7f0fdf35}, - {0x3466e707, 0x0026, 0x0042, 0x258319be, 0x75c484a2}, - {0xafd1281b, 0x0023, 0x02ee, 0x4428eaf8, 0x06c7ad10}, - {0xd3857b18, 0x0028, 0x04a2, 0x5c430821, 0xb062b7cb}, - {0x1d825a8f, 0x002b, 0x050b, 0xd2c45f0c, 0xd68634e0}, - {0x5033e3bc, 0x000b, 0x0078, 0xa3ea4113, 0xac6d31fb}, - {0x94f1fb5e, 0x000f, 0x03a2, 0xfbfc50b1, 0x3cfe50ed}, - {0xc9a0fe14, 0x0009, 0x0473, 0x5fb61894, 0x87070591}, - {0x88a034b1, 0x001c, 0x05ad, 0xc1b16053, 0x46f95c67}, - {0xf0f72239, 0x0020, 0x026d, 0xa6fa58f3, 0xf8c2c1dd}, - {0xcc20a5e3, 0x003b, 0x067a, 0x7740185a, 0x308b979a}, - {0xce589c95, 0x002b, 0x0641, 0xd055e987, 0x40aae25b}, - {0x78edc885, 0x0035, 0x05be, 0xa39cb14b, 0x035b0d1f}, - {0x9d40a377, 0x003b, 0x0038, 0x1f47ccd2, 0x197fbc9d}, - {0x703d0e01, 0x003c, 0x06f1, 0x88735e7c, 0xfed57c5a}, - {0x776bf505, 0x000f, 0x05b2, 0x5cc4fc01, 0xf32efb97}, - {0x4a3e7854, 0x0027, 0x04b8, 0x8d923c82, 0x0cbfb4a2}, - {0x209172dd, 0x003b, 0x0356, 0xb89e9c2b, 0xd7868138}, - {0x3ba4cc5b, 0x002f, 0x0203, 0xe51601a9, 0x5b2a1032}, - {0xfc62f297, 0x, 0x0079, 0x71a8e1a2, 0x5d88685f}, - {0x64280b8b, 0x0016, 0x07ab, 0x0fa7a30c, 0xda3a455f}, - {0x97dd724b, 0x0033, 0x07ad, 0x5788b2f4, 0xd7326d32}, - {0x61394b52, 0x0035, 0x0571, 0xc66525f1, 0xcabe7fef}, - {0x29b4faff, 0x0024, 0x006e, 0xca13751e, 0x993648e0}, - {0x29bfb1dc, 0x000b, 0x0244, 0x436c43f7, 0x429f7a59}, - {0x86ae934b, 0x0035, 0x0104, 0x0760ec93, 0x9cf7d0f4}, - {0xc4c1024e, 0x002e, 0x06b1, 0x6516a3ec, 0x19321f9c}, - {0x3287a80a, 0x0026, 0x0496, 0x0b257eb1, 0x754ebd51}, - {0xa4db423e, 0x0023, 0x045d, 0x9b3a66dc, 0x873e9f11}, - {0x7a1078df, 0x0015, 0x014a, 0x8c2484c5, 0x6a628659}, - {0x6048bd5b, 0x0006, 0x006a, 0x897e3559, 0xac9961af}, - {0xd8f9ea20, 0x003d, 0x0277, 0x60eb905b, 0xed2aaf99}, - {0xea5ec3b4, 0x002a, 0x04fe, 0x869965dc, 0x6c1f833b}, - {0x2dfb005d, 0x0016, 0x0345, 0x6a3b117e, 0xf05e8521}, - {0x5a214ade, 0x0020, 0x05b6, 0x467f70be, 0xcb22ccd3}, - {0xf0ab9cca, 0x0032, 0x0515, 0xed223df3, 0x7f3ef01d}, - {0x91b444f9, 0x002e, 0x07f8, 0x84e9a983, 0x5676756f}, - {0x1b5d2ddb, 0x002e, 0x012c, 0xba638c4c, 0x3f42047b}, - {0xd824d1bb, 0x003a, 0x07b5, 0x6288653b, 0x3a3ebea0}, - {0x0470180c, 0x0034, 0x01f0, 0x9d5b80d6, 0x3de08195}, - {0xffaa3a3f, 0x0036, 0x0299, 0xf3a82ab8, 0x53e0c13d}, - {0x6406cfeb, 0x0023, 0x0600, 0xa920b8e8, 0xe4e2acf4}, - {0xb24aaa38, 0x003e, 0x04a1, 0x657cc328, 0x5077b2c3}, - {0x58b2ab7c, 0x0039, 0x02b4, 0x3a17ee7e, 0x9dcb3643}, - {0x3db85970, 0x0006, 0x02b6, 0x95268b59, 0xb9812c10}, - {0x857830c5, 0x0003, 0x0590, 0x4ef439d5, 0xf042161d}, - {0xe1fcd978, 0x003e, 0x07d8, 0xae8d8699, 0xce0a1ef5}, - {0xb982a768, 0x0016, 0x06e0, 0x62fad3df, 0x5f8a067b}, - {0x1d581ce8, 0x001e, 0x058b, 0xf0f5da53, 0x26e39eee}, - {0x2456719b, 0x0025, 0x0503, 0x4296ac64, 0xd50e4c14}, - {0xfae6d8f2, 0x, 0x055d, 0x057fdf2e, 0x2a31391a}, - {0xcba828e3, 0x0039, 0x02ce, 0xe3f22351, 0x8f00877b}, - {0x13d25952, 0x000a, 0x072d, 0x76d4b4cc, 0x5eb67ec3}, - {0x0342be3f, 0x0015, 0x0599, 0xec75d9f1, 0x9d4d2826}, - {0xeaa344e0, 0x0014, 0x04d8, 0x72a4c981, 0x2064ea06}, - {0xbbb52021, 0x003b, 0x0272
[PATCH 4/4] crc32: Select an algorithm via kconfig
Allow the kernel builder to choose a crc32* algorithm for the kernel. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig | 35 +++ lib/crc32defs.h | 18 ++ 2 files changed, 53 insertions(+), 0 deletions(-) diff --git a/lib/Kconfig b/lib/Kconfig index 477be04..9f08b64 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -70,6 +70,41 @@ config CRC32_SELFTEST and crc32_be over byte strings with random alignment and length and computes the total elapsed time and number of bytes processed. +choice + prompt CRC32 implementation + depends on CRC32 + default CRC32_SLICEBY8 + +config CRC32_SLICEBY8 + bool Slice by 8 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is the fastest algorithm, but comes with a 8KiB lookup table. + Most modern processors have enough cache that this shouldn't be + a problem. + + If you don't know which to choose, choose this one. + +config CRC32_SLICEBY4 + bool Slice by 4 bytes + help + Calculate checksum 8 bytes at a time with a clever slicing algorithm. + This is reasonably fast, but has a 4KiB lookup table. + +config CRC32_SARWATE + bool Sarwate's Algorithm (one byte at a time) + help + Calculate checksum a byte at a time using Sarwate's algorithm. This + is not particularly fast, but has a small 256 byte lookup table. + +config CRC32_BIT + bool Classic Algorithm (one bit at a time) + help + Calculate checksum one bit at a time. This is VERY slow, but has + no lookup table. This is provided as a debugging option. + +endchoice + config CRC7 tristate CRC7 functions help diff --git a/lib/crc32defs.h b/lib/crc32defs.h index 6fd1917..64cba2c 100644 --- a/lib/crc32defs.h +++ b/lib/crc32defs.h @@ -13,6 +13,24 @@ */ #define CRC32C_POLY_LE 0x82F63B78 +/* Try to choose an implementation variant via Kconfig */ +#ifdef CONFIG_CRC32_SLICEBY8 +# define CRC_LE_BITS 64 +# define CRC_BE_BITS 64 +#endif +#ifdef CONFIG_CRC32_SLICEBY4 +# define CRC_LE_BITS 32 +# define CRC_BE_BITS 32 +#endif +#ifdef CONFIG_CRC32_SARWATE +# define CRC_LE_BITS 8 +# define CRC_BE_BITS 8 +#endif +#ifdef CONFIG_CRC32_BIT +# define CRC_LE_BITS 1 +# define CRC_BE_BITS 1 +#endif + /* * How many bits at a time to use. Valid values are 1, 2, 4, 8, 32 and 64. * For less performance-sensitive, use 4 or 8 to save table size. -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] crc32c: Implement CRC32c with slicing-by-8 algorithm
On Tue, Oct 04, 2011 at 07:59:53AM +0100, Herbert Xu wrote: On Mon, Oct 03, 2011 at 05:55:10PM -0700, Darrick J. Wong wrote: So what I think I'm hearing is... 1. Apply Bob's slice-by-8 algorithm patch to regular crc32. 2. Adapt crc32's build code to generate crc32c as well. 3. Remove crypto/crc32c.c's implementation and have it wrap the code generated by #2. 4. Retain the current libcrc32c. I guess if you don't configure CRYPTO and CRYPTO_CRC32C then it could also just reference the generated crc32c functions directly. Is this a satisfactory way to move forward? All good except that you don't really have to touch libcrc32c at all. Ok, let's see what you think of my v5 patchset. :) --D Cheers, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] crc32c: Implement CRC32c with slicing-by-8 algorithm
On Sat, Oct 01, 2011 at 03:52:00PM +0200, Joakim Tjernlund wrote: Darrick J. Wong djw...@us.ibm.com wrote on 2011/09/30 18:12:23: [putting mailing lists on cc] On Fri, Sep 30, 2011 at 08:01:36AM +0200, Joakim Tjernlund wrote: (Just happen to see this patch in the archives) - This is basically an copy of Bobs crc32 work and duplicates code, this code needs to move into /lib/crc32.c and use the existing framework. Which framework are you talking about? lib/crc32.c appears to be a simple module that exports a utility function. Do you mean that you want to merge the crc32{,c}defs.h and gen_crc32{,c}table.c code? Do you want a build script that starts with only a crc${ALG}_defs.h file and stamps out gencrc${ALG}_table.c and crc${ALG}.c boilerplate code and then builds it? I meant adding a crc32c_le in crc32.c and extend gen_table to generate the crc32c table. I really don't know; from my perspective there was a slow implementation in crypto/crc32c.c and I wanted to speed it up. crc32c seems to be in crypto/ and not lib/ so that the implementation can be replaced with a hardware accelerated version at runtime (crc32c-intel). It was a mistake to place it there IMHO. For crc32 which has no such hw replacement (as far as I know), moving it into crypto/ would incur the overhead of going through the cryptoapi for not much benefit. On the other hand it wouldn't be hard to put the crc32 code into crypto/. No, CRC is not a crypto. It is used by other subsystems like file systems that has nothing to do with crypto. Compare with the internet checksum, I think you will have a hard time moving it to crypto. Yes, crc32* are not crypto hashes; crc32c is merely using the framework. I'm not inclined to tear it out of there unless the crypto maintainers tell me to move it, which seems unlikely since Herbert made the move in the first place for reasons I noted in my other reply. - Slice by 8 is just half the speed on my ppc32 compared to slice by 4 so it can't be enabled for all archs. Best to start with all 64 bit archs shrug I suppose I could make CRC32C_BITS configurable. What is the hardware profile of your ppc32 processor? How much L1D/L2 cache? slice-by-8 does have a big cache footprint. On the other hand it's faster than the slice-by-4 (crc32) and Sarwate (crc32c) code in the kernel, even on old slow 32-bit x86 processors (PII, PIII, P4). It is a low end embedded 333 MHz CPU with only L1 cache. How much faster is slice by 8 than slice by 4 on these old x86 machines? How much L1 cache? Or, if you'd rather not give away specifics, has the CPU more than 8KB L1 cache? I'm willing to concede that with little cache the added memory pressure could be painful. As for the old x86 machines, please have a look at: http://djwong.org/docs/ext4_metadata_checksums.html#Benchmarking ~15% faster on a 2GHz Via C7 ~20% faster on a 2.7GHz P4 ~25% faster on a 500MHz P3 I vaguely recall it was ~20% faster on a 400MHz P2, but all the kernel.org wikis are still down. :( So I suspect the key factor here is memory hierachy, since all of those systems have at least 16K of L1 cache. Slice by 8 might actually suck on a Pentium Proor earlier. Unfortunately I don't have anything older than a PII... Bobs last version tested for 64/32 bits arch and selected slice by 8/slice by 4 based on that. - Last time I tested Bobs slice by 8 on ppc32 it didn't work. ... is crc32c broken *now*? It seems fine on x86/amd64/ppc64. Don't know, I haven't tested it. Don't have much time ATM and I don't want to test something I don't agree with. It seems fine on a ppc64 running in 32bit mode too. I'll go find an old ppc32 and see how it fares. I think it's a G3 500MHz. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] crc32c: Implement CRC32c with slicing-by-8 algorithm
On Sat, Oct 01, 2011 at 04:02:10PM +0200, Joakim Tjernlund wrote: Darrick J. Wong djw...@us.ibm.com wrote on 2011/09/30 21:29:56: The existing CRC32c implementation uses Sarwate's algorithm to calculate the code one byte at a time. Using a slicing-by-8 algorithm adapted from Bob Pearson, we can process buffers 8 bytes at a time, for a substantial increase in performance. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4 and jbd2. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only metadata operations (file creation and deletion, and fallocate/truncate), I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have some preliminary results[1] that show the difference in various crc algorithms that I've come across: the crc32c-by8-le column is the new algorithm in the patch; the crc32c column is the current crc32c kernel implementation; and the crc32-kern-le column is the current crc32 kernel implementation, which is similar to the results one gets for CONFIG_CRC32C_SLICEBY4=y. As you can see, the new implementation runs at nearly 4x the speed of the current implementation; even the slimmer slice-by-4 implementation is generally 2-3x faster. However, the implementation allows the kernel builder to select from a variety of space-speed tradeoffs, should my results not hold true on a particular class of system. v2: Use the crypto testmgr api for self-test. v3: Get rid of the -be version, which had no users. v4: Allow kernel builder a choice of speed vs. space optimization. [1]http://djwong.org/docs/ext4_metadata_checksums.html (cached copy of the ext4 wiki) Signed-off-by: Darrick J. Wong djw...@us.ibm.com This is based on an old version of Bobs slice by 8 that has lots duplication and hard to maintain. Are you referring to [PATCH v6 05/10] crc32-misc-cleanup.diff from 8/31? I haven't seen that one, so I'll go comb the internet. Thank you for the pointer, I'll update my patch. Start from Bobs latest patches and add crc32c to lib/crc32.c If I did that, how should I handle patching in the hardware accelerated version on Intel systems? That switcheroo ability seems to have been Herbert Xu's motivation for moving crc32c into crypto/ in the first place: libcrc32c: Move implementation to crypto crc32c This patch swaps the role of libcrc32c and crc32c. Previously the implementation was in libcrc32c and crc32c was a wrapper. Now the code is in crc32c and libcrc32c just calls the crypto layer. The reason for the change is to tap into the algorithm selection capability of the crypto API so that optimised implementations such as the one utilising Intel's CRC32C instruction can be used where available. Also, for crc32c I think you only need slice by 4 and slice by 8 Yes. The lookup table option is only for people with extremely small systems, and the per-bit option is usable only for debugging. They could go away if anyone's really offended by them. :) --D Jocke -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] crc32c: Implement CRC32c with slicing-by-8 algorithm
On Mon, Oct 03, 2011 at 09:35:13PM +0100, Herbert Xu wrote: On Mon, Oct 03, 2011 at 10:27:03PM +0200, Joakim Tjernlund wrote: Start from Bobs latest patches and add crc32c to lib/crc32.c If I did that, how should I handle patching in the hardware accelerated version on Intel systems? That switcheroo ability seems to have been Herbert Xu's motivation for moving crc32c into crypto/ in the first place: I don't know, I haven't looked at that problem. I suspect it moved because that was the easiest solution. Having an identical impl. of crc32(only the table values differ) in crypto compared to the one in lib is not the way forward though. You can always get crypto/crc32c.c to use call helpers from lib/crc32.c. So what I think I'm hearing is... 1. Apply Bob's slice-by-8 algorithm patch to regular crc32. 2. Adapt crc32's build code to generate crc32c as well. 3. Remove crypto/crc32c.c's implementation and have it wrap the code generated by #2. 4. Retain the current libcrc32c. I guess if you don't configure CRYPTO and CRYPTO_CRC32C then it could also just reference the generated crc32c functions directly. Is this a satisfactory way to move forward? --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] crc32c: Implement CRC32c with slicing-by-8 algorithm
[putting mailing lists on cc] On Fri, Sep 30, 2011 at 08:01:36AM +0200, Joakim Tjernlund wrote: (Just happen to see this patch in the archives) - This is basically an copy of Bobs crc32 work and duplicates code, this code needs to move into /lib/crc32.c and use the existing framework. Which framework are you talking about? lib/crc32.c appears to be a simple module that exports a utility function. Do you mean that you want to merge the crc32{,c}defs.h and gen_crc32{,c}table.c code? Do you want a build script that starts with only a crc${ALG}_defs.h file and stamps out gencrc${ALG}_table.c and crc${ALG}.c boilerplate code and then builds it? I really don't know; from my perspective there was a slow implementation in crypto/crc32c.c and I wanted to speed it up. crc32c seems to be in crypto/ and not lib/ so that the implementation can be replaced with a hardware accelerated version at runtime (crc32c-intel). For crc32 which has no such hw replacement (as far as I know), moving it into crypto/ would incur the overhead of going through the cryptoapi for not much benefit. On the other hand it wouldn't be hard to put the crc32 code into crypto/. - Slice by 8 is just half the speed on my ppc32 compared to slice by 4 so it can't be enabled for all archs. Best to start with all 64 bit archs shrug I suppose I could make CRC32C_BITS configurable. What is the hardware profile of your ppc32 processor? How much L1D/L2 cache? slice-by-8 does have a big cache footprint. On the other hand it's faster than the slice-by-4 (crc32) and Sarwate (crc32c) code in the kernel, even on old slow 32-bit x86 processors (PII, PIII, P4). - Last time I tested Bobs slice by 8 on ppc32 it didn't work. ... is crc32c broken *now*? It seems fine on x86/amd64/ppc64. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] libcrc32c: Expose big-endian version of crc32c
On Wed, Sep 28, 2011 at 01:53:59PM +1000, Herbert Xu wrote: On Tue, Sep 27, 2011 at 03:12:53PM -0700, Darrick J. Wong wrote: Provide a big-endian version of crc32c for modules that want it. Who is going to use this? Well, I was using it for jbd2 ... but since you ask, it seems to work just as well with crc32c-le, so I think I'll just drop the -be version. --D Thanks, -- Email: Herbert Xu herb...@gondor.apana.org.au Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] libcrc32c: Expose big-endian version of crc32c
On Wed, Sep 28, 2011 at 09:51:45AM -0700, Darrick J. Wong wrote: On Wed, Sep 28, 2011 at 01:53:59PM +1000, Herbert Xu wrote: On Tue, Sep 27, 2011 at 03:12:53PM -0700, Darrick J. Wong wrote: Provide a big-endian version of crc32c for modules that want it. Who is going to use this? Well, I was using it for jbd2 ... but since you ask, it seems to work just as well with crc32c-le, so I think I'll just drop the -be version. Drat, it's also missing the gen_crc32ctable program. Sorry for the noise; I'll resend it. With the -be parts stripped out I can remove all but the first patch, which cuts down the code changes considerably. --D -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3] crc32c: Implement CRC32c with slicing-by-8 algorithm
The existing CRC32c implementation uses Sarwate's algorithm to calculate the code one byte at a time. Using slicing-by-8, we can process buffers 8 bytes at a time, for a substantial increase in performance. v2: Use the crypto testmgr api for self-test. v3: Get rid of the -be version, which had no users. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Makefile | 11 ++ crypto/crc32c.c | 305 ++ crypto/crc32c_defs.h | 26 crypto/gen_crc32ctable.c | 79 4 files changed, 340 insertions(+), 81 deletions(-) create mode 100644 crypto/crc32c_defs.h create mode 100644 crypto/gen_crc32ctable.c diff --git a/crypto/Makefile b/crypto/Makefile index ce5a813..00811ef 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -94,3 +94,14 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o # obj-$(CONFIG_XOR_BLOCKS) += xor.o obj-$(CONFIG_ASYNC_CORE) += async_tx/ + +hostprogs-y:= gen_crc32ctable +clean-files:= crc32ctable.h + +$(obj)/crc32c.o: $(obj)/crc32c_table.h + +quiet_cmd_crc32c = GEN $@ + cmd_crc32c = $ $@ + +$(obj)/crc32c_table.h: $(obj)/gen_crc32ctable + $(call cmd,crc32c) diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..d510ec8 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -33,6 +33,35 @@ * Software Foundation; either version 2 of the License, or (at your option) * any later version. * + * The current crc32c implementation is adapted from Bob Pearson's slice-by-8 + * crc32 kernel patch from mid-2011. + * + * August 26, 2011 Darrick J. Wong djwong at us.ibm.com + * Reuse Bob Pearson's slice-by-8 implementation for e2fsprogs. + * + * July 20, 2011 Bob Pearson rpearson at systemfabricworks.com + * added slice by 8 algorithm to the existing conventional and + * slice by 4 algorithms. + * + * Oct 15, 2000 Matt Domsch matt_dom...@dell.com + * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch matt_dom...@dell.com + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. */ #include crypto/internal/hash.h @@ -40,6 +69,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include crc32c_defs.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -52,92 +82,205 @@ struct chksum_desc_ctx { u32 crc; }; -/* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ +#if CRC32C_BITS 8 +# define tole(x) (__force u32) __constant_cpu_to_le32(x) +#else +# define tole(x) (x) +#endif -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L
[PATCH 1/3] crc32c: Implement CRC32c with slicing-by-8 algorithm
The existing CRC32c implementation uses Sarwate's algorithm to calculate the code one byte at a time. Using slicing-by-8, we can process buffers 8 bytes at a time, for a substantial increase in performance. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Makefile | 11 + crypto/crc32c.c | 635 ++ crypto/crc32c_defs.h | 34 +++ 3 files changed, 576 insertions(+), 104 deletions(-) create mode 100644 crypto/crc32c_defs.h diff --git a/crypto/Makefile b/crypto/Makefile index ce5a813..00811ef 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -94,3 +94,14 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o # obj-$(CONFIG_XOR_BLOCKS) += xor.o obj-$(CONFIG_ASYNC_CORE) += async_tx/ + +hostprogs-y:= gen_crc32ctable +clean-files:= crc32ctable.h + +$(obj)/crc32c.o: $(obj)/crc32c_table.h + +quiet_cmd_crc32c = GEN $@ + cmd_crc32c = $ $@ + +$(obj)/crc32c_table.h: $(obj)/gen_crc32ctable + $(call cmd,crc32c) diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..d18f6a1 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -33,6 +33,35 @@ * Software Foundation; either version 2 of the License, or (at your option) * any later version. * + * The current crc32c implementation is adapted from Bob Pearson's slice-by-8 + * crc32 kernel patch from mid-2011. + * + * August 26, 2011 Darrick J. Wong djwong at us.ibm.com + * Reuse Bob Pearson's slice-by-8 implementation for e2fsprogs. + * + * July 20, 2011 Bob Pearson rpearson at systemfabricworks.com + * added slice by 8 algorithm to the existing conventional and + * slice by 4 algorithms. + * + * Oct 15, 2000 Matt Domsch matt_dom...@dell.com + * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch matt_dom...@dell.com + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. */ #include crypto/internal/hash.h @@ -40,6 +69,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include crc32c_defs.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -52,92 +82,398 @@ struct chksum_desc_ctx { u32 crc; }; -/* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ +#if CRC_LE_BITS 8 +# define tole(x) (__force u32) __constant_cpu_to_le32(x) +#else +# define tole(x) (x) +#endif -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, - 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, - 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, - 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, - 0x7198540DL
[PATCH 3/3] crc32c: Implement a self-test for CRC32c
This is a self-test for the CRC32c code. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/tcrypt.c |6 ++ crypto/testmgr.c | 36 +-- crypto/testmgr.h | 177 +- 3 files changed, 211 insertions(+), 8 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 617..73c10f8 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -64,7 +64,7 @@ static char *check[] = { cast6, arc4, michael_mic, deflate, crc32c, tea, xtea, khazad, wp512, wp384, wp256, tnepres, xeta, fcrypt, camellia, seed, salsa20, rmd128, rmd160, rmd256, rmd320, - lzo, cts, zlib, NULL + lzo, cts, zlib, crc32c-be, NULL }; static int test_cipher_jiffies(struct blkcipher_desc *desc, int enc, @@ -944,6 +944,10 @@ static int do_test(int m) ret += tcrypt_test(rfc4309(ccm(aes))); break; + case 46: + ret += tcrypt_test(crc32c-be); + break; + case 100: ret += tcrypt_test(hmac(md5)); break; diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b6b93d4..738b79f 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1457,7 +1457,8 @@ static int alg_test_hash(const struct alg_test_desc *desc, const char *driver, } static int alg_test_crc32c(const struct alg_test_desc *desc, - const char *driver, u32 type, u32 mask) + const char *driver, u32 type, u32 mask, + int big_endian) { struct crypto_shash *tfm; u32 val; @@ -1484,7 +1485,10 @@ static int alg_test_crc32c(const struct alg_test_desc *desc, sdesc.shash.tfm = tfm; sdesc.shash.flags = 0; - *(u32 *)sdesc.ctx = le32_to_cpu(420553207); + if (big_endian) + *(u32 *)sdesc.ctx = be32_to_cpu(420553207); + else + *(u32 *)sdesc.ctx = le32_to_cpu(420553207); err = crypto_shash_final(sdesc.shash, (u8 *)val); if (err) { printk(KERN_ERR alg: crc32c: Operation failed for @@ -1505,6 +1509,18 @@ out: return err; } +static int alg_test_crc32c_be(const struct alg_test_desc *desc, + const char *driver, u32 type, u32 mask) +{ + return alg_test_crc32c(desc, driver, type, mask, 1); +} + +static int alg_test_crc32c_le(const struct alg_test_desc *desc, + const char *driver, u32 type, u32 mask) +{ + return alg_test_crc32c(desc, driver, type, mask, 0); +} + static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver, u32 type, u32 mask) { @@ -1707,12 +1723,22 @@ static const struct alg_test_desc alg_test_descs[] = { } }, { .alg = crc32c, - .test = alg_test_crc32c, + .test = alg_test_crc32c_le, + .fips_allowed = 1, + .suite = { + .hash = { + .vecs = crc32c_le_tv_template, + .count = CRC32C_LE_TEST_VECTORS + } + } + }, { + .alg = crc32c-be, + .test = alg_test_crc32c_be, .fips_allowed = 1, .suite = { .hash = { - .vecs = crc32c_tv_template, - .count = CRC32C_TEST_VECTORS + .vecs = crc32c_be_tv_template, + .count = CRC32C_BE_TEST_VECTORS } } }, { diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 27adc92..8223738 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -10172,9 +10172,10 @@ static struct hash_testvec michael_mic_tv_template[] = { /* * CRC32C test vectors */ -#define CRC32C_TEST_VECTORS 14 +#define CRC32C_LE_TEST_VECTORS 14 +#define CRC32C_BE_TEST_VECTORS 14 -static struct hash_testvec crc32c_tv_template[] = { +static struct hash_testvec crc32c_le_tv_template[] = { { .psize = 0, .digest = \x00\x00\x00\x00, @@ -10346,4 +10347,176 @@ static struct hash_testvec crc32c_tv_template[] = { }, }; +static struct hash_testvec crc32c_be_tv_template[] = { + { + .psize = 0, + .digest = \x00\x00\x00\x00, + }, + { + .key = \x87\xa9\xcb\xed, + .ksize = 4, + .psize = 0, + .digest = \x78\x56\x34\x12, + }, + { + .key = \xff\xff\xff\xff, + .ksize = 4, + .plaintext = \x01\x02\x03\x04\x05\x06\x07\x08 +\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10 +\x11\x12\x13\x14\x15\x16\x17\x18
[PATCH 2/3] libcrc32c: Expose big-endian version of crc32c
Provide a big-endian version of crc32c for modules that want it. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32c.h |5 +++-- lib/libcrc32c.c| 43 ++- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index bd8b44d..33320e1 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -3,9 +3,10 @@ #include linux/types.h -extern u32 crc32c(u32 crc, const void *address, unsigned int length); +extern u32 crc32c_le(u32 crc, const void *address, unsigned int length); +extern u32 crc32c_be(u32 crc, const void *address, unsigned int length); /* This macro exists for backwards-compatibility. */ -#define crc32c_le crc32c +#define crc32c crc32c_le #endif /* _LINUX_CRC32C_H */ diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 244f548..e421ff5 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -37,17 +37,17 @@ #include linux/kernel.h #include linux/module.h -static struct crypto_shash *tfm; +static struct crypto_shash *tfm_le, *tfm_be; -u32 crc32c(u32 crc, const void *address, unsigned int length) +u32 crc32c_le(u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(tfm)]; + char ctx[crypto_shash_descsize(tfm_le)]; } desc; int err; - desc.shash.tfm = tfm; + desc.shash.tfm = tfm_le; desc.shash.flags = 0; *(u32 *)desc.ctx = crc; @@ -56,21 +56,46 @@ u32 crc32c(u32 crc, const void *address, unsigned int length) return *(u32 *)desc.ctx; } +EXPORT_SYMBOL(crc32c_le); -EXPORT_SYMBOL(crc32c); +u32 crc32c_be(u32 crc, const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(tfm_be)]; + } desc; + int err; + + desc.shash.tfm = tfm_be; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} +EXPORT_SYMBOL(crc32c_be); static int __init libcrc32c_mod_init(void) { - tfm = crypto_alloc_shash(crc32c, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); + tfm_le = crypto_alloc_shash(crc32c, 0, 0); + if (IS_ERR(tfm_le)) + return PTR_ERR(tfm_le); + + tfm_be = crypto_alloc_shash(crc32c-be, 0, 0); + if (IS_ERR(tfm_be)) { + crypto_free_shash(tfm_le); + return PTR_ERR(tfm_be); + } return 0; } static void __exit libcrc32c_mod_fini(void) { - crypto_free_shash(tfm); + crypto_free_shash(tfm_be); + crypto_free_shash(tfm_le); } module_init(libcrc32c_mod_init); -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 0/3] crc32c: Add faster algorithm and self-test code
On Tue, Sep 27, 2011 at 03:12:39PM -0700, Darrick J. Wong wrote: Hi all, This patchset replaces the current crc32c software implementation, which uses a slow per-byte lookup table algorithm, with a faster implementation that uses an adaptation of the slice-by-8 algorithm that Bob Pearson has been pushing for crc32. The motivation for this patchset is that I am working on adding full metadata checksumming to ext4[1]. As far as performance impact of adding checksumming goes, I see nearly no change with a standard mail server ffsb simulation. On a test that involves only file creation and deletion and extent tree writes, I see a drop of about 50 pcercent with the current kernel crc32c implementation; this improves to a drop of about 20 percent with the enclosed crc32c code. When metadata is usually a small fraction of total IO, this new implementation doesn't help much because metadata is usually a small fraction of total IO. However, when we are doing IO that is almost all metadata (such as rm -rf'ing a tree), then this patch speeds up the operation substantially. Please have a look at the patches, and please feel free to suggest any changes. I will be at LPC next week if anyone wishes to discuss, debate, or protest. Oops, ignore that sentence, since LPC has long passed. :( --D Incidentally, given that iscsi, sctp, and btrfs also use crc32c, this patchset should improve their speed as well. I have not yet quantified that, however. v2: Use the crypto test manager code to check crc32c operation. --D [1] https://ext4.wiki.kernel.org/index.php/Ext4_Metadata_Checksums -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] libcrc32c: Expose big-endian version of crc32c
Provide a big-endian version of crc32c for modules that want it. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- include/linux/crc32c.h |5 +++-- lib/libcrc32c.c| 43 ++- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index bd8b44d..33320e1 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -3,9 +3,10 @@ #include linux/types.h -extern u32 crc32c(u32 crc, const void *address, unsigned int length); +extern u32 crc32c_le(u32 crc, const void *address, unsigned int length); +extern u32 crc32c_be(u32 crc, const void *address, unsigned int length); /* This macro exists for backwards-compatibility. */ -#define crc32c_le crc32c +#define crc32c crc32c_le #endif /* _LINUX_CRC32C_H */ diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 244f548..e421ff5 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -37,17 +37,17 @@ #include linux/kernel.h #include linux/module.h -static struct crypto_shash *tfm; +static struct crypto_shash *tfm_le, *tfm_be; -u32 crc32c(u32 crc, const void *address, unsigned int length) +u32 crc32c_le(u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(tfm)]; + char ctx[crypto_shash_descsize(tfm_le)]; } desc; int err; - desc.shash.tfm = tfm; + desc.shash.tfm = tfm_le; desc.shash.flags = 0; *(u32 *)desc.ctx = crc; @@ -56,21 +56,46 @@ u32 crc32c(u32 crc, const void *address, unsigned int length) return *(u32 *)desc.ctx; } +EXPORT_SYMBOL(crc32c_le); -EXPORT_SYMBOL(crc32c); +u32 crc32c_be(u32 crc, const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(tfm_be)]; + } desc; + int err; + + desc.shash.tfm = tfm_be; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} +EXPORT_SYMBOL(crc32c_be); static int __init libcrc32c_mod_init(void) { - tfm = crypto_alloc_shash(crc32c, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); + tfm_le = crypto_alloc_shash(crc32c, 0, 0); + if (IS_ERR(tfm_le)) + return PTR_ERR(tfm_le); + + tfm_be = crypto_alloc_shash(crc32c-be, 0, 0); + if (IS_ERR(tfm_be)) { + crypto_free_shash(tfm_le); + return PTR_ERR(tfm_be); + } return 0; } static void __exit libcrc32c_mod_fini(void) { - crypto_free_shash(tfm); + crypto_free_shash(tfm_be); + crypto_free_shash(tfm_le); } module_init(libcrc32c_mod_init); -- To unsubscribe from this list: send the line unsubscribe linux-crypto in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] crc32c: Implement CRC32c with slicing-by-8 algorithm
The existing CRC32c implementation uses Sarwate's algorithm to calculate the code one byte at a time. Using slicing-by-8, we can process buffers 8 bytes at a time, for a substantial increase in performance. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- crypto/Makefile | 11 + crypto/crc32c.c | 635 ++ crypto/crc32c_defs.h | 34 +++ 3 files changed, 576 insertions(+), 104 deletions(-) create mode 100644 crypto/crc32c_defs.h diff --git a/crypto/Makefile b/crypto/Makefile index ce5a813..00811ef 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -94,3 +94,14 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o # obj-$(CONFIG_XOR_BLOCKS) += xor.o obj-$(CONFIG_ASYNC_CORE) += async_tx/ + +hostprogs-y:= gen_crc32ctable +clean-files:= crc32ctable.h + +$(obj)/crc32c.o: $(obj)/crc32c_table.h + +quiet_cmd_crc32c = GEN $@ + cmd_crc32c = $ $@ + +$(obj)/crc32c_table.h: $(obj)/gen_crc32ctable + $(call cmd,crc32c) diff --git a/crypto/crc32c.c b/crypto/crc32c.c index 3f9ad28..d18f6a1 100644 --- a/crypto/crc32c.c +++ b/crypto/crc32c.c @@ -33,6 +33,35 @@ * Software Foundation; either version 2 of the License, or (at your option) * any later version. * + * The current crc32c implementation is adapted from Bob Pearson's slice-by-8 + * crc32 kernel patch from mid-2011. + * + * August 26, 2011 Darrick J. Wong djwong at us.ibm.com + * Reuse Bob Pearson's slice-by-8 implementation for e2fsprogs. + * + * July 20, 2011 Bob Pearson rpearson at systemfabricworks.com + * added slice by 8 algorithm to the existing conventional and + * slice by 4 algorithms. + * + * Oct 15, 2000 Matt Domsch matt_dom...@dell.com + * Nicer crc32 functions/docs submitted by li...@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch matt_dom...@dell.com + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. */ #include crypto/internal/hash.h @@ -40,6 +69,7 @@ #include linux/module.h #include linux/string.h #include linux/kernel.h +#include crc32c_defs.h #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -52,92 +82,398 @@ struct chksum_desc_ctx { u32 crc; }; -/* - * This is the CRC-32C table - * Generated with: - * width = 32 bits - * poly = 0x1EDC6F41 - * reflect input bytes = true - * reflect output bytes = true - */ +#if CRC_LE_BITS 8 +# define tole(x) (__force u32) __constant_cpu_to_le32(x) +#else +# define tole(x) (x) +#endif -static const u32 crc32c_table[256] = { - 0xL, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, - 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, - 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, - 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, - 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, - 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, - 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, - 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, - 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, - 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, - 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, - 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, - 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, - 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, - 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, - 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, - 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, - 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, - 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, - 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, - 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, - 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, - 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, - 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, - 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, - 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, - 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, - 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, - 0x7198540DL
[PATCH 3/3] crc32c: Implement a self-test for CRC32c
This is a loadable module that will self-test the CRC32c code. Signed-off-by: Darrick J. Wong djw...@us.ibm.com --- lib/Kconfig |7 + lib/Makefile |1 lib/libcrc32c_test.c | 694 ++ 3 files changed, 702 insertions(+), 0 deletions(-) create mode 100644 lib/libcrc32c_test.c diff --git a/lib/Kconfig b/lib/Kconfig index 6c695ff..2bfdde8 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -79,6 +79,13 @@ config LIBCRC32C require M here. See Castagnoli93. Module will be libcrc32c. +config LIBCRC32C_SELFTEST + tristate CRC32c Self-Test + depends on CRYPTO_CRC32C + help + This is a testing module that ensure that a crc32c implementation + is working correctly. + config CRC8 tristate CRC8 function help diff --git a/lib/Makefile b/lib/Makefile index 3f5bc6d..79ca5ed 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o obj-$(CONFIG_CRC32)+= crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C)+= libcrc32c.o +obj-$(CONFIG_LIBCRC32C_SELFTEST) += libcrc32c_test.o obj-$(CONFIG_CRC8) += crc8.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o diff --git a/lib/libcrc32c_test.c b/lib/libcrc32c_test.c new file mode 100644 index 000..8b5c75f --- /dev/null +++ b/lib/libcrc32c_test.c @@ -0,0 +1,694 @@ +/* + * libcrc32c_test.c: Test buffer and checksums for crc32c. + */ +#include linux/module.h +#include linux/crc32c.h + +static u8 test_buf[] = { + 0xd9, 0xd7, 0x6a, 0x13, 0x3a, 0xb1, 0x05, 0x48, + 0xda, 0xad, 0x14, 0xbd, 0x03, 0x3a, 0x58, 0x5e, + 0x6e, 0xd1, 0x56, 0xc9, 0x2e, 0xc4, 0xcb, 0x6b, + 0xe8, 0x77, 0x52, 0x37, 0x4e, 0x0f, 0x55, 0xd2, + 0x12, 0x65, 0x90, 0xc2, 0x41, 0x49, 0x81, 0x01, + 0xf5, 0x01, 0xeb, 0x2d, 0x78, 0x74, 0x23, 0x5d, + 0x84, 0x5c, 0x81, 0x92, 0x21, 0xe9, 0x8d, 0x1d, + 0x89, 0xf2, 0x4a, 0xac, 0xdd, 0xf9, 0xaf, 0xee, + 0x44, 0xe7, 0x6e, 0xed, 0xfb, 0xd8, 0x89, 0x0e, + 0x96, 0x62, 0xcd, 0xa4, 0x4b, 0xa9, 0xe5, 0x45, + 0xb1, 0x29, 0x9b, 0x0f, 0xfc, 0xbd, 0x83, 0xab, + 0xa8, 0x54, 0x96, 0x44, 0x2c, 0x7f, 0xbb, 0xe7, + 0x52, 0x29, 0x08, 0xee, 0x14, 0xc5, 0xc2, 0xec, + 0x5a, 0xeb, 0x40, 0x40, 0xea, 0xd1, 0x3d, 0x15, + 0x73, 0xaa, 0x8c, 0x73, 0xfc, 0xf2, 0x2b, 0x49, + 0x0b, 0x13, 0x96, 0xd9, 0x8e, 0x4b, 0xbc, 0xe0, + 0xf4, 0xd2, 0xe0, 0x2e, 0x7a, 0xf0, 0x5d, 0x1f, + 0xd2, 0x92, 0x97, 0xe0, 0xaa, 0x59, 0xab, 0xc9, + 0x5c, 0xa6, 0x51, 0x1a, 0xe3, 0xd6, 0x06, 0xb9, + 0xae, 0xb8, 0x76, 0x36, 0x79, 0x37, 0x52, 0xf6, + 0x34, 0xaf, 0x27, 0x19, 0xe1, 0xc0, 0x2b, 0xdd, + 0x01, 0x15, 0xcd, 0xce, 0x44, 0xf6, 0x4c, 0x18, + 0x92, 0x69, 0xbe, 0x8a, 0x76, 0x23, 0x52, 0x13, + 0x3f, 0xf9, 0xe0, 0xf5, 0x06, 0x28, 0x7c, 0xc7, + 0xf3, 0x42, 0x0f, 0xdd, 0x40, 0x33, 0xf7, 0x99, + 0xe2, 0xad, 0x26, 0xd9, 0x53, 0x10, 0x72, 0x0c, + 0x4e, 0x43, 0x4c, 0x61, 0xfe, 0xd9, 0xc1, 0x16, + 0xa1, 0x93, 0xca, 0x3c, 0x75, 0x7f, 0x07, 0x7a, + 0x65, 0xb3, 0x53, 0x2a, 0x52, 0x00, 0xa0, 0x62, + 0xe0, 0xa3, 0x1f, 0xad, 0xd7, 0xbb, 0xc0, 0x83, + 0x5d, 0x54, 0x87, 0x5f, 0xc8, 0x2f, 0xc8, 0xbf, + 0x69, 0x04, 0x91, 0xc8, 0xa6, 0x1d, 0x4d, 0x46, + 0x91, 0xfc, 0x26, 0xf4, 0x16, 0xd1, 0xa4, 0xbf, + 0x5c, 0xa2, 0x6c, 0xdd, 0xb4, 0x40, 0xf2, 0x2e, + 0xa2, 0xad, 0xf7, 0xf4, 0xa5, 0x8a, 0x3e, 0x23, + 0x64, 0x08, 0xc8, 0xa1, 0xa0, 0xf0, 0x5d, 0x70, + 0xd2, 0x77, 0xfd, 0xc8, 0x50, 0x83, 0x0f, 0xd6, + 0x2b, 0xe4, 0x1f, 0x52, 0x34, 0x33, 0x68, 0xfd, + 0x92, 0xbe, 0x9f, 0x97, 0x6b, 0x8d, 0x81, 0x91, + 0x0f, 0xef, 0x65, 0xc8, 0x0d, 0x15, 0x01, 0x77, + 0x58, 0xb2, 0xf4, 0x1b, 0x06, 0x7e, 0xf5, 0xca, + 0x15, 0x2e, 0x38, 0xd8, 0x81, 0x1c, 0x1c, 0xa0, + 0xb6, 0x13, 0x6a, 0x2b, 0x71, 0x34, 0x52, 0xd7, + 0x1d, 0xbd, 0x37, 0x59, 0xbc, 0x86, 0x25, 0x2b, + 0xa8, 0x93, 0xce, 0x1a, 0x03, 0x16, 0xfe, 0x01, + 0x57, 0x99, 0x24, 0x25, 0x2c, 0xb3, 0xab, 0x1e, + 0x2d, 0x65, 0x20, 0x89, 0x17, 0x02, 0x0e, 0x0a, + 0xf5, 0x1e, 0xc7, 0xff, 0x1f, 0x61, 0xa9, 0x54, + 0x18, 0xd4, 0xba, 0x50, 0x57, 0x02, 0xa1, 0xab, + 0x22, 0x2e, 0x07, 0xea, 0xa9, 0xa3, 0x83, 0x4f, + 0x27, 0xf5, 0xc5, 0xee, 0x3c, 0x3b, 0x10, 0xad, + 0x32, 0x2b, 0x1c, 0x03, 0xcb, 0xaf, 0x98, 0x83, + 0x54, 0xc3, 0x68, 0x63, 0xd4, 0xe0, 0x0e, 0x3c, + 0x1a, 0x4e, 0xc0, 0x81, 0xd0, 0xe8, 0x6a, 0x62, + 0x6b, 0x3e, 0x6f, 0xc4, 0xc6, 0x33, 0x4e, 0x26, + 0x21, 0xf5, 0x04, 0xdf, 0xfa, 0xce, 0x45, 0xaf, + 0xdc, 0x5e, 0x1b, 0xad, 0x93, 0xca, 0xf5, 0xcf, + 0xd7, 0xee, 0x0c, 0x5c, 0x5e, 0xb4, 0xf0, 0x92, + 0xd2, 0xf2, 0xf0, 0xa9, 0x1e, 0xab, 0x80, 0x68, + 0x46, 0xef, 0xcc, 0x26, 0x0c, 0x5c, 0xdd, 0x4e, + 0x83, 0xb8, 0xb9, 0x53, 0x6e, 0xf8, 0x93, 0x38