Re: [RESEND PATCH] crypto: Add zstd support

2018-03-30 Thread Nick Terrell
> On Mar 30, 2018, at 10:14 AM, Herbert Xu <herb...@gondor.apana.org.au> wrote:
> 
> On Thu, Mar 22, 2018 at 01:32:30PM +0900, Sergey Senozhatsky wrote:
>> On (03/21/18 15:49), Nick Terrell wrote:
>>> depends on CONFIG_CRYPTO_ZSTD, which isn't defined until this patch is in
>> 
>> Yikes! How come I missed that... :)
>> 
>>> [0] 
>>> lkml.kernel.org/r/9c9416b2dff19f05fb4c35879aaa83d11ff72c92.1521626182.git.geliangt...@gmail.com
>> 
>> Signed-off-by: Nick Terrell <terre...@fb.com> ?
> 
> Please resend with sign-off.

Sorry about that, resent as 
http://lkml.kernel.org/r/20180330191453.3535287-1-terre...@fb.com.

> 
> Thanks,
> -- 
> Email: Herbert Xu <herb...@gondor.apana.org.au>
> Home Page: 
> https://urldefense.proofpoint.com/v2/url?u=http-3A__gondor.apana.org.au_-7Eherbert_=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=HQM5IQdWOB8WaMoii2dYTw=9TJPOvcdtkpIjIAGJJ6-7fzehxatF48hBoFlxI0lAw8=79UWdDJu7MxUVoJ2mEANx31Yj5EYfAyJl8noCeBv-4c=
> PGP Key: 
> https://urldefense.proofpoint.com/v2/url?u=http-3A__gondor.apana.org.au_-7Eherbert_pubkey.txt=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=HQM5IQdWOB8WaMoii2dYTw=9TJPOvcdtkpIjIAGJJ6-7fzehxatF48hBoFlxI0lAw8=9Pll03qIuCxGGfTX0v5TkbxRUh7p76X1keoDqWC49UA=



[RESEND PATCH] crypto: Add zstd support

2018-03-30 Thread Nick Terrell
Adds zstd support to crypto and scompress. Only supports the default
level.

Previously we held off on this patch, since there weren't any users.
Now zram is ready for zstd support, but depends on CONFIG_CRYPTO_ZSTD,
which isn't defined until this patch is in. I also see a patch adding
zstd to pstore [0], which depends on crypto zstd.

[0] 
lkml.kernel.org/r/9c9416b2dff19f05fb4c35879aaa83d11ff72c92.1521626182.git.geliangt...@gmail.com

Signed-off-by: Nick Terrell <terre...@fb.com>
---
 crypto/Kconfig   |   9 ++
 crypto/Makefile  |   1 +
 crypto/testmgr.c |  10 +++
 crypto/testmgr.h |  71 +++
 crypto/zstd.c| 265 +++
 5 files changed, 356 insertions(+)
 create mode 100644 crypto/zstd.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b75264b..500583c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1677,6 +1677,15 @@ config CRYPTO_LZ4HC
help
  This is the LZ4 high compression mode algorithm.
 
+config CRYPTO_ZSTD
+   tristate "Zstd compression algorithm"
+   select CRYPTO_ALGAPI
+   select CRYPTO_ACOMP2
+   select ZSTD_COMPRESS
+   select ZSTD_DECOMPRESS
+   help
+ This is the zstd algorithm.
+
 comment "Random Number Generation"
 
 config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index cdbc03b..2900ab1 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -136,6 +136,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
 obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
+obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
 
 ecdh_generic-y := ecc.o
 ecdh_generic-y += ecdh.o
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index d5e23a1..71b2798 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3576,6 +3576,16 @@ static const struct alg_test_desc alg_test_descs[] = {
.decomp = 
__VECS(zlib_deflate_decomp_tv_template)
}
}
+   }, {
+   .alg = "zstd",
+   .test = alg_test_comp,
+   .fips_allowed = 1,
+   .suite = {
+   .comp = {
+   .comp = __VECS(zstd_comp_tv_template),
+   .decomp = __VECS(zstd_decomp_tv_template)
+   }
+   }
}
 };
 
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 6044f69..9b5b930 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -35255,4 +35255,75 @@ static const struct comp_testvec 
lz4hc_decomp_tv_template[] = {
},
 };
 
+static const struct comp_testvec zstd_comp_tv_template[] = {
+   {
+   .inlen  = 68,
+   .outlen = 39,
+   .input  = "The algorithm is zstd. "
+ "The algorithm is zstd. "
+ "The algorithm is zstd.",
+   .output = "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+ ,
+   },
+   {
+   .inlen  = 244,
+   .outlen = 151,
+   .input  = "zstd, short for Zstandard, is a fast lossless "
+ "compression algorithm, targeting real-time "
+ "compression scenarios at zlib-level and better "
+ "compression ratios. The zstd compression library "
+ "provides in-memory compression and decompression "
+ "functions.",
+   .output = "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17"
+ "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+ "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+ "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+ "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+ "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+ "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+ "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+ "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+ "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+ "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+ "\x20\xa9\x0e\x82\xb9\x43\x45\x01",
+   },
+};
+
+s

[RESEND PATCH] crypto: Add zstd support

2018-03-21 Thread Nick Terrell
Adds zstd support to crypto and scompress. Only supports the default
level.

Previously we held off on this patch, since there weren't any users.
Now zram is ready for zstd support, but depends on CONFIG_CRYPTO_ZSTD,
which isn't defined until this patch is in. I also see a patch adding
zstd to pstore [0], which depends on crypto zstd.

[0] 
lkml.kernel.org/r/9c9416b2dff19f05fb4c35879aaa83d11ff72c92.1521626182.git.geliangt...@gmail.com
---
 crypto/Kconfig   |   9 ++
 crypto/Makefile  |   1 +
 crypto/testmgr.c |  10 +++
 crypto/testmgr.h |  71 +++
 crypto/zstd.c| 265 +++
 5 files changed, 356 insertions(+)
 create mode 100644 crypto/zstd.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index b75264b..500583c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1677,6 +1677,15 @@ config CRYPTO_LZ4HC
help
  This is the LZ4 high compression mode algorithm.
 
+config CRYPTO_ZSTD
+   tristate "Zstd compression algorithm"
+   select CRYPTO_ALGAPI
+   select CRYPTO_ACOMP2
+   select ZSTD_COMPRESS
+   select ZSTD_DECOMPRESS
+   help
+ This is the zstd algorithm.
+
 comment "Random Number Generation"
 
 config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index cdbc03b..2900ab1 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -136,6 +136,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
 obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
+obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
 
 ecdh_generic-y := ecc.o
 ecdh_generic-y += ecdh.o
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index d5e23a1..71b2798 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3576,6 +3576,16 @@ static const struct alg_test_desc alg_test_descs[] = {
.decomp = 
__VECS(zlib_deflate_decomp_tv_template)
}
}
+   }, {
+   .alg = "zstd",
+   .test = alg_test_comp,
+   .fips_allowed = 1,
+   .suite = {
+   .comp = {
+   .comp = __VECS(zstd_comp_tv_template),
+   .decomp = __VECS(zstd_decomp_tv_template)
+   }
+   }
}
 };
 
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 6044f69..9b5b930 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -35255,4 +35255,75 @@ static const struct comp_testvec 
lz4hc_decomp_tv_template[] = {
},
 };
 
+static const struct comp_testvec zstd_comp_tv_template[] = {
+   {
+   .inlen  = 68,
+   .outlen = 39,
+   .input  = "The algorithm is zstd. "
+ "The algorithm is zstd. "
+ "The algorithm is zstd.",
+   .output = "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+ ,
+   },
+   {
+   .inlen  = 244,
+   .outlen = 151,
+   .input  = "zstd, short for Zstandard, is a fast lossless "
+ "compression algorithm, targeting real-time "
+ "compression scenarios at zlib-level and better "
+ "compression ratios. The zstd compression library "
+ "provides in-memory compression and decompression "
+ "functions.",
+   .output = "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17"
+ "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+ "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+ "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+ "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+ "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+ "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+ "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+ "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+ "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+ "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+ "\x20\xa9\x0e\x82\xb9\x43\x45\x01",
+   },
+};
+
+static const struct comp_testvec zstd_decomp_tv_template[] = {
+   {
+   .inlen  = 43,
+   .outlen = 68,
+   .input  = "\x28\xb5\x2f\xfd\x04\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ 

Re: [PATCH 1/1] lz4: Implement lz4 with dynamic offset length.

2018-03-21 Thread Nick Terrell
On (03/21/18 10:10), Maninder Singh wrote:
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4..185c358 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -183,7 +183,8 @@ static FORCE_INLINE int LZ4_compress_generic(
>   const tableType_t tableType,
>   const dict_directive dict,
>   const dictIssue_directive dictIssue,
> - const U32 acceleration)
> + const U32 acceleration,
> + const Dynamic_Offset dynOffset)
>  {
>   const BYTE *ip = (const BYTE *) source;
>   const BYTE *base;
> @@ -199,6 +200,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>
>   BYTE *op = (BYTE *) dest;
>   BYTE * const olimit = op + maxOutputSize;
> + int max_distance = dynOffset ? MAX_DISTANCE_DYN : MAX_DISTANCE;

Lets mark this variable `const`. If the compiler doesn't realize that this
variable and `dynOffset` are compile time constants, I expect the speed to
be impacted.

>
>   U32 forwardH;
>   size_t refDelta = 0;
> @@ -245,6 +247,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>   for ( ; ; ) {
>   const BYTE *match;
>   BYTE *token;
> + int curr_offset;
>
>   /* Find a match */
>   {
> @@ -285,7 +288,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>   : 0)
>   || ((tableType == byU16)
>   ? 0
> - : (match + MAX_DISTANCE < ip))
> + : (match + max_distance < ip))
>   || (LZ4_read32(match + refDelta)
>   != LZ4_read32(ip)));
>   }
> @@ -328,8 +331,26 @@ static FORCE_INLINE int LZ4_compress_generic(
>
>  _next_match:
>   /* Encode Offset */
> - LZ4_writeLE16(op, (U16)(ip - match));
> - op += 2;
> + if (dynOffset) {
> + curr_offset = (U16)(ip - match);
> +
> + /*
> +  * If Ofsset is greater than 127, we need 2 bytes
> +  * to store it. Otherwise 1 byte is enough.
> +  */
> + if (curr_offset > 127) {
> + curr_offset = (curr_offset << 1) | DYN_BIT;
> + LZ4_writeLE16(op, (U16)curr_offset);
> + op += 2;
> + } else {
> + curr_offset = curr_offset << 1;
> + *op = (BYTE)curr_offset;
> + op++;
> + }

The standard way to do variable sized integers is to use the high-bit as
the control bit, not the low-bit. Do you have benchmarks to show that using
the low-bit is faster than using the high-bit? If so, lets comment in the
code, if not lets use the high-bit.

> + } else {
> + LZ4_writeLE16(op, (U16)(ip - match));
> + op += 2;
> + }
>
>   /* Encode MatchLength */
>   {
> @@ -480,39 +501,70 @@ static int LZ4_compress_fast_extState(
>   return LZ4_compress_generic(ctx, source,
>   dest, inputSize, 0,
>   noLimit, byU16, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
>   else
>   return LZ4_compress_generic(ctx, source,
>   dest, inputSize, 0,
>   noLimit, tableType, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
>   } else {
>   if (inputSize < LZ4_64Klimit)
>   return LZ4_compress_generic(ctx, source,
>   dest, inputSize,
>   maxOutputSize, limitedOutput, byU16, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
>   else
>   return LZ4_compress_generic(ctx, source,
>   dest, inputSize,
>   maxOutputSize, limitedOutput, tableType, noDict,
> - noDictIssue, acceleration);
> + noDictIssue, acceleration, NoDynOffset);
>   }
>  }
>
> +static int LZ4_compress_fast_extState_dynamic(
> + void *state,
> + const char *source,
> + char *dest,
> + int inputSize,
> + int maxOutputSize,
> + int acceleration)
> +{
> + LZ4_stream_t_internal *ctx = &((LZ4_stream_t 
> *)state)->internal_donotuse;
> +
> + LZ4_resetStream((LZ4_stream_t *)state);
> +
> + if (acceleration < 1)
> +

Re: [PATCH 0/1] cover-letter/lz4: Implement lz4 with dynamic offset length.

2018-03-21 Thread Nick Terrell
On (03/21/18 10:10), Maninder Singh wrote:
> LZ4 specification defines 2 byte offset length for 64 KB data.
> But in case of ZRAM we compress data per page and in most of
> architecture PAGE_SIZE is 4KB. So we can decide offset length based
> on actual offset value. For this we can reserve 1 bit to decide offset
> length (1 byte or 2 byte). 2 byte required only if ofsset is greater than 127,
> else 1 byte is enough.
> 
> With this new implementation new offset value can be at MAX 32 KB.
> 
> Thus we can save more memory for compressed data.
> 
> results checked with new implementation:-
> 
> comression size for same input source
> (LZ4_DYN < LZO < LZ4)
> 
> LZO
> ===
> orig_data_size: 78917632
> compr_data_size: 15894668
> mem_used_total: 17117184
> 
> LZ4
> 
> orig_data_size: 78917632
> compr_data_size: 16310717
> mem_used_total: 17592320
> 
> LZ4_DYN
> ===
> orig_data_size: 78917632
> compr_data_size: 15520506
> mem_used_total: 16748544

This seems like a reasonable extension to the algorithm, and it looks like
LZ4_DYN is about a 5% improvement to compression ratio on your benchmark.
The biggest question I have is if it is worthwhile to maintain a separate
incompatible variant of LZ4 in the kernel without any upstream for a 5%
gain? If we do want to go forward with this, we should perform more
benchmarks.

I commented in the patch, but because the `dynOffset` variable isn't a
compile time static in LZ4_decompress_generic(), I suspect that the patch
causes a regression in decompression speed for both LZ4 and LZ4_DYN. You'll
need to re-run the benchmarks to first show that LZ4 before the patch
performs the same as LZ4 after the patch. Then re-run the LZ4 vs LZ4_DYN
benchmarks.

I would also like to see a benchmark in user-space (with the code), so we
can see the performance of LZ4 before and after the patch, as well as LZ4
vs LZ4_DYN without anything else going on. I expect the extra branches in
the decoding loop to have an impact on speed, and I would like to see how
big the impact is without noise. 

CC-ing Yann Collet, the author of LZ4



Re: [PATCH v5 2/5] lib: Add zstd modules

2017-08-10 Thread Nick Terrell
On 8/10/17, 10:48 AM, "Austin S. Hemmelgarn" <ahferro...@gmail.com> wrote:
>On 2017-08-10 13:24, Eric Biggers wrote:
>>On Thu, Aug 10, 2017 at 07:32:18AM -0400, Austin S. Hemmelgarn wrote:
>>>On 2017-08-10 04:30, Eric Biggers wrote:
>>>>On Wed, Aug 09, 2017 at 07:35:53PM -0700, Nick Terrell wrote:
>>>>>
>>>>> It can compress at speeds approaching lz4, and quality approaching lzma.
>>>>
>>>> Well, for a very loose definition of "approaching", and certainly not at 
>>>> the
>>>> same time.  I doubt there's a use case for using the highest compression 
>>>> levels
>>>> in kernel mode --- especially the ones using zstd_opt.h.
>>> Large data-sets with WORM access patterns and infrequent writes
>>> immediately come to mind as a use case for the highest compression
>>> level.
>>>
>>> As a more specific example, the company I work for has a very large
>>> amount of documentation, and we keep all old versions.  This is all
>>> stored on a file server which is currently using BTRFS.  Once a
>>> document is written, it's almost never rewritten, so write
>>> performance only matters for the first write.  However, they're read
>>> back pretty frequently, so we need good read performance.  As of
>>> right now, the system is set to use LZO compression by default, and
>>> then when a new document is added, the previous version of that
>>> document gets re-compressed using zlib compression, which actually
>>> results in pretty significant space savings most of the time.  I
>>> would absolutely love to use zstd compression with this system with
>>> the highest compression level, because most people don't care how
>>> long it takes to write the file out, but they do care how long it
>>> takes to read a file (even if it's an older version).
>> 
>> This may be a reasonable use case, but note this cannot just be the regular
>> "zstd" compression setting, since filesystem compression by default must 
>> provide
>> reasonable performance for many different access patterns.  See the patch in
>> this series which actually adds zstd compression to btrfs; it only uses 
>> level 1.
>> I do not see a patch which adds a higher compression mode.  It would need to 
>> be
>> a special setting like "zstdhc" that users could opt-in to on specific
>> directories.  It also would need to be compared to simply compressing in
>> userspace.  In many cases compressing in userspace is probably the better
>> solution for the use case in question because it works on any filesystem, 
>> allows
>> using any compression algorithm, and if random access is not needed it is
>> possible to compress each file as a single stream (like a .xz file), which
>> produces a much better compression ratio than the block-by-block compression
>> that filesystems have to use.
> There has been discussion as well as (I think) initial patches merged 
> for support of specifying the compression level for algorithms which 
> support multiple compression levels in BTRFS.  I was actually under the 
> impression that we had decided to use level 3 as the default for zstd, 
> but that apparently isn't the case, and with the benchmark issues, it 
> may not be once proper benchmarks are run.

There are some initial patches to add compression levels to BtrFS [1]. Once
it's ready, we can add compression levels to zstd. The default compression
level in the current patch is 3.

[1] https://lkml.kernel.org/r/20170724172939.24527-1-dste...@suse.com




Re: [PATCH v5 2/5] lib: Add zstd modules

2017-08-10 Thread Nick Terrell
On 8/10/17, 1:30 AM, "Eric Biggers" <ebigge...@gmail.com> wrote:
> On Wed, Aug 09, 2017 at 07:35:53PM -0700, Nick Terrell wrote:
>>
>> It can compress at speeds approaching lz4, and quality approaching lzma.
> 
> Well, for a very loose definition of "approaching", and certainly not at the
> same time.  I doubt there's a use case for using the highest compression 
> levels
> in kernel mode --- especially the ones using zstd_opt.h.
> 
>> 
>> The code was ported from the upstream zstd source repository.
> 
> What version?

zstd-1.1.4 with patches applied from upstream. I'll include it in the
next patch version.

>> `linux/zstd.h` header was modified to match linux kernel style.
>> The cross-platform and allocation code was stripped out. Instead zstd
>> requires the caller to pass a preallocated workspace. The source files
>> were clang-formatted [1] to match the Linux Kernel style as much as
>> possible. 
> 
> It would be easier to compare to the upstream version if it was not all
> reformatted.  There is a chance that bugs were introduced by Linux-specific
> changes, and it would be nice if they could be easily reviewed.  (Also I don't
> know what clang-format settings you used, but there are still a lot of
> differences from the Linux coding style.)

The clang-format settings I used are available in the zstd repo [1]. I left
the line length long, since it looked terrible otherwise.I set up a branch
in my zstd GitHub fork called "original-formatted" [2]. I've taken the source
I based the kernel patches off of [3] and ran clang-format without any other
changes. If you have any suggestions to improve the clang-formatting
please let me know.

>> 
>> I benchmarked zstd compression as a special character device. I ran zstd
>> and zlib compression at several levels, as well as performing no
>> compression, which measure the time spent copying the data to kernel space.
>> Data is passed to the compresser 4096 B at a time. The benchmark file is
>> located in the upstream zstd source repository under
>> `contrib/linux-kernel/zstd_compress_test.c` [2].
>> 
>> I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
>> The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
>> 16 GB of RAM, and a SSD. I benchmarked using `silesia.tar` [3], which is
>> 211,988,480 B large. Run the following commands for the benchmark:
>> 
>> sudo modprobe zstd_compress_test
>> sudo mknod zstd_compress_test c 245 0
>> sudo cp silesia.tar zstd_compress_test
>> 
>> The time is reported by the time of the userland `cp`.
>> The MB/s is computed with
>> 
>> 1,536,217,008 B / time(buffer size, hash)
>> 
>> which includes the time to copy from userland.
>> The Adjusted MB/s is computed with
>> 
>> 1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)).
>> 
>> The memory reported is the amount of memory the compressor requests.
>> 
>> | Method   | Size (B) | Time (s) | Ratio | MB/s| Adj MB/s | Mem (MB) |
>> |--|--|--|---|-|--|--|
>> | none | 11988480 |0.100 | 1 | 2119.88 |- |- |
>> | zstd -1  | 73645762 |1.044 | 2.878 |  203.05 |   224.56 | 1.23 |
>> | zstd -3  | 66988878 |1.761 | 3.165 |  120.38 |   127.63 | 2.47 |
>> | zstd -5  | 65001259 |2.563 | 3.261 |   82.71 |86.07 | 2.86 |
>> | zstd -10 | 60165346 |   13.242 | 3.523 |   16.01 |16.13 |13.22 |
>> | zstd -15 | 58009756 |   47.601 | 3.654 |4.45 | 4.46 |21.61 |
>> | zstd -19 | 54014593 |  102.835 | 3.925 |2.06 | 2.06 |60.15 |
>> | zlib -1  | 77260026 |2.895 | 2.744 |   73.23 |75.85 | 0.27 |
>> | zlib -3  | 72972206 |4.116 | 2.905 |   51.50 |52.79 | 0.27 |
>> | zlib -6  | 68190360 |9.633 | 3.109 |   22.01 |22.24 | 0.27 |
>> | zlib -9  | 67613382 |   22.554 | 3.135 |9.40 | 9.44 | 0.27 |
>> 
> 
> Theses benchmarks are misleading because they compress the whole file as a
> single stream without resetting the dictionary, which isn't how data will
> typically be compressed in kernel mode.  With filesystem compression the data
> has to be divided into small chunks that can each be decompressed 
> independently.
> That eliminates one of the primary advantages of Zstandard (support for large
> dictionary sizes).

This benchmark isn't meant to be representative of a filesystem scenario. I
wanted to show off zstd without anything else going on. Even in filesystems
where the data is chunked, zstd uses the whole chunk as the wind

[PATCH v5 5/5] crypto: Add zstd support

2017-08-09 Thread Nick Terrell
Adds zstd support to crypto and scompress. Only supports the default
level.

Signed-off-by: Nick Terrell <terre...@fb.com>
---
 crypto/Kconfig   |   9 ++
 crypto/Makefile  |   1 +
 crypto/testmgr.c |  10 +++
 crypto/testmgr.h |  71 +++
 crypto/zstd.c| 265 +++
 5 files changed, 356 insertions(+)
 create mode 100644 crypto/zstd.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index caa770e..4fc3936 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1662,6 +1662,15 @@ config CRYPTO_LZ4HC
help
  This is the LZ4 high compression mode algorithm.

+config CRYPTO_ZSTD
+   tristate "Zstd compression algorithm"
+   select CRYPTO_ALGAPI
+   select CRYPTO_ACOMP2
+   select ZSTD_COMPRESS
+   select ZSTD_DECOMPRESS
+   help
+ This is the zstd algorithm.
+
 comment "Random Number Generation"

 config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index d41f033..b22e1e8 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -133,6 +133,7 @@ obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
 obj-$(CONFIG_CRYPTO_USER_API_AEAD) += algif_aead.o
+obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o

 ecdh_generic-y := ecc.o
 ecdh_generic-y += ecdh.o
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 7125ba3..8a124d3 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -3603,6 +3603,16 @@ static const struct alg_test_desc alg_test_descs[] = {
.decomp = 
__VECS(zlib_deflate_decomp_tv_template)
}
}
+   }, {
+   .alg = "zstd",
+   .test = alg_test_comp,
+   .fips_allowed = 1,
+   .suite = {
+   .comp = {
+   .comp = __VECS(zstd_comp_tv_template),
+   .decomp = __VECS(zstd_decomp_tv_template)
+   }
+   }
}
 };

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 6ceb0e2..e6b5920 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -34631,4 +34631,75 @@ static const struct comp_testvec 
lz4hc_decomp_tv_template[] = {
},
 };

+static const struct comp_testvec zstd_comp_tv_template[] = {
+   {
+   .inlen  = 68,
+   .outlen = 39,
+   .input  = "The algorithm is zstd. "
+ "The algorithm is zstd. "
+ "The algorithm is zstd.",
+   .output = "\x28\xb5\x2f\xfd\x00\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ "\x20\x7a\x73\x74\x64\x2e\x20\x01\x00\x55\x73\x36\x01"
+ ,
+   },
+   {
+   .inlen  = 244,
+   .outlen = 151,
+   .input  = "zstd, short for Zstandard, is a fast lossless "
+ "compression algorithm, targeting real-time "
+ "compression scenarios at zlib-level and better "
+ "compression ratios. The zstd compression library "
+ "provides in-memory compression and decompression "
+ "functions.",
+   .output = "\x28\xb5\x2f\xfd\x00\x50\x75\x04\x00\x42\x4b\x1e\x17"
+ "\x90\x81\x31\x00\xf2\x2f\xe4\x36\xc9\xef\x92\x88\x32"
+ "\xc9\xf2\x24\x94\xd8\x68\x9a\x0f\x00\x0c\xc4\x31\x6f"
+ "\x0d\x0c\x38\xac\x5c\x48\x03\xcd\x63\x67\xc0\xf3\xad"
+ "\x4e\x90\xaa\x78\xa0\xa4\xc5\x99\xda\x2f\xb6\x24\x60"
+ "\xe2\x79\x4b\xaa\xb6\x6b\x85\x0b\xc9\xc6\x04\x66\x86"
+ "\xe2\xcc\xe2\x25\x3f\x4f\x09\xcd\xb8\x9d\xdb\xc1\x90"
+ "\xa9\x11\xbc\x35\x44\x69\x2d\x9c\x64\x4f\x13\x31\x64"
+ "\xcc\xfb\x4d\x95\x93\x86\x7f\x33\x7f\x1a\xef\xe9\x30"
+ "\xf9\x67\xa1\x94\x0a\x69\x0f\x60\xcd\xc3\xab\x99\xdc"
+ "\x42\xed\x97\x05\x00\x33\xc3\x15\x95\x3a\x06\xa0\x0e"
+ "\x20\xa9\x0e\x82\xb9\x43\x45\x01",
+   },
+};
+
+static const struct comp_testvec zstd_decomp_tv_template[] = {
+   {
+   .inlen  = 43,
+   .outlen = 68,
+   .input  = "\x28\xb5\x2f\xfd\x04\x50\xf5\x00\x00\xb8\x54\x68\x65"
+ "\x20\x61\x6c\x67\x6f\x72\x69\x74\x68\x6d\x20\x69\x73"
+ "\x20\

[PATCH v5 1/5] lib: Add xxhash module

2017-08-09 Thread Nick Terrell
Adds xxhash kernel module with xxh32 and xxh64 hashes. xxhash is an
extremely fast non-cryptographic hash algorithm for checksumming.
The zstd compression and decompression modules added in the next patch
require xxhash. I extracted it out from zstd since it is useful on its
own. I copied the code from the upstream XXHash source repository and
translated it into kernel style. I ran benchmarks and tests in the kernel
and tests in userland.

I benchmarked xxhash as a special character device. I ran in four modes,
no-op, xxh32, xxh64, and crc32. The no-op mode simply copies the data to
kernel space and ignores it. The xxh32, xxh64, and crc32 modes compute
hashes on the copied data. I also ran it with four different buffer sizes.
The benchmark file is located in the upstream zstd source repository under
`contrib/linux-kernel/xxhash_test.c` [1].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD. I benchmarked using the file `filesystem.squashfs`
from `ubuntu-16.10-desktop-amd64.iso`, which is 1,536,217,088 B large.
Run the following commands for the benchmark:

modprobe xxhash_test
mknod xxhash_test c 245 0
time cp filesystem.squashfs xxhash_test

The time is reported by the time of the userland `cp`.
The GB/s is computed with

1,536,217,008 B / time(buffer size, hash)

which includes the time to copy from userland.
The Normalized GB/s is computed with

1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)).


| Buffer Size (B) | Hash  | Time (s) | GB/s | Adjusted GB/s |
|-|---|--|--|---|
|1024 | none  |0.408 | 3.77 | - |
|1024 | xxh32 |0.649 | 2.37 |  6.37 |
|1024 | xxh64 |0.542 | 2.83 | 11.46 |
|1024 | crc32 |1.290 | 1.19 |  1.74 |
|4096 | none  |0.380 | 4.04 | - |
|4096 | xxh32 |0.645 | 2.38 |  5.79 |
|4096 | xxh64 |0.500 | 3.07 | 12.80 |
|4096 | crc32 |1.168 | 1.32 |  1.95 |
|8192 | none  |0.351 | 4.38 | - |
|8192 | xxh32 |0.614 | 2.50 |  5.84 |
|8192 | xxh64 |0.464 | 3.31 | 13.60 |
|8192 | crc32 |1.163 | 1.32 |  1.89 |
|   16384 | none  |0.346 | 4.43 | - |
|   16384 | xxh32 |0.590 | 2.60 |  6.30 |
|   16384 | xxh64 |0.466 | 3.30 | 12.80 |
|   16384 | crc32 |1.183 | 1.30 |  1.84 |

Tested in userland using the test-suite in the zstd repo under
`contrib/linux-kernel/test/XXHashUserlandTest.cpp` [2] by mocking the
kernel functions. A line in each branch of every function in `xxhash.c`
was commented out to ensure that the test-suite fails. Additionally
tested while testing zstd and with SMHasher [3].

[1] https://phabricator.intern.facebook.com/P57526246
[2] 
https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/XXHashUserlandTest.cpp
[3] https://github.com/aappleby/smhasher

zstd source repository: https://github.com/facebook/zstd
XXHash source repository: https://github.com/cyan4973/xxhash

Signed-off-by: Nick Terrell <terre...@fb.com>
---
v1 -> v2:
- Make pointer in lib/xxhash.c:394 non-const

 include/linux/xxhash.h | 236 +++
 lib/Kconfig|   3 +
 lib/Makefile   |   1 +
 lib/xxhash.c   | 500 +
 4 files changed, 740 insertions(+)
 create mode 100644 include/linux/xxhash.h
 create mode 100644 lib/xxhash.c

diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
new file mode 100644
index 000..9e1f42c
--- /dev/null
+++ b/include/linux/xxhash.h
@@ -0,0 +1,236 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, I

[PATCH v5 0/5] Add xxhash and zstd modules

2017-08-09 Thread Nick Terrell
Hi all,

This patch set adds xxhash, zstd compression, and zstd decompression
modules. It also adds zstd support to BtrFS and SquashFS.

Each patch has relevant summaries, benchmarks, and tests.

Best,
Nick Terrell

Changelog:

v1 -> v2:
- Make pointer in lib/xxhash.c:394 non-const (1/5)
- Use div_u64() for division of u64s (2/5)
- Reduce stack usage of ZSTD_compressSequences(), ZSTD_buildSeqTable(),
  ZSTD_decompressSequencesLong(), FSE_buildDTable(), FSE_decompress_wksp(),
  HUF_writeCTable(), HUF_readStats(), HUF_readCTable(),
  HUF_compressWeights(), HUF_readDTableX2(), and HUF_readDTableX4() (2/5)
- No zstd function uses more than 400 B of stack space (2/5)

v2 -> v3:
- Work around gcc-7 bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388
  (2/5)
- Fix bug in dictionary compression from upstream commit cc1522351f (2/5)
- Port upstream BtrFS commits e1ddce71d6, 389a6cfc2a, and 6acafd1eff (3/5)
- Change default compression level for BtrFS to 3 (3/5)

v3 -> v4:
- Fix compiler warnings (2/5)
- Add missing includes (3/5)
- Fix minor linter warnings (3/5, 4/5)
- Add crypto patch (5/5)

v4 -> v5:
- Fix rare compression bug from upstream commit 308047eb5d (2/5)
- Fix bug introduced in v3 when working around the gcc-7 bug (2/5)
- Fix ZSTD_DStream initialization code in squashfs (4/5)
- Fix patch documentation for patches written by Sean Purcell (4/5)

Nick Terrell (5):
  lib: Add xxhash module
  lib: Add zstd modules
  btrfs: Add zstd support
  squashfs: Add zstd support
  crypto: Add zstd support

 crypto/Kconfig |9 +
 crypto/Makefile|1 +
 crypto/testmgr.c   |   10 +
 crypto/testmgr.h   |   71 +
 crypto/zstd.c  |  265 
 fs/btrfs/Kconfig   |2 +
 fs/btrfs/Makefile  |2 +-
 fs/btrfs/compression.c |1 +
 fs/btrfs/compression.h |6 +-
 fs/btrfs/ctree.h   |1 +
 fs/btrfs/disk-io.c |2 +
 fs/btrfs/ioctl.c   |6 +-
 fs/btrfs/props.c   |6 +
 fs/btrfs/super.c   |   12 +-
 fs/btrfs/sysfs.c   |2 +
 fs/btrfs/zstd.c|  432 ++
 fs/squashfs/Kconfig|   14 +
 fs/squashfs/Makefile   |1 +
 fs/squashfs/decompressor.c |7 +
 fs/squashfs/decompressor.h |4 +
 fs/squashfs/squashfs_fs.h  |1 +
 fs/squashfs/zstd_wrapper.c |  151 ++
 include/linux/xxhash.h |  236 +++
 include/linux/zstd.h   | 1157 +++
 include/uapi/linux/btrfs.h |8 +-
 lib/Kconfig|   11 +
 lib/Makefile   |3 +
 lib/xxhash.c   |  500 +++
 lib/zstd/Makefile  |   18 +
 lib/zstd/bitstream.h   |  374 +
 lib/zstd/compress.c| 3484 
 lib/zstd/decompress.c  | 2528 
 lib/zstd/entropy_common.c  |  243 +++
 lib/zstd/error_private.h   |   53 +
 lib/zstd/fse.h |  575 
 lib/zstd/fse_compress.c|  795 ++
 lib/zstd/fse_decompress.c  |  332 +
 lib/zstd/huf.h |  212 +++
 lib/zstd/huf_compress.c|  770 ++
 lib/zstd/huf_decompress.c  |  960 
 lib/zstd/mem.h |  151 ++
 lib/zstd/zstd_common.c |   75 +
 lib/zstd/zstd_internal.h   |  263 
 lib/zstd/zstd_opt.h| 1014 +
 44 files changed, 14756 insertions(+), 12 deletions(-)
 create mode 100644 crypto/zstd.c
 create mode 100644 fs/btrfs/zstd.c
 create mode 100644 fs/squashfs/zstd_wrapper.c
 create mode 100644 include/linux/xxhash.h
 create mode 100644 include/linux/zstd.h
 create mode 100644 lib/xxhash.c
 create mode 100644 lib/zstd/Makefile
 create mode 100644 lib/zstd/bitstream.h
 create mode 100644 lib/zstd/compress.c
 create mode 100644 lib/zstd/decompress.c
 create mode 100644 lib/zstd/entropy_common.c
 create mode 100644 lib/zstd/error_private.h
 create mode 100644 lib/zstd/fse.h
 create mode 100644 lib/zstd/fse_compress.c
 create mode 100644 lib/zstd/fse_decompress.c
 create mode 100644 lib/zstd/huf.h
 create mode 100644 lib/zstd/huf_compress.c
 create mode 100644 lib/zstd/huf_decompress.c
 create mode 100644 lib/zstd/mem.h
 create mode 100644 lib/zstd/zstd_common.c
 create mode 100644 lib/zstd/zstd_internal.h
 create mode 100644 lib/zstd/zstd_opt.h

--
2.9.3