[PATCH 1/2] staging: Add Snappy compression library

Zeev Tarantov Fri, 15 Apr 2011 14:49:10 -0700

From: Zeev Tarantov <[email protected]>

Google's Snappy data compression library is a faster alternative to LZO,
optimized for x86-64. On compressible input it compresses ~2.5x faster than LZO
and decompresses ~1.5-2x faster than LZO. On incompressible input, it skips the
input at 100x faster than LZO and decompresses ~4x faster than LZO.
It is released under BSD license.
This is a kernel port from user space C++ code.
The current intended use is with zram (see next patch in series).


Signed-off-by: Zeev Tarantov <[email protected]>
---
 drivers/staging/Kconfig                     |    2 +
 drivers/staging/Makefile                    |    2 +
 drivers/staging/snappy/Makefile             |    5 +
 drivers/staging/snappy/csnappy.h            |  125 +++++++
 drivers/staging/snappy/csnappy_compress.c   |  497 +++++++++++++++++++++++++++
 drivers/staging/snappy/csnappy_decompress.c |  321 +++++++++++++++++
 drivers/staging/snappy/csnappy_internal.h   |   83 +++++
 7 files changed, 1035 insertions(+), 0 deletions(-)
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index dca4a0b..6c222f0 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -123,6 +123,8 @@ source "drivers/staging/iio/Kconfig"
 
 source "drivers/staging/cs5535_gpio/Kconfig"
 
+source "drivers/staging/snappy/Kconfig"
+
 source "drivers/staging/zram/Kconfig"
 
 source "drivers/staging/zcache/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index eb93012..3b216a6 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -71,3 +71,5 @@ obj-$(CONFIG_ALTERA_STAPL)    +=altera-stapl/
 obj-$(CONFIG_TOUCHSCREEN_CLEARPAD_TM1217)      += cptm1217/
 obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4)   += ste_rmi4/
 obj-$(CONFIG_DRM_PSB)          += gma500/
+obj-$(CONFIG_SNAPPY_COMPRESS)  += snappy/
+obj-$(CONFIG_SNAPPY_DECOMPRESS)        += snappy/
diff --git a/drivers/staging/snappy/Makefile b/drivers/staging/snappy/Makefile
new file mode 100644
index 0000000..399d070
--- /dev/null
+++ b/drivers/staging/snappy/Makefile
@@ -0,0 +1,5 @@
+snappy_compress-objs := csnappy_compress.o
+snappy_decompress-objs := csnappy_decompress.o
+
+obj-$(CONFIG_SNAPPY_COMPRESS) += csnappy_compress.o
+obj-$(CONFIG_SNAPPY_DECOMPRESS) += csnappy_decompress.o
diff --git a/drivers/staging/snappy/csnappy.h b/drivers/staging/snappy/csnappy.h
new file mode 100644
index 0000000..af68232
--- /dev/null
+++ b/drivers/staging/snappy/csnappy.h
@@ -0,0 +1,125 @@
+#ifndef __CSNAPPY_H__
+#define __CSNAPPY_H__
+/*
+File modified for the Linux Kernel by
+Zeev Tarantov <[email protected]>
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CSNAPPY_VERSION        4
+
+#define CSNAPPY_WORKMEM_BYTES_POWER_OF_TWO 15
+#define CSNAPPY_WORKMEM_BYTES (1 << CSNAPPY_WORKMEM_BYTES_POWER_OF_TWO)
+
+/*
+ * Returns the maximal size of the compressed representation of
+ * input data that is "source_len" bytes in length;
+ */
+uint32_t
+csnappy_max_compressed_length(uint32_t source_len) __attribute__((const));
+
+/*
+ * Flat array compression that does not emit the "uncompressed length"
+ * prefix. Compresses "input" array to the "output" array.
+ *
+ * REQUIRES: "input" is at most 32KiB long.
+ * REQUIRES: "output" points to an array of memory that is at least
+ * "csnappy_max_compressed_length(input_length)" in size.
+ * REQUIRES: working_memory has (1 << workmem_bytes_power_of_two) bytes.
+ * REQUIRES: 9 <= workmem_bytes_power_of_two <= 15.
+ *
+ * Returns an "end" pointer into "output" buffer.
+ * "end - output" is the compressed size of "input".
+ */
+char*
+csnappy_compress_fragment(
+       const char *input,
+       const uint32_t input_length,
+       char *output,
+       void *working_memory,
+       const int workmem_bytes_power_of_two);
+
+/*
+ * REQUIRES: "compressed" must point to an area of memory that is at
+ * least "csnappy_max_compressed_length(input_length)" bytes in length.
+ * REQUIRES: working_memory has (1 << workmem_bytes_power_of_two) bytes.
+ * REQUIRES: 9 <= workmem_bytes_power_of_two <= 15.
+ *
+ * Takes the data stored in "input[0..input_length]" and stores
+ * it in the array pointed to by "compressed".
+ *
+ * "*out_compressed_length" is set to the length of the compressed output.
+ */
+void
+csnappy_compress(
+       const char *input,
+       uint32_t input_length,
+       char *compressed,
+       uint32_t *out_compressed_length,
+       void *working_memory,
+       const int workmem_bytes_power_of_two);
+
+/*
+ * Reads header of compressed data to get stored length of uncompressed data.
+ * REQUIRES: start points to compressed data.
+ * REQUIRES: n is length of available compressed data.
+ *
+ * Returns SNAPPY_E_HEADER_BAD on error.
+ * Returns number of bytes read from input on success.
+ * Stores decoded length into *result.
+ */
+int
+csnappy_get_uncompressed_length(
+       const char *start,
+       uint32_t n,
+       uint32_t *result);
+
+/*
+ * Safely decompresses all data from array "src" of length "src_len" containing
+ * entire compressed stream (with header) into array "dst" of size "dst_len".
+ * REQUIRES: dst_len is at least csnappy_get_uncompressed_length(...).
+ *
+ * Iff sucessful, returns CSNAPPY_E_OK.
+ * If recorded length in header is greater than dst_len, returns
+ *  CSNAPPY_E_OUTPUT_INSUF.
+ * If compressed data is malformed, does not write more than dst_len into dst.
+ */
+int
+csnappy_decompress(
+       const char *src,
+       uint32_t src_len,
+       char *dst,
+       uint32_t dst_len);
+
+/*
+ * Safely decompresses stream src_len bytes long read from src to dst.
+ * Amount of available space at dst must be provided in *dst_len by caller.
+ * If compressed stream needs more space, it will not overflow and return
+ *  CSNAPPY_E_OUTPUT_OVERRUN.
+ * On success, sets *dst_len to actal number of bytes decompressed.
+ * Iff sucessful, returns CSNAPPY_E_OK.
+ */
+int
+csnappy_decompress_noheader(
+       const char *src,
+       uint32_t src_len,
+       char *dst,
+       uint32_t *dst_len);
+
+/*
+ * Return values (< 0 = Error)
+ */
+#define CSNAPPY_E_OK                   0
+#define CSNAPPY_E_HEADER_BAD           (-1)
+#define CSNAPPY_E_OUTPUT_INSUF         (-2)
+#define CSNAPPY_E_OUTPUT_OVERRUN       (-3)
+#define CSNAPPY_E_INPUT_NOT_CONSUMED   (-4)
+#define CSNAPPY_E_DATA_MALFORMED       (-5)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/drivers/staging/snappy/csnappy_compress.c 
b/drivers/staging/snappy/csnappy_compress.c
new file mode 100644
index 0000000..40bb90d
--- /dev/null
+++ b/drivers/staging/snappy/csnappy_compress.c
@@ -0,0 +1,497 @@
+/*
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+  * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File modified for the Linux Kernel by
+Zeev Tarantov <[email protected]>
+*/
+
+#include "csnappy_internal.h"
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#include <linux/module.h>
+#endif
+#include "csnappy.h"
+
+
+static inline char*
+encode_varint32(char *sptr, uint32_t v)
+{
+       uint8_t* ptr = (uint8_t *)sptr;
+       static const int B = 128;
+       if (v < (1<<7)) {
+               *(ptr++) = v;
+       } else if (v < (1<<14)) {
+               *(ptr++) = v | B;
+               *(ptr++) = v>>7;
+       } else if (v < (1<<21)) {
+               *(ptr++) = v | B;
+               *(ptr++) = (v>>7) | B;
+               *(ptr++) = v>>14;
+       } else if (v < (1<<28)) {
+               *(ptr++) = v | B;
+               *(ptr++) = (v>>7) | B;
+               *(ptr++) = (v>>14) | B;
+               *(ptr++) = v>>21;
+       } else {
+               *(ptr++) = v | B;
+               *(ptr++) = (v>>7) | B;
+               *(ptr++) = (v>>14) | B;
+               *(ptr++) = (v>>21) | B;
+               *(ptr++) = v>>28;
+       }
+       return (char *)ptr;
+}
+
+
+/*
+ * Any hash function will produce a valid compressed bitstream, but a good
+ * hash function reduces the number of collisions and thus yields better
+ * compression for compressible input, and more speed for incompressible
+ * input. Of course, it doesn't hurt if the hash function is reasonably fast
+ * either, as it gets called a lot.
+ */
+static inline uint32_t HashBytes(uint32_t bytes, int shift)
+{
+       uint32_t kMul = 0x1e35a7bd;
+       return (bytes * kMul) >> shift;
+}
+static inline uint32_t Hash(const char *p, int shift)
+{
+       return HashBytes(UNALIGNED_LOAD32(p), shift);
+}
+
+
+/*
+ * *** DO NOT CHANGE THE VALUE OF kBlockSize ***
+
+ * New Compression code chops up the input into blocks of at most
+ * the following size.  This ensures that back-references in the
+ * output never cross kBlockSize block boundaries.  This can be
+ * helpful in implementing blocked decompression.  However the
+ * decompression code should not rely on this guarantee since older
+ * compression code may not obey it.
+ */
+#define kBlockLog 15
+#define kBlockSize (1 << kBlockLog)
+
+
+/*
+ * Return the largest n such that
+ *
+ *   s1[0,n-1] == s2[0,n-1]
+ *   and n <= (s2_limit - s2).
+ *
+ * Does not read *s2_limit or beyond.
+ * Does not read *(s1 + (s2_limit - s2)) or beyond.
+ * Requires that s2_limit >= s2.
+ *
+ * Separate implementation for x86_64, for speed.  Uses the fact that
+ * x86_64 is little endian.
+ */
+#if defined(__x86_64__)
+static inline int
+FindMatchLength(const char *s1, const char *s2, const char *s2_limit)
+{
+       uint64_t x;
+       int matched, matching_bits;
+       DCHECK_GE(s2_limit, s2);
+       matched = 0;
+       /*
+        * Find out how long the match is. We loop over the data 64 bits at a
+        * time until we find a 64-bit block that doesn't match; then we find
+        * the first non-matching bit and use that to calculate the total
+        * length of the match.
+        */
+       while (likely(s2 <= s2_limit - 8)) {
+               if (unlikely(UNALIGNED_LOAD64(s1 + matched) ==
+                               UNALIGNED_LOAD64(s2))) {
+                       s2 += 8;
+                       matched += 8;
+               } else {
+                       /*
+                        * On current (mid-2008) Opteron models there is a 3%
+                        * more efficient code sequence to find the first
+                        * non-matching byte. However, what follows is ~10%
+                        * better on Intel Core 2 and newer, and we expect AMD's
+                        * bsf instruction to improve.
+                        */
+                       x = UNALIGNED_LOAD64(s1 + matched) ^
+                               UNALIGNED_LOAD64(s2);
+                       matching_bits = FindLSBSetNonZero64(x);
+                       matched += matching_bits >> 3;
+                       return matched;
+               }
+       }
+       while (likely(s2 < s2_limit)) {
+               if (likely(s1[matched] == *s2)) {
+                       ++s2;
+                       ++matched;
+               } else {
+                       return matched;
+               }
+       }
+       return matched;
+}
+#else /* !defined(__x86_64__) */
+static inline int
+FindMatchLength(const char *s1, const char *s2, const char *s2_limit)
+{
+       /* Implementation based on the x86-64 version, above. */
+       int matched = 0;
+       DCHECK_GE(s2_limit, s2);
+
+       while (s2 <= s2_limit - 4 &&
+               UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
+               s2 += 4;
+               matched += 4;
+       }
+#if defined(__LITTLE_ENDIAN)
+       if (s2 <= s2_limit - 4) {
+               uint32_t x = UNALIGNED_LOAD32(s1 + matched) ^
+                               UNALIGNED_LOAD32(s2);
+               int matching_bits = FindLSBSetNonZero(x);
+               matched += matching_bits >> 3;
+       } else {
+               while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+                       ++s2;
+                       ++matched;
+               }
+       }
+#else
+       while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+               ++s2;
+               ++matched;
+       }
+#endif
+       return matched;
+}
+#endif /* !defined(__x86_64__) */
+
+
+static inline char*
+EmitLiteral(char *op, const char *literal, int len, int allow_fast_path)
+{
+       int n = len - 1; /* Zero-length literals are disallowed */
+       if (n < 60) {
+               /* Fits in tag byte */
+               *op++ = LITERAL | (n << 2);
+               /*
+               The vast majority of copies are below 16 bytes, for which a
+               call to memcpy is overkill. This fast path can sometimes
+               copy up to 15 bytes too much, but that is okay in the
+               main loop, since we have a bit to go on for both sides:
+               - The input will always have kInputMarginBytes = 15 extra
+               available bytes, as long as we're in the main loop, and
+               if not, allow_fast_path = false.
+               - The output will always have 32 spare bytes (see
+               snappy_max_compressed_length).
+               */
+               if (allow_fast_path && len <= 16) {
+                       UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
+                       UNALIGNED_STORE64(op + 8,
+                                               UNALIGNED_LOAD64(literal + 8));
+                       return op + len;
+               }
+       } else {
+               /* Encode in upcoming bytes */
+               char *base = op;
+               int count = 0;
+               op++;
+               while (n > 0) {
+                       *op++ = n & 0xff;
+                       n >>= 8;
+                       count++;
+               }
+               DCHECK_GE(count, 1);
+               DCHECK_LE(count, 4);
+               *base = LITERAL | ((59+count) << 2);
+       }
+       memcpy(op, literal, len);
+       return op + len;
+}
+
+static inline char*
+EmitCopyLessThan64(char *op, int offset, int len)
+{
+       DCHECK_LE(len, 64);
+       DCHECK_GE(len, 4);
+       DCHECK_LT(offset, 65536);
+
+       if ((len < 12) && (offset < 2048)) {
+               int len_minus_4 = len - 4;
+               DCHECK_LT(len_minus_4, 8); /* Must fit in 3 bits */
+               *op++ = COPY_1_BYTE_OFFSET   |
+                       ((len_minus_4) << 2) |
+                       ((offset >> 8) << 5);
+               *op++ = offset & 0xff;
+       } else {
+               *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
+               put_unaligned_le16(offset, op);
+               op += 2;
+       }
+       return op;
+}
+
+static inline char*
+EmitCopy(char *op, int offset, int len)
+{
+       /* Emit 64 byte copies but make sure to keep at least four bytes
+        * reserved */
+       while (len >= 68) {
+               op = EmitCopyLessThan64(op, offset, 64);
+               len -= 64;
+       }
+
+       /* Emit an extra 60 byte copy if have too much data to fit in one
+        * copy */
+       if (len > 64) {
+               op = EmitCopyLessThan64(op, offset, 60);
+               len -= 60;
+       }
+
+       /* Emit remainder */
+       op = EmitCopyLessThan64(op, offset, len);
+       return op;
+}
+
+
+/*
+ * For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+ * equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
+ * empirically found that overlapping loads such as
+ *  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
+ * are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to 
uint32_t.
+ */
+static inline uint32_t
+GetUint32AtOffset(uint64_t v, int offset)
+{
+       DCHECK(0 <= offset && offset <= 4);
+#ifdef __LITTLE_ENDIAN
+       return v >> (8 * offset);
+#else
+       return v >> (32 - 8 * offset);
+#endif
+}
+
+#define kInputMarginBytes 15
+char*
+csnappy_compress_fragment(
+       const char *input,
+       const uint32_t input_size,
+       char *op,
+       void *working_memory,
+       const int workmem_bytes_power_of_two)
+{
+       const char *ip, *ip_end, *base_ip, *next_emit, *ip_limit, *next_ip,
+                       *candidate, *base;
+       uint16_t *table = (uint16_t *)working_memory;
+       uint64_t input_bytes;
+       uint32_t hash, next_hash, prev_hash, cur_hash, skip, candidate_bytes;
+       int shift, matched;
+
+       DCHECK_GE(workmem_bytes_power_of_two, 9);
+       DCHECK_LE(workmem_bytes_power_of_two, 15);
+       /* Table of 2^X bytes, need (X-1) bits to address table of uint16_t.
+        * How many bits of 32bit hash function result are discarded? */
+       shift = 33 - workmem_bytes_power_of_two;
+       /* "ip" is the input pointer, and "op" is the output pointer. */
+       ip = input;
+       DCHECK_LE(input_size, kBlockSize);
+       ip_end = input + input_size;
+       base_ip = ip;
+       /* Bytes in [next_emit, ip) will be emitted as literal bytes. Or
+          [next_emit, ip_end) after the main loop. */
+       next_emit = ip;
+
+       if (unlikely(input_size < kInputMarginBytes))
+               goto emit_remainder;
+
+       memset(working_memory, 0, 1 << workmem_bytes_power_of_two);
+
+       ip_limit = input + input_size - kInputMarginBytes;
+       next_hash = Hash(++ip, shift);
+
+main_loop:
+       DCHECK_LT(next_emit, ip);
+       /*
+       * The body of this loop calls EmitLiteral once and then EmitCopy one or
+       * more times. (The exception is that when we're close to exhausting
+       * the input we goto emit_remainder.)
+       *
+       * In the first iteration of this loop we're just starting, so
+       * there's nothing to copy, so calling EmitLiteral once is
+       * necessary. And we only start a new iteration when the
+       * current iteration has determined that a call to EmitLiteral will
+       * precede the next call to EmitCopy (if any).
+       *
+       * Step 1: Scan forward in the input looking for a 4-byte-long match.
+       * If we get close to exhausting the input then goto emit_remainder.
+       *
+       * Heuristic match skipping: If 32 bytes are scanned with no matches
+       * found, start looking only at every other byte. If 32 more bytes are
+       * scanned, look at every third byte, etc.. When a match is found,
+       * immediately go back to looking at every byte. This is a small loss
+       * (~5% performance, ~0.1% density) for compressible data due to more
+       * bookkeeping, but for non-compressible data (such as JPEG) it's a huge
+       * win since the compressor quickly "realizes" the data is incompressible
+       * and doesn't bother looking for matches everywhere.
+       *
+       * The "skip" variable keeps track of how many bytes there are since the
+       * last match; dividing it by 32 (ie. right-shifting by five) gives the
+       * number of bytes to move ahead for each iteration.
+       */
+       skip = 32;
+
+       next_ip = ip;
+       do {
+               ip = next_ip;
+               hash = next_hash;
+               DCHECK_EQ(hash, Hash(ip, shift));
+               next_ip = ip + (skip++ >> 5);
+               if (unlikely(next_ip > ip_limit))
+                       goto emit_remainder;
+               next_hash = Hash(next_ip, shift);
+               candidate = base_ip + table[hash];
+               DCHECK_GE(candidate, base_ip);
+               DCHECK_LT(candidate, ip);
+
+               table[hash] = ip - base_ip;
+       } while (likely(UNALIGNED_LOAD32(ip) !=
+                       UNALIGNED_LOAD32(candidate)));
+
+       /*
+       * Step 2: A 4-byte match has been found. We'll later see if more
+       * than 4 bytes match. But, prior to the match, input
+       * bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
+       */
+       DCHECK_LE(next_emit + 16, ip_end);
+       op = EmitLiteral(op, next_emit, ip - next_emit, 1);
+
+       /*
+       * Step 3: Call EmitCopy, and then see if another EmitCopy could
+       * be our next move. Repeat until we find no match for the
+       * input immediately after what was consumed by the last EmitCopy call.
+       *
+       * If we exit this loop normally then we need to call EmitLiteral next,
+       * though we don't yet know how big the literal will be. We handle that
+       * by proceeding to the next iteration of the main loop. We also can exit
+       * this loop via goto if we get close to exhausting the input.
+       */
+       input_bytes = 0;
+       candidate_bytes = 0;
+
+       do {
+               /* We have a 4-byte match at ip, and no need to emit any
+                "literal bytes" prior to ip. */
+               base = ip;
+               matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+               ip += matched;
+               DCHECK_EQ(0, memcmp(base, candidate, matched));
+               op = EmitCopy(op, base - candidate, matched);
+               /* We could immediately start working at ip now, but to improve
+                compression we first update table[Hash(ip - 1, ...)]. */
+               next_emit = ip;
+               if (unlikely(ip >= ip_limit))
+                       goto emit_remainder;
+               input_bytes = UNALIGNED_LOAD64(ip - 1);
+               prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
+               table[prev_hash] = ip - base_ip - 1;
+               cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
+               candidate = base_ip + table[cur_hash];
+               candidate_bytes = UNALIGNED_LOAD32(candidate);
+               table[cur_hash] = ip - base_ip;
+       } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
+
+       next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
+       ++ip;
+       goto main_loop;
+
+emit_remainder:
+       /* Emit the remaining bytes as a literal */
+       if (next_emit < ip_end)
+               op = EmitLiteral(op, next_emit, ip_end - next_emit, 0);
+
+       return op;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_compress_fragment);
+#endif
+
+uint32_t __attribute__((const))
+csnappy_max_compressed_length(uint32_t source_len)
+{
+       return 32 + source_len + source_len/6;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_max_compressed_length);
+#endif
+
+void
+csnappy_compress(
+       const char *input,
+       uint32_t input_length,
+       char *compressed,
+       uint32_t *compressed_length,
+       void *working_memory,
+       const int workmem_bytes_power_of_two)
+{
+       int workmem_size;
+       int num_to_read;
+       uint32_t written = 0;
+       char *p = encode_varint32(compressed, input_length);
+       written += (p - compressed);
+       compressed = p;
+       while (input_length > 0) {
+               num_to_read = min(input_length, (uint32_t)kBlockSize);
+               workmem_size = workmem_bytes_power_of_two;
+               if (num_to_read < kBlockSize) {
+                       for (workmem_size = 9;
+                            workmem_size < workmem_bytes_power_of_two;
+                            ++workmem_size) {
+                               if ((1 << (workmem_size-1)) >= num_to_read)
+                                       break;
+                       }
+               }
+               p = csnappy_compress_fragment(
+                               input, num_to_read, compressed,
+                               working_memory, workmem_size);
+               written += (p - compressed);
+               compressed = p;
+               input_length -= num_to_read;
+               input += num_to_read;
+       }
+       *compressed_length = written;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_compress);
+
+MODULE_LICENSE("BSD");
+MODULE_DESCRIPTION("Snappy Compressor");
+#endif
diff --git a/drivers/staging/snappy/csnappy_decompress.c 
b/drivers/staging/snappy/csnappy_decompress.c
new file mode 100644
index 0000000..4d3a09b
--- /dev/null
+++ b/drivers/staging/snappy/csnappy_decompress.c
@@ -0,0 +1,321 @@
+/*
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+  * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File modified for the Linux Kernel by
+Zeev Tarantov <[email protected]>
+*/
+
+#include "csnappy_internal.h"
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#include <linux/module.h>
+#endif
+#include "csnappy.h"
+
+
+/* Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits */
+static const uint32_t wordmask[] = {
+       0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+/*
+ * Data stored per entry in lookup table:
+ *      Range   Bits-used       Description
+ *      ------------------------------------
+ *      1..64   0..7            Literal/copy length encoded in opcode byte
+ *      0..7    8..10           Copy offset encoded in opcode byte / 256
+ *      0..4    11..13          Extra bytes after opcode
+ *
+ * We use eight bits for the length even though 7 would have sufficed
+ * because of efficiency reasons:
+ *      (1) Extracting a byte is faster than a bit-field
+ *      (2) It properly aligns copy offset so we do not need a <<8
+ */
+static const uint16_t char_table[256] = {
+       0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+       0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+       0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+       0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+       0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+       0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+       0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+       0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+       0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+       0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+       0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+       0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+       0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+       0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+       0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+       0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+       0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+       0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+       0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+       0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+       0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+       0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+       0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+       0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+       0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+       0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+       0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+       0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+       0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+       0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+       0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+       0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
+/*
+ * Copy "len" bytes from "src" to "op", one byte at a time.  Used for
+ * handling COPY operations where the input and output regions may
+ * overlap.  For example, suppose:
+ *    src    == "ab"
+ *    op     == src + 2
+ *    len    == 20
+ * After IncrementalCopy(src, op, len), the result will have
+ * eleven copies of "ab"
+ *    ababababababababababab
+ * Note that this does not match the semantics of either memcpy()
+ * or memmove().
+ */
+static inline void IncrementalCopy(const char *src, char *op, int len)
+{
+       DCHECK_GT(len, 0);
+       do {
+               *op++ = *src++;
+       } while (--len > 0);
+}
+
+/*
+ * Equivalent to IncrementalCopy except that it can write up to ten extra
+ * bytes after the end of the copy, and that it is faster.
+ *
+ * The main part of this loop is a simple copy of eight bytes at a time until
+ * we've copied (at least) the requested amount of bytes.  However, if op and
+ * src are less than eight bytes apart (indicating a repeating pattern of
+ * length < 8), we first need to expand the pattern in order to get the correct
+ * results. For instance, if the buffer looks like this, with the eight-byte
+ * <src> and <op> patterns marked as intervals:
+ *
+ *    abxxxxxxxxxxxx
+ *    [------]           src
+ *      [------]         op
+ *
+ * a single eight-byte copy from <src> to <op> will repeat the pattern once,
+ * after which we can move <op> two bytes without moving <src>:
+ *
+ *    ababxxxxxxxxxx
+ *    [------]           src
+ *        [------]       op
+ *
+ * and repeat the exercise until the two no longer overlap.
+ *
+ * This allows us to do very well in the special case of one single byte
+ * repeated many times, without taking a big hit for more general cases.
+ *
+ * The worst case of extra writing past the end of the match occurs when
+ * op - src == 1 and len == 1; the last copy will read from byte positions
+ * [0..7] and write to [4..11], whereas it was only supposed to write to
+ * position 1. Thus, ten excess bytes.
+ */
+static const int kMaxIncrementCopyOverflow = 10;
+static inline void IncrementalCopyFastPath(const char *src, char *op, int len)
+{
+       while (op - src < 8) {
+               UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+               len -= op - src;
+               op += op - src;
+       }
+       while (len > 0) {
+               UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+               src += 8;
+               op += 8;
+               len -= 8;
+       }
+}
+
+
+/* A type that writes to a flat array. */
+struct SnappyArrayWriter {
+       char *base;
+       char *op;
+       char *op_limit;
+};
+
+static inline int
+SAW__Append(struct SnappyArrayWriter *this,
+           const char *ip, uint32_t len, int allow_fast_path)
+{
+       char *op = this->op;
+       const int space_left = this->op_limit - op;
+       /*Fast path, used for the majority (about 90%) of dynamic invocations.*/
+       if (allow_fast_path && len <= 16 && space_left >= 16) {
+               UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+               UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+       } else {
+               if (space_left < len)
+                       return CSNAPPY_E_OUTPUT_OVERRUN;
+               memcpy(op, ip, len);
+       }
+       this->op = op + len;
+       return CSNAPPY_E_OK;
+}
+
+static inline int
+SAW__AppendFromSelf(struct SnappyArrayWriter *this,
+                   uint32_t offset, uint32_t len)
+{
+       char *op = this->op;
+       const int space_left = this->op_limit - op;
+       /* -1u catches offset==0 */
+       if (op - this->base <= offset - 1u)
+               return CSNAPPY_E_DATA_MALFORMED;
+       /* Fast path, used for the majority (70-80%) of dynamic invocations. */
+       if (len <= 16 && offset >= 8 && space_left >= 16) {
+               UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
+               UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+       } else if (space_left >= len + kMaxIncrementCopyOverflow) {
+               IncrementalCopyFastPath(op - offset, op, len);
+       } else {
+               if (space_left < len)
+                       return CSNAPPY_E_OUTPUT_OVERRUN;
+               IncrementalCopy(op - offset, op, len);
+       }
+       this->op = op + len;
+       return CSNAPPY_E_OK;
+}
+
+
+int
+csnappy_get_uncompressed_length(
+       const char *src,
+       uint32_t src_len,
+       uint32_t *result)
+{
+       const char *src_base = src;
+       uint32_t shift = 0;
+       uint8_t c;
+       /* Length is encoded in 1..5 bytes */
+       *result = 0;
+       for (;;) {
+               if (shift >= 32)
+                       goto err_out;
+               if (src_len == 0)
+                       goto err_out;
+               c = *(const uint8_t *)src++;
+               src_len -= 1;
+               *result |= (uint32_t)(c & 0x7f) << shift;
+               if (c < 128)
+                       break;
+               shift += 7;
+       }
+       return src - src_base;
+err_out:
+       return CSNAPPY_E_HEADER_BAD;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_get_uncompressed_length);
+#endif
+
+int
+csnappy_decompress_noheader(
+       const char      *src,
+       uint32_t        src_remaining,
+       char            *dst,
+       uint32_t        *dst_len)
+{
+       struct SnappyArrayWriter writer;
+       uint32_t length, trailer, opword, extra_bytes;
+       int ret;
+       uint8_t opcode;
+       char scratch[5];
+       writer.op = writer.base = dst;
+       writer.op_limit = writer.op + *dst_len;
+       while (src_remaining) {
+               if (unlikely(src_remaining < 5)) {
+                       memcpy(scratch, src, src_remaining);
+                       src = scratch;
+               }
+               opcode = *(const uint8_t *)src++;
+               opword = char_table[opcode];
+               extra_bytes = opword >> 11;
+               trailer = get_unaligned_le32(src) & wordmask[extra_bytes];
+               src += extra_bytes;
+               src_remaining -= 1 + extra_bytes;
+               length = opword & 0xff;
+               if (opcode & 0x3) {
+                       trailer += opword & 0x700;
+                       ret = SAW__AppendFromSelf(&writer, trailer, length);
+                       if (ret < 0)
+                               return ret;
+               } else {
+                       length += trailer;
+                       if (unlikely(src_remaining < length))
+                               return CSNAPPY_E_DATA_MALFORMED;
+                       ret = src_remaining >= 16;
+                       ret = SAW__Append(&writer, src, length, ret);
+                       if (ret < 0)
+                               return ret;
+                       src += length;
+                       src_remaining -= length;
+               }
+       }
+       *dst_len = writer.op - writer.base;
+       return CSNAPPY_E_OK;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_decompress_noheader);
+#endif
+
+int
+csnappy_decompress(
+       const char *src,
+       uint32_t src_len,
+       char *dst,
+       uint32_t dst_len)
+{
+       int n;
+       uint32_t olen = 0;
+       /* Read uncompressed length from the front of the compressed input */
+       n = csnappy_get_uncompressed_length(src, src_len, &olen);
+       if (unlikely(n < CSNAPPY_E_OK))
+               return n;
+       /* Protect against possible DoS attack */
+       if (unlikely(olen > dst_len))
+               return CSNAPPY_E_OUTPUT_INSUF;
+       return csnappy_decompress_noheader(src + n, src_len - n, dst, &olen);
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_decompress);
+
+MODULE_LICENSE("BSD");
+MODULE_DESCRIPTION("Snappy Decompressor");
+#endif
diff --git a/drivers/staging/snappy/csnappy_internal.h 
b/drivers/staging/snappy/csnappy_internal.h
new file mode 100644
index 0000000..6255a3f
--- /dev/null
+++ b/drivers/staging/snappy/csnappy_internal.h
@@ -0,0 +1,83 @@
+/*
+Copyright 2011 Google Inc. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Various stubs for the open-source version of Snappy.
+
+File modified for the Linux Kernel by
+Zeev Tarantov <[email protected]>
+*/
+
+#ifndef CSNAPPY_INTERNAL_H_
+#define CSNAPPY_INTERNAL_H_
+
+#ifndef __KERNEL__
+#include "csnappy_internal_userspace.h"
+#else
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#ifdef DEBUG
+#define DCHECK(cond)   if (!(cond)) \
+                       printk(KERN_DEBUG "assert failed @ %s:%i\n", \
+                               __FILE__, __LINE__)
+#else
+#define DCHECK(cond)
+#endif
+
+#define UNALIGNED_LOAD16(_p)           get_unaligned((const uint16_t *)(_p))
+#define UNALIGNED_LOAD32(_p)           get_unaligned((const uint32_t *)(_p))
+#define UNALIGNED_LOAD64(_p)           get_unaligned((const uint64_t *)(_p))
+#define UNALIGNED_STORE16(_p, _val)    put_unaligned((_val), (uint16_t *)(_p))
+#define UNALIGNED_STORE32(_p, _val)    put_unaligned((_val), (uint32_t *)(_p))
+#define UNALIGNED_STORE64(_p, _val)    put_unaligned((_val), (uint64_t *)(_p))
+
+#define FindLSBSetNonZero(n)           __builtin_ctz(n)
+#define FindLSBSetNonZero64(n)         __builtin_ctzll(n)
+
+#endif /* __KERNEL__ */
+
+#define DCHECK_EQ(a, b)        DCHECK(((a) == (b)))
+#define DCHECK_NE(a, b)        DCHECK(((a) != (b)))
+#define DCHECK_GT(a, b)        DCHECK(((a) >  (b)))
+#define DCHECK_GE(a, b)        DCHECK(((a) >= (b)))
+#define DCHECK_LT(a, b)        DCHECK(((a) <  (b)))
+#define DCHECK_LE(a, b)        DCHECK(((a) <= (b)))
+
+enum {
+       LITERAL = 0,
+       COPY_1_BYTE_OFFSET = 1,  /* 3 bit length + 3 bits of offset in opcode */
+       COPY_2_BYTE_OFFSET = 2,
+       COPY_4_BYTE_OFFSET = 3
+};
+
+#endif  /* CSNAPPY_INTERNAL_H_ */
_______________________________________________
devel mailing list
[email protected]
http://driverdev.linuxdriverproject.org/mailman/listinfo/devel

[PATCH 1/2] staging: Add Snappy compression library

Reply via email to