Fix performance regressions compared to current kernel LZ4

Signed-off-by: Sven Schmidt <4ssch...@informatik.uni-hamburg.de>
---
 include/linux/lz4.h      |   2 +-
 lib/lz4/lz4_compress.c   | 157 +++++++++++++++++++++++-------------
 lib/lz4/lz4_decompress.c |  50 ++++++++----
 lib/lz4/lz4defs.h        | 203 ++++++++++++++++++++++++++++++++---------------
 lib/lz4/lz4hc_compress.c |   8 +-
 5 files changed, 281 insertions(+), 139 deletions(-)

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index a3912d7..394e3d9 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -82,7 +82,7 @@
 /*-************************************************************************
  *     STREAMING CONSTANTS AND STRUCTURES
  **************************************************************************/
-#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE - 3)) + 4)
 #define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(unsigned long long))

 #define LZ4_STREAMHCSIZE        262192
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index 697dbda..2cbbf99 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -39,27 +39,33 @@
 #include <linux/kernel.h>
 #include <asm/unaligned.h>

+static const int LZ4_minLength = (MFLIMIT + 1);
+static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT - 1));
+
 /*-******************************
  *     Compression functions
  ********************************/
-static U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+static FORCE_INLINE U32 LZ4_hash4(
+       U32 sequence,
+       tableType_t const tableType)
 {
        if (tableType == byU16)
                return ((sequence * 2654435761U)
-                       >> ((MINMATCH*8) - (LZ4_HASHLOG + 1)));
+                       >> ((MINMATCH * 8) - (LZ4_HASHLOG + 1)));
        else
                return ((sequence * 2654435761U)
-                       >> ((MINMATCH*8) - LZ4_HASHLOG));
+                       >> ((MINMATCH * 8) - LZ4_HASHLOG));
 }

-#if LZ4_ARCH64
-static U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+static FORCE_INLINE __maybe_unused U32 LZ4_hash5(
+       U64 sequence,
+       tableType_t const tableType)
 {
        const U32 hashLog = (tableType == byU16)
                ? LZ4_HASHLOG + 1
                : LZ4_HASHLOG;

-#ifdef __LITTLE_ENDIAN__
+#if LZ4_LITTLE_ENDIAN
        static const U64 prime5bytes = 889523592379ULL;

        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
@@ -69,9 +75,10 @@ static U32 LZ4_hash5(U64 sequence, tableType_t const 
tableType)
        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
 #endif
 }
-#endif

-static U32 LZ4_hashPosition(const void *p, tableType_t tableType)
+static FORCE_INLINE U32 LZ4_hashPosition(
+       const void *p,
+       tableType_t const tableType)
 {
 #if LZ4_ARCH64
        if (tableType == byU32)
@@ -81,8 +88,12 @@ static U32 LZ4_hashPosition(const void *p, tableType_t 
tableType)
        return LZ4_hash4(LZ4_read32(p), tableType);
 }

-static void LZ4_putPositionOnHash(const BYTE *p, U32 h, void *tableBase,
-       tableType_t const tableType, const BYTE *srcBase)
+static void LZ4_putPositionOnHash(
+       const BYTE *p,
+       U32 h,
+       void *tableBase,
+       tableType_t const tableType,
+       const BYTE *srcBase)
 {
        switch (tableType) {
        case byPtr:
@@ -109,16 +120,22 @@ static void LZ4_putPositionOnHash(const BYTE *p, U32 h, 
void *tableBase,
        }
 }

-static inline void LZ4_putPosition(const BYTE *p, void *tableBase,
-       tableType_t tableType, const BYTE *srcBase)
+static FORCE_INLINE void LZ4_putPosition(
+       const BYTE *p,
+       void *tableBase,
+       tableType_t tableType,
+       const BYTE *srcBase)
 {
        U32 const h = LZ4_hashPosition(p, tableType);

        LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
 }

-static const BYTE *LZ4_getPositionOnHash(U32 h, void *tableBase,
-       tableType_t tableType, const BYTE *srcBase)
+static const BYTE *LZ4_getPositionOnHash(
+       U32 h,
+       void *tableBase,
+       tableType_t tableType,
+       const BYTE *srcBase)
 {
        if (tableType == byPtr) {
                const BYTE **hashTable = (const BYTE **) tableBase;
@@ -135,12 +152,16 @@ static const BYTE *LZ4_getPositionOnHash(U32 h, void 
*tableBase,
        {
                /* default, to ensure a return */
                const U16 * const hashTable = (U16 *) tableBase;
+
                return hashTable[h] + srcBase;
        }
 }

-static inline const BYTE *LZ4_getPosition(const BYTE *p, void *tableBase,
-       tableType_t tableType, const BYTE *srcBase)
+static FORCE_INLINE const BYTE *LZ4_getPosition(
+       const BYTE *p,
+       void *tableBase,
+       tableType_t tableType,
+       const BYTE *srcBase)
 {
        U32 const h = LZ4_hashPosition(p, tableType);

@@ -152,7 +173,7 @@ static inline const BYTE *LZ4_getPosition(const BYTE *p, 
void *tableBase,
  * LZ4_compress_generic() :
  * inlined, to ensure branches are decided at compilation time
  */
-static inline int LZ4_compress_generic(
+static FORCE_INLINE int LZ4_compress_generic(
        LZ4_stream_t_internal * const dictPtr,
        const char * const source,
        char * const dest,
@@ -187,6 +208,7 @@ static inline int LZ4_compress_generic(
                /* Unsupported inputSize, too large (or negative) */
                return 0;
        }
+
        switch (dict) {
        case noDict:
        default:
@@ -216,7 +238,8 @@ static inline int LZ4_compress_generic(

        /* First Byte */
        LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
-       ip++; forwardH = LZ4_hashPosition(ip, tableType);
+       ip++;
+       forwardH = LZ4_hashPosition(ip, tableType);

        /* Main Loop */
        for ( ; ; ) {
@@ -227,15 +250,14 @@ static inline int LZ4_compress_generic(
                {
                        const BYTE *forwardIp = ip;
                        unsigned int step = 1;
-                       unsigned int searchMatchNb = acceleration
-                               << LZ4_skipTrigger;
+                       unsigned int searchMatchNb = acceleration << 
LZ4_SKIPTRIGGER;

                        do {
                                U32 const h = forwardH;

                                ip = forwardIp;
                                forwardIp += step;
-                               step = (searchMatchNb++ >> LZ4_skipTrigger);
+                               step = (searchMatchNb++ >> LZ4_SKIPTRIGGER);

                                if (unlikely(forwardIp > mflimit))
                                        goto _last_literals;
@@ -243,6 +265,7 @@ static inline int LZ4_compress_generic(
                                match = LZ4_getPositionOnHash(h,
                                        dictPtr->hashTable,
                                        tableType, base);
+
                                if (dict == usingExtDict) {
                                        if (match < (const BYTE *)source) {
                                                refDelta = dictDelta;
@@ -251,11 +274,12 @@ static inline int LZ4_compress_generic(
                                                refDelta = 0;
                                                lowLimit = (const BYTE *)source;
                                }        }
+
                                forwardH = LZ4_hashPosition(forwardIp,
                                        tableType);
+
                                LZ4_putPositionOnHash(ip, h, dictPtr->hashTable,
                                        tableType, base);
-
                        } while (((dictIssue == dictSmall)
                                        ? (match < lowRefLimit)
                                        : 0)
@@ -268,31 +292,34 @@ static inline int LZ4_compress_generic(

                /* Catch up */
                while (((ip > anchor) & (match + refDelta > lowLimit))
-                       && (unlikely(ip[-1] == match[refDelta - 1]))) {
+                               && (unlikely(ip[-1] == match[refDelta - 1]))) {
                        ip--;
                        match--;
-                       }
+               }

                /* Encode Literals */
                {
                        unsigned const int litLength = (unsigned int)(ip - 
anchor);

                        token = op++;
+
                        if ((outputLimited) &&
                                /* Check output buffer overflow */
                                (unlikely(op + litLength +
                                        (2 + 1 + LASTLITERALS) +
-                                       (litLength/255) > olimit)))
+                                       (litLength / 255) > olimit)))
                                return 0;
+
                        if (litLength >= RUN_MASK) {
                                int len = (int)litLength - RUN_MASK;

-                               *token = (RUN_MASK<<ML_BITS);
-                               for (; len >= 255 ; len -= 255)
+                               *token = (RUN_MASK << ML_BITS);
+
+                               for (; len >= 255; len -= 255)
                                        *op++ = 255;
                                *op++ = (BYTE)len;
                        } else
-                               *token = (BYTE)(litLength<<ML_BITS);
+                               *token = (BYTE)(litLength << ML_BITS);

                        /* Copy Literals */
                        LZ4_wildCopy(op, anchor, op + litLength);
@@ -301,7 +328,8 @@ static inline int LZ4_compress_generic(

 _next_match:
                /* Encode Offset */
-               LZ4_writeLE16(op, (U16)(ip - match)); op += 2;
+               LZ4_writeLE16(op, (U16)(ip - match));
+               op += 2;

                /* Encode MatchLength */
                {
@@ -313,11 +341,15 @@ static inline int LZ4_compress_generic(

                                match += refDelta;
                                limit = ip + (dictEnd - match);
+
                                if (limit > matchlimit)
                                        limit = matchlimit;
+
                                matchCode = LZ4_count(ip + MINMATCH,
                                        match + MINMATCH, limit);
+
                                ip += MINMATCH + matchCode;
+
                                if (ip == limit) {
                                        unsigned const int more = LZ4_count(ip,
                                                (const BYTE *)source,
@@ -336,17 +368,20 @@ static inline int LZ4_compress_generic(
                                /* Check output buffer overflow */
                                (unlikely(op +
                                        (1 + LASTLITERALS) +
-                                       (matchCode>>8) > olimit)))
+                                       (matchCode >> 8) > olimit)))
                                return 0;
+
                        if (matchCode >= ML_MASK) {
                                *token += ML_MASK;
                                matchCode -= ML_MASK;
                                LZ4_write32(op, 0xFFFFFFFF);
-                               while (matchCode >= 4*255) {
+
+                               while (matchCode >= 4 * 255) {
                                        op += 4;
                                        LZ4_write32(op, 0xFFFFFFFF);
-                                       matchCode -= 4*255;
+                                       matchCode -= 4 * 255;
                                }
+
                                op += matchCode / 255;
                                *op++ = (BYTE)(matchCode % 255);
                        } else
@@ -365,6 +400,7 @@ static inline int LZ4_compress_generic(
                /* Test next position */
                match = LZ4_getPosition(ip, dictPtr->hashTable,
                        tableType, base);
+
                if (dict == usingExtDict) {
                        if (match < (const BYTE *)source) {
                                refDelta = dictDelta;
@@ -374,7 +410,9 @@ static inline int LZ4_compress_generic(
                                lowLimit = (const BYTE *)source;
                        }
                }
+
                LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
+
                if (((dictIssue == dictSmall) ? (match >= lowRefLimit) : 1)
                        && (match + MAX_DISTANCE >= ip)
                        && (LZ4_read32(match + refDelta) == LZ4_read32(ip))) {
@@ -395,18 +433,21 @@ static inline int LZ4_compress_generic(
                if ((outputLimited) &&
                        /* Check output buffer overflow */
                        ((op - (BYTE *)dest) + lastRun + 1 +
-                       ((lastRun + 255 - RUN_MASK)/255) > (U32)maxOutputSize))
+                       ((lastRun + 255 - RUN_MASK) / 255) > 
(U32)maxOutputSize))
                        return 0;
+
                if (lastRun >= RUN_MASK) {
                        size_t accumulator = lastRun - RUN_MASK;
                        *op++ = RUN_MASK << ML_BITS;
-                       for (; accumulator >= 255 ; accumulator -= 255)
+                       for (; accumulator >= 255; accumulator -= 255)
                                *op++ = 255;
                        *op++ = (BYTE) accumulator;
                } else {
-                       *op++ = (BYTE)(lastRun<<ML_BITS);
+                       *op++ = (BYTE)(lastRun << ML_BITS);
                }
+
                memcpy(op, anchor, lastRun);
+
                op += lastRun;
        }

@@ -414,23 +455,27 @@ static inline int LZ4_compress_generic(
        return (int) (((char *)op) - dest);
 }

-static int LZ4_compress_fast_extState(void *state, const char *source, char 
*dest,
-       int inputSize, int maxOutputSize, int acceleration)
+static int LZ4_compress_fast_extState(
+       void *state,
+       const char *source,
+       char *dest,
+       int inputSize,
+       int maxOutputSize,
+       int acceleration)
 {
-       #if LZ4_ARCH64
-       tableType_t tableType = byU32;
-       #else
-       tableType_t tableType = byPtr;
-       #endif
-
        LZ4_stream_t_internal *ctx = &((LZ4_stream_t 
*)state)->internal_donotuse;
+#if LZ4_ARCH64
+       const tableType_t tableType = byU32;
+#else
+       const tableType_t tableType = byPtr;
+#endif

        LZ4_resetStream((LZ4_stream_t *)state);

        if (acceleration < 1)
                acceleration = LZ4_ACCELERATION_DEFAULT;

-       if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+       if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize)) {
                if (inputSize < LZ4_64Klimit)
                        return LZ4_compress_generic(ctx, source,
                                dest, inputSize, 0,
@@ -474,7 +519,6 @@ EXPORT_SYMBOL(LZ4_compress_default);
 /*-******************************
  *     *_destSize() variant
  ********************************/
-
 static int LZ4_compress_destSize_generic(
        LZ4_stream_t_internal * const ctx,
        const char * const src,
@@ -529,14 +573,14 @@ static int LZ4_compress_destSize_generic(
                {
                        const BYTE *forwardIp = ip;
                        unsigned int step = 1;
-                       unsigned int searchMatchNb = 1 << LZ4_skipTrigger;
+                       unsigned int searchMatchNb = 1 << LZ4_SKIPTRIGGER;

                        do {
                                U32 h = forwardH;

                                ip = forwardIp;
                                forwardIp += step;
-                               step = (searchMatchNb++ >> LZ4_skipTrigger);
+                               step = (searchMatchNb++ >> LZ4_SKIPTRIGGER);

                                if (unlikely(forwardIp > mflimit))
                                        goto _last_literals;
@@ -559,8 +603,9 @@ static int LZ4_compress_destSize_generic(
                while ((ip > anchor)
                        && (match > lowLimit)
                        && (unlikely(ip[-1] == match[-1]))) {
-                       ip--; match--;
-                       }
+                       ip--;
+                       match--;
+               }

                /* Encode Literal length */
                {
@@ -644,11 +689,11 @@ static int LZ4_compress_destSize_generic(
                size_t lastRunSize = (size_t)(iend - anchor);

                if (op + 1 /* token */
-                       + ((lastRunSize + 240)/255) /* litLength */
+                       + ((lastRunSize + 240) / 255) /* litLength */
                        + lastRunSize /* literals */ > oend) {
                        /* adapt lastRunSize to fill 'dst' */
                        lastRunSize     = (oend - op) - 1;
-                       lastRunSize -= (lastRunSize + 240)/255;
+                       lastRunSize -= (lastRunSize + 240) / 255;
                }
                ip = anchor + lastRunSize;

@@ -656,7 +701,7 @@ static int LZ4_compress_destSize_generic(
                        size_t accumulator = lastRunSize - RUN_MASK;

                        *op++ = RUN_MASK << ML_BITS;
-                       for (; accumulator >= 255 ; accumulator -= 255)
+                       for (; accumulator >= 255; accumulator -= 255)
                                *op++ = 255;
                        *op++ = (BYTE) accumulator;
                } else {
@@ -675,14 +720,14 @@ static int LZ4_compress_destSize_extState(LZ4_stream_t 
*state, const char *src,
        char *dst, int *srcSizePtr, int targetDstSize)
 {
        #if LZ4_ARCH64
-       tableType_t tableType = byU32;
+               const tableType_t tableType = byU32;
        #else
-       tableType_t tableType = byPtr;
+               const tableType_t tableType = byPtr;
        #endif

        LZ4_resetStream(state);

-       if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {
+       if (targetDstSize >= LZ4_COMPRESSBOUND(*srcSizePtr)) {
                /* compression success is guaranteed */
                return LZ4_compress_fast_extState(
                        state, src, dst, *srcSizePtr,
@@ -847,7 +892,7 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, 
const char *source,
                        result = LZ4_compress_generic(
                                streamPtr, source, dest, inputSize,
                                maxOutputSize, limitedOutput, byU32,
-                               withPrefix64k, dictSmall,       acceleration);
+                               withPrefix64k, dictSmall, acceleration);
                } else {
                        result = LZ4_compress_generic(
                                streamPtr, source, dest, inputSize,
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index a7731ba..3bfc2f6 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -49,8 +49,8 @@
  * Note that it is important this generic function is really inlined,
  * in order to remove useless branches during compilation optimization.
  */
-static inline int LZ4_decompress_generic(
-        const char *const source,
+static FORCE_INLINE int LZ4_decompress_generic(
+        const char * const source,
         char * const dest,
         int inputSize,
                /*
@@ -180,22 +180,28 @@ static inline int LZ4_decompress_generic(
                                        goto _output_error;
                                }
                        }
+
                        memcpy(op, ip, length);
                        ip += length;
                        op += length;
                        /* Necessarily EOF, due to parsing restrictions */
                        break;
                }
+
                LZ4_wildCopy(op, ip, cpy);
-               ip += length; op = cpy;
+               ip += length;
+               op = cpy;

                /* get offset */
-               offset = LZ4_readLE16(ip); ip += 2;
+               offset = LZ4_readLE16(ip);
+               ip += 2;
                match = op - offset;
+
                if ((checkOffset) && (unlikely(match < lowLimit))) {
                        /* Error : offset outside buffers */
                        goto _output_error;
                }
+
                /* costs ~1%; silence an msan warning when offset == 0 */
                LZ4_write32(op, (U32)offset);

@@ -205,11 +211,14 @@ static inline int LZ4_decompress_generic(
                        unsigned int s;

                        do {
-                       s = *ip++;
-                       if ((endOnInput) && (ip > iend - LASTLITERALS))
-                               goto _output_error;
-                       length += s;
+                               s = *ip++;
+
+                               if ((endOnInput) && (ip > iend - LASTLITERALS))
+                                       goto _output_error;
+
+                               length += s;
                        } while (s == 255);
+
                        if ((safeDecode)
                                && unlikely(
                                        (size_t)(op + length) < (size_t)op)) {
@@ -217,6 +226,7 @@ static inline int LZ4_decompress_generic(
                                goto _output_error;
                        }
                }
+
                length += MINMATCH;

                /* check external dictionary */
@@ -227,12 +237,13 @@ static inline int LZ4_decompress_generic(
                        }

                        if (length <= (size_t)(lowPrefix - match)) {
-                       /*
-                        * match can be copied as a single segment
-                        * from external dictionary
-                        */
-                       memmove(op, dictEnd - (lowPrefix - match), length);
-                       op += length;
+                               /*
+                                * match can be copied as a single segment
+                                * from external dictionary
+                                */
+                               memmove(op, dictEnd - (lowPrefix - match),
+                                       length);
+                               op += length;
                        } else {
                                /*
                                 * match encompass external
@@ -256,11 +267,13 @@ static inline int LZ4_decompress_generic(
                                        op += restSize;
                                }
                        }
+
                        continue;
                }

                /* copy match within block */
                cpy = op + length;
+
                if (unlikely(offset < 8)) {
                        const int dec64 = dec64table[offset];

@@ -272,7 +285,8 @@ static inline int LZ4_decompress_generic(
                        memcpy(op + 4, match, 4);
                        match -= dec64;
                } else {
-                       LZ4_copy8(op, match); match += 8;
+                       LZ4_copy8(op, match);
+                       match += 8;
                }

                op += 8;
@@ -287,18 +301,22 @@ static inline int LZ4_decompress_generic(
                                 */
                                goto _output_error;
                        }
+
                        if (op < oCopyLimit) {
                                LZ4_wildCopy(op, match, oCopyLimit);
                                match += oCopyLimit - op;
                                op = oCopyLimit;
                        }
+
                        while (op < cpy)
                                *op++ = *match++;
                } else {
                        LZ4_copy8(op, match);
+
                        if (length > 16)
                                LZ4_wildCopy(op + 8, match + 8, cpy);
                }
+
                op = cpy; /* correction */
        }

@@ -438,7 +456,7 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t 
*LZ4_streamDecode,
  * These decoding functions work the same as "_continue" ones,
  * the dictionary must be explicitly provided within parameters
  */
-static inline int LZ4_decompress_usingDict_generic(const char *source,
+static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
        char *dest, int compressedSize, int maxOutputSize, int safe,
        const char *dictStart, int dictSize)
 {
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 23e1a1b..47ef42b 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -38,14 +38,7 @@
 #include <asm/unaligned.h>
 #include <linux/string.h>       /* memset, memcpy */

-/*
- * Detects 64 bits mode
-*/
-#if defined(CONFIG_64BIT)
-#define LZ4_ARCH64 1
-#else
-#define LZ4_ARCH64 0
-#endif
+#define FORCE_INLINE __always_inline

 /*-************************************
  *     Basic Types
@@ -60,14 +53,38 @@ typedef uint64_t U64;
 typedef uintptr_t uptrval;

 /*-************************************
+ *     Architecture specifics
+ **************************************/
+#if defined(CONFIG_64BIT)
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+#if defined(__LITTLE_ENDIAN)
+#define LZ4_LITTLE_ENDIAN 1
+#else
+#define LZ4_LITTLE_ENDIAN 0
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system
+ * does not support hardware bit count
+ */
+/* #define LZ4_FORCE_SW_BITCOUNT */
+
+/*-************************************
  *     Constants
  **************************************/
 #define MINMATCH 4

 #define WILDCOPYLENGTH 8
 #define LASTLITERALS 5
-#define MFLIMIT (WILDCOPYLENGTH+MINMATCH)
-static const int LZ4_minLength = (MFLIMIT+1);
+#define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
+
+/* Increase this value ==> compression run slower on incompressible data */
+#define LZ4_SKIPTRIGGER 6

 #define KB (1<<10)
 #define MB (1<<20)
@@ -82,53 +99,42 @@ static const int LZ4_minLength = (MFLIMIT+1);
 #define RUN_BITS (8-ML_BITS)
 #define RUN_MASK ((1U<<RUN_BITS)-1)

-static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT-1));
-static const U32 LZ4_skipTrigger = 6;
-
 /*-************************************
  *     Reading and writing into memory
  **************************************/
+typedef union {
+       U16 u16;
+       U32 u32;
+       size_t uArch;
+} __packed unalign;

-static inline U16 LZ4_read16(const void *memPtr)
+static FORCE_INLINE __maybe_unused U16 LZ4_read16(const void *ptr)
 {
-       U16 val;
-
-       memcpy(&val, memPtr, sizeof(val));
-
-       return val;
+       return ((const unalign *)ptr)->u16;
 }

-static inline U32 LZ4_read32(const void *memPtr)
+static FORCE_INLINE __maybe_unused U32 LZ4_read32(const void *ptr)
 {
-       U32 val;
-
-       memcpy(&val, memPtr, sizeof(val));
-
-       return val;
+       return ((const unalign *)ptr)->u32;
 }

-static inline size_t LZ4_read_ARCH(const void *memPtr)
+static FORCE_INLINE __maybe_unused size_t LZ4_read_ARCH(const void *ptr)
 {
-       size_t val;
-
-       memcpy(&val, memPtr, sizeof(val));
-
-       return val;
+       return ((const unalign *)ptr)->uArch;
 }

-static inline void LZ4_write16(void *memPtr, U16 value)
+static FORCE_INLINE __maybe_unused void LZ4_write16(void *memPtr, U16 value)
 {
-       memcpy(memPtr, &value, sizeof(value));
+       ((unalign *)memPtr)->u16 = value;
 }

-static inline void LZ4_write32(void *memPtr, U32 value)
-{
-       memcpy(memPtr, &value, sizeof(value));
+static FORCE_INLINE __maybe_unused void LZ4_write32(void *memPtr, U32 value) {
+       ((unalign *)memPtr)->u32 = value;
 }

-static inline U16 LZ4_readLE16(const void *memPtr)
+static FORCE_INLINE __maybe_unused U16 LZ4_readLE16(const void *memPtr)
 {
-#ifdef __LITTLE_ENDIAN__
+#if LZ4_LITTLE_ENDIAN
        return LZ4_read16(memPtr);
 #else
        const BYTE *p = (const BYTE *)memPtr;
@@ -137,19 +143,19 @@ static inline U16 LZ4_readLE16(const void *memPtr)
 #endif
 }

-static inline void LZ4_writeLE16(void *memPtr, U16 value)
+static FORCE_INLINE __maybe_unused void LZ4_writeLE16(void *memPtr, U16 value)
 {
-#ifdef __LITTLE_ENDIAN__
+#if LZ4_LITTLE_ENDIAN
        LZ4_write16(memPtr, value);
 #else
        BYTE *p = (BYTE *)memPtr;

        p[0] = (BYTE) value;
-       p[1] = (BYTE)(value>>8);
+       p[1] = (BYTE)(value >> 8);
 #endif
 }

-static inline void LZ4_copy8(void *dst, const void *src)
+static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
 {
        memcpy(dst, src, 8);
 }
@@ -158,7 +164,8 @@ static inline void LZ4_copy8(void *dst, const void *src)
  * customized variant of memcpy,
  * which can overwrite up to 7 bytes beyond dstEnd
  */
-static inline void LZ4_wildCopy(void *dstPtr, const void *srcPtr, void *dstEnd)
+static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
+       const void *srcPtr, void *dstEnd)
 {
        BYTE *d = (BYTE *)dstPtr;
        const BYTE *s = (const BYTE *)srcPtr;
@@ -171,49 +178,121 @@ static inline void LZ4_wildCopy(void *dstPtr, const void 
*srcPtr, void *dstEnd)
        } while (d < e);
 }

-#if LZ4_ARCH64
-#ifdef __BIG_ENDIAN__
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
+{
+#if LZ4_LITTLE_ENDIAN
+#if LZ4_ARCH64 /* 64 Bits Little Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+       static const int DeBruijnBytePos[64] = {
+               0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7,
+               0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7,
+               7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6,
+               7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+       };
+
+       return DeBruijnBytePos[((U64)((val & -(long long)val)
+               * 0x0218A392CDABBD3FULL)) >> 58];
 #else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
-#endif
+       return (__builtin_ctzll((U64)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#else /* 32 Bits Little Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+       static const int DeBruijnBytePos[32] = {
+               0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1,
+               3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1
+       };
+
+       return DeBruijnBytePos[((U32)((val & -(S32)val)
+               * 0x077CB531U)) >> 27];
 #else
-#ifdef __BIG_ENDIAN__
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
+       return (__builtin_ctz((U32)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#endif /* LZ4_ARCH64 */
+#else /* Big Endian */
+#if LZ4_ARCH64 /* 64 Bits Big Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+       unsigned int r;
+
+       if (!(val >> 32)) {
+               r = 4;
+       } else {
+               r = 0;
+               val >>= 32;
+       }
+
+       if (!(val >> 16)) {
+               r += 2;
+               val >>= 8;
+       } else {
+               val >>= 24;
+       }
+
+       r += (!val);
+
+       return r;
 #else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
-#endif
-#endif
+       return (__builtin_clzll((U64)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#else /* 32 Bits Big Endian */
+#if defined(LZ4_FORCE_SW_BITCOUNT)
+       unsigned int r;
+
+       if (!(val >> 16)) {
+               r = 2;
+               val >>= 8;
+       } else {
+               r = 0;
+               val >>= 24;
+       }
+
+       r += (!val);
+
+       return r;
+#else
+       return (__builtin_clz((U32)val) >> 3);
+#endif /* defined(LZ4_FORCE_SW_BITCOUNT) */
+#endif /* LZ4_ARCH64 */
+#endif /* LZ4_LITTLE_ENDIAN */
+}

-static inline unsigned int LZ4_count(const BYTE *pIn, const BYTE *pMatch,
+static FORCE_INLINE __maybe_unused unsigned int LZ4_count(
+       const BYTE *pIn,
+       const BYTE *pMatch,
        const BYTE *pInLimit)
 {
        const BYTE *const pStart = pIn;

-       while (likely(pIn < pInLimit-(STEPSIZE-1))) {
-               size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+       while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+               size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);

                if (!diff) {
                        pIn += STEPSIZE;
                        pMatch += STEPSIZE;
                        continue;
                }
-               pIn += LZ4_NBCOMMONBYTES(diff);
+
+               pIn += LZ4_NbCommonBytes(diff);
+
                return (unsigned int)(pIn - pStart);
        }

-#ifdef LZ4_ARCH64
-       if ((pIn < (pInLimit-3))
+#if LZ4_ARCH64
+       if ((pIn < (pInLimit - 3))
                && (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
-               pIn += 4; pMatch += 4;
+               pIn += 4;
+               pMatch += 4;
        }
 #endif
-       if ((pIn < (pInLimit-1))
+
+       if ((pIn < (pInLimit - 1))
                && (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
-               pIn += 2; pMatch += 2;
+               pIn += 2;
+               pMatch += 2;
        }
+
        if ((pIn < pInLimit) && (*pMatch == *pIn))
                pIn++;
+
        return (unsigned int)(pIn - pStart);
 }

diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index 8363292..c7271a1 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -71,7 +71,7 @@ static void LZ4HC_init(LZ4HC_CCtx_internal *hc4, const BYTE 
*start)
 }

 /* Update chains up to ip (excluded) */
-static inline void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
+static FORCE_INLINE void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
        const BYTE *ip)
 {
        U16 * const chainTable = hc4->chainTable;
@@ -96,7 +96,7 @@ static inline void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
        hc4->nextToUpdate = target;
 }

-static inline int LZ4HC_InsertAndFindBestMatch(
+static FORCE_INLINE int LZ4HC_InsertAndFindBestMatch(
        LZ4HC_CCtx_internal *hc4, /* Index table will be updated */
        const BYTE *ip,
        const BYTE * const iLimit,
@@ -165,7 +165,7 @@ static inline int LZ4HC_InsertAndFindBestMatch(
        return (int)ml;
 }

-static inline int LZ4HC_InsertAndGetWiderMatch(
+static FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch(
        LZ4HC_CCtx_internal *hc4,
        const BYTE * const ip,
        const BYTE * const iLowLimit,
@@ -259,7 +259,7 @@ static inline int LZ4HC_InsertAndGetWiderMatch(
        return longest;
 }

-static inline int LZ4HC_encodeSequence(
+static FORCE_INLINE int LZ4HC_encodeSequence(
        const BYTE **ip,
        BYTE **op,
        const BYTE **anchor,
--
2.1.4

Reply via email to