Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64

Gao Xiang Thu, 16 May 2019 23:36:22 -0700

Hi Chenxi,

Some words about the patch format.. not important tho.


One suggestion is that the subject line should be better written
as "[PATCH v2/v3/...] title" since it's more clear to know
which patch is the latest patch among these emails.

On 2019/5/17 13:56, Chenxi Mao wrote:
> FAST_DEC_LOOP was introduced from LZ4 1.9.0[1]
> This change would be introduce 10% on decompress operation
> according to LZ4 benchmark result on X86 devices.
> Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64,
> however clang compiler has downgrade if FAST_DEC_LOOP enabled.
> 
> So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build.
> 
> LZ4 FAST_DEC_LOOP bug fixes include as well.
> 1. fixed read-after input in LZ4_decompress_safe() (issue 681)
> 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676)
> 
> PS2:
> 1. Move common API to lz4defs.h
> 2. Add PPC related inline Macro defination.
> 3. Force inline new static apis.
> 
> Here is the test result on ARM64(cortex-A53)
> Benchmark via ZRAM:
> 
> Test case:
> taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \
> --buffer_compress_percentage=75  --size=700M \
> --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \
> --filename=/data/test/test --name=seq-read --rw=read --stonewall
> 
> Patched:
>     READ: bw=150MiB/s (157MB/s)
> Vanilla:
>     READ: bw=135MiB/s (142MB/s)
> 
> [1] https://github.com/lz4/lz4/releases/tag/v1.9.0
> 
> Signed-off-by: chenxi.mao <chenxi....@sony.com>
> ---

It's perferred to move all changelogs here if you don't want these changelog
as a part of commit and there are some patches which can be referenced:
https://lore.kernel.org/lkml/

Thanks,
Gao Xiang

>  lib/lz4/lz4_compress.c   |   4 +-
>  lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++-------
>  lib/lz4/lz4defs.h        |  60 +++++-
>  lib/lz4/lz4hc_compress.c |   2 +-
>  4 files changed, 392 insertions(+), 71 deletions(-)
> 
> diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
> index cc7b6d4cc7c7..b703ed1ca57d 100644
> --- a/lib/lz4/lz4_compress.c
> +++ b/lib/lz4/lz4_compress.c
> @@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
>                               *token = (BYTE)(litLength << ML_BITS);
>  
>                       /* Copy Literals */
> -                     LZ4_wildCopy(op, anchor, op + litLength);
> +                     LZ4_wildCopy8(op, anchor, op + litLength);
>                       op += litLength;
>               }
>  
> @@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
>                               *token = (BYTE)(litLength << ML_BITS);
>  
>                       /* Copy Literals */
> -                     LZ4_wildCopy(op, anchor, op + litLength);
> +                     LZ4_wildCopy8(op, anchor, op + litLength);
>                       op += litLength;
>               }
>  
> diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
> index 0c9d3ad17e0f..8622922304c3 100644
> --- a/lib/lz4/lz4_decompress.c
> +++ b/lib/lz4/lz4_decompress.c
> @@ -50,6 +50,96 @@
>  #define assert(condition) ((void)0)
>  #endif
>  
> +#ifndef LZ4_FAST_DEC_LOOP
> +#if defined(__i386__) || defined(__x86_64__)
> +#define LZ4_FAST_DEC_LOOP 1
> +#elif defined(__aarch64__) && !defined(__clang__)
> +     /* On aarch64, we disable this optimization for clang because on certain
> +      * mobile chipsets and clang, it reduces performance. For more 
> information
> +      * refer to https://github.com/lz4/lz4/pull/707. */
> +#define LZ4_FAST_DEC_LOOP 1
> +#else
> +#define LZ4_FAST_DEC_LOOP 0
> +#endif
> +#endif
> +
> +#if LZ4_FAST_DEC_LOOP
> +#define FASTLOOP_SAFE_DISTANCE 64
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * 
> dstEnd,
> +                          const size_t offset)
> +{
> +     if (offset < 8) {
> +             dstPtr[0] = srcPtr[0];
> +
> +             dstPtr[1] = srcPtr[1];
> +             dstPtr[2] = srcPtr[2];
> +             dstPtr[3] = srcPtr[3];
> +             srcPtr += inc32table[offset];
> +             memcpy(dstPtr + 4, srcPtr, 4);
> +             srcPtr -= dec64table[offset];
> +             dstPtr += 8;
> +     } else {
> +             memcpy(dstPtr, srcPtr, 8);
> +             dstPtr += 8;
> +             srcPtr += 8;
> +     }
> +
> +     LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
> +}
> +
> +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond 
> dstEnd
> + * this version copies two times 16 bytes (instead of one time 32 bytes)
> + * because it must be compatible with offsets >= 16. */
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
> +{
> +     BYTE *d = (BYTE *) dstPtr;
> +     const BYTE *s = (const BYTE *)srcPtr;
> +     BYTE *const e = (BYTE *) dstEnd;
> +
> +     do {
> +             memcpy(d, s, 16);
> +             memcpy(d + 16, s + 16, 16);
> +             d += 32;
> +             s += 32;
> +     } while (d < e);
> +}
> +
> +FORCE_O2_INLINE_GCC_PPC64LE void
> +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
> +                     const size_t offset)
> +{
> +     BYTE v[8];
> +     switch (offset) {
> +
> +     case 1:
> +             memset(v, *srcPtr, 8);
> +             goto copy_loop;
> +     case 2:
> +             memcpy(v, srcPtr, 2);
> +             memcpy(&v[2], srcPtr, 2);
> +             memcpy(&v[4], &v[0], 4);
> +             goto copy_loop;
> +     case 4:
> +             memcpy(v, srcPtr, 4);
> +             memcpy(&v[4], srcPtr, 4);
> +             goto copy_loop;
> +     default:
> +             LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
> +             return;
> +     }
> +
> +      copy_loop:
> +     memcpy(dstPtr, v, 8);
> +     dstPtr += 8;
> +     while (dstPtr < dstEnd) {
> +             memcpy(dstPtr, v, 8);
> +             dstPtr += 8;
> +     }
> +}
> +#endif
> +
>  /*
>   * LZ4_decompress_generic() :
>   * This generic decompression function covers all use cases.
> @@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic(
>        const size_t dictSize
>        )
>  {
> -     const BYTE *ip = (const BYTE *) src;
> -     const BYTE * const iend = ip + srcSize;
> +     const BYTE *ip = (const BYTE *)src;
> +     const BYTE *const iend = ip + srcSize;
>  
>       BYTE *op = (BYTE *) dst;
> -     BYTE * const oend = op + outputSize;
> +     BYTE *const oend = op + outputSize;
>       BYTE *cpy;
>  
> -     const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
> -     static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
> -     static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
> +     const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize;
>  
>       const int safeDecode = (endOnInput == endOnInputSize);
>       const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
>  
>       /* Set up the "end" pointers for the shortcut. */
>       const BYTE *const shortiend = iend -
> -             (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
> +         (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
>       const BYTE *const shortoend = oend -
> -             (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +         (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
> +
> +     const BYTE *match;
> +     size_t offset;
> +     unsigned int token;
> +     size_t length;
>  
>       DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
>                srcSize, outputSize);
> @@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
>       if ((endOnInput) && unlikely(srcSize == 0))
>               return -1;
>  
> -     /* Main Loop : decode sequences */
> +#if LZ4_FAST_DEC_LOOP
> +     if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
> +             DEBUGLOG(6, "skip fast decode loop");
> +             goto safe_decode;
> +     }
> +
> +     /* Fast loop : decode sequences as long as output < 
> iend-FASTLOOP_SAFE_DISTANCE */
>       while (1) {
> -             size_t length;
> -             const BYTE *match;
> -             size_t offset;
> +             /* Main fastloop assertion: We can always wildcopy 
> FASTLOOP_SAFE_DISTANCE */
> +             assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
> +             if (endOnInput) {
> +                     assert(ip < iend);
> +             }
> +             token = *ip++;
> +             length = token >> ML_BITS;      /* literal length */
> +
> +             assert(!endOnInput || ip <= iend);      /* ip < iend before the 
> increment */
> +
> +             /* decode literal length */
> +             if (length == RUN_MASK) {
> +                     variable_length_error error = ok;
> +                     length +=
> +                         read_variable_length(&ip, iend - RUN_MASK,
> +                                              endOnInput, endOnInput,
> +                                              &error);
> +                     if (error == initial_error) {
> +                             goto _output_error;
> +                     }
> +                     if ((safeDecode)
> +                         && unlikely((uptrval) (op) + length <
> +                                     (uptrval) (op))) {
> +                             goto _output_error;
> +                     }       /* overflow detection */
> +                     if ((safeDecode)
> +                         && unlikely((uptrval) (ip) + length <
> +                                     (uptrval) (ip))) {
> +                             goto _output_error;
> +                     }
>  
> -             /* get literal length */
> -             unsigned int const token = *ip++;
> -             length = token>>ML_BITS;
> +                     /* overflow detection */
> +                     /* copy literals */
> +                     cpy = op + length;
> +                     LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
> +                             if ((cpy > oend - 32)
> +                                 || (ip + length > iend - 32)) {
> +                                     goto safe_literal_copy;
> +                             }
> +                             LZ4_wildCopy32(op, ip, cpy);
> +                     } else {        /* LZ4_decompress_fast() */
> +                             if (cpy > oend - 8) {
> +                                     goto safe_literal_copy;
> +                             }
> +                             LZ4_wildCopy8(op, ip, cpy);
> +                             /* LZ4_decompress_fast() cannot copy more than 
> 8 bytes at a time */
> +                             /* it doesn't know input length, and only 
> relies on end-of-block */
> +                             /* properties */
> +                     }
> +                     ip += length;
> +                     op = cpy;
> +             } else {
> +                     cpy = op + length;
> +                     if (endOnInput) {       /* LZ4_decompress_safe() */
> +                             DEBUGLOG(7,
> +                                      "copy %u bytes in a 16-bytes stripe",
> +                                      (unsigned)length);
> +                             /* We don't need to check oend */
> +                             /* since we check it once for each loop below */
> +                             if (ip > iend - (16 + 1)) {     /*max lit + 
> offset + nextToken */
> +                                     goto safe_literal_copy;
> +                             }
> +                             /* Literals can only be 14, but hope compilers 
> optimize */
> +                             /*if we copy by a register size */
> +                             memcpy(op, ip, 16);
> +                     } else {
> +                             /* LZ4_decompress_fast() cannot copy more than 
> 8 bytes at a time */
> +                             /* it doesn't know input length, and relies on 
> end-of-block */
> +                             /* properties */
> +                             memcpy(op, ip, 8);
> +                             if (length > 8) {
> +                                     memcpy(op + 8, ip + 8, 8);
> +                             }
> +                     }
> +                     ip += length;
> +                     op = cpy;
> +             }
> +
> +             /* get offset */
> +             offset = LZ4_readLE16(ip);
> +             ip += 2;        /* end-of-block condition violated */
> +             match = op - offset;
> +
> +             /* get matchlength */
> +             length = token & ML_MASK;
> +
> +             if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
> +                     goto _output_error;
> +             }
> +             /* Error : offset outside buffers */
> +             if (length == ML_MASK) {
> +                     variable_length_error error = ok;
> +                     length +=
> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
> +                                              endOnInput, 0, &error);
> +                     if (error != ok) {
> +                             goto _output_error;
> +                     }
> +                     if ((safeDecode)
> +                         && unlikely((uptrval) (op) + length < (uptrval) 
> op)) {
> +                             goto _output_error;
> +                     }       /* overflow detection */
> +                     length += MINMATCH;
> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +                             goto safe_match_copy;
> +                     }
> +             } else {
> +                     length += MINMATCH;
> +                     if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
> +                             goto safe_match_copy;
> +                     }
> +
> +                     /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if 
> true */
> +                     if (!(dict == usingExtDict) || (match >= lowPrefix)) {
> +                             if (offset >= 8) {
> +                                     memcpy(op, match, 8);
> +                                     memcpy(op + 8, match + 8, 8);
> +                                     memcpy(op + 16, match + 16, 2);
> +                                     op += length;
> +                                     continue;
> +                             }
> +                     }
> +             }
> +
> +             /* match starting within external dictionary */
> +             if ((dict == usingExtDict) && (match < lowPrefix)) {
> +                     if (unlikely(op + length > oend - LASTLITERALS)) {
> +                             if (partialDecoding) {
> +                                     /* reach end of buffer */
> +                                     length =
> +                                         min(length, (size_t) (oend - op));
> +                             } else {
> +                                     /* end-of-block condition violated */
> +                                     goto _output_error;
> +                             }
> +                     }
> +
> +                     if (length <= (size_t) (lowPrefix - match)) {
> +                             /* match fits entirely within external 
> dictionary : just copy */
> +                             memmove(op, dictEnd - (lowPrefix - match),
> +                                     length);
> +                             op += length;
> +                     } else {
> +                             /* match stretches into both external dict and 
> current block */
> +                             size_t const copySize =
> +                                 (size_t) (lowPrefix - match);
> +                             size_t const restSize = length - copySize;
> +                             memcpy(op, dictEnd - copySize, copySize);
> +                             op += copySize;
> +                             if (restSize > (size_t) (op - lowPrefix)) {     
> /* overlap copy */
> +                                     BYTE *const endOfMatch = op + restSize;
> +                                     const BYTE *copyFrom = lowPrefix;
> +                                     while (op < endOfMatch) {
> +                                             *op++ = *copyFrom++;
> +                                     }
> +                             } else {
> +                                     memcpy(op, lowPrefix, restSize);
> +                                     op += restSize;
> +                             }
> +                     }
> +                     continue;
> +             }
> +
> +             /* copy match within block */
> +             cpy = op + length;
> +
> +             assert((op <= oend) && (oend - op >= 32));
> +             if (unlikely(offset < 16)) {
> +                     LZ4_memcpy_using_offset(op, match, cpy, offset);
> +             } else {
> +                     LZ4_wildCopy32(op, match, cpy);
> +             }
> +
> +             op = cpy;       /* wildcopy correction */
> +     }
> +      safe_decode:
> +#endif
> +     /* Main Loop : decode sequences */
> +     while (1) {
> +             length = token >> ML_BITS;
>  
>               /* ip < iend before the increment */
>               assert(!endOnInput || ip <= iend);
> @@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                * combined check for both stages).
>                */
>               if ((endOnInput ? length != RUN_MASK : length <= 8)
> -                /*
> -                 * strictly "less than" on input, to re-enter
> -                 * the loop with at least one byte
> -                 */
> -                && likely((endOnInput ? ip < shortiend : 1) &
> -                          (op <= shortoend))) {
> +                 /*
> +                  * strictly "less than" on input, to re-enter
> +                  * the loop with at least one byte
> +                  */
> +                 && likely((endOnInput ? ip < shortiend : 1) &
> +                           (op <= shortoend))) {
>                       /* Copy the literals */
>                       memcpy(op, ip, endOnInput ? 16 : 8);
> -                     op += length; ip += length;
> +                     op += length;
> +                     ip += length;
>  
>                       /*
>                        * The second stage:
>                        * prepare for match copying, decode full info.
>                        * If it doesn't work out, the info won't be wasted.
>                        */
> -                     length = token & ML_MASK; /* match length */
> +                     length = token & ML_MASK;       /* match length */
>                       offset = LZ4_readLE16(ip);
>                       ip += 2;
>                       match = op - offset;
> -                     assert(match <= op); /* check overflow */
> +                     assert(match <= op);    /* check overflow */
>  
>                       /* Do not deal with overlapping matches. */
>                       if ((length != ML_MASK) &&
> @@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>               /* decode literal length */
>               if (length == RUN_MASK) {
> -                     unsigned int s;
>  
> -                     if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
> -                             /* overflow detection */
> +                     variable_length_error error = ok;
> +                     length +=
> +                         read_variable_length(&ip, iend - RUN_MASK,
> +                                              endOnInput, endOnInput,
> +                                              &error);
> +                     if (error == initial_error)
>                               goto _output_error;
> -                     }
> -                     do {
> -                             s = *ip++;
> -                             length += s;
> -                     } while (likely(endOnInput
> -                             ? ip < iend - RUN_MASK
> -                             : 1) & (s == 255));
>  
>                       if ((safeDecode)
> -                         && unlikely((uptrval)(op) +
> -                                     length < (uptrval)(op))) {
> +                         && unlikely((uptrval) (op) +
> +                                     length < (uptrval) (op))) {
>                               /* overflow detection */
>                               goto _output_error;
>                       }
>                       if ((safeDecode)
> -                         && unlikely((uptrval)(ip) +
> -                                     length < (uptrval)(ip))) {
> +                         && unlikely((uptrval) (ip) +
> +                                     length < (uptrval) (ip))) {
>                               /* overflow detection */
>                               goto _output_error;
>                       }
> @@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>               /* copy literals */
>               cpy = op + length;
> +#if LZ4_FAST_DEC_LOOP
> +           safe_literal_copy:
> +#endif
>               LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
>  
>               if (((endOnInput) && ((cpy > oend - MFLIMIT)
> -                     || (ip + length > iend - (2 + 1 + LASTLITERALS))))
> -                     || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
> +                                   || (ip + length >
> +                                       iend - (2 + 1 + LASTLITERALS))))
> +                 || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
>                       if (partialDecoding) {
>                               if (cpy > oend) {
>                                       /*
> @@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                                       length = oend - op;
>                               }
>                               if ((endOnInput)
> -                                     && (ip + length > iend)) {
> +                                 && (ip + length > iend)) {
>                                       /*
>                                        * Error :
>                                        * read attempt beyond
> @@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                               }
>                       } else {
>                               if ((!endOnInput)
> -                                     && (cpy != oend)) {
> +                                 && (cpy != oend)) {
>                                       /*
>                                        * Error :
>                                        * block decoding must
> @@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                                       goto _output_error;
>                               }
>                               if ((endOnInput)
> -                                     && ((ip + length != iend)
> +                                 && ((ip + length != iend)
>                                       || (cpy > oend))) {
>                                       /*
>                                        * Error :
> @@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                               break;
>               } else {
>                       /* may overwrite up to WILDCOPYLENGTH beyond cpy */
> -                     LZ4_wildCopy(op, ip, cpy);
> +                     LZ4_wildCopy8(op, ip, cpy);
>                       ip += length;
>                       op = cpy;
>               }
> @@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                       goto _output_error;
>               }
>  
> -             /* costs ~1%; silence an msan warning when offset == 0 */
> -             /*
> -              * note : when partialDecoding, there is no guarantee that
> -              * at least 4 bytes remain available in output buffer
> -              */
> -             if (!partialDecoding) {
> -                     assert(oend > op);
> -                     assert(oend - op >= 4);
> -
> -                     LZ4_write32(op, (U32)offset);
> -             }
> -
>               if (length == ML_MASK) {
> -                     unsigned int s;
> -
> -                     do {
> -                             s = *ip++;
> -
> -                             if ((endOnInput) && (ip > iend - LASTLITERALS))
> -                                     goto _output_error;
>  
> -                             length += s;
> -                     } while (s == 255);
> +                     variable_length_error error = ok;
> +                     length +=
> +                         read_variable_length(&ip, iend - LASTLITERALS + 1,
> +                                              endOnInput, 0, &error);
> +                     if (error != ok)
> +                             goto _output_error;
>  
>                       if ((safeDecode)
>                               && unlikely(
> @@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic(
>  
>               length += MINMATCH;
>  
> +#if LZ4_FAST_DEC_LOOP
> +safe_match_copy:
> +#endif
> +
>               /* match starting within external dictionary */
>               if ((dict == usingExtDict) && (match < lowPrefix)) {
>                       if (unlikely(op + length > oend - LASTLITERALS)) {
> @@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>                       }
>  
>                       if (op < oCopyLimit) {
> -                             LZ4_wildCopy(op, match, oCopyLimit);
> +                             LZ4_wildCopy8(op, match, oCopyLimit);
>                               match += oCopyLimit - op;
>                               op = oCopyLimit;
>                       }
> @@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
>               } else {
>                       LZ4_copy8(op, match);
>                       if (length > 16)
> -                             LZ4_wildCopy(op + 8, match + 8, cpy);
> +                             LZ4_wildCopy8(op + 8, match + 8, cpy);
>               }
>               op = cpy; /* wildcopy correction */
>       }
> diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
> index 1a7fa9d9170f..4cc17cf589ed 100644
> --- a/lib/lz4/lz4defs.h
> +++ b/lib/lz4/lz4defs.h
> @@ -40,6 +40,28 @@
>  
>  #define FORCE_INLINE __always_inline
>  
> +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
> + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
> + * together with a simple 8-byte copy loop as a fall-back path.
> + * However, this optimization hurts the decompression speed by >30%,
> + * because the execution does not go to the optimized loop
> + * for typical compressible data, and all of the preamble checks
> + * before going to the fall-back path become useless overhead.
> + * This optimization happens only with the -O3 flag, and -O2 generates
> + * a simple 8-byte copy loop.
> + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
> + * functions are annotated with __attribute__((optimize("O2"))),
> + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
> + * of LZ4_wildCopy8 does not affect the compression speed.
> + */
> +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && 
> !defined(__clang__)
> +#  define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
> +#  define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) 
> FORCE_INLINE)
> +#else
> +#  define FORCE_O2_GCC_PPC64LE               FORCE_INLINE
> +#  define FORCE_O2_INLINE_GCC_PPC64LE        FORCE_INLINE
> +#endif
> +
>  /*-************************************
>   *   Basic Types
>   **************************************/
> @@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
>  #define RUN_BITS (8 - ML_BITS)
>  #define RUN_MASK ((1U << RUN_BITS) - 1)
>  
> +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
> +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
> +
>  /*-************************************
>   *   Reading and writing into memory
>   **************************************/
> @@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void 
> *src)
>   * customized variant of memcpy,
>   * which can overwrite up to 7 bytes beyond dstEnd
>   */
> -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
> +static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
>       const void *srcPtr, void *dstEnd)
>  {
>       BYTE *d = (BYTE *)dstPtr;
> @@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count(
>       return (unsigned int)(pIn - pStart);
>  }
>  
> +/* Read the variable-length literal or match length.
> + *
> + * ip - pointer to use as input.
> + * lencheck - end ip.  Return an error if ip advances >= lencheck.
> + * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if 
> so.
> + * initial_check - check ip >= lencheck before start of loop.  Returns 
> initial_error if so.
> + * error (output) - error code.  Should be set to 0 before call.
> + */
> +typedef enum { loop_error = -2, initial_error = -1, ok = 0} 
> variable_length_error;
> +static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
> +                                        const BYTE *lencheck,
> +                                        int loop_check, int initial_check,
> +                                        variable_length_error *error)
> +{
> +     unsigned length = 0;
> +     unsigned s;
> +     if (initial_check && unlikely((*ip) >= lencheck)) {     /* overflow 
> detection */
> +             *error = initial_error;
> +             return length;
> +     }
> +     do {
> +             s = **ip;
> +             (*ip)++;
> +             length += s;
> +             if (loop_check && unlikely((*ip) >= lencheck)) {        /* 
> overflow detection */
> +                     *error = loop_error;
> +                     return length;
> +             }
> +     } while (s == 255);
> +
> +     return length;
> +}
> +
>  typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
>  typedef enum { byPtr, byU32, byU16 } tableType_t;
>  
> diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
> index 176f03b83e56..e02e041a01d9 100644
> --- a/lib/lz4/lz4hc_compress.c
> +++ b/lib/lz4/lz4hc_compress.c
> @@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
>               *token = (BYTE)(length<<ML_BITS);
>  
>       /* Copy Literals */
> -     LZ4_wildCopy(*op, *anchor, (*op) + length);
> +     LZ4_wildCopy8(*op, *anchor, (*op) + length);
>       *op += length;
>  
>       /* Encode Offset */
>

Re: [PATCH 1/1] LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64

Reply via email to