Hi Chenxi, Some words about the patch format.. not important tho.
One suggestion is that the subject line should be better written as "[PATCH v2/v3/...] title" since it's more clear to know which patch is the latest patch among these emails. On 2019/5/17 13:56, Chenxi Mao wrote: > FAST_DEC_LOOP was introduced from LZ4 1.9.0[1] > This change would be introduce 10% on decompress operation > according to LZ4 benchmark result on X86 devices. > Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64, > however clang compiler has downgrade if FAST_DEC_LOOP enabled. > > So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build. > > LZ4 FAST_DEC_LOOP bug fixes include as well. > 1. fixed read-after input in LZ4_decompress_safe() (issue 681) > 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676) > > PS2: > 1. Move common API to lz4defs.h > 2. Add PPC related inline Macro defination. > 3. Force inline new static apis. > > Here is the test result on ARM64(cortex-A53) > Benchmark via ZRAM: > > Test case: > taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \ > --buffer_compress_percentage=75 --size=700M \ > --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \ > --filename=/data/test/test --name=seq-read --rw=read --stonewall > > Patched: > READ: bw=150MiB/s (157MB/s) > Vanilla: > READ: bw=135MiB/s (142MB/s) > > [1] https://github.com/lz4/lz4/releases/tag/v1.9.0 > > Signed-off-by: chenxi.mao <chenxi....@sony.com> > --- It's perferred to move all changelogs here if you don't want these changelog as a part of commit and there are some patches which can be referenced: https://lore.kernel.org/lkml/ Thanks, Gao Xiang > lib/lz4/lz4_compress.c | 4 +- > lib/lz4/lz4_decompress.c | 397 ++++++++++++++++++++++++++++++++------- > lib/lz4/lz4defs.h | 60 +++++- > lib/lz4/lz4hc_compress.c | 2 +- > 4 files changed, 392 insertions(+), 71 deletions(-) > > diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c > index cc7b6d4cc7c7..b703ed1ca57d 100644 > --- a/lib/lz4/lz4_compress.c > +++ b/lib/lz4/lz4_compress.c > @@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic( > *token = (BYTE)(litLength << ML_BITS); > > /* Copy Literals */ > - LZ4_wildCopy(op, anchor, op + litLength); > + LZ4_wildCopy8(op, anchor, op + litLength); > op += litLength; > } > > @@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic( > *token = (BYTE)(litLength << ML_BITS); > > /* Copy Literals */ > - LZ4_wildCopy(op, anchor, op + litLength); > + LZ4_wildCopy8(op, anchor, op + litLength); > op += litLength; > } > > diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c > index 0c9d3ad17e0f..8622922304c3 100644 > --- a/lib/lz4/lz4_decompress.c > +++ b/lib/lz4/lz4_decompress.c > @@ -50,6 +50,96 @@ > #define assert(condition) ((void)0) > #endif > > +#ifndef LZ4_FAST_DEC_LOOP > +#if defined(__i386__) || defined(__x86_64__) > +#define LZ4_FAST_DEC_LOOP 1 > +#elif defined(__aarch64__) && !defined(__clang__) > + /* On aarch64, we disable this optimization for clang because on certain > + * mobile chipsets and clang, it reduces performance. For more > information > + * refer to https://github.com/lz4/lz4/pull/707. */ > +#define LZ4_FAST_DEC_LOOP 1 > +#else > +#define LZ4_FAST_DEC_LOOP 0 > +#endif > +#endif > + > +#if LZ4_FAST_DEC_LOOP > +#define FASTLOOP_SAFE_DISTANCE 64 > +FORCE_O2_INLINE_GCC_PPC64LE void > +LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * > dstEnd, > + const size_t offset) > +{ > + if (offset < 8) { > + dstPtr[0] = srcPtr[0]; > + > + dstPtr[1] = srcPtr[1]; > + dstPtr[2] = srcPtr[2]; > + dstPtr[3] = srcPtr[3]; > + srcPtr += inc32table[offset]; > + memcpy(dstPtr + 4, srcPtr, 4); > + srcPtr -= dec64table[offset]; > + dstPtr += 8; > + } else { > + memcpy(dstPtr, srcPtr, 8); > + dstPtr += 8; > + srcPtr += 8; > + } > + > + LZ4_wildCopy8(dstPtr, srcPtr, dstEnd); > +} > + > +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond > dstEnd > + * this version copies two times 16 bytes (instead of one time 32 bytes) > + * because it must be compatible with offsets >= 16. */ > +FORCE_O2_INLINE_GCC_PPC64LE void > +LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd) > +{ > + BYTE *d = (BYTE *) dstPtr; > + const BYTE *s = (const BYTE *)srcPtr; > + BYTE *const e = (BYTE *) dstEnd; > + > + do { > + memcpy(d, s, 16); > + memcpy(d + 16, s + 16, 16); > + d += 32; > + s += 32; > + } while (d < e); > +} > + > +FORCE_O2_INLINE_GCC_PPC64LE void > +LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd, > + const size_t offset) > +{ > + BYTE v[8]; > + switch (offset) { > + > + case 1: > + memset(v, *srcPtr, 8); > + goto copy_loop; > + case 2: > + memcpy(v, srcPtr, 2); > + memcpy(&v[2], srcPtr, 2); > + memcpy(&v[4], &v[0], 4); > + goto copy_loop; > + case 4: > + memcpy(v, srcPtr, 4); > + memcpy(&v[4], srcPtr, 4); > + goto copy_loop; > + default: > + LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); > + return; > + } > + > + copy_loop: > + memcpy(dstPtr, v, 8); > + dstPtr += 8; > + while (dstPtr < dstEnd) { > + memcpy(dstPtr, v, 8); > + dstPtr += 8; > + } > +} > +#endif > + > /* > * LZ4_decompress_generic() : > * This generic decompression function covers all use cases. > @@ -80,25 +170,28 @@ static FORCE_INLINE int LZ4_decompress_generic( > const size_t dictSize > ) > { > - const BYTE *ip = (const BYTE *) src; > - const BYTE * const iend = ip + srcSize; > + const BYTE *ip = (const BYTE *)src; > + const BYTE *const iend = ip + srcSize; > > BYTE *op = (BYTE *) dst; > - BYTE * const oend = op + outputSize; > + BYTE *const oend = op + outputSize; > BYTE *cpy; > > - const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize; > - static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; > - static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; > + const BYTE *const dictEnd = (const BYTE *)dictStart + dictSize; > > const int safeDecode = (endOnInput == endOnInputSize); > const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB))); > > /* Set up the "end" pointers for the shortcut. */ > const BYTE *const shortiend = iend - > - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; > + (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; > const BYTE *const shortoend = oend - > - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; > + (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; > + > + const BYTE *match; > + size_t offset; > + unsigned int token; > + size_t length; > > DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__, > srcSize, outputSize); > @@ -117,15 +210,195 @@ static FORCE_INLINE int LZ4_decompress_generic( > if ((endOnInput) && unlikely(srcSize == 0)) > return -1; > > - /* Main Loop : decode sequences */ > +#if LZ4_FAST_DEC_LOOP > + if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { > + DEBUGLOG(6, "skip fast decode loop"); > + goto safe_decode; > + } > + > + /* Fast loop : decode sequences as long as output < > iend-FASTLOOP_SAFE_DISTANCE */ > while (1) { > - size_t length; > - const BYTE *match; > - size_t offset; > + /* Main fastloop assertion: We can always wildcopy > FASTLOOP_SAFE_DISTANCE */ > + assert(oend - op >= FASTLOOP_SAFE_DISTANCE); > + if (endOnInput) { > + assert(ip < iend); > + } > + token = *ip++; > + length = token >> ML_BITS; /* literal length */ > + > + assert(!endOnInput || ip <= iend); /* ip < iend before the > increment */ > + > + /* decode literal length */ > + if (length == RUN_MASK) { > + variable_length_error error = ok; > + length += > + read_variable_length(&ip, iend - RUN_MASK, > + endOnInput, endOnInput, > + &error); > + if (error == initial_error) { > + goto _output_error; > + } > + if ((safeDecode) > + && unlikely((uptrval) (op) + length < > + (uptrval) (op))) { > + goto _output_error; > + } /* overflow detection */ > + if ((safeDecode) > + && unlikely((uptrval) (ip) + length < > + (uptrval) (ip))) { > + goto _output_error; > + } > > - /* get literal length */ > - unsigned int const token = *ip++; > - length = token>>ML_BITS; > + /* overflow detection */ > + /* copy literals */ > + cpy = op + length; > + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); > + if (endOnInput) { /* LZ4_decompress_safe() */ > + if ((cpy > oend - 32) > + || (ip + length > iend - 32)) { > + goto safe_literal_copy; > + } > + LZ4_wildCopy32(op, ip, cpy); > + } else { /* LZ4_decompress_fast() */ > + if (cpy > oend - 8) { > + goto safe_literal_copy; > + } > + LZ4_wildCopy8(op, ip, cpy); > + /* LZ4_decompress_fast() cannot copy more than > 8 bytes at a time */ > + /* it doesn't know input length, and only > relies on end-of-block */ > + /* properties */ > + } > + ip += length; > + op = cpy; > + } else { > + cpy = op + length; > + if (endOnInput) { /* LZ4_decompress_safe() */ > + DEBUGLOG(7, > + "copy %u bytes in a 16-bytes stripe", > + (unsigned)length); > + /* We don't need to check oend */ > + /* since we check it once for each loop below */ > + if (ip > iend - (16 + 1)) { /*max lit + > offset + nextToken */ > + goto safe_literal_copy; > + } > + /* Literals can only be 14, but hope compilers > optimize */ > + /*if we copy by a register size */ > + memcpy(op, ip, 16); > + } else { > + /* LZ4_decompress_fast() cannot copy more than > 8 bytes at a time */ > + /* it doesn't know input length, and relies on > end-of-block */ > + /* properties */ > + memcpy(op, ip, 8); > + if (length > 8) { > + memcpy(op + 8, ip + 8, 8); > + } > + } > + ip += length; > + op = cpy; > + } > + > + /* get offset */ > + offset = LZ4_readLE16(ip); > + ip += 2; /* end-of-block condition violated */ > + match = op - offset; > + > + /* get matchlength */ > + length = token & ML_MASK; > + > + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { > + goto _output_error; > + } > + /* Error : offset outside buffers */ > + if (length == ML_MASK) { > + variable_length_error error = ok; > + length += > + read_variable_length(&ip, iend - LASTLITERALS + 1, > + endOnInput, 0, &error); > + if (error != ok) { > + goto _output_error; > + } > + if ((safeDecode) > + && unlikely((uptrval) (op) + length < (uptrval) > op)) { > + goto _output_error; > + } /* overflow detection */ > + length += MINMATCH; > + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { > + goto safe_match_copy; > + } > + } else { > + length += MINMATCH; > + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { > + goto safe_match_copy; > + } > + > + /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if > true */ > + if (!(dict == usingExtDict) || (match >= lowPrefix)) { > + if (offset >= 8) { > + memcpy(op, match, 8); > + memcpy(op + 8, match + 8, 8); > + memcpy(op + 16, match + 16, 2); > + op += length; > + continue; > + } > + } > + } > + > + /* match starting within external dictionary */ > + if ((dict == usingExtDict) && (match < lowPrefix)) { > + if (unlikely(op + length > oend - LASTLITERALS)) { > + if (partialDecoding) { > + /* reach end of buffer */ > + length = > + min(length, (size_t) (oend - op)); > + } else { > + /* end-of-block condition violated */ > + goto _output_error; > + } > + } > + > + if (length <= (size_t) (lowPrefix - match)) { > + /* match fits entirely within external > dictionary : just copy */ > + memmove(op, dictEnd - (lowPrefix - match), > + length); > + op += length; > + } else { > + /* match stretches into both external dict and > current block */ > + size_t const copySize = > + (size_t) (lowPrefix - match); > + size_t const restSize = length - copySize; > + memcpy(op, dictEnd - copySize, copySize); > + op += copySize; > + if (restSize > (size_t) (op - lowPrefix)) { > /* overlap copy */ > + BYTE *const endOfMatch = op + restSize; > + const BYTE *copyFrom = lowPrefix; > + while (op < endOfMatch) { > + *op++ = *copyFrom++; > + } > + } else { > + memcpy(op, lowPrefix, restSize); > + op += restSize; > + } > + } > + continue; > + } > + > + /* copy match within block */ > + cpy = op + length; > + > + assert((op <= oend) && (oend - op >= 32)); > + if (unlikely(offset < 16)) { > + LZ4_memcpy_using_offset(op, match, cpy, offset); > + } else { > + LZ4_wildCopy32(op, match, cpy); > + } > + > + op = cpy; /* wildcopy correction */ > + } > + safe_decode: > +#endif > + /* Main Loop : decode sequences */ > + while (1) { > + length = token >> ML_BITS; > > /* ip < iend before the increment */ > assert(!endOnInput || ip <= iend); > @@ -143,26 +416,27 @@ static FORCE_INLINE int LZ4_decompress_generic( > * combined check for both stages). > */ > if ((endOnInput ? length != RUN_MASK : length <= 8) > - /* > - * strictly "less than" on input, to re-enter > - * the loop with at least one byte > - */ > - && likely((endOnInput ? ip < shortiend : 1) & > - (op <= shortoend))) { > + /* > + * strictly "less than" on input, to re-enter > + * the loop with at least one byte > + */ > + && likely((endOnInput ? ip < shortiend : 1) & > + (op <= shortoend))) { > /* Copy the literals */ > memcpy(op, ip, endOnInput ? 16 : 8); > - op += length; ip += length; > + op += length; > + ip += length; > > /* > * The second stage: > * prepare for match copying, decode full info. > * If it doesn't work out, the info won't be wasted. > */ > - length = token & ML_MASK; /* match length */ > + length = token & ML_MASK; /* match length */ > offset = LZ4_readLE16(ip); > ip += 2; > match = op - offset; > - assert(match <= op); /* check overflow */ > + assert(match <= op); /* check overflow */ > > /* Do not deal with overlapping matches. */ > if ((length != ML_MASK) && > @@ -187,28 +461,24 @@ static FORCE_INLINE int LZ4_decompress_generic( > > /* decode literal length */ > if (length == RUN_MASK) { > - unsigned int s; > > - if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) { > - /* overflow detection */ > + variable_length_error error = ok; > + length += > + read_variable_length(&ip, iend - RUN_MASK, > + endOnInput, endOnInput, > + &error); > + if (error == initial_error) > goto _output_error; > - } > - do { > - s = *ip++; > - length += s; > - } while (likely(endOnInput > - ? ip < iend - RUN_MASK > - : 1) & (s == 255)); > > if ((safeDecode) > - && unlikely((uptrval)(op) + > - length < (uptrval)(op))) { > + && unlikely((uptrval) (op) + > + length < (uptrval) (op))) { > /* overflow detection */ > goto _output_error; > } > if ((safeDecode) > - && unlikely((uptrval)(ip) + > - length < (uptrval)(ip))) { > + && unlikely((uptrval) (ip) + > + length < (uptrval) (ip))) { > /* overflow detection */ > goto _output_error; > } > @@ -216,11 +486,15 @@ static FORCE_INLINE int LZ4_decompress_generic( > > /* copy literals */ > cpy = op + length; > +#if LZ4_FAST_DEC_LOOP > + safe_literal_copy: > +#endif > LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); > > if (((endOnInput) && ((cpy > oend - MFLIMIT) > - || (ip + length > iend - (2 + 1 + LASTLITERALS)))) > - || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) { > + || (ip + length > > + iend - (2 + 1 + LASTLITERALS)))) > + || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) { > if (partialDecoding) { > if (cpy > oend) { > /* > @@ -231,7 +505,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > length = oend - op; > } > if ((endOnInput) > - && (ip + length > iend)) { > + && (ip + length > iend)) { > /* > * Error : > * read attempt beyond > @@ -241,7 +515,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > } > } else { > if ((!endOnInput) > - && (cpy != oend)) { > + && (cpy != oend)) { > /* > * Error : > * block decoding must > @@ -250,7 +524,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > goto _output_error; > } > if ((endOnInput) > - && ((ip + length != iend) > + && ((ip + length != iend) > || (cpy > oend))) { > /* > * Error : > @@ -269,7 +543,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > break; > } else { > /* may overwrite up to WILDCOPYLENGTH beyond cpy */ > - LZ4_wildCopy(op, ip, cpy); > + LZ4_wildCopy8(op, ip, cpy); > ip += length; > op = cpy; > } > @@ -288,29 +562,14 @@ static FORCE_INLINE int LZ4_decompress_generic( > goto _output_error; > } > > - /* costs ~1%; silence an msan warning when offset == 0 */ > - /* > - * note : when partialDecoding, there is no guarantee that > - * at least 4 bytes remain available in output buffer > - */ > - if (!partialDecoding) { > - assert(oend > op); > - assert(oend - op >= 4); > - > - LZ4_write32(op, (U32)offset); > - } > - > if (length == ML_MASK) { > - unsigned int s; > - > - do { > - s = *ip++; > - > - if ((endOnInput) && (ip > iend - LASTLITERALS)) > - goto _output_error; > > - length += s; > - } while (s == 255); > + variable_length_error error = ok; > + length += > + read_variable_length(&ip, iend - LASTLITERALS + 1, > + endOnInput, 0, &error); > + if (error != ok) > + goto _output_error; > > if ((safeDecode) > && unlikely( > @@ -322,6 +581,10 @@ static FORCE_INLINE int LZ4_decompress_generic( > > length += MINMATCH; > > +#if LZ4_FAST_DEC_LOOP > +safe_match_copy: > +#endif > + > /* match starting within external dictionary */ > if ((dict == usingExtDict) && (match < lowPrefix)) { > if (unlikely(op + length > oend - LASTLITERALS)) { > @@ -418,7 +681,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > } > > if (op < oCopyLimit) { > - LZ4_wildCopy(op, match, oCopyLimit); > + LZ4_wildCopy8(op, match, oCopyLimit); > match += oCopyLimit - op; > op = oCopyLimit; > } > @@ -427,7 +690,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > } else { > LZ4_copy8(op, match); > if (length > 16) > - LZ4_wildCopy(op + 8, match + 8, cpy); > + LZ4_wildCopy8(op + 8, match + 8, cpy); > } > op = cpy; /* wildcopy correction */ > } > diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h > index 1a7fa9d9170f..4cc17cf589ed 100644 > --- a/lib/lz4/lz4defs.h > +++ b/lib/lz4/lz4defs.h > @@ -40,6 +40,28 @@ > > #define FORCE_INLINE __always_inline > > +/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE > + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, > + * together with a simple 8-byte copy loop as a fall-back path. > + * However, this optimization hurts the decompression speed by >30%, > + * because the execution does not go to the optimized loop > + * for typical compressible data, and all of the preamble checks > + * before going to the fall-back path become useless overhead. > + * This optimization happens only with the -O3 flag, and -O2 generates > + * a simple 8-byte copy loop. > + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 > + * functions are annotated with __attribute__((optimize("O2"))), > + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute > + * of LZ4_wildCopy8 does not affect the compression speed. > + */ > +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && > !defined(__clang__) > +# define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2"))) > +# define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) > FORCE_INLINE) > +#else > +# define FORCE_O2_GCC_PPC64LE FORCE_INLINE > +# define FORCE_O2_INLINE_GCC_PPC64LE FORCE_INLINE > +#endif > + > /*-************************************ > * Basic Types > **************************************/ > @@ -99,6 +121,9 @@ typedef uintptr_t uptrval; > #define RUN_BITS (8 - ML_BITS) > #define RUN_MASK ((1U << RUN_BITS) - 1) > > +static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 }; > +static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 }; > + > /*-************************************ > * Reading and writing into memory > **************************************/ > @@ -156,7 +181,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void > *src) > * customized variant of memcpy, > * which can overwrite up to 7 bytes beyond dstEnd > */ > -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, > +static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr, > const void *srcPtr, void *dstEnd) > { > BYTE *d = (BYTE *)dstPtr; > @@ -220,6 +245,39 @@ static FORCE_INLINE unsigned int LZ4_count( > return (unsigned int)(pIn - pStart); > } > > +/* Read the variable-length literal or match length. > + * > + * ip - pointer to use as input. > + * lencheck - end ip. Return an error if ip advances >= lencheck. > + * loop_check - check ip >= lencheck in body of loop. Returns loop_error if > so. > + * initial_check - check ip >= lencheck before start of loop. Returns > initial_error if so. > + * error (output) - error code. Should be set to 0 before call. > + */ > +typedef enum { loop_error = -2, initial_error = -1, ok = 0} > variable_length_error; > +static FORCE_INLINE unsigned read_variable_length(const BYTE **ip, > + const BYTE *lencheck, > + int loop_check, int initial_check, > + variable_length_error *error) > +{ > + unsigned length = 0; > + unsigned s; > + if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow > detection */ > + *error = initial_error; > + return length; > + } > + do { > + s = **ip; > + (*ip)++; > + length += s; > + if (loop_check && unlikely((*ip) >= lencheck)) { /* > overflow detection */ > + *error = loop_error; > + return length; > + } > + } while (s == 255); > + > + return length; > +} > + > typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; > typedef enum { byPtr, byU32, byU16 } tableType_t; > > diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c > index 176f03b83e56..e02e041a01d9 100644 > --- a/lib/lz4/lz4hc_compress.c > +++ b/lib/lz4/lz4hc_compress.c > @@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence( > *token = (BYTE)(length<<ML_BITS); > > /* Copy Literals */ > - LZ4_wildCopy(*op, *anchor, (*op) + length); > + LZ4_wildCopy8(*op, *anchor, (*op) + length); > *op += length; > > /* Encode Offset */ >