Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package zlib for openSUSE:Factory checked in at 2022-10-13 15:39:38 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/zlib (Old) and /work/SRC/openSUSE:Factory/.zlib.new.2275 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "zlib" Thu Oct 13 15:39:38 2022 rev:84 rq:1009787 version:1.2.12 Changes: -------- --- /work/SRC/openSUSE:Factory/zlib/zlib.changes 2022-09-02 21:56:04.288228022 +0200 +++ /work/SRC/openSUSE:Factory/.zlib.new.2275/zlib.changes 2022-10-13 15:39:41.890435356 +0200 @@ -1,0 +2,10 @@ +Mon Oct 10 10:08:02 UTC 2022 - Danilo Spinella <danilo.spine...@suse.com> + +- Add Power8 optimizations: + * zlib-1.2.12-add-optimized-slide_hash-for-power.patch + * zlib-1.2.12-add-vectorized-longest_match-for-power.patch + * zlib-1.2.12-adler32-vector-optimizations-for-power.patch + * zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch +- Update zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch + +------------------------------------------------------------------- New: ---- zlib-1.2.12-add-optimized-slide_hash-for-power.patch zlib-1.2.12-add-vectorized-longest_match-for-power.patch zlib-1.2.12-adler32-vector-optimizations-for-power.patch zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ zlib.spec ++++++ --- /var/tmp/diff_new_pack.Ly3d2k/_old 2022-10-13 15:39:42.650437055 +0200 +++ /var/tmp/diff_new_pack.Ly3d2k/_new 2022-10-13 15:39:42.654437063 +0200 @@ -44,12 +44,18 @@ # The following patches are taken from https://github.com/iii-i/zlib/commits/crc32vx-v3 Patch7: zlib-1.2.5-minizip-fixuncrypt.patch Patch8: zlib-1.2.11-optimized-s390.patch +# https://github.com/iii-i/zlib/commit/171d0ff3c9ed40da0ac14085ab16b766b1162069 Patch9: zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch Patch10: zlib-1.2.11-covscan-issues.patch Patch11: zlib-1.2.11-covscan-issues-rhel9.patch Patch12: zlib-1.2.12-optimized-crc32-power8.patch Patch13: zlib-1.2.12-fix-configure.patch Patch14: zlib-1.2.12-s390-vectorize-crc32.patch +# The following patches are taken from https://github.com/mscastanho/zlib/commits/power-optimizations-1.2.12 +Patch15: zlib-1.2.12-adler32-vector-optimizations-for-power.patch +Patch16: zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch +Patch17: zlib-1.2.12-add-optimized-slide_hash-for-power.patch +Patch18: zlib-1.2.12-add-vectorized-longest_match-for-power.patch BuildRequires: autoconf BuildRequires: automake BuildRequires: libtool @@ -148,6 +154,10 @@ %patch12 -p1 %patch13 -p1 %patch14 -p1 +%patch15 -p1 +%patch16 -p1 +%patch17 -p1 +%patch18 -p1 cp %{SOURCE4} . %build @@ -167,10 +177,10 @@ # Profiling flags breaks tests, as of 1.2.12 # In particular, gzseek does not work as intended #%if %{do_profiling} -# #make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_generate}" +# make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_generate}" # make check %{?_smp_mflags} -# #make %{?_smp_mflags} clean -# #make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_feedback}" +# make %{?_smp_mflags} clean +# make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_feedback}" #%else make %{?_smp_mflags} #%endif ++++++ zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch ++++++ ++++ 682 lines (skipped) ++++ between /work/SRC/openSUSE:Factory/zlib/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch ++++ and /work/SRC/openSUSE:Factory/.zlib.new.2275/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch ++++++ zlib-1.2.12-add-optimized-slide_hash-for-power.patch ++++++ >From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001 From: Matheus Castanho <m...@linux.ibm.com> Date: Wed, 27 Nov 2019 10:18:10 -0300 Subject: [PATCH] Add optimized slide_hash for Power Considerable time is spent on deflate.c:slide_hash() during deflate. This commit introduces a new slide_hash function that uses VSX vector instructions to slide 8 hash elements at a time, instead of just one as the standard code does. The choice between the optimized and default versions is made only on the first call to the function, enabling a fallback to standard behavior if the host processor does not support VSX instructions, so the same binary can be used for multiple Power processor versions. Author: Matheus Castanho <m...@linux.ibm.com> --- CMakeLists.txt | 3 +- Makefile.in | 8 ++++ configure | 4 +- contrib/power/power.h | 3 ++ contrib/power/slide_hash_power8.c | 63 +++++++++++++++++++++++++++++ contrib/power/slide_hash_resolver.c | 15 +++++++ deflate.c | 12 ++++++ 7 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 contrib/power/slide_hash_power8.c create mode 100644 contrib/power/slide_hash_resolver.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 44de486f6..8208c626b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,7 +186,8 @@ if(CMAKE_COMPILER_IS_GNUCC) add_definitions(-DZ_POWER8) set(ZLIB_POWER8 contrib/power/adler32_power8.c - contrib/power/crc32_z_power8.c) + contrib/power/crc32_z_power8.c + contrib/power/slide_hash_power8.c) set_source_files_properties( ${ZLIB_POWER8} diff --git a/Makefile.in b/Makefile.in index 9ef9fa9b5..f71c6eae0 100644 --- a/Makefile.in +++ b/Makefile.in @@ -183,6 +183,9 @@ crc32_z_power8.o: $(SRCDIR)contrib/power/crc32_z_power8.c deflate.o: $(SRCDIR)deflate.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c +slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c + $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c + infback.o: $(SRCDIR)infback.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c @@ -245,6 +248,11 @@ deflate.lo: $(SRCDIR)deflate.c $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c -@mv objs/deflate.o $@ +slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c + -@mv objs/slide_hash_power8.o $@ + infback.lo: $(SRCDIR)infback.c -@mkdir objs 2>/dev/null || test -d objs $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c diff --git a/configure b/configure index 810a7404d..d0dacf9c2 100755 --- a/configure +++ b/configure @@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then POWER8="-DZ_POWER8" - PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo" - OBJC="${OBJC} adler32_power8.o crc32_z_power8.o" + PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo slide_hash_power8.lo" + OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o" echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log else echo "Checking for -mcpu=power8 support... No." | tee -a configure.log diff --git a/contrib/power/power.h b/contrib/power/power.h index f57c76167..28c8f78ca 100644 --- a/contrib/power/power.h +++ b/contrib/power/power.h @@ -4,7 +4,10 @@ */ #include "../../zconf.h" #include "../../zutil.h" +#include "../../deflate.h" uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len); unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); + +void _slide_hash_power8(deflate_state *s); diff --git a/contrib/power/slide_hash_power8.c b/contrib/power/slide_hash_power8.c new file mode 100644 index 000000000..c5a0eb5a6 --- /dev/null +++ b/contrib/power/slide_hash_power8.c @@ -0,0 +1,63 @@ + /* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <altivec.h> +#include "../../deflate.h" + +local inline void slide_hash_power8_loop OF((deflate_state *s, + unsigned n_elems, Posf *table_end)) __attribute__((always_inline)); + +local void slide_hash_power8_loop( + deflate_state *s, + unsigned n_elems, + Posf *table_end) +{ + vector unsigned short vw, vm, *vp; + unsigned chunks; + + /* Each vector register (chunk) corresponds to 128 bits == 8 Posf, + * so instead of processing each of the n_elems in the hash table + * individually, we can do it in chunks of 8 with vector instructions. + * + * This function is only called from slide_hash_power8(), and both calls + * pass n_elems as a power of 2 higher than 2^7, as defined by + * deflateInit2_(), so n_elems will always be a multiple of 8. */ + chunks = n_elems >> 3; + Assert(n_elems % 8 == 0, "Weird hash table size!"); + + /* This type casting is safe since s->w_size is always <= 64KB + * as defined by deflateInit2_() and Posf == unsigned short */ + vw[0] = (Posf) s->w_size; + vw = vec_splat(vw,0); + + vp = (vector unsigned short *) table_end; + + do { + /* Processing 8 elements at a time */ + vp--; + vm = *vp; + + /* This is equivalent to: m >= w_size ? m - w_size : 0 + * Since we are using a saturated unsigned subtraction, any + * values that are > w_size will be set to 0, while the others + * will be subtracted by w_size. */ + *vp = vec_subs(vm,vw); + } while (--chunks); +}; + +void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s) +{ + unsigned n; + Posf *p; + + n = s->hash_size; + p = &s->head[n]; + slide_hash_power8_loop(s,n,p); + +#ifndef FASTEST + n = s->w_size; + p = &s->prev[n]; + slide_hash_power8_loop(s,n,p); +#endif +} diff --git a/contrib/power/slide_hash_resolver.c b/contrib/power/slide_hash_resolver.c new file mode 100644 index 000000000..54fa1eb21 --- /dev/null +++ b/contrib/power/slide_hash_resolver.c @@ -0,0 +1,15 @@ +/* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../gcc/zifunc.h" +#include "power.h" + +Z_IFUNC(slide_hash) { +#ifdef Z_POWER8 + if (__builtin_cpu_supports("arch_2_07")) + return _slide_hash_power8; +#endif + + return slide_hash_default; +} diff --git a/deflate.c b/deflate.c index 799fb93cc..b2db576dc 100644 --- a/deflate.c +++ b/deflate.c @@ -196,6 +196,13 @@ local const config configuration_table[10] = { (unsigned)(s->hash_size-1)*sizeof(*s->head)); \ } while (0) +#ifdef Z_POWER_OPT +/* Rename function so resolver can use its symbol. The default version will be + * returned by the resolver if the host has no support for an optimized version. + */ +#define slide_hash slide_hash_default +#endif /* Z_POWER_OPT */ + /* =========================================================================== * Slide the hash table when sliding the window down (could be avoided with 32 * bit values at the expense of memory usage). We slide even when level == 0 to @@ -227,6 +234,11 @@ local void slide_hash(s) #endif } +#ifdef Z_POWER_OPT +#undef slide_hash +#include "contrib/power/slide_hash_resolver.c" +#endif /* Z_POWER_OPT */ + /* ========================================================================= */ int ZEXPORT deflateInit_(strm, level, version, stream_size) z_streamp strm; ++++++ zlib-1.2.12-add-vectorized-longest_match-for-power.patch ++++++ >From aecdff0646c7e188b48f6db285d8d63a74f246c1 Mon Sep 17 00:00:00 2001 From: Matheus Castanho <m...@linux.ibm.com> Date: Tue, 29 Oct 2019 18:04:11 -0300 Subject: [PATCH] Add vectorized longest_match for Power This commit introduces an optimized version of the longest_match function for Power processors. It uses VSX instructions to match 16 bytes at a time on each comparison, instead of one by one. Author: Matheus Castanho <m...@linux.ibm.com> --- CMakeLists.txt | 3 +- Makefile.in | 8 + configure | 4 +- contrib/power/longest_match_power9.c | 194 +++++++++++++++++++++++++ contrib/power/longest_match_resolver.c | 15 ++ contrib/power/power.h | 2 + deflate.c | 13 ++ 7 files changed, 236 insertions(+), 3 deletions(-) create mode 100644 contrib/power/longest_match_power9.c create mode 100644 contrib/power/longest_match_resolver.c Index: zlib-1.2.12/CMakeLists.txt =================================================================== --- zlib-1.2.12.orig/CMakeLists.txt +++ zlib-1.2.12/CMakeLists.txt @@ -199,7 +199,8 @@ if(CMAKE_COMPILER_IS_GNUCC) if(POWER9) add_definitions(-DZ_POWER9) - set(ZLIB_POWER9 ) + set(ZLIB_POWER9 + contrib/power/longest_match_power9.c) set_source_files_properties( ${ZLIB_POWER9} Index: zlib-1.2.12/Makefile.in =================================================================== --- zlib-1.2.12.orig/Makefile.in +++ zlib-1.2.12/Makefile.in @@ -189,6 +189,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32- deflate.o: $(SRCDIR)deflate.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c +longest_match_power9.o: $(SRCDIR)contrib/power/longest_match_power9.c + $(CC) $(CFLAGS) -mcpu=power9 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/longest_match_power9.c + slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/slide_hash_power8.c @@ -259,6 +262,11 @@ deflate.lo: $(SRCDIR)deflate.c $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c -@mv objs/deflate.o $@ +longest_match_power9.lo: $(SRCDIR)contrib/power/longest_match_power9.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -mcpu=power9 $(ZINC) -DPIC -c -o objs/longest_match_power9.o $(SRCDIR)contrib/power/longest_match_power9.c + -@mv objs/longest_match_power9.o $@ + slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c -@mkdir objs 2>/dev/null || test -d objs $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c Index: zlib-1.2.12/configure =================================================================== --- zlib-1.2.12.orig/configure +++ zlib-1.2.12/configure @@ -915,8 +915,8 @@ if tryboth $CC -c $CFLAGS $test.c; then if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then POWER9="-DZ_POWER9" - PIC_OBJC="${PIC_OBJC}" - OBJC="${OBJC}" + PIC_OBJC="$PIC_OBJC longest_match_power9.lo" + OBJC="$OBJC longest_match_power9.o" echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log else echo "Checking for -mcpu=power9 support... No." | tee -a configure.log Index: zlib-1.2.12/contrib/power/longest_match_power9.c =================================================================== --- /dev/null +++ zlib-1.2.12/contrib/power/longest_match_power9.c @@ -0,0 +1,194 @@ +/* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <altivec.h> +#include "../../deflate.h" + +local inline int vec_match OF((Bytef* scan, Bytef* match)) + __attribute__((always_inline)); + +local inline int vec_match(Bytef* scan, Bytef* match) +{ + vector unsigned char vscan, vmatch, vc; + int len; + + vscan = *((vector unsigned char *) scan); + vmatch = *((vector unsigned char *) match); + + /* Compare 16 bytes at a time. + * Each byte of vc will be either all ones or all zeroes, + * depending on the result of the comparison + */ + vc = (vector unsigned char) vec_cmpne(vscan,vmatch); + + /* Since the index of matching bytes will contain only zeroes + * on vc (since we used cmpne), counting the number of consecutive + * bytes where LSB == 0 is the same as counting the length of the match. + * + * There was an issue in the way the vec_cnttz_lsbb builtin was implemented + * that got fixed on GCC 12, but now we have to use different builtins + * depending on the compiler version. To avoid that, let's use inline asm to + * generate the exact instruction we need. + */ + #ifdef __LITTLE_ENDIAN__ + asm volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)); + #else + asm volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc)); + #endif + + return len; +} + +uInt ZLIB_INTERNAL _longest_match_power9(deflate_state *s, IPos cur_match) +{ + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Bytef *scan = s->window + s->strstart; /* current string */ + register Bytef *match; /* matched string */ + register int len; /* length of current match */ + int best_len = (int)s->prev_length; /* best match length so far */ + int nice_match = s->nice_match; /* stop if match long enough */ + int mbytes; /* matched bytes inside loop */ + IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + s->strstart - (IPos)MAX_DIST(s) : 0; + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ + Posf *prev = s->prev; + uInt wmask = s->w_mask; + +#if (MAX_MATCH == 258) + /* Compare the last two bytes at once. */ + register Bytef *strend2 = s->window + s->strstart + MAX_MATCH - 2; + register ush scan_end = *(ushf*)(scan+best_len-1); +#else + register Bytef *strend = s->window + s->strstart + MAX_MATCH; + register Byte scan_end1 = scan[best_len-1]; + register Byte scan_end = scan[best_len]; +#endif + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + /* Do not waste too much time if we already have a good match: */ + if (s->prev_length >= s->good_match) { + chain_length >>= 2; + } + /* Do not look for matches beyond the end of the input. This is necessary + * to make deflate deterministic. + */ + if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead; + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + do { + Assert(cur_match < s->strstart, "no future"); + match = s->window + cur_match; + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2. Note that the checks below + * for insufficient lookahead only occur occasionally for performance + * reasons. Therefore uninitialized memory will be accessed, and + * conditional jumps will be made that depend on those values. + * However the length of the match is limited to the lookahead, so + * the output of deflate is not affected by the uninitialized values. + */ + +/* MAX_MATCH - 2 should be a multiple of 16 for this optimization to work. */ +#if (MAX_MATCH == 258) + + /* Compare ending (2 bytes) and beginning of potential match. + * + * On Power processors, loading a 16-byte vector takes only 1 extra + * cycle compared to a regular byte load. So instead of comparing the + * first two bytes and then the rest later if they match, we can compare + * the first 16 at once, and when we have a match longer than 2, we will + * already have the result of comparing the first 16 bytes saved in mbytes. + */ + if (*(ushf*)(match+best_len-1) != scan_end || + (mbytes = vec_match(scan,match)) < 3) continue; + + scan += mbytes; + match += mbytes; + + /* In case when we may have a match longer than 16, we perform further + * comparisons in chunks of 16 and keep going while all bytes match. + */ + while(mbytes == 16) { + mbytes = vec_match(scan,match); + scan += mbytes; + match += mbytes; + + /* We also have to limit the maximum match based on MAX_MATCH. + * Since we are comparing 16 bytes at a time and MAX_MATCH == 258 (to + * comply with default implementation), we should stop comparing when + * we have matched 256 bytes, which happens when scan == strend2. + * In this ("rare") case, we have to check the remaining 2 bytes + * individually using common load and compare operations. + */ + if(scan >= strend2) { + if(*scan == *match) { + if(*++scan == *++match) + scan++; + } + break; + } + } + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = (MAX_MATCH - 2) - (int)(strend2 - scan); + scan = strend2 - (MAX_MATCH - 2); + +#else /* MAX_MATCH == 258 */ + + if (match[best_len] != scan_end || + match[best_len-1] != scan_end1 || + *match != *scan || + *++match != scan[1]) continue; + + /* The check at best_len-1 can be removed because it will be made + * again later. (This heuristic is not always a win.) + * It is not necessary to compare scan[2] and match[2] since they + * are always equal when the other bytes match, given that + * the hash keys are equal and that HASH_BITS >= 8. + */ + scan += 2, match++; + Assert(*scan == *match, "match[2]?"); + + /* We check for insufficient lookahead only every 8th comparison; + * the 256th check will be made at strstart+258. + */ + do { + } while (*++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + *++scan == *++match && *++scan == *++match && + scan < strend); + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + scan = strend - MAX_MATCH; + +#endif /* MAX_MATCH == 258 */ + + if (len > best_len) { + s->match_start = cur_match; + best_len = len; + if (len >= nice_match) break; +#if (MAX_MATCH == 258) + scan_end = *(ushf*)(scan+best_len-1); +#else + scan_end1 = scan[best_len-1]; + scan_end = scan[best_len]; +#endif + } + } while ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0); + + if ((uInt)best_len <= s->lookahead) return (uInt)best_len; + return s->lookahead; +} Index: zlib-1.2.12/contrib/power/longest_match_resolver.c =================================================================== --- /dev/null +++ zlib-1.2.12/contrib/power/longest_match_resolver.c @@ -0,0 +1,15 @@ +/* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../gcc/zifunc.h" +#include "power.h" + +Z_IFUNC(longest_match) { +#ifdef Z_POWER9 + if (__builtin_cpu_supports("arch_3_00")) + return _longest_match_power9; +#endif + + return longest_match_default; +} Index: zlib-1.2.12/contrib/power/power.h =================================================================== --- zlib-1.2.12.orig/contrib/power/power.h +++ zlib-1.2.12/contrib/power/power.h @@ -10,4 +10,6 @@ uLong _adler32_power8(uLong adler, const unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); +uInt _longest_match_power9(deflate_state *s, IPos cur_match); + void _slide_hash_power8(deflate_state *s); Index: zlib-1.2.12/deflate.c =================================================================== --- zlib-1.2.12.orig/deflate.c +++ zlib-1.2.12/deflate.c @@ -1309,6 +1309,14 @@ local void lm_init (s) /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. */ + +#ifdef Z_POWER_OPT +/* Rename function so resolver can use its symbol. The default version will be + * returned by the resolver if the host has no support for an optimized version. + */ +#define longest_match longest_match_default +#endif /* Z_POWER_OPT */ + local uInt longest_match(s, pcur_match) deflate_state *s; IPos pcur_match; /* current match */ @@ -1454,6 +1462,11 @@ local uInt longest_match(s, pcur_match) } #endif /* ASMV */ +#ifdef Z_POWER_OPT +#undef longest_match +#include "contrib/power/longest_match_resolver.c" +#endif /* Z_POWER_OPT */ + #else /* FASTEST */ /* --------------------------------------------------------------------------- ++++++ zlib-1.2.12-adler32-vector-optimizations-for-power.patch ++++++ >From 772f4bd0f880c4c193ab7da78728f38821572a02 Mon Sep 17 00:00:00 2001 From: Rogerio Alves <rcard...@linux.ibm.com> Date: Mon, 9 Dec 2019 14:40:53 -0300 Subject: [PATCH] Adler32 vector optimization for Power. This commit implements a Power (POWER8+) vector optimization for Adler32 checksum using VSX (vector) instructions. The VSX adler32 checksum is up to 10x fast than the adler32 baseline code. Author: Rogerio Alves <rcard...@linux.ibm.com> --- CMakeLists.txt | 1 + Makefile.in | 8 ++ adler32.c | 11 ++ configure | 4 +- contrib/power/adler32_power8.c | 196 +++++++++++++++++++++++++++++++ contrib/power/adler32_resolver.c | 15 +++ contrib/power/power.h | 4 +- 7 files changed, 236 insertions(+), 3 deletions(-) create mode 100644 contrib/power/adler32_power8.c create mode 100644 contrib/power/adler32_resolver.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 581e1fa6d..c6296ee68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,6 +185,7 @@ if(CMAKE_COMPILER_IS_GNUCC) if(POWER8) add_definitions(-DZ_POWER8) set(ZLIB_POWER8 + contrib/power/adler32_power8.c contrib/power/crc32_z_power8.c) set_source_files_properties( diff --git a/Makefile.in b/Makefile.in index 16943044e..a0ffac860 100644 --- a/Makefile.in +++ b/Makefile.in @@ -165,6 +165,9 @@ minigzip64.o: $(SRCDIR)test/minigzip.c $(SRCDIR)zlib.h zconf.h adler32.o: $(SRCDIR)adler32.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c +adler32_power8.o: $(SRCDIR)contrib/power/adler32_power8.c + $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ $(SRCDIR)contrib/power/adler32_power8.c + crc32.o: $(SRCDIR)crc32.c $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c @@ -216,6 +219,11 @@ adler32.lo: $(SRCDIR)adler32.c $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c -@mv objs/adler32.o $@ +adler32_power8.lo: $(SRCDIR)contrib/power/adler32_power8.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/adler32_power8.o $(SRCDIR)contrib/power/adler32_power8.c + -@mv objs/adler32_power8.o $@ + crc32.lo: $(SRCDIR)crc32.c -@mkdir objs 2>/dev/null || test -d objs $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c diff --git a/adler32.c b/adler32.c index d0be4380a..4bde0fa18 100644 --- a/adler32.c +++ b/adler32.c @@ -131,6 +131,12 @@ uLong ZEXPORT adler32_z(adler, buf, len) } /* ========================================================================= */ + +#ifdef Z_POWER_OPT +/* Rename the default function to avoid naming conflicts */ +#define adler32 adler32_default +#endif /* Z_POWER_OPT */ + uLong ZEXPORT adler32(adler, buf, len) uLong adler; const Bytef *buf; @@ -139,6 +145,11 @@ uLong ZEXPORT adler32(adler, buf, len) return adler32_z(adler, buf, len); } +#ifdef Z_POWER_OPT +#undef adler32 +#include "contrib/power/adler32_resolver.c" +#endif /* Z_POWER_OPT */ + /* ========================================================================= */ local uLong adler32_combine_(adler1, adler2, len2) uLong adler1; diff --git a/configure b/configure index 914d9f4aa..810a7404d 100755 --- a/configure +++ b/configure @@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then POWER8="-DZ_POWER8" - PIC_OBJC="${PIC_OBJC} crc32_z_power8.lo" - OBJC="${OBJC} crc32_z_power8.o" + PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo" + OBJC="${OBJC} adler32_power8.o crc32_z_power8.o" echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log else echo "Checking for -mcpu=power8 support... No." | tee -a configure.log diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c new file mode 100644 index 000000000..473c39457 --- /dev/null +++ b/contrib/power/adler32_power8.c @@ -0,0 +1,196 @@ +/* + * Adler32 for POWER 8+ using VSX instructions. + * + * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector) + * instructions. + * + * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means + * iteration n) is the initial value of adler - at start _0 is 1 unless + * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after + * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on. + * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on + * after iteration N. + * + * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] + + * N-1*c[1] + ... + c[N] + * + * In a more general way: + * + * s1_N = s1_0 + sum(i=1 to N)c[i] + * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i] + * + * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we + * can process N-bit at time we can do this at once. + * + * Since VSX can support 16-bit vector instructions, we can process + * 16-bit at time using N = 16 we have: + * + * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i] + * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i] + * + * After the first iteration we calculate the adler32 checksum for 16 bytes. + * + * For more background about adler32 please check the RFC: + * https://www.ietf.org/rfc/rfc1950.txt + * + * Copyright (C) 2019 Rogerio Alves <rcard...@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + * + */ + +#include "../../zutil.h" +#include <altivec.h> + +/* Largest prime smaller than 65536. */ +#define BASE 65521U +#define NMAX 5552 +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1. */ + +#define DO1(s1,s2,buf,i) {(s1) += buf[(i)]; (s2) += (s1);} +#define DO2(s1,s2,buf,i) {DO1(s1,s2,buf,i); DO1(s1,s2,buf,i+1);} +#define DO4(s1,s2,buf,i) {DO2(s1,s2,buf,i); DO2(s1,s2,buf,i+2);} +#define DO8(s1,s2,buf,i) {DO4(s1,s2,buf,i); DO4(s1,s2,buf,i+4);} +#define DO16(s1,s2,buf) {DO8(s1,s2,buf,0); DO8(s1,s2,buf,8);} + +/* Vector across sum unsigned int (saturate). */ +inline vector unsigned int vec_sumsu (vector unsigned int __a, + vector unsigned int __b) +{ + __b = vec_sld(__a, __a, 8); + __b = vec_add(__b, __a); + __a = vec_sld(__b, __b, 4); + __a = vec_add(__a, __b); + + return __a; +} + +uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len) +{ + /* If buffer is empty or len=0 we need to return adler initial value. */ + if (buf == NULL) + return 1; + + unsigned int s1 = adler & 0xffff; + unsigned int s2 = (adler >> 16) & 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) { + s1 += buf[0]; + if (s1 >= BASE) + s1 -= BASE; + s2 += s1; + if (s2 >= BASE) + s2 -= BASE; + return (s2 << 16) | s1; + } + + /* Keep it fast for short length buffers. */ + if (len < 16) { + while (len--) { + s1 += *buf++; + s2 += s1; + } + if (s1 >= BASE) + s1 -= BASE; + s2 %= BASE; + return (s2 << 16) | s1; + } + + /* This is faster than VSX code for len < 64. */ + if (len < 64) { + while (len >= 16) { + len -= 16; + DO16(s1,s2,buf); + buf += 16; + } + } else { + /* Use POWER VSX instructions for len >= 64. */ + const vector unsigned int v_zeros = { 0 }; + const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, + 6, 5, 4, 3, 2, 1}; + const vector unsigned char vsh = vec_splat_u8(4); + const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; + vector unsigned int vs1 = vec_xl(0, &s1); + vector unsigned int vs2 = vec_xl(0, &s2); + vector unsigned int vs1_save = { 0 }; + vector unsigned int vsum1, vsum2; + vector unsigned char vbuf; + int n; + + /* Zeros the undefined values of vectors vs1, vs2. */ + vs1 = vec_and(vs1, vmask); + vs2 = vec_and(vs2, vmask); + + /* Do length bigger than NMAX in blocks of NMAX size. */ + while (len >= NMAX) { + len -= NMAX; + n = NMAX / 16; + do { + vbuf = vec_xl(0, (unsigned char *) buf); + vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ + /* sum(i=1 to 16) buf[i]*(16-i+1). */ + vsum2 = vec_msum(vbuf, v_mul, v_zeros); + /* Save vs1. */ + vs1_save = vec_add(vs1_save, vs1); + /* Accumulate the sums. */ + vs1 = vec_add(vsum1, vs1); + vs2 = vec_add(vsum2, vs2); + + buf += 16; + } while (--n); + /* Once each block of NMAX size. */ + vs1 = vec_sumsu(vs1, vsum1); + vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ + vs2 = vec_add(vs1_save, vs2); + vs2 = vec_sumsu(vs2, vsum2); + + /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */ + vs1[0] = vs1[0] % BASE; + /* vs2[0] = s2_i + 16*s1_save + + sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */ + vs2[0] = vs2[0] % BASE; + + vs1 = vec_and(vs1, vmask); + vs2 = vec_and(vs2, vmask); + vs1_save = v_zeros; + } + + /* len is less than NMAX one modulo is needed. */ + if (len >= 16) { + while (len >= 16) { + len -= 16; + + vbuf = vec_xl(0, (unsigned char *) buf); + + vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */ + /* sum(i=1 to 16) buf[i]*(16-i+1). */ + vsum2 = vec_msum(vbuf, v_mul, v_zeros); + /* Save vs1. */ + vs1_save = vec_add(vs1_save, vs1); + /* Accumulate the sums. */ + vs1 = vec_add(vsum1, vs1); + vs2 = vec_add(vsum2, vs2); + + buf += 16; + } + /* Since the size will be always less than NMAX we do this once. */ + vs1 = vec_sumsu(vs1, vsum1); + vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */ + vs2 = vec_add(vs1_save, vs2); + vs2 = vec_sumsu(vs2, vsum2); + } + /* Copy result back to s1, s2 (mod 65521). */ + s1 = vs1[0] % BASE; + s2 = vs2[0] % BASE; + } + + /* Process tail (len < 16). */ + while (len--) { + s1 += *buf++; + s2 += s1; + } + s1 %= BASE; + s2 %= BASE; + + return (s2 << 16) | s1; +} diff --git a/contrib/power/adler32_resolver.c b/contrib/power/adler32_resolver.c new file mode 100644 index 000000000..07a1a2cb2 --- /dev/null +++ b/contrib/power/adler32_resolver.c @@ -0,0 +1,15 @@ +/* Copyright (C) 2019 Rogerio Alves <rcard...@linux.ibm.com>, IBM + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../gcc/zifunc.h" +#include "power.h" + +Z_IFUNC(adler32) { +#ifdef Z_POWER8 + if (__builtin_cpu_supports("arch_2_07")) + return _adler32_power8; +#endif + + return adler32_default; +} diff --git a/contrib/power/power.h b/contrib/power/power.h index 79123aa90..f57c76167 100644 --- a/contrib/power/power.h +++ b/contrib/power/power.h @@ -2,7 +2,9 @@ * 2019 Rogerio Alves <rogerio.al...@ibm.com>, IBM * For conditions of distribution and use, see copyright notice in zlib.h */ - #include "../../zconf.h" +#include "../../zutil.h" + +uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len); unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t); ++++++ zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch ++++++ >From 11b722e4ae91b611f605221587ec8e0829c27949 Mon Sep 17 00:00:00 2001 From: Matheus Castanho <m...@linux.ibm.com> Date: Tue, 23 Jun 2020 10:26:19 -0300 Subject: [PATCH] Fix invalid memory access on ppc and ppc64 --- contrib/power/adler32_power8.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c index 473c39457..fdd086453 100644 --- a/contrib/power/adler32_power8.c +++ b/contrib/power/adler32_power8.c @@ -110,16 +110,15 @@ uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len) 6, 5, 4, 3, 2, 1}; const vector unsigned char vsh = vec_splat_u8(4); const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0}; - vector unsigned int vs1 = vec_xl(0, &s1); - vector unsigned int vs2 = vec_xl(0, &s2); + vector unsigned int vs1 = { 0 }; + vector unsigned int vs2 = { 0 }; vector unsigned int vs1_save = { 0 }; vector unsigned int vsum1, vsum2; vector unsigned char vbuf; int n; - /* Zeros the undefined values of vectors vs1, vs2. */ - vs1 = vec_and(vs1, vmask); - vs2 = vec_and(vs2, vmask); + vs1[0] = s1; + vs2[0] = s2; /* Do length bigger than NMAX in blocks of NMAX size. */ while (len >= NMAX) {