Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package zlib for openSUSE:Factory checked in 
at 2022-10-13 15:39:38
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/zlib (Old)
 and      /work/SRC/openSUSE:Factory/.zlib.new.2275 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "zlib"

Thu Oct 13 15:39:38 2022 rev:84 rq:1009787 version:1.2.12

Changes:
--------
--- /work/SRC/openSUSE:Factory/zlib/zlib.changes        2022-09-02 
21:56:04.288228022 +0200
+++ /work/SRC/openSUSE:Factory/.zlib.new.2275/zlib.changes      2022-10-13 
15:39:41.890435356 +0200
@@ -1,0 +2,10 @@
+Mon Oct 10 10:08:02 UTC 2022 - Danilo Spinella <danilo.spine...@suse.com>
+
+- Add Power8 optimizations:
+  * zlib-1.2.12-add-optimized-slide_hash-for-power.patch
+  * zlib-1.2.12-add-vectorized-longest_match-for-power.patch
+  * zlib-1.2.12-adler32-vector-optimizations-for-power.patch
+  * zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch
+- Update zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch
+
+-------------------------------------------------------------------

New:
----
  zlib-1.2.12-add-optimized-slide_hash-for-power.patch
  zlib-1.2.12-add-vectorized-longest_match-for-power.patch
  zlib-1.2.12-adler32-vector-optimizations-for-power.patch
  zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ zlib.spec ++++++
--- /var/tmp/diff_new_pack.Ly3d2k/_old  2022-10-13 15:39:42.650437055 +0200
+++ /var/tmp/diff_new_pack.Ly3d2k/_new  2022-10-13 15:39:42.654437063 +0200
@@ -44,12 +44,18 @@
 # The following patches are taken from 
https://github.com/iii-i/zlib/commits/crc32vx-v3
 Patch7:         zlib-1.2.5-minizip-fixuncrypt.patch
 Patch8:         zlib-1.2.11-optimized-s390.patch
+# https://github.com/iii-i/zlib/commit/171d0ff3c9ed40da0ac14085ab16b766b1162069
 Patch9:         zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch
 Patch10:        zlib-1.2.11-covscan-issues.patch
 Patch11:        zlib-1.2.11-covscan-issues-rhel9.patch
 Patch12:        zlib-1.2.12-optimized-crc32-power8.patch
 Patch13:        zlib-1.2.12-fix-configure.patch
 Patch14:        zlib-1.2.12-s390-vectorize-crc32.patch
+# The following patches are taken from 
https://github.com/mscastanho/zlib/commits/power-optimizations-1.2.12
+Patch15:        zlib-1.2.12-adler32-vector-optimizations-for-power.patch
+Patch16:        zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch
+Patch17:        zlib-1.2.12-add-optimized-slide_hash-for-power.patch
+Patch18:        zlib-1.2.12-add-vectorized-longest_match-for-power.patch
 BuildRequires:  autoconf
 BuildRequires:  automake
 BuildRequires:  libtool
@@ -148,6 +154,10 @@
 %patch12 -p1
 %patch13 -p1
 %patch14 -p1
+%patch15 -p1
+%patch16 -p1
+%patch17 -p1
+%patch18 -p1
 cp %{SOURCE4} .
 
 %build
@@ -167,10 +177,10 @@
 # Profiling flags breaks tests, as of 1.2.12
 # In particular, gzseek does not work as intended
 #%if %{do_profiling}
-#  #make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_generate}"
+#  make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_generate}"
 #  make check %{?_smp_mflags}
-#  #make %{?_smp_mflags} clean
-#  #make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_feedback}"
+#  make %{?_smp_mflags} clean
+#  make %{?_smp_mflags} CFLAGS="%{optflags} %{cflags_profile_feedback}"
 #%else
   make %{?_smp_mflags}
 #%endif

++++++ zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch ++++++
++++ 682 lines (skipped)
++++ between 
/work/SRC/openSUSE:Factory/zlib/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch
++++ and 
/work/SRC/openSUSE:Factory/.zlib.new.2275/zlib-1.2.12-IBM-Z-hw-accelerated-deflate-s390x.patch

++++++ zlib-1.2.12-add-optimized-slide_hash-for-power.patch ++++++
>From 4a8d89ae49aa17d1634a2816c8d159f533a07eae Mon Sep 17 00:00:00 2001
From: Matheus Castanho <m...@linux.ibm.com>
Date: Wed, 27 Nov 2019 10:18:10 -0300
Subject: [PATCH] Add optimized slide_hash for Power

Considerable time is spent on deflate.c:slide_hash() during
deflate. This commit introduces a new slide_hash function that
uses VSX vector instructions to slide 8 hash elements at a time,
instead of just one as the standard code does.

The choice between the optimized and default versions is made only
on the first call to the function, enabling a fallback to standard
behavior if the host processor does not support VSX instructions,
so the same binary can be used for multiple Power processor
versions.

Author: Matheus Castanho <m...@linux.ibm.com>
---
 CMakeLists.txt                      |  3 +-
 Makefile.in                         |  8 ++++
 configure                           |  4 +-
 contrib/power/power.h               |  3 ++
 contrib/power/slide_hash_power8.c   | 63 +++++++++++++++++++++++++++++
 contrib/power/slide_hash_resolver.c | 15 +++++++
 deflate.c                           | 12 ++++++
 7 files changed, 105 insertions(+), 3 deletions(-)
 create mode 100644 contrib/power/slide_hash_power8.c
 create mode 100644 contrib/power/slide_hash_resolver.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44de486f6..8208c626b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,7 +186,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
                 add_definitions(-DZ_POWER8)
                 set(ZLIB_POWER8
                   contrib/power/adler32_power8.c
-                  contrib/power/crc32_z_power8.c)
+                  contrib/power/crc32_z_power8.c
+                  contrib/power/slide_hash_power8.c)
 
                 set_source_files_properties(
                     ${ZLIB_POWER8}
diff --git a/Makefile.in b/Makefile.in
index 9ef9fa9b5..f71c6eae0 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -183,6 +183,9 @@ crc32_z_power8.o: $(SRCDIR)contrib/power/crc32_z_power8.c
 deflate.o: $(SRCDIR)deflate.c
        $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
 
+slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
+       $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ 
$(SRCDIR)contrib/power/slide_hash_power8.c
+
 infback.o: $(SRCDIR)infback.c
        $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)infback.c
 
@@ -245,6 +248,11 @@ deflate.lo: $(SRCDIR)deflate.c
        $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
        -@mv objs/deflate.o $@
 
+slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
+       -@mkdir objs 2>/dev/null || test -d objs
+       $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o 
objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
+       -@mv objs/slide_hash_power8.o $@
+
 infback.lo: $(SRCDIR)infback.c
        -@mkdir objs 2>/dev/null || test -d objs
        $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/infback.o $(SRCDIR)infback.c
diff --git a/configure b/configure
index 810a7404d..d0dacf9c2 100755
--- a/configure
+++ b/configure
@@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
 
   if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
     POWER8="-DZ_POWER8"
-    PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
-    OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
+    PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo 
slide_hash_power8.lo"
+    OBJC="${OBJC} adler32_power8.o crc32_z_power8.o slide_hash_power8.o"
     echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
   else
     echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
diff --git a/contrib/power/power.h b/contrib/power/power.h
index f57c76167..28c8f78ca 100644
--- a/contrib/power/power.h
+++ b/contrib/power/power.h
@@ -4,7 +4,10 @@
  */
 #include "../../zconf.h"
 #include "../../zutil.h"
+#include "../../deflate.h"
 
 uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
 
 unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
+
+void _slide_hash_power8(deflate_state *s);
diff --git a/contrib/power/slide_hash_power8.c 
b/contrib/power/slide_hash_power8.c
new file mode 100644
index 000000000..c5a0eb5a6
--- /dev/null
+++ b/contrib/power/slide_hash_power8.c
@@ -0,0 +1,63 @@
+ /* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM
+  * For conditions of distribution and use, see copyright notice in zlib.h
+  */
+
+#include <altivec.h>
+#include "../../deflate.h"
+
+local inline void slide_hash_power8_loop OF((deflate_state *s,
+      unsigned n_elems, Posf *table_end)) __attribute__((always_inline));
+
+local void slide_hash_power8_loop(
+    deflate_state *s,
+    unsigned n_elems,
+    Posf *table_end)
+{
+    vector unsigned short vw, vm, *vp;
+    unsigned chunks;
+
+    /* Each vector register (chunk) corresponds to 128 bits == 8 Posf,
+     * so instead of processing each of the n_elems in the hash table
+     * individually, we can do it in chunks of 8 with vector instructions.
+     *
+     * This function is only called from slide_hash_power8(), and both calls
+     * pass n_elems as a power of 2 higher than 2^7, as defined by
+     * deflateInit2_(), so n_elems will always be a multiple of 8. */
+    chunks = n_elems >> 3;
+    Assert(n_elems % 8 == 0, "Weird hash table size!");
+
+    /* This type casting is safe since s->w_size is always <= 64KB
+     * as defined by deflateInit2_() and Posf == unsigned short */
+    vw[0] = (Posf) s->w_size;
+    vw = vec_splat(vw,0);
+
+    vp = (vector unsigned short *) table_end;
+
+    do {
+        /* Processing 8 elements at a time */
+        vp--;
+        vm = *vp;
+
+        /* This is equivalent to: m >= w_size ? m - w_size : 0
+         * Since we are using a saturated unsigned subtraction, any
+         * values that are > w_size will be set to 0, while the others
+         * will be subtracted by w_size. */
+        *vp = vec_subs(vm,vw);
+    } while (--chunks);
+};
+
+void ZLIB_INTERNAL _slide_hash_power8(deflate_state *s)
+{
+    unsigned n;
+    Posf *p;
+
+    n = s->hash_size;
+    p = &s->head[n];
+    slide_hash_power8_loop(s,n,p);
+
+#ifndef FASTEST
+    n = s->w_size;
+    p = &s->prev[n];
+    slide_hash_power8_loop(s,n,p);
+#endif
+}
diff --git a/contrib/power/slide_hash_resolver.c 
b/contrib/power/slide_hash_resolver.c
new file mode 100644
index 000000000..54fa1eb21
--- /dev/null
+++ b/contrib/power/slide_hash_resolver.c
@@ -0,0 +1,15 @@
+/* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../gcc/zifunc.h"
+#include "power.h"
+
+Z_IFUNC(slide_hash) {
+#ifdef Z_POWER8
+    if (__builtin_cpu_supports("arch_2_07"))
+        return _slide_hash_power8;
+#endif
+
+    return slide_hash_default;
+}
diff --git a/deflate.c b/deflate.c
index 799fb93cc..b2db576dc 100644
--- a/deflate.c
+++ b/deflate.c
@@ -196,6 +196,13 @@ local const config configuration_table[10] = {
                  (unsigned)(s->hash_size-1)*sizeof(*s->head)); \
     } while (0)
 
+#ifdef Z_POWER_OPT
+/* Rename function so resolver can use its symbol. The default version will be
+ * returned by the resolver if the host has no support for an optimized 
version.
+ */
+#define slide_hash slide_hash_default
+#endif /* Z_POWER_OPT */
+
 /* ===========================================================================
  * Slide the hash table when sliding the window down (could be avoided with 32
  * bit values at the expense of memory usage). We slide even when level == 0 to
@@ -227,6 +234,11 @@ local void slide_hash(s)
 #endif
 }
 
+#ifdef Z_POWER_OPT
+#undef slide_hash
+#include "contrib/power/slide_hash_resolver.c"
+#endif /* Z_POWER_OPT */
+
 /* ========================================================================= */
 int ZEXPORT deflateInit_(strm, level, version, stream_size)
     z_streamp strm;

++++++ zlib-1.2.12-add-vectorized-longest_match-for-power.patch ++++++
>From aecdff0646c7e188b48f6db285d8d63a74f246c1 Mon Sep 17 00:00:00 2001
From: Matheus Castanho <m...@linux.ibm.com>
Date: Tue, 29 Oct 2019 18:04:11 -0300
Subject: [PATCH] Add vectorized longest_match for Power

This commit introduces an optimized version of the longest_match
function for Power processors. It uses VSX instructions to match
16 bytes at a time on each comparison, instead of one by one.

Author: Matheus Castanho <m...@linux.ibm.com>
---
 CMakeLists.txt                         |   3 +-
 Makefile.in                            |   8 +
 configure                              |   4 +-
 contrib/power/longest_match_power9.c   | 194 +++++++++++++++++++++++++
 contrib/power/longest_match_resolver.c |  15 ++
 contrib/power/power.h                  |   2 +
 deflate.c                              |  13 ++
 7 files changed, 236 insertions(+), 3 deletions(-)
 create mode 100644 contrib/power/longest_match_power9.c
 create mode 100644 contrib/power/longest_match_resolver.c

Index: zlib-1.2.12/CMakeLists.txt
===================================================================
--- zlib-1.2.12.orig/CMakeLists.txt
+++ zlib-1.2.12/CMakeLists.txt
@@ -199,7 +199,8 @@ if(CMAKE_COMPILER_IS_GNUCC)
 
             if(POWER9)
                 add_definitions(-DZ_POWER9)
-                set(ZLIB_POWER9 )
+                set(ZLIB_POWER9
+                    contrib/power/longest_match_power9.c)
 
                 set_source_files_properties(
                     ${ZLIB_POWER9}
Index: zlib-1.2.12/Makefile.in
===================================================================
--- zlib-1.2.12.orig/Makefile.in
+++ zlib-1.2.12/Makefile.in
@@ -189,6 +189,9 @@ crc32-vx.o: $(SRCDIR)contrib/s390/crc32-
 deflate.o: $(SRCDIR)deflate.c
        $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate.c
 
+longest_match_power9.o: $(SRCDIR)contrib/power/longest_match_power9.c
+       $(CC) $(CFLAGS) -mcpu=power9 $(ZINC) -c -o $@ 
$(SRCDIR)contrib/power/longest_match_power9.c
+
 slide_hash_power8.o: $(SRCDIR)contrib/power/slide_hash_power8.c
        $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ 
$(SRCDIR)contrib/power/slide_hash_power8.c
 
@@ -259,6 +262,11 @@ deflate.lo: $(SRCDIR)deflate.c
        $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate.o $(SRCDIR)deflate.c
        -@mv objs/deflate.o $@
 
+longest_match_power9.lo: $(SRCDIR)contrib/power/longest_match_power9.c
+       -@mkdir objs 2>/dev/null || test -d objs
+       $(CC) $(SFLAGS) -mcpu=power9 $(ZINC) -DPIC -c -o 
objs/longest_match_power9.o $(SRCDIR)contrib/power/longest_match_power9.c
+       -@mv objs/longest_match_power9.o $@
+
 slide_hash_power8.lo: $(SRCDIR)contrib/power/slide_hash_power8.c
        -@mkdir objs 2>/dev/null || test -d objs
        $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o 
objs/slide_hash_power8.o $(SRCDIR)contrib/power/slide_hash_power8.c
Index: zlib-1.2.12/configure
===================================================================
--- zlib-1.2.12.orig/configure
+++ zlib-1.2.12/configure
@@ -915,8 +915,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
 
   if tryboth $CC -c $CFLAGS -mcpu=power9 $test.c; then
     POWER9="-DZ_POWER9"
-    PIC_OBJC="${PIC_OBJC}"
-    OBJC="${OBJC}"
+    PIC_OBJC="$PIC_OBJC longest_match_power9.lo"
+    OBJC="$OBJC longest_match_power9.o"
     echo "Checking for -mcpu=power9 support... Yes." | tee -a configure.log
   else
     echo "Checking for -mcpu=power9 support... No." | tee -a configure.log
Index: zlib-1.2.12/contrib/power/longest_match_power9.c
===================================================================
--- /dev/null
+++ zlib-1.2.12/contrib/power/longest_match_power9.c
@@ -0,0 +1,194 @@
+/* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <altivec.h>
+#include "../../deflate.h"
+
+local inline int vec_match OF((Bytef* scan, Bytef* match))
+    __attribute__((always_inline));
+
+local inline int vec_match(Bytef* scan, Bytef* match)
+{
+    vector unsigned char vscan, vmatch, vc;
+    int len;
+
+    vscan  = *((vector unsigned char *) scan);
+    vmatch = *((vector unsigned char *) match);
+
+    /* Compare 16 bytes at a time.
+     * Each byte of vc will be either all ones or all zeroes,
+     * depending on the result of the comparison
+     */
+    vc = (vector unsigned char) vec_cmpne(vscan,vmatch);
+
+    /* Since the index of matching bytes will contain only zeroes
+     * on vc (since we used cmpne), counting the number of consecutive
+     * bytes where LSB == 0 is the same as counting the length of the match.
+     *
+     * There was an issue in the way the vec_cnttz_lsbb builtin was implemented
+     * that got fixed on GCC 12, but now we have to use different builtins
+     * depending on the compiler version. To avoid that, let's use inline asm 
to
+     * generate the exact instruction we need.
+     */
+    #ifdef __LITTLE_ENDIAN__
+    asm volatile("vctzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc));
+    #else
+    asm volatile("vclzlsbb %0, %1\n\t" : "=r" (len) : "v" (vc));
+    #endif
+
+   return len;
+}
+
+uInt ZLIB_INTERNAL _longest_match_power9(deflate_state *s, IPos cur_match)
+{
+    unsigned chain_length = s->max_chain_length;/* max hash chain length */
+    register Bytef *scan = s->window + s->strstart; /* current string */
+    register Bytef *match;                      /* matched string */
+    register int len;                           /* length of current match */
+    int best_len = (int)s->prev_length;         /* best match length so far */
+    int nice_match = s->nice_match;             /* stop if match long enough */
+    int mbytes;                                 /* matched bytes inside loop */
+    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+        s->strstart - (IPos)MAX_DIST(s) : 0;
+    /* Stop when cur_match becomes <= limit. To simplify the code,
+     * we prevent matches with the string of window index 0.
+     */
+    Posf *prev = s->prev;
+    uInt wmask = s->w_mask;
+
+#if (MAX_MATCH == 258)
+    /* Compare the last two bytes at once. */
+    register Bytef *strend2 = s->window + s->strstart + MAX_MATCH - 2;
+    register ush scan_end   = *(ushf*)(scan+best_len-1);
+#else
+    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
+    register Byte scan_end1  = scan[best_len-1];
+    register Byte scan_end   = scan[best_len];
+#endif
+
+    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+     * It is easy to get rid of this optimization if necessary.
+     */
+    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+    /* Do not waste too much time if we already have a good match: */
+    if (s->prev_length >= s->good_match) {
+        chain_length >>= 2;
+    }
+    /* Do not look for matches beyond the end of the input. This is necessary
+     * to make deflate deterministic.
+     */
+    if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;
+
+    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+    do {
+        Assert(cur_match < s->strstart, "no future");
+        match = s->window + cur_match;
+
+        /* Skip to next match if the match length cannot increase
+         * or if the match length is less than 2.  Note that the checks below
+         * for insufficient lookahead only occur occasionally for performance
+         * reasons.  Therefore uninitialized memory will be accessed, and
+         * conditional jumps will be made that depend on those values.
+         * However the length of the match is limited to the lookahead, so
+         * the output of deflate is not affected by the uninitialized values.
+         */
+
+/* MAX_MATCH - 2 should be a multiple of 16 for this optimization to work. */
+#if (MAX_MATCH == 258)
+
+        /* Compare ending (2 bytes) and beginning of potential match.
+         *
+         * On Power processors, loading a 16-byte vector takes only 1 extra
+         * cycle compared to a regular byte load. So instead of comparing the
+         * first two bytes and then the rest later if they match, we can 
compare
+         * the first 16 at once, and when we have a match longer than 2, we 
will
+         * already have the result of comparing the first 16 bytes saved in 
mbytes.
+         */
+        if (*(ushf*)(match+best_len-1) != scan_end ||
+             (mbytes = vec_match(scan,match)) < 3) continue;
+
+        scan  += mbytes;
+        match += mbytes;
+
+        /* In case when we may have a match longer than 16, we perform further
+         * comparisons in chunks of 16 and keep going while all bytes match.
+         */
+        while(mbytes == 16) {
+            mbytes = vec_match(scan,match);
+            scan += mbytes;
+            match += mbytes;
+
+            /* We also have to limit the maximum match based on MAX_MATCH.
+             * Since we are comparing 16 bytes at a time and MAX_MATCH == 258 
(to
+             * comply with default implementation), we should stop comparing 
when
+             * we have matched 256 bytes, which happens when scan == strend2.
+             * In this ("rare") case, we have to check the remaining 2 bytes
+             * individually using common load and compare operations.
+             */
+            if(scan >= strend2) {
+                if(*scan == *match) {
+                    if(*++scan == *++match)
+                        scan++;
+                }
+                break;
+            }
+        }
+
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+        len = (MAX_MATCH - 2) - (int)(strend2 - scan);
+        scan = strend2 - (MAX_MATCH - 2);
+
+#else /* MAX_MATCH == 258 */
+
+        if (match[best_len]   != scan_end  ||
+            match[best_len-1] != scan_end1 ||
+            *match            != *scan     ||
+            *++match          != scan[1])      continue;
+
+        /* The check at best_len-1 can be removed because it will be made
+         * again later. (This heuristic is not always a win.)
+         * It is not necessary to compare scan[2] and match[2] since they
+         * are always equal when the other bytes match, given that
+         * the hash keys are equal and that HASH_BITS >= 8.
+         */
+        scan += 2, match++;
+        Assert(*scan == *match, "match[2]?");
+
+        /* We check for insufficient lookahead only every 8th comparison;
+         * the 256th check will be made at strstart+258.
+         */
+        do {
+        } while (*++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 *++scan == *++match && *++scan == *++match &&
+                 scan < strend);
+
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+        len = MAX_MATCH - (int)(strend - scan);
+        scan = strend - MAX_MATCH;
+
+#endif /* MAX_MATCH == 258 */
+
+        if (len > best_len) {
+            s->match_start = cur_match;
+            best_len = len;
+            if (len >= nice_match) break;
+#if (MAX_MATCH == 258)
+            scan_end = *(ushf*)(scan+best_len-1);
+#else
+            scan_end1  = scan[best_len-1];
+            scan_end   = scan[best_len];
+#endif
+        }
+    } while ((cur_match = prev[cur_match & wmask]) > limit
+             && --chain_length != 0);
+
+    if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
+    return s->lookahead;
+}
Index: zlib-1.2.12/contrib/power/longest_match_resolver.c
===================================================================
--- /dev/null
+++ zlib-1.2.12/contrib/power/longest_match_resolver.c
@@ -0,0 +1,15 @@
+/* Copyright (C) 2019 Matheus Castanho <m...@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../gcc/zifunc.h"
+#include "power.h"
+
+Z_IFUNC(longest_match) {
+#ifdef Z_POWER9
+    if (__builtin_cpu_supports("arch_3_00"))
+        return _longest_match_power9;
+#endif
+
+    return longest_match_default;
+}
Index: zlib-1.2.12/contrib/power/power.h
===================================================================
--- zlib-1.2.12.orig/contrib/power/power.h
+++ zlib-1.2.12/contrib/power/power.h
@@ -10,4 +10,6 @@ uLong _adler32_power8(uLong adler, const
 
 unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);
 
+uInt _longest_match_power9(deflate_state *s, IPos cur_match);
+
 void _slide_hash_power8(deflate_state *s);
Index: zlib-1.2.12/deflate.c
===================================================================
--- zlib-1.2.12.orig/deflate.c
+++ zlib-1.2.12/deflate.c
@@ -1309,6 +1309,14 @@ local void lm_init (s)
 /* For 80x86 and 680x0, an optimized version will be provided in match.asm or
  * match.S. The code will be functionally equivalent.
  */
+
+#ifdef Z_POWER_OPT
+/* Rename function so resolver can use its symbol. The default version will be
+ * returned by the resolver if the host has no support for an optimized 
version.
+ */
+#define longest_match longest_match_default
+#endif /* Z_POWER_OPT */
+
 local uInt longest_match(s, pcur_match)
     deflate_state *s;
     IPos pcur_match;                             /* current match */
@@ -1454,6 +1462,11 @@ local uInt longest_match(s, pcur_match)
 }
 #endif /* ASMV */
 
+#ifdef Z_POWER_OPT
+#undef longest_match
+#include "contrib/power/longest_match_resolver.c"
+#endif /* Z_POWER_OPT */
+
 #else /* FASTEST */
 
 /* ---------------------------------------------------------------------------

++++++ zlib-1.2.12-adler32-vector-optimizations-for-power.patch ++++++
>From 772f4bd0f880c4c193ab7da78728f38821572a02 Mon Sep 17 00:00:00 2001
From: Rogerio Alves <rcard...@linux.ibm.com>
Date: Mon, 9 Dec 2019 14:40:53 -0300
Subject: [PATCH] Adler32 vector optimization for Power.

This commit implements a Power (POWER8+) vector optimization for Adler32
checksum using VSX (vector) instructions. The VSX adler32 checksum is up
to 10x fast than the adler32 baseline code.

Author: Rogerio Alves <rcard...@linux.ibm.com>
---
 CMakeLists.txt                   |   1 +
 Makefile.in                      |   8 ++
 adler32.c                        |  11 ++
 configure                        |   4 +-
 contrib/power/adler32_power8.c   | 196 +++++++++++++++++++++++++++++++
 contrib/power/adler32_resolver.c |  15 +++
 contrib/power/power.h            |   4 +-
 7 files changed, 236 insertions(+), 3 deletions(-)
 create mode 100644 contrib/power/adler32_power8.c
 create mode 100644 contrib/power/adler32_resolver.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 581e1fa6d..c6296ee68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,6 +185,7 @@ if(CMAKE_COMPILER_IS_GNUCC)
             if(POWER8)
                 add_definitions(-DZ_POWER8)
                 set(ZLIB_POWER8
+                  contrib/power/adler32_power8.c
                   contrib/power/crc32_z_power8.c)
 
                 set_source_files_properties(
diff --git a/Makefile.in b/Makefile.in
index 16943044e..a0ffac860 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -165,6 +165,9 @@ minigzip64.o: $(SRCDIR)test/minigzip.c $(SRCDIR)zlib.h 
zconf.h
 adler32.o: $(SRCDIR)adler32.c
        $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c
 
+adler32_power8.o: $(SRCDIR)contrib/power/adler32_power8.c
+       $(CC) $(CFLAGS) -mcpu=power8 $(ZINC) -c -o $@ 
$(SRCDIR)contrib/power/adler32_power8.c
+
 crc32.o: $(SRCDIR)crc32.c
        $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c
 
@@ -216,6 +219,11 @@ adler32.lo: $(SRCDIR)adler32.c
        $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c
        -@mv objs/adler32.o $@
 
+adler32_power8.lo: $(SRCDIR)contrib/power/adler32_power8.c
+       -@mkdir objs 2>/dev/null || test -d objs
+       $(CC) $(SFLAGS) -mcpu=power8 $(ZINC) -DPIC -c -o objs/adler32_power8.o 
$(SRCDIR)contrib/power/adler32_power8.c
+       -@mv objs/adler32_power8.o $@
+
 crc32.lo: $(SRCDIR)crc32.c
        -@mkdir objs 2>/dev/null || test -d objs
        $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c
diff --git a/adler32.c b/adler32.c
index d0be4380a..4bde0fa18 100644
--- a/adler32.c
+++ b/adler32.c
@@ -131,6 +131,12 @@ uLong ZEXPORT adler32_z(adler, buf, len)
 }
 
 /* ========================================================================= */
+
+#ifdef Z_POWER_OPT
+/* Rename the default function to avoid naming conflicts */
+#define adler32 adler32_default
+#endif /* Z_POWER_OPT */
+
 uLong ZEXPORT adler32(adler, buf, len)
     uLong adler;
     const Bytef *buf;
@@ -139,6 +145,11 @@ uLong ZEXPORT adler32(adler, buf, len)
     return adler32_z(adler, buf, len);
 }
 
+#ifdef Z_POWER_OPT
+#undef adler32
+#include "contrib/power/adler32_resolver.c"
+#endif /* Z_POWER_OPT */
+
 /* ========================================================================= */
 local uLong adler32_combine_(adler1, adler2, len2)
     uLong adler1;
diff --git a/configure b/configure
index 914d9f4aa..810a7404d 100755
--- a/configure
+++ b/configure
@@ -879,8 +879,8 @@ if tryboth $CC -c $CFLAGS $test.c; then
 
   if tryboth $CC -c $CFLAGS -mcpu=power8 $test.c; then
     POWER8="-DZ_POWER8"
-    PIC_OBJC="${PIC_OBJC} crc32_z_power8.lo"
-    OBJC="${OBJC} crc32_z_power8.o"
+    PIC_OBJC="${PIC_OBJC} adler32_power8.lo crc32_z_power8.lo"
+    OBJC="${OBJC} adler32_power8.o crc32_z_power8.o"
     echo "Checking for -mcpu=power8 support... Yes." | tee -a configure.log
   else
     echo "Checking for -mcpu=power8 support... No." | tee -a configure.log
diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c
new file mode 100644
index 000000000..473c39457
--- /dev/null
+++ b/contrib/power/adler32_power8.c
@@ -0,0 +1,196 @@
+/*
+ * Adler32 for POWER 8+ using VSX instructions.
+ *
+ * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
+ * instructions.
+ *
+ * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
+ * iteration n) is the initial value of adler - at start  _0 is 1 unless
+ * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
+ * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
+ * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
+ * after iteration N.
+ *
+ * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
+ * N-1*c[1] + ... + c[N]
+ *
+ * In a more general way:
+ *
+ * s1_N = s1_0 + sum(i=1 to N)c[i]
+ * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
+ *
+ * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
+ * can process N-bit at time we can do this at once.
+ *
+ * Since VSX can support 16-bit vector instructions, we can process
+ * 16-bit at time using N = 16 we have:
+ *
+ * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
+ * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
+ *
+ * After the first iteration we calculate the adler32 checksum for 16 bytes.
+ *
+ * For more background about adler32 please check the RFC:
+ * https://www.ietf.org/rfc/rfc1950.txt
+ *
+ * Copyright (C) 2019 Rogerio Alves <rcard...@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+#include "../../zutil.h"
+#include <altivec.h>
+
+/* Largest prime smaller than 65536.  */
+#define BASE 65521U
+#define NMAX 5552
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1.  */
+
+#define DO1(s1,s2,buf,i)  {(s1) += buf[(i)]; (s2) += (s1);}
+#define DO2(s1,s2,buf,i)  {DO1(s1,s2,buf,i); DO1(s1,s2,buf,i+1);}
+#define DO4(s1,s2,buf,i)  {DO2(s1,s2,buf,i); DO2(s1,s2,buf,i+2);}
+#define DO8(s1,s2,buf,i)  {DO4(s1,s2,buf,i); DO4(s1,s2,buf,i+4);}
+#define DO16(s1,s2,buf)   {DO8(s1,s2,buf,0); DO8(s1,s2,buf,8);}
+
+/* Vector across sum unsigned int (saturate).  */
+inline vector unsigned int vec_sumsu (vector unsigned int __a,
+        vector unsigned int __b)
+{
+  __b = vec_sld(__a, __a, 8);
+  __b = vec_add(__b, __a);
+  __a = vec_sld(__b, __b, 4);
+  __a = vec_add(__a, __b);
+
+  return __a;
+}
+
+uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const Bytef* buf, uInt len)
+{
+  /* If buffer is empty or len=0 we need to return adler initial value.  */
+  if (buf == NULL)
+      return 1;
+
+  unsigned int s1 = adler & 0xffff;
+  unsigned int s2 = (adler >> 16) & 0xffff;
+
+  /* in case user likes doing a byte at a time, keep it fast */
+  if (len == 1) {
+      s1 += buf[0];
+      if (s1 >= BASE)
+          s1 -= BASE;
+      s2 += s1;
+      if (s2 >= BASE)
+          s2 -= BASE;
+      return (s2 << 16) | s1;
+  }
+
+  /* Keep it fast for short length buffers. */
+  if (len < 16) {
+      while (len--) {
+          s1 += *buf++;
+          s2 += s1;
+      }
+      if (s1 >= BASE)
+          s1 -= BASE;
+      s2 %= BASE;
+      return (s2 << 16) | s1;
+  }
+
+  /* This is faster than VSX code for len < 64.  */
+  if (len < 64) {
+      while (len >= 16) {
+          len -= 16;
+          DO16(s1,s2,buf);
+          buf += 16;
+      }
+  } else {
+      /* Use POWER VSX instructions for len >= 64. */
+      const vector unsigned int v_zeros = { 0 };
+      const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+           6, 5, 4, 3, 2, 1};
+      const vector unsigned char vsh = vec_splat_u8(4);
+      const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
+      vector unsigned int vs1 = vec_xl(0, &s1);
+      vector unsigned int vs2 = vec_xl(0, &s2);
+      vector unsigned int vs1_save = { 0 };
+      vector unsigned int vsum1, vsum2;
+      vector unsigned char vbuf;
+      int n;
+
+      /* Zeros the undefined values of vectors vs1, vs2.  */
+      vs1 = vec_and(vs1, vmask);
+      vs2 = vec_and(vs2, vmask);
+
+      /* Do length bigger than NMAX in blocks of NMAX size.  */
+      while (len >= NMAX) {
+          len -= NMAX;
+          n = NMAX / 16;
+          do {
+             vbuf = vec_xl(0, (unsigned char *) buf);
+             vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+             /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+             vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+             /* Save vs1.  */
+             vs1_save = vec_add(vs1_save, vs1);
+             /* Accumulate the sums.  */
+             vs1 = vec_add(vsum1, vs1);
+             vs2 = vec_add(vsum2, vs2);
+
+             buf += 16;
+          } while (--n);
+          /* Once each block of NMAX size.  */
+          vs1 = vec_sumsu(vs1, vsum1);
+          vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+          vs2 = vec_add(vs1_save, vs2);
+          vs2 = vec_sumsu(vs2, vsum2);
+
+          /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
+          vs1[0] = vs1[0] % BASE;
+          /* vs2[0] = s2_i + 16*s1_save +
+             sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
+          vs2[0] = vs2[0] % BASE;
+
+          vs1 = vec_and(vs1, vmask);
+          vs2 = vec_and(vs2, vmask);
+          vs1_save = v_zeros;
+      }
+
+      /* len is less than NMAX one modulo is needed.  */
+      if (len >= 16) {
+          while (len >= 16) {
+              len -= 16;
+
+              vbuf = vec_xl(0, (unsigned char *) buf);
+
+              vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
+              /* sum(i=1 to 16) buf[i]*(16-i+1).  */
+              vsum2 = vec_msum(vbuf, v_mul, v_zeros);
+              /* Save vs1.  */
+              vs1_save = vec_add(vs1_save, vs1);
+              /* Accumulate the sums.  */
+              vs1 = vec_add(vsum1, vs1);
+              vs2 = vec_add(vsum2, vs2);
+
+              buf += 16;
+          }
+          /* Since the size will be always less than NMAX we do this once.  */
+          vs1 = vec_sumsu(vs1, vsum1);
+          vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
+          vs2 = vec_add(vs1_save, vs2);
+          vs2 = vec_sumsu(vs2, vsum2);
+      }
+      /* Copy result back to s1, s2 (mod 65521).  */
+      s1 = vs1[0] % BASE;
+      s2 = vs2[0] % BASE;
+   }
+
+  /* Process tail (len < 16).  */
+  while (len--) {
+      s1 += *buf++;
+      s2 += s1;
+  }
+  s1 %= BASE;
+  s2 %= BASE;
+
+  return (s2 << 16) | s1;
+}
diff --git a/contrib/power/adler32_resolver.c b/contrib/power/adler32_resolver.c
new file mode 100644
index 000000000..07a1a2cb2
--- /dev/null
+++ b/contrib/power/adler32_resolver.c
@@ -0,0 +1,15 @@
+/* Copyright (C) 2019 Rogerio Alves <rcard...@linux.ibm.com>, IBM
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../gcc/zifunc.h"
+#include "power.h"
+
+Z_IFUNC(adler32) {
+#ifdef Z_POWER8
+    if (__builtin_cpu_supports("arch_2_07"))
+        return _adler32_power8;
+#endif
+
+    return adler32_default;
+}
diff --git a/contrib/power/power.h b/contrib/power/power.h
index 79123aa90..f57c76167 100644
--- a/contrib/power/power.h
+++ b/contrib/power/power.h
@@ -2,7 +2,9 @@
  *               2019 Rogerio Alves    <rogerio.al...@ibm.com>, IBM
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
-
 #include "../../zconf.h"
+#include "../../zutil.h"
+
+uLong _adler32_power8(uLong adler, const Bytef* buf, uInt len);
 
 unsigned long _crc32_z_power8(unsigned long, const Bytef *, z_size_t);

++++++ zlib-1.2.12-fix-invalid-memory-access-on-ppc-and-ppc64.patch ++++++
>From 11b722e4ae91b611f605221587ec8e0829c27949 Mon Sep 17 00:00:00 2001
From: Matheus Castanho <m...@linux.ibm.com>
Date: Tue, 23 Jun 2020 10:26:19 -0300
Subject: [PATCH] Fix invalid memory access on ppc and ppc64

---
 contrib/power/adler32_power8.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/contrib/power/adler32_power8.c b/contrib/power/adler32_power8.c
index 473c39457..fdd086453 100644
--- a/contrib/power/adler32_power8.c
+++ b/contrib/power/adler32_power8.c
@@ -110,16 +110,15 @@ uLong ZLIB_INTERNAL _adler32_power8 (uLong adler, const 
Bytef* buf, uInt len)
            6, 5, 4, 3, 2, 1};
       const vector unsigned char vsh = vec_splat_u8(4);
       const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
-      vector unsigned int vs1 = vec_xl(0, &s1);
-      vector unsigned int vs2 = vec_xl(0, &s2);
+      vector unsigned int vs1 = { 0 };
+      vector unsigned int vs2 = { 0 };
       vector unsigned int vs1_save = { 0 };
       vector unsigned int vsum1, vsum2;
       vector unsigned char vbuf;
       int n;
 
-      /* Zeros the undefined values of vectors vs1, vs2.  */
-      vs1 = vec_and(vs1, vmask);
-      vs2 = vec_and(vs2, vmask);
+      vs1[0] = s1;
+      vs2[0] = s2;
 
       /* Do length bigger than NMAX in blocks of NMAX size.  */
       while (len >= NMAX) {

Reply via email to