Hello community,

here is the log from the commit of package zlib for openSUSE:Factory
checked in at Sun Apr 3 12:14:01 CEST 2011.



--------
--- zlib/zlib.changes   2011-01-09 14:33:39.000000000 +0100
+++ /mounts/work_src_done/STABLE/zlib/zlib.changes      2011-03-30 
21:48:31.000000000 +0200
@@ -1,0 +2,11 @@
+Wed Mar 30 19:47:30 UTC 2011 - crrodrig...@opensuse.org
+
+- Update SSE2/MMX patches to version 2. 
+
+-------------------------------------------------------------------
+Tue Mar 15 22:38:32 UTC 2011 - crrodrig...@opensuse.org
+
+- Add highly experimental patches to use SSE2/SSSE3/MMX in zlib
+  this makes the library up to 6 times faster. 
+
+-------------------------------------------------------------------

calling whatdependson for head-i586


New:
----
  01-prepare.patch
  02-ppc_altivec.patch
  03-arm.patch
  04-x86.patch

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ zlib.spec ++++++
--- /var/tmp/diff_new_pack.Tg1qVv/_old  2011-04-03 12:13:22.000000000 +0200
+++ /var/tmp/diff_new_pack.Tg1qVv/_new  2011-04-03 12:13:22.000000000 +0200
@@ -1,5 +1,5 @@
 #
-# spec file for package zlib (Version 1.2.5)
+# spec file for package zlib
 #
 # Copyright (c) 2011 SUSE LINUX Products GmbH, Nuernberg, Germany.
 #
@@ -28,7 +28,7 @@
 %endif
 #
 Version:        1.2.5
-Release:        5
+Release:        11
 Summary:        Data Compression Library
 Url:            http://www.zlib.net/
 Source:         zlib-%{version}.tar.bz2
@@ -40,6 +40,10 @@
 Patch1:         zlib-lfs.patch
 # PATCH-FIX-JENGELH-PARALLEL-MAKE zlib-parallel.patch meiss...@novell.com -- 
shared library links with libz.a
 Patch2:         zlib-parallel.patch
+Patch3:         01-prepare.patch  
+Patch4:         02-ppc_altivec.patch  
+Patch5:         03-arm.patch  
+Patch6:         04-x86.patch
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 BuildRequires:  pkgconfig
 
@@ -84,6 +88,10 @@
 %patch0
 %patch1
 %patch2 -p1
+%patch3
+%patch4
+%patch5
+%patch6
 
 %build
 # Marcus: breaks example64 in 32bit builds.

++++++ 01-prepare.patch ++++++
=== modified file 'Makefile.in'
--- Makefile.in 2011-03-14 01:01:37 +0000
+++ Makefile.in 2011-03-14 02:19:21 +0000
@@ -236,7 +236,8 @@
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
-adler32.o zutil.o: zutil.h zlib.h zconf.h
+adler32.o: adler32.c zutil.h zlib.h zconf.h
+zutil.o: zutil.h zlib.h zconf.h
 gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
 compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
 crc32.o: zutil.h zlib.h zconf.h crc32.h
@@ -246,7 +247,8 @@
 inftrees.o: zutil.h zlib.h zconf.h inftrees.h
 trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
 
-adler32.lo zutil.lo: zutil.h zlib.h zconf.h
+adler32.lo: adler32.c zutil.h zlib.h zconf.h
+zutil.lo: zutil.h zlib.h zconf.h
 gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
 compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
 crc32.lo: zutil.h zlib.h zconf.h crc32.h

=== modified file 'adler32.c'
--- adler32.c   2011-03-14 01:01:37 +0000
+++ adler32.c   2011-03-30 13:38:42 +0000
@@ -9,6 +9,35 @@
 
 #define local static
 
+#define GCC_VERSION_GE(x) ((__GNUC__-0) * 100 + __GNUC_MINOR__-0 >= x)
+
+#if GCC_VERSION_GE(301)
+/* sometimes leakes out of old kernel header */
+#  undef noinline
+#  define noinline __attribute__((__noinline__))
+#else
+#  ifndef noinline
+#    define noinline
+#  endif
+#endif
+
+#if GCC_VERSION_GE(301)
+# define GCC_ATTR_UNUSED_PARAM __attribute__((__unused__))
+#else
+# define GCC_ATTR_UNUSED_PARAM
+#endif
+
+#if GCC_VERSION_GE(296)
+#  define likely(x)   __builtin_expect(!!(x), 1)
+#  define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#  define likely(x)   (x)
+#  define unlikely(x) (x)
+#endif
+
+#define ROUND_TO(x , n) ((x) & ~((n) - 1L))
+#define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - 
(intptr_t)(x))
+
 local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
 
 #define BASE 65521UL    /* largest prime smaller than 65536 */
@@ -21,9 +50,20 @@
 #define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
 #define DO16(buf)   DO8(buf,0); DO8(buf,8);
 
+#if defined(__alpha__)
+/* even if gcc can generate a mul by inverse, the code is really
+ * ugly (find global const pool pointer, load constant, a mul, lots
+ * of shifts/add/sub), up to 14 instructions. The replacement code
+ * only needs >= 5 instructions
+ */
+#  define NO_DIVIDE
+#endif
+
 /* use NO_DIVIDE if your processor does not do division in hardware */
 #ifdef NO_DIVIDE
-#  define MOD(a) \
+/* use NO_SHIFT if your processor does shift > 1 by loop */
+#  ifdef NO_SHIFT
+#    define reduce_full(a) \
     do { \
         if (a >= (BASE << 16)) a -= (BASE << 16); \
         if (a >= (BASE << 15)) a -= (BASE << 15); \
@@ -43,21 +83,237 @@
         if (a >= (BASE << 1)) a -= (BASE << 1); \
         if (a >= BASE) a -= BASE; \
     } while (0)
-#  define MOD4(a) \
+#    define reduce_x(a) \
     do { \
+        if (MIN_WORK >= (1 << 6) && a >= (BASE << 6)) a -= (BASE << 6); \
+        if (MIN_WORK >= (1 << 5) && a >= (BASE << 5)) a -= (BASE << 5); \
         if (a >= (BASE << 4)) a -= (BASE << 4); \
         if (a >= (BASE << 3)) a -= (BASE << 3); \
         if (a >= (BASE << 2)) a -= (BASE << 2); \
         if (a >= (BASE << 1)) a -= (BASE << 1); \
         if (a >= BASE) a -= BASE; \
     } while (0)
+#    define reduce(a) reduce_full(a)
+#  else
+#    define reduce_full(a) \
+    do { \
+        unsigned long b = a & 0x0000ffff; \
+        a >>= 16; \
+        b -= a; \
+        a <<= 4; \
+        a += b; \
+    } while(a >= BASE)
+#    define reduce_x(a) \
+    do { \
+        unsigned long b = a & 0x0000ffff; \
+        a >>= 16; \
+        b -= a; \
+        a <<= 4; \
+        a += b; \
+        a = a >= BASE ? a - BASE : a; \
+    } while(0)
+#    define reduce(a) \
+    do { \
+        unsigned long b = a & 0x0000ffff; \
+        a >>= 16; \
+        b -= a; \
+        a <<= 4; \
+        a += b; \
+    } while(0)
+#  endif
 #else
-#  define MOD(a) a %= BASE
-#  define MOD4(a) a %= BASE
-#endif
-
-/* ========================================================================= */
-uLong ZEXPORT adler32(adler, buf, len)
+#  define reduce_full(a) a %= BASE
+#  define reduce_x(a) a %= BASE
+#  define reduce(a) a %= BASE
+#endif
+
+local int host_is_bigendian()
+{
+    local const union {
+        uInt d;
+        unsigned char endian[sizeof(uInt)];
+    } x = {1};
+    return x.endian[0] == 0;
+}
+
+#ifndef MIN_WORK
+#  define MIN_WORK 16
+#endif
+
+/* ========================================================================= */
+local noinline uLong adler32_1(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len GCC_ATTR_UNUSED_PARAM;
+{
+    unsigned long sum2;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    adler += buf[0];
+    if (adler >= BASE)
+        adler -= BASE;
+    sum2 += adler;
+    if (sum2 >= BASE)
+        sum2 -= BASE;
+    return adler | (sum2 << 16);
+}
+
+/* ========================================================================= */
+local noinline uLong adler32_common(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned long sum2;
+
+    /* split Adler-32 into component sums */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    while (len--) {
+        adler += *buf++;
+        sum2 += adler;
+    }
+    if (adler >= BASE)
+        adler -= BASE;
+    reduce_x(sum2);             /* only added so many BASE's */
+    return adler | (sum2 << 16);
+}
+
+#ifndef HAVE_ADLER32_VEC
+#  if (defined(__LP64__) || ((SIZE_MAX-0) >> 31) >= 2) && 
!defined(NO_ADLER32_VEC)
+
+/* On 64 Bit archs, we can do pseudo SIMD with a nice win.
+ * This is esp. important for old Alphas, they do not have byte
+ * access.
+ * This needs some register but x86_64 is fine (>= 9 for the mainloop
+ * req.). If your 64 Bit arch is more limited, throw it away...
+ */
+#    ifndef UINT64_C
+#      if defined(_MSC_VER) || defined(__BORLANDC__)
+#        define UINT64_C(c)    (c ## ui64)
+#      else
+#        define UINT64_C(c)    (c ## ULL)
+#      endif
+#    endif
+
+#    undef VNMAX
+#    define VNMAX (2*NMAX+((9*NMAX)/10))
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1, s2;
+    unsigned int k;
+
+    /* split Adler-32 into component sums */
+    s1 = adler & 0xffff;
+    s2 = (adler >> 16) & 0xffff;
+
+    /* align input data */
+    k    = ALIGN_DIFF(buf, sizeof(size_t));
+    len -= k;
+    if (k) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while(--k);
+
+    k = len > VNMAX ? VNMAX : len;
+    len -= k;
+    if (likely(k >= 2 * sizeof(size_t))) do
+    {
+        unsigned int vs1, vs2;
+        unsigned int vs1s;
+
+        /* add s1 to s2 for rounds to come */
+        s2 += s1 * ROUND_TO(k, sizeof(size_t));
+        vs1s = vs1 = vs2 = 0;
+        do {
+            size_t vs1l = 0, vs1h = 0, vs1l_s = 0, vs1h_s = 0;
+            unsigned int a, b, c, d, e, f, g, h;
+            unsigned int j;
+
+            j = k > 23 * sizeof(size_t) ? 23 : k/sizeof(size_t);
+            k -= j * sizeof(size_t);
+            /* add s1 to s1 round sum for rounds to come */
+            vs1s += j * vs1;
+            do {
+                size_t in8 = *(const size_t *)buf;
+                buf += sizeof(size_t);
+                /* add this s1 to s1 round sum */
+                vs1l_s += vs1l;
+                vs1h_s += vs1h;
+                /* add up input data to s1 */
+                vs1l +=  in8 & UINT64_C(0x00ff00ff00ff00ff);
+                vs1h += (in8 & UINT64_C(0xff00ff00ff00ff00)) >> 8;
+            } while(--j);
+
+            /* split s1 */
+            if(host_is_bigendian()) {
+                a = (vs1h >> 48) & 0x0000ffff;
+                b = (vs1l >> 48) & 0x0000ffff;
+                c = (vs1h >> 32) & 0x0000ffff;
+                d = (vs1l >> 32) & 0x0000ffff;
+                e = (vs1h >> 16) & 0x0000ffff;
+                f = (vs1l >> 16) & 0x0000ffff;
+                g = (vs1h      ) & 0x0000ffff;
+                h = (vs1l      ) & 0x0000ffff;
+            } else {
+                a = (vs1l      ) & 0x0000ffff;
+                b = (vs1h      ) & 0x0000ffff;
+                c = (vs1l >> 16) & 0x0000ffff;
+                d = (vs1h >> 16) & 0x0000ffff;
+                e = (vs1l >> 32) & 0x0000ffff;
+                f = (vs1h >> 32) & 0x0000ffff;
+                g = (vs1l >> 48) & 0x0000ffff;
+                h = (vs1h >> 48) & 0x0000ffff;
+            }
+
+            /* add s1 & s2 horiz. */
+            vs2 += 8*a + 7*b + 6*c + 5*d + 4*e + 3*f + 2*g + 1*h;
+            vs1 += a + b + c + d + e + f + g + h;
+
+            /* split and add up s1 round sum */
+            vs1l_s = ((vs1l_s      ) & UINT64_C(0x0000ffff0000ffff)) +
+                     ((vs1l_s >> 16) & UINT64_C(0x0000ffff0000ffff));
+            vs1h_s = ((vs1h_s      ) & UINT64_C(0x0000ffff0000ffff)) +
+                     ((vs1h_s >> 16) & UINT64_C(0x0000ffff0000ffff));
+            vs1l_s += vs1h_s;
+            vs1s += ((vs1l_s      ) & UINT64_C(0x00000000ffffffff)) +
+                    ((vs1l_s >> 32) & UINT64_C(0x00000000ffffffff));
+        } while (k >= sizeof(size_t));
+        reduce(vs1s);
+        s2 += vs1s * 8 + vs2;
+        reduce(s2);
+        s1 += vs1;
+        reduce(s1);
+        len += k;
+        k = len > VNMAX ? VNMAX : len;
+        len -= k;
+    } while (k >= sizeof(size_t));
+
+    /* handle trailer */
+    if (k) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while (--k);
+    reduce(s1);
+    reduce(s2);
+
+    /* return recombined sums */
+    return (s2 << 16) | s1;
+}
+
+#  else
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
     uLong adler;
     const Bytef *buf;
     uInt len;
@@ -69,33 +325,6 @@
     sum2 = (adler >> 16) & 0xffff;
     adler &= 0xffff;
 
-    /* in case user likes doing a byte at a time, keep it fast */
-    if (len == 1) {
-        adler += buf[0];
-        if (adler >= BASE)
-            adler -= BASE;
-        sum2 += adler;
-        if (sum2 >= BASE)
-            sum2 -= BASE;
-        return adler | (sum2 << 16);
-    }
-
-    /* initial Adler-32 value (deferred check for len == 1 speed) */
-    if (buf == Z_NULL)
-        return 1L;
-
-    /* in case short lengths are provided, keep it somewhat fast */
-    if (len < 16) {
-        while (len--) {
-            adler += *buf++;
-            sum2 += adler;
-        }
-        if (adler >= BASE)
-            adler -= BASE;
-        MOD4(sum2);             /* only added so many BASE's */
-        return adler | (sum2 << 16);
-    }
-
     /* do length NMAX blocks -- requires just one modulo operation */
     while (len >= NMAX) {
         len -= NMAX;
@@ -104,8 +333,8 @@
             DO16(buf);          /* 16 sums unrolled */
             buf += 16;
         } while (--n);
-        MOD(adler);
-        MOD(sum2);
+        reduce_full(adler);
+        reduce_full(sum2);
     }
 
     /* do remaining bytes (less than NMAX, still just one modulo) */
@@ -119,13 +348,36 @@
             adler += *buf++;
             sum2 += adler;
         }
-        MOD(adler);
-        MOD(sum2);
+        reduce_full(adler);
+        reduce_full(sum2);
     }
 
     /* return recombined sums */
     return adler | (sum2 << 16);
 }
+#  endif
+#endif
+
+/* ========================================================================= */
+uLong ZEXPORT adler32(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_1(adler, buf, len); /* should create a fast tailcall */
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == Z_NULL)
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < MIN_WORK)
+        return adler32_common(adler, buf, len);
+
+    return adler32_vec(adler, buf, len);
+}
 
 /* ========================================================================= */
 local uLong adler32_combine_(adler1, adler2, len2)
@@ -141,7 +393,7 @@
     rem = (unsigned)(len2 % BASE);
     sum1 = adler1 & 0xffff;
     sum2 = rem * sum1;
-    MOD(sum2);
+    reduce_full(sum2);
     sum1 += (adler2 & 0xffff) + BASE - 1;
     sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
     if (sum1 >= BASE) sum1 -= BASE;
++++++ 02-ppc_altivec.patch ++++++
=== modified file 'Makefile.in'
--- Makefile.in 2011-03-14 02:19:21 +0000
+++ Makefile.in 2011-03-14 03:06:03 +0000
@@ -236,7 +236,7 @@
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
-adler32.o: adler32.c zutil.h zlib.h zconf.h
+adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
 zutil.o: zutil.h zlib.h zconf.h
 gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
 compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
@@ -247,7 +247,7 @@
 inftrees.o: zutil.h zlib.h zconf.h inftrees.h
 trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
 
-adler32.lo: adler32.c zutil.h zlib.h zconf.h
+adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
 zutil.lo: zutil.h zlib.h zconf.h
 gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
 compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h

=== modified file 'adler32.c'
--- adler32.c   2011-03-30 13:38:42 +0000
+++ adler32.c   2011-03-30 13:38:46 +0000
@@ -36,7 +36,10 @@
 #endif
 
 #define ROUND_TO(x , n) ((x) & ~((n) - 1L))
+#define DIV_ROUNDUP(a, b) (((a) + (b) - 1) / (b))
 #define ALIGN_DIFF(x, n) (((intptr_t)((x)+(n) - 1L) & ~((intptr_t)(n) - 1L)) - 
(intptr_t)(x))
+#define ALIGN_DOWN(x, n) (((intptr_t)(x)) & ~((intptr_t)(n) - 1L))
+#define ALIGN_DOWN_DIFF(x, n) (((intptr_t)(x)) & ((intptr_t)(n) - 1L))
 
 local uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2);
 
@@ -136,6 +139,12 @@
     return x.endian[0] == 0;
 }
 
+#ifndef NO_ADLER32_VEC
+#  if defined(__powerpc__) || defined(__powerpc64__)
+#    include "adler32_ppc.c"
+#  endif
+#endif
+
 #ifndef MIN_WORK
 #  define MIN_WORK 16
 #endif

=== added file 'adler32_ppc.c'
--- adler32_ppc.c       1970-01-01 00:00:00 +0000
+++ adler32_ppc.c       2011-03-30 11:12:04 +0000
@@ -0,0 +1,253 @@
+/*
+ * adler32.c -- compute the Adler-32 checksum of a data stream
+ *   ppc implementation
+ * Copyright (C) 1995-2007 Mark Adler
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+/*
+ * We use the Altivec PIM vector stuff, but still, this is only
+ * tested with GCC, and prop. uses some GCC specifics (like GCC
+ * understands vector types and you can simply write a += b)
+ */
+#if defined(__ALTIVEC__) && defined(__GNUC__)
+# define HAVE_ADLER32_VEC
+/* it needs some bytes till the vec version gets up to speed... */
+# define MIN_WORK 64
+# include <altivec.h>
+
+/*
+ * Depending on length, this can be slower (short length < 64 bytes),
+ * much faster (our beloved 128kb 22.2s generic to 3.4s vec, but cache
+ * is important...), to a little faster (very long length, 1.6MB, 47.6s
+ * to 36s), which is prop. only capped by memory bandwith.
+ * (The orig. 128k case was slower in AltiVec, because AltiVec loads
+ * are always uncached and trigger no HW prefetching, because that is
+ * what you often need with mass data manipulation (not poisen your
+ * cache, movntq), instead you have to do it for yourself (data stream
+ * touch). With 128k it could be cleanly seen: no prefetch, half as slow
+ * as generic, but comment out the memory load -> 3s. With proper prefetch
+ * we are at 3.4s. So AltiVec can execute these "expensive" FMA quite
+ * fast (even without fancy unrolling), only the data does not arrive
+ * fast enough. In cases where the working set does not fit into cache
+ * it simply cannot be delivered fast enough over the FSB/Mem).
+ * Still we have to prefetch, or we are slow as hell.
+ */
+
+# define SOVUC (sizeof(vector unsigned char))
+
+/* can be propably more, since we do not have the x86 psadbw 64 bit sum */
+# define VNMAX (6*NMAX)
+
+/* ========================================================================= */
+local inline vector unsigned char vec_identl(level)
+    unsigned int level;
+{
+    return vec_lvsl(level, (const unsigned char *)0);
+}
+
+/* ========================================================================= */
+local inline vector unsigned char vec_ident_rev(void)
+{
+    return vec_xor(vec_identl(0), vec_splat_u8(15));
+}
+
+/* ========================================================================= */
+/* multiply two 32 bit ints, return the low 32 bit */
+local inline vector unsigned int vec_mullw(vector unsigned int a, vector 
unsigned int b)
+{
+    vector unsigned int v16   = vec_splat_u32(-16);
+    vector unsigned int v0_32 = vec_splat_u32(0);
+    vector unsigned int swap, low, high;
+
+    swap = vec_rl(b, v16);
+    low  = vec_mulo((vector unsigned short)a, (vector unsigned short)b);
+    high = vec_msum((vector unsigned short)a, (vector unsigned short)swap, 
v0_32);
+    high = vec_sl(high, v16);
+    return vec_add(low, high);
+}
+
+/* ========================================================================= */
+local inline vector unsigned int vector_reduce(vector unsigned int x)
+{
+    vector unsigned int y;
+    vector unsigned int vsh;
+
+    vsh = vec_splat_u32(1);
+    vsh = vec_sl(vsh, vec_splat_u32(4));
+
+    y = vec_sl(x, vsh);
+    y = vec_sr(y, vsh);
+    x = vec_sr(x, vsh);
+    y = vec_sub(y, x);
+    x = vec_sl(x, vec_splat_u32(4));
+    x = vec_add(x, y);
+    return x;
+}
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1, s2;
+
+    s1 = adler & 0xffff;
+    s2 = (adler >> 16) & 0xffff;
+
+    if (likely(len >= 2*SOVUC)) {
+        vector unsigned int v0_32 = vec_splat_u32(0);
+        vector unsigned int   vsh = vec_splat_u32(4);
+        vector unsigned char   v1 = vec_splat_u8(1);
+        vector unsigned char vord;
+        vector unsigned char   v0 = vec_splat_u8(0);
+        vector unsigned int vs1, vs2;
+        vector unsigned char in16, vord_a, v1_a, vperm;
+        unsigned int f, n;
+        unsigned int k, block_num;
+
+        /*
+         * if i understand the Altivec PEM right, little
+         * endian impl. should have the data reversed on
+         * load, so the big endian vorder works.
+         */
+        vord = vec_ident_rev() + v1;
+        block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
+        f  = 512;
+        f |= block_num >= 256 ? 0 : block_num << 16;
+        vec_dst(buf, f, 2);
+        /*
+         * Add stuff to achieve alignment
+         */
+        /* swizzle masks in place */
+        vperm  = vec_lvsl(0, buf);
+        vord_a = vec_perm(vord, v0, vperm);
+        v1_a   = vec_perm(v1, v0, vperm);
+        vperm  = vec_lvsr(0, buf);
+        vord_a = vec_perm(v0, vord_a, vperm);
+        v1_a   = vec_perm(v0, v1_a, vperm);
+
+        /* align hard down */
+        f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUC);
+        n = SOVUC - f;
+        buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUC);
+
+        /* add n times s1 to s2 for start round */
+        s2 += s1 * n;
+
+        /* set sums 0 */
+        vs1 = v0_32;
+        vs2 = v0_32;
+
+        k = len < VNMAX ? (unsigned)len : VNMAX;
+        len -= k;
+
+        /* insert scalar start somewhere */
+        vs1 = vec_lde(0, &s1);
+        vs2 = vec_lde(0, &s2);
+
+        /* get input data */
+        in16 = vec_ldl(0, buf);
+
+        /* mask out excess data, add 4 byte horizontal and add to old dword */
+        vs1 = vec_msum(in16, v1_a, vs1);
+
+        /* apply order, masking out excess data, add 4 byte horizontal and add 
to old dword */
+        vs2 = vec_msum(in16, vord_a, vs2);
+
+        buf += SOVUC;
+        k -= n;
+
+        if (likely(k >= SOVUC)) do {
+            vector unsigned int vs1_r = v0_32;
+            f  = 512;
+            f |= block_num >= 256 ? 0 : block_num << 16;
+            vec_dst(buf, f, 2);
+
+            do {
+                /* get input data */
+                in16 = vec_ldl(0, buf);
+
+                /* add vs1 for this round */
+                vs1_r += vs1;
+
+                /* add 4 byte horizontal and add to old dword */
+                vs1 = vec_sum4s(in16, vs1);
+                /* apply order, add 4 byte horizontal and add to old dword */
+                vs2 = vec_msum(in16, vord, vs2);
+
+                buf += SOVUC;
+                k -= SOVUC;
+            } while (k >= SOVUC);
+            /* reduce vs1 round sum before multiplying by 16 */
+            vs1_r = vector_reduce(vs1_r);
+            /* add all vs1 for 16 times */
+            vs2 += vec_sl(vs1_r, vsh);
+            /* reduce the vectors to something in the range of BASE */
+            vs2 = vector_reduce(vs2);
+            vs1 = vector_reduce(vs1);
+            len += k;
+            k = len < VNMAX ? (unsigned)len : VNMAX;
+            block_num = DIV_ROUNDUP(len, 512); /* 32 block size * 16 bytes */
+            len -= k;
+        } while (likely(k >= SOVUC));
+
+        if (likely(k)) {
+            vector unsigned int vk;
+            /*
+             * handle trailer
+             */
+            f = SOVUC - k;
+            /* swizzle masks in place */
+            vperm  = vec_identl(f);
+            vord_a = vec_perm(vord, v0, vperm);
+            v1_a   = vec_perm(v1, v0, vperm);
+
+            /* add k times vs1 for this trailer */
+            vk = (vector unsigned int)vec_lvsl(0, (unsigned *)(intptr_t)k);
+            vk = (vector unsigned)vec_mergeh(v0, (vector unsigned char)vk);
+            vk = (vector unsigned)vec_mergeh((vector unsigned short)v0, 
(vector unsigned short)vk);
+            vk = vec_splat(vk, 0);
+            vs2 += vec_mullw(vs1, vk);
+
+            /* get input data */
+            in16 = vec_ldl(0, buf);
+
+            /* mask out excess data, add 4 byte horizontal and add to old 
dword */
+            vs1 = vec_msum(in16, v1_a, vs1);
+            /* apply order, masking out excess data, add 4 byte horizontal and 
add to old dword */
+            vs2 = vec_msum(in16, vord_a, vs2);
+
+            buf += k;
+            k -= k;
+        }
+
+        vec_dss(2);
+
+        /* add horizontal */
+        /* stuff should be reduced so no proplem with signed sature */
+        vs1 = (vector unsigned)vec_sums((vector int)vs1, (vector int)v0_32);
+        vs2 = (vector unsigned)vec_sums((vector int)vs2, (vector int)v0_32);
+        /* shake and roll */
+        vs1 = vec_splat(vs1, 3);
+        vs2 = vec_splat(vs2, 3);
+        vec_ste(vs1, 0, &s1);
+        vec_ste(vs2, 0, &s2);
+        /* after horizontal add, reduce again in scalar code */
+    }
+
+    if (unlikely(len)) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while (--len);
+    reduce(s1);
+    reduce(s2);
+
+    return (s2 << 16) | s1;
+}
+
+#endif
++++++ 03-arm.patch ++++++
=== modified file 'Makefile.in'
--- Makefile.in 2011-03-14 03:06:03 +0000
+++ Makefile.in 2011-03-14 14:39:24 +0000
@@ -236,7 +236,7 @@
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
-adler32.o: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
+adler32.o: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
 zutil.o: zutil.h zlib.h zconf.h
 gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
 compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
@@ -247,7 +247,7 @@
 inftrees.o: zutil.h zlib.h zconf.h inftrees.h
 trees.o: deflate.h zutil.h zlib.h zconf.h trees.h
 
-adler32.lo: adler32.c adler32_ppc.c zutil.h zlib.h zconf.h
+adler32.lo: adler32.c adler32_ppc.c adler32_arm.c zutil.h zlib.h zconf.h
 zutil.lo: zutil.h zlib.h zconf.h
 gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
 compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h

=== modified file 'adler32.c'
--- adler32.c   2011-03-30 13:38:46 +0000
+++ adler32.c   2011-03-30 13:38:46 +0000
@@ -140,7 +140,9 @@
 }
 
 #ifndef NO_ADLER32_VEC
-#  if defined(__powerpc__) || defined(__powerpc64__)
+#  if defined(__arm__)
+#    include "adler32_arm.c"
+#  elif defined(__powerpc__) || defined(__powerpc64__)
 #    include "adler32_ppc.c"
 #  endif
 #endif

=== added file 'adler32_arm.c'
--- adler32_arm.c       1970-01-01 00:00:00 +0000
+++ adler32_arm.c       2011-03-30 11:18:49 +0000
@@ -0,0 +1,359 @@
+/*
+ * adler32.c -- compute the Adler-32 checksum of a data stream
+ *   arm implementation
+ * Copyright (C) 1995-2007 Mark Adler
+ * Copyright (C) 2009-2011 Jan Seiffert
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#if defined(__ARM_NEON__)
+// TODO: need byte order define
+/*
+ * Big endian NEON qwords are kind of broken.
+ * They are big endian within the dwords, but WRONG
+ * (really??) way round between lo and hi.
+ * Creating some kind of PDP11 middle endian.
+ *
+ * This is madness and unsupportable. For this reason
+ * GCC wants to disable qword endian specific patterns.
+ * We would need a Preprocessor define which endian we
+ * have to disable this code.
+ */
+#  include <arm_neon.h>
+
+#  define SOVUCQ sizeof(uint8x16_t)
+#  define SOVUC sizeof(uint8x8_t)
+/* since we do not have the 64bit psadbw sum, we could prop. do a little more 
*/
+#  define VNMAX (6*NMAX)
+#  define HAVE_ADLER32_VEC
+#  define MIN_WORK 32
+
+/* ========================================================================= */
+local inline uint8x16_t neon_simple_alignq(uint8x16_t a, uint8x16_t b, 
unsigned amount)
+{
+    switch(amount % SOVUCQ)
+    {
+    case  0: return a;
+    case  1: return vextq_u8(a, b,  1);
+    case  2: return vextq_u8(a, b,  2);
+    case  3: return vextq_u8(a, b,  3);
+    case  4: return vextq_u8(a, b,  4);
+    case  5: return vextq_u8(a, b,  5);
+    case  6: return vextq_u8(a, b,  6);
+    case  7: return vextq_u8(a, b,  7);
+    case  8: return vextq_u8(a, b,  8);
+    case  9: return vextq_u8(a, b,  9);
+    case 10: return vextq_u8(a, b, 10);
+    case 11: return vextq_u8(a, b, 11);
+    case 12: return vextq_u8(a, b, 12);
+    case 13: return vextq_u8(a, b, 13);
+    case 14: return vextq_u8(a, b, 14);
+    case 15: return vextq_u8(a, b, 15);
+    }
+    return b;
+}
+
+/* ========================================================================= */
+local inline uint32x4_t vector_reduce(uint32x4_t x)
+{
+    uint32x4_t y;
+
+    y = vshlq_n_u32(x, 16);
+    x = vshrq_n_u32(x, 16);
+    y = vshrq_n_u32(y, 16);
+    y = vsubq_u32(y, x);
+    x = vaddq_u32(y, vshlq_n_u32(x, 4));
+    return x;
+}
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    uint32x4_t v0_32 = (uint32x4_t){0,0,0,0};
+    uint8x16_t    v0 = (uint8x16_t)v0_32;
+    uint8x16_t vord, vord_a;
+    uint32x4_t vs1, vs2;
+    uint32x2_t v_tsum;
+    uint8x16_t in16;
+    uint32_t s1, s2;
+    unsigned k;
+
+    s1 = adler & 0xffff;
+    s2 = (adler >> 16) & 0xffff;
+
+// TODO: big endian mask is prop. wrong
+    if (host_is_bigendian())
+        vord = (uint8x16_t){16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1};
+    else
+        vord = (uint8x16_t){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+
+    if (likely(len >= 2*SOVUCQ)) {
+        unsigned f, n;
+
+        /*
+         * Add stuff to achieve alignment
+         */
+        /* align hard down */
+        f = (unsigned) ALIGN_DOWN_DIFF(buf, SOVUCQ);
+        n = SOVUCQ - f;
+        buf = (const unsigned char *)ALIGN_DOWN(buf, SOVUCQ);
+
+        /* add n times s1 to s2 for start round */
+        s2 += s1 * n;
+
+        /* set sums 0 */
+        vs1 = v0_32;
+        vs2 = v0_32;
+        /*
+         * the accumulation of s1 for every round grows very fast
+         * (quadratic?), even if we accumulate in 4 dwords, more
+         * rounds means nonlinear growth.
+         * We already split it out of s2, normaly it would be in
+         * s2 times 16... and even grow faster.
+         * Thanks to this split and vector reduction, we can stay
+         * longer in the loops. But we have to prepare for the worst
+         * (all 0xff), only do 6 times the work.
+         * (we could prop. stay a little longer since we have 4 sums,
+         * not 2 like on x86).
+         */
+        k = len < VNMAX ? (unsigned)len : VNMAX;
+        len -= k;
+        /* insert scalar start somewhere */
+        vs1 = vsetq_lane_u32(s1, vs1, 0);
+        vs2 = vsetq_lane_u32(s2, vs2, 0);
+
+        /* get input data */
+        in16 = *(const uint8x16_t *)buf;
+        /* mask out excess data */
+        if(host_is_bigendian()) {
+            in16 = neon_simple_alignq(v0, in16, n);
+            vord_a = neon_simple_alignq(v0, vord, n);
+        } else {
+            in16 = neon_simple_alignq(in16, v0, f);
+            vord_a = neon_simple_alignq(vord, v0, f);
+        }
+
+        /* pairwise add bytes and long, pairwise add word long acc */
+        vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
+        /* apply order, add words, pairwise add word long acc */
+        vs2 = vpadalq_u16(vs2,
+                vmlal_u8(
+                    vmull_u8(vget_low_u8(in16), vget_low_u8(vord_a)),
+                    vget_high_u8(in16), vget_high_u8(vord_a)
+                    )
+                );
+
+        buf += SOVUCQ;
+        k -= n;
+
+        if (likely(k >= SOVUCQ)) do {
+            uint32x4_t vs1_r = v0_32;
+            do {
+                /* add vs1 for this round */
+                vs1_r = vaddq_u32(vs1_r, vs1);
+
+                /* get input data */
+                in16 = *(const uint8x16_t *)buf;
+
+// TODO: make work in inner loop more tight
+                /*
+                 * decompose partial sums, so we do less instructions and
+                 * build loops around it to do acc and so on only from time
+                 * to time.
+                 * This is hard with NEON, because the instruction are nice:
+                 * we have the stuff in widening and with acc (practicaly
+                 * for free...)
+                 */
+                /* pairwise add bytes and long, pairwise add word long acc */
+                vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
+                /* apply order, add words, pairwise add word long acc */
+                vs2 = vpadalq_u16(vs2,
+                        vmlal_u8(
+                            vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
+                            vget_high_u8(in16), vget_high_u8(vord)
+                            )
+                        );
+
+                buf += SOVUCQ;
+                k -= SOVUCQ;
+            } while (k >= SOVUCQ);
+            /* reduce vs1 round sum before multiplying by 16 */
+            vs1_r = vector_reduce(vs1_r);
+            /* add vs1 for this round (16 times) */
+            /* they have shift right and accummulate, where is shift left and 
acc?? */
+            vs2 = vaddq_u32(vs2, vshlq_n_u32(vs1_r, 4));
+            /* reduce both vectors to something within 16 bit */
+            vs2 = vector_reduce(vs2);
+            vs1 = vector_reduce(vs1);
+            len += k;
+            k = len < VNMAX ? (unsigned) len : VNMAX;
+            len -= k;
+        } while (likely(k >= SOVUC));
+
+        if (likely(k)) {
+            /*
+             * handle trailer
+             */
+            f = SOVUCQ - k;
+            /* add k times vs1 for this trailer */
+            vs2 = vmlaq_u32(vs2, vs1, vdupq_n_u32(k));
+
+            /* get input data */
+            in16 = *(const uint8x16_t *)buf;
+            /* masks out bad data */
+            if(host_is_bigendian())
+                in16 = neon_simple_alignq(in16, v0, f);
+            else
+                in16 = neon_simple_alignq(v0, in16, k);
+
+            /* pairwise add bytes and long, pairwise add word long acc */
+            vs1 = vpadalq_u16(vs1, vpaddlq_u8(in16));
+            /* apply order, add words, pairwise add word long acc */
+            vs2 = vpadalq_u16(vs2,
+                    vmlal_u8(
+                        vmull_u8(vget_low_u8(in16), vget_low_u8(vord)),
+                        vget_high_u8(in16), vget_high_u8(vord)
+                        )
+                    );
+
+            buf += k;
+            k -= k;
+        }
+
+        /* add horizontal */
+        v_tsum = vpadd_u32(vget_high_u32(vs1), vget_low_u32(vs1));
+        v_tsum = vpadd_u32(v_tsum, v_tsum);
+        s1 = vget_lane_u32(v_tsum, 0);
+        v_tsum = vpadd_u32(vget_high_u32(vs2), vget_low_u32(vs2));
+        v_tsum = vpadd_u32(v_tsum, v_tsum);
+        s2 = vget_lane_u32(v_tsum, 0);
+    }
+
+    if (unlikely(len)) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while (--len);
+    reduce_x(s1);
+    reduce_x(s2);
+
+    return (s2 << 16) | s1;
+}
+
+/* inline asm, so only on GCC (or compatible) && ARM v6 or better */
+#elif defined(__GNUC__) && ( \
+        defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__)  || \
+        defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
+        defined(__ARM_ARCH_7A__) \
+      )
+#  define SOU32 (sizeof(unsigned int))
+#  define HAVE_ADLER32_VEC
+#  define MIN_WORK 16
+
+/* ========================================================================= */
+local noinline uLong adler32_vec(adler, buf, len)
+    uLong adler;
+    const Bytef *buf;
+    uInt len;
+{
+    unsigned int s1, s2;
+    unsigned int k;
+
+    s1 = adler & 0xffff;
+    s2 = (adler >> 16) & 0xffff;
+
+    k    = ALIGN_DIFF(buf, SOU32);
+    len -= k;
+    if (k) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while (--k);
+
+    if (likely(len >= 4 * SOU32)) {
+        unsigned int vs1 = s1, vs2 = s2;
+        unsigned int order_lo, order_hi;
+
+// TODO: byte order?
+        if(host_is_bigendian()) {
+            order_lo = 0x00030001;
+            order_hi = 0x00040002;
+        } else {
+            order_lo = 0x00020004;
+            order_hi = 0x00010003;
+        }
+// TODO: we could go over NMAX, since we have split the vs2 sum
+        /* something around (NMAX+(NMAX/3)+302) */
+        k = len < NMAX ? len : NMAX;
+        len -= k;
+
+        do {
+            unsigned int vs1_r = 0;
+            do {
+                unsigned int t21, t22, in;
+
+                /* get input data */
+                in = *(const unsigned int *)buf;
+
+                /* add vs1 for this round */
+                vs1_r += vs1;
+
+                /* add horizontal and acc */
+                asm ("usada8 %0, %1, %2, %3" : "=r" (vs1) : "r" (in), "r" (0), 
"r" (vs1));
+                /* widen bytes to words, apply order, add and acc */
+                asm ("uxtb16 %0, %1" : "=r" (t21) : "r" (in));
+                asm ("uxtb16 %0, %1, ror #8" : "=r" (t22) : "r" (in));
+// TODO: instruction result latency
+                /*
+                 * The same problem like the classic serial sum:
+                 * Chip maker sell us 1-cycle instructions, but that is not the
+                 * whole story. Nearly all 1-cycle chips are pipelined, so
+                 * you can get one result per cycle, but only if _they_ 
(plural)
+                 * are independent.
+                 * If you are depending on the result of an preciding 
instruction,
+                 * in the worst case you hit the instruction latency which is 
worst
+                 * case >= pipeline length. On the other hand there are 
result-fast-paths.
+                 * This could all be a wash with the classic sum (4 * 2 
instructions,
+                 * + dependence), since smald is:
+                 * - 2 cycle issue
+                 * - needs the acc in pipeline step E1, instead of E2
+                 * But the Cortex has a fastpath for acc.
+                 * I don't know.
+                 * We can not even unroll, we would need 4 order vars, return 
ENOREGISTER.
+                 */
+                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t21) , "r" 
(order_lo), "r" (vs2));
+                asm ("smlad %0, %1, %2, %3" : "=r" (vs2) : "r" (t22) , "r" 
(order_hi), "r" (vs2));
+
+                buf += SOU32;
+                k -= SOU32;
+            } while (k >= SOU32);
+            /* reduce vs1 round sum before multiplying by 4 */
+            reduce(vs1_r);
+            /* add vs1 for this round (4 times) */
+            vs2 += vs1_r * 4;
+            /* reduce both sums to something within 16 bit */
+            reduce(vs2);
+            reduce(vs1);
+            len += k;
+            k = len < NMAX ? len : NMAX;
+            len -= k;
+        } while (likely(k >= 4 * SOU32));
+        len += k;
+        s1 = vs1;
+        s2 = vs2;
+    }
+
+    if (unlikely(len)) do {
+        s1 += *buf++;
+        s2 += s1;
+    } while (--len);
+    /* at this point we should no have so big s1 & s2 */
+    reduce_x(s1);
+    reduce_x(s2);
+
+    return (s2 << 16) | s1;
+}
+#endif
++++++ 04-x86.patch ++++++
++++ 1165 lines (skipped)


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



Remember to have fun...

-- 
To unsubscribe, e-mail: opensuse-commit+unsubscr...@opensuse.org
For additional commands, e-mail: opensuse-commit+h...@opensuse.org

Reply via email to