Hello community,

here is the log from the commit of package dd_rescue for openSUSE:Factory 
checked in at 2013-08-04 07:28:46
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/dd_rescue (Old)
 and      /work/SRC/openSUSE:Factory/.dd_rescue.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "dd_rescue"

Changes:
--------
--- /work/SRC/openSUSE:Factory/dd_rescue/dd_rescue.changes      2013-07-25 
20:27:32.000000000 +0200
+++ /work/SRC/openSUSE:Factory/.dd_rescue.new/dd_rescue.changes 2013-08-04 
07:28:48.000000000 +0200
@@ -1,0 +2,18 @@
+Fri Aug  2 22:05:52 CEST 2013 - [email protected]
+
+- Update to dd_rescue-1.38:
+  * Further optimized SSE2 sparse detection. (Also added AVX2
+    version, not enabled by default though.)
+  * --force allows to ignore non-seekable output with non zero
+    output position.
+  * make check does some testing ...
+  * improved cur.rate and ETA calculation.
+
+-------------------------------------------------------------------
+Thu Aug  1 22:02:16 CEST 2013 - [email protected]
+
+- Update to dd_rescue-1.37:
+  * Important bugfix for SSE2 sparse detection.
+  * Fix exact zero-length on big endian machines (irrelevant).
+
+-------------------------------------------------------------------

Old:
----
  dd_rescue-1.36.tar.gz

New:
----
  dd_rescue-1.38.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ dd_rescue.spec ++++++
--- /var/tmp/diff_new_pack.LTvZx7/_old  2013-08-04 07:28:49.000000000 +0200
+++ /var/tmp/diff_new_pack.LTvZx7/_new  2013-08-04 07:28:49.000000000 +0200
@@ -17,7 +17,7 @@
 
 
 Name:           dd_rescue
-Version:        1.36
+Version:        1.38
 Release:        0
 Summary:        Data Copying in the Presence of I/O Errors
 License:        GPL-2.0 or GPL-3.0
@@ -63,6 +63,9 @@
 ln -sf %{_bindir}/dd_rescue %{buildroot}/bin
 #EndUsrMerge
 
+%check
+make check
+
 %files
 %defattr(-,root,root,-)
 %doc COPYING README.dd_rescue

++++++ dd_rescue-1.36.tar.gz -> dd_rescue-1.38.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/Makefile new/dd_rescue/Makefile
--- old/dd_rescue/Makefile      2013-07-24 00:38:20.000000000 +0200
+++ new/dd_rescue/Makefile      2013-08-02 13:33:21.000000000 +0200
@@ -1,8 +1,8 @@
 # Makefile for dd_rescue
 # (c) [email protected], 99/10/09, GNU GPL
-# $Id: Makefile,v 1.62 2013/07/23 22:38:20 garloff Exp $
+# $Id: Makefile,v 1.70 2013/08/02 10:23:12 garloff Exp $
 
-VERSION = 1.36
+VERSION = 1.38
 
 DESTDIR = 
 
@@ -53,6 +53,9 @@
 find_nonzero.o: find_nonzero.c find_nonzero.h
        $(CC) $(CFLAGS_OPT) -c $< $(SSE)
 
+find_nonzero_avx.o: find_nonzero_avx.c find_nonzero.h
+       $(CC) $(CFLAGS_OPT) -mavx2 -c $<
+
 libfalloc: dd_rescue.c $(HEADERS) $(OBJECTS)
        $(CC) $(CFLAGS) -DHAVE_LIBFALLOCATE=1 $(DEFINES) $< $(OUT) $(OBJECTS) 
-lfallocate
 
@@ -75,14 +78,19 @@
        strip -S $<
 
 clean:
-       rm -f $(TARGETS) $(OBJECTS) dd_rescue.o core test log find_nonzero 
fmt_no
+       rm -f $(TARGETS) $(OBJECTS) dd_rescue.o core test log find_nonzero 
fmt_no file_zblock find_nonzero_avx.o find_nonzero_avx
 
 find_nonzero: find_nonzero.c find_nonzero.h
        $(CC) $(CFLAGS_OPT) -o $@ $< -DTEST $(SSE)
 
+find_nonzero_avx: find_nonzero.c find_nonzero.h find_nonzero_avx.o
+       $(CC) $(CFLAGS_OPT) -o $@ $< -DHAVE_AVX2 -DTEST $(SSE) 
find_nonzero_avx.o
+
 fmt_no: fmt_no.c fmt_no.h
        $(CC) $(CFLAGS) -o $@ $< -DTEST
 
+file_zblock: file_zblock.c find_nonzero.h find_nonzero.c find_nonzero.o
+       $(CC) $(CFLAGS) -o $@ $< find_nonzero.o
 
 distclean: clean
        rm -f *~
@@ -99,3 +107,9 @@
        $(INSTALL) $(INSTASROOT) -m 644 dd_rescue.1 $(MANDIR)/man1/
        gzip -9 $(MANDIR)/man1/dd_rescue.1
 
+check: $(TARGETS) find_nonzero
+       ./dd_rescue -apP dd_rescue dd_rescue.copy
+       cmp dd_rescue dd_rescue.copy 
+       ./find_nonzero 2
+       rm dd_rescue.copy
+
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/dd_rescue.1 new/dd_rescue/dd_rescue.1
--- old/dd_rescue/dd_rescue.1   2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/dd_rescue.1   2013-07-26 00:00:15.000000000 +0200
@@ -1,4 +1,4 @@
-.\" $Id: dd_rescue.1,v 1.12 2013/07/16 09:53:50 garloff Exp $
+.\" $Id: dd_rescue.1,v 1.13 2013/07/25 11:57:34 garloff Exp $
 .
 .TH dd_rescue 1 "2013-02-24" "Kurt Garloff" "Rescue copy tool"
 .
@@ -359,7 +359,7 @@
 .TP 8
 .BR \-d ", " \-\-odir_in
 instructs 
-.b dd_rescue
+.B dd_rescue
 to open
 .IR infie
 with O_DIRECT, bypassing the kernel buffers. While this option has a negative
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/dd_rescue.c new/dd_rescue/dd_rescue.c
--- old/dd_rescue/dd_rescue.c   2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/dd_rescue.c   2013-08-02 00:16:39.000000000 +0200
@@ -45,7 +45,7 @@
 # define __COMPILER__ "(unknown compiler)"
 #endif
 
-#define ID "$Id: dd_rescue.c,v 1.214 2013/07/23 22:28:55 garloff Exp $"
+#define ID "$Id: dd_rescue.c,v 1.218 2013/08/01 22:16:39 garloff Exp $"
 
 #ifndef BUF_SOFTBLOCKSIZE
 # define BUF_SOFTBLOCKSIZE 65536
@@ -146,7 +146,8 @@
 char identical, preserve, falloc, dosplice;
 char i_chr, o_chr;
 char i_repeat, i_rep_init;
-int i_rep_zero, prng_seed;
+size_t i_rep_zero;
+int  prng_seed;
 char noextend, avoidwrite;
 char prng_libc, prng_frnd;
 char bsim715, bsim715_4, bsim715_2, bsim715_2ndpass;
@@ -562,12 +563,21 @@
 }
 #endif
 
+float floatrate4  = 0.0;
+float floatrate32 = 0.0;
 void doprint(FILE* const file, const unsigned int bs, const clock_t cl, 
             const float t1, const float t2, const int sync)
 {
        float avgrate = (float)xfer/t1;
        float currate = (float)(xfer-lxfer)/t2;
        const char *bold = BOLD, *norm = NORM;
+       if (!floatrate4) {
+               floatrate4  = currate;
+               floatrate32 = currate;
+       } else {
+               floatrate4  = (floatrate4 * 3 + currate)/ 4;
+               floatrate32 = (floatrate32*31 + currate)/32;
+       }
        if (nocol || (file != stderr && file != stdout)) {
                bold = ""; norm = "";
        }
@@ -581,7 +591,7 @@
                fmt_int(10, 1, 1024, sxfer, bold, norm, 1));
        if (sync || (file != stdin && file != stdout) )
                fprintf(file, "             +curr.rate:%skB/s, avg.rate:%skB/s, 
avg.load:%s%%\n",
-                       fmt_int(9, 0, 1024, currate, bold, norm, 1),
+                       fmt_int(9, 0, 1024, floatrate4, bold, norm, 1),
                        fmt_int(9, 0, 1024, avgrate, bold, norm, 1),
                        fmt_int(3, 1, 10, 
(cl-startclock)/(t1*(CLOCKS_PER_SEC/1000)), bold, norm, 1));
        else
@@ -594,7 +604,7 @@
                if (in_report)
                        sec = 0.5 + t1;
                else
-                       sec = 0.5 + (estxfer-xfer)/avgrate;
+                       sec = 0.5 + 2*(estxfer-xfer)/(avgrate+floatrate32);
                int hour = sec / 3600;
                int min = (sec % 3600) / 60;
                sec = sec % 60;
@@ -844,7 +854,7 @@
 }
 
 /** is the block zero ? */
-static int blockiszero(const unsigned char* blk, const size_t ln)
+static ssize_t blockiszero(const unsigned char* blk, const size_t ln)
 {
        if (i_repeat && i_rep_zero)
                return i_rep_zero;
@@ -2054,8 +2064,12 @@
        }
 
        if (o_chr && opos != 0) {
-               fplog(stderr, FATAL, "outfile not seekable, but opos !=0 
requested!\n");
-               cleanup(); exit(19);
+               if (force)
+                       fplog(stderr, WARN, "ignore non-seekable output with 
opos != 0 due to --force\n");
+               else {
+                       fplog(stderr, FATAL, "outfile not seekable, but opos 
!=0 requested!\n");
+                       cleanup(); exit(19);
+               }
        }
        if (i_chr && ipos != 0) {
                fplog(stderr, FATAL, "infile not seekable, but ipos !=0 
requested!\n");
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/file_zblock.c new/dd_rescue/file_zblock.c
--- old/dd_rescue/file_zblock.c 1970-01-01 01:00:00.000000000 +0100
+++ new/dd_rescue/file_zblock.c 2013-08-01 23:22:05.000000000 +0200
@@ -0,0 +1,56 @@
+#define _GNU_SOURCE 1
+#include <stdio.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "find_nonzero.h"
+
+#define BUFSZ (64*1024)
+unsigned char buf[BUFSZ];
+
+void usage()
+{
+       fprintf(stderr, "Usage: file_zblock FILE1 [FILE2 [FILE3 [...]]]\n");
+       fprintf(stderr, "file_zblock reports files with ()at least) chunk-sized 
blocks of zeros inside.\n");
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       int zf = 0;
+       int chunksz = 4096;
+       int i = 1, off;
+       if (argc < 2)
+               usage();
+       if (!memcmp(argv[1], "-c", 2)) {
+               if (strlen(argv[1]) > 2) {
+                       chunksz = atoi(argv[1]+2);
+                       ++i;
+               } else {
+                       chunksz = atoi(argv[2]);
+                       i += 2;
+               }
+       }
+       for (; i < argc; ++i) {
+               int fd = open(argv[i], O_RDONLY);
+               if (fd<0) {
+                       fprintf(stderr, "ERROR opening file %s: %s\n", argv[i], 
strerror(errno));
+                       continue;
+               }
+               int rd, found = 0;
+               while ((rd = read(fd, buf, BUFSZ)) > 0 && !found) {
+                       for (off = 0; off < rd; off += chunksz) {
+                               unsigned int tocheck = rd-off > chunksz? 
chunksz: rd-off;
+                               if (find_nonzero(buf+off, tocheck) == tocheck) {
+                                       ++found; ++zf;
+                                       printf("%s,%i\n", argv[i], off);
+                                       break;
+                               }
+                       }
+               }
+               close(fd);
+       }
+       return zf;
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/find_nonzero.c new/dd_rescue/find_nonzero.c
--- old/dd_rescue/find_nonzero.c        2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/find_nonzero.c        2013-08-02 13:33:21.000000000 +0200
@@ -7,7 +7,8 @@
 #define IN_FINDZERO
 #include "find_nonzero.h"
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(TEST) && (defined(__i386__) || defined(__x86_64__))
+/** Just for testing the speed of the good old x86 string instructions */
 size_t find_nonzero_rep(const unsigned char* blk, const size_t ln)
 {
        unsigned long register res;
@@ -30,15 +31,58 @@
 #ifdef __SSE2__
 #include <emmintrin.h>
 
-size_t find_nonzero_simd(const unsigned char* blk, const size_t ln)
+#ifdef TEST
+/** SSE2 version for measuring the initial zero bytes of aligned blk */
+size_t find_nonzero_sse2o(const unsigned char* blk, const size_t ln)
 {
-       __m128i xmm, zero = _mm_setzero_si128();
-       unsigned /*long*/ register eax;
+       __m128i register xmm;
+       const __m128i register zero = _mm_setzero_si128();
+#ifdef SIMD_XOR
+       const __m128i register mask = _mm_set_epi16(-1, -1, -1, -1, -1, -1, -1, 
-1);
+#endif
+       unsigned register eax;
        size_t i = 0;
+       //asm(".align 32");
        for (; i < ln; i+= 16) {
                xmm = _mm_load_si128((__m128i*)(blk+i));
+#ifdef BUGGY_136
                _mm_cmpeq_epi8(xmm, zero);
                eax = _mm_movemask_epi8(xmm);
+#else
+               xmm = _mm_cmpeq_epi8(xmm, zero);
+#ifdef SIMD_XOR
+               xmm = _mm_xor_si128(xmm, mask);
+#endif
+               eax = _mm_movemask_epi8(xmm);
+#endif /* BUGGY **/
+#if defined(SIMD_XOR) || defined(BUGGY_136)
+               if (eax) 
+                       return i + myffs(eax)-1;
+#else
+               if (eax != 0xffff)
+                       return i + myffs(eax^0xffff)-1;
+#endif
+       }
+       return ln;
+}
+#endif
+
+/** SSE2 version for measuring the initial zero bytes of 16b aligned blk */
+size_t find_nonzero_sse2(const unsigned char* blk, const size_t ln)
+{
+       register __m128i xmm0, xmm1;
+       register const __m128i zero = _mm_setzero_si128();
+       register unsigned int eax, ebx;
+       size_t i = 0;
+       //asm(".p2align 5");
+       for (; i < ln; i+= 32) {
+               //xmm0 = _mm_load_si128((__m128i*)(blk+i));
+               //xmm1 = _mm_load_si128((__m128i*)(blk+i+16));
+               xmm0 = _mm_cmpeq_epi8(*(__m128i*)(blk+i), zero);
+               xmm1 = _mm_cmpeq_epi8(*(__m128i*)(blk+i+16), zero);
+               eax = _mm_movemask_epi8(xmm0);
+               ebx = _mm_movemask_epi8(xmm1);
+               eax = ~(eax | (ebx << 16));
                if (eax) 
                        return i + myffs(eax)-1;
        }
@@ -46,11 +90,12 @@
 }
 
 #ifdef NEED_SIMD_RUNTIME_DETECTION
+/** Issue an SSE2 insn for runtime detection of SSE2 capability (x86) */
+volatile __m128d _probe_xmm;
 void probe_simd()
 {
-       volatile __m128d xmm;
        double val = 3.14159265358979323844;
-       xmm = _mm_set_sd(val);
+       _probe_xmm = _mm_set_sd(val);
 }
 #endif /* NEED_SIMD_RUNTIME_DETECTION */
 
@@ -58,13 +103,15 @@
 
 #if defined(__arm__)
 
-/* Inspired by Linaro's strlen() implementation; 
-   we don't even need NEON here, ldmia does the 3x speedup on A-9 */
-size_t find_nonzero_simd(const unsigned char *blk, const size_t ln)
+/** ASM optimized version for ARM.
+ * Inspired by Linaro's strlen() implementation; 
+ * we don't even need NEON here, ldmia does the 3x speedup on Cortexes */
+size_t find_nonzero_arm6(const unsigned char *blk, const size_t ln)
 {
        register unsigned char* res;
        const register unsigned char* end = blk+ln;
        asm volatile(
+       //".align 4                     \n"
        "1:                             \n"
        "       ldmia %0!,{r2,r3}       \n"
        "       cmp r2, #0              \n"
@@ -80,7 +127,8 @@
        "       mov r3, r2              \n"
        "3:                             \n"
        "       sub %0, #4              \n"
-#ifndef __ARMEB__                              /* Little endian bitmasks */
+//#ifndef __ARMEB__                            /* Little endian bitmasks */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
        "       tst r3, #0xff           \n"
        "       bne 10f                 \n"
        "       add %0, #1              \n"
@@ -105,10 +153,10 @@
        : "r2", "r3");
        return res-blk;
 }
+#define find_nonzero_simd find_nonzero_arm6
 #endif
 
 
-
 #ifdef TEST
 #include <string.h>
 #include <time.h>
@@ -129,13 +177,41 @@
        gettimeofday(&t2, NULL);        \
        tdiff = t2.tv_sec-t1.tv_sec + 0.000001*(t2.tv_usec-t1.tv_usec); \
        printf("%7i x %20s (%8i): %8i (%6.3fs => %5.0fMB/s)\n", \
-               rep, #routine, sz, ln, tdiff, 
(double)(rep)*(double)(sz)/(1024*1024*tdiff))
+               rep, #routine, sz, ln, tdiff, 
(double)(rep)*(double)(sz+1)/(1024*1024*tdiff));  \
+       if (ln != (tsz<sz? tsz: sz))    \
+               abort()
+
+
+#define TEST2C(sz,routine,rep,tsz)     \
+       memset(buf, 0, tsz);            \
+       buf[sz] = 0x4c;                 \
+       gettimeofday(&t1, NULL);        \
+       for (i = 0; i < rep; ++i) {     \
+               mem_clobber;            \
+               ln = routine(buf, tsz); \
+       }                               \
+       gettimeofday(&t2, NULL);        \
+       tdiff = t2.tv_sec-t1.tv_sec + 0.000001*(t2.tv_usec-t1.tv_usec); \
+       printf("%7i x %20s (%8i): %8i (%6.3fs => %5.0fMB/s)\n", \
+               rep, #routine, sz, ln, tdiff, 
(double)(rep)*(double)(sz+1)/(1024*1024*tdiff));  \
+       if (ln != (tsz<sz? tsz: sz))    \
+               abort()
 
 
 #if defined(HAVE_SIMD)
 #define TEST_SIMD(a,b,c,d) TESTC(a,b,c,d)
+#define TEST2_SIMD(a,b,c,d) TEST2C(a,b,c,d)
 #else
 #define TEST_SIMD(a,b,c,d) do {} while (0)
+#define TEST2_SIMD(a,b,c,d) do {} while (0)
+#endif
+
+#ifdef __SSE2__
+#define TEST_SIMD2(a,b,c,d) TESTC(a,b,c,d)
+#define TEST2_SIMD2(a,b,c,d) TEST2C(a,b,c,d)
+#else
+#define TEST_SIMD2(a,b,c,d) do {} while (0)
+#define TEST2_SIMD2(a,b,c,d) do {} while (0)
 #endif
 
 #if defined(HAVE_NONZERO_REP)
@@ -144,10 +220,17 @@
 #define TEST_REP(a,b,c,d) do {} while (0)
 #endif
 
+#define TESTFFS(val) printf("%08x: last %i first %i\n", val, myffsl(val), 
myflsl(val));
+#if __WORDSIZE == 64
+#define TESTFFS64(val) printf("%016Lx: last %i first %i\n", val, myffsl(val), 
myflsl(val));
+#else
+#define TESTFFS64(val) do {} while (0)
+#endif
+
 int main(int argc, char* argv[])
 {
-       unsigned char* obuf = (unsigned char*)malloc(SIZE+15);
-       unsigned char* buf = obuf+15;
+       unsigned char* obuf = (unsigned char*)malloc(SIZE+31);
+       unsigned char* buf = (obuf+31)-((unsigned long)(obuf+31)%32);
        struct timeval t1, t2;
        int i, ln = 0;
        double tdiff;
@@ -155,15 +238,22 @@
 #ifdef NEED_SIMD_RUNTIME_DETECTION
        detect_simd();
 #endif
+       TESTFFS(0x05000100);
+       TESTFFS(0x00900002);
+       TESTFFS(0x00000100);
+       TESTFFS(0x80400000);
+       TESTFFS64(0x0030000000000100ULL);
+       TESTFFS64(0x1000000000000000ULL);
+       TESTFFS64(0x0000000000001000ULL);
+
        if (argc > 1)
                scale = atoi(argv[1]);
-       buf -= (unsigned long)buf%16;
        memset(buf, 0xa5, SIZE);
        
-       TESTC(0, find_nonzero_c, 1024*256*scale/16, SIZE);
-       TEST_SIMD(0, find_nonzero_simd, 1024*256*scale/16, SIZE);
-       TESTC(0, find_nonzero, 1024*256*scale/16, SIZE);
-       TEST_REP(0, find_nonzero_rep, 1024*256*scale/16, SIZE);
+       TESTC(0, find_nonzero_c, 1024*1024*scale/16, SIZE);
+       TEST_SIMD(0, find_nonzero_simd, 1024*1024*scale/16, SIZE);
+       TESTC(0, find_nonzero, 1024*1024*scale/16, SIZE);
+       TEST_REP(0, find_nonzero_rep, 1024*1024*scale/16, SIZE);
        
        TESTC(8*1024-15, find_nonzero_c, 1024*256*scale/16, SIZE);
        TEST_SIMD(8*1024-15, find_nonzero_simd, 1024*256*scale/16, SIZE);
@@ -175,6 +265,7 @@
        buf--;
        TESTC(32*1024-9, find_nonzero_c, 1024*64*scale/16, SIZE);
        TEST_SIMD(32*1024-9, find_nonzero_simd, 1024*64*scale/16, SIZE);
+       TEST_SIMD2(32*1024-9, find_nonzero_sse2o, 1024*64*scale/16, SIZE);
        TESTC(32*1024-9, find_nonzero, 1024*64*scale/16, SIZE);
        TEST_REP(32*1024-9, find_nonzero_rep, 1024*64*scale/16, SIZE);
        TESTC(128*1024-8, find_nonzero_c, 1024*16*scale/16, SIZE);
@@ -190,15 +281,19 @@
        TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE);
        TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE);
        
-       TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE-16);
-       TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE-16);
-       TESTC(64*1024*1024, find_nonzero, 32*scale/16, SIZE-16);
-       TEST_REP(64*1024*1024, find_nonzero_rep, 32*scale/16, SIZE-16);
-
-       TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE-5);
-       TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE-5);
-       TESTC(64*1024*1024, find_nonzero, 32*scale/16, SIZE-5);
-       TEST_REP(64*1024*1024, find_nonzero_rep, 32*scale/16, SIZE-5);
+       TESTC(64*1024*1024, find_nonzero_c, 1+scale/16, SIZE-16);
+       TEST_SIMD(64*1024*1024, find_nonzero_simd, 1+scale/16, SIZE-16);
+       TESTC(64*1024*1024, find_nonzero, 1+scale/16, SIZE-16);
+       TEST_REP(64*1024*1024, find_nonzero_rep, 1+scale/16, SIZE-16);
+
+       TESTC(64*1024*1024, find_nonzero_c, 1+scale/16, SIZE-5);
+       TEST_SIMD(64*1024*1024, find_nonzero_simd, 1+scale/16, SIZE-5);
+       TESTC(64*1024*1024, find_nonzero, 1+scale/16, SIZE-5);
+       TEST_REP(64*1024*1024, find_nonzero_rep, 1+scale/16, SIZE-5);
+
+       TEST2C(12*1024*1024, find_nonzero_c, 160*scale/16, SIZE);
+       TEST2_SIMD(12*1024*1024, find_nonzero_simd, 160*scale/16, SIZE);
+       TEST2_SIMD2(12*1024*1024, find_nonzero_sse2o, 160*scale/16, SIZE);
 
        free(obuf);
        return 0;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/find_nonzero.h new/dd_rescue/find_nonzero.h
--- old/dd_rescue/find_nonzero.h        2013-07-24 16:47:30.000000000 +0200
+++ new/dd_rescue/find_nonzero.h        2013-08-02 13:33:21.000000000 +0200
@@ -23,17 +23,65 @@
 # else
 #  define myffsl(x) _mm_popcnt_u32(x^(~(-x)))
 # endif
-#else
+#else /* NOFFS */
 # define myffsl(x) myffs(x)
+/** Find first (lowest) bit set in word val, returns a val b/w 1 and 
__WORDSIZE, 0 if no bit is set */
 static inline int myffsl(unsigned long val)
 {
-       int i;
-       for (i = 1; i <= sizeof(val)*8; ++i) {
-               if (val & 0x01)
-                       return i;
-               val >>= 1;
+       int res = 1;
+       if (!val)
+               return 0;
+#if __WORDSIZE == 64
+       unsigned int vlo = val;
+       unsigned int vhi = val >> 32;
+       if (!vlo) {
+               res += 32;
+               vlo = vhi;
+       }
+#else
+       unsigned int vlo = val;
+#endif
+       unsigned int mask = 0x0000ffff;
+       unsigned int shift = 16;
+       while (shift > 0) {
+               if (!(vlo & mask)) {
+                       res += shift;
+                       vlo >>= shift;
+               }
+               shift >>= 1;
+               mask >>= shift;
        }
-       return 0;
+       return res;
+}
+#endif
+#if __BYTE_ORDER == __BIG_ENDIAN || defined(TEST)
+/** Find last (highest) bit set in word val, returns a val b/w __WORDSIZE and 
1, 0 if no bit is set */
+static inline int myflsl(unsigned long val)
+{
+       int res = __WORDSIZE;
+       if (!val)
+               return 0;
+#if __WORDSIZE == 64
+       unsigned int vlo = val;
+       unsigned int vhi = val >> 32;
+       if (!vhi) {
+               res -= 32;
+               vhi = vlo;
+       }
+#else
+       unsigned int vhi = val;
+#endif
+       unsigned int mask = 0xffff0000;
+       unsigned int shift = 16;
+       while (shift > 0) {
+               if (!(vhi & mask)) {
+                       res -= shift;
+                       vhi <<= shift;
+               }
+               shift >>= 1;
+               mask <<= shift;
+       }
+       return res;
 }
 #endif
 
@@ -78,6 +126,14 @@
 #if defined(HAVE_SSE2) || defined(__arm__)
 #define HAVE_SIMD
 
+#ifdef HAVE_AVX2
+#define find_nonzero_simd find_nonzero_avx2
+#elif defined(HAVE_SSE2)
+#define find_nonzero_simd find_nonzero_sse2
+#elif defined(__arm__)
+#define find_nonzero_simd find_nonzero_arm6
+#endif
+
 /* FIXME: Is there no library function to find the first non-null byte?
  * Something like ffs() for a long byte array?
  * Here is an optimized version using SSE2 intrinsics, but there should be
@@ -90,29 +146,39 @@
 /* No need for runtime detection here */
 const static char have_simd = 0;
 #endif
-/** return length of zero bytes */
+/** return number of bytes at beginning of blk that are all zero, assumes 
__WORDSIZE bit alignment */
 static size_t find_nonzero_c(const unsigned char* blk, const size_t ln)
 {
        const unsigned long* ptr = (const unsigned long*)blk;
        const unsigned long* const bptr = ptr;
        for (; (size_t)(ptr-bptr) < ln/sizeof(*ptr); ++ptr)
                if (*ptr)
+#if __BYTE_ORDER == __BIG_ENDIAN
+                       return sizeof(unsigned long)*(ptr-bptr) + 
sizeof(long)-((myflsl(*ptr)+7)>>3);
+#else
                        return sizeof(unsigned long)*(ptr-bptr) + 
((myffsl(*ptr)-1)>>3);
+#endif
        return ln;
 }
 
-/* Generic version, does not require an aligned buffer blk */
+/** return number of bytes at beginning of blk that are all zero 
+  * Generic version, does not require an aligned buffer blk or even ln ... */
 inline static size_t find_nonzero(const unsigned char* blk, const size_t ln)
 {
-       const int off = ((unsigned long)blk) % 16;
-       if (off) {
-               int i;
-               for (i = 0; i < 16-off; ++i)
-                       if (blk[i])
-                               return i;
-               return i+find_nonzero_opt(blk+i, ln-i);
-       } else
-               return find_nonzero_opt(blk, ln);
+       const int off = (-(unsigned char)(unsigned long)blk) & 0x1f;
+       size_t remain = ln - off;
+       size_t i;
+       for (i = 0; i < off; ++i)
+               if (blk[i])
+                       return i;
+       int r2 = remain % 0x1f;
+       size_t res = find_nonzero_opt(blk+off, remain-r2);
+       if (!r2 || res != remain-r2)
+               return off+res;
+       for (i = off+remain; i < ln; ++i)
+               if (blk[i])
+                       return i;
+       return ln;
 }
 
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dd_rescue/find_nonzero_avx.c 
new/dd_rescue/find_nonzero_avx.c
--- old/dd_rescue/find_nonzero_avx.c    1970-01-01 01:00:00.000000000 +0100
+++ new/dd_rescue/find_nonzero_avx.c    2013-08-02 13:37:31.000000000 +0200
@@ -0,0 +1,41 @@
+/** find_nonzero_avx.c
+  * AVX2 optimized search for non-zero bytes
+  * taken straight from SSE2 and adapted to use AVX registers
+  * Needs recent (2.23+) binutils to compile ...
+  * (c) Kurt Garloff <[email protected]>, 2013
+  * License: GNU GPL v2 or v3
+  */
+
+#define _GNU_SOURCE 1
+#include "find_nonzero.h"
+size_t find_nonzero_sse2(const unsigned char* blk, const size_t ln);
+
+#ifdef __AVX2__
+#if defined(__GNUC__) || defined(__llvm__)
+# warning AVX2 version untested and runtime detection only with gcc 4.8+
+#endif
+#include <immintrin.h>
+/** AVX2 version for measuring the initial zero bytes of 32b aligned blk */
+size_t find_nonzero_avx2(const unsigned char* blk, const size_t ln)
+{
+#if defined( __GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 
8))
+       if (!(__builtin_cpu_supports("avx2")))
+               return find_nonzero_sse2(blk, ln);
+#endif
+       __m256i register ymm;
+       const __m256i register zero = _mm256_setzero_si256();
+       unsigned register eax;
+       size_t i = 0;
+       //asm(".p2align 5");
+       for (; i < ln; i+= 32) {
+               //ymm = _mm256_load_si256((__m256i*)(blk+i));
+               ymm = _mm256_cmpeq_epi8(*(__m256i*)(blk+i), zero);
+               eax = ~(_mm256_movemask_epi8(ymm));
+               if (eax) 
+                       return i + myffs(eax)-1;
+       }
+       return ln;
+}
+#endif
+
+

-- 
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to