Hello community, here is the log from the commit of package dd_rescue for openSUSE:Factory checked in at 2013-08-04 07:28:46 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/dd_rescue (Old) and /work/SRC/openSUSE:Factory/.dd_rescue.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "dd_rescue" Changes: -------- --- /work/SRC/openSUSE:Factory/dd_rescue/dd_rescue.changes 2013-07-25 20:27:32.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.dd_rescue.new/dd_rescue.changes 2013-08-04 07:28:48.000000000 +0200 @@ -1,0 +2,18 @@ +Fri Aug 2 22:05:52 CEST 2013 - [email protected] + +- Update to dd_rescue-1.38: + * Further optimized SSE2 sparse detection. (Also added AVX2 + version, not enabled by default though.) + * --force allows to ignore non-seekable output with non zero + output position. + * make check does some testing ... + * improved cur.rate and ETA calculation. + +------------------------------------------------------------------- +Thu Aug 1 22:02:16 CEST 2013 - [email protected] + +- Update to dd_rescue-1.37: + * Important bugfix for SSE2 sparse detection. + * Fix exact zero-length on big endian machines (irrelevant). + +------------------------------------------------------------------- Old: ---- dd_rescue-1.36.tar.gz New: ---- dd_rescue-1.38.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ dd_rescue.spec ++++++ --- /var/tmp/diff_new_pack.LTvZx7/_old 2013-08-04 07:28:49.000000000 +0200 +++ /var/tmp/diff_new_pack.LTvZx7/_new 2013-08-04 07:28:49.000000000 +0200 @@ -17,7 +17,7 @@ Name: dd_rescue -Version: 1.36 +Version: 1.38 Release: 0 Summary: Data Copying in the Presence of I/O Errors License: GPL-2.0 or GPL-3.0 @@ -63,6 +63,9 @@ ln -sf %{_bindir}/dd_rescue %{buildroot}/bin #EndUsrMerge +%check +make check + %files %defattr(-,root,root,-) %doc COPYING README.dd_rescue ++++++ dd_rescue-1.36.tar.gz -> dd_rescue-1.38.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/Makefile new/dd_rescue/Makefile --- old/dd_rescue/Makefile 2013-07-24 00:38:20.000000000 +0200 +++ new/dd_rescue/Makefile 2013-08-02 13:33:21.000000000 +0200 @@ -1,8 +1,8 @@ # Makefile for dd_rescue # (c) [email protected], 99/10/09, GNU GPL -# $Id: Makefile,v 1.62 2013/07/23 22:38:20 garloff Exp $ +# $Id: Makefile,v 1.70 2013/08/02 10:23:12 garloff Exp $ -VERSION = 1.36 +VERSION = 1.38 DESTDIR = @@ -53,6 +53,9 @@ find_nonzero.o: find_nonzero.c find_nonzero.h $(CC) $(CFLAGS_OPT) -c $< $(SSE) +find_nonzero_avx.o: find_nonzero_avx.c find_nonzero.h + $(CC) $(CFLAGS_OPT) -mavx2 -c $< + libfalloc: dd_rescue.c $(HEADERS) $(OBJECTS) $(CC) $(CFLAGS) -DHAVE_LIBFALLOCATE=1 $(DEFINES) $< $(OUT) $(OBJECTS) -lfallocate @@ -75,14 +78,19 @@ strip -S $< clean: - rm -f $(TARGETS) $(OBJECTS) dd_rescue.o core test log find_nonzero fmt_no + rm -f $(TARGETS) $(OBJECTS) dd_rescue.o core test log find_nonzero fmt_no file_zblock find_nonzero_avx.o find_nonzero_avx find_nonzero: find_nonzero.c find_nonzero.h $(CC) $(CFLAGS_OPT) -o $@ $< -DTEST $(SSE) +find_nonzero_avx: find_nonzero.c find_nonzero.h find_nonzero_avx.o + $(CC) $(CFLAGS_OPT) -o $@ $< -DHAVE_AVX2 -DTEST $(SSE) find_nonzero_avx.o + fmt_no: fmt_no.c fmt_no.h $(CC) $(CFLAGS) -o $@ $< -DTEST +file_zblock: file_zblock.c find_nonzero.h find_nonzero.c find_nonzero.o + $(CC) $(CFLAGS) -o $@ $< find_nonzero.o distclean: clean rm -f *~ @@ -99,3 +107,9 @@ $(INSTALL) $(INSTASROOT) -m 644 dd_rescue.1 $(MANDIR)/man1/ gzip -9 $(MANDIR)/man1/dd_rescue.1 +check: $(TARGETS) find_nonzero + ./dd_rescue -apP dd_rescue dd_rescue.copy + cmp dd_rescue dd_rescue.copy + ./find_nonzero 2 + rm dd_rescue.copy + diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/dd_rescue.1 new/dd_rescue/dd_rescue.1 --- old/dd_rescue/dd_rescue.1 2013-07-24 16:47:30.000000000 +0200 +++ new/dd_rescue/dd_rescue.1 2013-07-26 00:00:15.000000000 +0200 @@ -1,4 +1,4 @@ -.\" $Id: dd_rescue.1,v 1.12 2013/07/16 09:53:50 garloff Exp $ +.\" $Id: dd_rescue.1,v 1.13 2013/07/25 11:57:34 garloff Exp $ . .TH dd_rescue 1 "2013-02-24" "Kurt Garloff" "Rescue copy tool" . @@ -359,7 +359,7 @@ .TP 8 .BR \-d ", " \-\-odir_in instructs -.b dd_rescue +.B dd_rescue to open .IR infie with O_DIRECT, bypassing the kernel buffers. While this option has a negative diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/dd_rescue.c new/dd_rescue/dd_rescue.c --- old/dd_rescue/dd_rescue.c 2013-07-24 16:47:30.000000000 +0200 +++ new/dd_rescue/dd_rescue.c 2013-08-02 00:16:39.000000000 +0200 @@ -45,7 +45,7 @@ # define __COMPILER__ "(unknown compiler)" #endif -#define ID "$Id: dd_rescue.c,v 1.214 2013/07/23 22:28:55 garloff Exp $" +#define ID "$Id: dd_rescue.c,v 1.218 2013/08/01 22:16:39 garloff Exp $" #ifndef BUF_SOFTBLOCKSIZE # define BUF_SOFTBLOCKSIZE 65536 @@ -146,7 +146,8 @@ char identical, preserve, falloc, dosplice; char i_chr, o_chr; char i_repeat, i_rep_init; -int i_rep_zero, prng_seed; +size_t i_rep_zero; +int prng_seed; char noextend, avoidwrite; char prng_libc, prng_frnd; char bsim715, bsim715_4, bsim715_2, bsim715_2ndpass; @@ -562,12 +563,21 @@ } #endif +float floatrate4 = 0.0; +float floatrate32 = 0.0; void doprint(FILE* const file, const unsigned int bs, const clock_t cl, const float t1, const float t2, const int sync) { float avgrate = (float)xfer/t1; float currate = (float)(xfer-lxfer)/t2; const char *bold = BOLD, *norm = NORM; + if (!floatrate4) { + floatrate4 = currate; + floatrate32 = currate; + } else { + floatrate4 = (floatrate4 * 3 + currate)/ 4; + floatrate32 = (floatrate32*31 + currate)/32; + } if (nocol || (file != stderr && file != stdout)) { bold = ""; norm = ""; } @@ -581,7 +591,7 @@ fmt_int(10, 1, 1024, sxfer, bold, norm, 1)); if (sync || (file != stdin && file != stdout) ) fprintf(file, " +curr.rate:%skB/s, avg.rate:%skB/s, avg.load:%s%%\n", - fmt_int(9, 0, 1024, currate, bold, norm, 1), + fmt_int(9, 0, 1024, floatrate4, bold, norm, 1), fmt_int(9, 0, 1024, avgrate, bold, norm, 1), fmt_int(3, 1, 10, (cl-startclock)/(t1*(CLOCKS_PER_SEC/1000)), bold, norm, 1)); else @@ -594,7 +604,7 @@ if (in_report) sec = 0.5 + t1; else - sec = 0.5 + (estxfer-xfer)/avgrate; + sec = 0.5 + 2*(estxfer-xfer)/(avgrate+floatrate32); int hour = sec / 3600; int min = (sec % 3600) / 60; sec = sec % 60; @@ -844,7 +854,7 @@ } /** is the block zero ? */ -static int blockiszero(const unsigned char* blk, const size_t ln) +static ssize_t blockiszero(const unsigned char* blk, const size_t ln) { if (i_repeat && i_rep_zero) return i_rep_zero; @@ -2054,8 +2064,12 @@ } if (o_chr && opos != 0) { - fplog(stderr, FATAL, "outfile not seekable, but opos !=0 requested!\n"); - cleanup(); exit(19); + if (force) + fplog(stderr, WARN, "ignore non-seekable output with opos != 0 due to --force\n"); + else { + fplog(stderr, FATAL, "outfile not seekable, but opos !=0 requested!\n"); + cleanup(); exit(19); + } } if (i_chr && ipos != 0) { fplog(stderr, FATAL, "infile not seekable, but ipos !=0 requested!\n"); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/file_zblock.c new/dd_rescue/file_zblock.c --- old/dd_rescue/file_zblock.c 1970-01-01 01:00:00.000000000 +0100 +++ new/dd_rescue/file_zblock.c 2013-08-01 23:22:05.000000000 +0200 @@ -0,0 +1,56 @@ +#define _GNU_SOURCE 1 +#include <stdio.h> +#include <sys/file.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> + +#include "find_nonzero.h" + +#define BUFSZ (64*1024) +unsigned char buf[BUFSZ]; + +void usage() +{ + fprintf(stderr, "Usage: file_zblock FILE1 [FILE2 [FILE3 [...]]]\n"); + fprintf(stderr, "file_zblock reports files with ()at least) chunk-sized blocks of zeros inside.\n"); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int zf = 0; + int chunksz = 4096; + int i = 1, off; + if (argc < 2) + usage(); + if (!memcmp(argv[1], "-c", 2)) { + if (strlen(argv[1]) > 2) { + chunksz = atoi(argv[1]+2); + ++i; + } else { + chunksz = atoi(argv[2]); + i += 2; + } + } + for (; i < argc; ++i) { + int fd = open(argv[i], O_RDONLY); + if (fd<0) { + fprintf(stderr, "ERROR opening file %s: %s\n", argv[i], strerror(errno)); + continue; + } + int rd, found = 0; + while ((rd = read(fd, buf, BUFSZ)) > 0 && !found) { + for (off = 0; off < rd; off += chunksz) { + unsigned int tocheck = rd-off > chunksz? chunksz: rd-off; + if (find_nonzero(buf+off, tocheck) == tocheck) { + ++found; ++zf; + printf("%s,%i\n", argv[i], off); + break; + } + } + } + close(fd); + } + return zf; +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/find_nonzero.c new/dd_rescue/find_nonzero.c --- old/dd_rescue/find_nonzero.c 2013-07-24 16:47:30.000000000 +0200 +++ new/dd_rescue/find_nonzero.c 2013-08-02 13:33:21.000000000 +0200 @@ -7,7 +7,8 @@ #define IN_FINDZERO #include "find_nonzero.h" -#if defined(__i386__) || defined(__x86_64__) +#if defined(TEST) && (defined(__i386__) || defined(__x86_64__)) +/** Just for testing the speed of the good old x86 string instructions */ size_t find_nonzero_rep(const unsigned char* blk, const size_t ln) { unsigned long register res; @@ -30,15 +31,58 @@ #ifdef __SSE2__ #include <emmintrin.h> -size_t find_nonzero_simd(const unsigned char* blk, const size_t ln) +#ifdef TEST +/** SSE2 version for measuring the initial zero bytes of aligned blk */ +size_t find_nonzero_sse2o(const unsigned char* blk, const size_t ln) { - __m128i xmm, zero = _mm_setzero_si128(); - unsigned /*long*/ register eax; + __m128i register xmm; + const __m128i register zero = _mm_setzero_si128(); +#ifdef SIMD_XOR + const __m128i register mask = _mm_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1); +#endif + unsigned register eax; size_t i = 0; + //asm(".align 32"); for (; i < ln; i+= 16) { xmm = _mm_load_si128((__m128i*)(blk+i)); +#ifdef BUGGY_136 _mm_cmpeq_epi8(xmm, zero); eax = _mm_movemask_epi8(xmm); +#else + xmm = _mm_cmpeq_epi8(xmm, zero); +#ifdef SIMD_XOR + xmm = _mm_xor_si128(xmm, mask); +#endif + eax = _mm_movemask_epi8(xmm); +#endif /* BUGGY **/ +#if defined(SIMD_XOR) || defined(BUGGY_136) + if (eax) + return i + myffs(eax)-1; +#else + if (eax != 0xffff) + return i + myffs(eax^0xffff)-1; +#endif + } + return ln; +} +#endif + +/** SSE2 version for measuring the initial zero bytes of 16b aligned blk */ +size_t find_nonzero_sse2(const unsigned char* blk, const size_t ln) +{ + register __m128i xmm0, xmm1; + register const __m128i zero = _mm_setzero_si128(); + register unsigned int eax, ebx; + size_t i = 0; + //asm(".p2align 5"); + for (; i < ln; i+= 32) { + //xmm0 = _mm_load_si128((__m128i*)(blk+i)); + //xmm1 = _mm_load_si128((__m128i*)(blk+i+16)); + xmm0 = _mm_cmpeq_epi8(*(__m128i*)(blk+i), zero); + xmm1 = _mm_cmpeq_epi8(*(__m128i*)(blk+i+16), zero); + eax = _mm_movemask_epi8(xmm0); + ebx = _mm_movemask_epi8(xmm1); + eax = ~(eax | (ebx << 16)); if (eax) return i + myffs(eax)-1; } @@ -46,11 +90,12 @@ } #ifdef NEED_SIMD_RUNTIME_DETECTION +/** Issue an SSE2 insn for runtime detection of SSE2 capability (x86) */ +volatile __m128d _probe_xmm; void probe_simd() { - volatile __m128d xmm; double val = 3.14159265358979323844; - xmm = _mm_set_sd(val); + _probe_xmm = _mm_set_sd(val); } #endif /* NEED_SIMD_RUNTIME_DETECTION */ @@ -58,13 +103,15 @@ #if defined(__arm__) -/* Inspired by Linaro's strlen() implementation; - we don't even need NEON here, ldmia does the 3x speedup on A-9 */ -size_t find_nonzero_simd(const unsigned char *blk, const size_t ln) +/** ASM optimized version for ARM. + * Inspired by Linaro's strlen() implementation; + * we don't even need NEON here, ldmia does the 3x speedup on Cortexes */ +size_t find_nonzero_arm6(const unsigned char *blk, const size_t ln) { register unsigned char* res; const register unsigned char* end = blk+ln; asm volatile( + //".align 4 \n" "1: \n" " ldmia %0!,{r2,r3} \n" " cmp r2, #0 \n" @@ -80,7 +127,8 @@ " mov r3, r2 \n" "3: \n" " sub %0, #4 \n" -#ifndef __ARMEB__ /* Little endian bitmasks */ +//#ifndef __ARMEB__ /* Little endian bitmasks */ +#if __BYTE_ORDER == __LITTLE_ENDIAN " tst r3, #0xff \n" " bne 10f \n" " add %0, #1 \n" @@ -105,10 +153,10 @@ : "r2", "r3"); return res-blk; } +#define find_nonzero_simd find_nonzero_arm6 #endif - #ifdef TEST #include <string.h> #include <time.h> @@ -129,13 +177,41 @@ gettimeofday(&t2, NULL); \ tdiff = t2.tv_sec-t1.tv_sec + 0.000001*(t2.tv_usec-t1.tv_usec); \ printf("%7i x %20s (%8i): %8i (%6.3fs => %5.0fMB/s)\n", \ - rep, #routine, sz, ln, tdiff, (double)(rep)*(double)(sz)/(1024*1024*tdiff)) + rep, #routine, sz, ln, tdiff, (double)(rep)*(double)(sz+1)/(1024*1024*tdiff)); \ + if (ln != (tsz<sz? tsz: sz)) \ + abort() + + +#define TEST2C(sz,routine,rep,tsz) \ + memset(buf, 0, tsz); \ + buf[sz] = 0x4c; \ + gettimeofday(&t1, NULL); \ + for (i = 0; i < rep; ++i) { \ + mem_clobber; \ + ln = routine(buf, tsz); \ + } \ + gettimeofday(&t2, NULL); \ + tdiff = t2.tv_sec-t1.tv_sec + 0.000001*(t2.tv_usec-t1.tv_usec); \ + printf("%7i x %20s (%8i): %8i (%6.3fs => %5.0fMB/s)\n", \ + rep, #routine, sz, ln, tdiff, (double)(rep)*(double)(sz+1)/(1024*1024*tdiff)); \ + if (ln != (tsz<sz? tsz: sz)) \ + abort() #if defined(HAVE_SIMD) #define TEST_SIMD(a,b,c,d) TESTC(a,b,c,d) +#define TEST2_SIMD(a,b,c,d) TEST2C(a,b,c,d) #else #define TEST_SIMD(a,b,c,d) do {} while (0) +#define TEST2_SIMD(a,b,c,d) do {} while (0) +#endif + +#ifdef __SSE2__ +#define TEST_SIMD2(a,b,c,d) TESTC(a,b,c,d) +#define TEST2_SIMD2(a,b,c,d) TEST2C(a,b,c,d) +#else +#define TEST_SIMD2(a,b,c,d) do {} while (0) +#define TEST2_SIMD2(a,b,c,d) do {} while (0) #endif #if defined(HAVE_NONZERO_REP) @@ -144,10 +220,17 @@ #define TEST_REP(a,b,c,d) do {} while (0) #endif +#define TESTFFS(val) printf("%08x: last %i first %i\n", val, myffsl(val), myflsl(val)); +#if __WORDSIZE == 64 +#define TESTFFS64(val) printf("%016Lx: last %i first %i\n", val, myffsl(val), myflsl(val)); +#else +#define TESTFFS64(val) do {} while (0) +#endif + int main(int argc, char* argv[]) { - unsigned char* obuf = (unsigned char*)malloc(SIZE+15); - unsigned char* buf = obuf+15; + unsigned char* obuf = (unsigned char*)malloc(SIZE+31); + unsigned char* buf = (obuf+31)-((unsigned long)(obuf+31)%32); struct timeval t1, t2; int i, ln = 0; double tdiff; @@ -155,15 +238,22 @@ #ifdef NEED_SIMD_RUNTIME_DETECTION detect_simd(); #endif + TESTFFS(0x05000100); + TESTFFS(0x00900002); + TESTFFS(0x00000100); + TESTFFS(0x80400000); + TESTFFS64(0x0030000000000100ULL); + TESTFFS64(0x1000000000000000ULL); + TESTFFS64(0x0000000000001000ULL); + if (argc > 1) scale = atoi(argv[1]); - buf -= (unsigned long)buf%16; memset(buf, 0xa5, SIZE); - TESTC(0, find_nonzero_c, 1024*256*scale/16, SIZE); - TEST_SIMD(0, find_nonzero_simd, 1024*256*scale/16, SIZE); - TESTC(0, find_nonzero, 1024*256*scale/16, SIZE); - TEST_REP(0, find_nonzero_rep, 1024*256*scale/16, SIZE); + TESTC(0, find_nonzero_c, 1024*1024*scale/16, SIZE); + TEST_SIMD(0, find_nonzero_simd, 1024*1024*scale/16, SIZE); + TESTC(0, find_nonzero, 1024*1024*scale/16, SIZE); + TEST_REP(0, find_nonzero_rep, 1024*1024*scale/16, SIZE); TESTC(8*1024-15, find_nonzero_c, 1024*256*scale/16, SIZE); TEST_SIMD(8*1024-15, find_nonzero_simd, 1024*256*scale/16, SIZE); @@ -175,6 +265,7 @@ buf--; TESTC(32*1024-9, find_nonzero_c, 1024*64*scale/16, SIZE); TEST_SIMD(32*1024-9, find_nonzero_simd, 1024*64*scale/16, SIZE); + TEST_SIMD2(32*1024-9, find_nonzero_sse2o, 1024*64*scale/16, SIZE); TESTC(32*1024-9, find_nonzero, 1024*64*scale/16, SIZE); TEST_REP(32*1024-9, find_nonzero_rep, 1024*64*scale/16, SIZE); TESTC(128*1024-8, find_nonzero_c, 1024*16*scale/16, SIZE); @@ -190,15 +281,19 @@ TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE); TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE); - TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE-16); - TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE-16); - TESTC(64*1024*1024, find_nonzero, 32*scale/16, SIZE-16); - TEST_REP(64*1024*1024, find_nonzero_rep, 32*scale/16, SIZE-16); - - TESTC(64*1024*1024, find_nonzero_c, 32*scale/16, SIZE-5); - TEST_SIMD(64*1024*1024, find_nonzero_simd, 32*scale/16, SIZE-5); - TESTC(64*1024*1024, find_nonzero, 32*scale/16, SIZE-5); - TEST_REP(64*1024*1024, find_nonzero_rep, 32*scale/16, SIZE-5); + TESTC(64*1024*1024, find_nonzero_c, 1+scale/16, SIZE-16); + TEST_SIMD(64*1024*1024, find_nonzero_simd, 1+scale/16, SIZE-16); + TESTC(64*1024*1024, find_nonzero, 1+scale/16, SIZE-16); + TEST_REP(64*1024*1024, find_nonzero_rep, 1+scale/16, SIZE-16); + + TESTC(64*1024*1024, find_nonzero_c, 1+scale/16, SIZE-5); + TEST_SIMD(64*1024*1024, find_nonzero_simd, 1+scale/16, SIZE-5); + TESTC(64*1024*1024, find_nonzero, 1+scale/16, SIZE-5); + TEST_REP(64*1024*1024, find_nonzero_rep, 1+scale/16, SIZE-5); + + TEST2C(12*1024*1024, find_nonzero_c, 160*scale/16, SIZE); + TEST2_SIMD(12*1024*1024, find_nonzero_simd, 160*scale/16, SIZE); + TEST2_SIMD2(12*1024*1024, find_nonzero_sse2o, 160*scale/16, SIZE); free(obuf); return 0; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/find_nonzero.h new/dd_rescue/find_nonzero.h --- old/dd_rescue/find_nonzero.h 2013-07-24 16:47:30.000000000 +0200 +++ new/dd_rescue/find_nonzero.h 2013-08-02 13:33:21.000000000 +0200 @@ -23,17 +23,65 @@ # else # define myffsl(x) _mm_popcnt_u32(x^(~(-x))) # endif -#else +#else /* NOFFS */ # define myffsl(x) myffs(x) +/** Find first (lowest) bit set in word val, returns a val b/w 1 and __WORDSIZE, 0 if no bit is set */ static inline int myffsl(unsigned long val) { - int i; - for (i = 1; i <= sizeof(val)*8; ++i) { - if (val & 0x01) - return i; - val >>= 1; + int res = 1; + if (!val) + return 0; +#if __WORDSIZE == 64 + unsigned int vlo = val; + unsigned int vhi = val >> 32; + if (!vlo) { + res += 32; + vlo = vhi; + } +#else + unsigned int vlo = val; +#endif + unsigned int mask = 0x0000ffff; + unsigned int shift = 16; + while (shift > 0) { + if (!(vlo & mask)) { + res += shift; + vlo >>= shift; + } + shift >>= 1; + mask >>= shift; } - return 0; + return res; +} +#endif +#if __BYTE_ORDER == __BIG_ENDIAN || defined(TEST) +/** Find last (highest) bit set in word val, returns a val b/w __WORDSIZE and 1, 0 if no bit is set */ +static inline int myflsl(unsigned long val) +{ + int res = __WORDSIZE; + if (!val) + return 0; +#if __WORDSIZE == 64 + unsigned int vlo = val; + unsigned int vhi = val >> 32; + if (!vhi) { + res -= 32; + vhi = vlo; + } +#else + unsigned int vhi = val; +#endif + unsigned int mask = 0xffff0000; + unsigned int shift = 16; + while (shift > 0) { + if (!(vhi & mask)) { + res -= shift; + vhi <<= shift; + } + shift >>= 1; + mask <<= shift; + } + return res; } #endif @@ -78,6 +126,14 @@ #if defined(HAVE_SSE2) || defined(__arm__) #define HAVE_SIMD +#ifdef HAVE_AVX2 +#define find_nonzero_simd find_nonzero_avx2 +#elif defined(HAVE_SSE2) +#define find_nonzero_simd find_nonzero_sse2 +#elif defined(__arm__) +#define find_nonzero_simd find_nonzero_arm6 +#endif + /* FIXME: Is there no library function to find the first non-null byte? * Something like ffs() for a long byte array? * Here is an optimized version using SSE2 intrinsics, but there should be @@ -90,29 +146,39 @@ /* No need for runtime detection here */ const static char have_simd = 0; #endif -/** return length of zero bytes */ +/** return number of bytes at beginning of blk that are all zero, assumes __WORDSIZE bit alignment */ static size_t find_nonzero_c(const unsigned char* blk, const size_t ln) { const unsigned long* ptr = (const unsigned long*)blk; const unsigned long* const bptr = ptr; for (; (size_t)(ptr-bptr) < ln/sizeof(*ptr); ++ptr) if (*ptr) +#if __BYTE_ORDER == __BIG_ENDIAN + return sizeof(unsigned long)*(ptr-bptr) + sizeof(long)-((myflsl(*ptr)+7)>>3); +#else return sizeof(unsigned long)*(ptr-bptr) + ((myffsl(*ptr)-1)>>3); +#endif return ln; } -/* Generic version, does not require an aligned buffer blk */ +/** return number of bytes at beginning of blk that are all zero + * Generic version, does not require an aligned buffer blk or even ln ... */ inline static size_t find_nonzero(const unsigned char* blk, const size_t ln) { - const int off = ((unsigned long)blk) % 16; - if (off) { - int i; - for (i = 0; i < 16-off; ++i) - if (blk[i]) - return i; - return i+find_nonzero_opt(blk+i, ln-i); - } else - return find_nonzero_opt(blk, ln); + const int off = (-(unsigned char)(unsigned long)blk) & 0x1f; + size_t remain = ln - off; + size_t i; + for (i = 0; i < off; ++i) + if (blk[i]) + return i; + int r2 = remain % 0x1f; + size_t res = find_nonzero_opt(blk+off, remain-r2); + if (!r2 || res != remain-r2) + return off+res; + for (i = off+remain; i < ln; ++i) + if (blk[i]) + return i; + return ln; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dd_rescue/find_nonzero_avx.c new/dd_rescue/find_nonzero_avx.c --- old/dd_rescue/find_nonzero_avx.c 1970-01-01 01:00:00.000000000 +0100 +++ new/dd_rescue/find_nonzero_avx.c 2013-08-02 13:37:31.000000000 +0200 @@ -0,0 +1,41 @@ +/** find_nonzero_avx.c + * AVX2 optimized search for non-zero bytes + * taken straight from SSE2 and adapted to use AVX registers + * Needs recent (2.23+) binutils to compile ... + * (c) Kurt Garloff <[email protected]>, 2013 + * License: GNU GPL v2 or v3 + */ + +#define _GNU_SOURCE 1 +#include "find_nonzero.h" +size_t find_nonzero_sse2(const unsigned char* blk, const size_t ln); + +#ifdef __AVX2__ +#if defined(__GNUC__) || defined(__llvm__) +# warning AVX2 version untested and runtime detection only with gcc 4.8+ +#endif +#include <immintrin.h> +/** AVX2 version for measuring the initial zero bytes of 32b aligned blk */ +size_t find_nonzero_avx2(const unsigned char* blk, const size_t ln) +{ +#if defined( __GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + if (!(__builtin_cpu_supports("avx2"))) + return find_nonzero_sse2(blk, ln); +#endif + __m256i register ymm; + const __m256i register zero = _mm256_setzero_si256(); + unsigned register eax; + size_t i = 0; + //asm(".p2align 5"); + for (; i < ln; i+= 32) { + //ymm = _mm256_load_si256((__m256i*)(blk+i)); + ymm = _mm256_cmpeq_epi8(*(__m256i*)(blk+i), zero); + eax = ~(_mm256_movemask_epi8(ymm)); + if (eax) + return i + myffs(eax)-1; + } + return ln; +} +#endif + + -- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
