Hello community,

here is the log from the commit of package duperemove for openSUSE:Factory 
checked in at 2014-04-11 13:28:10
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/duperemove (Old)
 and      /work/SRC/openSUSE:Factory/.duperemove.new (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Package is "duperemove"

Changes:
--------
--- /work/SRC/openSUSE:Factory/duperemove/duperemove.changes    2014-03-15 
17:36:22.000000000 +0100
+++ /work/SRC/openSUSE:Factory/.duperemove.new/duperemove.changes       
2014-04-11 13:28:12.000000000 +0200
@@ -1,0 +2,10 @@
+Fri Apr 11 00:30:41 UTC 2014 - [email protected]
+
+- update to duperemove v0.06. This adds several fixes and features:
+        - fixes bnc#871804 (duperemove not looping on entire range)
+
+- also includes important usability fixes
+
+- updates hashing library to libgcrypt to reflect upstream
+
+-------------------------------------------------------------------

Old:
----
  duperemove-v0.04.tar.gz

New:
----
  duperemove-v0.06.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ duperemove.spec ++++++
--- /var/tmp/diff_new_pack.7Bs4gt/_old  2014-04-11 13:28:12.000000000 +0200
+++ /var/tmp/diff_new_pack.7Bs4gt/_new  2014-04-11 13:28:12.000000000 +0200
@@ -15,19 +15,19 @@
 # Please submit bugfixes or comments via http://bugs.opensuse.org/
 #
 
+
 %define modname duperemove
 
 Name:           duperemove
 BuildRequires:  gcc-c++
-BuildRequires:  mhash-devel
-Version:        0.04
+BuildRequires:  libgcrypt-devel
+Version:        0.06
 Release:        0
 Summary:        Software to find duplicate extents in files and remove them
 License:        GPL-2.0
 Group:          System/Filesystems
 Url:            https://github.com/markfasheh/duperemove
 Source:         %{modname}-v%{version}.tar.gz
-Requires:       mhash
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
 
 %description
@@ -38,8 +38,8 @@
 %define         samename btrfs-extent-same
 %package -n btrfs-extent-same
 Summary:        Debug/Test tool to exercise the btrfs out-of-band 
deduplication ioctl
-License:        GPL-2.0
 Group:          System/Filesystems
+
 %description -n btrfs-extent-same
 Debug/Test tool to exercise a btrfs ioctl for deduplicating file regions.
 

++++++ duperemove-v0.04.tar.gz -> duperemove-v0.06.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/Makefile 
new/duperemove-v0.06/Makefile
--- old/duperemove-v0.04/Makefile       2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/Makefile       2014-04-11 02:11:50.000000000 +0200
@@ -1,16 +1,27 @@
 CC=gcc
-RELEASE=v0.04
+RELEASE=v0.06
 CFLAGS=-Wall -ggdb -D_FILE_OFFSET_BITS=64 -DVERSTRING=\"$(RELEASE)\"
-LIBRARY_FLAGS=-lmhash
 
 MANPAGES=duperemove.8 btrfs-extent-same.8
 
-DIST_SOURCES=csum.c csum.h duperemove.c hash-tree.c hash-tree.h results-tree.c 
results-tree.h kernel.h LICENSE list.h Makefile rbtree.c rbtree.h rbtree.txt 
README TODO dedupe.c dedupe.h btrfs-ioctl.h filerec.c filerec.h $(MANPAGES) 
btrfs-extent-same.c
+DIST_SOURCES=csum-gcrypt.c csum-mhash.c csum.h duperemove.c hash-tree.c 
hash-tree.h results-tree.c results-tree.h kernel.h LICENSE list.h Makefile 
rbtree.c rbtree.h rbtree.txt README TODO dedupe.c dedupe.h btrfs-ioctl.h 
filerec.c filerec.h $(MANPAGES) btrfs-extent-same.c
 DIST=duperemove-$(RELEASE)
 DIST_TARBALL=$(DIST).tar.gz
 TEMP_INSTALL_DIR:=$(shell mktemp -du -p .)
 
-objects = duperemove.o rbtree.o csum.o hash-tree.o results-tree.o dedupe.o 
filerec.o
+hash_obj=csum-gcrypt.o
+crypt_CFLAGS=$(shell libgcrypt-config --cflags)
+crypt_LIBS=$(shell libgcrypt-config --libs)
+ifdef USE_MHASH
+       hash_obj=csum-mhash.o
+       crypt_CFLAGS=
+       crypt_LIBS=-lmhash
+endif
+
+CFLAGS += $(crypt_CFLAGS)
+LIBRARY_FLAGS += $(crypt_LIBS)
+
+objects = duperemove.o rbtree.o hash-tree.o results-tree.o dedupe.o filerec.o 
$(hash_obj)
 progs = duperemove
 
 all: $(progs) kernel.h list.h btrfs-ioctl.h
@@ -27,5 +38,8 @@
 btrfs-extent-same: btrfs-extent-same.c
        $(CC) -Wall -o btrfs-extent-same btrfs-extent-same.c
 
+csum-test: $(hash_obj) csum-test.c
+       $(CC) -Wall $(hash_obj) $(CFLAGS) $(LIBRARY_FLAGS) -o csum-test 
csum-test.c
+
 clean:
-       rm -fr $(objects) $(progs) $(DIST_TARBALL) btrfs-extent-same *~
+       rm -fr $(objects) $(progs) $(DIST_TARBALL) btrfs-extent-same csum-*.o *~
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/README new/duperemove-v0.06/README
--- old/duperemove-v0.04/README 2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/README 2014-04-11 02:11:50.000000000 +0200
@@ -1,16 +1,75 @@
-duperemove v0.04
-Find duplicate extents and print them to stdout
+Duperemove
 
-Usage: ./duperemove [-r] [-D] [-A] [-b blocksize-in-K] [-v] [-d] OBJECTS
-Where "OBJECTS" is a list of files (or directories) which
-we want to find duplicate extents in. If a directory is 
-specified, all regular files inside of it will be scanned.
-
-       <switches>
-       -r              Enable recursive dir traversal.
-       -D              De-dupe the results - only works on btrfs.
-       -A              Opens files readonly when deduping. Primarily for use 
by privileged users on readonly snapshots
-       -b bsize        Use bsize blocks - specify in kilobytes. Default is 128.
-       -v              Be verbose.
-       -d              Print debug messages, forces -v if selected.
-       -h              Prints this help text.
+Duperemove is a simple tool for finding duplicated extents and
+submitting them for deduplication. When given a list of files it will
+hash their contents on a block by block basis and compare those hashes
+to each other, finding and categorizing extents that match each
+other. When given the optional -D option, duperemove will submit those
+extents for deduplication using the btrfs-extent-same ioctl.
+
+Duperemove has two major modes of operation one of which is a subset
+of the other.
+
+
+Readonly / Non-deduplicating Mode
+
+When run without -D (the default) duperemove will print out one or
+more tables of matching extents it has determined would be ideal
+candidates for deduplication. As a result, readonly mode is useful for
+seeing what duperemove might do when run with '-D'. The output could
+also be used by some other software to submit the extents for
+deduplication at a later time.
+
+It is important to note that this mode will not print out *all*
+instances of matching extents, just those it would consider for
+deduplication.
+
+Another important note is that duperemove does not concern itself with
+the underlying representation of the extents. Some of them could be
+compressed, undergoing I/O, or even have already been deduplicated. In
+dedupe mode, the kernel handles those details and therefore we try not
+to replicate that work. Think of duperemove as trying for 'bulk'
+deduplication.
+
+
+Deduping Mode
+
+This functions similarly to readonly mode with the exception that the
+duplicated extents found in our "read hash and compare" step will
+actually be submitted for deduplication. At the end, a total count of
+bytes that were processed by the kernel will be printed.
+
+Keep in mind, that the bytecount we report here (recieved from the
+kernel) is NOT the total amount deduplicated but rather a count of the
+amount of data it also found to be identical.
+
+See the duperemove man page for further details about running duperemove.
+
+
+FAQ
+
+* Is there an upper limit to the amount of data duperemove can process?
+
+Right now duperemove has been tested on small numbers of VMS or iso
+files (5-10). I don't believe there should be a major problem scaling
+that up to 50 or so.
+
+
+* Why does it not print out all duplicate extents?
+
+Internally duperemove is classifying extents based on various criteria
+like length, number of identical extents, etc. The printout we give is
+based on the results of that classification.
+
+
+* How can I find out my space savings after a dedupe?
+
+The easiest way to do this would be a df before the dedupe operation,
+then a df about 60 seconds after the operation. It is common for btrfs
+space reporting to be 'behind' while delayed updates get processed, so
+an immediate df after deduping might not show any savings.
+
+
+USAGE EXAMPLES
+
+TODO
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/TODO new/duperemove-v0.06/TODO
--- old/duperemove-v0.04/TODO   2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/TODO   2014-04-11 02:11:50.000000000 +0200
@@ -1,15 +1,22 @@
-HIGH LEVEL
+- Allow checking for similar files by some criteria (approximate size,
+  file magic type, file extension, etc)
 
-- Test on actual VM images
+- Limit the window for duplicate extent calculation to limit overall time
+  spent looking for duplicates
 
-- Add code to take a directory as a command line option
-
-LOW LEVEL
-
-- Replace ugly pointer comparison in walk_dupe_block()
+- Store results of our search to speed up subsequent runs.
+  - In particular, store file system transaction ids so we can use the
+    equivalent of btrfs's find-new
 
 - Wrap bytes <-> blocks conversions
 
 - Possibly use mmap (with madvise) for the checksumming phase
 
 - Do checksumming in seperate threads
+
+- Add an optional mode to do duplicate checking with resolution of extent
+  owners (expensive).
+
+- Allow duperemove to take as input it's previous output so a user
+  could run in readonly mode, record dupes and then pass them later to
+  duperemove for potential deduplication.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/csum-gcrypt.c 
new/duperemove-v0.06/csum-gcrypt.c
--- old/duperemove-v0.04/csum-gcrypt.c  1970-01-01 01:00:00.000000000 +0100
+++ new/duperemove-v0.06/csum-gcrypt.c  2014-04-11 02:11:50.000000000 +0200
@@ -0,0 +1,106 @@
+/*
+ * csum.c
+ *
+ * Copyright (C) 2014 SUSE.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <gcrypt.h>
+
+#include "csum.h"
+
+#define        HASH_FUNC       GCRY_MD_SHA256
+
+unsigned int digest_len = 0;
+
+void checksum_block(char *buf, int len, unsigned char *digest)
+{
+       gcry_md_hash_buffer(HASH_FUNC, digest, buf, len);
+}
+
+int init_hash(void)
+{
+       /*
+        * Version check should be the very first call because it makes sure
+        * that important subsystems are intialized.
+        */
+       if (!gcry_check_version(GCRYPT_VERSION))
+               return 1;
+
+       /* Disable secure memory.  */
+       gcry_control(GCRYCTL_DISABLE_SECMEM, 0);
+
+       /* Tell Libgcrypt that initialization has completed. */
+       gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0);
+
+       if (gcry_md_test_algo(HASH_FUNC))
+               return 1;
+
+       digest_len = gcry_md_get_algo_dlen(HASH_FUNC);
+       if (!digest_len)
+               return 1;
+
+       if (digest_len == 0 || digest_len > DIGEST_LEN_MAX)
+               abort();
+
+       return 0;
+}
+
+void debug_print_digest(FILE *stream, unsigned char *digest)
+{
+       int i;
+
+       for (i = 0; i < digest_len; i++)
+               fprintf(stream, "%.2x", digest[i]);
+}
+
+struct running_checksum {
+       gcry_md_hd_t    hd;
+       unsigned char   digest[DIGEST_LEN_MAX];
+};
+
+struct running_checksum *start_running_checksum(void)
+{
+       struct running_checksum *c = calloc(1, sizeof(struct running_checksum));
+
+       if (c) {        
+               if (gcry_md_open(&c->hd, HASH_FUNC, 0) != GPG_ERR_NO_ERROR) {
+                       free(c);
+                       c = NULL;
+               }
+       }
+
+       return c;
+}
+
+void add_to_running_checksum(struct running_checksum *c,
+                            unsigned int len, unsigned char *buf)
+{
+       gcry_md_write(c->hd, buf, len);
+}
+
+void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
+{
+       unsigned char *gcry_digest;
+
+       /* gcry_md_read() does this implicitly */
+       gcry_md_final(c->hd);
+       gcry_digest = gcry_md_read(c->hd, 0);
+       memcpy(digest, gcry_digest, digest_len);
+
+       gcry_md_close(c->hd);
+
+       free(c);
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/csum-mhash.c 
new/duperemove-v0.06/csum-mhash.c
--- old/duperemove-v0.04/csum-mhash.c   1970-01-01 01:00:00.000000000 +0100
+++ new/duperemove-v0.06/csum-mhash.c   2014-04-11 02:11:50.000000000 +0200
@@ -0,0 +1,85 @@
+/*
+ * csum.c
+ *
+ * Copyright (C) 2013 SUSE.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <mhash.h>
+
+#include "csum.h"
+
+static MHASH td;
+
+#define        HASH_FUNC       MHASH_SHA256
+
+unsigned int digest_len = 0;
+
+void checksum_block(char *buf, int len, unsigned char *digest)
+{
+       td = mhash_init(HASH_FUNC);
+       if (td == MHASH_FAILED)
+               abort();
+
+       mhash(td, buf, len);
+       mhash_deinit(td, digest);
+}
+
+int init_hash(void)
+{
+       digest_len = mhash_get_block_size(HASH_FUNC);
+       if (!digest_len)
+               return 1;
+
+       if (digest_len == 0 || digest_len > DIGEST_LEN_MAX)
+               abort();
+
+       return 0;
+}
+
+void debug_print_digest(FILE *stream, unsigned char *digest)
+{
+       int i;
+
+       for (i = 0; i < digest_len; i++)
+               fprintf(stream, "%.2x", digest[i]);
+}
+
+struct running_checksum {
+       MHASH   td;
+       unsigned char   digest[DIGEST_LEN_MAX];
+};
+
+struct running_checksum *start_running_checksum(void)
+{
+       struct running_checksum *c = calloc(1, sizeof(struct running_checksum));
+
+       if (c)
+               c->td = mhash_init(HASH_FUNC);
+
+       return c;
+}
+
+void add_to_running_checksum(struct running_checksum *c,
+                            unsigned int len, unsigned char *buf)
+{
+       mhash(c->td, buf, len);
+}
+
+void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
+{
+       mhash_deinit(c->td, digest);
+       free(c);
+}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/csum.c new/duperemove-v0.06/csum.c
--- old/duperemove-v0.04/csum.c 2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/csum.c 1970-01-01 01:00:00.000000000 +0100
@@ -1,84 +0,0 @@
-/*
- * csum.c
- *
- * Copyright (C) 2013 SUSE.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <mhash.h>
-
-#include "csum.h"
-
-static MHASH td;
-
-#define        HASH_FUNC       MHASH_SHA256
-
-unsigned int digest_len = 0;
-
-void checksum_block(char *buf, int len, unsigned char *digest)
-{
-       td = mhash_init(HASH_FUNC);
-       if (td == MHASH_FAILED)
-               abort();
-
-       mhash(td, buf, len);
-       mhash_deinit(td, digest);
-}
-
-int init_hash(void)
-{
-       digest_len = mhash_get_block_size(HASH_FUNC);
-       if (!digest_len)
-               return 1;
-
-       if (digest_len == 0 || digest_len > DIGEST_LEN_MAX)
-               abort();
-
-       return 0;
-}
-
-void debug_print_digest(FILE *stream, unsigned char *digest)
-{
-       int i;
-
-       for (i = 0; i < digest_len; i++)
-               fprintf(stream, "%.2x", digest[i]);
-}
-
-struct running_checksum {
-       MHASH   td;
-       unsigned char   digest[DIGEST_LEN_MAX];
-};
-
-struct running_checksum *start_running_checksum(void)
-{
-       struct running_checksum *c = calloc(1, sizeof(struct running_checksum));
-
-       if (c)
-               c->td = mhash_init(HASH_FUNC);
-
-       return c;
-}
-
-void add_to_running_checksum(struct running_checksum *c,
-                            unsigned int len, unsigned char *buf)
-{
-       mhash(c->td, buf, len);
-}
-
-void finish_running_checksum(struct running_checksum *c, unsigned char *digest)
-{
-       mhash_deinit(c->td, digest);
-}
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/dedupe.c 
new/duperemove-v0.06/dedupe.c
--- old/duperemove-v0.04/dedupe.c       2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/dedupe.c       2014-04-11 02:11:50.000000000 +0200
@@ -13,16 +13,89 @@
  * General Public License for more details.
  */
 
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
 
+#include "kernel.h"
+#include "list.h"
 #include "filerec.h"
 #include "dedupe.h"
 
+#ifdef DEBUG_DEDUPE
+static struct filerec *
+same_idx_to_filerec(struct dedupe_ctxt *ctxt, int idx)
+{
+       int i;
+       struct filerec *file;
+       struct list_head *lists[3] = { &ctxt->queued,
+                                     &ctxt->in_progress,
+                                     &ctxt->completed, };
+
+       for (i = 0; i < 3; i++) {
+               list_for_each_entry(file, lists[i], dedupe_list) {
+                       if (file->dedupe_idx == idx)
+                               return file;
+               }
+       }
+
+       return NULL;
+}
+
+#define _PRE   "(dedupe) "
+static void print_btrfs_same_info(struct dedupe_ctxt *ctxt)
+{
+       int i;
+       struct filerec *file = ctxt->ioctl_file;
+       struct btrfs_ioctl_same_args *same = ctxt->same;
+       struct btrfs_ioctl_same_extent_info *info;
+
+       printf(_PRE"btrfs same info: ioctl_file: \"%s\"\n",
+              file ? file->filename : "(null)");
+       printf(_PRE"logical_offset: %llu, length: %llu, dest_count: %u\n",
+              (unsigned long long)same->logical_offset,
+              (unsigned long long)same->length, same->dest_count);
+
+       for (i = 0; i < same->dest_count; i++) {
+               info = &same->info[i];
+               file = same_idx_to_filerec(ctxt, i);
+               printf(_PRE"info[%d]: name: \"%s\", fd: %llu, logical_offset: "
+                      "%llu, bytes_deduped: %llu, status: %d\n",
+                      i, file ? file->filename : "(null)", (long long)info->fd,
+                      (unsigned long long)info->logical_offset,
+                      (unsigned long long)info->bytes_deduped, info->status);
+       }
+}
+#endif
+
+static void clear_file_dedupe_info(struct filerec *file)
+{
+       file->dedupe_total = 0;
+       file->dedupe_status = 0;
+       file->dedupe_loff = 0;
+}
+
+static void clear_lists(struct dedupe_ctxt *ctxt)
+{
+       int i;
+       struct list_head *lists[3] = { &ctxt->queued,
+                                     &ctxt->in_progress,
+                                     &ctxt->completed, };
+       struct filerec *file, *tmp;
+
+       for (i = 0; i < 3; i++) {
+               list_for_each_entry_safe(file, tmp, lists[i], dedupe_list) {
+                       clear_file_dedupe_info(file);
+                       list_del_init(&file->dedupe_list);
+               }
+       }
+}
+
 void free_dedupe_ctxt(struct dedupe_ctxt *ctxt)
 {
        if (ctxt) {
-               if (ctxt->filerec_array)
-                       free(ctxt->filerec_array);
+               clear_lists(ctxt);
                if (ctxt->same)
                        free(ctxt->same);
                free(ctxt);
@@ -35,32 +108,30 @@
        struct dedupe_ctxt *ctxt = calloc(1, sizeof(*ctxt));
        struct btrfs_ioctl_same_args *same;
        unsigned int same_size;
+       unsigned int max_dest_files = max_extents - 1;
 
        if (ctxt == NULL)
                return NULL;
 
-       ctxt->filerec_array = calloc(max_extents - 1,
-                                    sizeof(*ctxt->filerec_array));
-       if (ctxt->filerec_array == NULL) {
-               free(ctxt);
-               return NULL;
-       }
-
        same_size = sizeof(*same) +
-               (max_extents - 1) * sizeof(struct btrfs_ioctl_same_extent_info);
+               max_dest_files * sizeof(struct btrfs_ioctl_same_extent_info);
        same = calloc(1, same_size);
        if (same == NULL) {
-               free(ctxt->filerec_array);
+               free(same);
                free(ctxt);
                return NULL;
        }
 
        ctxt->same = same;
+       ctxt->same_size = same_size;
 
-       ctxt->max_extents = max_extents;
-       ctxt->len = ctxt->same->length = elen;
+       ctxt->max_queable = max_dest_files;
+       ctxt->len = ctxt->orig_len = elen;
        ctxt->ioctl_file = ioctl_file;
-       ctxt->ioctl_file_off = same->logical_offset = loff;
+       ctxt->ioctl_file_off = ctxt->orig_file_off = loff;
+       INIT_LIST_HEAD(&ctxt->queued);
+       INIT_LIST_HEAD(&ctxt->in_progress);
+       INIT_LIST_HEAD(&ctxt->completed);
 
        return ctxt;
 }
@@ -68,31 +139,141 @@
 void add_extent_to_dedupe(struct dedupe_ctxt *ctxt, uint64_t loff, uint64_t 
len,
                          struct filerec *file)
 {
-       int i = ctxt->same->dest_count;
-       struct btrfs_ioctl_same_args *same = ctxt->same;
+       clear_file_dedupe_info(file);
+       file->dedupe_loff = loff;
+       list_add_tail(&file->dedupe_list, &ctxt->queued);
+
+       if (++ctxt->num_queued > ctxt->max_queable)
+               abort();
+}
+
+static int add_dedupe_request(struct dedupe_ctxt *ctxt,
+                              struct btrfs_ioctl_same_args *same,
+                              struct filerec *file)
+{
+       int same_idx = same->dest_count;
+       struct btrfs_ioctl_same_extent_info *info;
 
-       if (ctxt->same->dest_count >= ctxt->max_extents)
+       if (same->dest_count > ctxt->max_queable)
                abort();
 
-       same->info[i].logical_offset = loff;
-       same->info[i].fd = file->fd;
-       ctxt->filerec_array[i] = file;
+       info = &same->info[same_idx];
+       info->fd = file->fd;
+       info->logical_offset = file->dedupe_loff;
+       info->bytes_deduped = 0;
        same->dest_count++;
+
+#ifdef DEBUG_DEDUPE
+       printf("add request %s, off: %llu, dest: %d\n", file->filename,
+              (unsigned long long)file->dedupe_loff, same->dest_count);
+#endif
+       return same_idx;
+}
+
+static void populate_dedupe_request(struct dedupe_ctxt *ctxt,
+                                   struct btrfs_ioctl_same_args *same)
+{
+       struct filerec *file, *tmp;
+
+       memset(same, 0, ctxt->same_size);
+
+       same->length = ctxt->len;
+       same->logical_offset = ctxt->ioctl_file_off;
+
+       list_for_each_entry_safe(file, tmp, &ctxt->queued, dedupe_list) {
+               file->dedupe_idx = add_dedupe_request(ctxt, same, file);
+
+               list_move_tail(&file->dedupe_list, &ctxt->in_progress);
+       }
+}
+
+/* Returns 1 when there are no more dedupes to process. */
+static void process_dedupes(struct dedupe_ctxt *ctxt,
+                           struct btrfs_ioctl_same_args *same)
+{
+       int same_idx;
+       uint64_t max_deduped = 0;
+       struct btrfs_ioctl_same_extent_info *info;
+       struct filerec *file, *tmp;
+
+       list_for_each_entry_safe(file, tmp, &ctxt->in_progress, dedupe_list) {
+               same_idx = file->dedupe_idx;
+               info = &same->info[same_idx];
+
+               if (info->bytes_deduped > max_deduped)
+                       max_deduped = info->bytes_deduped;
+
+               file->dedupe_loff += info->bytes_deduped;
+               file->dedupe_total += info->bytes_deduped;
+
+               if (info->status || file->dedupe_total >= ctxt->orig_len)
+                       goto completed;
+
+               /* put us back on the queued list for another go around */
+               list_move_tail(&file->dedupe_list, &ctxt->queued);
+               continue;
+completed:
+               /* Only bother taking the final status (the rest will be 0) */
+               file->dedupe_status = info->status;
+               list_move_tail(&file->dedupe_list, &ctxt->completed);
+       }
+
+       /* Increment our ioctl file pointers */
+       ctxt->len -= max_deduped;
+       ctxt->ioctl_file_off += max_deduped;
 }
 
 int dedupe_extents(struct dedupe_ctxt *ctxt)
 {
-       return btrfs_extent_same(ctxt->ioctl_file->fd, ctxt->same);
+       int ret;
+
+       while (!list_empty(&ctxt->queued)) {
+               /* Convert the queued list into an actual request */
+               populate_dedupe_request(ctxt, ctxt->same);
+
+               ret = btrfs_extent_same(ctxt->ioctl_file->fd, ctxt->same);
+               if (ret)
+                       break;
+
+#ifdef DEBUG_DEDUPE
+               print_btrfs_same_info(ctxt);
+#endif
+
+               process_dedupes(ctxt, ctxt->same);
+       }
+
+       return ret;
 }
 
-void get_dedupe_result(struct dedupe_ctxt *ctxt, int idx, int *status,
-                      uint64_t *off, uint64_t *bytes_deduped,
-                      struct filerec **file)
+/*
+ * Returns 1 when we have no more items.
+ */
+int pop_one_dedupe_result(struct dedupe_ctxt *ctxt, int *status,
+                         uint64_t *off, uint64_t *bytes_deduped,
+                         struct filerec **file)
 {
-       struct btrfs_ioctl_same_extent_info *info = &ctxt->same->info[idx];
+       struct filerec *f;
 
-       *status = info->status;
-       *off = info->logical_offset;
-       *bytes_deduped = info->bytes_deduped;
-       *file = ctxt->filerec_array[idx];
+       if (list_empty(&ctxt->completed))
+               goto out;
+
+       f = list_entry(ctxt->completed.next, struct filerec, dedupe_list);
+       list_del_init(&f->dedupe_list);
+
+       *status = f->dedupe_status;
+       *off = f->dedupe_loff - f->dedupe_total;
+       *bytes_deduped = f->dedupe_total;
+       *file = f;
+
+out:
+       return !!list_empty(&ctxt->completed);
+}
+
+void get_target_dedupe_info(struct dedupe_ctxt *ctxt, uint64_t *orig_loff,
+                           uint64_t *orig_len,
+                           struct filerec **file)
+{
+       *orig_loff = ctxt->orig_file_off;
+       *orig_len = ctxt->orig_len;
+       *file = ctxt->ioctl_file;
 }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/dedupe.h 
new/duperemove-v0.06/dedupe.h
--- old/duperemove-v0.04/dedupe.h       2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/dedupe.h       2014-04-11 02:11:50.000000000 +0200
@@ -1,16 +1,36 @@
 #ifndef        __DEDUPE_H__
 #define        __DEDUPE_H__
 
+#include "list.h"
 #include "btrfs-ioctl.h"
 
 struct dedupe_ctxt {
-       unsigned int    max_extents;    /* used for sanity checking */
+
+       /*
+        * Starting len/file off saved for the callers convenience -
+        * the ones below can change during dedupe operations.
+        */
+       uint64_t        orig_len;
+       uint64_t        orig_file_off;
 
        uint64_t        len;
        struct filerec  *ioctl_file;
        uint64_t        ioctl_file_off;
 
-       struct filerec  **filerec_array;
+       /* Next two are used for sanity checking */
+       unsigned int            max_queable;
+       unsigned int            num_queued;
+
+       unsigned int            same_size;
+       /*
+        * filerecs that are being used to dedupe against the ioctl file.
+        *      queued: filerec is awaiting dedupe
+        *      in_progress: currently undergoing dedupe operations
+        *      completed: results of dedupe for this file are available
+        */
+       struct list_head        queued;
+       struct list_head        in_progress;
+       struct list_head        completed;
 
        struct btrfs_ioctl_same_args    *same;
 };
@@ -21,12 +41,10 @@
 void add_extent_to_dedupe(struct dedupe_ctxt *ctxt, uint64_t loff, uint64_t 
len,
                          struct filerec *file);
 int dedupe_extents(struct dedupe_ctxt *ctxt);
-void get_dedupe_result(struct dedupe_ctxt *ctxt, int idx, int *status,
-                      uint64_t *off, uint64_t *bytes_deduped,
-                      struct filerec **file);
-
-static inline int num_dedupe_requests(struct dedupe_ctxt *ctxt)
-{
-       return ctxt->same->dest_count;
-}
+int pop_one_dedupe_result(struct dedupe_ctxt *ctxt, int *status,
+                         uint64_t *off, uint64_t *bytes_deduped,
+                         struct filerec **file);
+void get_target_dedupe_info(struct dedupe_ctxt *ctxt, uint64_t *orig_loff,
+                           uint64_t *orig_len, struct filerec **file);
+
 #endif /* __BTRFS_IOCTL_H__ */
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/duperemove.c 
new/duperemove-v0.06/duperemove.c
--- old/duperemove-v0.04/duperemove.c   2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/duperemove.c   2014-04-11 02:11:50.000000000 +0200
@@ -25,6 +25,7 @@
 #include <errno.h>
 #include <string.h>
 #include <linux/limits.h>
+#include <ctype.h>
 
 #include "rbtree.h"
 #include "list.h"
@@ -103,7 +104,7 @@
        }
 }
 
-static void print_results(struct results_tree *res)
+static void print_dupes_table(struct results_tree *res)
 {
        struct rb_root *root = &res->root;
        struct rb_node *node = rb_first(root);
@@ -111,7 +112,12 @@
        struct extent *extent;
        uint64_t calc_bytes = 0;
 
-       printf("Found %u instances of duplicated extents\n", res->num_dupes);
+       printf("Simple read and compare of file data found %u instances of "
+              "extents that might benefit from deduplication.\n",
+              res->num_dupes);
+
+       if (res->num_dupes == 0)
+               return;
 
        while (1) {
                uint64_t len, len_blocks;
@@ -125,9 +131,9 @@
                len_blocks = len / blocksize;
                calc_bytes += dext->de_score;
 
-               vprintf("(dext: 0x%p) %u extents had length %llu (%llu) for a"
-                       " score of %llu.\n", (void *)dext,
-                       dext->de_num_dupes, (unsigned long long)len_blocks,
+               vprintf("%u extents had a length %llu Blocks (%llu) for a"
+                       " score of %llu.\n", dext->de_num_dupes,
+                       (unsigned long long)len_blocks,
                        (unsigned long long)len,
                        (unsigned long long)dext->de_score);
                if (debug) {
@@ -136,29 +142,32 @@
                        printf("\n");
                }
 
-               if (verbose) {
-                       list_for_each_entry(extent, &dext->de_extents, e_list) {
-                               printf("%s\tstart block: %llu (%llu)\n",
-                                      extent->e_file->filename,
-                                      (unsigned long long)extent->e_loff / 
blocksize,
-                                      (unsigned long long)extent->e_loff);
-                       }
+               printf("Start\t\tLength\t\tFilename\n");
+               list_for_each_entry(extent, &dext->de_extents, e_list) {
+                       printf("%llu\t%llu\t\"%s\"\n",
+                              (unsigned long long)extent->e_loff,
+                              (unsigned long long)len,
+                              extent->e_file->filename);
                }
 
                node = rb_next(node);
        }
-
-       printf("Calculated %llu bytes of duplicated data.\n",
-              (unsigned long long)calc_bytes);
 }
 
 static int run_dedupe_and_close_files(struct dedupe_ctxt **ret_ctxt,
                                      uint64_t *bytes_deduped)
 {
-       int ret, i;
+       int ret, done = 0;
        struct dedupe_ctxt *ctxt = *ret_ctxt;
+       struct filerec *ioctl_file;
+       uint64_t orig_file_off, orig_len;
+
+       /* For our target status loop */
+       int target_status;
+       uint64_t target_loff, target_bytes;
+       struct filerec *f;
 
-       printf("Running dedupe.\n");
+       printf("Requesting dedupe pass from kernel.\n");
 
        ret = dedupe_extents(ctxt);
        if (ret) {
@@ -169,23 +178,20 @@
                goto cleanup;
        }
 
-       printf("Dedupe from: \"%s\"\toffset: %llu\tlen: %llu\n",
-              ctxt->ioctl_file->filename,
-              (unsigned long long)ctxt->ioctl_file_off,
-              (unsigned long long)ctxt->len);
-
-       for (i = 0; i < num_dedupe_requests(ctxt); i++) {
-               uint64_t target_loff, target_bytes;
-               int status;
-               struct filerec *f;
-
-               get_dedupe_result(ctxt, i, &status, &target_loff,
-                                 &target_bytes, &f);
-
-               printf("\"%s\":\toffset: %llu\tdeduped bytes: %llu"
-                      "\tstatus: %d\n", f->filename,
-                      (unsigned long long)target_loff,
-                      (unsigned long long)target_bytes, status);
+       get_target_dedupe_info(ctxt, &orig_file_off, &orig_len, &ioctl_file);
+
+       vprintf("Ask for dedupe from: \"%s\"\toffset: %llu\tlen: %llu\n",
+               ioctl_file->filename,
+               (unsigned long long)orig_file_off,
+               (unsigned long long)orig_len);
+
+       while (!done) {
+               done = pop_one_dedupe_result(ctxt, &target_status, &target_loff,
+                                            &target_bytes, &f);
+               vprintf("\"%s\":\toffset: %llu\tmaybe deduped bytes: %llu"
+                       "\tstatus: %d\n", f->filename,
+                       (unsigned long long)target_loff,
+                       (unsigned long long)target_bytes, target_status);
 
                filerec_close(f);
                *bytes_deduped += target_bytes;
@@ -208,9 +214,12 @@
        struct dedupe_ctxt *ctxt = NULL;
        uint64_t actual_bytes = 0;
 
-       print_results(res);
+       print_dupes_table(res);
 
-       printf("Deduping data...\n");
+       if (RB_EMPTY_ROOT(root)) {
+               printf("Nothing to dedupe.\n");
+               return;
+       }
 
        while (1) {
                uint64_t len, len_blocks;
@@ -294,7 +303,9 @@
                node = rb_next(node);
        }
 
-       printf("Deduped %llu bytes of data\n", (unsigned long 
long)actual_bytes);
+       printf("Kernel reports %llu bytes of data processed. Actual disk "
+              "savings will differ depending on how much of the data was "
+              "previously deduplicated.\n", (unsigned long long)actual_bytes);
 }
 
 static int csum_whole_file(struct hash_tree *tree, struct filerec *file)
@@ -303,7 +314,7 @@
        ssize_t bytes;
        uint64_t off;
 
-       vprintf("csum: %s\n", file->filename);
+       printf("csum: %s\n", file->filename);
 
        ret = filerec_open(file, 0);
        if (ret)
@@ -374,7 +385,7 @@
 {
        printf("duperemove %s\n", VERSTRING);
        printf("Find duplicate extents and print them to stdout\n\n");
-       printf("Usage: %s [-r] [-D] [-A] [-b blocksize-in-K] [-v] [-d]"
+       printf("Usage: %s [-r] [-D] [-A] [-b blocksize] [-v] [-d]"
               " OBJECTS\n", prog);
        printf("Where \"OBJECTS\" is a list of files (or directories) which\n");
        printf("we want to find duplicate extents in. If a directory is \n");
@@ -383,7 +394,8 @@
        printf("\t-r\t\tEnable recursive dir traversal.\n");
        printf("\t-D\t\tDe-dupe the results - only works on btrfs.\n");
        printf("\t-A\t\tOpens files readonly when deduping. Primarily for use 
by privileged users on readonly snapshots\n");
-       printf("\t-b bsize\tUse bsize blocks - specify in kilobytes. Default is 
%d.\n", DEFAULT_BLOCKSIZE / 1024);
+       printf("\t-b bsize\tUse bsize blocks. Default is %dk.\n",
+              DEFAULT_BLOCKSIZE / 1024);
        printf("\t-v\t\tBe verbose.\n");
        printf("\t-d\t\tPrint debug messages, forces -v if selected.\n");
        printf("\t-h\t\tPrints this help text.\n");
@@ -478,13 +490,64 @@
                exit(ENOMEM);
        }
 
-       dprintf("added file: %s\n", path);
-
 out:
        pathp = pathtmp;
 }
 
 /*
+ * parse_size() taken from btrfs-progs/util.c
+ */
+uint64_t parse_size(char *s)
+{
+       int i;
+       char c;
+       uint64_t mult = 1;
+
+       for (i = 0; s && s[i] && isdigit(s[i]); i++) ;
+       if (!i) {
+               fprintf(stderr, "ERROR: size value is empty\n");
+               exit(50);
+       }
+
+       if (s[i]) {
+               c = tolower(s[i]);
+               switch (c) {
+               case 'e':
+                       mult *= 1024;
+                       /* fallthrough */
+               case 'p':
+                       mult *= 1024;
+                       /* fallthrough */
+               case 't':
+                       mult *= 1024;
+                       /* fallthrough */
+               case 'g':
+                       mult *= 1024;
+                       /* fallthrough */
+               case 'm':
+                       mult *= 1024;
+                       /* fallthrough */
+               case 'k':
+                       mult *= 1024;
+                       /* fallthrough */
+               case 'b':
+                       break;
+               default:
+                       fprintf(stderr, "ERROR: Unknown size descriptor "
+                               "'%c'\n", c);
+                       exit(1);
+               }
+       }
+       if (s[i] && s[i+1]) {
+               fprintf(stderr, "ERROR: Illegal suffix contains "
+                       "character '%c' in wrong position\n",
+                       s[i+1]);
+               exit(51);
+       }
+       return strtoull(s, NULL, 10) * mult;
+}
+
+/*
  * Ok this is doing more than just parsing options.
  */
 static int parse_options(int argc, char **argv)
@@ -500,8 +563,7 @@
                        target_rw = 0;
                        break;
                case 'b':
-                       blocksize = atoi(optarg);
-                       blocksize *= 1024;
+                       blocksize = parse_size(optarg);
                        if (blocksize < MIN_BLOCKSIZE ||
                            blocksize > MAX_BLOCKSIZE)
                                return EINVAL;
@@ -682,7 +744,7 @@
                return EINVAL;
        }
 
-       vprintf("Using %uK blocks\n", blocksize/1024);
+       printf("Using %uK blocks\n", blocksize/1024);
 
        buf = malloc(blocksize);
        if (!buf)
@@ -704,7 +766,7 @@
        }
 
        if (debug) {
-               print_results(&res);
+               print_dupes_table(&res);
                printf("\n\nRemoving overlapping extents\n\n");
        }
 
@@ -715,7 +777,7 @@
        if (run_dedupe)
                dedupe_results(&res);
        else
-               print_results(&res);
+               print_dupes_table(&res);
 
 out:
        return ret;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/filerec.c 
new/duperemove-v0.06/filerec.c
--- old/duperemove-v0.04/filerec.c      2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/filerec.c      2014-04-11 02:11:50.000000000 +0200
@@ -27,6 +27,7 @@
                file->fd = -1;
                INIT_LIST_HEAD(&file->block_list);
                INIT_LIST_HEAD(&file->extent_list);
+               INIT_LIST_HEAD(&file->dedupe_list);
 
                list_add_tail(&file->rec_list, &filerec_list);
        }
@@ -43,6 +44,7 @@
                list_del(&file->block_list);
                list_del(&file->extent_list);
                list_del(&file->rec_list);
+               list_del(&file->dedupe_list);
 
                free(file);
        }
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/duperemove-v0.04/filerec.h 
new/duperemove-v0.06/filerec.h
--- old/duperemove-v0.04/filerec.h      2014-03-12 07:19:56.000000000 +0100
+++ new/duperemove-v0.06/filerec.h      2014-04-11 02:11:50.000000000 +0200
@@ -1,6 +1,7 @@
 #ifndef __FILEREC__
 #define __FILEREC__
 
+#include <stdint.h>
 #include "list.h"
 
 extern struct list_head filerec_list;
@@ -14,6 +15,16 @@
        struct list_head        extent_list;    /* head for results node list */
 
        struct list_head        rec_list;       /* all filerecs */
+
+       /*
+        * Used by dedupe code to track state of this file during a
+        * dedupe request.
+        */
+       uint64_t                dedupe_loff;
+       uint64_t                dedupe_total;
+       int                     dedupe_status;
+       int                     dedupe_idx;
+       struct list_head        dedupe_list;    /* see comment in dededupe.h */
 };
 
 static inline void init_filerec(void)

-- 
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to