Hello community, here is the log from the commit of package duperemove for openSUSE:Factory checked in at 2014-04-11 13:28:10 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/duperemove (Old) and /work/SRC/openSUSE:Factory/.duperemove.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "duperemove" Changes: -------- --- /work/SRC/openSUSE:Factory/duperemove/duperemove.changes 2014-03-15 17:36:22.000000000 +0100 +++ /work/SRC/openSUSE:Factory/.duperemove.new/duperemove.changes 2014-04-11 13:28:12.000000000 +0200 @@ -1,0 +2,10 @@ +Fri Apr 11 00:30:41 UTC 2014 - [email protected] + +- update to duperemove v0.06. This adds several fixes and features: + - fixes bnc#871804 (duperemove not looping on entire range) + +- also includes important usability fixes + +- updates hashing library to libgcrypt to reflect upstream + +------------------------------------------------------------------- Old: ---- duperemove-v0.04.tar.gz New: ---- duperemove-v0.06.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ duperemove.spec ++++++ --- /var/tmp/diff_new_pack.7Bs4gt/_old 2014-04-11 13:28:12.000000000 +0200 +++ /var/tmp/diff_new_pack.7Bs4gt/_new 2014-04-11 13:28:12.000000000 +0200 @@ -15,19 +15,19 @@ # Please submit bugfixes or comments via http://bugs.opensuse.org/ # + %define modname duperemove Name: duperemove BuildRequires: gcc-c++ -BuildRequires: mhash-devel -Version: 0.04 +BuildRequires: libgcrypt-devel +Version: 0.06 Release: 0 Summary: Software to find duplicate extents in files and remove them License: GPL-2.0 Group: System/Filesystems Url: https://github.com/markfasheh/duperemove Source: %{modname}-v%{version}.tar.gz -Requires: mhash BuildRoot: %{_tmppath}/%{name}-%{version}-build %description @@ -38,8 +38,8 @@ %define samename btrfs-extent-same %package -n btrfs-extent-same Summary: Debug/Test tool to exercise the btrfs out-of-band deduplication ioctl -License: GPL-2.0 Group: System/Filesystems + %description -n btrfs-extent-same Debug/Test tool to exercise a btrfs ioctl for deduplicating file regions. ++++++ duperemove-v0.04.tar.gz -> duperemove-v0.06.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/Makefile new/duperemove-v0.06/Makefile --- old/duperemove-v0.04/Makefile 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/Makefile 2014-04-11 02:11:50.000000000 +0200 @@ -1,16 +1,27 @@ CC=gcc -RELEASE=v0.04 +RELEASE=v0.06 CFLAGS=-Wall -ggdb -D_FILE_OFFSET_BITS=64 -DVERSTRING=\"$(RELEASE)\" -LIBRARY_FLAGS=-lmhash MANPAGES=duperemove.8 btrfs-extent-same.8 -DIST_SOURCES=csum.c csum.h duperemove.c hash-tree.c hash-tree.h results-tree.c results-tree.h kernel.h LICENSE list.h Makefile rbtree.c rbtree.h rbtree.txt README TODO dedupe.c dedupe.h btrfs-ioctl.h filerec.c filerec.h $(MANPAGES) btrfs-extent-same.c +DIST_SOURCES=csum-gcrypt.c csum-mhash.c csum.h duperemove.c hash-tree.c hash-tree.h results-tree.c results-tree.h kernel.h LICENSE list.h Makefile rbtree.c rbtree.h rbtree.txt README TODO dedupe.c dedupe.h btrfs-ioctl.h filerec.c filerec.h $(MANPAGES) btrfs-extent-same.c DIST=duperemove-$(RELEASE) DIST_TARBALL=$(DIST).tar.gz TEMP_INSTALL_DIR:=$(shell mktemp -du -p .) -objects = duperemove.o rbtree.o csum.o hash-tree.o results-tree.o dedupe.o filerec.o +hash_obj=csum-gcrypt.o +crypt_CFLAGS=$(shell libgcrypt-config --cflags) +crypt_LIBS=$(shell libgcrypt-config --libs) +ifdef USE_MHASH + hash_obj=csum-mhash.o + crypt_CFLAGS= + crypt_LIBS=-lmhash +endif + +CFLAGS += $(crypt_CFLAGS) +LIBRARY_FLAGS += $(crypt_LIBS) + +objects = duperemove.o rbtree.o hash-tree.o results-tree.o dedupe.o filerec.o $(hash_obj) progs = duperemove all: $(progs) kernel.h list.h btrfs-ioctl.h @@ -27,5 +38,8 @@ btrfs-extent-same: btrfs-extent-same.c $(CC) -Wall -o btrfs-extent-same btrfs-extent-same.c +csum-test: $(hash_obj) csum-test.c + $(CC) -Wall $(hash_obj) $(CFLAGS) $(LIBRARY_FLAGS) -o csum-test csum-test.c + clean: - rm -fr $(objects) $(progs) $(DIST_TARBALL) btrfs-extent-same *~ + rm -fr $(objects) $(progs) $(DIST_TARBALL) btrfs-extent-same csum-*.o *~ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/README new/duperemove-v0.06/README --- old/duperemove-v0.04/README 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/README 2014-04-11 02:11:50.000000000 +0200 @@ -1,16 +1,75 @@ -duperemove v0.04 -Find duplicate extents and print them to stdout +Duperemove -Usage: ./duperemove [-r] [-D] [-A] [-b blocksize-in-K] [-v] [-d] OBJECTS -Where "OBJECTS" is a list of files (or directories) which -we want to find duplicate extents in. If a directory is -specified, all regular files inside of it will be scanned. - - <switches> - -r Enable recursive dir traversal. - -D De-dupe the results - only works on btrfs. - -A Opens files readonly when deduping. Primarily for use by privileged users on readonly snapshots - -b bsize Use bsize blocks - specify in kilobytes. Default is 128. - -v Be verbose. - -d Print debug messages, forces -v if selected. - -h Prints this help text. +Duperemove is a simple tool for finding duplicated extents and +submitting them for deduplication. When given a list of files it will +hash their contents on a block by block basis and compare those hashes +to each other, finding and categorizing extents that match each +other. When given the optional -D option, duperemove will submit those +extents for deduplication using the btrfs-extent-same ioctl. + +Duperemove has two major modes of operation one of which is a subset +of the other. + + +Readonly / Non-deduplicating Mode + +When run without -D (the default) duperemove will print out one or +more tables of matching extents it has determined would be ideal +candidates for deduplication. As a result, readonly mode is useful for +seeing what duperemove might do when run with '-D'. The output could +also be used by some other software to submit the extents for +deduplication at a later time. + +It is important to note that this mode will not print out *all* +instances of matching extents, just those it would consider for +deduplication. + +Another important note is that duperemove does not concern itself with +the underlying representation of the extents. Some of them could be +compressed, undergoing I/O, or even have already been deduplicated. In +dedupe mode, the kernel handles those details and therefore we try not +to replicate that work. Think of duperemove as trying for 'bulk' +deduplication. + + +Deduping Mode + +This functions similarly to readonly mode with the exception that the +duplicated extents found in our "read hash and compare" step will +actually be submitted for deduplication. At the end, a total count of +bytes that were processed by the kernel will be printed. + +Keep in mind, that the bytecount we report here (recieved from the +kernel) is NOT the total amount deduplicated but rather a count of the +amount of data it also found to be identical. + +See the duperemove man page for further details about running duperemove. + + +FAQ + +* Is there an upper limit to the amount of data duperemove can process? + +Right now duperemove has been tested on small numbers of VMS or iso +files (5-10). I don't believe there should be a major problem scaling +that up to 50 or so. + + +* Why does it not print out all duplicate extents? + +Internally duperemove is classifying extents based on various criteria +like length, number of identical extents, etc. The printout we give is +based on the results of that classification. + + +* How can I find out my space savings after a dedupe? + +The easiest way to do this would be a df before the dedupe operation, +then a df about 60 seconds after the operation. It is common for btrfs +space reporting to be 'behind' while delayed updates get processed, so +an immediate df after deduping might not show any savings. + + +USAGE EXAMPLES + +TODO diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/TODO new/duperemove-v0.06/TODO --- old/duperemove-v0.04/TODO 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/TODO 2014-04-11 02:11:50.000000000 +0200 @@ -1,15 +1,22 @@ -HIGH LEVEL +- Allow checking for similar files by some criteria (approximate size, + file magic type, file extension, etc) -- Test on actual VM images +- Limit the window for duplicate extent calculation to limit overall time + spent looking for duplicates -- Add code to take a directory as a command line option - -LOW LEVEL - -- Replace ugly pointer comparison in walk_dupe_block() +- Store results of our search to speed up subsequent runs. + - In particular, store file system transaction ids so we can use the + equivalent of btrfs's find-new - Wrap bytes <-> blocks conversions - Possibly use mmap (with madvise) for the checksumming phase - Do checksumming in seperate threads + +- Add an optional mode to do duplicate checking with resolution of extent + owners (expensive). + +- Allow duperemove to take as input it's previous output so a user + could run in readonly mode, record dupes and then pass them later to + duperemove for potential deduplication. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/csum-gcrypt.c new/duperemove-v0.06/csum-gcrypt.c --- old/duperemove-v0.04/csum-gcrypt.c 1970-01-01 01:00:00.000000000 +0100 +++ new/duperemove-v0.06/csum-gcrypt.c 2014-04-11 02:11:50.000000000 +0200 @@ -0,0 +1,106 @@ +/* + * csum.c + * + * Copyright (C) 2014 SUSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <gcrypt.h> + +#include "csum.h" + +#define HASH_FUNC GCRY_MD_SHA256 + +unsigned int digest_len = 0; + +void checksum_block(char *buf, int len, unsigned char *digest) +{ + gcry_md_hash_buffer(HASH_FUNC, digest, buf, len); +} + +int init_hash(void) +{ + /* + * Version check should be the very first call because it makes sure + * that important subsystems are intialized. + */ + if (!gcry_check_version(GCRYPT_VERSION)) + return 1; + + /* Disable secure memory. */ + gcry_control(GCRYCTL_DISABLE_SECMEM, 0); + + /* Tell Libgcrypt that initialization has completed. */ + gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0); + + if (gcry_md_test_algo(HASH_FUNC)) + return 1; + + digest_len = gcry_md_get_algo_dlen(HASH_FUNC); + if (!digest_len) + return 1; + + if (digest_len == 0 || digest_len > DIGEST_LEN_MAX) + abort(); + + return 0; +} + +void debug_print_digest(FILE *stream, unsigned char *digest) +{ + int i; + + for (i = 0; i < digest_len; i++) + fprintf(stream, "%.2x", digest[i]); +} + +struct running_checksum { + gcry_md_hd_t hd; + unsigned char digest[DIGEST_LEN_MAX]; +}; + +struct running_checksum *start_running_checksum(void) +{ + struct running_checksum *c = calloc(1, sizeof(struct running_checksum)); + + if (c) { + if (gcry_md_open(&c->hd, HASH_FUNC, 0) != GPG_ERR_NO_ERROR) { + free(c); + c = NULL; + } + } + + return c; +} + +void add_to_running_checksum(struct running_checksum *c, + unsigned int len, unsigned char *buf) +{ + gcry_md_write(c->hd, buf, len); +} + +void finish_running_checksum(struct running_checksum *c, unsigned char *digest) +{ + unsigned char *gcry_digest; + + /* gcry_md_read() does this implicitly */ + gcry_md_final(c->hd); + gcry_digest = gcry_md_read(c->hd, 0); + memcpy(digest, gcry_digest, digest_len); + + gcry_md_close(c->hd); + + free(c); +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/csum-mhash.c new/duperemove-v0.06/csum-mhash.c --- old/duperemove-v0.04/csum-mhash.c 1970-01-01 01:00:00.000000000 +0100 +++ new/duperemove-v0.06/csum-mhash.c 2014-04-11 02:11:50.000000000 +0200 @@ -0,0 +1,85 @@ +/* + * csum.c + * + * Copyright (C) 2013 SUSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <mhash.h> + +#include "csum.h" + +static MHASH td; + +#define HASH_FUNC MHASH_SHA256 + +unsigned int digest_len = 0; + +void checksum_block(char *buf, int len, unsigned char *digest) +{ + td = mhash_init(HASH_FUNC); + if (td == MHASH_FAILED) + abort(); + + mhash(td, buf, len); + mhash_deinit(td, digest); +} + +int init_hash(void) +{ + digest_len = mhash_get_block_size(HASH_FUNC); + if (!digest_len) + return 1; + + if (digest_len == 0 || digest_len > DIGEST_LEN_MAX) + abort(); + + return 0; +} + +void debug_print_digest(FILE *stream, unsigned char *digest) +{ + int i; + + for (i = 0; i < digest_len; i++) + fprintf(stream, "%.2x", digest[i]); +} + +struct running_checksum { + MHASH td; + unsigned char digest[DIGEST_LEN_MAX]; +}; + +struct running_checksum *start_running_checksum(void) +{ + struct running_checksum *c = calloc(1, sizeof(struct running_checksum)); + + if (c) + c->td = mhash_init(HASH_FUNC); + + return c; +} + +void add_to_running_checksum(struct running_checksum *c, + unsigned int len, unsigned char *buf) +{ + mhash(c->td, buf, len); +} + +void finish_running_checksum(struct running_checksum *c, unsigned char *digest) +{ + mhash_deinit(c->td, digest); + free(c); +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/csum.c new/duperemove-v0.06/csum.c --- old/duperemove-v0.04/csum.c 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/csum.c 1970-01-01 01:00:00.000000000 +0100 @@ -1,84 +0,0 @@ -/* - * csum.c - * - * Copyright (C) 2013 SUSE. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - - -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <mhash.h> - -#include "csum.h" - -static MHASH td; - -#define HASH_FUNC MHASH_SHA256 - -unsigned int digest_len = 0; - -void checksum_block(char *buf, int len, unsigned char *digest) -{ - td = mhash_init(HASH_FUNC); - if (td == MHASH_FAILED) - abort(); - - mhash(td, buf, len); - mhash_deinit(td, digest); -} - -int init_hash(void) -{ - digest_len = mhash_get_block_size(HASH_FUNC); - if (!digest_len) - return 1; - - if (digest_len == 0 || digest_len > DIGEST_LEN_MAX) - abort(); - - return 0; -} - -void debug_print_digest(FILE *stream, unsigned char *digest) -{ - int i; - - for (i = 0; i < digest_len; i++) - fprintf(stream, "%.2x", digest[i]); -} - -struct running_checksum { - MHASH td; - unsigned char digest[DIGEST_LEN_MAX]; -}; - -struct running_checksum *start_running_checksum(void) -{ - struct running_checksum *c = calloc(1, sizeof(struct running_checksum)); - - if (c) - c->td = mhash_init(HASH_FUNC); - - return c; -} - -void add_to_running_checksum(struct running_checksum *c, - unsigned int len, unsigned char *buf) -{ - mhash(c->td, buf, len); -} - -void finish_running_checksum(struct running_checksum *c, unsigned char *digest) -{ - mhash_deinit(c->td, digest); -} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/dedupe.c new/duperemove-v0.06/dedupe.c --- old/duperemove-v0.04/dedupe.c 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/dedupe.c 2014-04-11 02:11:50.000000000 +0200 @@ -13,16 +13,89 @@ * General Public License for more details. */ +#include <stdio.h> #include <stdlib.h> +#include <string.h> +#include <stdint.h> +#include "kernel.h" +#include "list.h" #include "filerec.h" #include "dedupe.h" +#ifdef DEBUG_DEDUPE +static struct filerec * +same_idx_to_filerec(struct dedupe_ctxt *ctxt, int idx) +{ + int i; + struct filerec *file; + struct list_head *lists[3] = { &ctxt->queued, + &ctxt->in_progress, + &ctxt->completed, }; + + for (i = 0; i < 3; i++) { + list_for_each_entry(file, lists[i], dedupe_list) { + if (file->dedupe_idx == idx) + return file; + } + } + + return NULL; +} + +#define _PRE "(dedupe) " +static void print_btrfs_same_info(struct dedupe_ctxt *ctxt) +{ + int i; + struct filerec *file = ctxt->ioctl_file; + struct btrfs_ioctl_same_args *same = ctxt->same; + struct btrfs_ioctl_same_extent_info *info; + + printf(_PRE"btrfs same info: ioctl_file: \"%s\"\n", + file ? file->filename : "(null)"); + printf(_PRE"logical_offset: %llu, length: %llu, dest_count: %u\n", + (unsigned long long)same->logical_offset, + (unsigned long long)same->length, same->dest_count); + + for (i = 0; i < same->dest_count; i++) { + info = &same->info[i]; + file = same_idx_to_filerec(ctxt, i); + printf(_PRE"info[%d]: name: \"%s\", fd: %llu, logical_offset: " + "%llu, bytes_deduped: %llu, status: %d\n", + i, file ? file->filename : "(null)", (long long)info->fd, + (unsigned long long)info->logical_offset, + (unsigned long long)info->bytes_deduped, info->status); + } +} +#endif + +static void clear_file_dedupe_info(struct filerec *file) +{ + file->dedupe_total = 0; + file->dedupe_status = 0; + file->dedupe_loff = 0; +} + +static void clear_lists(struct dedupe_ctxt *ctxt) +{ + int i; + struct list_head *lists[3] = { &ctxt->queued, + &ctxt->in_progress, + &ctxt->completed, }; + struct filerec *file, *tmp; + + for (i = 0; i < 3; i++) { + list_for_each_entry_safe(file, tmp, lists[i], dedupe_list) { + clear_file_dedupe_info(file); + list_del_init(&file->dedupe_list); + } + } +} + void free_dedupe_ctxt(struct dedupe_ctxt *ctxt) { if (ctxt) { - if (ctxt->filerec_array) - free(ctxt->filerec_array); + clear_lists(ctxt); if (ctxt->same) free(ctxt->same); free(ctxt); @@ -35,32 +108,30 @@ struct dedupe_ctxt *ctxt = calloc(1, sizeof(*ctxt)); struct btrfs_ioctl_same_args *same; unsigned int same_size; + unsigned int max_dest_files = max_extents - 1; if (ctxt == NULL) return NULL; - ctxt->filerec_array = calloc(max_extents - 1, - sizeof(*ctxt->filerec_array)); - if (ctxt->filerec_array == NULL) { - free(ctxt); - return NULL; - } - same_size = sizeof(*same) + - (max_extents - 1) * sizeof(struct btrfs_ioctl_same_extent_info); + max_dest_files * sizeof(struct btrfs_ioctl_same_extent_info); same = calloc(1, same_size); if (same == NULL) { - free(ctxt->filerec_array); + free(same); free(ctxt); return NULL; } ctxt->same = same; + ctxt->same_size = same_size; - ctxt->max_extents = max_extents; - ctxt->len = ctxt->same->length = elen; + ctxt->max_queable = max_dest_files; + ctxt->len = ctxt->orig_len = elen; ctxt->ioctl_file = ioctl_file; - ctxt->ioctl_file_off = same->logical_offset = loff; + ctxt->ioctl_file_off = ctxt->orig_file_off = loff; + INIT_LIST_HEAD(&ctxt->queued); + INIT_LIST_HEAD(&ctxt->in_progress); + INIT_LIST_HEAD(&ctxt->completed); return ctxt; } @@ -68,31 +139,141 @@ void add_extent_to_dedupe(struct dedupe_ctxt *ctxt, uint64_t loff, uint64_t len, struct filerec *file) { - int i = ctxt->same->dest_count; - struct btrfs_ioctl_same_args *same = ctxt->same; + clear_file_dedupe_info(file); + file->dedupe_loff = loff; + list_add_tail(&file->dedupe_list, &ctxt->queued); + + if (++ctxt->num_queued > ctxt->max_queable) + abort(); +} + +static int add_dedupe_request(struct dedupe_ctxt *ctxt, + struct btrfs_ioctl_same_args *same, + struct filerec *file) +{ + int same_idx = same->dest_count; + struct btrfs_ioctl_same_extent_info *info; - if (ctxt->same->dest_count >= ctxt->max_extents) + if (same->dest_count > ctxt->max_queable) abort(); - same->info[i].logical_offset = loff; - same->info[i].fd = file->fd; - ctxt->filerec_array[i] = file; + info = &same->info[same_idx]; + info->fd = file->fd; + info->logical_offset = file->dedupe_loff; + info->bytes_deduped = 0; same->dest_count++; + +#ifdef DEBUG_DEDUPE + printf("add request %s, off: %llu, dest: %d\n", file->filename, + (unsigned long long)file->dedupe_loff, same->dest_count); +#endif + return same_idx; +} + +static void populate_dedupe_request(struct dedupe_ctxt *ctxt, + struct btrfs_ioctl_same_args *same) +{ + struct filerec *file, *tmp; + + memset(same, 0, ctxt->same_size); + + same->length = ctxt->len; + same->logical_offset = ctxt->ioctl_file_off; + + list_for_each_entry_safe(file, tmp, &ctxt->queued, dedupe_list) { + file->dedupe_idx = add_dedupe_request(ctxt, same, file); + + list_move_tail(&file->dedupe_list, &ctxt->in_progress); + } +} + +/* Returns 1 when there are no more dedupes to process. */ +static void process_dedupes(struct dedupe_ctxt *ctxt, + struct btrfs_ioctl_same_args *same) +{ + int same_idx; + uint64_t max_deduped = 0; + struct btrfs_ioctl_same_extent_info *info; + struct filerec *file, *tmp; + + list_for_each_entry_safe(file, tmp, &ctxt->in_progress, dedupe_list) { + same_idx = file->dedupe_idx; + info = &same->info[same_idx]; + + if (info->bytes_deduped > max_deduped) + max_deduped = info->bytes_deduped; + + file->dedupe_loff += info->bytes_deduped; + file->dedupe_total += info->bytes_deduped; + + if (info->status || file->dedupe_total >= ctxt->orig_len) + goto completed; + + /* put us back on the queued list for another go around */ + list_move_tail(&file->dedupe_list, &ctxt->queued); + continue; +completed: + /* Only bother taking the final status (the rest will be 0) */ + file->dedupe_status = info->status; + list_move_tail(&file->dedupe_list, &ctxt->completed); + } + + /* Increment our ioctl file pointers */ + ctxt->len -= max_deduped; + ctxt->ioctl_file_off += max_deduped; } int dedupe_extents(struct dedupe_ctxt *ctxt) { - return btrfs_extent_same(ctxt->ioctl_file->fd, ctxt->same); + int ret; + + while (!list_empty(&ctxt->queued)) { + /* Convert the queued list into an actual request */ + populate_dedupe_request(ctxt, ctxt->same); + + ret = btrfs_extent_same(ctxt->ioctl_file->fd, ctxt->same); + if (ret) + break; + +#ifdef DEBUG_DEDUPE + print_btrfs_same_info(ctxt); +#endif + + process_dedupes(ctxt, ctxt->same); + } + + return ret; } -void get_dedupe_result(struct dedupe_ctxt *ctxt, int idx, int *status, - uint64_t *off, uint64_t *bytes_deduped, - struct filerec **file) +/* + * Returns 1 when we have no more items. + */ +int pop_one_dedupe_result(struct dedupe_ctxt *ctxt, int *status, + uint64_t *off, uint64_t *bytes_deduped, + struct filerec **file) { - struct btrfs_ioctl_same_extent_info *info = &ctxt->same->info[idx]; + struct filerec *f; - *status = info->status; - *off = info->logical_offset; - *bytes_deduped = info->bytes_deduped; - *file = ctxt->filerec_array[idx]; + if (list_empty(&ctxt->completed)) + goto out; + + f = list_entry(ctxt->completed.next, struct filerec, dedupe_list); + list_del_init(&f->dedupe_list); + + *status = f->dedupe_status; + *off = f->dedupe_loff - f->dedupe_total; + *bytes_deduped = f->dedupe_total; + *file = f; + +out: + return !!list_empty(&ctxt->completed); +} + +void get_target_dedupe_info(struct dedupe_ctxt *ctxt, uint64_t *orig_loff, + uint64_t *orig_len, + struct filerec **file) +{ + *orig_loff = ctxt->orig_file_off; + *orig_len = ctxt->orig_len; + *file = ctxt->ioctl_file; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/dedupe.h new/duperemove-v0.06/dedupe.h --- old/duperemove-v0.04/dedupe.h 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/dedupe.h 2014-04-11 02:11:50.000000000 +0200 @@ -1,16 +1,36 @@ #ifndef __DEDUPE_H__ #define __DEDUPE_H__ +#include "list.h" #include "btrfs-ioctl.h" struct dedupe_ctxt { - unsigned int max_extents; /* used for sanity checking */ + + /* + * Starting len/file off saved for the callers convenience - + * the ones below can change during dedupe operations. + */ + uint64_t orig_len; + uint64_t orig_file_off; uint64_t len; struct filerec *ioctl_file; uint64_t ioctl_file_off; - struct filerec **filerec_array; + /* Next two are used for sanity checking */ + unsigned int max_queable; + unsigned int num_queued; + + unsigned int same_size; + /* + * filerecs that are being used to dedupe against the ioctl file. + * queued: filerec is awaiting dedupe + * in_progress: currently undergoing dedupe operations + * completed: results of dedupe for this file are available + */ + struct list_head queued; + struct list_head in_progress; + struct list_head completed; struct btrfs_ioctl_same_args *same; }; @@ -21,12 +41,10 @@ void add_extent_to_dedupe(struct dedupe_ctxt *ctxt, uint64_t loff, uint64_t len, struct filerec *file); int dedupe_extents(struct dedupe_ctxt *ctxt); -void get_dedupe_result(struct dedupe_ctxt *ctxt, int idx, int *status, - uint64_t *off, uint64_t *bytes_deduped, - struct filerec **file); - -static inline int num_dedupe_requests(struct dedupe_ctxt *ctxt) -{ - return ctxt->same->dest_count; -} +int pop_one_dedupe_result(struct dedupe_ctxt *ctxt, int *status, + uint64_t *off, uint64_t *bytes_deduped, + struct filerec **file); +void get_target_dedupe_info(struct dedupe_ctxt *ctxt, uint64_t *orig_loff, + uint64_t *orig_len, struct filerec **file); + #endif /* __BTRFS_IOCTL_H__ */ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/duperemove.c new/duperemove-v0.06/duperemove.c --- old/duperemove-v0.04/duperemove.c 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/duperemove.c 2014-04-11 02:11:50.000000000 +0200 @@ -25,6 +25,7 @@ #include <errno.h> #include <string.h> #include <linux/limits.h> +#include <ctype.h> #include "rbtree.h" #include "list.h" @@ -103,7 +104,7 @@ } } -static void print_results(struct results_tree *res) +static void print_dupes_table(struct results_tree *res) { struct rb_root *root = &res->root; struct rb_node *node = rb_first(root); @@ -111,7 +112,12 @@ struct extent *extent; uint64_t calc_bytes = 0; - printf("Found %u instances of duplicated extents\n", res->num_dupes); + printf("Simple read and compare of file data found %u instances of " + "extents that might benefit from deduplication.\n", + res->num_dupes); + + if (res->num_dupes == 0) + return; while (1) { uint64_t len, len_blocks; @@ -125,9 +131,9 @@ len_blocks = len / blocksize; calc_bytes += dext->de_score; - vprintf("(dext: 0x%p) %u extents had length %llu (%llu) for a" - " score of %llu.\n", (void *)dext, - dext->de_num_dupes, (unsigned long long)len_blocks, + vprintf("%u extents had a length %llu Blocks (%llu) for a" + " score of %llu.\n", dext->de_num_dupes, + (unsigned long long)len_blocks, (unsigned long long)len, (unsigned long long)dext->de_score); if (debug) { @@ -136,29 +142,32 @@ printf("\n"); } - if (verbose) { - list_for_each_entry(extent, &dext->de_extents, e_list) { - printf("%s\tstart block: %llu (%llu)\n", - extent->e_file->filename, - (unsigned long long)extent->e_loff / blocksize, - (unsigned long long)extent->e_loff); - } + printf("Start\t\tLength\t\tFilename\n"); + list_for_each_entry(extent, &dext->de_extents, e_list) { + printf("%llu\t%llu\t\"%s\"\n", + (unsigned long long)extent->e_loff, + (unsigned long long)len, + extent->e_file->filename); } node = rb_next(node); } - - printf("Calculated %llu bytes of duplicated data.\n", - (unsigned long long)calc_bytes); } static int run_dedupe_and_close_files(struct dedupe_ctxt **ret_ctxt, uint64_t *bytes_deduped) { - int ret, i; + int ret, done = 0; struct dedupe_ctxt *ctxt = *ret_ctxt; + struct filerec *ioctl_file; + uint64_t orig_file_off, orig_len; + + /* For our target status loop */ + int target_status; + uint64_t target_loff, target_bytes; + struct filerec *f; - printf("Running dedupe.\n"); + printf("Requesting dedupe pass from kernel.\n"); ret = dedupe_extents(ctxt); if (ret) { @@ -169,23 +178,20 @@ goto cleanup; } - printf("Dedupe from: \"%s\"\toffset: %llu\tlen: %llu\n", - ctxt->ioctl_file->filename, - (unsigned long long)ctxt->ioctl_file_off, - (unsigned long long)ctxt->len); - - for (i = 0; i < num_dedupe_requests(ctxt); i++) { - uint64_t target_loff, target_bytes; - int status; - struct filerec *f; - - get_dedupe_result(ctxt, i, &status, &target_loff, - &target_bytes, &f); - - printf("\"%s\":\toffset: %llu\tdeduped bytes: %llu" - "\tstatus: %d\n", f->filename, - (unsigned long long)target_loff, - (unsigned long long)target_bytes, status); + get_target_dedupe_info(ctxt, &orig_file_off, &orig_len, &ioctl_file); + + vprintf("Ask for dedupe from: \"%s\"\toffset: %llu\tlen: %llu\n", + ioctl_file->filename, + (unsigned long long)orig_file_off, + (unsigned long long)orig_len); + + while (!done) { + done = pop_one_dedupe_result(ctxt, &target_status, &target_loff, + &target_bytes, &f); + vprintf("\"%s\":\toffset: %llu\tmaybe deduped bytes: %llu" + "\tstatus: %d\n", f->filename, + (unsigned long long)target_loff, + (unsigned long long)target_bytes, target_status); filerec_close(f); *bytes_deduped += target_bytes; @@ -208,9 +214,12 @@ struct dedupe_ctxt *ctxt = NULL; uint64_t actual_bytes = 0; - print_results(res); + print_dupes_table(res); - printf("Deduping data...\n"); + if (RB_EMPTY_ROOT(root)) { + printf("Nothing to dedupe.\n"); + return; + } while (1) { uint64_t len, len_blocks; @@ -294,7 +303,9 @@ node = rb_next(node); } - printf("Deduped %llu bytes of data\n", (unsigned long long)actual_bytes); + printf("Kernel reports %llu bytes of data processed. Actual disk " + "savings will differ depending on how much of the data was " + "previously deduplicated.\n", (unsigned long long)actual_bytes); } static int csum_whole_file(struct hash_tree *tree, struct filerec *file) @@ -303,7 +314,7 @@ ssize_t bytes; uint64_t off; - vprintf("csum: %s\n", file->filename); + printf("csum: %s\n", file->filename); ret = filerec_open(file, 0); if (ret) @@ -374,7 +385,7 @@ { printf("duperemove %s\n", VERSTRING); printf("Find duplicate extents and print them to stdout\n\n"); - printf("Usage: %s [-r] [-D] [-A] [-b blocksize-in-K] [-v] [-d]" + printf("Usage: %s [-r] [-D] [-A] [-b blocksize] [-v] [-d]" " OBJECTS\n", prog); printf("Where \"OBJECTS\" is a list of files (or directories) which\n"); printf("we want to find duplicate extents in. If a directory is \n"); @@ -383,7 +394,8 @@ printf("\t-r\t\tEnable recursive dir traversal.\n"); printf("\t-D\t\tDe-dupe the results - only works on btrfs.\n"); printf("\t-A\t\tOpens files readonly when deduping. Primarily for use by privileged users on readonly snapshots\n"); - printf("\t-b bsize\tUse bsize blocks - specify in kilobytes. Default is %d.\n", DEFAULT_BLOCKSIZE / 1024); + printf("\t-b bsize\tUse bsize blocks. Default is %dk.\n", + DEFAULT_BLOCKSIZE / 1024); printf("\t-v\t\tBe verbose.\n"); printf("\t-d\t\tPrint debug messages, forces -v if selected.\n"); printf("\t-h\t\tPrints this help text.\n"); @@ -478,13 +490,64 @@ exit(ENOMEM); } - dprintf("added file: %s\n", path); - out: pathp = pathtmp; } /* + * parse_size() taken from btrfs-progs/util.c + */ +uint64_t parse_size(char *s) +{ + int i; + char c; + uint64_t mult = 1; + + for (i = 0; s && s[i] && isdigit(s[i]); i++) ; + if (!i) { + fprintf(stderr, "ERROR: size value is empty\n"); + exit(50); + } + + if (s[i]) { + c = tolower(s[i]); + switch (c) { + case 'e': + mult *= 1024; + /* fallthrough */ + case 'p': + mult *= 1024; + /* fallthrough */ + case 't': + mult *= 1024; + /* fallthrough */ + case 'g': + mult *= 1024; + /* fallthrough */ + case 'm': + mult *= 1024; + /* fallthrough */ + case 'k': + mult *= 1024; + /* fallthrough */ + case 'b': + break; + default: + fprintf(stderr, "ERROR: Unknown size descriptor " + "'%c'\n", c); + exit(1); + } + } + if (s[i] && s[i+1]) { + fprintf(stderr, "ERROR: Illegal suffix contains " + "character '%c' in wrong position\n", + s[i+1]); + exit(51); + } + return strtoull(s, NULL, 10) * mult; +} + +/* * Ok this is doing more than just parsing options. */ static int parse_options(int argc, char **argv) @@ -500,8 +563,7 @@ target_rw = 0; break; case 'b': - blocksize = atoi(optarg); - blocksize *= 1024; + blocksize = parse_size(optarg); if (blocksize < MIN_BLOCKSIZE || blocksize > MAX_BLOCKSIZE) return EINVAL; @@ -682,7 +744,7 @@ return EINVAL; } - vprintf("Using %uK blocks\n", blocksize/1024); + printf("Using %uK blocks\n", blocksize/1024); buf = malloc(blocksize); if (!buf) @@ -704,7 +766,7 @@ } if (debug) { - print_results(&res); + print_dupes_table(&res); printf("\n\nRemoving overlapping extents\n\n"); } @@ -715,7 +777,7 @@ if (run_dedupe) dedupe_results(&res); else - print_results(&res); + print_dupes_table(&res); out: return ret; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/filerec.c new/duperemove-v0.06/filerec.c --- old/duperemove-v0.04/filerec.c 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/filerec.c 2014-04-11 02:11:50.000000000 +0200 @@ -27,6 +27,7 @@ file->fd = -1; INIT_LIST_HEAD(&file->block_list); INIT_LIST_HEAD(&file->extent_list); + INIT_LIST_HEAD(&file->dedupe_list); list_add_tail(&file->rec_list, &filerec_list); } @@ -43,6 +44,7 @@ list_del(&file->block_list); list_del(&file->extent_list); list_del(&file->rec_list); + list_del(&file->dedupe_list); free(file); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/duperemove-v0.04/filerec.h new/duperemove-v0.06/filerec.h --- old/duperemove-v0.04/filerec.h 2014-03-12 07:19:56.000000000 +0100 +++ new/duperemove-v0.06/filerec.h 2014-04-11 02:11:50.000000000 +0200 @@ -1,6 +1,7 @@ #ifndef __FILEREC__ #define __FILEREC__ +#include <stdint.h> #include "list.h" extern struct list_head filerec_list; @@ -14,6 +15,16 @@ struct list_head extent_list; /* head for results node list */ struct list_head rec_list; /* all filerecs */ + + /* + * Used by dedupe code to track state of this file during a + * dedupe request. + */ + uint64_t dedupe_loff; + uint64_t dedupe_total; + int dedupe_status; + int dedupe_idx; + struct list_head dedupe_list; /* see comment in dededupe.h */ }; static inline void init_filerec(void) -- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
