Re: systemd-journal, nodatacow, was: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Duncan
Chris Murphy posted on Sun, 29 Dec 2013 17:38:23 -0700 as excerpted:

 And I'm predicting that since btrfs is the assumed successor to the
 ext*
 series as the Linux default filesystem, and systemd is targeting Linux
 default initsystem status as well, it's only logical that at some point
 systemd will detect what filesystem it's logging to, and will
 automatically set NOCOW on the journal file when that filesystem is
 btrfs.
 
 Is this something that should be brought up on the systemd-devel@ list?
 Or maybe file it as an RFE against systemd at freedesktop.org?

I don't know.

While I don't (yet?) run systemd personally, I'd have almost thought it'd 
be done by now (tho obviously it's not, at least in distro-current 
versions), but perhaps they've been waiting on word that btrfs or some API 
they plan to use for it is stabilizing before doing it.

-- 
Duncan - List replies preferred.   No HTML msgs.
Every nonfree program has a lord, a master --
and if you use the program, he is your master.  Richard Stallman

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 05/14] Btrfs: introduce dedup tree and relatives

2013-12-30 Thread Liu Bo
This is a preparation step for online/inband dedup tree.
It introduces dedup tree and its relatives, including hash driver and
some structures.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ctree.h | 73 
 fs/btrfs/disk-io.c   | 36 ++
 fs/btrfs/extent-tree.c   |  2 ++
 include/trace/events/btrfs.h |  3 +-
 4 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54ab861..0e5718a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
 #include asm/kmap_types.h
 #include linux/pagemap.h
 #include linux/btrfs.h
+#include crypto/hash.h
 #include extent_io.h
 #include extent_map.h
 #include async-thread.h
@@ -101,6 +102,9 @@ struct btrfs_ordered_sum;
 /* for storing items that use the BTRFS_UUID_KEY* types */
 #define BTRFS_UUID_TREE_OBJECTID 9ULL
 
+/* dedup tree(experimental) */
+#define BTRFS_DEDUP_TREE_OBJECTID 10ULL
+
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
@@ -521,6 +525,7 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF   (1ULL  6)
 #define BTRFS_FEATURE_INCOMPAT_RAID56  (1ULL  7)
 #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL  8)
+#define BTRFS_FEATURE_INCOMPAT_DEDUP   (1ULL  9)
 
 #define BTRFS_FEATURE_COMPAT_SUPP  0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP   0ULL
@@ -532,6 +537,7 @@ struct btrfs_super_block {
 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |  \
 BTRFS_FEATURE_INCOMPAT_RAID56 |\
 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+BTRFS_FEATURE_INCOMPAT_DEDUP | \
 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
 
 /*
@@ -903,6 +909,51 @@ struct btrfs_csum_item {
u8 csum;
 } __attribute__ ((__packed__));
 
+/* dedup */
+enum btrfs_dedup_type {
+   BTRFS_DEDUP_SHA256 = 0,
+   BTRFS_DEDUP_LAST = 1,
+};
+
+static int btrfs_dedup_lens[] = { 4, 0 };
+static int btrfs_dedup_sizes[] = { 32, 0 };/* 256bit, 32bytes */
+
+struct btrfs_dedup_item {
+   /* disk length of dedup range */
+   __le64 len;
+
+   u8 type;
+   u8 compression;
+   u8 encryption;
+
+   /* spare for later use */
+   __le16 other_encoding;
+
+   /* hash/fingerprints go here */
+} __attribute__ ((__packed__));
+
+struct btrfs_dedup_hash {
+   u64 bytenr;
+   u64 num_bytes;
+
+   /* hash algorithm */
+   int type;
+
+   int compression;
+
+   /* last field is a variable length array of dedup hash */
+   u64 hash[];
+};
+
+static inline int btrfs_dedup_hash_size(int type)
+{
+   WARN_ON((btrfs_dedup_lens[type] * sizeof(u64)) !=
+btrfs_dedup_sizes[type]);
+
+   return sizeof(struct btrfs_dedup_hash) + btrfs_dedup_sizes[type];
+}
+
+
 struct btrfs_dev_stats_item {
/*
 * grow this item struct at the end for future enhancements and keep
@@ -1304,6 +1355,7 @@ struct btrfs_fs_info {
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
struct btrfs_root *csum_root;
+   struct btrfs_root *dedup_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
 
@@ -1655,6 +1707,14 @@ struct btrfs_fs_info {
 
struct semaphore uuid_tree_rescan_sem;
unsigned int update_uuid_tree_gen:1;
+
+   /* reference to deduplication algorithm driver via cryptoapi */
+   struct crypto_shash *dedup_driver;
+
+   /* dedup blocksize */
+   u64 dedup_bs;
+
+   int dedup_type;
 };
 
 /*
@@ -1968,6 +2028,8 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_STRING_ITEM_KEY  253
 
+#define BTRFS_DEDUP_ITEM_KEY   254
+
 /*
  * Flags for mount options.
  *
@@ -2980,6 +3042,14 @@ static inline u32 
btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
 }
 
+/* btrfs_dedup_item */
+BTRFS_SETGET_FUNCS(dedup_len, struct btrfs_dedup_item, len, 64);
+BTRFS_SETGET_FUNCS(dedup_compression, struct btrfs_dedup_item, compression, 8);
+BTRFS_SETGET_FUNCS(dedup_encryption, struct btrfs_dedup_item, encryption, 8);
+BTRFS_SETGET_FUNCS(dedup_other_encoding, struct btrfs_dedup_item,
+  other_encoding, 16);
+BTRFS_SETGET_FUNCS(dedup_type, struct btrfs_dedup_item, type, 8);
+
 /* btrfs_dev_stats_item */
 static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
struct btrfs_dev_stats_item *ptr,
@@ -3443,6 +3513,8 @@ static inline int btrfs_need_cleaner_sleep(struct 
btrfs_root *root)
 
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+   if (fs_info-dedup_driver)
+   crypto_free_shash(fs_info-dedup_driver);
kfree(fs_info-balance_ctl);
kfree(fs_info-delayed_root);
kfree(fs_info-extent_root);
@@ -3615,6 +3687,7 @@ int btrfs_csum_one_bio(struct btrfs_root 

[RFC PATCH v8 00/14] Online(inband) data deduplication

2013-12-30 Thread Liu Bo
Hello,

Here is the New Year patch bomb :-)

Data deduplication is a specialized data compression technique for eliminating
duplicate copies of repeating data.[1]

This patch set is also related to Content based storage in project ideas[2],
it introduces inband data deduplication for btrfs and dedup/dedupe is for short.

PATCH 1 is a hang fix with deduplication on, but it's also useful without
dedup in practice use.

PATCH 2 and 3 are targetting delayed refs' scalability problems, which are
uncovered by the dedup feature.

PATCH 4 is a speed-up improvement, which is about dedup and quota.

PATCH 5-8 is the preparation work for dedup implementation.

PATCH 9 shows how we implement dedup feature.

PATCH 10 fixes a backref walking bug with dedup.

PATCH 11 fixes a free space bug of dedup extents on error handling.

PATCH 12 adds the ioctl to control dedup feature.

PATCH 13 fixes the metadata ENOSPC problem with dedup which has been there
WAY TOO LONG.

PATCH 14 fixes a race bug on dedup writes.

And there is also a btrfs-progs patch(PATCH 15) which offers all details about
how to control the dedup feature.

I've tested this with xfstests by adding a inline dedup 'enable  on' in 
xfstests'
mount and scratch_mount.

TODO:
* a bit-to-bit comparison callback.

All comments are welcome!


[1]: http://en.wikipedia.org/wiki/Data_deduplication
[2]: https://btrfs.wiki.kernel.org/index.php/Project_ideas#Content_based_storage

v8:
- fix the race crash of dedup ref again.
- fix the metadata ENOSPC problem with dedup.

v7:
- rebase onto the lastest btrfs
- break a big patch into smaller ones to make reviewers happy.
- kill mount options of dedup and use ioctl method instead.
- fix two crash due to the special dedup ref

For former patch sets:
v6: http://thread.gmane.org/gmane.comp.file-systems.btrfs/27512
v5: http://thread.gmane.org/gmane.comp.file-systems.btrfs/27257
v4: http://thread.gmane.org/gmane.comp.file-systems.btrfs/25751
v3: http://comments.gmane.org/gmane.comp.file-systems.btrfs/25433
v2: http://comments.gmane.org/gmane.comp.file-systems.btrfs/24959

Liu Bo (14):
  Btrfs: skip merge part for delayed data refs
  Btrfs: improve the delayed refs process in rm case
  Btrfs: introduce a head ref rbtree
  Btrfs: disable qgroups accounting when quata_enable is 0
  Btrfs: introduce dedup tree and relatives
  Btrfs: introduce dedup tree operations
  Btrfs: introduce dedup state
  Btrfs: make ordered extent aware of dedup
  Btrfs: online(inband) data dedup
  Btrfs: skip dedup reference during backref walking
  Btrfs: don't return space for dedup extent
  Btrfs: add ioctl of dedup control
  Btrfs: fix dedupe 'ENOSPC' problem
  Btrfs: fix a crash of dedup ref

 fs/btrfs/backref.c   |   9 +
 fs/btrfs/ctree.c |   2 +-
 fs/btrfs/ctree.h |  86 ++
 fs/btrfs/delayed-ref.c   | 161 +++
 fs/btrfs/delayed-ref.h   |   8 +
 fs/btrfs/disk-io.c   |  40 +++
 fs/btrfs/extent-tree.c   | 208 --
 fs/btrfs/extent_io.c |  22 +-
 fs/btrfs/extent_io.h |  16 ++
 fs/btrfs/file-item.c | 244 +
 fs/btrfs/inode.c | 635 ++-
 fs/btrfs/ioctl.c | 167 
 fs/btrfs/ordered-data.c  |  38 ++-
 fs/btrfs/ordered-data.h  |  13 +-
 fs/btrfs/qgroup.c|   3 +
 fs/btrfs/relocation.c|   3 +
 fs/btrfs/transaction.c   |   4 +-
 include/trace/events/btrfs.h |   3 +-
 include/uapi/linux/btrfs.h   |  11 +
 19 files changed, 1501 insertions(+), 172 deletions(-)

-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 02/14] Btrfs: improve the delayed refs process in rm case

2013-12-30 Thread Liu Bo
While removing a file with dedup extents, we could have a great number of
delayed refs pending to process, and these refs refer to droping
a ref of the extent, which is of BTRFS_DROP_DELAYED_REF type.

But in order to prevent an extent's ref count from going down to zero when
there still are pending delayed refs, we first select those adding a ref
ones, which is of BTRFS_ADD_DELAYED_REF type.

So in removing case, all of our delayed refs are of BTRFS_DROP_DELAYED_REF type,
but we have to walk all the refs issued to the extent to find any
BTRFS_ADD_DELAYED_REF types and end up there is no such thing, and then start
over again to find BTRFS_DROP_DELAYED_REF.

This is really unnecessary, we can improve this by tracking how many
BTRFS_ADD_DELAYED_REF refs we have and search by the right type.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/delayed-ref.c | 10 ++
 fs/btrfs/delayed-ref.h |  3 +++
 fs/btrfs/extent-tree.c | 17 -
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b0d5d79..9596649 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -543,6 +543,10 @@ update_existing_head_ref(struct btrfs_delayed_ref_node 
*existing,
 * update the reference mod on the head to reflect this new operation
 */
existing-ref_mod += update-ref_mod;
+
+   WARN_ON_ONCE(update-ref_mod  1);
+   if (update-ref_mod == 1)
+   existing_ref-add_cnt++;
 }
 
 /*
@@ -604,6 +608,12 @@ static noinline void add_delayed_ref_head(struct 
btrfs_fs_info *fs_info,
head_ref-must_insert_reserved = must_insert_reserved;
head_ref-is_data = is_data;
 
+   /* track added ref, more comments in select_delayed_ref() */
+   if (count_mod == 1)
+   head_ref-add_cnt = 1;
+   else
+   head_ref-add_cnt = 0;
+
INIT_LIST_HEAD(head_ref-cluster);
mutex_init(head_ref-mutex);
 
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 70b962c..9377b27 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -84,6 +84,9 @@ struct btrfs_delayed_ref_head {
struct list_head cluster;
 
struct btrfs_delayed_extent_op *extent_op;
+
+   int add_cnt;
+
/*
 * when a new extent is allocated, it is just reserved in memory
 * The actual extent isn't inserted into the extent allocation tree
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 009980c..a6fb5fa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2287,6 +2287,16 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
struct rb_node *node;
struct btrfs_delayed_ref_node *ref;
int action = BTRFS_ADD_DELAYED_REF;
+
+   /*
+* track the count of BTRFS_ADD_DELAYED_REF,
+* in the case that there's no BTRFS_ADD_DELAYED_REF while there're a
+* a great number of BTRFS_DROP_DELAYED_REF,
+* it'll waste time on searching BTRFS_ADD_DELAYED_REF, usually this
+* happens with dedup enabled.
+*/
+   if (head-add_cnt == 0)
+   action = BTRFS_DROP_DELAYED_REF;
 again:
/*
 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
@@ -2301,8 +2311,11 @@ again:
rb_node);
if (ref-bytenr != head-node.bytenr)
break;
-   if (ref-action == action)
+   if (ref-action == action) {
+   if (action == BTRFS_ADD_DELAYED_REF)
+   head-add_cnt--;
return ref;
+   }
node = rb_prev(node);
}
if (action == BTRFS_ADD_DELAYED_REF) {
@@ -2378,6 +2391,8 @@ static noinline int run_clustered_refs(struct 
btrfs_trans_handle *trans,
 * there are still refs with lower seq numbers in the
 * process of being added. Don't run this ref yet.
 */
+   if (ref-action == BTRFS_ADD_DELAYED_REF)
+   locked_ref-add_cnt++;
list_del_init(locked_ref-cluster);
btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 04/14] Btrfs: disable qgroups accounting when quata_enable is 0

2013-12-30 Thread Liu Bo
It's unnecessary to do qgroups accounting without enabling quota.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ctree.c   |  2 +-
 fs/btrfs/delayed-ref.c | 18 ++
 fs/btrfs/qgroup.c  |  3 +++
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 316136b..160fa3e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -407,7 +407,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 
tree_mod_log_write_lock(fs_info);
spin_lock(fs_info-tree_mod_seq_lock);
-   if (!elem-seq) {
+   if (elem  !elem-seq) {
elem-seq = btrfs_inc_tree_mod_seq_major(fs_info);
list_add_tail(elem-list, fs_info-tree_mod_seq_list);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9e1a1c9..3ec3d08 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -691,8 +691,13 @@ static noinline void add_delayed_tree_ref(struct 
btrfs_fs_info *fs_info,
ref-is_head = 0;
ref-in_tree = 1;
 
-   if (need_ref_seq(for_cow, ref_root))
-   seq = btrfs_get_tree_mod_seq(fs_info, trans-delayed_ref_elem);
+   if (need_ref_seq(for_cow, ref_root)) {
+   struct seq_list *elem = NULL;
+
+   if (fs_info-quota_enabled)
+   elem = trans-delayed_ref_elem;
+   seq = btrfs_get_tree_mod_seq(fs_info, elem);
+   }
ref-seq = seq;
 
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -750,8 +755,13 @@ static noinline void add_delayed_data_ref(struct 
btrfs_fs_info *fs_info,
ref-is_head = 0;
ref-in_tree = 1;
 
-   if (need_ref_seq(for_cow, ref_root))
-   seq = btrfs_get_tree_mod_seq(fs_info, trans-delayed_ref_elem);
+   if (need_ref_seq(for_cow, ref_root)) {
+   struct seq_list *elem = NULL;
+
+   if (fs_info-quota_enabled)
+   elem = trans-delayed_ref_elem;
+   seq = btrfs_get_tree_mod_seq(fs_info, elem);
+   }
ref-seq = seq;
 
full_ref = btrfs_delayed_node_to_data_ref(ref);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e6ef49..1cb58f9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1188,6 +1188,9 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle 
*trans,
 {
struct qgroup_update *u;
 
+   if (!trans-root-fs_info-quota_enabled)
+   return 0;
+
BUG_ON(!trans-delayed_ref_elem.seq);
u = kmalloc(sizeof(*u), GFP_NOFS);
if (!u)
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 01/14] Btrfs: skip merge part for delayed data refs

2013-12-30 Thread Liu Bo
When we have data deduplication on, we'll hang on the merge part
because it needs to verify every queued delayed data refs related to
this disk offset but we may have millions refs.

And in the case of delayed data refs, we don't usually have too much
data refs to merge.

So it's safe to shut it down for data refs.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/delayed-ref.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e4d467b..b0d5d79 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -320,6 +320,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle 
*trans,
struct rb_node *node;
u64 seq = 0;
 
+   /*
+* We don't have too much refs to merge in the case of delayed data
+* refs.
+*/
+   if (head-is_data)
+   return;
+
spin_lock(fs_info-tree_mod_seq_lock);
if (!list_empty(fs_info-tree_mod_seq_list)) {
struct seq_list *elem;
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 12/14] Btrfs: add ioctl of dedup control

2013-12-30 Thread Liu Bo
So far we have 4 commands to control dedup behaviour,
- btrfs dedup enable
Create the dedup tree, and it's the very first step when you're going to use
the dedup feature.

- btrfs dedup disable
Delete the dedup tree, after this we're not able to use dedup any more unless
you enable it again.

- btrfs dedup on [-b]
Switch on the dedup feature temporarily, and it's the second step of applying
dedup with writes.  Option '-b' is used to set dedup blocksize.
The default blocksize is 8192(no special reason, you may argue), and the current
limit is [4096, 128 * 1024], because 4K is the generic page size and 128K is the
upper limit of btrfs's compression.

- btrfs dedup off
Switch off the dedup feature temporarily, but the dedup tree remains.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ctree.h   |   3 +
 fs/btrfs/disk-io.c |   1 +
 fs/btrfs/ioctl.c   | 167 +
 include/uapi/linux/btrfs.h |  11 +++
 4 files changed, 182 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 52b2843..1b89d6c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1715,6 +1715,9 @@ struct btrfs_fs_info {
u64 dedup_bs;
 
int dedup_type;
+
+   /* protect user change for dedup operations */
+   struct mutex dedup_ioctl_mutex;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2c83de7..9c9667d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2314,6 +2314,7 @@ int open_ctree(struct super_block *sb,
mutex_init(fs_info-dev_replace.lock_finishing_cancel_unmount);
mutex_init(fs_info-dev_replace.lock_management_lock);
mutex_init(fs_info-dev_replace.lock);
+   mutex_init(fs_info-dedup_ioctl_mutex);
 
spin_lock_init(fs_info-qgroup_lock);
mutex_init(fs_info-qgroup_ioctl_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 347bf61..75fb3de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4481,6 +4481,171 @@ out_unlock:
return ret;
 }
 
+static long btrfs_enable_dedup(struct btrfs_root *root)
+{
+   struct btrfs_fs_info *fs_info = root-fs_info;
+   struct btrfs_trans_handle *trans = NULL;
+   struct btrfs_root *dedup_root;
+   int ret = 0;
+
+   mutex_lock(fs_info-dedup_ioctl_mutex);
+   if (fs_info-dedup_root) {
+   pr_info(btrfs: dedup has already been enabled\n);
+   mutex_unlock(fs_info-dedup_ioctl_mutex);
+   return 0;
+   }
+
+   trans = btrfs_start_transaction(root, 2);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   mutex_unlock(fs_info-dedup_ioctl_mutex);
+   return ret;
+   }
+
+   dedup_root = btrfs_create_tree(trans, fs_info,
+  BTRFS_DEDUP_TREE_OBJECTID);
+   if (IS_ERR(dedup_root))
+   ret = PTR_ERR(dedup_root);
+
+   if (ret)
+   btrfs_end_transaction(trans, root);
+   else
+   ret = btrfs_commit_transaction(trans, root);
+
+   if (!ret) {
+   pr_info(btrfs: dedup enabled\n);
+   fs_info-dedup_root = dedup_root;
+   fs_info-dedup_root-block_rsv = fs_info-global_block_rsv;
+   btrfs_set_fs_incompat(fs_info, DEDUP);
+   }
+
+   mutex_unlock(fs_info-dedup_ioctl_mutex);
+   return ret;
+}
+
+static long btrfs_disable_dedup(struct btrfs_root *root)
+{
+   struct btrfs_fs_info *fs_info = root-fs_info;
+   struct btrfs_root *dedup_root;
+   int ret;
+
+   mutex_lock(fs_info-dedup_ioctl_mutex);
+   if (!fs_info-dedup_root) {
+   pr_info(btrfs: dedup has been disabled\n);
+   mutex_unlock(fs_info-dedup_ioctl_mutex);
+   return 0;
+   }
+
+   if (fs_info-dedup_bs != 0) {
+   pr_info(btrfs: cannot disable dedup until switching off 
dedup!\n);
+   mutex_unlock(fs_info-dedup_ioctl_mutex);
+   return -EBUSY;
+   }
+
+   dedup_root = fs_info-dedup_root;
+
+   ret = btrfs_drop_snapshot(dedup_root, NULL, 1, 0);
+
+   if (!ret) {
+   fs_info-dedup_root = NULL;
+   pr_info(btrfs: dedup disabled\n);
+   }
+
+   mutex_unlock(fs_info-dedup_ioctl_mutex);
+   WARN_ON(ret  0  ret != -EAGAIN  ret != -EROFS);
+   return ret;
+}
+
+static long btrfs_set_dedup_bs(struct btrfs_root *root, u64 bs)
+{
+   struct btrfs_fs_info *info = root-fs_info;
+   int ret = 0;
+
+   mutex_lock(info-dedup_ioctl_mutex);
+   if (!info-dedup_root) {
+   pr_info(btrfs: dedup is disabled, we cannot switch on/off 
dedup\n);
+   ret = -EINVAL;
+   goto out;
+   }
+
+   bs = ALIGN(bs, root-sectorsize);
+   bs = min_t(u64, bs, (128 * 1024ULL));
+
+   if (bs == info-dedup_bs) {
+   if (info-dedup_bs == 0)
+   pr_info(btrfs: switch OFF dedup(it's already 

[PATCH v8 06/14] Btrfs: introduce dedup tree operations

2013-12-30 Thread Liu Bo
The operations consist of finding matched items, adding new items and
removing items.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ctree.h |   9 +++
 fs/btrfs/file-item.c | 210 +++
 2 files changed, 219 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0e5718a..52b2843 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3688,6 +3688,15 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 struct list_head *list, int search_commit);
 
+int noinline_for_stack
+btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash 
*hash);
+int noinline_for_stack
+btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_dedup_hash *hash);
+int noinline_for_stack
+btrfs_free_dedup_extent(struct btrfs_trans_handle *trans,
+   struct btrfs_root *root, u64 hash, u64 bytenr);
 /* inode.c */
 struct btrfs_delalloc_work {
struct inode *inode;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6f38488..fd95692 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -884,3 +884,213 @@ out:
 fail_unlock:
goto out;
 }
+
+/* 1 means we find one, 0 means we dont. */
+int noinline_for_stack
+btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash)
+{
+   struct btrfs_key key;
+   struct btrfs_path *path;
+   struct extent_buffer *leaf;
+   struct btrfs_root *dedup_root;
+   struct btrfs_dedup_item *item;
+   u64 hash_value;
+   u64 length;
+   u64 dedup_size;
+   int compression;
+   int found = 0;
+   int index;
+   int ret;
+
+   if (!hash) {
+   WARN_ON(1);
+   return 0;
+   }
+   if (!root-fs_info-dedup_root) {
+   WARN(1, KERN_INFO dedup not enabled\n);
+   return 0;
+   }
+   dedup_root = root-fs_info-dedup_root;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return 0;
+
+   /*
+* For SHA256 dedup algorithm, we store the last 64bit as the
+* key.objectid, and the rest in the tree item.
+*/
+   index = btrfs_dedup_lens[hash-type] - 1;
+   dedup_size = btrfs_dedup_sizes[hash-type] - sizeof(u64);
+
+   hash_value = hash-hash[index];
+
+   key.objectid = hash_value;
+   key.offset = (u64)-1;
+   btrfs_set_key_type(key, BTRFS_DEDUP_ITEM_KEY);
+
+   ret = btrfs_search_slot(NULL, dedup_root, key, path, 0, 0);
+   if (ret  0)
+   goto out;
+   if (ret == 0) {
+   WARN_ON(1);
+   goto out;
+   }
+
+prev_slot:
+   /* this will do match checks. */
+   ret = btrfs_previous_item(dedup_root, path, hash_value,
+ BTRFS_DEDUP_ITEM_KEY);
+   if (ret)
+   goto out;
+
+   leaf = path-nodes[0];
+   btrfs_item_key_to_cpu(leaf, key, path-slots[0]);
+   if (key.objectid != hash_value)
+   goto out;
+
+   item = btrfs_item_ptr(leaf, path-slots[0], struct btrfs_dedup_item);
+   /* disk length of dedup range */
+   length = btrfs_dedup_len(leaf, item);
+
+   compression = btrfs_dedup_compression(leaf, item);
+   if (compression  BTRFS_COMPRESS_TYPES) {
+   WARN_ON(1);
+   goto out;
+   }
+
+   if (btrfs_dedup_type(leaf, item) != hash-type)
+   goto prev_slot;
+
+   if (memcmp_extent_buffer(leaf, hash-hash, (unsigned long)(item + 1),
+dedup_size)) {
+   pr_info(goto prev\n);
+   goto prev_slot;
+   }
+
+   hash-bytenr = key.offset;
+   hash-num_bytes = length;
+   hash-compression = compression;
+   found = 1;
+out:
+   btrfs_free_path(path);
+   return found;
+}
+
+int noinline_for_stack
+btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_dedup_hash *hash)
+{
+   struct btrfs_key key;
+   struct btrfs_path *path;
+   struct extent_buffer *leaf;
+   struct btrfs_root *dedup_root;
+   struct btrfs_dedup_item *dedup_item;
+   u64 ins_size;
+   u64 dedup_size;
+   int index;
+   int ret;
+
+   if (!hash) {
+   WARN_ON(1);
+   return 0;
+   }
+
+   WARN_ON(hash-num_bytes  root-fs_info-dedup_bs);
+
+   if (!root-fs_info-dedup_root) {
+   WARN(1, KERN_INFO dedup not enabled\n);
+   return 0;
+   }
+   dedup_root = root-fs_info-dedup_root;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   /*
+* For SHA256 dedup algorithm, we store the last 64bit 

[PATCH v8 03/14] Btrfs: introduce a head ref rbtree

2013-12-30 Thread Liu Bo
The way how we process delayed refs is
1) get a bunch of head refs,
2) pick up one head ref,
3) go one node back for any delayed ref updates.

The head ref is also linked in the same rbtree as the delayed ref is,
so in 1) stage, we have to walk one by one including not only head refs, but
delayed refs.

When we have a great number of delayed refs pending to process,
this'll cost time a lot.

Here we introduce a head ref specific rbtree, it only has head refs, so troubles
go away.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/delayed-ref.c | 126 +
 fs/btrfs/delayed-ref.h |   5 ++
 fs/btrfs/disk-io.c |   3 ++
 fs/btrfs/extent-tree.c |  21 ++---
 fs/btrfs/transaction.c |   4 +-
 5 files changed, 99 insertions(+), 60 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9596649..9e1a1c9 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct 
rb_root *root,
return NULL;
 }
 
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+  struct rb_node *node)
+{
+   struct rb_node **p = root-rb_node;
+   struct rb_node *parent_node = NULL;
+   struct btrfs_delayed_ref_head *entry;
+   struct btrfs_delayed_ref_head *ins;
+   u64 bytenr;
+
+   ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+   bytenr = ins-node.bytenr;
+   while (*p) {
+   parent_node = *p;
+   entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+href_node);
+
+   if (bytenr  entry-node.bytenr)
+   p = (*p)-rb_left;
+   else if (bytenr  entry-node.bytenr)
+   p = (*p)-rb_right;
+   else
+   return entry;
+   }
+
+   rb_link_node(node, parent_node, p);
+   rb_insert_color(node, root);
+   return NULL;
+}
+
 /*
  * find an head entry based on bytenr. This returns the delayed ref
  * head if it was able to find one, or NULL if nothing was in that spot.
  * If return_bigger is given, the next bigger entry is returned if no exact
  * match is found.
  */
-static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
- u64 bytenr,
- struct btrfs_delayed_ref_node **last,
- int return_bigger)
+static struct btrfs_delayed_ref_head *
+find_ref_head(struct rb_root *root, u64 bytenr,
+ struct btrfs_delayed_ref_head **last, int return_bigger)
 {
struct rb_node *n;
-   struct btrfs_delayed_ref_node *entry;
+   struct btrfs_delayed_ref_head *entry;
int cmp = 0;
 
 again:
n = root-rb_node;
entry = NULL;
while (n) {
-   entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
-   WARN_ON(!entry-in_tree);
+   entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
if (last)
*last = entry;
 
-   if (bytenr  entry-bytenr)
+   if (bytenr  entry-node.bytenr)
cmp = -1;
-   else if (bytenr  entry-bytenr)
-   cmp = 1;
-   else if (!btrfs_delayed_ref_is_head(entry))
+   else if (bytenr  entry-node.bytenr)
cmp = 1;
else
cmp = 0;
@@ -203,12 +229,12 @@ again:
}
if (entry  return_bigger) {
if (cmp  0) {
-   n = rb_next(entry-rb_node);
+   n = rb_next(entry-href_node);
if (!n)
n = rb_first(root);
-   entry = rb_entry(n, struct btrfs_delayed_ref_node,
-rb_node);
-   bytenr = entry-bytenr;
+   entry = rb_entry(n, struct btrfs_delayed_ref_head,
+href_node);
+   bytenr = entry-node.bytenr;
return_bigger = 0;
goto again;
}
@@ -246,6 +272,12 @@ static inline void drop_delayed_ref(struct 
btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref)
 {
rb_erase(ref-rb_node, delayed_refs-root);
+   if (btrfs_delayed_ref_is_head(ref)) {
+   struct btrfs_delayed_ref_head *head;
+
+   head = btrfs_delayed_node_to_head(ref);
+   rb_erase(head-href_node, delayed_refs-href_root);
+   }
ref-in_tree = 0;
btrfs_put_delayed_ref(ref);
delayed_refs-num_entries--;
@@ -386,42 +418,35 @@ int 

[PATCH v8 11/14] Btrfs: don't return space for dedup extent

2013-12-30 Thread Liu Bo
If the ordered extent had an IOERR or something else went wrong we need to
return the space for this ordered extent back to the allocator, but if the
extent is marked as a dedup one, we don't free the space because we just
use the existing space instead of allocating new space.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3a221bb..4363e1e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3190,6 +3190,7 @@ out:
 * truncated case if we didn't write out the extent at all.
 */
if ((ret || !logical_len) 
+   !ordered_extent-dedup 
!test_bit(BTRFS_ORDERED_NOCOW, ordered_extent-flags) 
!test_bit(BTRFS_ORDERED_PREALLOC, ordered_extent-flags))
btrfs_free_reserved_extent(root, ordered_extent-start,
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs-progs: add dedup subcommand

2013-12-30 Thread Liu Bo
This adds deduplication subcommands, 'btrfs dedup command path',
including enable/disable/on/off.

- btrfs dedup enable
Create the dedup tree, and it's the very first step when you're going to use
the dedup feature.

- btrfs dedup disable
Delete the dedup tree, after this we're not able to use dedup any more unless
you enable it again.

- btrfs dedup on [-b]
Switch on the dedup feature temporarily, and it's the second step of applying
dedup with writes.  Option '-b' is used to set dedup blocksize.
The default blocksize is 8192(no special reason, you may argue), and the current
limit is [4096, 128 * 1024], because 4K is the generic page size and 128K is the
upper limit of btrfs's compression.

- btrfs dedup off
Switch off the dedup feature temporarily, but the dedup tree remains.

-
Usage:
Step 1: btrfs dedup enable /btrfs
Step 2: btrfs dedup on /btrfs or btrfs dedup on -b 4K /btrfs
Step 3: now we have dedup, run your test.
Step 4: btrfs dedup off /btrfs
Step 5: btrfs dedup disable /btrfs
-

v3: add commands 'btrfs dedup on/off'
v2: add manpage

Signed-off-by: Liu Bo bo.li@oracle.com
---
 Makefile   |   3 +-
 btrfs.c|   1 +
 cmds-dedup.c   | 178 +
 commands.h |   2 +
 ctree.h|   2 +
 ioctl.h|  12 
 man/btrfs.8.in |  31 +-
 7 files changed, 225 insertions(+), 4 deletions(-)
 create mode 100644 cmds-dedup.c

diff --git a/Makefile b/Makefile
index 0874a41..092f2db 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,8 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o 
print-tree.o \
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \
-  cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o
+  cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \
+  cmds-dedup.o
 libbtrfs_objects = send-stream.o send-utils.o rbtree.o btrfs-list.o crc32c.o \
   uuid-tree.o
 libbtrfs_headers = send-stream.h send-utils.h send.h rbtree.h btrfs-list.h \
diff --git a/btrfs.c b/btrfs.c
index d5fc738..dfae35f 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -255,6 +255,7 @@ static const struct cmd_group btrfs_cmd_group = {
{ quota, cmd_quota, NULL, quota_cmd_group, 0 },
{ qgroup, cmd_qgroup, NULL, qgroup_cmd_group, 0 },
{ replace, cmd_replace, NULL, replace_cmd_group, 0 },
+   { dedup, cmd_dedup, NULL, dedup_cmd_group, 0 },
{ help, cmd_help, cmd_help_usage, NULL, 0 },
{ version, cmd_version, cmd_version_usage, NULL, 0 },
NULL_CMD_STRUCT
diff --git a/cmds-dedup.c b/cmds-dedup.c
new file mode 100644
index 000..b959349
--- /dev/null
+++ b/cmds-dedup.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2013 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include sys/ioctl.h
+#include unistd.h
+#include getopt.h
+
+#include ctree.h
+#include ioctl.h
+
+#include commands.h
+#include utils.h
+
+static const char * const dedup_cmd_group_usage[] = {
+   btrfs dedup command [options] path,
+   NULL
+};
+
+int dedup_ctl(char *path, struct btrfs_ioctl_dedup_args *args)
+{
+   int ret = 0;
+   int fd;
+   int e;
+   DIR *dirstream = NULL;
+
+   fd = open_file_or_dir(path, dirstream);
+   if (fd  0) {
+   fprintf(stderr, ERROR: can't access '%s'\n, path);
+   return -EACCES;
+   }
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUP_CTL, args);
+   e = errno;
+   close_file_or_dir(fd, dirstream);
+   if (ret  0) {
+   fprintf(stderr, ERROR: dedup command failed: %s\n,
+   strerror(e));
+   if (args-cmd == BTRFS_DEDUP_CTL_DISABLE ||
+   args-cmd == BTRFS_DEDUP_CTL_SET_BS)
+   fprintf(stderr, please refer to 'dmesg | tail' for 
more info\n);
+   return -EINVAL;
+   }
+   return 0;
+}
+
+static const char * const cmd_dedup_enable_usage[] = {
+   btrfs dedup enable path,
+   Enable data deduplication 

[PATCH v8 07/14] Btrfs: introduce dedup state

2013-12-30 Thread Liu Bo
This introduces dedup state and relative operations to mark and unmark
the dedup data range, it'll be used in later patches.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/extent_io.c | 14 ++
 fs/btrfs/extent_io.h |  5 +
 2 files changed, 19 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8e457fc..54cef32 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1251,6 +1251,20 @@ int clear_extent_uptodate(struct extent_io_tree *tree, 
u64 start, u64 end,
cached_state, mask);
 }
 
+int set_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end,
+struct extent_state **cached_state, gfp_t mask)
+{
+   return set_extent_bit(tree, start, end, EXTENT_DEDUP, 0,
+ cached_state, mask);
+}
+
+int clear_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+   return clear_extent_bit(tree, start, end, EXTENT_DEDUP, 0, 0,
+   cached_state, mask);
+}
+
 /*
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 19620c5..5c6a78d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,6 +20,7 @@
 #define EXTENT_NEED_WAIT (1  13)
 #define EXTENT_DAMAGED (1  14)
 #define EXTENT_NORESERVE (1  15)
+#define EXTENT_DEDUP (1  16)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
@@ -227,6 +228,10 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 
start, u64 end,
struct extent_state **cached_state, gfp_t mask);
 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
  struct extent_state **cached_state, gfp_t mask);
+int set_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end,
+struct extent_state **cached_state, gfp_t mask);
+int clear_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end,
+  struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 08/14] Btrfs: make ordered extent aware of dedup

2013-12-30 Thread Liu Bo
This adds a dedup flag and dedup hash into ordered extent so that
we can insert dedup extents to dedup tree at endio time.

The benefit is simplicity, we don't need to fall back to cleanup dedup
structures if the write is cancelled for some reasons.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ordered-data.c | 38 --
 fs/btrfs/ordered-data.h | 13 -
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 69582d5..a61c327 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -183,7 +183,8 @@ static inline struct rb_node *tree_search(struct 
btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ int type, int dio, int compress_type,
+ int dedup, struct btrfs_dedup_hash *hash)
 {
struct btrfs_root *root = BTRFS_I(inode)-root;
struct btrfs_ordered_inode_tree *tree;
@@ -199,10 +200,23 @@ static int __btrfs_add_ordered_extent(struct inode 
*inode, u64 file_offset,
entry-start = start;
entry-len = len;
if (!(BTRFS_I(inode)-flags  BTRFS_INODE_NODATASUM) 
-   !(type == BTRFS_ORDERED_NOCOW))
+   !(type == BTRFS_ORDERED_NOCOW)  !dedup)
entry-csum_bytes_left = disk_len;
entry-disk_len = disk_len;
entry-bytes_left = len;
+   entry-dedup = dedup;
+   entry-hash = NULL;
+
+   if (!dedup  hash) {
+   entry-hash = kzalloc(btrfs_dedup_hash_size(hash-type),
+ GFP_NOFS);
+   if (!entry-hash) {
+   kmem_cache_free(btrfs_ordered_extent_cache, entry);
+   return -ENOMEM;
+   }
+   memcpy(entry-hash, hash, btrfs_dedup_hash_size(hash-type));
+   }
+
entry-inode = igrab(inode);
entry-compress_type = compress_type;
entry-truncated_len = (u64)-1;
@@ -251,7 +265,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, 0, NULL);
+}
+
+int btrfs_add_ordered_extent_dedup(struct inode *inode, u64 file_offset,
+  u64 start, u64 len, u64 disk_len, int type,
+  int dedup, struct btrfs_dedup_hash *hash,
+  int compress_type)
+{
+   return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ compress_type, dedup, hash);
 }
 
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
@@ -259,16 +283,17 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 1,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, 0, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int compress_type)
+ int type, int compress_type,
+ struct btrfs_dedup_hash *hash)
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- compress_type);
+ compress_type, 0, hash);
 }
 
 /*
@@ -501,6 +526,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent 
*entry)
list_del(sum-list);
kfree(sum);
}
+   kfree(entry-hash);
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f..75f3ec2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -109,6 +109,9 @@ struct btrfs_ordered_extent {
/* compression algorithm */
int compress_type;
 
+   /* whether this ordered extent is marked for dedup or not */
+   int dedup;
+
/* reference count */
atomic_t refs;
 
@@ -135,6 +138,9 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;

[PATCH v8 10/14] Btrfs: skip dedup reference during backref walking

2013-12-30 Thread Liu Bo
The dedup ref is quite a special one, it is just used to store the hash value
of the extent and cannot be used to find data, so we skip it during backref
walking.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/backref.c| 9 +
 fs/btrfs/relocation.c | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3775947..1ec0046 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -590,6 +590,9 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head 
*head, u64 seq,
key.objectid = ref-objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = ref-offset;
+   if (ref-root == BTRFS_DEDUP_TREE_OBJECTID)
+   break;
+
ret = __add_prelim_ref(prefs, ref-root, key, 0, 0,
   node-bytenr,
   node-ref_mod * sgn, GFP_ATOMIC);
@@ -708,6 +711,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
root = btrfs_extent_data_ref_root(leaf, dref);
+   if (root == BTRFS_DEDUP_TREE_OBJECTID)
+   break;
+
ret = __add_prelim_ref(prefs, root, key, 0, 0,
   bytenr, count, GFP_NOFS);
break;
@@ -791,6 +797,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
root = btrfs_extent_data_ref_root(leaf, dref);
+   if (root == BTRFS_DEDUP_TREE_OBJECTID)
+   break;
+
ret = __add_prelim_ref(prefs, root, key, 0, 0,
   bytenr, count, GFP_NOFS);
break;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 429c73c..a06e448 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3509,6 +3509,9 @@ static int find_data_references(struct reloc_control *rc,
ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
ref_count = btrfs_extent_data_ref_count(leaf, ref);
 
+   if (ref_root == BTRFS_DEDUP_TREE_OBJECTID)
+   return 0;
+
/*
 * This is an extent belonging to the free space cache, lets just delete
 * it and redo the search.
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 09/14] Btrfs: online(inband) data dedup

2013-12-30 Thread Liu Bo
The main part of data dedup.

This introduces a FORMAT CHANGE.

Btrfs provides online(inband/synchronous) and block-level dedup.

It maps naturally to btrfs's block back-reference, which enables us
to store multiple copies of data as single copy with references
on that copy.

The workflow is
(1) write some data,
(2) get the hash of these data based on btrfs's dedup blocksize.
(3) find matched extents by hash and decide whether to mark it
as a duplicate one or not.  If no, write the data onto disk,
otherwise, add a reference to the matched extent.

Btrfs's built-in dedup supports normal writes and compressed writes.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/extent-tree.c | 150 ++--
 fs/btrfs/extent_io.c   |   8 +-
 fs/btrfs/extent_io.h   |  11 +
 fs/btrfs/inode.c   | 640 +++--
 4 files changed, 712 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aa40a5e..f14db92 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1119,8 +1119,16 @@ static noinline int lookup_extent_data_ref(struct 
btrfs_trans_handle *trans,
key.offset = parent;
} else {
key.type = BTRFS_EXTENT_DATA_REF_KEY;
-   key.offset = hash_extent_data_ref(root_objectid,
- owner, offset);
+
+   /*
+* we've not got the right offset and owner, so search by -1
+* here.
+*/
+   if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID)
+   key.offset = (u64)-1;
+   else
+   key.offset = hash_extent_data_ref(root_objectid,
+ owner, offset);
}
 again:
recow = 0;
@@ -1147,6 +1155,10 @@ again:
goto fail;
}
 
+   if (ret  0  root_objectid == BTRFS_DEDUP_TREE_OBJECTID 
+   path-slots[0]  0)
+   path-slots[0]--;
+
leaf = path-nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
@@ -1170,14 +1182,22 @@ again:
ref = btrfs_item_ptr(leaf, path-slots[0],
 struct btrfs_extent_data_ref);
 
-   if (match_extent_data_ref(leaf, ref, root_objectid,
- owner, offset)) {
-   if (recow) {
-   btrfs_release_path(path);
-   goto again;
+   if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID) {
+   if (btrfs_extent_data_ref_root(leaf, ref) ==
+   root_objectid) {
+   err = 0;
+   break;
+   }
+   } else {
+   if (match_extent_data_ref(leaf, ref, root_objectid,
+ owner, offset)) {
+   if (recow) {
+   btrfs_release_path(path);
+   goto again;
+   }
+   err = 0;
+   break;
}
-   err = 0;
-   break;
}
path-slots[0]++;
}
@@ -1321,6 +1341,32 @@ static noinline int remove_extent_data_ref(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
+static noinline u64 extent_data_ref_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_extent_inline_ref *iref)
+{
+   struct btrfs_key key;
+   struct extent_buffer *leaf;
+   struct btrfs_extent_data_ref *ref1;
+   u64 offset = 0;
+
+   leaf = path-nodes[0];
+   btrfs_item_key_to_cpu(leaf, key, path-slots[0]);
+   if (iref) {
+   WARN_ON(btrfs_extent_inline_ref_type(leaf, iref) !=
+   BTRFS_EXTENT_DATA_REF_KEY);
+   ref1 = (struct btrfs_extent_data_ref *)(iref-offset);
+   offset = btrfs_extent_data_ref_offset(leaf, ref1);
+   } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+   ref1 = btrfs_item_ptr(leaf, path-slots[0],
+ struct btrfs_extent_data_ref);
+   offset = btrfs_extent_data_ref_offset(leaf, ref1);
+   } else {
+   WARN_ON(1);
+   }
+   return offset;
+}
+
 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
  struct btrfs_path *path,
  struct btrfs_extent_inline_ref *iref)
@@ -1587,7 +1633,8 @@ again:
err = -ENOENT;
while (1) {
if (ptr = end) {
-   

[PATCH v8 14/14] Btrfs: fix a crash of dedup ref

2013-12-30 Thread Liu Bo
The dedup reference is a special kind of delayed refs, and the delayed refs
are batched to be processed later.

If we find a matched dedup extent, then we queue an ADD delayed ref on it within
endio work, but there is already a DROP delayed ref queued,

   t1 t2  t3
-writepage commit transaction
  -run_delalloc_dedup
 find_dedup
--
   process_delayed refs
(it deletes the dedup 
extent)
 add ordered extent|
 submit pages  |
  finish ordered io|
insert file extents|
queue delayed refs |
queue dedup ref|
 process delayed refs 
continues
 (insert a ref on an extent
  deleted by the above)

This senario ends up with a crash because we're going to insert a ref on
a deleted extent.

To avoid the race, we need to check if there is a ADD delayed ref on deleting
the extent and protect this job with lock.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ctree.h   |  3 ++-
 fs/btrfs/extent-tree.c | 35 +++
 fs/btrfs/file-item.c   | 36 +++-
 fs/btrfs/inode.c   | 10 ++
 4 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1b89d6c..8a35cdf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3692,7 +3692,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 
start, u64 end,
 struct list_head *list, int search_commit);
 
 int noinline_for_stack
-btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash 
*hash);
+btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash,
+   struct inode *inode, u64 file_pos);
 int noinline_for_stack
 btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans,
  struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df3a645..a140ea9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5996,9 +5996,23 @@ again:
goto again;
}
} else {
-   if (!dedup_hash  is_data 
-   root_objectid == BTRFS_DEDUP_TREE_OBJECTID)
-   dedup_hash = extent_data_ref_offset(root, path, iref);
+   if (is_data  root_objectid == BTRFS_DEDUP_TREE_OBJECTID) {
+   if (!dedup_hash)
+   dedup_hash = extent_data_ref_offset(root,
+   path, iref);
+
+   ret = btrfs_free_dedup_extent(trans, root,
+ dedup_hash, bytenr);
+   if (ret) {
+   if (ret == -EAGAIN)
+   ret = 0;
+   else
+   btrfs_abort_transaction(trans,
+   extent_root,
+   ret);
+   goto out;
+   }
+   }
 
if (found_extent) {
BUG_ON(is_data  refs_to_drop !=
@@ -6023,21 +6037,10 @@ again:
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
if (ret) {
-   btrfs_abort_transaction(trans, extent_root, 
ret);
+   btrfs_abort_transaction(trans,
+   extent_root, ret);
goto out;
}
-
-   if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID) {
-   ret = btrfs_free_dedup_extent(trans, root,
- dedup_hash,
- bytenr);
-   if (ret) {
-   btrfs_abort_transaction(trans,
-   extent_root,
-   ret);
-   goto out;
- 

[PATCH v8 13/14] Btrfs: fix dedupe 'ENOSPC' problem

2013-12-30 Thread Liu Bo
With dedupe, writes are likely to produce metadata but no data, which means
producing more delayed_refs, and this can ends up aborting a transaction
because it needs to allocate enough free space from global_rsv to turn
delayed_refs into real extent tree's nodes/leaves and global_rsv is used up.

So we must take the worst case for throttling delayed refs in this dedup case,
ie, allocating one leaf on each delayed ref head update.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/extent-tree.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f14db92..df3a645 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2661,6 +2661,19 @@ static inline u64 heads_to_leaves(struct btrfs_root 
*root, u64 heads)
 {
u64 num_bytes;
 
+   /*
+* For deduplication it's a special case where we must be very careful
+* on throttling the number of delayed_refs, just because with dedupe,
+* we're likely to produce _only_ metadata but no data, which results in
+* quite a lot of delayed_refs, and consequently global_rsv may be used
+* up and we get a RO btrfs.
+*
+* So we have to take the worst case -- one leaf for updating a ref head
+* and its related refs.
+*/
+   if (root-fs_info-dedup_bs != 0)
+   return heads;
+
num_bytes = heads * (sizeof(struct btrfs_extent_item) +
 sizeof(struct btrfs_extent_inline_ref));
if (!btrfs_fs_incompat(root-fs_info, SKINNY_METADATA))
-- 
1.8.2.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Migrate to bcache: A few questions

2013-12-30 Thread Marc MERLIN
On Mon, Dec 30, 2013 at 02:22:55AM +0100, Kai Krakow wrote:
 These thought are actually quite interesting. So you are saying that data 
 may not be fully written to SSD although the kernel thinks so? This is 

That, and worse.

Incidently, I have just posted on my G+ about this:
https://plus.google.com/106981743284611658289/posts/Us8yjK9SPs6

which is mostly links to
http://lkcl.net/reports/ssd_analysis.html
https://www.usenix.org/conference/fast13/understanding-robustness-ssds-under-power-fault

After you read those, you'll never think twice about SSDs and data loss
anymore :-/
(I kind of found that out myself over time too, but these have much more
data than I got myself empirically on a couple of SSDs)

Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs-progs: add dedup subcommand

2013-12-30 Thread Martin Steigerwald
Am Montag, 30. Dezember 2013, 16:12:55 schrieben Sie:
 This adds deduplication subcommands, 'btrfs dedup command path',
 including enable/disable/on/off.

Nice. Looking forward to test it.
 
 - btrfs dedup enable
 Create the dedup tree, and it's the very first step when you're going to use
 the dedup feature.
 
 - btrfs dedup disable
 Delete the dedup tree, after this we're not able to use dedup any more
 unless you enable it again.

So if deduplication has been switched on for a while, btrfs dedup disable will 
cause BTRFS to undo the deduplication (and thus require more space for the 
same amount of data)?

Thanks and happy new year,
-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Migrate to bcache: A few questions

2013-12-30 Thread Austin S Hemmelgarn
On 12/29/2013 04:11 PM, Kai Krakow wrote:
 Hello list!
 
 I'm planning to buy a small SSD (around 60GB) and use it for bcache in front 
 of my 3x 1TB HDD btrfs setup (mraid1+draid0) using write-back caching. Btrfs 
 is my root device, thus the system must be able to boot from bcache using 
 init ramdisk. My /boot is a separate filesystem outside of btrfs and will be 
 outside of bcache. I am using Gentoo as my system.
 
 I have a few questions:
 
 * How stable is it? I've read about some csum errors lately...
 
 * I want to migrate my current storage to bcache without replaying a backup.
   Is it possible?
 
 * Did others already use it? What is the perceived performance for desktop
   workloads in comparision to not using bcache?
 
 * How well does bcache handle power outages? Btrfs does handle them very
   well since many months.
 
 * How well does it play with dracut as initrd? Is it as simple as telling it
   the new device nodes or is there something complicate to configure?
 
 * How does bcache handle a failing SSD when it starts to wear out in a few
   years?
 
 * Is it worth waiting for hot-relocation support in btrfs to natively use
   a SSD as cache?
 
 * Would you recommend going with a bigger/smaller SSD? I'm planning to use
   only 75% of it for bcache so wear-leveling can work better, maybe use
   another part of it for hibernation (suspend to disk).
I've actually tried a simmilar configuration myself a couple of times
(also using Gentoo in-fact), and I can tell you from experience that
unless things have changed greatly since kernel 3.12.1, it really isn't
worth the headaches.  Setting it up on an already installed system is a
serious pain because the backing device has to be reformatted with a
bcache super-block.  In addition, every kernel that I have tried that
had bcache compiled in or loaded as a module had issues, I would see a
kernel OOPS on average once a day from the bcache code, usually followed
shortly by a panic from some other unrelated subsystem.  I didn't get
any actual data corruption, but I wasn't using btrfs at the time for any
of my filesystems.

As an alternative to using bcache, you might try something simmilar to
the following:
64G SSD with /boot, /, and /usr
Other HDD with /var, /usr/portage, /usr/src, and /home
tmpfs or ramdisk for /tmp and /var/tmp
This is essentially what I use now, and I have found that it
significantly improves system performance.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: systemd-journal, nodatacow, was: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Chris Mason
On Sun, 2013-12-29 at 17:38 -0700, Chris Murphy wrote:
 On Dec 29, 2013, at 5:39 AM, Duncan 1i5t5.dun...@cox.net wrote:
 
  Yes, it does turn off checksumming as well as COW, but given the write-
  into scenario, that's actually best anyway, because otherwise btrfs has 
  to keep updating the checksums
 
 On second thought, I'm less concerned with bitrot and checksumming being lost 
 with nodatacow, than I am with significantly increasing the chance the 
 journal is irreparably lost due to corruption during an unclean shutdown.

So first, send/receive + nowcow aren't a great combination.  NOCOW won't
update the generation numbers send/receive needs to find changes.  The
best send/receive can do in that case is send over the entire file.

 
  But in all these cases, it's also quite common for the application doing 
  the writing to have its own checksumming/error-detection and possible 
  correction -- it pretty much comes with the territory -- in which case 
  btrfs attempting to do the same is simply superfluous even if it weren't 
  a race-condition trigger.
 
 I don't know what kind of checksumming systemd performs on the journal, but 
 whenever Btrfs has found corruption with the journal file(s), 
 systemd-journald has also found corruption and starts a new log. So it makes 
 sense to rely on its own mechanisms, than Btrfs's.
 

The autodefrag mode was really made for the small databases like
systemd.  I'd prefer that we use that for systemd instead of suggesting
NOCOW.  I'm finally dusting off my work to improve db performance, so
hopefully we can do much better in the near future.

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Chris Mason
On Sat, 2013-12-28 at 10:20 -0800, Marc MERLIN wrote:
 On Sat, Dec 28, 2013 at 10:07:58AM -0800, Marc MERLIN wrote:
  For instance, if I use an existing rsync destination to start syncing
  btrfs snapshots to after that, and one file operation can't be applied
  because let's say the destination file it's supposed to be applied to,
  isn't there?
 
 I should have written more: I'm guessing what happens is that the btrfs
 receive fails/aborts, I get an error, I then run a manual rsync to reset
 everything to a good known state, and then continue the btrfs
 send/receive after that?

Btrfs send/receive works by matching state between snapshots on the
sending and receiving end.  If you update the files manually on the
receiving end (say with rsync), it can't merge the states anymore.

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Marc MERLIN
On Mon, Dec 30, 2013 at 04:05:03PM +, Chris Mason wrote:
 On Sat, 2013-12-28 at 10:20 -0800, Marc MERLIN wrote:
  On Sat, Dec 28, 2013 at 10:07:58AM -0800, Marc MERLIN wrote:
   For instance, if I use an existing rsync destination to start syncing
   btrfs snapshots to after that, and one file operation can't be applied
   because let's say the destination file it's supposed to be applied to,
   isn't there?
  
  I should have written more: I'm guessing what happens is that the btrfs
  receive fails/aborts, I get an error, I then run a manual rsync to reset
  everything to a good known state, and then continue the btrfs
  send/receive after that?
 
 Btrfs send/receive works by matching state between snapshots on the
 sending and receiving end.  If you update the files manually on the
 receiving end (say with rsync), it can't merge the states anymore.

I got that, but it wasn't quite my question :)

I understand that btrfs receive cannot apply file changes if the
destination filesystem isn't in a file state that's identical to the source
one.

I'm just not too sure how the destination FS needs to be configured so
that btrfs receive can work with it.

1) Does it need to be an exact byte for byte copy of the block device the
source was on?

2) Or can the destination be seeded with a full rsync or cp -a and can btrfs 
receive
take over from there?

3) Then, if I hit a bug where something doesn't get synced right, and I run
rsync to fix or verify that the two FS are indeed identical file-wise
like they're supposed to, if rsync fixes something, are you saying that
it'll stop btrfs receive from working after that?

Thanks,
Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Chris Mason
On Mon, 2013-12-30 at 08:17 -0800, Marc MERLIN wrote:
 On Mon, Dec 30, 2013 at 04:05:03PM +, Chris Mason wrote:
  On Sat, 2013-12-28 at 10:20 -0800, Marc MERLIN wrote:
   On Sat, Dec 28, 2013 at 10:07:58AM -0800, Marc MERLIN wrote:
For instance, if I use an existing rsync destination to start syncing
btrfs snapshots to after that, and one file operation can't be applied
because let's say the destination file it's supposed to be applied to,
isn't there?
   
   I should have written more: I'm guessing what happens is that the btrfs
   receive fails/aborts, I get an error, I then run a manual rsync to reset
   everything to a good known state, and then continue the btrfs
   send/receive after that?
  
  Btrfs send/receive works by matching state between snapshots on the
  sending and receiving end.  If you update the files manually on the
  receiving end (say with rsync), it can't merge the states anymore.
 
 I got that, but it wasn't quite my question :)
 
 I understand that btrfs receive cannot apply file changes if the
 destination filesystem isn't in a file state that's identical to the source
 one.
 
 I'm just not too sure how the destination FS needs to be configured so
 that btrfs receive can work with it.
 
 1) Does it need to be an exact byte for byte copy of the block device the
 source was on?
 
No, in fact this doesn't help.

 2) Or can the destination be seeded with a full rsync or cp -a and can btrfs 
 receive
 take over from there?
 

No, it has to be created by btrfs receive.

 3) Then, if I hit a bug where something doesn't get synced right, and I run
 rsync to fix or verify that the two FS are indeed identical file-wise
 like they're supposed to, if rsync fixes something, are you saying that
 it'll stop btrfs receive from working after that?
 

Yes, today anyway it won't work.  Send converts the changed items into
an intermediate format (we don't send btree blocks directly over the
wire) and then receive modifies the destination from userland.

At the end of the stream we update the destination root to say you're
now version xxyyzz of uuid aabbcc.

We definitely could add a way to manually set this, but once a user does
it, it'll be very hard to debug any problems they might have had if
their copy wasn't actually up to date.

-chris

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Marc MERLIN
On Mon, Dec 30, 2013 at 04:26:42PM +, Chris Mason wrote:
  1) Does it need to be an exact byte for byte copy of the block device the
  source was on?
  
 No, in fact this doesn't help.
 
  2) Or can the destination be seeded with a full rsync or cp -a and can 
  btrfs receive
  take over from there?
 
 No, it has to be created by btrfs receive.

Aaah, I wasn't clear on that, thanks for clarifying.
So I need to make sure the target block device is at least as big as the
source one, and if necessary a few blocks bigger if the drives do not
allocate partitions of the exactly the same size.

Mmmh, this makes it less desirable for me to use this then since I use over
allocation on the backup servers and if I had to have as much space blocked
off for the full size of each filesystem backed up, I'm going to be short.

Bummer.
 
  3) Then, if I hit a bug where something doesn't get synced right, and I run
  rsync to fix or verify that the two FS are indeed identical file-wise
  like they're supposed to, if rsync fixes something, are you saying that
  it'll stop btrfs receive from working after that?
 
 Yes, today anyway it won't work.  Send converts the changed items into
 an intermediate format (we don't send btree blocks directly over the
 wire) and then receive modifies the destination from userland.
 
 At the end of the stream we update the destination root to say you're
 now version xxyyzz of uuid aabbcc.
 
 We definitely could add a way to manually set this, but once a user does
 it, it'll be very hard to debug any problems they might have had if
 their copy wasn't actually up to date.

Understood. I dreamt that it was computing file differences and could just
apply them on top of any other btrfs filesystem, even if it were smaller and
had been created via rsync.

If one day, it could at least work on a subvolume level (only sync a
subvolume), then it would be more useful to me. Maybe later...

Thanks for clearing that up.

Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/  
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: missing /sbin/fsck.btrfs

2013-12-30 Thread Tom Gundersen
On Mon, Dec 2, 2013 at 12:01 AM, Dave Chinner da...@fromorbit.com wrote:
 I just explained that things can go wrong if
 you don't detect certain types of errors in fsck.foo when it is
 called from fstab processing.

 What I am implying here is that we cannot prevent users from setting
 passno to 1 or 2 in /etc/fstab. We have no control over that and so
 asserting that we don't need a fsck.btrfs because we can set passno
 to 0 is invalid. IOWs, fsck.btrfs needs to be present and it needs
 to behave correctly in these cases

I actually think what btrfs is doing here is the more sensible thing
(i.e., to not ship an fsck.btrfs), as it is a bit confusing to have a
fsck.* that does not in fact do any filesystem checking.

The way this stuff works under systemd is:

 * fsck is only ever called on a filesystem once the backing device
has appeared (so under systemd, fsck.xfs is indeed a noop).
 * fsck is skipped for filesystems where the relevant helper does not
exist, so fs_passno=1 has the same effect for xfs and btrfs
filesystems (either way, nothing happens).

That still leaves non-systemd systems and calling fsck -A manually.
Maybe a good solution would be to patch fsck to adopt systemd's
behavior, which would avoid every filesystem having to ship these
fake fsck helpers? What do you think Karel?

Cheers,

Tom
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Chris Murphy

On Dec 30, 2013, at 10:10 AM, Marc MERLIN m...@merlins.org wrote:
 
 If one day, it could at least work on a subvolume level (only sync a
 subvolume), then it would be more useful to me. Maybe later…

Maybe I'm missing something, but btrfs send/receive only work on a subvolume 
level.


Chris Murphy--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Marc MERLIN
On Mon, Dec 30, 2013 at 10:48:10AM -0700, Chris Murphy wrote:
 
 On Dec 30, 2013, at 10:10 AM, Marc MERLIN m...@merlins.org wrote:
  
  If one day, it could at least work on a subvolume level (only sync a
  subvolume), then it would be more useful to me. Maybe later…
 
 Maybe I'm missing something, but btrfs send/receive only work on a subvolume 
 level.

Never mind, I seem to be the one being dense. I mis-read that you needed
to create the filesystem with btrfs receive.
Indeed, it's on a subvolume level, so it's actually fine since it does
allow over provisionning afterall.

My bad, sorry :)

Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is anyone using btrfs send/receive for backups instead of rsync?

2013-12-30 Thread Chris Murphy

On Dec 30, 2013, at 10:57 AM, Marc MERLIN m...@merlins.org wrote:

 On Mon, Dec 30, 2013 at 10:48:10AM -0700, Chris Murphy wrote:
 
 On Dec 30, 2013, at 10:10 AM, Marc MERLIN m...@merlins.org wrote:
 
 If one day, it could at least work on a subvolume level (only sync a
 subvolume), then it would be more useful to me. Maybe later…
 
 Maybe I'm missing something, but btrfs send/receive only work on a subvolume 
 level.
 
 Never mind, I seem to be the one being dense. I mis-read that you needed
 to create the filesystem with btrfs receive.
 Indeed, it's on a subvolume level, so it's actually fine since it does
 allow over provisionning afterall.

Depending on resources and disaster recovery requirements, you might also 
consider using send -f without receive at all, to the backup destination. The 
first send file (which will be big) can then be put anywhere, even to tape, and 
use the backup storage just for the incremental send -f files.


Chris Murphy--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Migrate to bcache: A few questions

2013-12-30 Thread Kai Krakow
Marc MERLIN m...@merlins.org schrieb:

 On Mon, Dec 30, 2013 at 02:22:55AM +0100, Kai Krakow wrote:
 These thought are actually quite interesting. So you are saying that data
 may not be fully written to SSD although the kernel thinks so? This is
 
 That, and worse.
 
 Incidently, I have just posted on my G+ about this:
 https://plus.google.com/106981743284611658289/posts/Us8yjK9SPs6
 
 which is mostly links to
 http://lkcl.net/reports/ssd_analysis.html
 https://www.usenix.org/conference/fast13/understanding-robustness-ssds-under-power-fault
 
 After you read those, you'll never think twice about SSDs and data loss
 anymore :-/
 (I kind of found that out myself over time too, but these have much more
 data than I got myself empirically on a couple of SSDs)

The bad thing here is: Even battery-backed RAID controllers won't help you 
here. I start to understand why I still don't trust this new technology 
entirely.

Thanks,
Kai

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


question regarding caching

2013-12-30 Thread Aastha Mehta
Hello,

I have some questions regarding caching in BTRFS. When a file system
is unmounted and mounted again, would all the previously cached
content be removed from the cache after flushing to disk? After
remounting, would the initial requests always be fetched from the
disk?

Rather than a local disk, I have a remote device to which my IO
requests are sent and from which the data is fetched. I need certain
data to be fetched from the remote device after a remount. But somehow
I do not see any request appearing at the device. I even tried to do
drop_caches after remounting the file system, but that does not seem
to help.

I guess my problem is not related to BTRFS, but since I am working
with BTRFS, I wanted to ask here for help. Could any one tell me how I
can ensure that requests are fetched from the (remote) device,
especially after file system remount, without having to use
drop_caches?

Please let me know if I described the problem too vaguely and should
give some more details.

Wishing everyone a happy new year.

Thanks and regards,
Aastha.

-- 
Aastha Mehta
MPI-SWS, Germany
E-mail: aasth...@mpi-sws.org
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


A question about btrfs_ioctl_file_extent_same

2013-12-30 Thread Michael Welsh Duggan
A simple question about btrfs_ioctl_file_extent_same...

Is there any check before btrfs_cmp_data is called that short-circuits
things if the extents being compared are already the same extent?  I was
looking and could not find such a check, but I don't really know the
btrfs source code that well.  If there is no such check, would it not
make sense to add one?

-- 
Michael Welsh Duggan
(m...@md5i.com)

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs-progs: add dedup subcommand

2013-12-30 Thread Liu Bo
On Mon, Dec 30, 2013 at 12:34:42PM +0100, Martin Steigerwald wrote:
 Am Montag, 30. Dezember 2013, 16:12:55 schrieben Sie:
  This adds deduplication subcommands, 'btrfs dedup command path',
  including enable/disable/on/off.
 
 Nice. Looking forward to test it.

Well, I just got a report from another user, Marcel, who still got ENOSPC 
errors with this
around of patch set, so it seems that I don't really fix that bug, I guess I
have to work harder on this :-(

  
  - btrfs dedup enable
  Create the dedup tree, and it's the very first step when you're going to use
  the dedup feature.
  
  - btrfs dedup disable
  Delete the dedup tree, after this we're not able to use dedup any more
  unless you enable it again.
 
 So if deduplication has been switched on for a while, btrfs dedup disable 
 will 
 cause BTRFS to undo the deduplication (and thus require more space for the 
 same amount of data)?

No, it remains unchanged, and the data is independent of dedupe, so you can read
them without any problems.

Happy new year.

Thanks,
-liubo
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Migrate to bcache: A few questions

2013-12-30 Thread Kai Krakow
Duncan 1i5t5.dun...@cox.net schrieb:

[ spoiler: tldr ;-) ]

 * How stable is it? I've read about some csum errors lately...
 
 FWIW, both bcache and btrfs are new and still developing technology.
 While I'm using btrfs here, I have tested usable (which for root means
 either means directly bootable or that you have tested booting to a
 recovery image and restoring from there, I do the former, here) backups,
 as STRONGLY recommended for btrfs in its current state, but haven't had
 to use them.
 
 And I considered bcache previously and might otherwise be using it, but
 at least personally, I'm not willing to try BOTH of them at once, since
 neither one is mature yet and if there are problems as there very well
 might be, I'd have the additional issue of figuring out which one was the
 problem, and I'm personally not prepared to deal with that.

I mostly trust btrfs by now. Don't understand me wrong: I still have my 
nightly backup job syncing the complete system to an external drive - 
nothing defeats a good backup. But btrfs has survived reliably multiple 
power-losses, kernel panics/freezes, unreliable USB connections, ... It 
looks very stable from that view. Yes, it may have bugs that may introduce 
errors fatal to the filesystem structure. But generally, under usual 
workloads it has proven stable for me. At least for desktop workloads.
 
 Instead, at this point I'd recommend choosing /either/ bcache /or/ btrfs,
 and using bcache with a more mature filesystem like ext4 or (what I used
 for years previous and still use for spinning rust) reiserfs.

I've used reiserfs for several years a long time ago. But it does absolutely 
not scale well for parallel/threaded workloads which is a show stopper for 
server workloads. But it always survived even the worst failure scenarios 
(like SCSI bus going offline for some RAID members) and the tools 
distributed with it were able to recover all data even if the FS was damaged 
beyond any usual things you would normally try when it does no longer mount. 
I've been with Ext3 before, and it was not only one time that a simple 
power-loss during high server-workload destroyed the filesystem beyond 
repair with fsck only making it worse.

Since reiserfs did not scale well and ext* FS has annoyed me more than once, 
we've decided to go with XFS. While it tends to wipe some data after power-
loss and leaves you with zero-filled files, it has proven extremely reliable 
even under those situations mentioned above like dying SCSI bus. Not to the 
extent reiserfs did but still very satisfying. The big plus: it scales 
extremely well with parallel workloads and can be optimized for the stripe 
configuration of the underlying RAID layer. So I made it my default 
filesystem for desktop, too. With the above mentioned annoying feature of 
zero'ing out recently touched files when the system crashed. But well, we 
all got proven backups, right? Yep, I also learned that lesson... *sigh

But btrfs, when first announced and while I already was jealously looking at 
ZFS, seemed to be the FS of my choice giving me flexible RAID setups, 
snapshots... I'm quite happy with it although it feels slow sometimes. I 
simply threw more RAM at it - now it is okay.


 And as I said, keep your backups as current as you're willing to deal
 with losing what's not backed up, and tested usable and (for root) either
 bootable or restorable from alternate boot, because while at least btrfs
 is /reasonably/ stable for /ordinary/ daily use, there remain corner-
 cases and you never know when your case is going to BE a corner-case!

I've got a small rescue system I can boot which has btrfs-tools and a recent 
kernel to flexible repair, restore, or whatever I want to do with my backup. 
My backup itself is not bootable (although it probably could, if I change 
some configurations files).

 * I want to migrate my current storage to bcache without replaying a
 backup.  Is it possible?
 
 Since I've not actually used bcache, I won't try to answer some of these,
 but will answer based on what I've seen on the list where I can...  I
 don't know on this one.

I remember someone created some pyhton scripts to make it possible - wrt to 
btrfs especially. Can't remember the link. Maybe I'm able to dig it up. But 
at least I read it as: There's no improvement on that migration path 
directly from bcache. I hoped otherwise...

 * Did others already use it? What is the perceived performance for
 desktop workloads in comparision to not using bcache?
 
 Others are indeed already using it.  I've seen some btrfs/bcache problems
 reported on this list, but as mentioned above, when both are in use that
 means figuring out which is the problem, and at least from the btrfs side
 I've not seen a lot of resolution in that regard.  From here it /looks/
 like that's simply being punted at this time, as there's still more
 easily traceable problems without the additional bcache variable to work
 on first.  But it's quite possible 

Re: question regarding caching

2013-12-30 Thread Kai Krakow
Aastha Mehta aasth...@gmail.com schrieb:

 Rather than a local disk, I have a remote device to which my IO
 requests are sent and from which the data is fetched. I need certain
 data to be fetched from the remote device after a remount. But somehow
 I do not see any request appearing at the device. I even tried to do
 drop_caches after remounting the file system, but that does not seem
 to help.

Maybe you or your distribution deployed cachefilesd and uses it for the 
remote fs?

HTH
Kai

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs-progs: add dedup subcommand

2013-12-30 Thread Kai Krakow
Martin Steigerwald mar...@lichtvoll.de schrieb:

 - btrfs dedup disable
 Delete the dedup tree, after this we're not able to use dedup any more
 unless you enable it again.
 
 So if deduplication has been switched on for a while, btrfs dedup disable
 will cause BTRFS to undo the deduplication (and thus require more space
 for the same amount of data)?

From my intention I would guess it just looses track of what the content is 
in content based storage - so when re-enabling it will have to learn 
from beginning. It should not unshare data as sharing extents is a feature 
of btrfs disting from the function of online dedup itself.

At least that would sound reasonable to me.

Regards,
Kai

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html