Re: systemd-journal, nodatacow, was: Is anyone using btrfs send/receive for backups instead of rsync?
Chris Murphy posted on Sun, 29 Dec 2013 17:38:23 -0700 as excerpted: And I'm predicting that since btrfs is the assumed successor to the ext* series as the Linux default filesystem, and systemd is targeting Linux default initsystem status as well, it's only logical that at some point systemd will detect what filesystem it's logging to, and will automatically set NOCOW on the journal file when that filesystem is btrfs. Is this something that should be brought up on the systemd-devel@ list? Or maybe file it as an RFE against systemd at freedesktop.org? I don't know. While I don't (yet?) run systemd personally, I'd have almost thought it'd be done by now (tho obviously it's not, at least in distro-current versions), but perhaps they've been waiting on word that btrfs or some API they plan to use for it is stabilizing before doing it. -- Duncan - List replies preferred. No HTML msgs. Every nonfree program has a lord, a master -- and if you use the program, he is your master. Richard Stallman -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 05/14] Btrfs: introduce dedup tree and relatives
This is a preparation step for online/inband dedup tree. It introduces dedup tree and its relatives, including hash driver and some structures. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ctree.h | 73 fs/btrfs/disk-io.c | 36 ++ fs/btrfs/extent-tree.c | 2 ++ include/trace/events/btrfs.h | 3 +- 4 files changed, 113 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54ab861..0e5718a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include asm/kmap_types.h #include linux/pagemap.h #include linux/btrfs.h +#include crypto/hash.h #include extent_io.h #include extent_map.h #include async-thread.h @@ -101,6 +102,9 @@ struct btrfs_ordered_sum; /* for storing items that use the BTRFS_UUID_KEY* types */ #define BTRFS_UUID_TREE_OBJECTID 9ULL +/* dedup tree(experimental) */ +#define BTRFS_DEDUP_TREE_OBJECTID 10ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -521,6 +525,7 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL 6) #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL 7) #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL 8) +#define BTRFS_FEATURE_INCOMPAT_DEDUP (1ULL 9) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -532,6 +537,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_RAID56 |\ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ +BTRFS_FEATURE_INCOMPAT_DEDUP | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) /* @@ -903,6 +909,51 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +/* dedup */ +enum btrfs_dedup_type { + BTRFS_DEDUP_SHA256 = 0, + BTRFS_DEDUP_LAST = 1, +}; + +static int btrfs_dedup_lens[] = { 4, 0 }; +static int btrfs_dedup_sizes[] = { 32, 0 };/* 256bit, 32bytes */ + +struct btrfs_dedup_item { + /* disk length of dedup range */ + __le64 len; + + u8 type; + u8 compression; + u8 encryption; + + /* spare for later use */ + __le16 other_encoding; + + /* hash/fingerprints go here */ +} __attribute__ ((__packed__)); + +struct btrfs_dedup_hash { + u64 bytenr; + u64 num_bytes; + + /* hash algorithm */ + int type; + + int compression; + + /* last field is a variable length array of dedup hash */ + u64 hash[]; +}; + +static inline int btrfs_dedup_hash_size(int type) +{ + WARN_ON((btrfs_dedup_lens[type] * sizeof(u64)) != +btrfs_dedup_sizes[type]); + + return sizeof(struct btrfs_dedup_hash) + btrfs_dedup_sizes[type]; +} + + struct btrfs_dev_stats_item { /* * grow this item struct at the end for future enhancements and keep @@ -1304,6 +1355,7 @@ struct btrfs_fs_info { struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; + struct btrfs_root *dedup_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; @@ -1655,6 +1707,14 @@ struct btrfs_fs_info { struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; + + /* reference to deduplication algorithm driver via cryptoapi */ + struct crypto_shash *dedup_driver; + + /* dedup blocksize */ + u64 dedup_bs; + + int dedup_type; }; /* @@ -1968,6 +2028,8 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_STRING_ITEM_KEY 253 +#define BTRFS_DEDUP_ITEM_KEY 254 + /* * Flags for mount options. * @@ -2980,6 +3042,14 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* btrfs_dedup_item */ +BTRFS_SETGET_FUNCS(dedup_len, struct btrfs_dedup_item, len, 64); +BTRFS_SETGET_FUNCS(dedup_compression, struct btrfs_dedup_item, compression, 8); +BTRFS_SETGET_FUNCS(dedup_encryption, struct btrfs_dedup_item, encryption, 8); +BTRFS_SETGET_FUNCS(dedup_other_encoding, struct btrfs_dedup_item, + other_encoding, 16); +BTRFS_SETGET_FUNCS(dedup_type, struct btrfs_dedup_item, type, 8); + /* btrfs_dev_stats_item */ static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, @@ -3443,6 +3513,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + if (fs_info-dedup_driver) + crypto_free_shash(fs_info-dedup_driver); kfree(fs_info-balance_ctl); kfree(fs_info-delayed_root); kfree(fs_info-extent_root); @@ -3615,6 +3687,7 @@ int btrfs_csum_one_bio(struct btrfs_root
[RFC PATCH v8 00/14] Online(inband) data deduplication
Hello, Here is the New Year patch bomb :-) Data deduplication is a specialized data compression technique for eliminating duplicate copies of repeating data.[1] This patch set is also related to Content based storage in project ideas[2], it introduces inband data deduplication for btrfs and dedup/dedupe is for short. PATCH 1 is a hang fix with deduplication on, but it's also useful without dedup in practice use. PATCH 2 and 3 are targetting delayed refs' scalability problems, which are uncovered by the dedup feature. PATCH 4 is a speed-up improvement, which is about dedup and quota. PATCH 5-8 is the preparation work for dedup implementation. PATCH 9 shows how we implement dedup feature. PATCH 10 fixes a backref walking bug with dedup. PATCH 11 fixes a free space bug of dedup extents on error handling. PATCH 12 adds the ioctl to control dedup feature. PATCH 13 fixes the metadata ENOSPC problem with dedup which has been there WAY TOO LONG. PATCH 14 fixes a race bug on dedup writes. And there is also a btrfs-progs patch(PATCH 15) which offers all details about how to control the dedup feature. I've tested this with xfstests by adding a inline dedup 'enable on' in xfstests' mount and scratch_mount. TODO: * a bit-to-bit comparison callback. All comments are welcome! [1]: http://en.wikipedia.org/wiki/Data_deduplication [2]: https://btrfs.wiki.kernel.org/index.php/Project_ideas#Content_based_storage v8: - fix the race crash of dedup ref again. - fix the metadata ENOSPC problem with dedup. v7: - rebase onto the lastest btrfs - break a big patch into smaller ones to make reviewers happy. - kill mount options of dedup and use ioctl method instead. - fix two crash due to the special dedup ref For former patch sets: v6: http://thread.gmane.org/gmane.comp.file-systems.btrfs/27512 v5: http://thread.gmane.org/gmane.comp.file-systems.btrfs/27257 v4: http://thread.gmane.org/gmane.comp.file-systems.btrfs/25751 v3: http://comments.gmane.org/gmane.comp.file-systems.btrfs/25433 v2: http://comments.gmane.org/gmane.comp.file-systems.btrfs/24959 Liu Bo (14): Btrfs: skip merge part for delayed data refs Btrfs: improve the delayed refs process in rm case Btrfs: introduce a head ref rbtree Btrfs: disable qgroups accounting when quata_enable is 0 Btrfs: introduce dedup tree and relatives Btrfs: introduce dedup tree operations Btrfs: introduce dedup state Btrfs: make ordered extent aware of dedup Btrfs: online(inband) data dedup Btrfs: skip dedup reference during backref walking Btrfs: don't return space for dedup extent Btrfs: add ioctl of dedup control Btrfs: fix dedupe 'ENOSPC' problem Btrfs: fix a crash of dedup ref fs/btrfs/backref.c | 9 + fs/btrfs/ctree.c | 2 +- fs/btrfs/ctree.h | 86 ++ fs/btrfs/delayed-ref.c | 161 +++ fs/btrfs/delayed-ref.h | 8 + fs/btrfs/disk-io.c | 40 +++ fs/btrfs/extent-tree.c | 208 -- fs/btrfs/extent_io.c | 22 +- fs/btrfs/extent_io.h | 16 ++ fs/btrfs/file-item.c | 244 + fs/btrfs/inode.c | 635 ++- fs/btrfs/ioctl.c | 167 fs/btrfs/ordered-data.c | 38 ++- fs/btrfs/ordered-data.h | 13 +- fs/btrfs/qgroup.c| 3 + fs/btrfs/relocation.c| 3 + fs/btrfs/transaction.c | 4 +- include/trace/events/btrfs.h | 3 +- include/uapi/linux/btrfs.h | 11 + 19 files changed, 1501 insertions(+), 172 deletions(-) -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 02/14] Btrfs: improve the delayed refs process in rm case
While removing a file with dedup extents, we could have a great number of delayed refs pending to process, and these refs refer to droping a ref of the extent, which is of BTRFS_DROP_DELAYED_REF type. But in order to prevent an extent's ref count from going down to zero when there still are pending delayed refs, we first select those adding a ref ones, which is of BTRFS_ADD_DELAYED_REF type. So in removing case, all of our delayed refs are of BTRFS_DROP_DELAYED_REF type, but we have to walk all the refs issued to the extent to find any BTRFS_ADD_DELAYED_REF types and end up there is no such thing, and then start over again to find BTRFS_DROP_DELAYED_REF. This is really unnecessary, we can improve this by tracking how many BTRFS_ADD_DELAYED_REF refs we have and search by the right type. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/delayed-ref.c | 10 ++ fs/btrfs/delayed-ref.h | 3 +++ fs/btrfs/extent-tree.c | 17 - 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index b0d5d79..9596649 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -543,6 +543,10 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * update the reference mod on the head to reflect this new operation */ existing-ref_mod += update-ref_mod; + + WARN_ON_ONCE(update-ref_mod 1); + if (update-ref_mod == 1) + existing_ref-add_cnt++; } /* @@ -604,6 +608,12 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref-must_insert_reserved = must_insert_reserved; head_ref-is_data = is_data; + /* track added ref, more comments in select_delayed_ref() */ + if (count_mod == 1) + head_ref-add_cnt = 1; + else + head_ref-add_cnt = 0; + INIT_LIST_HEAD(head_ref-cluster); mutex_init(head_ref-mutex); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 70b962c..9377b27 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -84,6 +84,9 @@ struct btrfs_delayed_ref_head { struct list_head cluster; struct btrfs_delayed_extent_op *extent_op; + + int add_cnt; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 009980c..a6fb5fa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2287,6 +2287,16 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) struct rb_node *node; struct btrfs_delayed_ref_node *ref; int action = BTRFS_ADD_DELAYED_REF; + + /* +* track the count of BTRFS_ADD_DELAYED_REF, +* in the case that there's no BTRFS_ADD_DELAYED_REF while there're a +* a great number of BTRFS_DROP_DELAYED_REF, +* it'll waste time on searching BTRFS_ADD_DELAYED_REF, usually this +* happens with dedup enabled. +*/ + if (head-add_cnt == 0) + action = BTRFS_DROP_DELAYED_REF; again: /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. @@ -2301,8 +2311,11 @@ again: rb_node); if (ref-bytenr != head-node.bytenr) break; - if (ref-action == action) + if (ref-action == action) { + if (action == BTRFS_ADD_DELAYED_REF) + head-add_cnt--; return ref; + } node = rb_prev(node); } if (action == BTRFS_ADD_DELAYED_REF) { @@ -2378,6 +2391,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * there are still refs with lower seq numbers in the * process of being added. Don't run this ref yet. */ + if (ref-action == BTRFS_ADD_DELAYED_REF) + locked_ref-add_cnt++; list_del_init(locked_ref-cluster); btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 04/14] Btrfs: disable qgroups accounting when quata_enable is 0
It's unnecessary to do qgroups accounting without enabling quota. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ctree.c | 2 +- fs/btrfs/delayed-ref.c | 18 ++ fs/btrfs/qgroup.c | 3 +++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 316136b..160fa3e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -407,7 +407,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, tree_mod_log_write_lock(fs_info); spin_lock(fs_info-tree_mod_seq_lock); - if (!elem-seq) { + if (elem !elem-seq) { elem-seq = btrfs_inc_tree_mod_seq_major(fs_info); list_add_tail(elem-list, fs_info-tree_mod_seq_list); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9e1a1c9..3ec3d08 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -691,8 +691,13 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref-is_head = 0; ref-in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, trans-delayed_ref_elem); + if (need_ref_seq(for_cow, ref_root)) { + struct seq_list *elem = NULL; + + if (fs_info-quota_enabled) + elem = trans-delayed_ref_elem; + seq = btrfs_get_tree_mod_seq(fs_info, elem); + } ref-seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -750,8 +755,13 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref-is_head = 0; ref-in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, trans-delayed_ref_elem); + if (need_ref_seq(for_cow, ref_root)) { + struct seq_list *elem = NULL; + + if (fs_info-quota_enabled) + elem = trans-delayed_ref_elem; + seq = btrfs_get_tree_mod_seq(fs_info, elem); + } ref-seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e6ef49..1cb58f9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1188,6 +1188,9 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, { struct qgroup_update *u; + if (!trans-root-fs_info-quota_enabled) + return 0; + BUG_ON(!trans-delayed_ref_elem.seq); u = kmalloc(sizeof(*u), GFP_NOFS); if (!u) -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 01/14] Btrfs: skip merge part for delayed data refs
When we have data deduplication on, we'll hang on the merge part because it needs to verify every queued delayed data refs related to this disk offset but we may have millions refs. And in the case of delayed data refs, we don't usually have too much data refs to merge. So it's safe to shut it down for data refs. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/delayed-ref.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e4d467b..b0d5d79 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -320,6 +320,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct rb_node *node; u64 seq = 0; + /* +* We don't have too much refs to merge in the case of delayed data +* refs. +*/ + if (head-is_data) + return; + spin_lock(fs_info-tree_mod_seq_lock); if (!list_empty(fs_info-tree_mod_seq_list)) { struct seq_list *elem; -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 12/14] Btrfs: add ioctl of dedup control
So far we have 4 commands to control dedup behaviour, - btrfs dedup enable Create the dedup tree, and it's the very first step when you're going to use the dedup feature. - btrfs dedup disable Delete the dedup tree, after this we're not able to use dedup any more unless you enable it again. - btrfs dedup on [-b] Switch on the dedup feature temporarily, and it's the second step of applying dedup with writes. Option '-b' is used to set dedup blocksize. The default blocksize is 8192(no special reason, you may argue), and the current limit is [4096, 128 * 1024], because 4K is the generic page size and 128K is the upper limit of btrfs's compression. - btrfs dedup off Switch off the dedup feature temporarily, but the dedup tree remains. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ctree.h | 3 + fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 167 + include/uapi/linux/btrfs.h | 11 +++ 4 files changed, 182 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 52b2843..1b89d6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1715,6 +1715,9 @@ struct btrfs_fs_info { u64 dedup_bs; int dedup_type; + + /* protect user change for dedup operations */ + struct mutex dedup_ioctl_mutex; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c83de7..9c9667d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2314,6 +2314,7 @@ int open_ctree(struct super_block *sb, mutex_init(fs_info-dev_replace.lock_finishing_cancel_unmount); mutex_init(fs_info-dev_replace.lock_management_lock); mutex_init(fs_info-dev_replace.lock); + mutex_init(fs_info-dedup_ioctl_mutex); spin_lock_init(fs_info-qgroup_lock); mutex_init(fs_info-qgroup_ioctl_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 347bf61..75fb3de 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4481,6 +4481,171 @@ out_unlock: return ret; } +static long btrfs_enable_dedup(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root-fs_info; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *dedup_root; + int ret = 0; + + mutex_lock(fs_info-dedup_ioctl_mutex); + if (fs_info-dedup_root) { + pr_info(btrfs: dedup has already been enabled\n); + mutex_unlock(fs_info-dedup_ioctl_mutex); + return 0; + } + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_unlock(fs_info-dedup_ioctl_mutex); + return ret; + } + + dedup_root = btrfs_create_tree(trans, fs_info, + BTRFS_DEDUP_TREE_OBJECTID); + if (IS_ERR(dedup_root)) + ret = PTR_ERR(dedup_root); + + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + + if (!ret) { + pr_info(btrfs: dedup enabled\n); + fs_info-dedup_root = dedup_root; + fs_info-dedup_root-block_rsv = fs_info-global_block_rsv; + btrfs_set_fs_incompat(fs_info, DEDUP); + } + + mutex_unlock(fs_info-dedup_ioctl_mutex); + return ret; +} + +static long btrfs_disable_dedup(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root-fs_info; + struct btrfs_root *dedup_root; + int ret; + + mutex_lock(fs_info-dedup_ioctl_mutex); + if (!fs_info-dedup_root) { + pr_info(btrfs: dedup has been disabled\n); + mutex_unlock(fs_info-dedup_ioctl_mutex); + return 0; + } + + if (fs_info-dedup_bs != 0) { + pr_info(btrfs: cannot disable dedup until switching off dedup!\n); + mutex_unlock(fs_info-dedup_ioctl_mutex); + return -EBUSY; + } + + dedup_root = fs_info-dedup_root; + + ret = btrfs_drop_snapshot(dedup_root, NULL, 1, 0); + + if (!ret) { + fs_info-dedup_root = NULL; + pr_info(btrfs: dedup disabled\n); + } + + mutex_unlock(fs_info-dedup_ioctl_mutex); + WARN_ON(ret 0 ret != -EAGAIN ret != -EROFS); + return ret; +} + +static long btrfs_set_dedup_bs(struct btrfs_root *root, u64 bs) +{ + struct btrfs_fs_info *info = root-fs_info; + int ret = 0; + + mutex_lock(info-dedup_ioctl_mutex); + if (!info-dedup_root) { + pr_info(btrfs: dedup is disabled, we cannot switch on/off dedup\n); + ret = -EINVAL; + goto out; + } + + bs = ALIGN(bs, root-sectorsize); + bs = min_t(u64, bs, (128 * 1024ULL)); + + if (bs == info-dedup_bs) { + if (info-dedup_bs == 0) + pr_info(btrfs: switch OFF dedup(it's already
[PATCH v8 06/14] Btrfs: introduce dedup tree operations
The operations consist of finding matched items, adding new items and removing items. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ctree.h | 9 +++ fs/btrfs/file-item.c | 210 +++ 2 files changed, 219 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e5718a..52b2843 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3688,6 +3688,15 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +int noinline_for_stack +btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash); +int noinline_for_stack +btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_dedup_hash *hash); +int noinline_for_stack +btrfs_free_dedup_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 hash, u64 bytenr); /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6f38488..fd95692 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -884,3 +884,213 @@ out: fail_unlock: goto out; } + +/* 1 means we find one, 0 means we dont. */ +int noinline_for_stack +btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_root *dedup_root; + struct btrfs_dedup_item *item; + u64 hash_value; + u64 length; + u64 dedup_size; + int compression; + int found = 0; + int index; + int ret; + + if (!hash) { + WARN_ON(1); + return 0; + } + if (!root-fs_info-dedup_root) { + WARN(1, KERN_INFO dedup not enabled\n); + return 0; + } + dedup_root = root-fs_info-dedup_root; + + path = btrfs_alloc_path(); + if (!path) + return 0; + + /* +* For SHA256 dedup algorithm, we store the last 64bit as the +* key.objectid, and the rest in the tree item. +*/ + index = btrfs_dedup_lens[hash-type] - 1; + dedup_size = btrfs_dedup_sizes[hash-type] - sizeof(u64); + + hash_value = hash-hash[index]; + + key.objectid = hash_value; + key.offset = (u64)-1; + btrfs_set_key_type(key, BTRFS_DEDUP_ITEM_KEY); + + ret = btrfs_search_slot(NULL, dedup_root, key, path, 0, 0); + if (ret 0) + goto out; + if (ret == 0) { + WARN_ON(1); + goto out; + } + +prev_slot: + /* this will do match checks. */ + ret = btrfs_previous_item(dedup_root, path, hash_value, + BTRFS_DEDUP_ITEM_KEY); + if (ret) + goto out; + + leaf = path-nodes[0]; + btrfs_item_key_to_cpu(leaf, key, path-slots[0]); + if (key.objectid != hash_value) + goto out; + + item = btrfs_item_ptr(leaf, path-slots[0], struct btrfs_dedup_item); + /* disk length of dedup range */ + length = btrfs_dedup_len(leaf, item); + + compression = btrfs_dedup_compression(leaf, item); + if (compression BTRFS_COMPRESS_TYPES) { + WARN_ON(1); + goto out; + } + + if (btrfs_dedup_type(leaf, item) != hash-type) + goto prev_slot; + + if (memcmp_extent_buffer(leaf, hash-hash, (unsigned long)(item + 1), +dedup_size)) { + pr_info(goto prev\n); + goto prev_slot; + } + + hash-bytenr = key.offset; + hash-num_bytes = length; + hash-compression = compression; + found = 1; +out: + btrfs_free_path(path); + return found; +} + +int noinline_for_stack +btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_dedup_hash *hash) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_root *dedup_root; + struct btrfs_dedup_item *dedup_item; + u64 ins_size; + u64 dedup_size; + int index; + int ret; + + if (!hash) { + WARN_ON(1); + return 0; + } + + WARN_ON(hash-num_bytes root-fs_info-dedup_bs); + + if (!root-fs_info-dedup_root) { + WARN(1, KERN_INFO dedup not enabled\n); + return 0; + } + dedup_root = root-fs_info-dedup_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* +* For SHA256 dedup algorithm, we store the last 64bit
[PATCH v8 03/14] Btrfs: introduce a head ref rbtree
The way how we process delayed refs is 1) get a bunch of head refs, 2) pick up one head ref, 3) go one node back for any delayed ref updates. The head ref is also linked in the same rbtree as the delayed ref is, so in 1) stage, we have to walk one by one including not only head refs, but delayed refs. When we have a great number of delayed refs pending to process, this'll cost time a lot. Here we introduce a head ref specific rbtree, it only has head refs, so troubles go away. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/delayed-ref.c | 126 + fs/btrfs/delayed-ref.h | 5 ++ fs/btrfs/disk-io.c | 3 ++ fs/btrfs/extent-tree.c | 21 ++--- fs/btrfs/transaction.c | 4 +- 5 files changed, 99 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9596649..9e1a1c9 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return NULL; } +/* insert a new ref to head ref rbtree */ +static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, + struct rb_node *node) +{ + struct rb_node **p = root-rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_head *entry; + struct btrfs_delayed_ref_head *ins; + u64 bytenr; + + ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); + bytenr = ins-node.bytenr; + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, +href_node); + + if (bytenr entry-node.bytenr) + p = (*p)-rb_left; + else if (bytenr entry-node.bytenr) + p = (*p)-rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + /* * find an head entry based on bytenr. This returns the delayed ref * head if it was able to find one, or NULL if nothing was in that spot. * If return_bigger is given, the next bigger entry is returned if no exact * match is found. */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, - struct btrfs_delayed_ref_node **last, - int return_bigger) +static struct btrfs_delayed_ref_head * +find_ref_head(struct rb_root *root, u64 bytenr, + struct btrfs_delayed_ref_head **last, int return_bigger) { struct rb_node *n; - struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_head *entry; int cmp = 0; again: n = root-rb_node; entry = NULL; while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry-in_tree); + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); if (last) *last = entry; - if (bytenr entry-bytenr) + if (bytenr entry-node.bytenr) cmp = -1; - else if (bytenr entry-bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) + else if (bytenr entry-node.bytenr) cmp = 1; else cmp = 0; @@ -203,12 +229,12 @@ again: } if (entry return_bigger) { if (cmp 0) { - n = rb_next(entry-rb_node); + n = rb_next(entry-href_node); if (!n) n = rb_first(root); - entry = rb_entry(n, struct btrfs_delayed_ref_node, -rb_node); - bytenr = entry-bytenr; + entry = rb_entry(n, struct btrfs_delayed_ref_head, +href_node); + bytenr = entry-node.bytenr; return_bigger = 0; goto again; } @@ -246,6 +272,12 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref) { rb_erase(ref-rb_node, delayed_refs-root); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + rb_erase(head-href_node, delayed_refs-href_root); + } ref-in_tree = 0; btrfs_put_delayed_ref(ref); delayed_refs-num_entries--; @@ -386,42 +418,35 @@ int
[PATCH v8 11/14] Btrfs: don't return space for dedup extent
If the ordered extent had an IOERR or something else went wrong we need to return the space for this ordered extent back to the allocator, but if the extent is marked as a dedup one, we don't free the space because we just use the existing space instead of allocating new space. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3a221bb..4363e1e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3190,6 +3190,7 @@ out: * truncated case if we didn't write out the extent at all. */ if ((ret || !logical_len) + !ordered_extent-dedup !test_bit(BTRFS_ORDERED_NOCOW, ordered_extent-flags) !test_bit(BTRFS_ORDERED_PREALLOC, ordered_extent-flags)) btrfs_free_reserved_extent(root, ordered_extent-start, -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: add dedup subcommand
This adds deduplication subcommands, 'btrfs dedup command path', including enable/disable/on/off. - btrfs dedup enable Create the dedup tree, and it's the very first step when you're going to use the dedup feature. - btrfs dedup disable Delete the dedup tree, after this we're not able to use dedup any more unless you enable it again. - btrfs dedup on [-b] Switch on the dedup feature temporarily, and it's the second step of applying dedup with writes. Option '-b' is used to set dedup blocksize. The default blocksize is 8192(no special reason, you may argue), and the current limit is [4096, 128 * 1024], because 4K is the generic page size and 128K is the upper limit of btrfs's compression. - btrfs dedup off Switch off the dedup feature temporarily, but the dedup tree remains. - Usage: Step 1: btrfs dedup enable /btrfs Step 2: btrfs dedup on /btrfs or btrfs dedup on -b 4K /btrfs Step 3: now we have dedup, run your test. Step 4: btrfs dedup off /btrfs Step 5: btrfs dedup disable /btrfs - v3: add commands 'btrfs dedup on/off' v2: add manpage Signed-off-by: Liu Bo bo.li@oracle.com --- Makefile | 3 +- btrfs.c| 1 + cmds-dedup.c | 178 + commands.h | 2 + ctree.h| 2 + ioctl.h| 12 man/btrfs.8.in | 31 +- 7 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 cmds-dedup.c diff --git a/Makefile b/Makefile index 0874a41..092f2db 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,8 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \ cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \ cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \ - cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o + cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \ + cmds-dedup.o libbtrfs_objects = send-stream.o send-utils.o rbtree.o btrfs-list.o crc32c.o \ uuid-tree.o libbtrfs_headers = send-stream.h send-utils.h send.h rbtree.h btrfs-list.h \ diff --git a/btrfs.c b/btrfs.c index d5fc738..dfae35f 100644 --- a/btrfs.c +++ b/btrfs.c @@ -255,6 +255,7 @@ static const struct cmd_group btrfs_cmd_group = { { quota, cmd_quota, NULL, quota_cmd_group, 0 }, { qgroup, cmd_qgroup, NULL, qgroup_cmd_group, 0 }, { replace, cmd_replace, NULL, replace_cmd_group, 0 }, + { dedup, cmd_dedup, NULL, dedup_cmd_group, 0 }, { help, cmd_help, cmd_help_usage, NULL, 0 }, { version, cmd_version, cmd_version_usage, NULL, 0 }, NULL_CMD_STRUCT diff --git a/cmds-dedup.c b/cmds-dedup.c new file mode 100644 index 000..b959349 --- /dev/null +++ b/cmds-dedup.c @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2013 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include sys/ioctl.h +#include unistd.h +#include getopt.h + +#include ctree.h +#include ioctl.h + +#include commands.h +#include utils.h + +static const char * const dedup_cmd_group_usage[] = { + btrfs dedup command [options] path, + NULL +}; + +int dedup_ctl(char *path, struct btrfs_ioctl_dedup_args *args) +{ + int ret = 0; + int fd; + int e; + DIR *dirstream = NULL; + + fd = open_file_or_dir(path, dirstream); + if (fd 0) { + fprintf(stderr, ERROR: can't access '%s'\n, path); + return -EACCES; + } + + ret = ioctl(fd, BTRFS_IOC_DEDUP_CTL, args); + e = errno; + close_file_or_dir(fd, dirstream); + if (ret 0) { + fprintf(stderr, ERROR: dedup command failed: %s\n, + strerror(e)); + if (args-cmd == BTRFS_DEDUP_CTL_DISABLE || + args-cmd == BTRFS_DEDUP_CTL_SET_BS) + fprintf(stderr, please refer to 'dmesg | tail' for more info\n); + return -EINVAL; + } + return 0; +} + +static const char * const cmd_dedup_enable_usage[] = { + btrfs dedup enable path, + Enable data deduplication
[PATCH v8 07/14] Btrfs: introduce dedup state
This introduces dedup state and relative operations to mark and unmark the dedup data range, it'll be used in later patches. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/extent_io.c | 14 ++ fs/btrfs/extent_io.h | 5 + 2 files changed, 19 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8e457fc..54cef32 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1251,6 +1251,20 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, cached_state, mask); } +int set_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end, +struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DEDUP, 0, + cached_state, mask); +} + +int clear_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_DEDUP, 0, 0, + cached_state, mask); +} + /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19620c5..5c6a78d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@ #define EXTENT_NEED_WAIT (1 13) #define EXTENT_DAMAGED (1 14) #define EXTENT_NORESERVE (1 15) +#define EXTENT_DEDUP (1 16) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -227,6 +228,10 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); +int set_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end, +struct extent_state **cached_state, gfp_t mask); +int clear_extent_dedup(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 08/14] Btrfs: make ordered extent aware of dedup
This adds a dedup flag and dedup hash into ordered extent so that we can insert dedup extents to dedup tree at endio time. The benefit is simplicity, we don't need to fall back to cleanup dedup structures if the write is cancelled for some reasons. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ordered-data.c | 38 -- fs/btrfs/ordered-data.h | 13 - 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 69582d5..a61c327 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -183,7 +183,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) + int type, int dio, int compress_type, + int dedup, struct btrfs_dedup_hash *hash) { struct btrfs_root *root = BTRFS_I(inode)-root; struct btrfs_ordered_inode_tree *tree; @@ -199,10 +200,23 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry-start = start; entry-len = len; if (!(BTRFS_I(inode)-flags BTRFS_INODE_NODATASUM) - !(type == BTRFS_ORDERED_NOCOW)) + !(type == BTRFS_ORDERED_NOCOW) !dedup) entry-csum_bytes_left = disk_len; entry-disk_len = disk_len; entry-bytes_left = len; + entry-dedup = dedup; + entry-hash = NULL; + + if (!dedup hash) { + entry-hash = kzalloc(btrfs_dedup_hash_size(hash-type), + GFP_NOFS); + if (!entry-hash) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + return -ENOMEM; + } + memcpy(entry-hash, hash, btrfs_dedup_hash_size(hash-type)); + } + entry-inode = igrab(inode); entry-compress_type = compress_type; entry-truncated_len = (u64)-1; @@ -251,7 +265,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 0, - BTRFS_COMPRESS_NONE); + BTRFS_COMPRESS_NONE, 0, NULL); +} + +int btrfs_add_ordered_extent_dedup(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type, + int dedup, struct btrfs_dedup_hash *hash, + int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type, dedup, hash); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, @@ -259,16 +283,17 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 1, - BTRFS_COMPRESS_NONE); + BTRFS_COMPRESS_NONE, 0, NULL); } int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int compress_type) + int type, int compress_type, + struct btrfs_dedup_hash *hash) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, disk_len, type, 0, - compress_type); + compress_type, 0, hash); } /* @@ -501,6 +526,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) list_del(sum-list); kfree(sum); } + kfree(entry-hash); kmem_cache_free(btrfs_ordered_extent_cache, entry); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 9b0450f..75f3ec2 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -109,6 +109,9 @@ struct btrfs_ordered_extent { /* compression algorithm */ int compress_type; + /* whether this ordered extent is marked for dedup or not */ + int dedup; + /* reference count */ atomic_t refs; @@ -135,6 +138,9 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work;
[PATCH v8 10/14] Btrfs: skip dedup reference during backref walking
The dedup ref is quite a special one, it is just used to store the hash value of the extent and cannot be used to find data, so we skip it during backref walking. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/backref.c| 9 + fs/btrfs/relocation.c | 3 +++ 2 files changed, 12 insertions(+) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3775947..1ec0046 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -590,6 +590,9 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.objectid = ref-objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = ref-offset; + if (ref-root == BTRFS_DEDUP_TREE_OBJECTID) + break; + ret = __add_prelim_ref(prefs, ref-root, key, 0, 0, node-bytenr, node-ref_mod * sgn, GFP_ATOMIC); @@ -708,6 +711,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); + if (root == BTRFS_DEDUP_TREE_OBJECTID) + break; + ret = __add_prelim_ref(prefs, root, key, 0, 0, bytenr, count, GFP_NOFS); break; @@ -791,6 +797,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); + if (root == BTRFS_DEDUP_TREE_OBJECTID) + break; + ret = __add_prelim_ref(prefs, root, key, 0, 0, bytenr, count, GFP_NOFS); break; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 429c73c..a06e448 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3509,6 +3509,9 @@ static int find_data_references(struct reloc_control *rc, ref_offset = btrfs_extent_data_ref_offset(leaf, ref); ref_count = btrfs_extent_data_ref_count(leaf, ref); + if (ref_root == BTRFS_DEDUP_TREE_OBJECTID) + return 0; + /* * This is an extent belonging to the free space cache, lets just delete * it and redo the search. -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v8 09/14] Btrfs: online(inband) data dedup
The main part of data dedup. This introduces a FORMAT CHANGE. Btrfs provides online(inband/synchronous) and block-level dedup. It maps naturally to btrfs's block back-reference, which enables us to store multiple copies of data as single copy with references on that copy. The workflow is (1) write some data, (2) get the hash of these data based on btrfs's dedup blocksize. (3) find matched extents by hash and decide whether to mark it as a duplicate one or not. If no, write the data onto disk, otherwise, add a reference to the matched extent. Btrfs's built-in dedup supports normal writes and compressed writes. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/extent-tree.c | 150 ++-- fs/btrfs/extent_io.c | 8 +- fs/btrfs/extent_io.h | 11 + fs/btrfs/inode.c | 640 +++-- 4 files changed, 712 insertions(+), 97 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aa40a5e..f14db92 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1119,8 +1119,16 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, key.offset = parent; } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); + + /* +* we've not got the right offset and owner, so search by -1 +* here. +*/ + if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID) + key.offset = (u64)-1; + else + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); } again: recow = 0; @@ -1147,6 +1155,10 @@ again: goto fail; } + if (ret 0 root_objectid == BTRFS_DEDUP_TREE_OBJECTID + path-slots[0] 0) + path-slots[0]--; + leaf = path-nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { @@ -1170,14 +1182,22 @@ again: ref = btrfs_item_ptr(leaf, path-slots[0], struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) { - if (recow) { - btrfs_release_path(path); - goto again; + if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID) { + if (btrfs_extent_data_ref_root(leaf, ref) == + root_objectid) { + err = 0; + break; + } + } else { + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(path); + goto again; + } + err = 0; + break; } - err = 0; - break; } path-slots[0]++; } @@ -1321,6 +1341,32 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } +static noinline u64 extent_data_ref_offset(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + u64 offset = 0; + + leaf = path-nodes[0]; + btrfs_item_key_to_cpu(leaf, key, path-slots[0]); + if (iref) { + WARN_ON(btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY); + ref1 = (struct btrfs_extent_data_ref *)(iref-offset); + offset = btrfs_extent_data_ref_offset(leaf, ref1); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path-slots[0], + struct btrfs_extent_data_ref); + offset = btrfs_extent_data_ref_offset(leaf, ref1); + } else { + WARN_ON(1); + } + return offset; +} + static noinline u32 extent_data_ref_count(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) @@ -1587,7 +1633,8 @@ again: err = -ENOENT; while (1) { if (ptr = end) { -
[PATCH v8 14/14] Btrfs: fix a crash of dedup ref
The dedup reference is a special kind of delayed refs, and the delayed refs are batched to be processed later. If we find a matched dedup extent, then we queue an ADD delayed ref on it within endio work, but there is already a DROP delayed ref queued, t1 t2 t3 -writepage commit transaction -run_delalloc_dedup find_dedup -- process_delayed refs (it deletes the dedup extent) add ordered extent| submit pages | finish ordered io| insert file extents| queue delayed refs | queue dedup ref| process delayed refs continues (insert a ref on an extent deleted by the above) This senario ends up with a crash because we're going to insert a ref on a deleted extent. To avoid the race, we need to check if there is a ADD delayed ref on deleting the extent and protect this job with lock. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 35 +++ fs/btrfs/file-item.c | 36 +++- fs/btrfs/inode.c | 10 ++ 4 files changed, 58 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1b89d6c..8a35cdf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3692,7 +3692,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); int noinline_for_stack -btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash); +btrfs_find_dedup_extent(struct btrfs_root *root, struct btrfs_dedup_hash *hash, + struct inode *inode, u64 file_pos); int noinline_for_stack btrfs_insert_dedup_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index df3a645..a140ea9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5996,9 +5996,23 @@ again: goto again; } } else { - if (!dedup_hash is_data - root_objectid == BTRFS_DEDUP_TREE_OBJECTID) - dedup_hash = extent_data_ref_offset(root, path, iref); + if (is_data root_objectid == BTRFS_DEDUP_TREE_OBJECTID) { + if (!dedup_hash) + dedup_hash = extent_data_ref_offset(root, + path, iref); + + ret = btrfs_free_dedup_extent(trans, root, + dedup_hash, bytenr); + if (ret) { + if (ret == -EAGAIN) + ret = 0; + else + btrfs_abort_transaction(trans, + extent_root, + ret); + goto out; + } + } if (found_extent) { BUG_ON(is_data refs_to_drop != @@ -6023,21 +6037,10 @@ again: if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, + extent_root, ret); goto out; } - - if (root_objectid == BTRFS_DEDUP_TREE_OBJECTID) { - ret = btrfs_free_dedup_extent(trans, root, - dedup_hash, - bytenr); - if (ret) { - btrfs_abort_transaction(trans, - extent_root, - ret); - goto out; -
[PATCH v8 13/14] Btrfs: fix dedupe 'ENOSPC' problem
With dedupe, writes are likely to produce metadata but no data, which means producing more delayed_refs, and this can ends up aborting a transaction because it needs to allocate enough free space from global_rsv to turn delayed_refs into real extent tree's nodes/leaves and global_rsv is used up. So we must take the worst case for throttling delayed refs in this dedup case, ie, allocating one leaf on each delayed ref head update. Signed-off-by: Liu Bo bo.li@oracle.com --- fs/btrfs/extent-tree.c | 13 + 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f14db92..df3a645 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2661,6 +2661,19 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; + /* +* For deduplication it's a special case where we must be very careful +* on throttling the number of delayed_refs, just because with dedupe, +* we're likely to produce _only_ metadata but no data, which results in +* quite a lot of delayed_refs, and consequently global_rsv may be used +* up and we get a RO btrfs. +* +* So we have to take the worst case -- one leaf for updating a ref head +* and its related refs. +*/ + if (root-fs_info-dedup_bs != 0) + return heads; + num_bytes = heads * (sizeof(struct btrfs_extent_item) + sizeof(struct btrfs_extent_inline_ref)); if (!btrfs_fs_incompat(root-fs_info, SKINNY_METADATA)) -- 1.8.2.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Migrate to bcache: A few questions
On Mon, Dec 30, 2013 at 02:22:55AM +0100, Kai Krakow wrote: These thought are actually quite interesting. So you are saying that data may not be fully written to SSD although the kernel thinks so? This is That, and worse. Incidently, I have just posted on my G+ about this: https://plus.google.com/106981743284611658289/posts/Us8yjK9SPs6 which is mostly links to http://lkcl.net/reports/ssd_analysis.html https://www.usenix.org/conference/fast13/understanding-robustness-ssds-under-power-fault After you read those, you'll never think twice about SSDs and data loss anymore :-/ (I kind of found that out myself over time too, but these have much more data than I got myself empirically on a couple of SSDs) Marc -- A mouse is a device used to point at the xterm you want to type in - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ | PGP 1024R/763BE901 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs-progs: add dedup subcommand
Am Montag, 30. Dezember 2013, 16:12:55 schrieben Sie: This adds deduplication subcommands, 'btrfs dedup command path', including enable/disable/on/off. Nice. Looking forward to test it. - btrfs dedup enable Create the dedup tree, and it's the very first step when you're going to use the dedup feature. - btrfs dedup disable Delete the dedup tree, after this we're not able to use dedup any more unless you enable it again. So if deduplication has been switched on for a while, btrfs dedup disable will cause BTRFS to undo the deduplication (and thus require more space for the same amount of data)? Thanks and happy new year, -- Martin 'Helios' Steigerwald - http://www.Lichtvoll.de GPG: 03B0 0D6C 0040 0710 4AFA B82F 991B EAAC A599 84C7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Migrate to bcache: A few questions
On 12/29/2013 04:11 PM, Kai Krakow wrote: Hello list! I'm planning to buy a small SSD (around 60GB) and use it for bcache in front of my 3x 1TB HDD btrfs setup (mraid1+draid0) using write-back caching. Btrfs is my root device, thus the system must be able to boot from bcache using init ramdisk. My /boot is a separate filesystem outside of btrfs and will be outside of bcache. I am using Gentoo as my system. I have a few questions: * How stable is it? I've read about some csum errors lately... * I want to migrate my current storage to bcache without replaying a backup. Is it possible? * Did others already use it? What is the perceived performance for desktop workloads in comparision to not using bcache? * How well does bcache handle power outages? Btrfs does handle them very well since many months. * How well does it play with dracut as initrd? Is it as simple as telling it the new device nodes or is there something complicate to configure? * How does bcache handle a failing SSD when it starts to wear out in a few years? * Is it worth waiting for hot-relocation support in btrfs to natively use a SSD as cache? * Would you recommend going with a bigger/smaller SSD? I'm planning to use only 75% of it for bcache so wear-leveling can work better, maybe use another part of it for hibernation (suspend to disk). I've actually tried a simmilar configuration myself a couple of times (also using Gentoo in-fact), and I can tell you from experience that unless things have changed greatly since kernel 3.12.1, it really isn't worth the headaches. Setting it up on an already installed system is a serious pain because the backing device has to be reformatted with a bcache super-block. In addition, every kernel that I have tried that had bcache compiled in or loaded as a module had issues, I would see a kernel OOPS on average once a day from the bcache code, usually followed shortly by a panic from some other unrelated subsystem. I didn't get any actual data corruption, but I wasn't using btrfs at the time for any of my filesystems. As an alternative to using bcache, you might try something simmilar to the following: 64G SSD with /boot, /, and /usr Other HDD with /var, /usr/portage, /usr/src, and /home tmpfs or ramdisk for /tmp and /var/tmp This is essentially what I use now, and I have found that it significantly improves system performance. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: systemd-journal, nodatacow, was: Is anyone using btrfs send/receive for backups instead of rsync?
On Sun, 2013-12-29 at 17:38 -0700, Chris Murphy wrote: On Dec 29, 2013, at 5:39 AM, Duncan 1i5t5.dun...@cox.net wrote: Yes, it does turn off checksumming as well as COW, but given the write- into scenario, that's actually best anyway, because otherwise btrfs has to keep updating the checksums On second thought, I'm less concerned with bitrot and checksumming being lost with nodatacow, than I am with significantly increasing the chance the journal is irreparably lost due to corruption during an unclean shutdown. So first, send/receive + nowcow aren't a great combination. NOCOW won't update the generation numbers send/receive needs to find changes. The best send/receive can do in that case is send over the entire file. But in all these cases, it's also quite common for the application doing the writing to have its own checksumming/error-detection and possible correction -- it pretty much comes with the territory -- in which case btrfs attempting to do the same is simply superfluous even if it weren't a race-condition trigger. I don't know what kind of checksumming systemd performs on the journal, but whenever Btrfs has found corruption with the journal file(s), systemd-journald has also found corruption and starts a new log. So it makes sense to rely on its own mechanisms, than Btrfs's. The autodefrag mode was really made for the small databases like systemd. I'd prefer that we use that for systemd instead of suggesting NOCOW. I'm finally dusting off my work to improve db performance, so hopefully we can do much better in the near future. -chris -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Sat, 2013-12-28 at 10:20 -0800, Marc MERLIN wrote: On Sat, Dec 28, 2013 at 10:07:58AM -0800, Marc MERLIN wrote: For instance, if I use an existing rsync destination to start syncing btrfs snapshots to after that, and one file operation can't be applied because let's say the destination file it's supposed to be applied to, isn't there? I should have written more: I'm guessing what happens is that the btrfs receive fails/aborts, I get an error, I then run a manual rsync to reset everything to a good known state, and then continue the btrfs send/receive after that? Btrfs send/receive works by matching state between snapshots on the sending and receiving end. If you update the files manually on the receiving end (say with rsync), it can't merge the states anymore. -chris -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Mon, Dec 30, 2013 at 04:05:03PM +, Chris Mason wrote: On Sat, 2013-12-28 at 10:20 -0800, Marc MERLIN wrote: On Sat, Dec 28, 2013 at 10:07:58AM -0800, Marc MERLIN wrote: For instance, if I use an existing rsync destination to start syncing btrfs snapshots to after that, and one file operation can't be applied because let's say the destination file it's supposed to be applied to, isn't there? I should have written more: I'm guessing what happens is that the btrfs receive fails/aborts, I get an error, I then run a manual rsync to reset everything to a good known state, and then continue the btrfs send/receive after that? Btrfs send/receive works by matching state between snapshots on the sending and receiving end. If you update the files manually on the receiving end (say with rsync), it can't merge the states anymore. I got that, but it wasn't quite my question :) I understand that btrfs receive cannot apply file changes if the destination filesystem isn't in a file state that's identical to the source one. I'm just not too sure how the destination FS needs to be configured so that btrfs receive can work with it. 1) Does it need to be an exact byte for byte copy of the block device the source was on? 2) Or can the destination be seeded with a full rsync or cp -a and can btrfs receive take over from there? 3) Then, if I hit a bug where something doesn't get synced right, and I run rsync to fix or verify that the two FS are indeed identical file-wise like they're supposed to, if rsync fixes something, are you saying that it'll stop btrfs receive from working after that? Thanks, Marc -- A mouse is a device used to point at the xterm you want to type in - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ | PGP 1024R/763BE901 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Mon, 2013-12-30 at 08:17 -0800, Marc MERLIN wrote: On Mon, Dec 30, 2013 at 04:05:03PM +, Chris Mason wrote: On Sat, 2013-12-28 at 10:20 -0800, Marc MERLIN wrote: On Sat, Dec 28, 2013 at 10:07:58AM -0800, Marc MERLIN wrote: For instance, if I use an existing rsync destination to start syncing btrfs snapshots to after that, and one file operation can't be applied because let's say the destination file it's supposed to be applied to, isn't there? I should have written more: I'm guessing what happens is that the btrfs receive fails/aborts, I get an error, I then run a manual rsync to reset everything to a good known state, and then continue the btrfs send/receive after that? Btrfs send/receive works by matching state between snapshots on the sending and receiving end. If you update the files manually on the receiving end (say with rsync), it can't merge the states anymore. I got that, but it wasn't quite my question :) I understand that btrfs receive cannot apply file changes if the destination filesystem isn't in a file state that's identical to the source one. I'm just not too sure how the destination FS needs to be configured so that btrfs receive can work with it. 1) Does it need to be an exact byte for byte copy of the block device the source was on? No, in fact this doesn't help. 2) Or can the destination be seeded with a full rsync or cp -a and can btrfs receive take over from there? No, it has to be created by btrfs receive. 3) Then, if I hit a bug where something doesn't get synced right, and I run rsync to fix or verify that the two FS are indeed identical file-wise like they're supposed to, if rsync fixes something, are you saying that it'll stop btrfs receive from working after that? Yes, today anyway it won't work. Send converts the changed items into an intermediate format (we don't send btree blocks directly over the wire) and then receive modifies the destination from userland. At the end of the stream we update the destination root to say you're now version xxyyzz of uuid aabbcc. We definitely could add a way to manually set this, but once a user does it, it'll be very hard to debug any problems they might have had if their copy wasn't actually up to date. -chris -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Mon, Dec 30, 2013 at 04:26:42PM +, Chris Mason wrote: 1) Does it need to be an exact byte for byte copy of the block device the source was on? No, in fact this doesn't help. 2) Or can the destination be seeded with a full rsync or cp -a and can btrfs receive take over from there? No, it has to be created by btrfs receive. Aaah, I wasn't clear on that, thanks for clarifying. So I need to make sure the target block device is at least as big as the source one, and if necessary a few blocks bigger if the drives do not allocate partitions of the exactly the same size. Mmmh, this makes it less desirable for me to use this then since I use over allocation on the backup servers and if I had to have as much space blocked off for the full size of each filesystem backed up, I'm going to be short. Bummer. 3) Then, if I hit a bug where something doesn't get synced right, and I run rsync to fix or verify that the two FS are indeed identical file-wise like they're supposed to, if rsync fixes something, are you saying that it'll stop btrfs receive from working after that? Yes, today anyway it won't work. Send converts the changed items into an intermediate format (we don't send btree blocks directly over the wire) and then receive modifies the destination from userland. At the end of the stream we update the destination root to say you're now version xxyyzz of uuid aabbcc. We definitely could add a way to manually set this, but once a user does it, it'll be very hard to debug any problems they might have had if their copy wasn't actually up to date. Understood. I dreamt that it was computing file differences and could just apply them on top of any other btrfs filesystem, even if it were smaller and had been created via rsync. If one day, it could at least work on a subvolume level (only sync a subvolume), then it would be more useful to me. Maybe later... Thanks for clearing that up. Marc -- A mouse is a device used to point at the xterm you want to type in - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: missing /sbin/fsck.btrfs
On Mon, Dec 2, 2013 at 12:01 AM, Dave Chinner da...@fromorbit.com wrote: I just explained that things can go wrong if you don't detect certain types of errors in fsck.foo when it is called from fstab processing. What I am implying here is that we cannot prevent users from setting passno to 1 or 2 in /etc/fstab. We have no control over that and so asserting that we don't need a fsck.btrfs because we can set passno to 0 is invalid. IOWs, fsck.btrfs needs to be present and it needs to behave correctly in these cases I actually think what btrfs is doing here is the more sensible thing (i.e., to not ship an fsck.btrfs), as it is a bit confusing to have a fsck.* that does not in fact do any filesystem checking. The way this stuff works under systemd is: * fsck is only ever called on a filesystem once the backing device has appeared (so under systemd, fsck.xfs is indeed a noop). * fsck is skipped for filesystems where the relevant helper does not exist, so fs_passno=1 has the same effect for xfs and btrfs filesystems (either way, nothing happens). That still leaves non-systemd systems and calling fsck -A manually. Maybe a good solution would be to patch fsck to adopt systemd's behavior, which would avoid every filesystem having to ship these fake fsck helpers? What do you think Karel? Cheers, Tom -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Dec 30, 2013, at 10:10 AM, Marc MERLIN m...@merlins.org wrote: If one day, it could at least work on a subvolume level (only sync a subvolume), then it would be more useful to me. Maybe later… Maybe I'm missing something, but btrfs send/receive only work on a subvolume level. Chris Murphy-- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Mon, Dec 30, 2013 at 10:48:10AM -0700, Chris Murphy wrote: On Dec 30, 2013, at 10:10 AM, Marc MERLIN m...@merlins.org wrote: If one day, it could at least work on a subvolume level (only sync a subvolume), then it would be more useful to me. Maybe later… Maybe I'm missing something, but btrfs send/receive only work on a subvolume level. Never mind, I seem to be the one being dense. I mis-read that you needed to create the filesystem with btrfs receive. Indeed, it's on a subvolume level, so it's actually fine since it does allow over provisionning afterall. My bad, sorry :) Marc -- A mouse is a device used to point at the xterm you want to type in - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ | PGP 1024R/763BE901 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive for backups instead of rsync?
On Dec 30, 2013, at 10:57 AM, Marc MERLIN m...@merlins.org wrote: On Mon, Dec 30, 2013 at 10:48:10AM -0700, Chris Murphy wrote: On Dec 30, 2013, at 10:10 AM, Marc MERLIN m...@merlins.org wrote: If one day, it could at least work on a subvolume level (only sync a subvolume), then it would be more useful to me. Maybe later… Maybe I'm missing something, but btrfs send/receive only work on a subvolume level. Never mind, I seem to be the one being dense. I mis-read that you needed to create the filesystem with btrfs receive. Indeed, it's on a subvolume level, so it's actually fine since it does allow over provisionning afterall. Depending on resources and disaster recovery requirements, you might also consider using send -f without receive at all, to the backup destination. The first send file (which will be big) can then be put anywhere, even to tape, and use the backup storage just for the incremental send -f files. Chris Murphy-- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Migrate to bcache: A few questions
Marc MERLIN m...@merlins.org schrieb: On Mon, Dec 30, 2013 at 02:22:55AM +0100, Kai Krakow wrote: These thought are actually quite interesting. So you are saying that data may not be fully written to SSD although the kernel thinks so? This is That, and worse. Incidently, I have just posted on my G+ about this: https://plus.google.com/106981743284611658289/posts/Us8yjK9SPs6 which is mostly links to http://lkcl.net/reports/ssd_analysis.html https://www.usenix.org/conference/fast13/understanding-robustness-ssds-under-power-fault After you read those, you'll never think twice about SSDs and data loss anymore :-/ (I kind of found that out myself over time too, but these have much more data than I got myself empirically on a couple of SSDs) The bad thing here is: Even battery-backed RAID controllers won't help you here. I start to understand why I still don't trust this new technology entirely. Thanks, Kai -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
question regarding caching
Hello, I have some questions regarding caching in BTRFS. When a file system is unmounted and mounted again, would all the previously cached content be removed from the cache after flushing to disk? After remounting, would the initial requests always be fetched from the disk? Rather than a local disk, I have a remote device to which my IO requests are sent and from which the data is fetched. I need certain data to be fetched from the remote device after a remount. But somehow I do not see any request appearing at the device. I even tried to do drop_caches after remounting the file system, but that does not seem to help. I guess my problem is not related to BTRFS, but since I am working with BTRFS, I wanted to ask here for help. Could any one tell me how I can ensure that requests are fetched from the (remote) device, especially after file system remount, without having to use drop_caches? Please let me know if I described the problem too vaguely and should give some more details. Wishing everyone a happy new year. Thanks and regards, Aastha. -- Aastha Mehta MPI-SWS, Germany E-mail: aasth...@mpi-sws.org -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
A question about btrfs_ioctl_file_extent_same
A simple question about btrfs_ioctl_file_extent_same... Is there any check before btrfs_cmp_data is called that short-circuits things if the extents being compared are already the same extent? I was looking and could not find such a check, but I don't really know the btrfs source code that well. If there is no such check, would it not make sense to add one? -- Michael Welsh Duggan (m...@md5i.com) -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs-progs: add dedup subcommand
On Mon, Dec 30, 2013 at 12:34:42PM +0100, Martin Steigerwald wrote: Am Montag, 30. Dezember 2013, 16:12:55 schrieben Sie: This adds deduplication subcommands, 'btrfs dedup command path', including enable/disable/on/off. Nice. Looking forward to test it. Well, I just got a report from another user, Marcel, who still got ENOSPC errors with this around of patch set, so it seems that I don't really fix that bug, I guess I have to work harder on this :-( - btrfs dedup enable Create the dedup tree, and it's the very first step when you're going to use the dedup feature. - btrfs dedup disable Delete the dedup tree, after this we're not able to use dedup any more unless you enable it again. So if deduplication has been switched on for a while, btrfs dedup disable will cause BTRFS to undo the deduplication (and thus require more space for the same amount of data)? No, it remains unchanged, and the data is independent of dedupe, so you can read them without any problems. Happy new year. Thanks, -liubo -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Migrate to bcache: A few questions
Duncan 1i5t5.dun...@cox.net schrieb: [ spoiler: tldr ;-) ] * How stable is it? I've read about some csum errors lately... FWIW, both bcache and btrfs are new and still developing technology. While I'm using btrfs here, I have tested usable (which for root means either means directly bootable or that you have tested booting to a recovery image and restoring from there, I do the former, here) backups, as STRONGLY recommended for btrfs in its current state, but haven't had to use them. And I considered bcache previously and might otherwise be using it, but at least personally, I'm not willing to try BOTH of them at once, since neither one is mature yet and if there are problems as there very well might be, I'd have the additional issue of figuring out which one was the problem, and I'm personally not prepared to deal with that. I mostly trust btrfs by now. Don't understand me wrong: I still have my nightly backup job syncing the complete system to an external drive - nothing defeats a good backup. But btrfs has survived reliably multiple power-losses, kernel panics/freezes, unreliable USB connections, ... It looks very stable from that view. Yes, it may have bugs that may introduce errors fatal to the filesystem structure. But generally, under usual workloads it has proven stable for me. At least for desktop workloads. Instead, at this point I'd recommend choosing /either/ bcache /or/ btrfs, and using bcache with a more mature filesystem like ext4 or (what I used for years previous and still use for spinning rust) reiserfs. I've used reiserfs for several years a long time ago. But it does absolutely not scale well for parallel/threaded workloads which is a show stopper for server workloads. But it always survived even the worst failure scenarios (like SCSI bus going offline for some RAID members) and the tools distributed with it were able to recover all data even if the FS was damaged beyond any usual things you would normally try when it does no longer mount. I've been with Ext3 before, and it was not only one time that a simple power-loss during high server-workload destroyed the filesystem beyond repair with fsck only making it worse. Since reiserfs did not scale well and ext* FS has annoyed me more than once, we've decided to go with XFS. While it tends to wipe some data after power- loss and leaves you with zero-filled files, it has proven extremely reliable even under those situations mentioned above like dying SCSI bus. Not to the extent reiserfs did but still very satisfying. The big plus: it scales extremely well with parallel workloads and can be optimized for the stripe configuration of the underlying RAID layer. So I made it my default filesystem for desktop, too. With the above mentioned annoying feature of zero'ing out recently touched files when the system crashed. But well, we all got proven backups, right? Yep, I also learned that lesson... *sigh But btrfs, when first announced and while I already was jealously looking at ZFS, seemed to be the FS of my choice giving me flexible RAID setups, snapshots... I'm quite happy with it although it feels slow sometimes. I simply threw more RAM at it - now it is okay. And as I said, keep your backups as current as you're willing to deal with losing what's not backed up, and tested usable and (for root) either bootable or restorable from alternate boot, because while at least btrfs is /reasonably/ stable for /ordinary/ daily use, there remain corner- cases and you never know when your case is going to BE a corner-case! I've got a small rescue system I can boot which has btrfs-tools and a recent kernel to flexible repair, restore, or whatever I want to do with my backup. My backup itself is not bootable (although it probably could, if I change some configurations files). * I want to migrate my current storage to bcache without replaying a backup. Is it possible? Since I've not actually used bcache, I won't try to answer some of these, but will answer based on what I've seen on the list where I can... I don't know on this one. I remember someone created some pyhton scripts to make it possible - wrt to btrfs especially. Can't remember the link. Maybe I'm able to dig it up. But at least I read it as: There's no improvement on that migration path directly from bcache. I hoped otherwise... * Did others already use it? What is the perceived performance for desktop workloads in comparision to not using bcache? Others are indeed already using it. I've seen some btrfs/bcache problems reported on this list, but as mentioned above, when both are in use that means figuring out which is the problem, and at least from the btrfs side I've not seen a lot of resolution in that regard. From here it /looks/ like that's simply being punted at this time, as there's still more easily traceable problems without the additional bcache variable to work on first. But it's quite possible
Re: question regarding caching
Aastha Mehta aasth...@gmail.com schrieb: Rather than a local disk, I have a remote device to which my IO requests are sent and from which the data is fetched. I need certain data to be fetched from the remote device after a remount. But somehow I do not see any request appearing at the device. I even tried to do drop_caches after remounting the file system, but that does not seem to help. Maybe you or your distribution deployed cachefilesd and uses it for the remote fs? HTH Kai -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs-progs: add dedup subcommand
Martin Steigerwald mar...@lichtvoll.de schrieb: - btrfs dedup disable Delete the dedup tree, after this we're not able to use dedup any more unless you enable it again. So if deduplication has been switched on for a while, btrfs dedup disable will cause BTRFS to undo the deduplication (and thus require more space for the same amount of data)? From my intention I would guess it just looses track of what the content is in content based storage - so when re-enabling it will have to learn from beginning. It should not unshare data as sharing extents is a feature of btrfs disting from the function of online dedup itself. At least that would sound reasonable to me. Regards, Kai -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html