On Mon, Jun 15, 2015 at 2:41 PM, <[email protected]> wrote:
> From: Jeff Mahoney <[email protected]>
>
> Since we now clean up block groups automatically as they become
> empty, iterating over block groups is no longer sufficient to discard
> unused space.
>
> This patch iterates over the unused chunk space and discards any regions
> that are unallocated, regardless of whether they were ever used. This is
> a change for btrfs but is consistent with other file systems.
>
> We do this in a transactionless manner since the discard process can take
> a substantial amount of time and a transaction would need to be started
> before the acquisition of the device list lock. That would mean a
> transaction would be held open across /all/ of the discards collectively.
> In order to prevent other threads from allocating or freeing chunks, we
> hold the chunks lock across the search and discard calls. We release it
> between searches to allow the file system to perform more-or-less
> normally. Since the running transaction can commit and disappear while
> we're using the transaction pointer, we take a reference to it and
> release it after the search. This is safe since it would happen normally
> at the end of the transaction commit after any locks are released anyway.
> We also take the commit_root_sem to protect against a transaction starting
> and committing while we're running.
>
> Signed-off-by: Jeff Mahoney <[email protected]>
Reviewed-by: Filipe Manana <[email protected]>
Tested-by: Filipe Manana <[email protected]>
Side note, this still doesn't apply cleanly on latest integration
branch (integration-4.2), results in warnings about casting pointer
from different type (btrfs_trans_handle to btrfs_transaction) at
btrfs_shrink_device().
> ---
> fs/btrfs/extent-tree.c | 101
> +++++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/volumes.c | 60 ++++++++++++++++++-----------
> fs/btrfs/volumes.h | 3 ++
> 3 files changed, 141 insertions(+), 23 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 1e44b93..24b48df 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10143,10 +10143,99 @@ int btrfs_error_unpin_extent_range(struct
> btrfs_root *root, u64 start, u64 end)
> return unpin_extent_range(root, start, end, false);
> }
>
> +/*
> + * It used to be that old block groups would be left around forever.
> + * Iterating over them would be enough to trim unused space. Since we
> + * now automatically remove them, we also need to iterate over unallocated
> + * space.
> + *
> + * We don't want a transaction for this since the discard may take a
> + * substantial amount of time. We don't require that a transaction be
> + * running, but we do need to take a running transaction into account
> + * to ensure that we're not discarding chunks that were released in
> + * the current transaction.
> + *
> + * Holding the chunks lock will prevent other threads from allocating
> + * or releasing chunks, but it won't prevent a running transaction
> + * from committing and releasing the memory that the pending chunks
> + * list head uses. For that, we need to take a reference to the
> + * transaction.
> + */
> +static int btrfs_trim_free_extents(struct btrfs_device *device,
> + u64 minlen, u64 *trimmed)
> +{
> + u64 start = 0, len = 0;
> + int ret;
> +
> + *trimmed = 0;
> +
> + /* Not writeable = nothing to do. */
> + if (!device->writeable)
> + return 0;
> +
> + /* No free space = nothing to do. */
> + if (device->total_bytes <= device->bytes_used)
> + return 0;
> +
> + ret = 0;
> +
> + while (1) {
> + struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
> + struct btrfs_transaction *trans;
> + u64 bytes;
> +
> + ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
> + if (ret)
> + return ret;
> +
> + down_read(&fs_info->commit_root_sem);
> +
> + spin_lock(&fs_info->trans_lock);
> + trans = fs_info->running_transaction;
> + if (trans)
> + atomic_inc(&trans->use_count);
> + spin_unlock(&fs_info->trans_lock);
> +
> + ret = find_free_dev_extent_start(trans, device, minlen, start,
> + &start, &len);
> + if (trans)
> + btrfs_put_transaction(trans);
> +
> + if (ret) {
> + up_read(&fs_info->commit_root_sem);
> + mutex_unlock(&fs_info->chunk_mutex);
> + if (ret == -ENOSPC)
> + ret = 0;
> + break;
> + }
> +
> + ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
> + up_read(&fs_info->commit_root_sem);
> + mutex_unlock(&fs_info->chunk_mutex);
> +
> + if (ret)
> + break;
> +
> + start += len;
> + *trimmed += bytes;
> +
> + if (fatal_signal_pending(current)) {
> + ret = -ERESTARTSYS;
> + break;
> + }
> +
> + cond_resched();
> + }
> +
> + return ret;
> +}
> +
> int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
> {
> struct btrfs_fs_info *fs_info = root->fs_info;
> struct btrfs_block_group_cache *cache = NULL;
> + struct btrfs_device *device;
> + struct list_head *devices;
> u64 group_trimmed;
> u64 start;
> u64 end;
> @@ -10201,6 +10290,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct
> fstrim_range *range)
> cache = next_block_group(fs_info->tree_root, cache);
> }
>
> + mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
> + devices = &root->fs_info->fs_devices->alloc_list;
> + list_for_each_entry(device, devices, dev_alloc_list) {
> + ret = btrfs_trim_free_extents(device, range->minlen,
> + &group_trimmed);
> + if (ret)
> + break;
> +
> + trimmed += group_trimmed;
> + }
> + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
> +
> range->len = trimmed;
> return ret;
> }
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 174f5e1..7fdde31 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1051,15 +1051,18 @@ out:
> return ret;
> }
>
> -static int contains_pending_extent(struct btrfs_trans_handle *trans,
> +static int contains_pending_extent(struct btrfs_transaction *transaction,
> struct btrfs_device *device,
> u64 *start, u64 len)
> {
> + struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
> struct extent_map *em;
> - struct list_head *search_list = &trans->transaction->pending_chunks;
> + struct list_head *search_list = &fs_info->pinned_chunks;
> int ret = 0;
> u64 physical_start = *start;
>
> + if (transaction)
> + search_list = &transaction->pending_chunks;
> again:
> list_for_each_entry(em, search_list, list) {
> struct map_lookup *map;
> @@ -1078,8 +1081,8 @@ again:
> ret = 1;
> }
> }
> - if (search_list == &trans->transaction->pending_chunks) {
> - search_list = &trans->root->fs_info->pinned_chunks;
> + if (search_list != &fs_info->pinned_chunks) {
> + search_list = &fs_info->pinned_chunks;
> goto again;
> }
>
> @@ -1088,12 +1091,13 @@ again:
>
>
> /*
> - * find_free_dev_extent - find free space in the specified device
> - * @device: the device which we search the free space in
> - * @num_bytes: the size of the free space that we need
> - * @start: store the start of the free space.
> - * @len: the size of the free space. that we find, or the size of the
> max
> - * free space if we don't find suitable free space
> + * find_free_dev_extent_start - find free space in the specified device
> + * @device: the device which we search the free space in
> + * @num_bytes: the size of the free space that we need
> + * @search_start: the position from which to begin the search
> + * @start: store the start of the free space.
> + * @len: the size of the free space. that we find, or the size
> + * of the max free space if we don't find suitable free space
> *
> * this uses a pretty simple search, the expectation is that it is
> * called very infrequently and that a given device has a small number
> @@ -1107,9 +1111,9 @@ again:
> * But if we don't find suitable free space, it is used to store the size of
> * the max free space.
> */
> -int find_free_dev_extent(struct btrfs_trans_handle *trans,
> - struct btrfs_device *device, u64 num_bytes,
> - u64 *start, u64 *len)
> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
> + struct btrfs_device *device, u64 num_bytes,
> + u64 search_start, u64 *start, u64 *len)
> {
> struct btrfs_key key;
> struct btrfs_root *root = device->dev_root;
> @@ -1119,19 +1123,11 @@ int find_free_dev_extent(struct btrfs_trans_handle
> *trans,
> u64 max_hole_start;
> u64 max_hole_size;
> u64 extent_end;
> - u64 search_start;
> u64 search_end = device->total_bytes;
> int ret;
> int slot;
> struct extent_buffer *l;
>
> - /* FIXME use last free of some kind */
> -
> - /* we don't want to overwrite the superblock on the drive,
> - * so we make sure to start at an offset of at least 1MB
> - */
> - search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
> -
> path = btrfs_alloc_path();
> if (!path)
> return -ENOMEM;
> @@ -1192,7 +1188,7 @@ again:
> * Have to check before we set max_hole_start,
> otherwise
> * we could end up sending back this offset anyway.
> */
> - if (contains_pending_extent(trans, device,
> + if (contains_pending_extent(transaction, device,
> &search_start,
> hole_size)) {
> if (key.offset >= search_start) {
> @@ -1241,7 +1237,7 @@ next:
> if (search_end > search_start) {
> hole_size = search_end - search_start;
>
> - if (contains_pending_extent(trans, device, &search_start,
> + if (contains_pending_extent(transaction, device,
> &search_start,
> hole_size)) {
> btrfs_release_path(path);
> goto again;
> @@ -1267,6 +1263,24 @@ out:
> return ret;
> }
>
> +int find_free_dev_extent(struct btrfs_trans_handle *trans,
> + struct btrfs_device *device, u64 num_bytes,
> + u64 *start, u64 *len)
> +{
> + struct btrfs_root *root = device->dev_root;
> + u64 search_start;
> +
> + /* FIXME use last free of some kind */
> +
> + /*
> + * we don't want to overwrite the superblock on the drive,
> + * so we make sure to start at an offset of at least 1MB
> + */
> + search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
> + return find_free_dev_extent_start(trans->transaction, device,
> + num_bytes, search_start, start,
> len);
> +}
> +
> static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
> struct btrfs_device *device,
> u64 start, u64 *dev_extent_len)
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index ebc3133..30918a8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -449,6 +449,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
> int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
> int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
> int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
> +int find_free_dev_extent_start(struct btrfs_transaction *transaction,
> + struct btrfs_device *device, u64 num_bytes,
> + u64 search_start, u64 *start, u64 *max_avail);
> int find_free_dev_extent(struct btrfs_trans_handle *trans,
> struct btrfs_device *device, u64 num_bytes,
> u64 *start, u64 *max_avail);
> --
> 2.4.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Filipe David Manana,
"Reasonable men adapt themselves to the world.
Unreasonable men adapt the world to themselves.
That's why all progress depends on unreasonable men."
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html