On Tue, Jun 07, 2011 at 04:44:09PM -0400, Josef Bacik wrote: > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info > *info, u64 flags, > found->full = 0; > found->force_alloc = CHUNK_ALLOC_NO_FORCE; > found->chunk_alloc = 0; > + found->flush = 0; > + init_waitqueue_head(&found->wait); > *space_info = found; > list_add_rcu(&found->list, &info->space_info); > atomic_set(&found->caching_threads, 0); > @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle > *trans, > if (reserved == 0) > return 0; > > - /* nothing to shrink - nothing to reclaim */ > - if (root->fs_info->delalloc_bytes == 0) > + smp_mb();
can you please explain why do you use the barrier here? (and add that explanation as a comment) it's for the delalloc_bytes test, right? this is being modified in btrfs_set_bit_hook/btrfs_clear_bit_hook, protected by a spinlock. the counter is sum of all delalloc_bytes for each delalloc inode. if the counter is nonzero, then fs_info->delalloc_inodes is expected to be nonempty, but the list is not touched in this function, but indirectly from writeback_inodes_sb_nr_if_idle and the respective .writepages callback, ending up in __extent_writepage which starts messing with delalloc. it think it's possible to reach state, where counter is nonzero, but the delalloc_inodes list is empty. then writeback will just skip the delalloc work in this round and will process it later. even with a barrier in place: btrfs_set_bit_hook: # counter increased, a barrier will assure len is obtained from # delalloc_bytes in shrink_delalloc 1349 root->fs_info->delalloc_bytes += len; # but fs_info->delalloc_list may be empty 1350 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1351 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1352 &root->fs_info->delalloc_inodes); 1353 } although this does not seem to crash or cause corruption, I suggest to use the spinlock as the synchronization mechanism to protect reading fs_info->delalloc_bytes david > + if (root->fs_info->delalloc_bytes == 0) { > + if (trans) > + return 0; > + btrfs_wait_ordered_extents(root, 0, 0); > return 0; > + } > > max_reclaim = min(reserved, to_reclaim); > > @@ -3360,6 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle > *trans, > } > > } > + if (reclaimed >= to_reclaim && !trans) > + btrfs_wait_ordered_extents(root, 0, 0); > return reclaimed >= to_reclaim; > } > > @@ -3384,15 +3392,36 @@ static int reserve_metadata_bytes(struct > btrfs_trans_handle *trans, > u64 num_bytes = orig_bytes; > int retries = 0; > int ret = 0; > - bool reserved = false; > bool committed = false; > + bool flushing = false; > > again: > - ret = -ENOSPC; > - if (reserved) > - num_bytes = 0; > - > + ret = 0; > spin_lock(&space_info->lock); > + /* > + * We only want to wait if somebody other than us is flushing and we are > + * actually alloed to flush. > + */ > + while (flush && !flushing && space_info->flush) { > + spin_unlock(&space_info->lock); > + /* > + * If we have a trans handle we can't wait because the flusher > + * may have to commit the transaction, which would mean we would > + * deadlock since we are waiting for the flusher to finish, but > + * hold the current transaction open. > + */ > + if (trans) > + return -EAGAIN; > + ret = wait_event_interruptible(space_info->wait, > + !space_info->flush); > + /* Must have been interrupted, return */ > + if (ret) > + return -EINTR; > + > + spin_lock(&space_info->lock); > + } > + > + ret = -ENOSPC; > unused = space_info->bytes_used + space_info->bytes_reserved + > space_info->bytes_pinned + space_info->bytes_readonly + > space_info->bytes_may_use; > @@ -3407,8 +3436,7 @@ again: > if (unused <= space_info->total_bytes) { > unused = space_info->total_bytes - unused; > if (unused >= num_bytes) { > - if (!reserved) > - space_info->bytes_may_use += orig_bytes; > + space_info->bytes_may_use += orig_bytes; > ret = 0; > } else { > /* > @@ -3433,17 +3461,14 @@ again: > * to reclaim space we can actually use it instead of somebody else > * stealing it from us. > */ > - if (ret && !reserved) { > - space_info->bytes_may_use += orig_bytes; > - reserved = true; > + if (ret && flush) { > + flushing = true; > + space_info->flush = 1; > } > > spin_unlock(&space_info->lock); > > - if (!ret) > - return 0; > - > - if (!flush) > + if (!ret || !flush) > goto out; > > /* > @@ -3451,9 +3476,7 @@ again: > * metadata until after the IO is completed. > */ > ret = shrink_delalloc(trans, root, num_bytes, 1); > - if (ret > 0) > - return 0; > - else if (ret < 0) > + if (ret < 0) > goto out; > > /* > @@ -3466,11 +3489,11 @@ again: > goto again; > } > > - spin_lock(&space_info->lock); > /* > * Not enough space to be reclaimed, don't bother committing the > * transaction. > */ > + spin_lock(&space_info->lock); > if (space_info->bytes_pinned < orig_bytes) > ret = -ENOSPC; > spin_unlock(&space_info->lock); > @@ -3493,12 +3516,12 @@ again: > } > > out: > - if (reserved) { > + if (flushing) { > spin_lock(&space_info->lock); > - space_info->bytes_may_use -= orig_bytes; > + space_info->flush = 0; > + wake_up_all(&space_info->wait); > spin_unlock(&space_info->lock); > } > - > return ret; > } > > -- > 1.7.2.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html