Re: [PATCH 07/10] writeback: Implement reliable switching to default writeback structure
On Fri 10-02-17 13:19:44, NeilBrown wrote: > On Thu, Feb 09 2017, Jan Kara wrote: > > > Currently switching of inode between different writeback structures is > > asynchronous and not guaranteed to succeed. Add a variant of switching > > that is synchronous and reliable so that it can reliably move inode to > > the default writeback structure (bdi->wb) when writeback on bdi is going > > to be shutdown. > > > > Signed-off-by: Jan Kara> > --- > > fs/fs-writeback.c | 60 > > --- > > include/linux/fs.h| 3 ++- > > include/linux/writeback.h | 6 + > > 3 files changed, 60 insertions(+), 9 deletions(-) > > > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > > index 23dc97cf2a50..52992a1036b1 100644 > > --- a/fs/fs-writeback.c > > +++ b/fs/fs-writeback.c > > @@ -332,14 +332,11 @@ struct inode_switch_wbs_context { > > struct work_struct work; > > }; > > > > -static void inode_switch_wbs_work_fn(struct work_struct *work) > > +static void do_inode_switch_wbs(struct inode *inode, > > + struct bdi_writeback *new_wb) > > { > > - struct inode_switch_wbs_context *isw = > > - container_of(work, struct inode_switch_wbs_context, work); > > - struct inode *inode = isw->inode; > > struct address_space *mapping = inode->i_mapping; > > struct bdi_writeback *old_wb = inode->i_wb; > > - struct bdi_writeback *new_wb = isw->new_wb; > > struct radix_tree_iter iter; > > bool switched = false; > > void **slot; > > @@ -436,15 +433,29 @@ static void inode_switch_wbs_work_fn(struct > > work_struct *work) > > spin_unlock(_wb->list_lock); > > spin_unlock(_wb->list_lock); > > > > + /* > > +* Make sure waitqueue_active() check in wake_up_bit() cannot happen > > +* before I_WB_SWITCH is cleared. Pairs with the barrier in > > +* set_task_state() after wait_on_bit() added waiter to the wait queue. > > I think you mean "set_current_state()" ?? Yes, I'll fix that. > It's rather a trap for the unwary, this need for a smp_mb(). > Greping for wake_up_bit(), I find quite a few places with barriers - > sometimes clear_bit_unlock() or spin_unlock() - but > > fs/block_dev.c- whole->bd_claiming = NULL; > fs/block_dev.c: wake_up_bit(>bd_claiming, 0); > > fs/cifs/connect.c- clear_bit(TCON_LINK_PENDING, >tl_flags); > fs/cifs/connect.c: wake_up_bit(>tl_flags, TCON_LINK_PENDING); > > fs/cifs/misc.c- clear_bit(CIFS_INODE_PENDING_WRITERS, > >flags); > fs/cifs/misc.c: wake_up_bit(>flags, > CIFS_INODE_PENDING_WRITERS); > > (several more in cifs) > > net/sunrpc/xprt.c- clear_bit(XPRT_CLOSE_WAIT, >state); > net/sunrpc/xprt.c- xprt->ops->close(xprt); > net/sunrpc/xprt.c- xprt_release_write(xprt, NULL); > net/sunrpc/xprt.c: wake_up_bit(>state, XPRT_LOCKED); > (there might be a barrier in ->close or xprt_release_write() I guess) > > security/keys/gc.c- clear_bit(KEY_GC_REAPING_KEYTYPE, > _gc_flags); > security/keys/gc.c: wake_up_bit(_gc_flags, > KEY_GC_REAPING_KEYTYPE); Yup, the above look like bugs. > I wonder if there is a good way to make this less error-prone. > I would suggest that wake_up_bit() should always have a barrier, and > __wake_up_bit() is needed to avoid it, but there is already a > __wake_up_bit() with a slightly different interface. Yeah, it is error-prone as all waitqueue_active() optimizations... > In this case, you have a spin_unlock() just before the wake_up_bit(). > It is my understand that it would provide enough of a barrier (all > writes before are globally visible after), so do you really need > the barrier here? I believe I do. spin_unlock() is a semi-permeable barrier - i.e., any read or write from "outside" can be moved inside. So CPU is free to prefetch values for waitqueue active checks before the spinlock is unlocked or even before clearing I_WB_SWITCH bit. > > +*/ > > + smp_mb(); > > + wake_up_bit(>i_state, __I_WB_SWITCH); > > + > > if (switched) { > > wb_wakeup(new_wb); > > wb_put(old_wb); > > } > > - wb_put(new_wb); > > +} > > > > - iput(inode); > > - kfree(isw); > > +static void inode_switch_wbs_work_fn(struct work_struct *work) > > +{ > > + struct inode_switch_wbs_context *isw = > > + container_of(work, struct inode_switch_wbs_context, work); > > > > + do_inode_switch_wbs(isw->inode, isw->new_wb); > > + wb_put(isw->new_wb); > > + iput(isw->inode); > > + kfree(isw); > > atomic_dec(_nr_in_flight); > > } > > > > @@ -521,6 +532,39 @@ static void inode_switch_wbs(struct inode *inode, int > > new_wb_id) > > } > > > > /** > > + * inode_switch_to_default_wb_sync - change the wb association of an inode > > to > > + * the default writeback structure synchronously > > + * @inode: target inode > > + * > > + * Switch @inode's wb association to the default
Re: [PATCH 07/10] writeback: Implement reliable switching to default writeback structure
On Thu, Feb 09 2017, Jan Kara wrote: > Currently switching of inode between different writeback structures is > asynchronous and not guaranteed to succeed. Add a variant of switching > that is synchronous and reliable so that it can reliably move inode to > the default writeback structure (bdi->wb) when writeback on bdi is going > to be shutdown. > > Signed-off-by: Jan Kara> --- > fs/fs-writeback.c | 60 > --- > include/linux/fs.h| 3 ++- > include/linux/writeback.h | 6 + > 3 files changed, 60 insertions(+), 9 deletions(-) > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c > index 23dc97cf2a50..52992a1036b1 100644 > --- a/fs/fs-writeback.c > +++ b/fs/fs-writeback.c > @@ -332,14 +332,11 @@ struct inode_switch_wbs_context { > struct work_struct work; > }; > > -static void inode_switch_wbs_work_fn(struct work_struct *work) > +static void do_inode_switch_wbs(struct inode *inode, > + struct bdi_writeback *new_wb) > { > - struct inode_switch_wbs_context *isw = > - container_of(work, struct inode_switch_wbs_context, work); > - struct inode *inode = isw->inode; > struct address_space *mapping = inode->i_mapping; > struct bdi_writeback *old_wb = inode->i_wb; > - struct bdi_writeback *new_wb = isw->new_wb; > struct radix_tree_iter iter; > bool switched = false; > void **slot; > @@ -436,15 +433,29 @@ static void inode_switch_wbs_work_fn(struct work_struct > *work) > spin_unlock(_wb->list_lock); > spin_unlock(_wb->list_lock); > > + /* > + * Make sure waitqueue_active() check in wake_up_bit() cannot happen > + * before I_WB_SWITCH is cleared. Pairs with the barrier in > + * set_task_state() after wait_on_bit() added waiter to the wait queue. I think you mean "set_current_state()" ?? It's rather a trap for the unwary, this need for a smp_mb(). Greping for wake_up_bit(), I find quite a few places with barriers - sometimes clear_bit_unlock() or spin_unlock() - but fs/block_dev.c- whole->bd_claiming = NULL; fs/block_dev.c: wake_up_bit(>bd_claiming, 0); fs/cifs/connect.c- clear_bit(TCON_LINK_PENDING, >tl_flags); fs/cifs/connect.c: wake_up_bit(>tl_flags, TCON_LINK_PENDING); fs/cifs/misc.c- clear_bit(CIFS_INODE_PENDING_WRITERS, >flags); fs/cifs/misc.c: wake_up_bit(>flags, CIFS_INODE_PENDING_WRITERS); (several more in cifs) net/sunrpc/xprt.c- clear_bit(XPRT_CLOSE_WAIT, >state); net/sunrpc/xprt.c- xprt->ops->close(xprt); net/sunrpc/xprt.c- xprt_release_write(xprt, NULL); net/sunrpc/xprt.c: wake_up_bit(>state, XPRT_LOCKED); (there might be a barrier in ->close or xprt_release_write() I guess) security/keys/gc.c- clear_bit(KEY_GC_REAPING_KEYTYPE, _gc_flags); security/keys/gc.c: wake_up_bit(_gc_flags, KEY_GC_REAPING_KEYTYPE); I wonder if there is a good way to make this less error-prone. I would suggest that wake_up_bit() should always have a barrier, and __wake_up_bit() is needed to avoid it, but there is already a __wake_up_bit() with a slightly different interface. In this case, you have a spin_unlock() just before the wake_up_bit(). It is my understand that it would provide enough of a barrier (all writes before are globally visible after), so do you really need the barrier here? > + */ > + smp_mb(); > + wake_up_bit(>i_state, __I_WB_SWITCH); > + > if (switched) { > wb_wakeup(new_wb); > wb_put(old_wb); > } > - wb_put(new_wb); > +} > > - iput(inode); > - kfree(isw); > +static void inode_switch_wbs_work_fn(struct work_struct *work) > +{ > + struct inode_switch_wbs_context *isw = > + container_of(work, struct inode_switch_wbs_context, work); > > + do_inode_switch_wbs(isw->inode, isw->new_wb); > + wb_put(isw->new_wb); > + iput(isw->inode); > + kfree(isw); > atomic_dec(_nr_in_flight); > } > > @@ -521,6 +532,39 @@ static void inode_switch_wbs(struct inode *inode, int > new_wb_id) > } > > /** > + * inode_switch_to_default_wb_sync - change the wb association of an inode to > + * the default writeback structure synchronously > + * @inode: target inode > + * > + * Switch @inode's wb association to the default writeback structure > (bdi->wb). > + * Unlike inode_switch_wbs() the switching is performed synchronously and we > + * guarantee the inode is switched to the default writeback structure when > this > + * function returns. Nothing prevents from someone else switching inode to > + * another writeback structure just when we are done though. Preventing that > is > + * upto the caller if needed. > + */ > +void inode_switch_to_default_wb_sync(struct inode *inode) > +{ > + struct backing_dev_info *bdi = inode_to_bdi(inode); > + > + /* while holding I_WB_SWITCH, no one else can