[Cluster-devel] [GFS2 PATCH] GFS2: Wake up io waiters whenever a flush is done
Hi, Before this patch, if a process called function gfs2_log_reserve to reserve some journal blocks, but the journal not enough blocks were free, it would call io_schedule. However, in the log flush daemon, it woke up the waiters only if an gfs2_ail_flush was no longer required. This resulted in situations where processes would wait forever because the number of blocks required was so high that it pushed the journal into a perpetual state of flush being required. This patch changes the logd daemon so that it wakes up io waiters every time the log is actually flushed. Signed-off-by: Bob Peterson --- diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 4df349c..5028a9d 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -918,12 +918,15 @@ int gfs2_logd(void *data) struct gfs2_sbd *sdp = data; unsigned long t = 1; DEFINE_WAIT(wait); + bool did_flush; while (!kthread_should_stop()) { + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + did_flush = true; } if (gfs2_ail_flush_reqd(sdp)) { @@ -931,9 +934,10 @@ int gfs2_logd(void *data) gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); + did_flush = true; } - if (!gfs2_ail_flush_reqd(sdp)) + if (!gfs2_ail_flush_reqd(sdp) || did_flush) wake_up(&sdp->sd_log_waitq); t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
Re: [Cluster-devel] [GFS2 PATCH] GFS2: Made logd daemon take into account log demand
Hi, Both patches look good to me, Steve. On 05/01/17 21:11, Bob Peterson wrote: Hi, Before this patch, the logd daemon only tried to flush things when the log blocks pinned exceeded a certain threshold. But when we're deleting very large files, it may require a huge number of journal blocks, and that, in turn, may exceed the threshold. This patch factors that into account. Signed-off-by: Bob Peterson --- diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a6a3389..00d8dc2 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -794,6 +794,7 @@ struct gfs2_sbd { atomic_t sd_log_thresh1; atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; + atomic_t sd_log_blks_needed; wait_queue_head_t sd_log_waitq; wait_queue_head_t sd_logd_waitq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e58ccef0..4df349c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -349,6 +349,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) if (gfs2_assert_warn(sdp, blks) || gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) return -EINVAL; + atomic_add(blks, &sdp->sd_log_blks_needed); retry: free_blocks = atomic_read(&sdp->sd_log_blks_free); if (unlikely(free_blocks <= wanted)) { @@ -370,6 +371,7 @@ retry: wake_up(&sdp->sd_reserving_log_wait); goto retry; } + atomic_sub(blks, &sdp->sd_log_blks_needed); trace_gfs2_log_blocks(sdp, -blks); /* @@ -891,13 +893,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { - return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); + return (atomic_read(&sdp->sd_log_pinned) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh1)); } static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); - return used_blocks >= atomic_read(&sdp->sd_log_thresh2); + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh2); } /** diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ff72ac6..86281a9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -683,6 +683,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_jindex; } + atomic_set(&sdp->sd_log_blks_needed, 0); if (sdp->sd_args.ar_spectator) { sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
[Cluster-devel] [DEBUG PATCH 1/2] mm, debug: report when GFP_NO{FS, IO} is used explicitly from memalloc_no{fs, io}_{save, restore} context
From: Michal Hocko THIS PATCH IS FOR TESTING ONLY AND NOT MEANT TO HIT LINUS TREE It is desirable to reduce the direct GFP_NO{FS,IO} usage at minimum and prefer scope usage defined by memalloc_no{fs,io}_{save,restore} API. Let's help this process and add a debugging tool to catch when an explicit allocation request for GFP_NO{FS,IO} is done from the scope context. The printed stacktrace should help to identify the caller and evaluate whether it can be changed to use a wider context or whether it is called from another potentially dangerous context which needs a scope protection as well. The checks have to be enabled explicitly by debug_scope_gfp kernel command line parameter. Signed-off-by: Michal Hocko --- include/linux/sched.h | 14 +++-- include/linux/slab.h | 3 +++ mm/page_alloc.c | 58 +++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2032fc642a26..59428926e989 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1988,6 +1988,8 @@ struct task_struct { /* A live task holds one reference. */ atomic_t stack_refcount; #endif + unsigned long nofs_caller; + unsigned long noio_caller; /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -2345,6 +2347,8 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +extern void debug_scope_gfp_context(gfp_t gfp_mask); + /* * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO @@ -2363,25 +2367,31 @@ static inline gfp_t current_gfp_context(gfp_t flags) return flags; } -static inline unsigned int memalloc_noio_save(void) +static inline unsigned int __memalloc_noio_save(unsigned long caller) { unsigned int flags = current->flags & PF_MEMALLOC_NOIO; current->flags |= PF_MEMALLOC_NOIO; + current->noio_caller = caller; return flags; } +#define memalloc_noio_save() __memalloc_noio_save(_RET_IP_) + static inline void memalloc_noio_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } -static inline unsigned int memalloc_nofs_save(void) +static inline unsigned int __memalloc_nofs_save(unsigned long caller) { unsigned int flags = current->flags & PF_MEMALLOC_NOFS; current->flags |= PF_MEMALLOC_NOFS; + current->nofs_caller = caller; return flags; } +#define memalloc_nofs_save() __memalloc_nofs_save(_RET_IP_) + static inline void memalloc_nofs_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; diff --git a/include/linux/slab.h b/include/linux/slab.h index 084b12bad198..6559668e29db 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -477,6 +477,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) */ static __always_inline void *kmalloc(size_t size, gfp_t flags) { + debug_scope_gfp_context(flags); if (__builtin_constant_p(size)) { if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); @@ -517,6 +518,7 @@ static __always_inline int kmalloc_size(int n) static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + debug_scope_gfp_context(flags); #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { @@ -575,6 +577,7 @@ int memcg_update_all_caches(int num_memcgs); */ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) { + debug_scope_gfp_context(flags); if (size != 0 && n > SIZE_MAX / size) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5138b46a4295..87a2bb5262b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3738,6 +3738,63 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, return page; } +static bool debug_scope_gfp; + +static int __init enable_debug_scope_gfp(char *unused) +{ + debug_scope_gfp = true; + return 0; +} + +/* + * spit the stack trace if the given gfp_mask clears flags which are context + * wide cleared. Such a caller can remove special flags clearing and rely on + * the context wide mask. + */ +void debug_scope_gfp_context(gfp_t gfp_mask) +{ + gfp_t restrict_mask; + + if (likely(!debug_scope_gfp)) + return; + + /* both NOFS, NOIO are irrelevant when direct reclaim is disabled */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return; + + if (current->flags & PF_MEMALLOC_NOIO) + restrict_mask = __GFP_IO; + else if ((current->flags & P
[Cluster-devel] [DEBUG PATCH 2/2] silent warnings which we cannot do anything about
From: Michal Hocko THIS PATCH IS FOR TESTING ONLY AND NOT MEANT TO HIT LINUS TREE There are some code paths used by all the filesystems which we cannot change to drop the GFP_NOFS, yet they generate a lot of warnings. Provide {disable,enable}_scope_gfp_check to silence those. alloc_page_buffers and grow_dev_page are silenced right away. Signed-off-by: Michal Hocko --- fs/buffer.c | 4 include/linux/sched.h | 11 +++ mm/page_alloc.c | 3 +++ 3 files changed, 18 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index 28484b3ebc98..dbe529e7881b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -873,7 +873,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { + disable_scope_gfp_check(); bh = alloc_buffer_head(GFP_NOFS); + enable_scope_gfp_check(); if (!bh) goto no_grow; @@ -1003,7 +1005,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ gfp_mask |= __GFP_NOFAIL; + disable_scope_gfp_check(); page = find_or_create_page(inode->i_mapping, index, gfp_mask); + enable_scope_gfp_check(); if (!page) return ret; diff --git a/include/linux/sched.h b/include/linux/sched.h index 59428926e989..f60294732ed5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1988,6 +1988,7 @@ struct task_struct { /* A live task holds one reference. */ atomic_t stack_refcount; #endif + bool disable_scope_gfp_warn; unsigned long nofs_caller; unsigned long noio_caller; /* CPU-specific state of this task */ @@ -2390,6 +2391,16 @@ static inline unsigned int __memalloc_nofs_save(unsigned long caller) return flags; } +static inline void disable_scope_gfp_check(void) +{ + current->disable_scope_gfp_warn = true; +} + +static inline void enable_scope_gfp_check(void) +{ + current->disable_scope_gfp_warn = false; +} + #define memalloc_nofs_save() __memalloc_nofs_save(_RET_IP_) static inline void memalloc_nofs_restore(unsigned int flags) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 87a2bb5262b2..5405278bd733 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3762,6 +3762,9 @@ void debug_scope_gfp_context(gfp_t gfp_mask) if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; + if (current->disable_scope_gfp_warn) + return; + if (current->flags & PF_MEMALLOC_NOIO) restrict_mask = __GFP_IO; else if ((current->flags & PF_MEMALLOC_NOFS) && (gfp_mask & __GFP_IO)) -- 2.11.0
[Cluster-devel] [DEBUG PATCH 0/2] debug explicit GFP_NO{FS, IO} usage from the scope context
These two patches should help to identify explicit GFP_NO{FS,IO} usage from withing a scope context and reduce such a usage as a result. Such a usage can be changed to the full GFP_KERNEL because all the calls from within the NO{FS,IO} scope will drop the __GFP_FS resp. __GFP_IO automatically and if the function is called outside of the scope then we do not need to restrict it to NOFS/NOIO as long as all the reclaim recursion unsafe contexts are marked properly. This means that each such a reported allocation site has to be checked before converted. The debugging has to be enabled explicitly by a kernel command line parameter and then it reports the stack trace of the allocation and also the function which has started the current scope. These two patches are _not_ intended to be merged and they are only aimed at debugging.
[Cluster-devel] [PATCH 5/8] jbd2: mark the transaction context with the scope GFP_NOFS context
From: Michal Hocko now that we have memalloc_nofs_{save,restore} api we can mark the whole transaction context as implicitly GFP_NOFS. All allocations will automatically inherit GFP_NOFS this way. This means that we do not have to mark any of those requests with GFP_NOFS and moreover all the ext4_kv[mz]alloc(GFP_NOFS) are also safe now because even the hardcoded GFP_KERNEL allocations deep inside the vmalloc will be NOFS now. Signed-off-by: Michal Hocko Reviewed-by: Jan Kara --- fs/jbd2/transaction.c | 11 +++ include/linux/jbd2.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e1652665bd93..35a5d3d76182 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -388,6 +388,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle, rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); + /* +* Make sure that no allocations done while the transaction is +* open is going to recurse back to the fs layer. +*/ + handle->saved_alloc_context = memalloc_nofs_save(); return 0; } @@ -466,6 +471,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); + return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -1760,6 +1766,11 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: + /* +* scope of th GFP_NOFS context is over here and so we can +* restore the original alloc context. +*/ + memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); return err; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index dfaa1f4dcb0c..606b6bce3a5b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -491,6 +491,8 @@ struct jbd2_journal_handle unsigned long h_start_jiffies; unsigned inth_requested_credits; + + unsigned intsaved_alloc_context; }; -- 2.11.0
[Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS
From: Michal Hocko xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite some time ago. We would like to make this concept more generic and use it for other filesystems as well. Let's start by giving the flag a more generic name PF_MEMALLOC_NOFS which is in line with an exiting PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO contexts. Replace all PF_FSTRANS usage from the xfs code in the first step before we introduce a full API for it as xfs uses the flag directly anyway. This patch doesn't introduce any functional change. Signed-off-by: Michal Hocko Reviewed-by: Brian Foster --- fs/xfs/kmem.c | 4 ++-- fs/xfs/kmem.h | 2 +- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/xfs_aops.c | 6 +++--- fs/xfs/xfs_trans.c| 12 ++-- include/linux/sched.h | 2 ++ 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 339c696bbc01..a76a05dae96b 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -80,13 +80,13 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) noio_flag = memalloc_noio_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) memalloc_noio_restore(noio_flag); return ptr; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 689f746224e7..d973dbfc2bfa 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) lflags &= ~__GFP_FS; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 21e6a6ab6b9a..a2672ba4dc33 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2866,7 +2866,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; + unsigned long new_pflags = PF_MEMALLOC_NOFS; /* * we are in a transaction context here, but may also be doing work diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index ef382bfb402b..d4094bb55033 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return 0; } @@ -252,7 +252,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -1015,7 +1015,7 @@ xfs_do_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) goto redirty; /* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfb..f5969c8274fc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,7 +134,7 @@ xfs_trans_reserve( boolrsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); /* * Attempt to reserve the needed disk blocks by decrementing @@ -144,7 +144,7 @@ xfs_trans_reserve( if (blocks > 0) { error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMAL
[Cluster-devel] [PATCH 7/8] Revert "ext4: avoid deadlocks in the writeback path by using sb_getblk_gfp"
From: Michal Hocko This reverts commit c45653c341f5c8a0ce19c8f0ad4678640849cb86 because sb_getblk_gfp is not really needed as sb_getblk __getblk_gfp __getblk_slow grow_buffers grow_dev_page gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp so __GFP_FS is cleared unconditionally and therefore the above commit didn't have any real effect in fact. This patch should not introduce any functional change. The main point of this change is to reduce explicit GFP_NOFS usage inside ext4 code to make the review of the remaining usage easier. Signed-off-by: Michal Hocko Reviewed-by: Jan Kara --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3e295d3350a9..9867b9e5ad8f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -518,7 +518,7 @@ __read_extent_tree_block(const char *function, unsigned int line, struct buffer_head *bh; int err; - bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); + bh = sb_getblk(inode->i_sb, pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -1096,7 +1096,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EFSCORRUPTED; goto cleanup; } - bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); + bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; @@ -1290,7 +1290,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (newblock == 0) return err; - bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); + bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); -- 2.11.0
[Cluster-devel] [PATCH 0/8 v3] scope GFP_NOFS api
Hi, I have posted the previous version here [1]. Since then I've added some reviewed bys and fixed some minor issues. I've dropped patch 2 [2] based on Dave's request [3]. I agree that this can be done later and doing all at once. I still think that __GFP_NOLOCKDEP should be added by this series to make the further development easier. There didn't seem to be any real objections and so I think we should go and merge this and build further on top. I would like to get rid of all explicit GFP_NOFS usage in ext4 code. I have something half baked already and will send it later on. I also hope we can get further with the xfs as well. I haven't heard anything from btrfs or other filesystems guys which is a bit unfortunate but I do not want to wait for them to much longer, they can join the effort later on. The patchset is based on next-20170106 Diffstat says fs/ext4/acl.c | 6 +++--- fs/ext4/extents.c | 8 fs/ext4/resize.c | 4 ++-- fs/ext4/xattr.c | 4 ++-- fs/jbd2/journal.c | 7 +++ fs/jbd2/transaction.c | 11 +++ fs/xfs/kmem.c | 10 +- fs/xfs/kmem.h | 2 +- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/xfs_aops.c | 6 +++--- fs/xfs/xfs_buf.c | 8 fs/xfs/xfs_trans.c| 12 ++-- include/linux/gfp.h | 18 +- include/linux/jbd2.h | 2 ++ include/linux/sched.h | 32 ++-- kernel/locking/lockdep.c | 6 +- lib/radix-tree.c | 2 ++ mm/page_alloc.c | 8 +--- mm/vmscan.c | 6 +++--- 19 files changed, 109 insertions(+), 45 deletions(-) Shortlog: Michal Hocko (8): lockdep: allow to disable reclaim lockup detection xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS mm: introduce memalloc_nofs_{save,restore} API xfs: use memalloc_nofs_{save,restore} instead of memalloc_noio* jbd2: mark the transaction context with the scope GFP_NOFS context jbd2: make the whole kjournald2 kthread NOFS safe Revert "ext4: avoid deadlocks in the writeback path by using sb_getblk_gfp" Revert "ext4: fix wrong gfp type under transaction" [1] http://lkml.kernel.org/r/20161215140715.12732-1-mho...@kernel.org [2] http://lkml.kernel.org/r/20161215140715.12732-3-mho...@kernel.org [3] http://lkml.kernel.org/r/20161219212413.GN4326@dastard
[Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*
From: Michal Hocko kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore} API to prevent from reclaim recursion into the fs because vmalloc can invoke unconditional GFP_KERNEL allocations and these functions might be called from the NOFS contexts. The memalloc_noio_save will enforce GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should provide exactly what we need here - implicit GFP_NOFS context. Changes since v1 - s@memalloc_noio_restore@memalloc_nofs_restore@ in _xfs_buf_map_pages as per Brian Foster Signed-off-by: Michal Hocko --- fs/xfs/kmem.c| 10 +- fs/xfs/xfs_buf.c | 8 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a76a05dae96b..d69ed5e76621 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -65,7 +65,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { - unsigned noio_flag = 0; + unsigned nofs_flag = 0; void*ptr; gfp_t lflags; @@ -80,14 +80,14 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) - noio_flag = memalloc_noio_save(); + if (flags & KM_NOFS) + nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) - memalloc_noio_restore(noio_flag); + if (flags & KM_NOFS) + memalloc_nofs_restore(nofs_flag); return ptr; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 7f0a01f7b592..8cb8dd4cdfd8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -441,17 +441,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; - unsigned noio_flag; + unsigned nofs_flag; /* * vm_map_ram() will allocate auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim -* that we are in such a context via PF_MEMALLOC_NOIO to prevent +* that we are in such a context via PF_MEMALLOC_NOFS to prevent * memory reclaim re-entering the filesystem here and * potentially deadlocking. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -459,7 +459,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); if (!bp->b_addr) return -ENOMEM; -- 2.11.0
[Cluster-devel] [PATCH 8/8] Revert "ext4: fix wrong gfp type under transaction"
From: Michal Hocko This reverts commit 216553c4b7f3e3e2beb4981cddca9b2027523928. Now that the transaction context uses memalloc_nofs_save and all allocations within the this context inherit GFP_NOFS automatically, there is no reason to mark specific allocations explicitly. This patch should not introduce any functional change. The main point of this change is to reduce explicit GFP_NOFS usage inside ext4 code to make the review of the remaining usage easier. Signed-off-by: Michal Hocko Reviewed-by: Jan Kara --- fs/ext4/acl.c | 6 +++--- fs/ext4/extents.c | 2 +- fs/ext4/resize.c | 4 ++-- fs/ext4/xattr.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fd389935ecd1..9e98092c2a4b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -32,7 +32,7 @@ ext4_acl_from_disk(const void *value, size_t size) return ERR_PTR(-EINVAL); if (count == 0) return NULL; - acl = posix_acl_alloc(count, GFP_NOFS); + acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { @@ -94,7 +94,7 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * - sizeof(ext4_acl_entry), GFP_NOFS); + sizeof(ext4_acl_entry), GFP_KERNEL); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); @@ -159,7 +159,7 @@ ext4_get_acl(struct inode *inode, int type) } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_NOFS); + value = kmalloc(retval, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9867b9e5ad8f..0371e7aa7bea 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2933,7 +2933,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), - GFP_NOFS); + GFP_KERNEL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index cf681004b196..e121f4e048b8 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -816,7 +816,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), -GFP_NOFS); +GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", @@ -943,7 +943,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, int res, i; int err; - primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL); if (!primary) return -ENOMEM; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5a94fa52b74f..172317462238 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -875,7 +875,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + s->base = kmalloc(bs->bh->b_size, GFP_KERNEL); error = -ENOMEM; if (s->base == NULL) goto cleanup; @@ -887,7 +887,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } } else { /* Allocate a buffer where we construct the new block. */ - s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + s->base = kzalloc(sb->s_blocksize, GFP_KERNEL); /* assert(header == s->base) */ error = -ENOMEM; if (s->base == NULL) -- 2.11.0
[Cluster-devel] [PATCH 6/8] jbd2: make the whole kjournald2 kthread NOFS safe
From: Michal Hocko kjournald2 is central to the transaction commit processing. As such any potential allocation from this kernel thread has to be GFP_NOFS. Make sure to mark the whole kernel thread GFP_NOFS by the memalloc_nofs_save. Suggested-by: Jan Kara Signed-off-by: Michal Hocko Reviewed-by: Jan Kara --- fs/jbd2/journal.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a097048ed1a3..3a449150f834 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -206,6 +206,13 @@ static int kjournald2(void *arg) wake_up(&journal->j_wait_done_commit); /* +* Make sure that no allocations from this kernel thread will ever recurse +* to the fs layer because we are responsible for the transaction commit +* and any fs involvement might get stuck waiting for the trasn. commit. +*/ + memalloc_nofs_save(); + + /* * And now, wait forever for commit wakeup events. */ write_lock(&journal->j_state_lock); -- 2.11.0
[Cluster-devel] [PATCH 1/8] lockdep: allow to disable reclaim lockup detection
From: Michal Hocko The current implementation of the reclaim lockup detection can lead to false positives and those even happen and usually lead to tweak the code to silence the lockdep by using GFP_NOFS even though the context can use __GFP_FS just fine. See http://lkml.kernel.org/r/20160512080321.GA18496@dastard as an example. = [ INFO: inconsistent lock state ] 4.5.0-rc2+ #4 Tainted: G O - inconsistent {RECLAIM_FS-ON-R} -> {IN-RECLAIM_FS-W} usage. kswapd0/543 [HC0[0]:SC0[0]:HE1:SE1] takes: (&xfs_nondir_ilock_class){-+}, at: [] xfs_ilock+0x177/0x200 [xfs] {RECLAIM_FS-ON-R} state was registered at: [] mark_held_locks+0x79/0xa0 [] lockdep_trace_alloc+0xb3/0x100 [] kmem_cache_alloc+0x33/0x230 [] kmem_zone_alloc+0x81/0x120 [xfs] [] xfs_refcountbt_init_cursor+0x3e/0xa0 [xfs] [] __xfs_refcount_find_shared+0x75/0x580 [xfs] [] xfs_refcount_find_shared+0x84/0xb0 [xfs] [] xfs_getbmap+0x608/0x8c0 [xfs] [] xfs_vn_fiemap+0xab/0xc0 [xfs] [] do_vfs_ioctl+0x498/0x670 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x6f CPU0 lock(&xfs_nondir_ilock_class); lock(&xfs_nondir_ilock_class); *** DEADLOCK *** 3 locks held by kswapd0/543: stack backtrace: CPU: 0 PID: 543 Comm: kswapd0 Tainted: G O4.5.0-rc2+ #4 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 82a34f10 88003aa078d0 813a14f9 88003d8551c0 88003aa07920 8110ec65 0001 8801 000b 0008 88003d855aa0 Call Trace: [] dump_stack+0x4b/0x72 [] print_usage_bug+0x215/0x240 [] mark_lock+0x1f5/0x660 [] ? print_shortest_lock_dependencies+0x1a0/0x1a0 [] __lock_acquire+0xa80/0x1e50 [] ? kmem_cache_alloc+0x15e/0x230 [] ? kmem_zone_alloc+0x81/0x120 [xfs] [] lock_acquire+0xd8/0x1e0 [] ? xfs_ilock+0x177/0x200 [xfs] [] ? xfs_reflink_cancel_cow_range+0x150/0x300 [xfs] [] down_write_nested+0x5e/0xc0 [] ? xfs_ilock+0x177/0x200 [xfs] [] xfs_ilock+0x177/0x200 [xfs] [] xfs_reflink_cancel_cow_range+0x150/0x300 [xfs] [] xfs_fs_evict_inode+0xdc/0x1e0 [xfs] [] evict+0xc5/0x190 [] dispose_list+0x39/0x60 [] prune_icache_sb+0x4b/0x60 [] super_cache_scan+0x14f/0x1a0 [] shrink_slab.part.63.constprop.79+0x1e9/0x4e0 [] shrink_zone+0x15e/0x170 [] kswapd+0x4f1/0xa80 [] ? zone_reclaim+0x230/0x230 [] kthread+0xf2/0x110 [] ? kthread_create_on_node+0x220/0x220 [] ret_from_fork+0x3f/0x70 [] ? kthread_create_on_node+0x220/0x220 To quote Dave: " Ignoring whether reflink should be doing anything or not, that's a "xfs_refcountbt_init_cursor() gets called both outside and inside transactions" lockdep false positive case. The problem here is lockdep has seen this allocation from within a transaction, hence a GFP_NOFS allocation, and now it's seeing it in a GFP_KERNEL context. Also note that we have an active reference to this inode. So, because the reclaim annotations overload the interrupt level detections and it's seen the inode ilock been taken in reclaim ("interrupt") context, this triggers a reclaim context warning where it thinks it is unsafe to do this allocation in GFP_KERNEL context holding the inode ilock... " This sounds like a fundamental problem of the reclaim lock detection. It is really impossible to annotate such a special usecase IMHO unless the reclaim lockup detection is reworked completely. Until then it is much better to provide a way to add "I know what I am doing flag" and mark problematic places. This would prevent from abusing GFP_NOFS flag which has a runtime effect even on configurations which have lockdep disabled. Introduce __GFP_NOLOCKDEP flag which tells the lockdep gfp tracking to skip the current allocation request. While we are at it also make sure that the radix tree doesn't accidentaly override tags stored in the upper part of the gfp_mask. Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Signed-off-by: Michal Hocko --- include/linux/gfp.h | 10 +- kernel/locking/lockdep.c | 4 lib/radix-tree.c | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4175dca4ac39..1a934383cc20 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -41,6 +41,11 @@ struct vm_area_struct; #define ___GFP_OTHER_NODE 0x80u #define ___GFP_WRITE 0x100u #define ___GFP_KSWAPD_RECLAIM 0x200u +#ifdef CONFIG_LOCKDEP +#define ___GFP_NOLOCKDEP 0x400u +#else +#define ___GFP_NOLOCKDEP 0 +#endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -186,8 +191,11 @@ struct vm_area_struct; #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) +/* Disable lockdep for GFP context tracking */ +#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
[Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API
From: Michal Hocko GFP_NOFS context is used for the following 5 reasons currently - to prevent from deadlocks when the lock held by the allocation context would be needed during the memory reclaim - to prevent from stack overflows during the reclaim because the allocation is performed from a deep context already - to prevent lockups when the allocation context depends on other reclaimers to make a forward progress indirectly - just in case because this would be safe from the fs POV - silence lockdep false positives Unfortunately overuse of this allocation context brings some problems to the MM. Memory reclaim is much weaker (especially during heavy FS metadata workloads), OOM killer cannot be invoked because the MM layer doesn't have enough information about how much memory is freeable by the FS layer. In many cases it is far from clear why the weaker context is even used and so it might be used unnecessarily. We would like to get rid of those as much as possible. One way to do that is to use the flag in scopes rather than isolated cases. Such a scope is declared when really necessary, tracked per task and all the allocation requests from within the context will simply inherit the GFP_NOFS semantic. Not only this is easier to understand and maintain because there are much less problematic contexts than specific allocation requests, this also helps code paths where FS layer interacts with other layers (e.g. crypto, security modules, MM etc...) and there is no easy way to convey the allocation context between the layers. Introduce memalloc_nofs_{save,restore} API to control the scope of GFP_NOFS allocation context. This is basically copying memalloc_noio_{save,restore} API we have for other restricted allocation context GFP_NOIO. The PF_MEMALLOC_NOFS flag already exists and it is just an alias for PF_FSTRANS which has been xfs specific until recently. There are no more PF_FSTRANS users anymore so let's just drop it. PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO. memalloc_noio_flags is renamed to current_gfp_context because it now cares about both PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts. Xfs code paths preserve their semantic. kmem_flags_convert() doesn't need to evaluate the flag anymore. This patch shouldn't introduce any functional changes. Let's hope that filesystems will drop direct GFP_NOFS (resp. ~__GFP_FS) usage as much as possible and only use a properly documented memalloc_nofs_{save,restore} checkpoints where they are appropriate. Signed-off-by: Michal Hocko --- fs/xfs/kmem.h| 2 +- include/linux/gfp.h | 8 include/linux/sched.h| 34 ++ kernel/locking/lockdep.c | 2 +- mm/page_alloc.c | 8 +--- mm/vmscan.c | 6 +++--- 6 files changed, 44 insertions(+), 16 deletions(-) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index d973dbfc2bfa..ae08cfd9552a 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 1a934383cc20..bfe53d95c25b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -217,8 +217,16 @@ struct vm_area_struct; * * GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. * * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. * * GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware diff --git a/include/linux/sched.h b/include/linux/sched.h index abeb84604d32..2032fc642a26 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2307,9 +2307,9 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_USED_ASYNC 0x4000 /* used async_schedule*(), used by module init */ #define PF_NOFREEZE0x8000 /* this thread should not be frozen */ #define PF_FROZEN