[Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown
Session attributes exposed through sysfs were freed before the device was destroyed, resulting in a potential use-after-free. Free these attributes after removing the device. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- drivers/scsi/libiscsi.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 42381adf0769..8696a51a5a0c 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup); /** * iscsi_session_teardown - destroy session, host, and cls_session * @cls_session: iscsi session - * - * The driver must have called iscsi_remove_session before - * calling this. */ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) { @@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) iscsi_pool_free(>cmdpool); + iscsi_remove_session(cls_session); + kfree(session->password); kfree(session->password_in); kfree(session->username); @@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->portal_type); kfree(session->discovery_parent_type); - iscsi_destroy_session(cls_session); + iscsi_free_session(cls_session); + iscsi_host_dec_session_cnt(shost); module_put(owner); } -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
[Patch v2 2/2] libiscsi: Remove iscsi_destroy_session
iscsi_session_teardown was the only user of this function. Function currently is just short for iscsi_remove_session + iscsi_free_session. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- drivers/scsi/scsi_transport_iscsi.c | 16 include/scsi/scsi_transport_iscsi.h | 1 - 2 files changed, 17 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index a424eaeafeb0..924ac408d8a9 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_free_session); -/** - * iscsi_destroy_session - destroy iscsi session - * @session: iscsi_session - * - * Can be called by a LLD or iscsi_transport. There must not be - * any running connections. - */ -int iscsi_destroy_session(struct iscsi_cls_session *session) -{ - iscsi_remove_session(session); - ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n"); - iscsi_free_session(session); - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_destroy_session); - /** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6183d20a01fb..b266d2a3bcb1 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -extern int iscsi_destroy_session(struct iscsi_cls_session *session); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn); -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] libiscsi: Remove iscsi_destroy_session
iscsi_session_teardown was the only user of this function. Function currently is just short for iscsi_remove_session + iscsi_free_session. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- with "libiscsi: Fix use after free race during iscsi_session_teardown" removing the last user. drivers/scsi/scsi_transport_iscsi.c | 16 include/scsi/scsi_transport_iscsi.h | 1 - 2 files changed, 17 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index a424eaeafeb0..924ac408d8a9 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_free_session); -/** - * iscsi_destroy_session - destroy iscsi session - * @session: iscsi_session - * - * Can be called by a LLD or iscsi_transport. There must not be - * any running connections. - */ -int iscsi_destroy_session(struct iscsi_cls_session *session) -{ - iscsi_remove_session(session); - ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n"); - iscsi_free_session(session); - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_destroy_session); - /** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6183d20a01fb..b266d2a3bcb1 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -extern int iscsi_destroy_session(struct iscsi_cls_session *session); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn); -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] libiscsi: Fix use-after-free race during iscsi_session_teardown
Session attributes exposed through sysfs were freed before the device was destroyed, resulting in a potential use-after-free. Free these attributes after removing the device. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- drivers/scsi/libiscsi.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 42381adf0769..f9199bebaec7 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup); /** * iscsi_session_teardown - destroy session, host, and cls_session * @cls_session: iscsi session - * - * The driver must have called iscsi_remove_session before - * calling this. */ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) { @@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) iscsi_pool_free(>cmdpool); + iscsi_remove_session(session); + kfree(session->password); kfree(session->password_in); kfree(session->username); @@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->portal_type); kfree(session->discovery_parent_type); - iscsi_destroy_session(cls_session); + iscsi_free_session(cls_session); + iscsi_host_dec_session_cnt(shost); module_put(owner); } -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov <kha...@google.com> wrote: > Hi, > > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on > the same tree at the same, behavior time blows up and all the calls hang with > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k > entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang > my test VMs) > > This seems to be due to thrashing on the dentry locks in multiple threads > tightly looping calling d_walk waiting for a shrink_dentry_list call (also > thrashing the locks) to complete. (d_invalidate loops calling d_walk so long > as > any dentry in the tree is in a dcache shrink list). > > There was a similar report recently "soft lockup in d_invalidate()" that > proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink > list already, which does prevent this hang/lockup in this case as well, > although > I'm not sure it doesn't violate the contract for d_invalidate. (Although the > entries in a shrink list should be dropped eventually, not necessarily by the > time d_invalidate returns). > > If someone more familiar with the dcache could provide input I would > appreciate. > > A reliable repro on mainline is: > - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough > - create a directory and populate with ~100k files with content to > populate dcache > - create some background processes opening/reading files in this folder (5 > while true; cat $file was enough to get an indefinite hang for me) > - cause the directory to need to be invalidated (e.g., make_bad_inode on the > directory) > > This results in the background processes all entering d_invalidate and > hanging, > while with just one process in d_invalidate (e.g., stat'ing a file in the dir) > things go pretty quickly as expected. > > > (The proposed patch from other thread) > > diff --git a/fs/dcache.c b/fs/dcache.c > index 7b8feb6..3a3b0f37 100644 > --- a/fs/dcache.c > +++ b/fs/dcache.c > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, > struct dentry *dentry) > goto out; > > if (dentry->d_flags & DCACHE_SHRINK_LIST) { > - data->found++; > + goto out; > } else { > if (dentry->d_flags & DCACHE_LRU_LIST) > d_lru_del(dentry); > > > khazhy Would this change actually violate any guarantees? select_collect looks like it used to ignore unused dentries that were part of a shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found would not be incremented). Once the dentry is on a shrink list would it be unreachable anyways, so returning from d_invalidate before all the shrink lists finish processing would be ok? khazhy smime.p7s Description: S/MIME Cryptographic Signature
Hang/soft lockup in d_invalidate with simultaneous calls
Hi, I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on the same tree at the same, behavior time blows up and all the calls hang with large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang my test VMs) This seems to be due to thrashing on the dentry locks in multiple threads tightly looping calling d_walk waiting for a shrink_dentry_list call (also thrashing the locks) to complete. (d_invalidate loops calling d_walk so long as any dentry in the tree is in a dcache shrink list). There was a similar report recently "soft lockup in d_invalidate()" that proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink list already, which does prevent this hang/lockup in this case as well, although I'm not sure it doesn't violate the contract for d_invalidate. (Although the entries in a shrink list should be dropped eventually, not necessarily by the time d_invalidate returns). If someone more familiar with the dcache could provide input I would appreciate. A reliable repro on mainline is: - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough - create a directory and populate with ~100k files with content to populate dcache - create some background processes opening/reading files in this folder (5 while true; cat $file was enough to get an indefinite hang for me) - cause the directory to need to be invalidated (e.g., make_bad_inode on the directory) This results in the background processes all entering d_invalidate and hanging, while with just one process in d_invalidate (e.g., stat'ing a file in the dir) things go pretty quickly as expected. (The proposed patch from other thread) diff --git a/fs/dcache.c b/fs/dcache.c index 7b8feb6..3a3b0f37 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { - data->found++; + goto out; } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH] ext4: Return EIO on read error in ext4_find_entry
On Fri, Jun 23, 2017 at 2:36 PM, Andreas Dilgerwrote: > On Jun 23, 2017, at 6:26 AM, Theodore Ts'o wrote: >> >> The problem is that if we continue, successive reads may all take >> seconds or minutes to fail, thus tieing up the process for a long >> time. > > Sorry, I don't understand where the seconds or minutes of delay come from? > Is that because of long SCSI retries in the block layer, or in the disk > itself, or something caused specifically because of this code? For a networked block device we may be retrying for a while before giving up, although this also applies to the initial failed read. > >> By returning EIO right away, we can "fast fail". > > But it seems like you don't necessarily need to fail at all? Something like > the > following would return an error if the entry is not found, but still search > the > rest of the leaf blocks (if any) before giving up: > Giving up early or checking future blocks both work, critical thing here is not returning NULL after seeing a read error. Previously to this the behavior was to continue to check future blocks after a read error, and it seemed OK. Khazhy smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] ext4: Return EIO on read error in ext4_find_entry
Previously, a read error would be ignored and we would eventually return NULL from ext4_find_entry, which signals "no such file or directory". We should be returning EIO. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 404256caf9cf..6fa17e9f7b6d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1428,11 +1428,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); - goto next; + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, -- 2.13.1.611.g7e3b11ae1-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov <kha...@google.com> wrote: > > On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov <kha...@google.com> wrote: > > Hi, > > > > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate > > on > > the same tree at the same, behavior time blows up and all the calls hang > > with > > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k > > entries in d_subdir, and 5 or so threads calling d_invalidate was able to > > hang > > my test VMs) > > > > This seems to be due to thrashing on the dentry locks in multiple threads > > tightly looping calling d_walk waiting for a shrink_dentry_list call (also > > thrashing the locks) to complete. (d_invalidate loops calling d_walk so > > long as > > any dentry in the tree is in a dcache shrink list). > > > > There was a similar report recently "soft lockup in d_invalidate()" that > > proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink > > list already, which does prevent this hang/lockup in this case as well, > > although > > I'm not sure it doesn't violate the contract for d_invalidate. (Although the > > entries in a shrink list should be dropped eventually, not necessarily by > > the > > time d_invalidate returns). > > > > If someone more familiar with the dcache could provide input I would > > appreciate. > > > > A reliable repro on mainline is: > > - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough > > - create a directory and populate with ~100k files with content to > > populate dcache > > - create some background processes opening/reading files in this folder (5 > > while true; cat $file was enough to get an indefinite hang for me) > > - cause the directory to need to be invalidated (e.g., make_bad_inode on > > the > > directory) > > > > This results in the background processes all entering d_invalidate and > > hanging, > > while with just one process in d_invalidate (e.g., stat'ing a file in the > > dir) > > things go pretty quickly as expected. > > > > > > (The proposed patch from other thread) > > > > diff --git a/fs/dcache.c b/fs/dcache.c > > index 7b8feb6..3a3b0f37 100644 > > --- a/fs/dcache.c > > +++ b/fs/dcache.c > > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, > > struct dentry *dentry) > > goto out; > > > > if (dentry->d_flags & DCACHE_SHRINK_LIST) { > > - data->found++; > > + goto out; > > } else { > > if (dentry->d_flags & DCACHE_LRU_LIST) > > d_lru_del(dentry); > > > > > > khazhy > > Would this change actually violate any guarantees? select_collect > looks like it used to ignore unused dentries that were part of a > shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found > would not be incremented). Once the dentry is on a shrink list would > it be unreachable anyways, so returning from d_invalidate before all > the shrink lists finish processing would be ok? > > khazhy Pinging in case this got missed, would appreciate thoughts from someone more familiar with the dcache. My previous email wasn't completely correct, while before fe91522a7ba8 ("don't remove from shrink list in select_collect()") d_invalidate would not busy wait for other workers calling shrink_list to compete, it would return -EBUSY, rather than success, so a change to return success without waiting would not be equivalent behavior before. Currently, we will loop calling d_walk repeatedly causing the extreme slowdown observed. I still want to understand better, in d_invalidate if we can return without pruning in-use dentries safely, would returning before unused dentries are pruned, so long as we know they will be pruned by another task (are on a shrink list), be safe as well? If not, would it make more sense to have have a mutual exclusion on calling d_invalidate on the same dentries simultaneously? khazhy
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Mon, May 22, 2017 at 11:18 AM, Khazhismel Kumykov <kha...@google.com> wrote: > On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov <kha...@google.com> wrote: >> >> On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov <kha...@google.com> >> wrote: >> > Hi, >> > >> > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate >> > on >> > the same tree at the same, behavior time blows up and all the calls hang >> > with >> > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k >> > entries in d_subdir, and 5 or so threads calling d_invalidate was able to >> > hang >> > my test VMs) >> > >> > This seems to be due to thrashing on the dentry locks in multiple threads >> > tightly looping calling d_walk waiting for a shrink_dentry_list call (also >> > thrashing the locks) to complete. (d_invalidate loops calling d_walk so >> > long as >> > any dentry in the tree is in a dcache shrink list). >> > >> > There was a similar report recently "soft lockup in d_invalidate()" that >> > proposed in the d_invalidate d_walk to ignore dentries marked as in a >> > shrink >> > list already, which does prevent this hang/lockup in this case as well, >> > although >> > I'm not sure it doesn't violate the contract for d_invalidate. (Although >> > the >> > entries in a shrink list should be dropped eventually, not necessarily by >> > the >> > time d_invalidate returns). >> > >> > If someone more familiar with the dcache could provide input I would >> > appreciate. >> > >> > A reliable repro on mainline is: >> > - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough >> > - create a directory and populate with ~100k files with content to >> > populate dcache >> > - create some background processes opening/reading files in this folder (5 >> > while true; cat $file was enough to get an indefinite hang for me) >> > - cause the directory to need to be invalidated (e.g., make_bad_inode on >> > the >> > directory) >> > >> > This results in the background processes all entering d_invalidate and >> > hanging, >> > while with just one process in d_invalidate (e.g., stat'ing a file in the >> > dir) >> > things go pretty quickly as expected. >> > >> > >> > (The proposed patch from other thread) >> > >> > diff --git a/fs/dcache.c b/fs/dcache.c >> > index 7b8feb6..3a3b0f37 100644 >> > --- a/fs/dcache.c >> > +++ b/fs/dcache.c >> > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, >> > struct dentry *dentry) >> > goto out; >> > >> > if (dentry->d_flags & DCACHE_SHRINK_LIST) { >> > - data->found++; >> > + goto out; >> > } else { >> > if (dentry->d_flags & DCACHE_LRU_LIST) >> > d_lru_del(dentry); >> > >> > >> > khazhy >> >> Would this change actually violate any guarantees? select_collect >> looks like it used to ignore unused dentries that were part of a >> shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found >> would not be incremented). Once the dentry is on a shrink list would >> it be unreachable anyways, so returning from d_invalidate before all >> the shrink lists finish processing would be ok? >> >> khazhy > > Pinging in case this got missed, would appreciate thoughts from someone more > familiar with the dcache. > > My previous email wasn't completely correct, while before fe91522a7ba8 > ("don't remove from shrink list in select_collect()") d_invalidate > would not busy > wait for other workers calling shrink_list to compete, it would return -EBUSY, > rather than success, so a change to return success without waiting would not > be equivalent behavior before. Currently, we will loop calling d_walk > repeatedly > causing the extreme slowdown observed. > > I still want to understand better, in d_invalidate if we can return > without pruning > in-use dentries safely, would returning before unused dentries are > pruned, so long > as we know they will be pruned by another task (are on a shrink list), > be safe as well? > If not, would it make more sense to have have a mutual exclusion on calling > d_invalidate on the same dentries simultaneously?
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Fri, Jun 2, 2017 at 11:20 PM, Al Virowrote: > The thing is, unlike shrink_dcache_parent() we *can* bugger off as > soon as we'd found no victims, nothing mounted and dentry itself > is unhashed. We can't do anything in select_collect() (we would've > broken shrink_dcache_parent() that way), but we can do unhashing > in check_and_drop() in "really nothing to do" case and we can return > from d_invalidate() after that. So how about this: That does the trick. smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Fri, Jun 2, 2017 at 6:12 PM, Al Virowrote: > Part of that could be relieved if we turned check_and_drop() into > static void check_and_drop(void *_data) > { > struct detach_data *data = _data; > > if (!data->mountpoint && list_empty(>select.dispose)) > __d_drop(data->select.start); > } So with this change, d_invalidate will drop the starting dentry before all it's children are dropped? Would it make sense to just drop it right off the bat, and let one task handle shrinking all it's children? > > It doesn't solve the entire problem, though - we still could get too > many threads into that thing before any of them gets to that __d_drop(). Yes, the change isn't sufficient for my repro, as many threads get to the loop before the drop, although new tasks don't get stuck in the same loop after the dentry is dropped. > I wonder if we should try and collect more victims; that could lead > to contention of its own, but... >From my understanding, the contention is the worst when one task is shrinking everything, and several other tasks are busily looping walking the dentry until everything is done. In this case, the tasks busily looping d_walk hold the d_lock for a dentry while walking over all it's children, then soon after it finishes the d_walk, it queues again to walk again, while shrink_dentry_list releases and re-grabs for each entry. If we limit the number of items we pass to shrink_dentry_list at one time things actually look quite a bit better. e.g., in testing arbitrarily limiting select.dispose to 1000 elements does fix my repro. e.g. diff --git a/fs/dcache.c b/fs/dcache.c index 22af360ceca3..3892e0eb7ec2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1367,6 +1367,7 @@ struct select_data { struct dentry *start; struct list_head dispose; int found; + int actually_found; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) @@ -1388,6 +1389,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (!dentry->d_lockref.count) { d_shrink_add(dentry, >dispose); data->found++; + data->actually_found++; } } /* @@ -1397,6 +1399,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) */ if (!list_empty(>dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; + + if (data->actually_found > 1000) + ret = D_WALK_QUIT; out: return ret; } @@ -1415,6 +1420,7 @@ void shrink_dcache_parent(struct dentry *parent) INIT_LIST_HEAD(); data.start = parent; data.found = 0; + data.actually_found = 0; d_walk(parent, , select_collect, NULL); if (!data.found) @@ -1536,6 +1542,7 @@ void d_invalidate(struct dentry *dentry) INIT_LIST_HEAD(); data.select.start = dentry; data.select.found = 0; + data.select.actually_found = 0; d_walk(dentry, , detach_and_collect, check_and_drop); smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Fri, Jun 2, 2017 at 11:47 PM, Khazhismel Kumykov <kha...@google.com> wrote: > On Fri, Jun 2, 2017 at 11:20 PM, Al Viro <v...@zeniv.linux.org.uk> wrote: >> The thing is, unlike shrink_dcache_parent() we *can* bugger off as >> soon as we'd found no victims, nothing mounted and dentry itself >> is unhashed. We can't do anything in select_collect() (we would've >> broken shrink_dcache_parent() that way), but we can do unhashing >> in check_and_drop() in "really nothing to do" case and we can return >> from d_invalidate() after that. So how about this: > That does the trick. I'm not entirely familiar the process here, is the above change committed somewhere, should I propose a patch? smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] generic: Add nocheck shutdown stress test
Most shutdown tests only run on filesystems with metadata journaling, so we lose coverage. Add a shutdown stress test that doesn't check for consistency, so does not require journaling. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- tests/generic/999 | 84 +++ tests/generic/999.out | 2 ++ tests/generic/group | 1 + 3 files changed, 87 insertions(+) create mode 100755 tests/generic/999 create mode 100644 tests/generic/999.out diff --git a/tests/generic/999 b/tests/generic/999 new file mode 100755 index ..71b9aa4c --- /dev/null +++ b/tests/generic/999 @@ -0,0 +1,84 @@ +#!/bin/bash +# FS QA Test No. 999 +# +# Shutdown stress test - exercise shutdown codepath with fsstress, +# make sure we don't BUG/WARN. Coverage for all fs with shutdown. +# +#--- +# Copyright (c) 2013 Red Hat, Inc. All Rights Reserved. +# Copyright (c) 2017 Google, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +_cleanup() +{ + cd / + _scratch_unmount 2>/dev/null + rm -f $tmp.* +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# real QA test starts here +_supported_fs generic +_supported_os Linux + +_require_scratch_nocheck +_require_scratch_shutdown +_require_command "$KILLALL_PROG" killall + +rm -f $seqres.full + +_scratch_mkfs > $seqres.full 2>&1 +_scratch_mount + +SLEEP_TIME=$((10 * $TIME_FACTOR)) +PROCS=$((4 * LOAD_FACTOR)) + +load_dir=$SCRATCH_MNT/test + +$XFS_FSR_PROG -v $load_dir >> $seqres.full 2>&1 +$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> $seqres.full 2>&1 & +sleep $SLEEP_TIME +sync + +# now shutdown and unmount +sleep 5 +$here/src/godown $load_dir +$KILLALL_PROG -q $FSSTRESS_PROG +wait + +# for some reason fsstress processes manage to live on beyond the wait? +sleep 5 + +_scratch_unmount + +echo "No output is good. Failures are loud." + +status=0 +exit diff --git a/tests/generic/999.out b/tests/generic/999.out new file mode 100644 index ..68a51fe7 --- /dev/null +++ b/tests/generic/999.out @@ -0,0 +1,2 @@ +QA output created by 999 +No output is good. Failures are loud. diff --git a/tests/generic/group b/tests/generic/group index f922b496..891386ac 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -463,3 +463,4 @@ 458 auto quick clone 459 auto dangerous 460 auto quick rw +999 auto shutdown stress -- 2.14.1.821.g8fa685d3b7-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v2] generic: Add nocheck shutdown stress test
Most shutdown tests only run on filesystems with metadata journaling, so we lose coverage. Add a shutdown stress test that doesn't check for consistency, so does not require journaling. This is a modified version of generic/051, with some extras trimmed and fs checking removed. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- v2: drop defrag, use "silence is golden" tests/generic/999 | 83 +++ tests/generic/999.out | 2 ++ tests/generic/group | 1 + 3 files changed, 86 insertions(+) create mode 100755 tests/generic/999 create mode 100644 tests/generic/999.out diff --git a/tests/generic/999 b/tests/generic/999 new file mode 100755 index ..7b612686 --- /dev/null +++ b/tests/generic/999 @@ -0,0 +1,83 @@ +#!/bin/bash +# FS QA Test No. 999 +# +# Shutdown stress test - exercise shutdown codepath with fsstress, +# make sure we don't BUG/WARN. Coverage for all fs with shutdown. +# +#--- +# Copyright (c) 2013 Red Hat, Inc. All Rights Reserved. +# Copyright (c) 2017 Google, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +_cleanup() +{ + cd / + _scratch_unmount 2>/dev/null + rm -f $tmp.* +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# real QA test starts here +_supported_fs generic +_supported_os Linux + +_require_scratch_nocheck +_require_scratch_shutdown +_require_command "$KILLALL_PROG" killall + +rm -f $seqres.full + +_scratch_mkfs > $seqres.full 2>&1 +_scratch_mount + +SLEEP_TIME=$((10 * $TIME_FACTOR)) +PROCS=$((4 * LOAD_FACTOR)) + +load_dir=$SCRATCH_MNT/test + +$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> $seqres.full 2>&1 & +sleep $SLEEP_TIME +sync + +# now shutdown and unmount +sleep 5 +$here/src/godown $load_dir +$KILLALL_PROG -q $FSSTRESS_PROG +wait + +# for some reason fsstress processes manage to live on beyond the wait? +sleep 5 + +_scratch_unmount + +echo "Silence is golden" + +status=0 +exit diff --git a/tests/generic/999.out b/tests/generic/999.out new file mode 100644 index ..3b276ca8 --- /dev/null +++ b/tests/generic/999.out @@ -0,0 +1,2 @@ +QA output created by 999 +Silence is golden diff --git a/tests/generic/group b/tests/generic/group index f922b496..891386ac 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -463,3 +463,4 @@ 458 auto quick clone 459 auto dangerous 460 auto quick rw +999 auto shutdown stress -- 2.14.1.821.g8fa685d3b7-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown
Noticed these don't seem to be in 4.14/scsi-queue On Tue, Aug 29, 2017 at 6:45 PM, Martin K. Petersenwrote: > > Chris, > >> Looks good to me, fixes up the code given that the comment there about >> calling iscsi_remove_session wasn't being followed. > > Applied these two to 4.14/scsi-queue. > > -- > Martin K. Petersen Oracle Linux Engineering smime.p7s Description: S/MIME Cryptographic Signature
Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown
On Thu, Jul 13, 2017 at 9:11 AM, Khazhismel Kumykov <kha...@google.com> wrote: Ping in case this was missed smime.p7s Description: S/MIME Cryptographic Signature
[RFC PATCH] blk-throttle: add burst allowance.
Allows configuration additional bytes or ios before a throttle is triggered. This allows implementation of a bucket style rate-limit/throttle on a block device. Previously, bursting to a device was limited to allowance granted in a single throtl_slice (similar to a bucket with limit N and refill rate N/slice). Additional parameters bytes/io_burst_conf defined for tg, which define a number of bytes/ios that must be depleted before throttling happens. A tg that does not deplete this allowance functions as though it has no configured limits. tgs earn additional allowance at rate defined by bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling kicks in. If a tg is idle for a while, it will again have some burst allowance before it gets throttled again. slice_end for a tg is extended until io_disp/byte_disp would fall to 0, when all "used" burst allowance would be earned back. trim_slice still does progress slice_start as before and decrements *_disp as before, and tgs continue to get bytes/ios in throtl_slice intervals. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- block/Kconfig| 11 +++ block/blk-throttle.c | 192 +++ 2 files changed, 189 insertions(+), 14 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 28ec55752b68..fbd05b419f93 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. +config BLK_DEV_THROTTLING_BURST +bool "Block throttling .burst allowance interface" +depends on BLK_DEV_THROTTLING +default n +---help--- +Add .burst allowance for block throttling. Burst allowance allows for +additional unthrottled usage, while still limiting speed for sustained +usage. + +If in doubt, say N. + config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 96ad32623427..27c084312772 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -157,6 +157,11 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + uint64_t bytes_burst_conf[2]; + unsigned int io_burst_conf[2]; +#endif + unsigned long last_low_overflow_time[2]; uint64_t last_bytes_disp[2]; @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + tg->bytes_burst_conf[READ] = 0; + tg->bytes_burst_conf[WRITE] = 0; + tg->io_burst_conf[READ] = 0; + tg->io_burst_conf[WRITE] = 0; +#endif /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_end[rw], jiffies); } +/* + * When current slice should end. + * + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait + * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end + * before this would result in tg receiving additional burst allowance. + */ +static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw, + unsigned long min_wait) +{ + unsigned long bytes_wait = 0, io_wait = 0; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + if (tg->bytes_burst_conf[rw]) + bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw); + if (tg->io_burst_conf[rw]) + io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw); +#endif + return max(min_wait, max(bytes_wait, io_wait)); +} + static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { @@ -849,7 +880,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, + jiffies + throtl_slice_wait(tg, rw, tg->td->throtl_slice)); time_elapsed = jiffies - tg->slice_start[rw]; @@ -889,7 +921,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; + unsigned int io_allowed, io_disp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp;
Re: [RFC PATCH] blk-throttle: add burst allowance.
(Botched the to line, sorry) smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Dec 18, 2017 at 1:01 PM, Vivek Goyal <vgo...@redhat.com> wrote: > On Mon, Dec 18, 2017 at 12:39:50PM -0800, Khazhismel Kumykov wrote: >> On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal <vgo...@redhat.com> wrote: >> > On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote: >> >> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> >> >> wrote: >> >> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote: >> >> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >> >> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote: >> >> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> >> >>> >> Allows configuration additional bytes or ios before a throttle is >> >> >>> >> triggered. >> >> >>> >> >> >> >>> >> This allows implementation of a bucket style rate-limit/throttle >> >> >>> >> on a >> >> >>> >> block device. Previously, bursting to a device was limited to >> >> >>> >> allowance >> >> >>> >> granted in a single throtl_slice (similar to a bucket with limit N >> >> >>> >> and >> >> >>> >> refill rate N/slice). >> >> >>> >> >> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which >> >> >>> >> define a >> >> >>> >> number of bytes/ios that must be depleted before throttling >> >> >>> >> happens. A >> >> >>> >> tg that does not deplete this allowance functions as though it has >> >> >>> >> no >> >> >>> >> configured limits. tgs earn additional allowance at rate defined by >> >> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, >> >> >>> >> throttling >> >> >>> >> kicks in. If a tg is idle for a while, it will again have some >> >> >>> >> burst >> >> >>> >> allowance before it gets throttled again. >> >> >>> >> >> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall >> >> >>> >> to 0, >> >> >>> >> when all "used" burst allowance would be earned back. trim_slice >> >> >>> >> still >> >> >>> >> does progress slice_start as before and decrements *_disp as >> >> >>> >> before, and >> >> >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >> >> >>> > >> >> >>> > Can you describe why we need this? It would be great if you can >> >> >>> > describe the >> >> >>> > usage model and an example. Does this work for io.low/io.max or >> >> >>> > both? >> >> >>> > >> >> >>> > Thanks, >> >> >>> > Shaohua >> >> >>> > >> >> >>> >> >> >>> Use case that brought this up was configuring limits for a remote >> >> >>> shared device. Bursting beyond io.max is desired but only for so much >> >> >>> before the limit kicks in, afterwards with sustained usage throughput >> >> >>> is capped. (This proactively avoids remote-side limits). In that case >> >> >>> one would configure in a root container io.max + io.burst, and >> >> >>> configure low/other limits on descendants sharing the resource on the >> >> >>> same node. >> >> >>> >> >> >>> With this patch, so long as tg has not dispatched more than the burst, >> >> >>> no limit is applied at all by that tg, including limit imposed by >> >> >>> io.low in tg_iops_limit, etc. >> >> >> >> >> >> I'd appreciate if you can give more details about the 'why'. >> >> >> 'configuring >> >> >> limits for a remote shared device' doesn't justify the change. >> >> > >> >> > This is to configure a bursty workload (and associated device) with >> >> > known/a
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal <vgo...@redhat.com> wrote: > On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote: >> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> >> wrote: >> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote: >> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote: >> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> >>> >> Allows configuration additional bytes or ios before a throttle is >> >>> >> triggered. >> >>> >> >> >>> >> This allows implementation of a bucket style rate-limit/throttle on a >> >>> >> block device. Previously, bursting to a device was limited to >> >>> >> allowance >> >>> >> granted in a single throtl_slice (similar to a bucket with limit N and >> >>> >> refill rate N/slice). >> >>> >> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which >> >>> >> define a >> >>> >> number of bytes/ios that must be depleted before throttling happens. A >> >>> >> tg that does not deplete this allowance functions as though it has no >> >>> >> configured limits. tgs earn additional allowance at rate defined by >> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >> >>> >> kicks in. If a tg is idle for a while, it will again have some burst >> >>> >> allowance before it gets throttled again. >> >>> >> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to >> >>> >> 0, >> >>> >> when all "used" burst allowance would be earned back. trim_slice still >> >>> >> does progress slice_start as before and decrements *_disp as before, >> >>> >> and >> >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >> >>> > >> >>> > Can you describe why we need this? It would be great if you can >> >>> > describe the >> >>> > usage model and an example. Does this work for io.low/io.max or both? >> >>> > >> >>> > Thanks, >> >>> > Shaohua >> >>> > >> >>> >> >>> Use case that brought this up was configuring limits for a remote >> >>> shared device. Bursting beyond io.max is desired but only for so much >> >>> before the limit kicks in, afterwards with sustained usage throughput >> >>> is capped. (This proactively avoids remote-side limits). In that case >> >>> one would configure in a root container io.max + io.burst, and >> >>> configure low/other limits on descendants sharing the resource on the >> >>> same node. >> >>> >> >>> With this patch, so long as tg has not dispatched more than the burst, >> >>> no limit is applied at all by that tg, including limit imposed by >> >>> io.low in tg_iops_limit, etc. >> >> >> >> I'd appreciate if you can give more details about the 'why'. 'configuring >> >> limits for a remote shared device' doesn't justify the change. >> > >> > This is to configure a bursty workload (and associated device) with >> > known/allowed expected burst size, but to not allow full utilization >> > of the device for extended periods of time for QoS. During idle or low >> > use periods the burst allowance accrues, and then tasks can burst well >> > beyond the configured throttle up to the limit, afterwards is >> > throttled. A constant throttle speed isn't sufficient for this as you >> > can only burst 1 slice worth, but a limit of sorts is desirable for >> > preventing over utilization of the shared device. This type of limit >> > is also slightly different than what i understand io.low does in local >> > cases in that tg is only high priority/unthrottled if it is bursty, >> > and is limited with constant usage >> > >> > Khazhy >> >> Hi Shaohua, >> >> Does this clarify the reason for this patch? Is this (or something >> similar) a good fit for inclusion in blk-throttle? >> > > So does this brus
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> wrote: > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote: >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote: >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >>> >> Allows configuration additional bytes or ios before a throttle is >>> >> triggered. >>> >> >>> >> This allows implementation of a bucket style rate-limit/throttle on a >>> >> block device. Previously, bursting to a device was limited to allowance >>> >> granted in a single throtl_slice (similar to a bucket with limit N and >>> >> refill rate N/slice). >>> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >>> >> number of bytes/ios that must be depleted before throttling happens. A >>> >> tg that does not deplete this allowance functions as though it has no >>> >> configured limits. tgs earn additional allowance at rate defined by >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >>> >> kicks in. If a tg is idle for a while, it will again have some burst >>> >> allowance before it gets throttled again. >>> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >>> >> when all "used" burst allowance would be earned back. trim_slice still >>> >> does progress slice_start as before and decrements *_disp as before, and >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >>> > >>> > Can you describe why we need this? It would be great if you can describe >>> > the >>> > usage model and an example. Does this work for io.low/io.max or both? >>> > >>> > Thanks, >>> > Shaohua >>> > >>> >>> Use case that brought this up was configuring limits for a remote >>> shared device. Bursting beyond io.max is desired but only for so much >>> before the limit kicks in, afterwards with sustained usage throughput >>> is capped. (This proactively avoids remote-side limits). In that case >>> one would configure in a root container io.max + io.burst, and >>> configure low/other limits on descendants sharing the resource on the >>> same node. >>> >>> With this patch, so long as tg has not dispatched more than the burst, >>> no limit is applied at all by that tg, including limit imposed by >>> io.low in tg_iops_limit, etc. >> >> I'd appreciate if you can give more details about the 'why'. 'configuring >> limits for a remote shared device' doesn't justify the change. > > This is to configure a bursty workload (and associated device) with > known/allowed expected burst size, but to not allow full utilization > of the device for extended periods of time for QoS. During idle or low > use periods the burst allowance accrues, and then tasks can burst well > beyond the configured throttle up to the limit, afterwards is > throttled. A constant throttle speed isn't sufficient for this as you > can only burst 1 slice worth, but a limit of sorts is desirable for > preventing over utilization of the shared device. This type of limit > is also slightly different than what i understand io.low does in local > cases in that tg is only high priority/unthrottled if it is bursty, > and is limited with constant usage > > Khazhy Hi Shaohua, Does this clarify the reason for this patch? Is this (or something similar) a good fit for inclusion in blk-throttle? Thanks, Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote: > On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote: >> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> >> Allows configuration additional bytes or ios before a throttle is >> >> triggered. >> >> >> >> This allows implementation of a bucket style rate-limit/throttle on a >> >> block device. Previously, bursting to a device was limited to allowance >> >> granted in a single throtl_slice (similar to a bucket with limit N and >> >> refill rate N/slice). >> >> >> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >> >> number of bytes/ios that must be depleted before throttling happens. A >> >> tg that does not deplete this allowance functions as though it has no >> >> configured limits. tgs earn additional allowance at rate defined by >> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >> >> kicks in. If a tg is idle for a while, it will again have some burst >> >> allowance before it gets throttled again. >> >> >> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >> >> when all "used" burst allowance would be earned back. trim_slice still >> >> does progress slice_start as before and decrements *_disp as before, and >> >> tgs continue to get bytes/ios in throtl_slice intervals. >> > >> > Can you describe why we need this? It would be great if you can describe >> > the >> > usage model and an example. Does this work for io.low/io.max or both? >> > >> > Thanks, >> > Shaohua >> > >> >> Use case that brought this up was configuring limits for a remote >> shared device. Bursting beyond io.max is desired but only for so much >> before the limit kicks in, afterwards with sustained usage throughput >> is capped. (This proactively avoids remote-side limits). In that case >> one would configure in a root container io.max + io.burst, and >> configure low/other limits on descendants sharing the resource on the >> same node. >> >> With this patch, so long as tg has not dispatched more than the burst, >> no limit is applied at all by that tg, including limit imposed by >> io.low in tg_iops_limit, etc. > > I'd appreciate if you can give more details about the 'why'. 'configuring > limits for a remote shared device' doesn't justify the change. This is to configure a bursty workload (and associated device) with known/allowed expected burst size, but to not allow full utilization of the device for extended periods of time for QoS. During idle or low use periods the burst allowance accrues, and then tasks can burst well beyond the configured throttle up to the limit, afterwards is throttled. A constant throttle speed isn't sufficient for this as you can only burst 1 slice worth, but a limit of sorts is desirable for preventing over utilization of the shared device. This type of limit is also slightly different than what i understand io.low does in local cases in that tg is only high priority/unthrottled if it is bursty, and is limited with constant usage Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote: > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> Allows configuration additional bytes or ios before a throttle is >> triggered. >> >> This allows implementation of a bucket style rate-limit/throttle on a >> block device. Previously, bursting to a device was limited to allowance >> granted in a single throtl_slice (similar to a bucket with limit N and >> refill rate N/slice). >> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >> number of bytes/ios that must be depleted before throttling happens. A >> tg that does not deplete this allowance functions as though it has no >> configured limits. tgs earn additional allowance at rate defined by >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >> kicks in. If a tg is idle for a while, it will again have some burst >> allowance before it gets throttled again. >> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >> when all "used" burst allowance would be earned back. trim_slice still >> does progress slice_start as before and decrements *_disp as before, and >> tgs continue to get bytes/ios in throtl_slice intervals. > > Can you describe why we need this? It would be great if you can describe the > usage model and an example. Does this work for io.low/io.max or both? > > Thanks, > Shaohua > Use case that brought this up was configuring limits for a remote shared device. Bursting beyond io.max is desired but only for so much before the limit kicks in, afterwards with sustained usage throughput is capped. (This proactively avoids remote-side limits). In that case one would configure in a root container io.max + io.burst, and configure low/other limits on descendants sharing the resource on the same node. With this patch, so long as tg has not dispatched more than the burst, no limit is applied at all by that tg, including limit imposed by io.low in tg_iops_limit, etc. Khazhy >> Signed-off-by: Khazhismel Kumykov <kha...@google.com> >> --- >> block/Kconfig| 11 +++ >> block/blk-throttle.c | 192 >> +++ >> 2 files changed, 189 insertions(+), 14 deletions(-) >> >> diff --git a/block/Kconfig b/block/Kconfig >> index 28ec55752b68..fbd05b419f93 100644 >> --- a/block/Kconfig >> +++ b/block/Kconfig >> @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW >> >> Note, this is an experimental interface and could be changed someday. >> >> +config BLK_DEV_THROTTLING_BURST >> +bool "Block throttling .burst allowance interface" >> +depends on BLK_DEV_THROTTLING >> +default n >> +---help--- >> +Add .burst allowance for block throttling. Burst allowance allows for >> +additional unthrottled usage, while still limiting speed for sustained >> +usage. >> + >> +If in doubt, say N. >> + >> config BLK_CMDLINE_PARSER >> bool "Block device command line partition parser" >> default n >> diff --git a/block/blk-throttle.c b/block/blk-throttle.c >> index 96ad32623427..27c084312772 100644 >> --- a/block/blk-throttle.c >> +++ b/block/blk-throttle.c >> @@ -157,6 +157,11 @@ struct throtl_grp { >> /* Number of bio's dispatched in current slice */ >> unsigned int io_disp[2]; >> >> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST >> + uint64_t bytes_burst_conf[2]; >> + unsigned int io_burst_conf[2]; >> +#endif >> + >> unsigned long last_low_overflow_time[2]; >> >> uint64_t last_bytes_disp[2]; >> @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t >> gfp, int node) >> tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; >> tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; >> tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; >> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST >> + tg->bytes_burst_conf[READ] = 0; >> + tg->bytes_burst_conf[WRITE] = 0; >> + tg->io_burst_conf[READ] = 0; >> + tg->io_burst_conf[WRITE] = 0; >> +#endif >> /* LIMIT_LOW will have default value 0 */ >> >> tg->latency_target = DFL_LATENCY_TARGET; >> @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct >> throtl_grp *tg, bool rw) >> tg->slice_end[rw], jiffies); >> } >> >> +/* >> + * When current slice should end. >> + * >> + * With CONFIG_B
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> wrote: > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote: >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote: >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >>> >> Allows configuration additional bytes or ios before a throttle is >>> >> triggered. >>> >> >>> >> This allows implementation of a bucket style rate-limit/throttle on a >>> >> block device. Previously, bursting to a device was limited to allowance >>> >> granted in a single throtl_slice (similar to a bucket with limit N and >>> >> refill rate N/slice). >>> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >>> >> number of bytes/ios that must be depleted before throttling happens. A >>> >> tg that does not deplete this allowance functions as though it has no >>> >> configured limits. tgs earn additional allowance at rate defined by >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >>> >> kicks in. If a tg is idle for a while, it will again have some burst >>> >> allowance before it gets throttled again. >>> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >>> >> when all "used" burst allowance would be earned back. trim_slice still >>> >> does progress slice_start as before and decrements *_disp as before, and >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >>> > >>> > Can you describe why we need this? It would be great if you can describe >>> > the >>> > usage model and an example. Does this work for io.low/io.max or both? >>> > >>> > Thanks, >>> > Shaohua >>> > >>> >>> Use case that brought this up was configuring limits for a remote >>> shared device. Bursting beyond io.max is desired but only for so much >>> before the limit kicks in, afterwards with sustained usage throughput >>> is capped. (This proactively avoids remote-side limits). In that case >>> one would configure in a root container io.max + io.burst, and >>> configure low/other limits on descendants sharing the resource on the >>> same node. >>> >>> With this patch, so long as tg has not dispatched more than the burst, >>> no limit is applied at all by that tg, including limit imposed by >>> io.low in tg_iops_limit, etc. >> >> I'd appreciate if you can give more details about the 'why'. 'configuring >> limits for a remote shared device' doesn't justify the change. > > This is to configure a bursty workload (and associated device) with > known/allowed expected burst size, but to not allow full utilization > of the device for extended periods of time for QoS. During idle or low > use periods the burst allowance accrues, and then tasks can burst well > beyond the configured throttle up to the limit, afterwards is > throttled. A constant throttle speed isn't sufficient for this as you > can only burst 1 slice worth, but a limit of sorts is desirable for > preventing over utilization of the shared device. This type of limit > is also slightly different than what i understand io.low does in local > cases in that tg is only high priority/unthrottled if it is bursty, > and is limited with constant usage > > Khazhy Ping this time without html (oops) smime.p7s Description: S/MIME Cryptographic Signature
[RFC PATCH] blk-throttle: add burst allowance
Allows configuration additional bytes or ios before a throttle is triggered. Slice end is extended to cover expended allowance recovery time. Usage would be e.g. per device to allow users to take up to X bytes/ios at full speed, but be limited to Y bps/iops with sustained usage. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- block/Kconfig| 11 +++ block/blk-throttle.c | 185 --- 2 files changed, 186 insertions(+), 10 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 3ab42bbb06d5..16545caa7fc9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -127,6 +127,17 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. +config BLK_DEV_THROTTLING_BURST +bool "Block throttling .burst allowance interface" +depends on BLK_DEV_THROTTLING +default n +---help--- +Add .burst allowance for block throttling. Burst allowance allows for +additional unthrottled usage, while still limiting speed for sustained +usage. + +If in doubt, say N. + config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d80c3f0144c5..e09ec11e9c5f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -156,6 +156,11 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + uint64_t bytes_burst_conf[2]; + unsigned int io_burst_conf[2]; +#endif + unsigned long last_low_overflow_time[2]; uint64_t last_bytes_disp[2]; @@ -506,6 +511,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + tg->bytes_burst_conf[READ] = 0; + tg->bytes_burst_conf[WRITE] = 0; + tg->io_burst_conf[READ] = 0; + tg->io_burst_conf[WRITE] = 0; +#endif /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; @@ -799,6 +810,26 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_end[rw], jiffies); } +/* + * When current slice should end. + * + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait + * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end + * before this would result in tg receiving additional burst allowance. + */ +static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw, + unsigned long min_wait) +{ + unsigned long bytes_wait = 0, io_wait = 0; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + if (tg->bytes_burst_conf[rw]) + bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw); + if (tg->io_burst_conf[rw]) + io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw); +#endif + return jiffies + max(min_wait, max(bytes_wait, io_wait)); +} + static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { @@ -848,7 +879,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, +throtl_slice_wait(tg, rw, tg->td->throtl_slice)); time_elapsed = jiffies - tg->slice_start[rw]; @@ -888,7 +920,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; + unsigned int io_allowed, io_disp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; @@ -907,6 +939,17 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, * have been trimmed. */ + io_disp = tg->io_disp[rw]; + +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + if (tg->io_disp[rw] < tg->io_burst_conf[rw]) { + if (wait) + *wait = 0; + return true; + } + io_disp -= tg->io_burst_conf[rw]; +#endif + tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); @@ -915,14 +958,14 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, else io_allowed = tmp; - if (tg->io_disp[rw] + 1 <= io_allowed) { + if (io_disp + 1 &
[PATCH] dm mpath selector: more evenly distribute ties
Move the last used path to the end of the list (least preferred) so that ties are more evenly distributed. For example, in case with three paths with one that is slower than others, the remaining two would be unevenly used if they tie. This is due to the rotation not being a truely fair distribution. Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are 'tied' Three possible rotations: (a, b, c) -> best path 'a' (b, c, a) -> best path 'b' (c, a, b) -> best path 'a' (a, b, c) -> best path 'a' (b, c, a) -> best path 'b' (c, a, b) -> best path 'a' ... So 'a' is used 2x more than 'b', although they should be used evenly. With this change, the most recently used path is always the least preferred, removing this bias resulting in even distribution. (a, b, c) -> best path 'a' (b, c, a) -> best path 'b' (c, a, b) -> best path 'a' (c, b, a) -> best path 'b' ... Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- drivers/md/dm-queue-length.c | 6 +++--- drivers/md/dm-service-time.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 23f178641794..969c4f1a3633 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) if (list_empty(>valid_paths)) goto out; - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, >valid_paths); - list_for_each_entry(pi, >valid_paths, list) { if (!best || (atomic_read(>qlen) < atomic_read(>qlen))) @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) if (!best) goto out; + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(>list, >valid_paths); + ret = best->path; out: spin_unlock_irqrestore(>lock, flags); diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 7b8642045c55..f006a9005593 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) if (list_empty(>valid_paths)) goto out; - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, >valid_paths); - list_for_each_entry(pi, >valid_paths, list) if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) best = pi; @@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) if (!best) goto out; + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(>list, >valid_paths); + ret = best->path; out: spin_unlock_irqrestore(>lock, flags); -- 2.16.0.rc1.238.g530d649a79-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties
On Wed, Jan 24, 2018 at 11:09 AM, Martin Wilck <mwi...@suse.com> wrote: > On Wed, 2018-01-24 at 10:44 -0800, Khazhismel Kumykov wrote: >> On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck <mwi...@suse.com> >> wrote: >> > On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote: >> > > Move the last used path to the end of the list (least preferred) >> > > so >> > > that >> > > ties are more evenly distributed. >> > > >> > > For example, in case with three paths with one that is slower >> > > than >> > > others, the remaining two would be unevenly used if they tie. >> > > This is >> > > due to the rotation not being a truely fair distribution. >> > > >> > > Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are >> > > 'tied' >> > > Three possible rotations: >> > > (a, b, c) -> best path 'a' >> > > (b, c, a) -> best path 'b' >> > > (c, a, b) -> best path 'a' >> > > (a, b, c) -> best path 'a' >> > > (b, c, a) -> best path 'b' >> > > (c, a, b) -> best path 'a' >> > > ... >> > >> > >> > This happens only if a and b actually have the same weight (e.g. >> > queue >> > length for the queue-length selector). If 'a' really receives more >> > IO, >> > its queue grows, and the selector will start preferring 'b', so the >> > effect should level out automatically with the current code as soon >> > as >> > you have real IO going on. But maybe I haven't grasped what you're >> > referring to as "tied". >> >> Yes, for "tied" I'm referring to paths with the same weight. As queue >> length grows it does tend to level out, but in the case where queue >> length doesn't grow (in this example I'm imagining 2 concurrent >> requests to the device) the bias does show and the selectors really >> do >> send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a' >> and 'b' are ~same speed). > > Have you actually observed this? I find the idea pretty academical that > two paths would be walking "tied" this way. In practice, under IO load, > I'd expect queue lengths (and service-times, for that matter) to > fluctuate enough to prevent this effect to be measurable. But of > course, I may be wrong. If you really did observe this, the next > question would be: does hurt performance to an extent that can be > noticed/measured? I reckon that if 'a' got saturated under the load, > hurting performance, its queue length would grow quickly and 'b' would > automatically get preferred. This is fairly simple to observe - start two sync reader threads against a device with 3 backing paths, with one path taking longer on average to complete requests than the other two. One of the 'faster' paths will be used ~2x more than the other. Perhaps not that common a situation, but is a real one. The bias seemed simple to remove, so that the two (or N) paths would be used equally. I don't see a downside to more evenly distributing in this case, although I can't say I've directly observed performance downsides for a biased distribution (aside from the distribution being biased itself being a downside). The bias of note is inherent to the order paths are added to the selector (and which path is 'always bad'), so if 'a' is saturated due to this, then slows, once it recovers it will continue to be preferred, versus in an even distribution. Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties
On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck <mwi...@suse.com> wrote: > On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote: >> Move the last used path to the end of the list (least preferred) so >> that >> ties are more evenly distributed. >> >> For example, in case with three paths with one that is slower than >> others, the remaining two would be unevenly used if they tie. This is >> due to the rotation not being a truely fair distribution. >> >> Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are >> 'tied' >> Three possible rotations: >> (a, b, c) -> best path 'a' >> (b, c, a) -> best path 'b' >> (c, a, b) -> best path 'a' >> (a, b, c) -> best path 'a' >> (b, c, a) -> best path 'b' >> (c, a, b) -> best path 'a' >> ... > > > This happens only if a and b actually have the same weight (e.g. queue > length for the queue-length selector). If 'a' really receives more IO, > its queue grows, and the selector will start preferring 'b', so the > effect should level out automatically with the current code as soon as > you have real IO going on. But maybe I haven't grasped what you're > referring to as "tied". Yes, for "tied" I'm referring to paths with the same weight. As queue length grows it does tend to level out, but in the case where queue length doesn't grow (in this example I'm imagining 2 concurrent requests to the device) the bias does show and the selectors really do send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a' and 'b' are ~same speed). > > OTOH, if the "best" path has much lower queue length than the other > paths for whatever reason, your pushing it to the tail will require a > full list walk with every new call of the selector. I see tjat as a > small disadvantage of your approach. I see, with best at the tail, we would not see as much benefit from the check if a path has no IOs on it in queue-length. In service-time no such short circuit exists, so I don't believe anything changes there. Am I understanding this correctly? > > Regards > Martin > Thanks for your comments, Khazhy >> >> So 'a' is used 2x more than 'b', although they should be used evenly. >> >> With this change, the most recently used path is always the least >> preferred, removing this bias resulting in even distribution. >> (a, b, c) -> best path 'a' >> (b, c, a) -> best path 'b' >> (c, a, b) -> best path 'a' >> (c, b, a) -> best path 'b' >> ... >> >> Signed-off-by: Khazhismel Kumykov <kha...@google.com> >> --- >> drivers/md/dm-queue-length.c | 6 +++--- >> drivers/md/dm-service-time.c | 6 +++--- >> 2 files changed, 6 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue- >> length.c >> index 23f178641794..969c4f1a3633 100644 >> --- a/drivers/md/dm-queue-length.c >> +++ b/drivers/md/dm-queue-length.c >> @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (list_empty(>valid_paths)) >> goto out; >> >> - /* Change preferred (first in list) path to evenly balance. >> */ >> - list_move_tail(s->valid_paths.next, >valid_paths); >> - >> list_for_each_entry(pi, >valid_paths, list) { >> if (!best || >> (atomic_read(>qlen) < atomic_read( >> >qlen))) >> @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (!best) >> goto out; >> >> + /* Move most recently used to least preferred to evenly >> balance. */ >> + list_move_tail(>list, >valid_paths); >> + >> ret = best->path; >> out: >> spin_unlock_irqrestore(>lock, flags); >> diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service- >> time.c >> index 7b8642045c55..f006a9005593 100644 >> --- a/drivers/md/dm-service-time.c >> +++ b/drivers/md/dm-service-time.c >> @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (list_empty(>valid_paths)) >> goto out; >> >> - /* Change preferred (first in list) path to evenly balance. >> */ >> - list_move_tail(s->valid_paths.next, >valid_paths); >> - >> list_for_each_entry(pi, >valid_paths, list) >> if (!best || (st_compare_load(pi, best, nr_bytes) < >> 0)) >>
[PATCH] block/compat_ioctl: fix range check in BLKGETSIZE
kernel ulong and compat_ulong_t may not be same width. Use type directly to eliminate mismatches. This would result in truncation rather than EFBIG for 32bit mode for large disks. Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- block/compat_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 6ca015f92766..3a2c77f07da8 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -388,7 +388,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return 0; case BLKGETSIZE: size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if ((size >> 9) > ~((compat_ulong_t)0UL)) return -EFBIG; return compat_put_ulong(arg, size >> 9); -- 2.17.0.484.g0c8726318c-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()
shrink_dcache_parent may spin waiting for a parallel shrink_dentry_list. In this case we may have 0 dentries to dispose, so we will never schedule out while waiting for the parallel shrink_dentry_list to complete. Tested that this fixes syzbot reports of stalls in shrink_dcache_parent() Fixes: 32785c0539b7 ("fs/dcache.c: add cond_resched() in shrink_dentry_list()") Reported-by: syzbot+ae80b790eb412884c...@syzkaller.appspotmail.com Cc: Nikolay Borisov <nbori...@suse.com> Cc: Andrew Morton <a...@linux-foundation.org> Cc: David Rientjes <rient...@google.com> Cc: Alexander Viro <v...@zeniv.linux.org.uk> Cc: Goldwyn Rodrigues <rgold...@suse.de> Cc: Jeff Mahoney <je...@suse.com> Cc: Davidlohr Bueso <d...@stgolabs.net> Cc: Linus Torvalds <torva...@linux-foundation.org> Signed-off-by: Khazhismel Kumykov <kha...@google.com> --- fs/dcache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dcache.c b/fs/dcache.c index 591b34500e41..3507badeb60a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1489,6 +1489,7 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(); + cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); -- 2.17.0.484.g0c8726318c-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()
On Sun, Apr 15, 2018 at 3:34 PM, Al Virowrote: > On Sun, Apr 15, 2018 at 10:54:55PM +0100, Al Viro wrote: >> On Sun, Apr 15, 2018 at 09:40:54PM +0100, Al Viro wrote: >> >> > BTW, the current placement of cond_resched() looks bogus; suppose we >> > have collected a lot of victims and ran into need_resched(). We leave >> > d_walk() and call shrink_dentry_list(). At that point there's a lot >> > of stuff on our shrink list and anybody else running into them will >> > have to keep scanning. Giving up the timeslice before we take care >> > of any of those looks like a bad idea, to put it mildly, and that's >> > precisely what will happen. >> > >> > What about doing that in the end of __dentry_kill() instead? And to >> > hell with both existing call sites - dput() one (before going to >> > the parent) is obviously covered by that (dentry_kill() only returns >> > non-NULL after having called __dentry_kill()) and in shrink_dentry_list() >> > we'll get to it as soon as we go through all dentries that can be >> > immediately kicked off the shrink list. Which, AFAICS, improves the >> > situation, now that shrink_lock_dentry() contains no trylock loops... >> > >> > Comments? >> >> What I mean is something like this (cumulative diff, it'll obviously need >> to be carved up into 3--4 commits): > > ... and carved-up version is in vfs.git#work.dcache. Could syzbot folks > hit it with their reproducers? To me it seems like shrink_dcache_parent could still spin without scheduling similar to before - found > 0 since *someone* is shrinking, but we have 0 entries to shrink - we never call __dentry_kill or schedule, and just spin. And indeed, the syzbot reproducer @vfs#work.dcache hits a softlockup in shrink_dcache_parent. I'd think re-adding cond_resched to shrink_dcache_parent does make the softlockup go way. It seems to fix the reproducer. Although did we want to terminate the loop in shrink_dcache_parent earlier? smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] fs/fat: add cond_resched to fat_count_free_clusters
On non-preempt kernels this loop can take a long time (more than 50 ticks) processing through entries. Signed-off-by: Khazhismel Kumykov --- fs/fat/fatent.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index defc2168de91..f58c0cacc531 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb) if (ops->ent_get() == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, )); + cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; -- 2.19.0.605.g01d371f741-goog
Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown
Noticed these don't seem to be in 4.14/scsi-queue On Tue, Aug 29, 2017 at 6:45 PM, Martin K. Petersen wrote: > > Chris, > >> Looks good to me, fixes up the code given that the comment there about >> calling iscsi_remove_session wasn't being followed. > > Applied these two to 4.14/scsi-queue. > > -- > Martin K. Petersen Oracle Linux Engineering smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Dec 18, 2017 at 1:01 PM, Vivek Goyal wrote: > On Mon, Dec 18, 2017 at 12:39:50PM -0800, Khazhismel Kumykov wrote: >> On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal wrote: >> > On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote: >> >> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov >> >> wrote: >> >> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li wrote: >> >> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >> >> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li wrote: >> >> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> >> >>> >> Allows configuration additional bytes or ios before a throttle is >> >> >>> >> triggered. >> >> >>> >> >> >> >>> >> This allows implementation of a bucket style rate-limit/throttle >> >> >>> >> on a >> >> >>> >> block device. Previously, bursting to a device was limited to >> >> >>> >> allowance >> >> >>> >> granted in a single throtl_slice (similar to a bucket with limit N >> >> >>> >> and >> >> >>> >> refill rate N/slice). >> >> >>> >> >> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which >> >> >>> >> define a >> >> >>> >> number of bytes/ios that must be depleted before throttling >> >> >>> >> happens. A >> >> >>> >> tg that does not deplete this allowance functions as though it has >> >> >>> >> no >> >> >>> >> configured limits. tgs earn additional allowance at rate defined by >> >> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, >> >> >>> >> throttling >> >> >>> >> kicks in. If a tg is idle for a while, it will again have some >> >> >>> >> burst >> >> >>> >> allowance before it gets throttled again. >> >> >>> >> >> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall >> >> >>> >> to 0, >> >> >>> >> when all "used" burst allowance would be earned back. trim_slice >> >> >>> >> still >> >> >>> >> does progress slice_start as before and decrements *_disp as >> >> >>> >> before, and >> >> >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >> >> >>> > >> >> >>> > Can you describe why we need this? It would be great if you can >> >> >>> > describe the >> >> >>> > usage model and an example. Does this work for io.low/io.max or >> >> >>> > both? >> >> >>> > >> >> >>> > Thanks, >> >> >>> > Shaohua >> >> >>> > >> >> >>> >> >> >>> Use case that brought this up was configuring limits for a remote >> >> >>> shared device. Bursting beyond io.max is desired but only for so much >> >> >>> before the limit kicks in, afterwards with sustained usage throughput >> >> >>> is capped. (This proactively avoids remote-side limits). In that case >> >> >>> one would configure in a root container io.max + io.burst, and >> >> >>> configure low/other limits on descendants sharing the resource on the >> >> >>> same node. >> >> >>> >> >> >>> With this patch, so long as tg has not dispatched more than the burst, >> >> >>> no limit is applied at all by that tg, including limit imposed by >> >> >>> io.low in tg_iops_limit, etc. >> >> >> >> >> >> I'd appreciate if you can give more details about the 'why'. >> >> >> 'configuring >> >> >> limits for a remote shared device' doesn't justify the change. >> >> > >> >> > This is to configure a bursty workload (and associated device) with >> >> > known/allowed expected burst size, but to not allow full utilization >> >> > of the device for extended peri
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov wrote: > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li wrote: >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li wrote: >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >>> >> Allows configuration additional bytes or ios before a throttle is >>> >> triggered. >>> >> >>> >> This allows implementation of a bucket style rate-limit/throttle on a >>> >> block device. Previously, bursting to a device was limited to allowance >>> >> granted in a single throtl_slice (similar to a bucket with limit N and >>> >> refill rate N/slice). >>> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >>> >> number of bytes/ios that must be depleted before throttling happens. A >>> >> tg that does not deplete this allowance functions as though it has no >>> >> configured limits. tgs earn additional allowance at rate defined by >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >>> >> kicks in. If a tg is idle for a while, it will again have some burst >>> >> allowance before it gets throttled again. >>> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >>> >> when all "used" burst allowance would be earned back. trim_slice still >>> >> does progress slice_start as before and decrements *_disp as before, and >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >>> > >>> > Can you describe why we need this? It would be great if you can describe >>> > the >>> > usage model and an example. Does this work for io.low/io.max or both? >>> > >>> > Thanks, >>> > Shaohua >>> > >>> >>> Use case that brought this up was configuring limits for a remote >>> shared device. Bursting beyond io.max is desired but only for so much >>> before the limit kicks in, afterwards with sustained usage throughput >>> is capped. (This proactively avoids remote-side limits). In that case >>> one would configure in a root container io.max + io.burst, and >>> configure low/other limits on descendants sharing the resource on the >>> same node. >>> >>> With this patch, so long as tg has not dispatched more than the burst, >>> no limit is applied at all by that tg, including limit imposed by >>> io.low in tg_iops_limit, etc. >> >> I'd appreciate if you can give more details about the 'why'. 'configuring >> limits for a remote shared device' doesn't justify the change. > > This is to configure a bursty workload (and associated device) with > known/allowed expected burst size, but to not allow full utilization > of the device for extended periods of time for QoS. During idle or low > use periods the burst allowance accrues, and then tasks can burst well > beyond the configured throttle up to the limit, afterwards is > throttled. A constant throttle speed isn't sufficient for this as you > can only burst 1 slice worth, but a limit of sorts is desirable for > preventing over utilization of the shared device. This type of limit > is also slightly different than what i understand io.low does in local > cases in that tg is only high priority/unthrottled if it is bursty, > and is limited with constant usage > > Khazhy Hi Shaohua, Does this clarify the reason for this patch? Is this (or something similar) a good fit for inclusion in blk-throttle? Thanks, Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal wrote: > On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote: >> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov >> wrote: >> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li wrote: >> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li wrote: >> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> >>> >> Allows configuration additional bytes or ios before a throttle is >> >>> >> triggered. >> >>> >> >> >>> >> This allows implementation of a bucket style rate-limit/throttle on a >> >>> >> block device. Previously, bursting to a device was limited to >> >>> >> allowance >> >>> >> granted in a single throtl_slice (similar to a bucket with limit N and >> >>> >> refill rate N/slice). >> >>> >> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which >> >>> >> define a >> >>> >> number of bytes/ios that must be depleted before throttling happens. A >> >>> >> tg that does not deplete this allowance functions as though it has no >> >>> >> configured limits. tgs earn additional allowance at rate defined by >> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >> >>> >> kicks in. If a tg is idle for a while, it will again have some burst >> >>> >> allowance before it gets throttled again. >> >>> >> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to >> >>> >> 0, >> >>> >> when all "used" burst allowance would be earned back. trim_slice still >> >>> >> does progress slice_start as before and decrements *_disp as before, >> >>> >> and >> >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >> >>> > >> >>> > Can you describe why we need this? It would be great if you can >> >>> > describe the >> >>> > usage model and an example. Does this work for io.low/io.max or both? >> >>> > >> >>> > Thanks, >> >>> > Shaohua >> >>> > >> >>> >> >>> Use case that brought this up was configuring limits for a remote >> >>> shared device. Bursting beyond io.max is desired but only for so much >> >>> before the limit kicks in, afterwards with sustained usage throughput >> >>> is capped. (This proactively avoids remote-side limits). In that case >> >>> one would configure in a root container io.max + io.burst, and >> >>> configure low/other limits on descendants sharing the resource on the >> >>> same node. >> >>> >> >>> With this patch, so long as tg has not dispatched more than the burst, >> >>> no limit is applied at all by that tg, including limit imposed by >> >>> io.low in tg_iops_limit, etc. >> >> >> >> I'd appreciate if you can give more details about the 'why'. 'configuring >> >> limits for a remote shared device' doesn't justify the change. >> > >> > This is to configure a bursty workload (and associated device) with >> > known/allowed expected burst size, but to not allow full utilization >> > of the device for extended periods of time for QoS. During idle or low >> > use periods the burst allowance accrues, and then tasks can burst well >> > beyond the configured throttle up to the limit, afterwards is >> > throttled. A constant throttle speed isn't sufficient for this as you >> > can only burst 1 slice worth, but a limit of sorts is desirable for >> > preventing over utilization of the shared device. This type of limit >> > is also slightly different than what i understand io.low does in local >> > cases in that tg is only high priority/unthrottled if it is bursty, >> > and is limited with constant usage >> > >> > Khazhy >> >> Hi Shaohua, >> >> Does this clarify the reason for this patch? Is this (or something >> similar) a good fit for inclusion in blk-throttle? >> > > So does this brust have to be per cgroup. I mean if thortl_slice was > configurable, that will allow to control t
[RFC PATCH] blk-throttle: add burst allowance
Allows configuration additional bytes or ios before a throttle is triggered. Slice end is extended to cover expended allowance recovery time. Usage would be e.g. per device to allow users to take up to X bytes/ios at full speed, but be limited to Y bps/iops with sustained usage. Signed-off-by: Khazhismel Kumykov --- block/Kconfig| 11 +++ block/blk-throttle.c | 185 --- 2 files changed, 186 insertions(+), 10 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 3ab42bbb06d5..16545caa7fc9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -127,6 +127,17 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. +config BLK_DEV_THROTTLING_BURST +bool "Block throttling .burst allowance interface" +depends on BLK_DEV_THROTTLING +default n +---help--- +Add .burst allowance for block throttling. Burst allowance allows for +additional unthrottled usage, while still limiting speed for sustained +usage. + +If in doubt, say N. + config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d80c3f0144c5..e09ec11e9c5f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -156,6 +156,11 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + uint64_t bytes_burst_conf[2]; + unsigned int io_burst_conf[2]; +#endif + unsigned long last_low_overflow_time[2]; uint64_t last_bytes_disp[2]; @@ -506,6 +511,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + tg->bytes_burst_conf[READ] = 0; + tg->bytes_burst_conf[WRITE] = 0; + tg->io_burst_conf[READ] = 0; + tg->io_burst_conf[WRITE] = 0; +#endif /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; @@ -799,6 +810,26 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_end[rw], jiffies); } +/* + * When current slice should end. + * + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait + * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end + * before this would result in tg receiving additional burst allowance. + */ +static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw, + unsigned long min_wait) +{ + unsigned long bytes_wait = 0, io_wait = 0; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + if (tg->bytes_burst_conf[rw]) + bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw); + if (tg->io_burst_conf[rw]) + io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw); +#endif + return jiffies + max(min_wait, max(bytes_wait, io_wait)); +} + static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { @@ -848,7 +879,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, +throtl_slice_wait(tg, rw, tg->td->throtl_slice)); time_elapsed = jiffies - tg->slice_start[rw]; @@ -888,7 +920,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; + unsigned int io_allowed, io_disp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; @@ -907,6 +939,17 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, * have been trimmed. */ + io_disp = tg->io_disp[rw]; + +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + if (tg->io_disp[rw] < tg->io_burst_conf[rw]) { + if (wait) + *wait = 0; + return true; + } + io_disp -= tg->io_burst_conf[rw]; +#endif + tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); @@ -915,14 +958,14 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, else io_allowed = tmp; - if (tg->io_disp[rw] + 1 <= io_allowed) { + if (io_disp + 1 <= io_allowed) { if (wai
[RFC PATCH] blk-throttle: add burst allowance.
Allows configuration additional bytes or ios before a throttle is triggered. This allows implementation of a bucket style rate-limit/throttle on a block device. Previously, bursting to a device was limited to allowance granted in a single throtl_slice (similar to a bucket with limit N and refill rate N/slice). Additional parameters bytes/io_burst_conf defined for tg, which define a number of bytes/ios that must be depleted before throttling happens. A tg that does not deplete this allowance functions as though it has no configured limits. tgs earn additional allowance at rate defined by bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling kicks in. If a tg is idle for a while, it will again have some burst allowance before it gets throttled again. slice_end for a tg is extended until io_disp/byte_disp would fall to 0, when all "used" burst allowance would be earned back. trim_slice still does progress slice_start as before and decrements *_disp as before, and tgs continue to get bytes/ios in throtl_slice intervals. Signed-off-by: Khazhismel Kumykov --- block/Kconfig| 11 +++ block/blk-throttle.c | 192 +++ 2 files changed, 189 insertions(+), 14 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index 28ec55752b68..fbd05b419f93 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. +config BLK_DEV_THROTTLING_BURST +bool "Block throttling .burst allowance interface" +depends on BLK_DEV_THROTTLING +default n +---help--- +Add .burst allowance for block throttling. Burst allowance allows for +additional unthrottled usage, while still limiting speed for sustained +usage. + +If in doubt, say N. + config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 96ad32623427..27c084312772 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -157,6 +157,11 @@ struct throtl_grp { /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + uint64_t bytes_burst_conf[2]; + unsigned int io_burst_conf[2]; +#endif + unsigned long last_low_overflow_time[2]; uint64_t last_bytes_disp[2]; @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + tg->bytes_burst_conf[READ] = 0; + tg->bytes_burst_conf[WRITE] = 0; + tg->io_burst_conf[READ] = 0; + tg->io_burst_conf[WRITE] = 0; +#endif /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_end[rw], jiffies); } +/* + * When current slice should end. + * + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait + * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end + * before this would result in tg receiving additional burst allowance. + */ +static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw, + unsigned long min_wait) +{ + unsigned long bytes_wait = 0, io_wait = 0; +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST + if (tg->bytes_burst_conf[rw]) + bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw); + if (tg->io_burst_conf[rw]) + io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw); +#endif + return max(min_wait, max(bytes_wait, io_wait)); +} + static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { @@ -849,7 +880,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, + jiffies + throtl_slice_wait(tg, rw, tg->td->throtl_slice)); time_elapsed = jiffies - tg->slice_start[rw]; @@ -889,7 +921,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; + unsigned int io_allowed, io_disp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; @@ -908,6 +940,17 @@ stati
Re: [RFC PATCH] blk-throttle: add burst allowance.
(Botched the to line, sorry) smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] generic: Add nocheck shutdown stress test
Most shutdown tests only run on filesystems with metadata journaling, so we lose coverage. Add a shutdown stress test that doesn't check for consistency, so does not require journaling. Signed-off-by: Khazhismel Kumykov --- tests/generic/999 | 84 +++ tests/generic/999.out | 2 ++ tests/generic/group | 1 + 3 files changed, 87 insertions(+) create mode 100755 tests/generic/999 create mode 100644 tests/generic/999.out diff --git a/tests/generic/999 b/tests/generic/999 new file mode 100755 index ..71b9aa4c --- /dev/null +++ b/tests/generic/999 @@ -0,0 +1,84 @@ +#!/bin/bash +# FS QA Test No. 999 +# +# Shutdown stress test - exercise shutdown codepath with fsstress, +# make sure we don't BUG/WARN. Coverage for all fs with shutdown. +# +#--- +# Copyright (c) 2013 Red Hat, Inc. All Rights Reserved. +# Copyright (c) 2017 Google, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +_cleanup() +{ + cd / + _scratch_unmount 2>/dev/null + rm -f $tmp.* +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# real QA test starts here +_supported_fs generic +_supported_os Linux + +_require_scratch_nocheck +_require_scratch_shutdown +_require_command "$KILLALL_PROG" killall + +rm -f $seqres.full + +_scratch_mkfs > $seqres.full 2>&1 +_scratch_mount + +SLEEP_TIME=$((10 * $TIME_FACTOR)) +PROCS=$((4 * LOAD_FACTOR)) + +load_dir=$SCRATCH_MNT/test + +$XFS_FSR_PROG -v $load_dir >> $seqres.full 2>&1 +$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> $seqres.full 2>&1 & +sleep $SLEEP_TIME +sync + +# now shutdown and unmount +sleep 5 +$here/src/godown $load_dir +$KILLALL_PROG -q $FSSTRESS_PROG +wait + +# for some reason fsstress processes manage to live on beyond the wait? +sleep 5 + +_scratch_unmount + +echo "No output is good. Failures are loud." + +status=0 +exit diff --git a/tests/generic/999.out b/tests/generic/999.out new file mode 100644 index ..68a51fe7 --- /dev/null +++ b/tests/generic/999.out @@ -0,0 +1,2 @@ +QA output created by 999 +No output is good. Failures are loud. diff --git a/tests/generic/group b/tests/generic/group index f922b496..891386ac 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -463,3 +463,4 @@ 458 auto quick clone 459 auto dangerous 460 auto quick rw +999 auto shutdown stress -- 2.14.1.821.g8fa685d3b7-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v2] generic: Add nocheck shutdown stress test
Most shutdown tests only run on filesystems with metadata journaling, so we lose coverage. Add a shutdown stress test that doesn't check for consistency, so does not require journaling. This is a modified version of generic/051, with some extras trimmed and fs checking removed. Signed-off-by: Khazhismel Kumykov --- v2: drop defrag, use "silence is golden" tests/generic/999 | 83 +++ tests/generic/999.out | 2 ++ tests/generic/group | 1 + 3 files changed, 86 insertions(+) create mode 100755 tests/generic/999 create mode 100644 tests/generic/999.out diff --git a/tests/generic/999 b/tests/generic/999 new file mode 100755 index ..7b612686 --- /dev/null +++ b/tests/generic/999 @@ -0,0 +1,83 @@ +#!/bin/bash +# FS QA Test No. 999 +# +# Shutdown stress test - exercise shutdown codepath with fsstress, +# make sure we don't BUG/WARN. Coverage for all fs with shutdown. +# +#--- +# Copyright (c) 2013 Red Hat, Inc. All Rights Reserved. +# Copyright (c) 2017 Google, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +_cleanup() +{ + cd / + _scratch_unmount 2>/dev/null + rm -f $tmp.* +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# real QA test starts here +_supported_fs generic +_supported_os Linux + +_require_scratch_nocheck +_require_scratch_shutdown +_require_command "$KILLALL_PROG" killall + +rm -f $seqres.full + +_scratch_mkfs > $seqres.full 2>&1 +_scratch_mount + +SLEEP_TIME=$((10 * $TIME_FACTOR)) +PROCS=$((4 * LOAD_FACTOR)) + +load_dir=$SCRATCH_MNT/test + +$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> $seqres.full 2>&1 & +sleep $SLEEP_TIME +sync + +# now shutdown and unmount +sleep 5 +$here/src/godown $load_dir +$KILLALL_PROG -q $FSSTRESS_PROG +wait + +# for some reason fsstress processes manage to live on beyond the wait? +sleep 5 + +_scratch_unmount + +echo "Silence is golden" + +status=0 +exit diff --git a/tests/generic/999.out b/tests/generic/999.out new file mode 100644 index ..3b276ca8 --- /dev/null +++ b/tests/generic/999.out @@ -0,0 +1,2 @@ +QA output created by 999 +Silence is golden diff --git a/tests/generic/group b/tests/generic/group index f922b496..891386ac 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -463,3 +463,4 @@ 458 auto quick clone 459 auto dangerous 460 auto quick rw +999 auto shutdown stress -- 2.14.1.821.g8fa685d3b7-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li wrote: > On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li wrote: >> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> >> Allows configuration additional bytes or ios before a throttle is >> >> triggered. >> >> >> >> This allows implementation of a bucket style rate-limit/throttle on a >> >> block device. Previously, bursting to a device was limited to allowance >> >> granted in a single throtl_slice (similar to a bucket with limit N and >> >> refill rate N/slice). >> >> >> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >> >> number of bytes/ios that must be depleted before throttling happens. A >> >> tg that does not deplete this allowance functions as though it has no >> >> configured limits. tgs earn additional allowance at rate defined by >> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >> >> kicks in. If a tg is idle for a while, it will again have some burst >> >> allowance before it gets throttled again. >> >> >> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >> >> when all "used" burst allowance would be earned back. trim_slice still >> >> does progress slice_start as before and decrements *_disp as before, and >> >> tgs continue to get bytes/ios in throtl_slice intervals. >> > >> > Can you describe why we need this? It would be great if you can describe >> > the >> > usage model and an example. Does this work for io.low/io.max or both? >> > >> > Thanks, >> > Shaohua >> > >> >> Use case that brought this up was configuring limits for a remote >> shared device. Bursting beyond io.max is desired but only for so much >> before the limit kicks in, afterwards with sustained usage throughput >> is capped. (This proactively avoids remote-side limits). In that case >> one would configure in a root container io.max + io.burst, and >> configure low/other limits on descendants sharing the resource on the >> same node. >> >> With this patch, so long as tg has not dispatched more than the burst, >> no limit is applied at all by that tg, including limit imposed by >> io.low in tg_iops_limit, etc. > > I'd appreciate if you can give more details about the 'why'. 'configuring > limits for a remote shared device' doesn't justify the change. This is to configure a bursty workload (and associated device) with known/allowed expected burst size, but to not allow full utilization of the device for extended periods of time for QoS. During idle or low use periods the burst allowance accrues, and then tasks can burst well beyond the configured throttle up to the limit, afterwards is throttled. A constant throttle speed isn't sufficient for this as you can only burst 1 slice worth, but a limit of sorts is desirable for preventing over utilization of the shared device. This type of limit is also slightly different than what i understand io.low does in local cases in that tg is only high priority/unthrottled if it is bursty, and is limited with constant usage Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown
On Thu, Jul 13, 2017 at 9:11 AM, Khazhismel Kumykov wrote: Ping in case this was missed smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()
shrink_dcache_parent may spin waiting for a parallel shrink_dentry_list. In this case we may have 0 dentries to dispose, so we will never schedule out while waiting for the parallel shrink_dentry_list to complete. Tested that this fixes syzbot reports of stalls in shrink_dcache_parent() Fixes: 32785c0539b7 ("fs/dcache.c: add cond_resched() in shrink_dentry_list()") Reported-by: syzbot+ae80b790eb412884c...@syzkaller.appspotmail.com Cc: Nikolay Borisov Cc: Andrew Morton Cc: David Rientjes Cc: Alexander Viro Cc: Goldwyn Rodrigues Cc: Jeff Mahoney Cc: Davidlohr Bueso Cc: Linus Torvalds Signed-off-by: Khazhismel Kumykov --- fs/dcache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dcache.c b/fs/dcache.c index 591b34500e41..3507badeb60a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1489,6 +1489,7 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(); + cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); -- 2.17.0.484.g0c8726318c-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()
On Sun, Apr 15, 2018 at 3:34 PM, Al Viro wrote: > On Sun, Apr 15, 2018 at 10:54:55PM +0100, Al Viro wrote: >> On Sun, Apr 15, 2018 at 09:40:54PM +0100, Al Viro wrote: >> >> > BTW, the current placement of cond_resched() looks bogus; suppose we >> > have collected a lot of victims and ran into need_resched(). We leave >> > d_walk() and call shrink_dentry_list(). At that point there's a lot >> > of stuff on our shrink list and anybody else running into them will >> > have to keep scanning. Giving up the timeslice before we take care >> > of any of those looks like a bad idea, to put it mildly, and that's >> > precisely what will happen. >> > >> > What about doing that in the end of __dentry_kill() instead? And to >> > hell with both existing call sites - dput() one (before going to >> > the parent) is obviously covered by that (dentry_kill() only returns >> > non-NULL after having called __dentry_kill()) and in shrink_dentry_list() >> > we'll get to it as soon as we go through all dentries that can be >> > immediately kicked off the shrink list. Which, AFAICS, improves the >> > situation, now that shrink_lock_dentry() contains no trylock loops... >> > >> > Comments? >> >> What I mean is something like this (cumulative diff, it'll obviously need >> to be carved up into 3--4 commits): > > ... and carved-up version is in vfs.git#work.dcache. Could syzbot folks > hit it with their reproducers? To me it seems like shrink_dcache_parent could still spin without scheduling similar to before - found > 0 since *someone* is shrinking, but we have 0 entries to shrink - we never call __dentry_kill or schedule, and just spin. And indeed, the syzbot reproducer @vfs#work.dcache hits a softlockup in shrink_dcache_parent. I'd think re-adding cond_resched to shrink_dcache_parent does make the softlockup go way. It seems to fix the reproducer. Although did we want to terminate the loop in shrink_dcache_parent earlier? smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] block/compat_ioctl: fix range check in BLKGETSIZE
kernel ulong and compat_ulong_t may not be same width. Use type directly to eliminate mismatches. This would result in truncation rather than EFBIG for 32bit mode for large disks. Signed-off-by: Khazhismel Kumykov --- block/compat_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 6ca015f92766..3a2c77f07da8 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -388,7 +388,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return 0; case BLKGETSIZE: size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if ((size >> 9) > ~((compat_ulong_t)0UL)) return -EFBIG; return compat_put_ulong(arg, size >> 9); -- 2.17.0.484.g0c8726318c-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] libiscsi: Fix use-after-free race during iscsi_session_teardown
Session attributes exposed through sysfs were freed before the device was destroyed, resulting in a potential use-after-free. Free these attributes after removing the device. Signed-off-by: Khazhismel Kumykov --- drivers/scsi/libiscsi.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 42381adf0769..f9199bebaec7 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup); /** * iscsi_session_teardown - destroy session, host, and cls_session * @cls_session: iscsi session - * - * The driver must have called iscsi_remove_session before - * calling this. */ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) { @@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) iscsi_pool_free(>cmdpool); + iscsi_remove_session(session); + kfree(session->password); kfree(session->password_in); kfree(session->username); @@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->portal_type); kfree(session->discovery_parent_type); - iscsi_destroy_session(cls_session); + iscsi_free_session(cls_session); + iscsi_host_dec_session_cnt(shost); module_put(owner); } -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] libiscsi: Remove iscsi_destroy_session
iscsi_session_teardown was the only user of this function. Function currently is just short for iscsi_remove_session + iscsi_free_session. Signed-off-by: Khazhismel Kumykov --- with "libiscsi: Fix use after free race during iscsi_session_teardown" removing the last user. drivers/scsi/scsi_transport_iscsi.c | 16 include/scsi/scsi_transport_iscsi.h | 1 - 2 files changed, 17 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index a424eaeafeb0..924ac408d8a9 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_free_session); -/** - * iscsi_destroy_session - destroy iscsi session - * @session: iscsi_session - * - * Can be called by a LLD or iscsi_transport. There must not be - * any running connections. - */ -int iscsi_destroy_session(struct iscsi_cls_session *session) -{ - iscsi_remove_session(session); - ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n"); - iscsi_free_session(session); - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_destroy_session); - /** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6183d20a01fb..b266d2a3bcb1 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -extern int iscsi_destroy_session(struct iscsi_cls_session *session); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn); -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
[Patch v2 2/2] libiscsi: Remove iscsi_destroy_session
iscsi_session_teardown was the only user of this function. Function currently is just short for iscsi_remove_session + iscsi_free_session. Signed-off-by: Khazhismel Kumykov --- drivers/scsi/scsi_transport_iscsi.c | 16 include/scsi/scsi_transport_iscsi.h | 1 - 2 files changed, 17 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index a424eaeafeb0..924ac408d8a9 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_free_session); -/** - * iscsi_destroy_session - destroy iscsi session - * @session: iscsi_session - * - * Can be called by a LLD or iscsi_transport. There must not be - * any running connections. - */ -int iscsi_destroy_session(struct iscsi_cls_session *session) -{ - iscsi_remove_session(session); - ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n"); - iscsi_free_session(session); - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_destroy_session); - /** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6183d20a01fb..b266d2a3bcb1 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -extern int iscsi_destroy_session(struct iscsi_cls_session *session); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn); -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
[Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown
Session attributes exposed through sysfs were freed before the device was destroyed, resulting in a potential use-after-free. Free these attributes after removing the device. Signed-off-by: Khazhismel Kumykov --- drivers/scsi/libiscsi.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 42381adf0769..8696a51a5a0c 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup); /** * iscsi_session_teardown - destroy session, host, and cls_session * @cls_session: iscsi session - * - * The driver must have called iscsi_remove_session before - * calling this. */ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) { @@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) iscsi_pool_free(>cmdpool); + iscsi_remove_session(cls_session); + kfree(session->password); kfree(session->password_in); kfree(session->username); @@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->portal_type); kfree(session->discovery_parent_type); - iscsi_destroy_session(cls_session); + iscsi_free_session(cls_session); + iscsi_host_dec_session_cnt(shost); module_put(owner); } -- 2.13.2.932.g7449e964c-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Fri, Jun 2, 2017 at 6:12 PM, Al Viro wrote: > Part of that could be relieved if we turned check_and_drop() into > static void check_and_drop(void *_data) > { > struct detach_data *data = _data; > > if (!data->mountpoint && list_empty(>select.dispose)) > __d_drop(data->select.start); > } So with this change, d_invalidate will drop the starting dentry before all it's children are dropped? Would it make sense to just drop it right off the bat, and let one task handle shrinking all it's children? > > It doesn't solve the entire problem, though - we still could get too > many threads into that thing before any of them gets to that __d_drop(). Yes, the change isn't sufficient for my repro, as many threads get to the loop before the drop, although new tasks don't get stuck in the same loop after the dentry is dropped. > I wonder if we should try and collect more victims; that could lead > to contention of its own, but... >From my understanding, the contention is the worst when one task is shrinking everything, and several other tasks are busily looping walking the dentry until everything is done. In this case, the tasks busily looping d_walk hold the d_lock for a dentry while walking over all it's children, then soon after it finishes the d_walk, it queues again to walk again, while shrink_dentry_list releases and re-grabs for each entry. If we limit the number of items we pass to shrink_dentry_list at one time things actually look quite a bit better. e.g., in testing arbitrarily limiting select.dispose to 1000 elements does fix my repro. e.g. diff --git a/fs/dcache.c b/fs/dcache.c index 22af360ceca3..3892e0eb7ec2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1367,6 +1367,7 @@ struct select_data { struct dentry *start; struct list_head dispose; int found; + int actually_found; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) @@ -1388,6 +1389,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (!dentry->d_lockref.count) { d_shrink_add(dentry, >dispose); data->found++; + data->actually_found++; } } /* @@ -1397,6 +1399,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) */ if (!list_empty(>dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; + + if (data->actually_found > 1000) + ret = D_WALK_QUIT; out: return ret; } @@ -1415,6 +1420,7 @@ void shrink_dcache_parent(struct dentry *parent) INIT_LIST_HEAD(); data.start = parent; data.found = 0; + data.actually_found = 0; d_walk(parent, , select_collect, NULL); if (!data.found) @@ -1536,6 +1542,7 @@ void d_invalidate(struct dentry *dentry) INIT_LIST_HEAD(); data.select.start = dentry; data.select.found = 0; + data.select.actually_found = 0; d_walk(dentry, , detach_and_collect, check_and_drop); smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Fri, Jun 2, 2017 at 11:20 PM, Al Viro wrote: > The thing is, unlike shrink_dcache_parent() we *can* bugger off as > soon as we'd found no victims, nothing mounted and dentry itself > is unhashed. We can't do anything in select_collect() (we would've > broken shrink_dcache_parent() that way), but we can do unhashing > in check_and_drop() in "really nothing to do" case and we can return > from d_invalidate() after that. So how about this: That does the trick. smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov wrote: > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li wrote: >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote: >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li wrote: >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >>> >> Allows configuration additional bytes or ios before a throttle is >>> >> triggered. >>> >> >>> >> This allows implementation of a bucket style rate-limit/throttle on a >>> >> block device. Previously, bursting to a device was limited to allowance >>> >> granted in a single throtl_slice (similar to a bucket with limit N and >>> >> refill rate N/slice). >>> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >>> >> number of bytes/ios that must be depleted before throttling happens. A >>> >> tg that does not deplete this allowance functions as though it has no >>> >> configured limits. tgs earn additional allowance at rate defined by >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >>> >> kicks in. If a tg is idle for a while, it will again have some burst >>> >> allowance before it gets throttled again. >>> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >>> >> when all "used" burst allowance would be earned back. trim_slice still >>> >> does progress slice_start as before and decrements *_disp as before, and >>> >> tgs continue to get bytes/ios in throtl_slice intervals. >>> > >>> > Can you describe why we need this? It would be great if you can describe >>> > the >>> > usage model and an example. Does this work for io.low/io.max or both? >>> > >>> > Thanks, >>> > Shaohua >>> > >>> >>> Use case that brought this up was configuring limits for a remote >>> shared device. Bursting beyond io.max is desired but only for so much >>> before the limit kicks in, afterwards with sustained usage throughput >>> is capped. (This proactively avoids remote-side limits). In that case >>> one would configure in a root container io.max + io.burst, and >>> configure low/other limits on descendants sharing the resource on the >>> same node. >>> >>> With this patch, so long as tg has not dispatched more than the burst, >>> no limit is applied at all by that tg, including limit imposed by >>> io.low in tg_iops_limit, etc. >> >> I'd appreciate if you can give more details about the 'why'. 'configuring >> limits for a remote shared device' doesn't justify the change. > > This is to configure a bursty workload (and associated device) with > known/allowed expected burst size, but to not allow full utilization > of the device for extended periods of time for QoS. During idle or low > use periods the burst allowance accrues, and then tasks can burst well > beyond the configured throttle up to the limit, afterwards is > throttled. A constant throttle speed isn't sufficient for this as you > can only burst 1 slice worth, but a limit of sorts is desirable for > preventing over utilization of the shared device. This type of limit > is also slightly different than what i understand io.low does in local > cases in that tg is only high priority/unthrottled if it is bursty, > and is limited with constant usage > > Khazhy Ping this time without html (oops) smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH v2] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE
On Sat, Aug 29, 2020 at 6:00 PM Bart Van Assche wrote: > > From https://www.kernel.org/doc/man-pages/linux-api-ml.html: > "all Linux kernel patches that change userspace interfaces should be CCed > to linux-...@vger.kernel.org" > > So I have added the linux-api mailing list to the Cc-list. Anyway: Thanks, sorry for missing that! > > Reviewed-by: Bart Van Assche Jens, does this change look good? khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH v2 1/3] fuse: on 64-bit store time in d_fsdata directly
On Thu, Aug 22, 2019 at 1:00 PM Khazhismel Kumykov wrote: > > Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use > ->d_time"), as the additional memory can be significant. (In particular, > on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry, > this can consume significant memory. > > Reviewed-by: Shakeel Butt > Signed-off-by: Khazhismel Kumykov > --- > fs/fuse/dir.c | 19 +++ > 1 file changed, 19 insertions(+) > ping? smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE
CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping. Retain CAP_SYS_ADMIN permission for backwards compatibility. Signed-off-by: Khazhismel Kumykov --- block/ioprio.c | 2 +- include/uapi/linux/capability.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/block/ioprio.c b/block/ioprio.c index 77bcab11dce5..4572456430f9 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio) switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_NICE)) return -EPERM; /* fall through */ /* rt has prio field too */ diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 395dd0df8d08..c6ca33034147 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -288,6 +288,8 @@ struct vfs_ns_cap_data { processes and setting the scheduling algorithm used by another process. */ /* Allow setting cpu affinity on other processes */ +/* Allow setting realtime ioprio class */ +/* Allow setting ioprio class on other processes */ #define CAP_SYS_NICE 23 -- 2.28.0.297.g1956fa8f8d-goog
Re: IOPRIO_CLASS_RT without CAP_SYS_ADMIN?
On Sat, Aug 22, 2020 at 7:14 PM Jens Axboe wrote: > > On 8/22/20 7:58 PM, Bart Van Assche wrote: > > On 2020-08-20 17:35, Khazhismel Kumykov wrote: > >> It'd be nice to allow a process to send RT requests without granting > >> it the wide capabilities of CAP_SYS_ADMIN, and we already have a > >> capability which seems to almost fit this priority idea - > >> CAP_SYS_NICE? Would this fit there? > >> > >> Being capable of setting IO priorities on per request or per thread > >> basis (be it async submission or w/ thread ioprio_set) is useful > >> especially when the userspace has its own prioritization/scheduling > >> before hitting the kernel, allowing us to signal to the kernel how to > >> order certain IOs, and it'd be nice to separate this from ADMIN for > >> non-root processes, in a way that's less error prone than e.g. having > >> a trusted launcher ionice the process and then drop priorities for > >> everything but prio requests. > > > > Hi Khazhy, > > > > In include/uapi/linux/capability.h I found the following: > > > > /* Allow raising priority and setting priority on other (different > >UID) processes */ > > /* Allow use of FIFO and round-robin (realtime) scheduling on own > >processes and setting the scheduling algorithm used by another > >process. */ > > /* Allow setting cpu affinity on other processes */ > > #define CAP_SYS_NICE 23 > > > > If it is acceptable that every process that has permission to submit > > IOPRIO_CLASS_RT I/O also has permission to modify the priority of > > other processes then extending CAP_SYS_NICE is an option. Another > > possibility is to extend the block cgroup controller such that the > > capability to submit IOPRIO_CLASS_RT I/O can be enabled through the > > cgroup interface. There may be other approaches. I'm not sure what > > the best approach is. I think it fits well with CAP_SYS_NICE, especially since that capability already grants the ability to demote other processes to IOPRIO_CLASS_IDLE, etc. > > I think CAP_SYS_NICE fits pretty nicely, and I was actually planning on > using that for the io_uring SQPOLL side as well. So there is/will be > some precedent for tying it into IO related things, too. For this use > case, I think it's perfect. > > -- > Jens Axboe > smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE
On Mon, Aug 24, 2020 at 1:45 PM Khazhismel Kumykov wrote: > > CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping. > > Retain CAP_SYS_ADMIN permission for backwards compatibility. > > Signed-off-by: Khazhismel Kumykov > --- > block/ioprio.c | 2 +- > include/uapi/linux/capability.h | 2 ++ > 2 files changed, 3 insertions(+), 1 deletion(-) > > diff --git a/block/ioprio.c b/block/ioprio.c > index 77bcab11dce5..4572456430f9 100644 > --- a/block/ioprio.c > +++ b/block/ioprio.c > @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio) > > switch (class) { > case IOPRIO_CLASS_RT: > - if (!capable(CAP_SYS_ADMIN)) > + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_NICE)) yikes, sorry for the spam > return -EPERM; > /* fall through */ > /* rt has prio field too */ > diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h > index 395dd0df8d08..c6ca33034147 100644 > --- a/include/uapi/linux/capability.h > +++ b/include/uapi/linux/capability.h > @@ -288,6 +288,8 @@ struct vfs_ns_cap_data { > processes and setting the scheduling algorithm used by another > process. */ > /* Allow setting cpu affinity on other processes */ > +/* Allow setting realtime ioprio class */ > +/* Allow setting ioprio class on other processes */ > > #define CAP_SYS_NICE 23 > > -- > 2.28.0.297.g1956fa8f8d-goog > smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v2] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE
CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping. Retain CAP_SYS_ADMIN permission for backwards compatibility. Signed-off-by: Khazhismel Kumykov --- block/ioprio.c | 2 +- include/uapi/linux/capability.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) v2: fix embarrassing logic mistake diff --git a/block/ioprio.c b/block/ioprio.c index 77bcab11dce5..276496246fe9 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio) switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN)) return -EPERM; /* fall through */ /* rt has prio field too */ diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 395dd0df8d08..c6ca33034147 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -288,6 +288,8 @@ struct vfs_ns_cap_data { processes and setting the scheduling algorithm used by another process. */ /* Allow setting cpu affinity on other processes */ +/* Allow setting realtime ioprio class */ +/* Allow setting ioprio class on other processes */ #define CAP_SYS_NICE 23 -- 2.28.0.297.g1956fa8f8d-goog
IOPRIO_CLASS_RT without CAP_SYS_ADMIN?
It'd be nice to allow a process to send RT requests without granting it the wide capabilities of CAP_SYS_ADMIN, and we already have a capability which seems to almost fit this priority idea - CAP_SYS_NICE? Would this fit there? Being capable of setting IO priorities on per request or per thread basis (be it async submission or w/ thread ioprio_set) is useful especially when the userspace has its own prioritization/scheduling before hitting the kernel, allowing us to signal to the kernel how to order certain IOs, and it'd be nice to separate this from ADMIN for non-root processes, in a way that's less error prone than e.g. having a trusted launcher ionice the process and then drop priorities for everything but prio requests. khazhy smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v3 2/2] fuse: kmemcg account fs data
account per-file, dentry, and inode data blockdev/superblock and temporary per-request data was left alone, as this usually isn't accounted Signed-off-by: Khazhismel Kumykov Reviewed-by: Shakeel Butt --- fs/fuse/dir.c | 3 ++- fs/fuse/file.c | 5 +++-- fs/fuse/inode.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 58557d4817e9..d572c900bb0f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -279,7 +279,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a2ea347c4d2c..862aff3665b5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -63,12 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL); + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); if (!ff->release_args) { kfree(ff); return NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3d598a5bb5b5..6cb445bed89d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -66,7 +66,8 @@ static struct file_system_type fuseblk_fs_type; struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); } static struct inode *fuse_alloc_inode(struct super_block *sb) -- 2.23.0.237.gc6a4ce50a0-goog
[PATCH v3 1/2] fuse: on 64-bit store time in d_fsdata directly
Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use ->d_time"), as the additional memory can be significant. (In particular, on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry, this can consume significant memory. Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov --- v3: reapplied on fuse/for-next, droping the fuse_request_alloc refactor it was already done :) (and account new per-file alloc) fs/fuse/dir.c | 36 ++-- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ba0a175d7578..58557d4817e9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,11 +24,34 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, >state); } +#if BITS_PER_LONG >= 64 +static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; }; +static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + ((union fuse_dentry *) dentry->d_fsdata)->time = time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return ((union fuse_dentry *) entry->d_fsdata)->time; +} +#endif + static void fuse_dentry_settime(struct dentry *dentry, u64 time) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); @@ -47,12 +70,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) spin_unlock(>d_lock); } - ((union fuse_dentry *) dentry->d_fsdata)->time = time; -} - -static inline u64 fuse_dentry_time(const struct dentry *entry) -{ - return ((union fuse_dentry *) entry->d_fsdata)->time; + __fuse_dentry_settime(dentry, time); } /* @@ -258,6 +276,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -270,6 +289,7 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif static int fuse_dentry_delete(const struct dentry *dentry) { @@ -279,13 +299,17 @@ static int fuse_dentry_delete(const struct dentry *dentry) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) -- 2.23.0.237.gc6a4ce50a0-goog
Re: [PATCH v3 2/2] fuse: kmemcg account fs data
On Tue, Sep 17, 2019 at 12:52 AM Miklos Szeredi wrote: > > On Tue, Sep 17, 2019 at 1:56 AM Khazhismel Kumykov wrote: > > struct fuse_forget_link *fuse_alloc_forget(void) > > { > > - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); > > + return kzalloc(sizeof(struct fuse_forget_link), > > + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); > > What does __GFP_RECLAIMBALE signify in slab allocs? > Marking these allocations reclaimable hints to mm how much we can reclaim overall by shrinking, so if it is reclaimable we should mark it as such. For d_fsdata, the lifetime is linked to the dentry, which is reclaimable, so it makes sense here. > You understand that the forget_link is not reclaimable in the sense, > that it requires action (reading requests from the fuse device) from > the userspace filesystem daemon? > Ah, I see, whenever we evict the fuse_inode, we queue a forget command, which usually waits for userspace. So it's not actually linked to the inode itself, and yeah, if we need userspace to intervene we shouldn't treat forget_link as reclaimable. I had figured since fuse_inode is reclaimable, this should be too, but missed that disconnect, thanks. smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v4] fuse: kmemcg account fs data
account per-file, dentry, and inode data blockdev/superblock and temporary per-request data was left alone, as this usually isn't accounted Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov --- fs/fuse/dir.c | 3 ++- fs/fuse/file.c | 5 +++-- fs/fuse/inode.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 58557d4817e9..d572c900bb0f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -279,7 +279,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8c7578b95d2c..0f0225686aee 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -63,12 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL); + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); if (!ff->release_args) { kfree(ff); return NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3d598a5bb5b5..e040e2a2b621 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -66,7 +66,7 @@ static struct file_system_type fuseblk_fs_type; struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } static struct inode *fuse_alloc_inode(struct super_block *sb) -- 2.23.0.237.gc6a4ce50a0-goog
Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties
On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck wrote: > On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote: >> Move the last used path to the end of the list (least preferred) so >> that >> ties are more evenly distributed. >> >> For example, in case with three paths with one that is slower than >> others, the remaining two would be unevenly used if they tie. This is >> due to the rotation not being a truely fair distribution. >> >> Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are >> 'tied' >> Three possible rotations: >> (a, b, c) -> best path 'a' >> (b, c, a) -> best path 'b' >> (c, a, b) -> best path 'a' >> (a, b, c) -> best path 'a' >> (b, c, a) -> best path 'b' >> (c, a, b) -> best path 'a' >> ... > > > This happens only if a and b actually have the same weight (e.g. queue > length for the queue-length selector). If 'a' really receives more IO, > its queue grows, and the selector will start preferring 'b', so the > effect should level out automatically with the current code as soon as > you have real IO going on. But maybe I haven't grasped what you're > referring to as "tied". Yes, for "tied" I'm referring to paths with the same weight. As queue length grows it does tend to level out, but in the case where queue length doesn't grow (in this example I'm imagining 2 concurrent requests to the device) the bias does show and the selectors really do send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a' and 'b' are ~same speed). > > OTOH, if the "best" path has much lower queue length than the other > paths for whatever reason, your pushing it to the tail will require a > full list walk with every new call of the selector. I see tjat as a > small disadvantage of your approach. I see, with best at the tail, we would not see as much benefit from the check if a path has no IOs on it in queue-length. In service-time no such short circuit exists, so I don't believe anything changes there. Am I understanding this correctly? > > Regards > Martin > Thanks for your comments, Khazhy >> >> So 'a' is used 2x more than 'b', although they should be used evenly. >> >> With this change, the most recently used path is always the least >> preferred, removing this bias resulting in even distribution. >> (a, b, c) -> best path 'a' >> (b, c, a) -> best path 'b' >> (c, a, b) -> best path 'a' >> (c, b, a) -> best path 'b' >> ... >> >> Signed-off-by: Khazhismel Kumykov >> --- >> drivers/md/dm-queue-length.c | 6 +++--- >> drivers/md/dm-service-time.c | 6 +++--- >> 2 files changed, 6 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue- >> length.c >> index 23f178641794..969c4f1a3633 100644 >> --- a/drivers/md/dm-queue-length.c >> +++ b/drivers/md/dm-queue-length.c >> @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (list_empty(>valid_paths)) >> goto out; >> >> - /* Change preferred (first in list) path to evenly balance. >> */ >> - list_move_tail(s->valid_paths.next, >valid_paths); >> - >> list_for_each_entry(pi, >valid_paths, list) { >> if (!best || >> (atomic_read(>qlen) < atomic_read( >> >qlen))) >> @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (!best) >> goto out; >> >> + /* Move most recently used to least preferred to evenly >> balance. */ >> + list_move_tail(>list, >valid_paths); >> + >> ret = best->path; >> out: >> spin_unlock_irqrestore(>lock, flags); >> diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service- >> time.c >> index 7b8642045c55..f006a9005593 100644 >> --- a/drivers/md/dm-service-time.c >> +++ b/drivers/md/dm-service-time.c >> @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (list_empty(>valid_paths)) >> goto out; >> >> - /* Change preferred (first in list) path to evenly balance. >> */ >> - list_move_tail(s->valid_paths.next, >valid_paths); >> - >> list_for_each_entry(pi, >valid_paths, list) >> if (!best || (st_compare_load(pi, best, nr_bytes) < >> 0)) >> best = pi; >> @@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct >> path_selector *ps, size_t nr_bytes) >> if (!best) >> goto out; >> >> + /* Move most recently used to least preferred to evenly >> balance. */ >> + list_move_tail(>list, >valid_paths); >> + >> ret = best->path; >> out: >> spin_unlock_irqrestore(>lock, flags); >> -- >> dm-devel mailing list >> dm-de...@redhat.com >> https://www.redhat.com/mailman/listinfo/dm-devel > > -- > Dr. Martin Wilck , Tel. +49 (0)911 74053 2107 > SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton > HRB 21284 (AG Nürnberg) > smime.p7s Description: S/MIME Cryptographic Signature
Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties
On Wed, Jan 24, 2018 at 11:09 AM, Martin Wilck wrote: > On Wed, 2018-01-24 at 10:44 -0800, Khazhismel Kumykov wrote: >> On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck >> wrote: >> > On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote: >> > > Move the last used path to the end of the list (least preferred) >> > > so >> > > that >> > > ties are more evenly distributed. >> > > >> > > For example, in case with three paths with one that is slower >> > > than >> > > others, the remaining two would be unevenly used if they tie. >> > > This is >> > > due to the rotation not being a truely fair distribution. >> > > >> > > Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are >> > > 'tied' >> > > Three possible rotations: >> > > (a, b, c) -> best path 'a' >> > > (b, c, a) -> best path 'b' >> > > (c, a, b) -> best path 'a' >> > > (a, b, c) -> best path 'a' >> > > (b, c, a) -> best path 'b' >> > > (c, a, b) -> best path 'a' >> > > ... >> > >> > >> > This happens only if a and b actually have the same weight (e.g. >> > queue >> > length for the queue-length selector). If 'a' really receives more >> > IO, >> > its queue grows, and the selector will start preferring 'b', so the >> > effect should level out automatically with the current code as soon >> > as >> > you have real IO going on. But maybe I haven't grasped what you're >> > referring to as "tied". >> >> Yes, for "tied" I'm referring to paths with the same weight. As queue >> length grows it does tend to level out, but in the case where queue >> length doesn't grow (in this example I'm imagining 2 concurrent >> requests to the device) the bias does show and the selectors really >> do >> send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a' >> and 'b' are ~same speed). > > Have you actually observed this? I find the idea pretty academical that > two paths would be walking "tied" this way. In practice, under IO load, > I'd expect queue lengths (and service-times, for that matter) to > fluctuate enough to prevent this effect to be measurable. But of > course, I may be wrong. If you really did observe this, the next > question would be: does hurt performance to an extent that can be > noticed/measured? I reckon that if 'a' got saturated under the load, > hurting performance, its queue length would grow quickly and 'b' would > automatically get preferred. This is fairly simple to observe - start two sync reader threads against a device with 3 backing paths, with one path taking longer on average to complete requests than the other two. One of the 'faster' paths will be used ~2x more than the other. Perhaps not that common a situation, but is a real one. The bias seemed simple to remove, so that the two (or N) paths would be used equally. I don't see a downside to more evenly distributing in this case, although I can't say I've directly observed performance downsides for a biased distribution (aside from the distribution being biased itself being a downside). The bias of note is inherent to the order paths are added to the selector (and which path is 'always bad'), so if 'a' is saturated due to this, then slows, once it recovers it will continue to be preferred, versus in an even distribution. Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: [RFC PATCH] blk-throttle: add burst allowance.
On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li wrote: > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote: >> Allows configuration additional bytes or ios before a throttle is >> triggered. >> >> This allows implementation of a bucket style rate-limit/throttle on a >> block device. Previously, bursting to a device was limited to allowance >> granted in a single throtl_slice (similar to a bucket with limit N and >> refill rate N/slice). >> >> Additional parameters bytes/io_burst_conf defined for tg, which define a >> number of bytes/ios that must be depleted before throttling happens. A >> tg that does not deplete this allowance functions as though it has no >> configured limits. tgs earn additional allowance at rate defined by >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling >> kicks in. If a tg is idle for a while, it will again have some burst >> allowance before it gets throttled again. >> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0, >> when all "used" burst allowance would be earned back. trim_slice still >> does progress slice_start as before and decrements *_disp as before, and >> tgs continue to get bytes/ios in throtl_slice intervals. > > Can you describe why we need this? It would be great if you can describe the > usage model and an example. Does this work for io.low/io.max or both? > > Thanks, > Shaohua > Use case that brought this up was configuring limits for a remote shared device. Bursting beyond io.max is desired but only for so much before the limit kicks in, afterwards with sustained usage throughput is capped. (This proactively avoids remote-side limits). In that case one would configure in a root container io.max + io.burst, and configure low/other limits on descendants sharing the resource on the same node. With this patch, so long as tg has not dispatched more than the burst, no limit is applied at all by that tg, including limit imposed by io.low in tg_iops_limit, etc. Khazhy >> Signed-off-by: Khazhismel Kumykov >> --- >> block/Kconfig| 11 +++ >> block/blk-throttle.c | 192 >> +++ >> 2 files changed, 189 insertions(+), 14 deletions(-) >> >> diff --git a/block/Kconfig b/block/Kconfig >> index 28ec55752b68..fbd05b419f93 100644 >> --- a/block/Kconfig >> +++ b/block/Kconfig >> @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW >> >> Note, this is an experimental interface and could be changed someday. >> >> +config BLK_DEV_THROTTLING_BURST >> +bool "Block throttling .burst allowance interface" >> +depends on BLK_DEV_THROTTLING >> +default n >> +---help--- >> +Add .burst allowance for block throttling. Burst allowance allows for >> +additional unthrottled usage, while still limiting speed for sustained >> +usage. >> + >> +If in doubt, say N. >> + >> config BLK_CMDLINE_PARSER >> bool "Block device command line partition parser" >> default n >> diff --git a/block/blk-throttle.c b/block/blk-throttle.c >> index 96ad32623427..27c084312772 100644 >> --- a/block/blk-throttle.c >> +++ b/block/blk-throttle.c >> @@ -157,6 +157,11 @@ struct throtl_grp { >> /* Number of bio's dispatched in current slice */ >> unsigned int io_disp[2]; >> >> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST >> + uint64_t bytes_burst_conf[2]; >> + unsigned int io_burst_conf[2]; >> +#endif >> + >> unsigned long last_low_overflow_time[2]; >> >> uint64_t last_bytes_disp[2]; >> @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t >> gfp, int node) >> tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; >> tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; >> tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; >> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST >> + tg->bytes_burst_conf[READ] = 0; >> + tg->bytes_burst_conf[WRITE] = 0; >> + tg->io_burst_conf[READ] = 0; >> + tg->io_burst_conf[WRITE] = 0; >> +#endif >> /* LIMIT_LOW will have default value 0 */ >> >> tg->latency_target = DFL_LATENCY_TARGET; >> @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct >> throtl_grp *tg, bool rw) >> tg->slice_end[rw], jiffies); >> } >> >> +/* >> + * When current slice should end. >> + * >> + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wai
[PATCH] dm mpath selector: more evenly distribute ties
Move the last used path to the end of the list (least preferred) so that ties are more evenly distributed. For example, in case with three paths with one that is slower than others, the remaining two would be unevenly used if they tie. This is due to the rotation not being a truely fair distribution. Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are 'tied' Three possible rotations: (a, b, c) -> best path 'a' (b, c, a) -> best path 'b' (c, a, b) -> best path 'a' (a, b, c) -> best path 'a' (b, c, a) -> best path 'b' (c, a, b) -> best path 'a' ... So 'a' is used 2x more than 'b', although they should be used evenly. With this change, the most recently used path is always the least preferred, removing this bias resulting in even distribution. (a, b, c) -> best path 'a' (b, c, a) -> best path 'b' (c, a, b) -> best path 'a' (c, b, a) -> best path 'b' ... Signed-off-by: Khazhismel Kumykov --- drivers/md/dm-queue-length.c | 6 +++--- drivers/md/dm-service-time.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 23f178641794..969c4f1a3633 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) if (list_empty(>valid_paths)) goto out; - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, >valid_paths); - list_for_each_entry(pi, >valid_paths, list) { if (!best || (atomic_read(>qlen) < atomic_read(>qlen))) @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) if (!best) goto out; + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(>list, >valid_paths); + ret = best->path; out: spin_unlock_irqrestore(>lock, flags); diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 7b8642045c55..f006a9005593 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) if (list_empty(>valid_paths)) goto out; - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, >valid_paths); - list_for_each_entry(pi, >valid_paths, list) if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) best = pi; @@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) if (!best) goto out; + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(>list, >valid_paths); + ret = best->path; out: spin_unlock_irqrestore(>lock, flags); -- 2.16.0.rc1.238.g530d649a79-goog smime.p7s Description: S/MIME Cryptographic Signature
[PATCH 1/3] fuse: on 64-bit store time in d_fsdata directly
Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use ->d_time"), as the additional memory can be significant. (In particular, on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry, this can consume significant memory. Signed-off-by: Khazhismel Kumykov --- fs/fuse/dir.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..f9c59a296568 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,6 +24,18 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, >state); } +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; @@ -38,6 +50,7 @@ static inline u64 fuse_dentry_time(struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } +#endif /* * FUSE caches dentries and attributes with separate timeout. The @@ -242,6 +255,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -254,16 +268,21 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) -- 2.23.0.187.g17f5b7556c-goog
[PATCH 3/3] fuse: pass gfp flags to fuse_request_alloc
Instead of having a helper per flag Signed-off-by: Khazhismel Kumykov --- fs/fuse/dev.c| 22 +++--- fs/fuse/file.c | 6 +++--- fs/fuse/fuse_i.h | 6 +- fs/fuse/inode.c | 4 ++-- 4 files changed, 9 insertions(+), 29 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a0d166a6596f..c957620ce7ba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -66,7 +66,7 @@ static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, return pages; } -static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) { @@ -90,24 +90,8 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) } return req; } - -struct fuse_req *fuse_request_alloc(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_KERNEL); -} EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_account(unsigned int npages) -{ - return __fuse_request_alloc(npages, GFP_KERNEL_ACCOUNT); -} -EXPORT_SYMBOL_GPL(fuse_request_alloc_account); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_NOFS); -} - static void fuse_req_pages_free(struct fuse_req *req) { if (req->pages != req->inline_pages) @@ -207,7 +191,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (fc->conn_error) goto out; - req = fuse_request_alloc(npages); + req = fuse_request_alloc(npages, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -316,7 +300,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, wait_event(fc->blocked_waitq, fc->initialized); /* Matches smp_wmb() in fuse_set_initialized() */ smp_rmb(); - req = fuse_request_alloc(0); + req = fuse_request_alloc(0, GFP_KERNEL); if (!req) req = get_reserved_req(fc, file); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c584ad7478b3..ae8c8016bb8e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -50,7 +50,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc_account(0); + ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL_ACCOUNT); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; @@ -1703,7 +1703,7 @@ static int fuse_writepage_locked(struct page *page) set_page_writeback(page); - req = fuse_request_alloc_nofs(1); + req = fuse_request_alloc(1, GFP_NOFS); if (!req) goto err; @@ -1923,7 +1923,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); + req = fuse_request_alloc(FUSE_REQ_INLINE_PAGES, GFP_NOFS); if (!req) { __free_page(tmp_page); goto out_unlock; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 08161b2d9b08..8080a51096e9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -902,11 +902,7 @@ void __exit fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(unsigned npages); - -struct fuse_req *fuse_request_alloc_account(unsigned int npages); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags); bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, gfp_t flags); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index eab44ddc68b9..ad92e93eaddd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1178,13 +1178,13 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = _dentry_operations; - init_req = fuse_request_alloc(0); + init_req = fuse_request_alloc(0, GFP_KERNEL); if (!init_req) goto err_put_root; __set_bit(FR_BACKGROUND, _req->flags); if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); + fc->destroy_req = fuse_request_alloc(0, GFP_KERNEL); if (!fc->destroy_req) goto err_free_init_req; } -- 2.23.0.187.g17f5b7556c-goog
[PATCH 2/3] fuse: kmemcg account fs data
account per-file, dentry, and inode data accounts the per-file reserved request, adding new fuse_request_alloc_account() blockdev/superblock and temporary per-request data was left alone, as this usually isn't accounted Signed-off-by: Khazhismel Kumykov --- fs/fuse/dev.c| 6 ++ fs/fuse/dir.c| 3 ++- fs/fuse/file.c | 4 ++-- fs/fuse/fuse_i.h | 2 ++ fs/fuse/inode.c | 3 ++- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ea8237513dfa..a0d166a6596f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -97,6 +97,12 @@ struct fuse_req *fuse_request_alloc(unsigned npages) } EXPORT_SYMBOL_GPL(fuse_request_alloc); +struct fuse_req *fuse_request_alloc_account(unsigned int npages) +{ + return __fuse_request_alloc(npages, GFP_KERNEL_ACCOUNT); +} +EXPORT_SYMBOL_GPL(fuse_request_alloc_account); + struct fuse_req *fuse_request_alloc_nofs(unsigned npages) { return __fuse_request_alloc(npages, GFP_NOFS); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f9c59a296568..2013e1222de7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -258,7 +258,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae2828beb00..c584ad7478b3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -45,12 +45,12 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); + ff->reserved_req = fuse_request_alloc_account(0); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24dbca75..08161b2d9b08 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -904,6 +904,8 @@ void __exit fuse_ctl_cleanup(void); */ struct fuse_req *fuse_request_alloc(unsigned npages); +struct fuse_req *fuse_request_alloc_account(unsigned int npages); + struct fuse_req *fuse_request_alloc_nofs(unsigned npages); bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4bb885b0f032..eab44ddc68b9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -76,7 +76,8 @@ struct fuse_mount_data { struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); } static struct inode *fuse_alloc_inode(struct super_block *sb) -- 2.23.0.187.g17f5b7556c-goog
Re: [PATCH 3/3] fuse: pass gfp flags to fuse_request_alloc
On Wed, Aug 21, 2019 at 5:18 PM Shakeel Butt wrote: > > On Wed, Aug 21, 2019 at 5:10 PM Khazhismel Kumykov wrote: > > > > Instead of having a helper per flag > > > > Signed-off-by: Khazhismel Kumykov > > I think it would be better to re-order the patch 2 and 3 of this > series. There will be less code churn. That makes sense to me, I'll follow up with a v2 smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v2 1/3] fuse: on 64-bit store time in d_fsdata directly
Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use ->d_time"), as the additional memory can be significant. (In particular, on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry, this can consume significant memory. Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov --- fs/fuse/dir.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..f9c59a296568 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,6 +24,18 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, >state); } +#if BITS_PER_LONG >= 64 +static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; @@ -38,6 +50,7 @@ static inline u64 fuse_dentry_time(struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } +#endif /* * FUSE caches dentries and attributes with separate timeout. The @@ -242,6 +255,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -254,16 +268,21 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) -- 2.23.0.187.g17f5b7556c-goog
[PATCH v2 3/3] fuse: kmemcg account fs data
account per-file, dentry, and inode data accounts the per-file reserved request, adding new fuse_request_alloc_account() blockdev/superblock and temporary per-request data was left alone, as this usually isn't accounted Signed-off-by: Khazhismel Kumykov --- fs/fuse/dir.c | 3 ++- fs/fuse/file.c | 4 ++-- fs/fuse/inode.c | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f9c59a296568..2013e1222de7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -258,7 +258,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 572d8347ebcb..ae8c8016bb8e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -45,12 +45,12 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL); + ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL_ACCOUNT); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5afd1872b8b1..ad92e93eaddd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -76,7 +76,8 @@ struct fuse_mount_data { struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); } static struct inode *fuse_alloc_inode(struct super_block *sb) -- 2.23.0.187.g17f5b7556c-goog
[PATCH v2 2/3] fuse: pass gfp flags to fuse_request_alloc
Instead of having a helper per flag Signed-off-by: Khazhismel Kumykov --- fs/fuse/dev.c| 16 +++- fs/fuse/file.c | 6 +++--- fs/fuse/fuse_i.h | 4 +--- fs/fuse/inode.c | 4 ++-- 4 files changed, 9 insertions(+), 21 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ea8237513dfa..c957620ce7ba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -66,7 +66,7 @@ static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, return pages; } -static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) { @@ -90,18 +90,8 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) } return req; } - -struct fuse_req *fuse_request_alloc(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_KERNEL); -} EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_NOFS); -} - static void fuse_req_pages_free(struct fuse_req *req) { if (req->pages != req->inline_pages) @@ -201,7 +191,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (fc->conn_error) goto out; - req = fuse_request_alloc(npages); + req = fuse_request_alloc(npages, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -310,7 +300,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, wait_event(fc->blocked_waitq, fc->initialized); /* Matches smp_wmb() in fuse_set_initialized() */ smp_rmb(); - req = fuse_request_alloc(0); + req = fuse_request_alloc(0, GFP_KERNEL); if (!req) req = get_reserved_req(fc, file); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae2828beb00..572d8347ebcb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -50,7 +50,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); + ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; @@ -1703,7 +1703,7 @@ static int fuse_writepage_locked(struct page *page) set_page_writeback(page); - req = fuse_request_alloc_nofs(1); + req = fuse_request_alloc(1, GFP_NOFS); if (!req) goto err; @@ -1923,7 +1923,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); + req = fuse_request_alloc(FUSE_REQ_INLINE_PAGES, GFP_NOFS); if (!req) { __free_page(tmp_page); goto out_unlock; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24dbca75..8080a51096e9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -902,9 +902,7 @@ void __exit fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(unsigned npages); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags); bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, gfp_t flags); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4bb885b0f032..5afd1872b8b1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1177,13 +1177,13 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = _dentry_operations; - init_req = fuse_request_alloc(0); + init_req = fuse_request_alloc(0, GFP_KERNEL); if (!init_req) goto err_put_root; __set_bit(FR_BACKGROUND, _req->flags); if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); + fc->destroy_req = fuse_request_alloc(0, GFP_KERNEL); if (!fc->destroy_req) goto err_free_init_req; } -- 2.23.0.187.g17f5b7556c-goog
Re: [PATCH 1/1] epoll: call final ep_events_available() check under the lock
On Tue, May 5, 2020 at 1:04 PM Andrew Morton wrote: > > On Tue, 05 May 2020 10:42:05 +0200 Roman Penyaev wrote: > > > May I ask you to remove "epoll: ensure ep_poll() doesn't miss wakeup > > events" from your -mm queue? Jason lately found out that the patch > > does not fully solve the problem and this one patch is a second > > attempt to do things correctly in a different way (namely to do > > the final check under the lock). Previous changes are not needed. > > Where do we stand with Khazhismel's "eventpoll: fix missing wakeup for > ovflist in ep_poll_callback"? > > http://lkml.kernel.org/r/20200424190039.192373-1-kha...@google.com > My understanding is - we need the ep_poll_callback fix on a logical level (ovfllist was never triggering wakeup), and the two follow-ups close races - in both how we add/remove from the wait queue, and how we observe the ready list, which are needed if we only wake when we add events, where before we were also waking when we were splicing ovflist events when reading the ready list. As well, the first two together are needed for epoll60 to pass in my testing. Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: WARNING in notify_change
I was able to reproduce this by setting security.capability xattr on a blockdev file, then writing to it - when writing to the blockdev we never lock the inode, so when we clear the capability we hit this lockdep warning. Is the issue here that we can set this xattr in the first place so we have to clear it at all? Or should we really be locking the inode for blockdevs after all? I'm not too familiar, but my gut says former this reproducer is able to immediately crash machine running linux-next-20190415: #include #include #include #include #include #include #include #include #include char *disk = "/dev/loop0"; int main(void) { int fd = open(disk, 0); if (fd < 0) printf("open: %d\n", errno); system("dd if=/dev/zero of=a_file count=51200"); system("losetup /dev/loop0 a_file"); uint32_t value[5] = { 0x200, 7, 0x20d0, 6, 4 }; int res = fsetxattr(fd, "security.capability", , sizeof(value), XATTR_CREATE); if (res < 0) printf ("xattr: %d\n", errno); int fd2 = open(disk, O_RDWR); write(fd2, "hello", 5); return 0; } smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] ext4: add cond_resched() to ext4_mb_init_backend()
on non-preempt kernels for filesystems with large number of groups we may take a long time (>50 ticks) initializing all the groups. Signed-off-by: Khazhismel Kumykov --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8ef5f12bbee2..c89f497ccf50 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2490,6 +2490,7 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { + cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); -- 2.21.0.392.gf8f6787159e-goog
Re: [PATCH] ext4: add cond_resched() to ext4_mb_init_backend()
On Thu, Apr 18, 2019 at 4:53 AM Jan Kara wrote: > > On Mon 15-04-19 19:59:34, Khazhismel Kumykov wrote: > > on non-preempt kernels for filesystems with large number of groups we > > may take a long time (>50 ticks) initializing all the groups. > > > > Signed-off-by: Khazhismel Kumykov > > Thanks for the patch. I'm not opposed to this but I'm just wondering: Has > this caused any real issues to you? Or how have you come across this > particular loop? Because I'd think we have more similar loops in ext4 > codebase... > > Honza > We have some instrumentation to warn for these longer periods without scheduling, which admittedly does trigger relatively frequently from various places. This particular loop just so happens to be during system startup where it caught my eye. smime.p7s Description: S/MIME Cryptographic Signature
[PATCH v2] ext4: cond_resched in work-heavy group loops
These functions may take a long time looping over many groups, which may cause issues for non-preempt kernels. ext4_mb_init_backend() ext4_setup_system_zone() ext4_mb_release() Signed-off-by: Khazhismel Kumykov --- v2: - a few other places that in testing showed to be slow fs/ext4/block_validity.c | 1 + fs/ext4/mballoc.c| 2 ++ 2 files changed, 3 insertions(+) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 913061c0de1b..16134469ea3c 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -155,6 +155,7 @@ int ext4_setup_system_zone(struct super_block *sb) return 0; for (i=0; i < ngroups; i++) { + cond_resched(); if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) add_system_zone(sbi, ext4_group_first_block_no(sb, i), diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8ef5f12bbee2..99ba720dbb7a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2490,6 +2490,7 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { + cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); @@ -2705,6 +2706,7 @@ int ext4_mb_release(struct super_block *sb) if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { + cond_resched(); grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); -- 2.21.0.593.g511ec345e18-goog
[PATCH] fs/fat: add cond_resched to fat_count_free_clusters
On non-preempt kernels this loop can take a long time (more than 50 ticks) processing through entries. Signed-off-by: Khazhismel Kumykov --- fs/fat/fatent.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index defc2168de91..f58c0cacc531 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb) if (ops->ent_get() == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, )); + cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; -- 2.19.0.605.g01d371f741-goog
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov wrote: > Hi, > > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on > the same tree at the same, behavior time blows up and all the calls hang with > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k > entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang > my test VMs) > > This seems to be due to thrashing on the dentry locks in multiple threads > tightly looping calling d_walk waiting for a shrink_dentry_list call (also > thrashing the locks) to complete. (d_invalidate loops calling d_walk so long > as > any dentry in the tree is in a dcache shrink list). > > There was a similar report recently "soft lockup in d_invalidate()" that > proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink > list already, which does prevent this hang/lockup in this case as well, > although > I'm not sure it doesn't violate the contract for d_invalidate. (Although the > entries in a shrink list should be dropped eventually, not necessarily by the > time d_invalidate returns). > > If someone more familiar with the dcache could provide input I would > appreciate. > > A reliable repro on mainline is: > - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough > - create a directory and populate with ~100k files with content to > populate dcache > - create some background processes opening/reading files in this folder (5 > while true; cat $file was enough to get an indefinite hang for me) > - cause the directory to need to be invalidated (e.g., make_bad_inode on the > directory) > > This results in the background processes all entering d_invalidate and > hanging, > while with just one process in d_invalidate (e.g., stat'ing a file in the dir) > things go pretty quickly as expected. > > > (The proposed patch from other thread) > > diff --git a/fs/dcache.c b/fs/dcache.c > index 7b8feb6..3a3b0f37 100644 > --- a/fs/dcache.c > +++ b/fs/dcache.c > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, > struct dentry *dentry) > goto out; > > if (dentry->d_flags & DCACHE_SHRINK_LIST) { > - data->found++; > + goto out; > } else { > if (dentry->d_flags & DCACHE_LRU_LIST) > d_lru_del(dentry); > > > khazhy Would this change actually violate any guarantees? select_collect looks like it used to ignore unused dentries that were part of a shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found would not be incremented). Once the dentry is on a shrink list would it be unreachable anyways, so returning from d_invalidate before all the shrink lists finish processing would be ok? khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Mon, May 22, 2017 at 11:18 AM, Khazhismel Kumykov wrote: > On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov wrote: >> >> On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov >> wrote: >> > Hi, >> > >> > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate >> > on >> > the same tree at the same, behavior time blows up and all the calls hang >> > with >> > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k >> > entries in d_subdir, and 5 or so threads calling d_invalidate was able to >> > hang >> > my test VMs) >> > >> > This seems to be due to thrashing on the dentry locks in multiple threads >> > tightly looping calling d_walk waiting for a shrink_dentry_list call (also >> > thrashing the locks) to complete. (d_invalidate loops calling d_walk so >> > long as >> > any dentry in the tree is in a dcache shrink list). >> > >> > There was a similar report recently "soft lockup in d_invalidate()" that >> > proposed in the d_invalidate d_walk to ignore dentries marked as in a >> > shrink >> > list already, which does prevent this hang/lockup in this case as well, >> > although >> > I'm not sure it doesn't violate the contract for d_invalidate. (Although >> > the >> > entries in a shrink list should be dropped eventually, not necessarily by >> > the >> > time d_invalidate returns). >> > >> > If someone more familiar with the dcache could provide input I would >> > appreciate. >> > >> > A reliable repro on mainline is: >> > - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough >> > - create a directory and populate with ~100k files with content to >> > populate dcache >> > - create some background processes opening/reading files in this folder (5 >> > while true; cat $file was enough to get an indefinite hang for me) >> > - cause the directory to need to be invalidated (e.g., make_bad_inode on >> > the >> > directory) >> > >> > This results in the background processes all entering d_invalidate and >> > hanging, >> > while with just one process in d_invalidate (e.g., stat'ing a file in the >> > dir) >> > things go pretty quickly as expected. >> > >> > >> > (The proposed patch from other thread) >> > >> > diff --git a/fs/dcache.c b/fs/dcache.c >> > index 7b8feb6..3a3b0f37 100644 >> > --- a/fs/dcache.c >> > +++ b/fs/dcache.c >> > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, >> > struct dentry *dentry) >> > goto out; >> > >> > if (dentry->d_flags & DCACHE_SHRINK_LIST) { >> > - data->found++; >> > + goto out; >> > } else { >> > if (dentry->d_flags & DCACHE_LRU_LIST) >> > d_lru_del(dentry); >> > >> > >> > khazhy >> >> Would this change actually violate any guarantees? select_collect >> looks like it used to ignore unused dentries that were part of a >> shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found >> would not be incremented). Once the dentry is on a shrink list would >> it be unreachable anyways, so returning from d_invalidate before all >> the shrink lists finish processing would be ok? >> >> khazhy > > Pinging in case this got missed, would appreciate thoughts from someone more > familiar with the dcache. > > My previous email wasn't completely correct, while before fe91522a7ba8 > ("don't remove from shrink list in select_collect()") d_invalidate > would not busy > wait for other workers calling shrink_list to compete, it would return -EBUSY, > rather than success, so a change to return success without waiting would not > be equivalent behavior before. Currently, we will loop calling d_walk > repeatedly > causing the extreme slowdown observed. > > I still want to understand better, in d_invalidate if we can return > without pruning > in-use dentries safely, would returning before unused dentries are > pruned, so long > as we know they will be pruned by another task (are on a shrink list), > be safe as well? > If not, would it make more sense to have have a mutual exclusion on calling > d_invalidate on the same dentries simultaneously? > > khazhy ping Again to summarize: With a directory with larg
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov wrote: > > On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov wrote: > > Hi, > > > > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate > > on > > the same tree at the same, behavior time blows up and all the calls hang > > with > > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k > > entries in d_subdir, and 5 or so threads calling d_invalidate was able to > > hang > > my test VMs) > > > > This seems to be due to thrashing on the dentry locks in multiple threads > > tightly looping calling d_walk waiting for a shrink_dentry_list call (also > > thrashing the locks) to complete. (d_invalidate loops calling d_walk so > > long as > > any dentry in the tree is in a dcache shrink list). > > > > There was a similar report recently "soft lockup in d_invalidate()" that > > proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink > > list already, which does prevent this hang/lockup in this case as well, > > although > > I'm not sure it doesn't violate the contract for d_invalidate. (Although the > > entries in a shrink list should be dropped eventually, not necessarily by > > the > > time d_invalidate returns). > > > > If someone more familiar with the dcache could provide input I would > > appreciate. > > > > A reliable repro on mainline is: > > - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough > > - create a directory and populate with ~100k files with content to > > populate dcache > > - create some background processes opening/reading files in this folder (5 > > while true; cat $file was enough to get an indefinite hang for me) > > - cause the directory to need to be invalidated (e.g., make_bad_inode on > > the > > directory) > > > > This results in the background processes all entering d_invalidate and > > hanging, > > while with just one process in d_invalidate (e.g., stat'ing a file in the > > dir) > > things go pretty quickly as expected. > > > > > > (The proposed patch from other thread) > > > > diff --git a/fs/dcache.c b/fs/dcache.c > > index 7b8feb6..3a3b0f37 100644 > > --- a/fs/dcache.c > > +++ b/fs/dcache.c > > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, > > struct dentry *dentry) > > goto out; > > > > if (dentry->d_flags & DCACHE_SHRINK_LIST) { > > - data->found++; > > + goto out; > > } else { > > if (dentry->d_flags & DCACHE_LRU_LIST) > > d_lru_del(dentry); > > > > > > khazhy > > Would this change actually violate any guarantees? select_collect > looks like it used to ignore unused dentries that were part of a > shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found > would not be incremented). Once the dentry is on a shrink list would > it be unreachable anyways, so returning from d_invalidate before all > the shrink lists finish processing would be ok? > > khazhy Pinging in case this got missed, would appreciate thoughts from someone more familiar with the dcache. My previous email wasn't completely correct, while before fe91522a7ba8 ("don't remove from shrink list in select_collect()") d_invalidate would not busy wait for other workers calling shrink_list to compete, it would return -EBUSY, rather than success, so a change to return success without waiting would not be equivalent behavior before. Currently, we will loop calling d_walk repeatedly causing the extreme slowdown observed. I still want to understand better, in d_invalidate if we can return without pruning in-use dentries safely, would returning before unused dentries are pruned, so long as we know they will be pruned by another task (are on a shrink list), be safe as well? If not, would it make more sense to have have a mutual exclusion on calling d_invalidate on the same dentries simultaneously? khazhy
Hang/soft lockup in d_invalidate with simultaneous calls
Hi, I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on the same tree at the same, behavior time blows up and all the calls hang with large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang my test VMs) This seems to be due to thrashing on the dentry locks in multiple threads tightly looping calling d_walk waiting for a shrink_dentry_list call (also thrashing the locks) to complete. (d_invalidate loops calling d_walk so long as any dentry in the tree is in a dcache shrink list). There was a similar report recently "soft lockup in d_invalidate()" that proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink list already, which does prevent this hang/lockup in this case as well, although I'm not sure it doesn't violate the contract for d_invalidate. (Although the entries in a shrink list should be dropped eventually, not necessarily by the time d_invalidate returns). If someone more familiar with the dcache could provide input I would appreciate. A reliable repro on mainline is: - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough - create a directory and populate with ~100k files with content to populate dcache - create some background processes opening/reading files in this folder (5 while true; cat $file was enough to get an indefinite hang for me) - cause the directory to need to be invalidated (e.g., make_bad_inode on the directory) This results in the background processes all entering d_invalidate and hanging, while with just one process in d_invalidate (e.g., stat'ing a file in the dir) things go pretty quickly as expected. (The proposed patch from other thread) diff --git a/fs/dcache.c b/fs/dcache.c index 7b8feb6..3a3b0f37 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { - data->found++; + goto out; } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); khazhy smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] ext4: Return EIO on read error in ext4_find_entry
Previously, a read error would be ignored and we would eventually return NULL from ext4_find_entry, which signals "no such file or directory". We should be returning EIO. Signed-off-by: Khazhismel Kumykov --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 404256caf9cf..6fa17e9f7b6d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1428,11 +1428,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); - goto next; + ret = ERR_PTR(-EIO); + goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, -- 2.13.1.611.g7e3b11ae1-goog smime.p7s Description: S/MIME Cryptographic Signature
Re: [PATCH] ext4: Return EIO on read error in ext4_find_entry
On Fri, Jun 23, 2017 at 2:36 PM, Andreas Dilger wrote: > On Jun 23, 2017, at 6:26 AM, Theodore Ts'o wrote: >> >> The problem is that if we continue, successive reads may all take >> seconds or minutes to fail, thus tieing up the process for a long >> time. > > Sorry, I don't understand where the seconds or minutes of delay come from? > Is that because of long SCSI retries in the block layer, or in the disk > itself, or something caused specifically because of this code? For a networked block device we may be retrying for a while before giving up, although this also applies to the initial failed read. > >> By returning EIO right away, we can "fast fail". > > But it seems like you don't necessarily need to fail at all? Something like > the > following would return an error if the entry is not found, but still search > the > rest of the leaf blocks (if any) before giving up: > Giving up early or checking future blocks both work, critical thing here is not returning NULL after seeing a read error. Previously to this the behavior was to continue to check future blocks after a read error, and it seemed OK. Khazhy smime.p7s Description: S/MIME Cryptographic Signature
Re: Hang/soft lockup in d_invalidate with simultaneous calls
On Fri, Jun 2, 2017 at 11:47 PM, Khazhismel Kumykov wrote: > On Fri, Jun 2, 2017 at 11:20 PM, Al Viro wrote: >> The thing is, unlike shrink_dcache_parent() we *can* bugger off as >> soon as we'd found no victims, nothing mounted and dentry itself >> is unhashed. We can't do anything in select_collect() (we would've >> broken shrink_dcache_parent() that way), but we can do unhashing >> in check_and_drop() in "really nothing to do" case and we can return >> from d_invalidate() after that. So how about this: > That does the trick. I'm not entirely familiar the process here, is the above change committed somewhere, should I propose a patch? smime.p7s Description: S/MIME Cryptographic Signature
[PATCH] bfq: silence lockdep for bfqd/ioc lock inversion
lockdep warns of circular locking due to inversion between bfq_insert_requests and bfq_exit_icq. If we end freeing a request when merging, we *may* grab an ioc->lock if that request is the last refcount to that ioc. bfq_bio_merge also potentially could have this ordering. bfq_exit_icq, conversely, grabs bfqd but is always called with ioc->lock held. bfq_exit_icq may either be called from put_io_context_active with ioc refcount raised, ioc_release_fn after the last refcount was already dropped, or ioc_clear_queue, which is only called while queue is quiesced or exiting, so the inverted orderings should never conflict. Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Signed-off-by: Khazhismel Kumykov --- block/bfq-iosched.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) Noticed this lockdep running xfstests (generic/464) on top of a bfq block device. I was also able to tease it out w/ binary trying to issue requests that would end up merging while rapidly swapping the active scheduler. As far as I could see, the deadlock would not actually occur, so this patch opts to change lock class for the inverted case. bfqd -> ioc : [ 2995.524557] __lock_acquire+0x18f5/0x2660 [ 2995.524562] lock_acquire+0xb4/0x3a0 [ 2995.524565] _raw_spin_lock_irqsave+0x3f/0x60 [ 2995.524569] put_io_context+0x33/0x90. -> ioc->lock grabbed [ 2995.524573] blk_mq_free_request+0x51/0x140 [ 2995.524577] blk_put_request+0xe/0x10 [ 2995.524580] blk_attempt_req_merge+0x1d/0x30 [ 2995.524585] elv_attempt_insert_merge+0x56/0xa0 [ 2995.524590] blk_mq_sched_try_insert_merge+0x4b/0x60 [ 2995.524595] bfq_insert_requests+0x9e/0x18c0. -> bfqd->lock grabbed [ 2995.524598] blk_mq_sched_insert_requests+0xd6/0x2b0 [ 2995.524602] blk_mq_flush_plug_list+0x154/0x280 [ 2995.524606] blk_finish_plug+0x40/0x60 [ 2995.524609] ext4_writepages+0x696/0x1320 [ 2995.524614] do_writepages+0x1c/0x80 [ 2995.524621] __filemap_fdatawrite_range+0xd7/0x120 [ 2995.524625] sync_file_range+0xac/0xf0 [ 2995.524642] __x64_sys_sync_file_range+0x44/0x70 [ 2995.524646] do_syscall_64+0x31/0x40 [ 2995.524649] entry_SYSCALL_64_after_hwframe+0x44/0xae ioc -> bfqd [ 2995.524490] _raw_spin_lock_irqsave+0x3f/0x60 [ 2995.524498] bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed [ 2995.524512] put_io_context_active+0x78/0xb0 -> ioc->lock grabbed [ 2995.524516] exit_io_context+0x48/0x50 [ 2995.524519] do_exit+0x7e9/0xdd0 [ 2995.524526] do_group_exit+0x54/0xc0 [ 2995.524530] __x64_sys_exit_group+0x18/0x20 [ 2995.524534] do_syscall_64+0x31/0x40 [ 2995.524537] entry_SYSCALL_64_after_hwframe+0x44/0xae Another trace where we grab ioc -> bfqd through bfq_exit_icq is when changing elevator -> #1 (&(>lock)->rlock){-.-.}: [ 646.890820] lock_acquire+0x9b/0x140 [ 646.894868] _raw_spin_lock_irqsave+0x3b/0x50 [ 646.899707] bfq_exit_icq_bfqq+0x47/0x1f0 [ 646.904196] bfq_exit_icq+0x21/0x30 [ 646.908160] ioc_destroy_icq+0xf3/0x130 [ 646.912466] ioc_clear_queue+0xb8/0x140 [ 646.916771] elevator_switch_mq+0xa4/0x3c0 [ 646.921333] elevator_switch+0x5f/0x340 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95586137194e..cb50ac0ffe80 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5027,7 +5027,14 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) if (bfqq && bfqd) { unsigned long flags; - spin_lock_irqsave(>lock, flags); + /* bfq_exit_icq is usually called with ioc->lock held, which is +* inverse order from elsewhere, which may grab ioc->lock +* under bfqd->lock if we merge requests and drop the last ioc +* refcount. Since exit_icq is either called with a refcount, +* or with queue quiesced, use a differnet lock class to +* silence lockdep +*/ + spin_lock_irqsave_nested(>lock, flags, 1); bfqq->bic = NULL; bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); -- 2.31.0.rc2.261.g7f71774620-goog