from:"Khazhismel Kumykov"

[Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-07-13 Thread Khazhismel Kumykov

Session attributes exposed through sysfs were freed before the device
was destroyed, resulting in a potential use-after-free. Free these
attributes after removing the device.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 drivers/scsi/libiscsi.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 42381adf0769..8696a51a5a0c 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup);
 /**
  * iscsi_session_teardown - destroy session, host, and cls_session
  * @cls_session: iscsi session
- *
- * The driver must have called iscsi_remove_session before
- * calling this.
  */
 void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 {
@@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
 
iscsi_pool_free(>cmdpool);
 
+   iscsi_remove_session(cls_session);
+
kfree(session->password);
kfree(session->password_in);
kfree(session->username);
@@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
kfree(session->portal_type);
kfree(session->discovery_parent_type);
 
-   iscsi_destroy_session(cls_session);
+   iscsi_free_session(cls_session);
+
iscsi_host_dec_session_cnt(shost);
module_put(owner);
 }
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[Patch v2 2/2] libiscsi: Remove iscsi_destroy_session

2017-07-13 Thread Khazhismel Kumykov

iscsi_session_teardown was the only user of this function. Function
currently is just short for iscsi_remove_session + iscsi_free_session.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 16 
 include/scsi/scsi_transport_iscsi.h |  1 -
 2 files changed, 17 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index a424eaeafeb0..924ac408d8a9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session 
*session)
 }
 EXPORT_SYMBOL_GPL(iscsi_free_session);
 
-/**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-   iscsi_remove_session(session);
-   ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-   iscsi_free_session(session);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
 /**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
diff --git a/include/scsi/scsi_transport_iscsi.h 
b/include/scsi/scsi_transport_iscsi.h
index 6183d20a01fb..b266d2a3bcb1 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session 
*iscsi_create_session(struct Scsi_Host *shost,
unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
int dd_size, uint32_t cid);
 extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] libiscsi: Remove iscsi_destroy_session

2017-07-12 Thread Khazhismel Kumykov

iscsi_session_teardown was the only user of this function. Function
currently is just short for iscsi_remove_session + iscsi_free_session.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 with "libiscsi: Fix use after free race during iscsi_session_teardown" 
removing the
 last user.

 drivers/scsi/scsi_transport_iscsi.c | 16 
 include/scsi/scsi_transport_iscsi.h |  1 -
 2 files changed, 17 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index a424eaeafeb0..924ac408d8a9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session 
*session)
 }
 EXPORT_SYMBOL_GPL(iscsi_free_session);
 
-/**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-   iscsi_remove_session(session);
-   ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-   iscsi_free_session(session);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
 /**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
diff --git a/include/scsi/scsi_transport_iscsi.h 
b/include/scsi/scsi_transport_iscsi.h
index 6183d20a01fb..b266d2a3bcb1 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session 
*iscsi_create_session(struct Scsi_Host *shost,
unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
int dd_size, uint32_t cid);
 extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-07-12 Thread Khazhismel Kumykov

Session attributes exposed through sysfs were freed before the device
was destroyed, resulting in a potential use-after-free. Free these
attributes after removing the device.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 drivers/scsi/libiscsi.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 42381adf0769..f9199bebaec7 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup);
 /**
  * iscsi_session_teardown - destroy session, host, and cls_session
  * @cls_session: iscsi session
- *
- * The driver must have called iscsi_remove_session before
- * calling this.
  */
 void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 {
@@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
 
iscsi_pool_free(>cmdpool);
 
+   iscsi_remove_session(session);
+
kfree(session->password);
kfree(session->password_in);
kfree(session->username);
@@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
kfree(session->portal_type);
kfree(session->discovery_parent_type);
 
-   iscsi_destroy_session(cls_session);
+   iscsi_free_session(cls_session);
+
iscsi_host_dec_session_cnt(shost);
module_put(owner);
 }
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-17 Thread Khazhismel Kumykov

On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov <kha...@google.com> wrote:
> Hi,
>
> I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on
> the same tree at the same, behavior time blows up and all the calls hang with
> large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
> entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang
> my test VMs)
>
> This seems to be due to thrashing on the dentry locks in multiple threads
> tightly looping calling d_walk waiting for a shrink_dentry_list call (also
> thrashing the locks) to complete. (d_invalidate loops calling d_walk so long 
> as
> any dentry in the tree is in a dcache shrink list).
>
> There was a similar report recently "soft lockup in d_invalidate()" that
> proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink
> list already, which does prevent this hang/lockup in this case as well, 
> although
> I'm not sure it doesn't violate the contract for d_invalidate. (Although the
> entries in a shrink list should be dropped eventually, not necessarily by the
> time d_invalidate returns).
>
> If someone more familiar with the dcache could provide input I would 
> appreciate.
>
> A reliable repro on mainline is:
>  - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
>  - create a directory and populate with ~100k files with content to
> populate dcache
>  - create some background processes opening/reading files in this folder (5
>   while true; cat $file was enough to get an indefinite hang for me)
>  - cause the directory to need to be invalidated (e.g., make_bad_inode on the
> directory)
>
> This results in the background processes all entering d_invalidate and 
> hanging,
> while with just one process in d_invalidate (e.g., stat'ing a file in the dir)
> things go pretty quickly as expected.
>
>
> (The proposed patch from other thread)
>
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 7b8feb6..3a3b0f37 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
> struct dentry *dentry)
>  goto out;
>
>  if (dentry->d_flags & DCACHE_SHRINK_LIST) {
> -   data->found++;
> +   goto out;
>  } else {
>  if (dentry->d_flags & DCACHE_LRU_LIST)
>  d_lru_del(dentry);
>
>
> khazhy

Would this change actually violate any guarantees? select_collect
looks like it used to ignore unused dentries that were part of a
shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found
would not be incremented). Once the dentry is on a shrink list would
it be unreachable anyways, so returning from d_invalidate before all
the shrink lists finish processing would be ok?

khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-15 Thread Khazhismel Kumykov

Hi,

I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on
the same tree at the same, behavior time blows up and all the calls hang with
large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang
my test VMs)

This seems to be due to thrashing on the dentry locks in multiple threads
tightly looping calling d_walk waiting for a shrink_dentry_list call (also
thrashing the locks) to complete. (d_invalidate loops calling d_walk so long as
any dentry in the tree is in a dcache shrink list).

There was a similar report recently "soft lockup in d_invalidate()" that
proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink
list already, which does prevent this hang/lockup in this case as well, although
I'm not sure it doesn't violate the contract for d_invalidate. (Although the
entries in a shrink list should be dropped eventually, not necessarily by the
time d_invalidate returns).

If someone more familiar with the dcache could provide input I would appreciate.

A reliable repro on mainline is:
 - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
 - create a directory and populate with ~100k files with content to
populate dcache
 - create some background processes opening/reading files in this folder (5
  while true; cat $file was enough to get an indefinite hang for me)
 - cause the directory to need to be invalidated (e.g., make_bad_inode on the
directory)

This results in the background processes all entering d_invalidate and hanging,
while with just one process in d_invalidate (e.g., stat'ing a file in the dir)
things go pretty quickly as expected.


(The proposed patch from other thread)

diff --git a/fs/dcache.c b/fs/dcache.c
index 7b8feb6..3a3b0f37 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
struct dentry *dentry)
 goto out;

 if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-   data->found++;
+   goto out;
 } else {
 if (dentry->d_flags & DCACHE_LRU_LIST)
 d_lru_del(dentry);


khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH] ext4: Return EIO on read error in ext4_find_entry

2017-06-23 Thread Khazhismel Kumykov

On Fri, Jun 23, 2017 at 2:36 PM, Andreas Dilger  wrote:
> On Jun 23, 2017, at 6:26 AM, Theodore Ts'o  wrote:
>>
>> The problem is that if we continue, successive reads may all take
>> seconds or minutes to fail, thus tieing up the process for a long
>> time.
>
> Sorry, I don't understand where the seconds or minutes of delay come from?
> Is that because of long SCSI retries in the block layer, or in the disk
> itself, or something caused specifically because of this code?

For a networked block device we may be retrying for a while before
giving up, although this also applies to the initial failed read.

>
>> By returning EIO right away, we can "fast fail".
>
> But it seems like you don't necessarily need to fail at all?  Something like 
> the
> following would return an error if the entry is not found, but still search 
> the
> rest of the leaf blocks (if any) before giving up:
>

Giving up early or checking future blocks both work, critical thing
here is not returning NULL after seeing a read error.
Previously to this the behavior was to continue to check future blocks
after a read error, and it seemed OK.

Khazhy

smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] ext4: Return EIO on read error in ext4_find_entry

2017-06-22 Thread Khazhismel Kumykov

Previously, a read error would be ignored and we would eventually return
NULL from ext4_find_entry, which signals "no such file or directory". We
should be returning EIO.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 fs/ext4/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 404256caf9cf..6fa17e9f7b6d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1428,11 +1428,11 @@ static struct buffer_head * ext4_find_entry (struct 
inode *dir,
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
-   /* read error, skip block & hope for the best */
EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
 (unsigned long) block);
brelse(bh);
-   goto next;
+   ret = ERR_PTR(-EIO);
+   goto cleanup_and_exit;
}
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
-- 
2.13.1.611.g7e3b11ae1-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-22 Thread Khazhismel Kumykov

On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov <kha...@google.com> wrote:
>
> On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov <kha...@google.com> wrote:
> > Hi,
> >
> > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate 
> > on
> > the same tree at the same, behavior time blows up and all the calls hang 
> > with
> > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
> > entries in d_subdir, and 5 or so threads calling d_invalidate was able to 
> > hang
> > my test VMs)
> >
> > This seems to be due to thrashing on the dentry locks in multiple threads
> > tightly looping calling d_walk waiting for a shrink_dentry_list call (also
> > thrashing the locks) to complete. (d_invalidate loops calling d_walk so 
> > long as
> > any dentry in the tree is in a dcache shrink list).
> >
> > There was a similar report recently "soft lockup in d_invalidate()" that
> > proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink
> > list already, which does prevent this hang/lockup in this case as well, 
> > although
> > I'm not sure it doesn't violate the contract for d_invalidate. (Although the
> > entries in a shrink list should be dropped eventually, not necessarily by 
> > the
> > time d_invalidate returns).
> >
> > If someone more familiar with the dcache could provide input I would 
> > appreciate.
> >
> > A reliable repro on mainline is:
> >  - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
> >  - create a directory and populate with ~100k files with content to
> > populate dcache
> >  - create some background processes opening/reading files in this folder (5
> >   while true; cat $file was enough to get an indefinite hang for me)
> >  - cause the directory to need to be invalidated (e.g., make_bad_inode on 
> > the
> > directory)
> >
> > This results in the background processes all entering d_invalidate and 
> > hanging,
> > while with just one process in d_invalidate (e.g., stat'ing a file in the 
> > dir)
> > things go pretty quickly as expected.
> >
> >
> > (The proposed patch from other thread)
> >
> > diff --git a/fs/dcache.c b/fs/dcache.c
> > index 7b8feb6..3a3b0f37 100644
> > --- a/fs/dcache.c
> > +++ b/fs/dcache.c
> > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
> > struct dentry *dentry)
> >  goto out;
> >
> >  if (dentry->d_flags & DCACHE_SHRINK_LIST) {
> > -   data->found++;
> > +   goto out;
> >  } else {
> >  if (dentry->d_flags & DCACHE_LRU_LIST)
> >  d_lru_del(dentry);
> >
> >
> > khazhy
>
> Would this change actually violate any guarantees? select_collect
> looks like it used to ignore unused dentries that were part of a
> shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found
> would not be incremented). Once the dentry is on a shrink list would
> it be unreachable anyways, so returning from d_invalidate before all
> the shrink lists finish processing would be ok?
>
> khazhy

Pinging in case this got missed, would appreciate thoughts from someone more
familiar with the dcache.

My previous email wasn't completely correct, while before fe91522a7ba8
("don't remove from shrink list in select_collect()") d_invalidate
would not busy
wait for other workers calling shrink_list to compete, it would return -EBUSY,
rather than success, so a change to return success without waiting would not
be equivalent behavior before. Currently, we will loop calling d_walk repeatedly
causing the extreme slowdown observed.

I still want to understand better, in d_invalidate if we can return
without pruning
in-use dentries safely, would returning before unused dentries are
pruned, so long
as we know they will be pruned by another task (are on a shrink list),
be safe as well?
If not, would it make more sense to have have a mutual exclusion on calling
d_invalidate on the same dentries simultaneously?

khazhy

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-25 Thread Khazhismel Kumykov

On Mon, May 22, 2017 at 11:18 AM, Khazhismel Kumykov <kha...@google.com> wrote:
> On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov <kha...@google.com> wrote:
>>
>> On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov <kha...@google.com> 
>> wrote:
>> > Hi,
>> >
>> > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate 
>> > on
>> > the same tree at the same, behavior time blows up and all the calls hang 
>> > with
>> > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
>> > entries in d_subdir, and 5 or so threads calling d_invalidate was able to 
>> > hang
>> > my test VMs)
>> >
>> > This seems to be due to thrashing on the dentry locks in multiple threads
>> > tightly looping calling d_walk waiting for a shrink_dentry_list call (also
>> > thrashing the locks) to complete. (d_invalidate loops calling d_walk so 
>> > long as
>> > any dentry in the tree is in a dcache shrink list).
>> >
>> > There was a similar report recently "soft lockup in d_invalidate()" that
>> > proposed in the d_invalidate d_walk to ignore dentries marked as in a 
>> > shrink
>> > list already, which does prevent this hang/lockup in this case as well, 
>> > although
>> > I'm not sure it doesn't violate the contract for d_invalidate. (Although 
>> > the
>> > entries in a shrink list should be dropped eventually, not necessarily by 
>> > the
>> > time d_invalidate returns).
>> >
>> > If someone more familiar with the dcache could provide input I would 
>> > appreciate.
>> >
>> > A reliable repro on mainline is:
>> >  - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
>> >  - create a directory and populate with ~100k files with content to
>> > populate dcache
>> >  - create some background processes opening/reading files in this folder (5
>> >   while true; cat $file was enough to get an indefinite hang for me)
>> >  - cause the directory to need to be invalidated (e.g., make_bad_inode on 
>> > the
>> > directory)
>> >
>> > This results in the background processes all entering d_invalidate and 
>> > hanging,
>> > while with just one process in d_invalidate (e.g., stat'ing a file in the 
>> > dir)
>> > things go pretty quickly as expected.
>> >
>> >
>> > (The proposed patch from other thread)
>> >
>> > diff --git a/fs/dcache.c b/fs/dcache.c
>> > index 7b8feb6..3a3b0f37 100644
>> > --- a/fs/dcache.c
>> > +++ b/fs/dcache.c
>> > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
>> > struct dentry *dentry)
>> >  goto out;
>> >
>> >  if (dentry->d_flags & DCACHE_SHRINK_LIST) {
>> > -   data->found++;
>> > +   goto out;
>> >  } else {
>> >  if (dentry->d_flags & DCACHE_LRU_LIST)
>> >  d_lru_del(dentry);
>> >
>> >
>> > khazhy
>>
>> Would this change actually violate any guarantees? select_collect
>> looks like it used to ignore unused dentries that were part of a
>> shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found
>> would not be incremented). Once the dentry is on a shrink list would
>> it be unreachable anyways, so returning from d_invalidate before all
>> the shrink lists finish processing would be ok?
>>
>> khazhy
>
> Pinging in case this got missed, would appreciate thoughts from someone more
> familiar with the dcache.
>
> My previous email wasn't completely correct, while before fe91522a7ba8
> ("don't remove from shrink list in select_collect()") d_invalidate
> would not busy
> wait for other workers calling shrink_list to compete, it would return -EBUSY,
> rather than success, so a change to return success without waiting would not
> be equivalent behavior before. Currently, we will loop calling d_walk 
> repeatedly
> causing the extreme slowdown observed.
>
> I still want to understand better, in d_invalidate if we can return
> without pruning
> in-use dentries safely, would returning before unused dentries are
> pruned, so long
> as we know they will be pruned by another task (are on a shrink list),
> be safe as well?
> If not, would it make more sense to have have a mutual exclusion on calling
> d_invalidate on the same dentries simultaneously?

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-06-03 Thread Khazhismel Kumykov

On Fri, Jun 2, 2017 at 11:20 PM, Al Viro  wrote:
> The thing is, unlike shrink_dcache_parent() we *can* bugger off as
> soon as we'd found no victims, nothing mounted and dentry itself
> is unhashed.  We can't do anything in select_collect() (we would've
> broken shrink_dcache_parent() that way), but we can do unhashing
> in check_and_drop() in "really nothing to do" case and we can return
> from d_invalidate() after that.  So how about this:
That does the trick.


smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-06-02 Thread Khazhismel Kumykov

On Fri, Jun 2, 2017 at 6:12 PM, Al Viro  wrote:
> Part of that could be relieved if we turned check_and_drop() into
> static void check_and_drop(void *_data)
> {
> struct detach_data *data = _data;
>
> if (!data->mountpoint && list_empty(>select.dispose))
> __d_drop(data->select.start);
> }

So with this change, d_invalidate will drop the starting dentry before
all it's children are dropped? Would it make sense to just drop it
right off the bat, and let one task handle shrinking all it's
children?
>
> It doesn't solve the entire problem, though - we still could get too
> many threads into that thing before any of them gets to that __d_drop().

Yes, the change isn't sufficient for my repro, as many threads get to
the loop before the drop, although new tasks don't get stuck in the
same loop after the dentry is dropped.

> I wonder if we should try and collect more victims; that could lead
> to contention of its own, but...

>From my understanding, the contention is the worst when one task is
shrinking everything, and several other tasks are busily looping
walking the dentry until everything is done. In this case, the tasks
busily looping d_walk hold the d_lock for a dentry while walking over
all it's children, then soon after it finishes the d_walk, it queues
again to walk again, while shrink_dentry_list releases and re-grabs
for each entry.

If we limit the number of items we pass to shrink_dentry_list at one
time things actually look quite a bit better. e.g., in testing
arbitrarily limiting select.dispose to 1000 elements does fix my
repro.

e.g.
diff --git a/fs/dcache.c b/fs/dcache.c
index 22af360ceca3..3892e0eb7ec2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1367,6 +1367,7 @@ struct select_data {
struct dentry *start;
struct list_head dispose;
int found;
+   int actually_found;
 };

 static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
@@ -1388,6 +1389,7 @@ static enum d_walk_ret select_collect(void
*_data, struct dentry *dentry)
if (!dentry->d_lockref.count) {
d_shrink_add(dentry, >dispose);
data->found++;
+   data->actually_found++;
}
}
/*
@@ -1397,6 +1399,9 @@ static enum d_walk_ret select_collect(void
*_data, struct dentry *dentry)
 */
if (!list_empty(>dispose))
ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+
+   if (data->actually_found > 1000)
+   ret = D_WALK_QUIT;
 out:
return ret;
 }
@@ -1415,6 +1420,7 @@ void shrink_dcache_parent(struct dentry *parent)
INIT_LIST_HEAD();
data.start = parent;
data.found = 0;
+   data.actually_found = 0;

d_walk(parent, , select_collect, NULL);
if (!data.found)
@@ -1536,6 +1542,7 @@ void d_invalidate(struct dentry *dentry)
INIT_LIST_HEAD();
data.select.start = dentry;
data.select.found = 0;
+   data.select.actually_found = 0;

d_walk(dentry, , detach_and_collect, check_and_drop);


smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-06-12 Thread Khazhismel Kumykov

On Fri, Jun 2, 2017 at 11:47 PM, Khazhismel Kumykov <kha...@google.com> wrote:
> On Fri, Jun 2, 2017 at 11:20 PM, Al Viro <v...@zeniv.linux.org.uk> wrote:
>> The thing is, unlike shrink_dcache_parent() we *can* bugger off as
>> soon as we'd found no victims, nothing mounted and dentry itself
>> is unhashed.  We can't do anything in select_collect() (we would've
>> broken shrink_dcache_parent() that way), but we can do unhashing
>> in check_and_drop() in "really nothing to do" case and we can return
>> from d_invalidate() after that.  So how about this:
> That does the trick.

I'm not entirely familiar the process here, is the above change
committed somewhere, should I propose a patch?


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] generic: Add nocheck shutdown stress test

2017-09-21 Thread Khazhismel Kumykov

Most shutdown tests only run on filesystems with metadata journaling, so
we lose coverage. Add a shutdown stress test that doesn't check for
consistency, so does not require journaling.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 tests/generic/999 | 84 +++
 tests/generic/999.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 87 insertions(+)
 create mode 100755 tests/generic/999
 create mode 100644 tests/generic/999.out

diff --git a/tests/generic/999 b/tests/generic/999
new file mode 100755
index ..71b9aa4c
--- /dev/null
+++ b/tests/generic/999
@@ -0,0 +1,84 @@
+#!/bin/bash
+# FS QA Test No. 999
+#
+# Shutdown stress test - exercise shutdown codepath with fsstress,
+# make sure we don't BUG/WARN. Coverage for all fs with shutdown.
+#
+#---
+# Copyright (c) 2013 Red Hat, Inc.  All Rights Reserved.
+# Copyright (c) 2017 Google, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+_cleanup()
+{
+   cd /
+   _scratch_unmount 2>/dev/null
+   rm -f $tmp.*
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+
+_require_scratch_nocheck
+_require_scratch_shutdown
+_require_command "$KILLALL_PROG" killall
+
+rm -f $seqres.full
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount
+
+SLEEP_TIME=$((10 * $TIME_FACTOR))
+PROCS=$((4 * LOAD_FACTOR))
+
+load_dir=$SCRATCH_MNT/test
+
+$XFS_FSR_PROG -v $load_dir >> $seqres.full 2>&1
+$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> 
$seqres.full 2>&1 &
+sleep $SLEEP_TIME
+sync
+
+# now shutdown and unmount
+sleep 5
+$here/src/godown $load_dir
+$KILLALL_PROG -q $FSSTRESS_PROG
+wait
+
+# for some reason fsstress processes manage to live on beyond the wait?
+sleep 5
+
+_scratch_unmount
+
+echo "No output is good. Failures are loud."
+
+status=0
+exit
diff --git a/tests/generic/999.out b/tests/generic/999.out
new file mode 100644
index ..68a51fe7
--- /dev/null
+++ b/tests/generic/999.out
@@ -0,0 +1,2 @@
+QA output created by 999
+No output is good. Failures are loud.
diff --git a/tests/generic/group b/tests/generic/group
index f922b496..891386ac 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -463,3 +463,4 @@
 458 auto quick clone
 459 auto dangerous
 460 auto quick rw
+999 auto shutdown stress
-- 
2.14.1.821.g8fa685d3b7-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v2] generic: Add nocheck shutdown stress test

2017-09-22 Thread Khazhismel Kumykov

Most shutdown tests only run on filesystems with metadata journaling, so
we lose coverage. Add a shutdown stress test that doesn't check for
consistency, so does not require journaling. This is a modified version
of generic/051, with some extras trimmed and fs checking removed.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
v2: drop defrag, use "silence is golden"

 tests/generic/999 | 83 +++
 tests/generic/999.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 86 insertions(+)
 create mode 100755 tests/generic/999
 create mode 100644 tests/generic/999.out

diff --git a/tests/generic/999 b/tests/generic/999
new file mode 100755
index ..7b612686
--- /dev/null
+++ b/tests/generic/999
@@ -0,0 +1,83 @@
+#!/bin/bash
+# FS QA Test No. 999
+#
+# Shutdown stress test - exercise shutdown codepath with fsstress,
+# make sure we don't BUG/WARN. Coverage for all fs with shutdown.
+#
+#---
+# Copyright (c) 2013 Red Hat, Inc.  All Rights Reserved.
+# Copyright (c) 2017 Google, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+_cleanup()
+{
+   cd /
+   _scratch_unmount 2>/dev/null
+   rm -f $tmp.*
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+
+_require_scratch_nocheck
+_require_scratch_shutdown
+_require_command "$KILLALL_PROG" killall
+
+rm -f $seqres.full
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount
+
+SLEEP_TIME=$((10 * $TIME_FACTOR))
+PROCS=$((4 * LOAD_FACTOR))
+
+load_dir=$SCRATCH_MNT/test
+
+$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> 
$seqres.full 2>&1 &
+sleep $SLEEP_TIME
+sync
+
+# now shutdown and unmount
+sleep 5
+$here/src/godown $load_dir
+$KILLALL_PROG -q $FSSTRESS_PROG
+wait
+
+# for some reason fsstress processes manage to live on beyond the wait?
+sleep 5
+
+_scratch_unmount
+
+echo "Silence is golden"
+
+status=0
+exit
diff --git a/tests/generic/999.out b/tests/generic/999.out
new file mode 100644
index ..3b276ca8
--- /dev/null
+++ b/tests/generic/999.out
@@ -0,0 +1,2 @@
+QA output created by 999
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index f922b496..891386ac 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -463,3 +463,4 @@
 458 auto quick clone
 459 auto dangerous
 460 auto quick rw
+999 auto shutdown stress
-- 
2.14.1.821.g8fa685d3b7-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-09-29 Thread Khazhismel Kumykov

Noticed these don't seem to be in 4.14/scsi-queue

On Tue, Aug 29, 2017 at 6:45 PM, Martin K. Petersen
 wrote:
>
> Chris,
>
>> Looks good to me, fixes up the code given that the comment there about
>> calling iscsi_remove_session wasn't being followed.
>
> Applied these two to 4.14/scsi-queue.
>
> --
> Martin K. Petersen  Oracle Linux Engineering


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-08-24 Thread Khazhismel Kumykov

On Thu, Jul 13, 2017 at 9:11 AM, Khazhismel Kumykov <kha...@google.com> wrote:
Ping in case this was missed


smime.p7s
Description: S/MIME Cryptographic Signature

[RFC PATCH] blk-throttle: add burst allowance.

2017-11-14 Thread Khazhismel Kumykov

Allows configuration additional bytes or ios before a throttle is
triggered.

This allows implementation of a bucket style rate-limit/throttle on a
block device. Previously, bursting to a device was limited to allowance
granted in a single throtl_slice (similar to a bucket with limit N and
refill rate N/slice).

Additional parameters bytes/io_burst_conf defined for tg, which define a
number of bytes/ios that must be depleted before throttling happens. A
tg that does not deplete this allowance functions as though it has no
configured limits. tgs earn additional allowance at rate defined by
bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
kicks in. If a tg is idle for a while, it will again have some burst
allowance before it gets throttled again.

slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
when all "used" burst allowance would be earned back. trim_slice still
does progress slice_start as before and decrements *_disp as before, and
tgs continue to get bytes/ios in throtl_slice intervals.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 block/Kconfig|  11 +++
 block/blk-throttle.c | 192 +++
 2 files changed, 189 insertions(+), 14 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 28ec55752b68..fbd05b419f93 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW
 
Note, this is an experimental interface and could be changed someday.
 
+config BLK_DEV_THROTTLING_BURST
+bool "Block throttling .burst allowance interface"
+depends on BLK_DEV_THROTTLING
+default n
+---help---
+Add .burst allowance for block throttling. Burst allowance allows for
+additional unthrottled usage, while still limiting speed for sustained
+usage.
+
+If in doubt, say N.
+
 config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad32623427..27c084312772 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -157,6 +157,11 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   uint64_t bytes_burst_conf[2];
+   unsigned int io_burst_conf[2];
+#endif
+
unsigned long last_low_overflow_time[2];
 
uint64_t last_bytes_disp[2];
@@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, 
int node)
tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   tg->bytes_burst_conf[READ] = 0;
+   tg->bytes_burst_conf[WRITE] = 0;
+   tg->io_burst_conf[READ] = 0;
+   tg->io_burst_conf[WRITE] = 0;
+#endif
/* LIMIT_LOW will have default value 0 */
 
tg->latency_target = DFL_LATENCY_TARGET;
@@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct 
throtl_grp *tg, bool rw)
   tg->slice_end[rw], jiffies);
 }
 
+/*
+ * When current slice should end.
+ *
+ * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait
+ * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end
+ * before this would result in tg receiving additional burst allowance.
+ */
+static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw,
+   unsigned long min_wait)
+{
+   unsigned long bytes_wait = 0, io_wait = 0;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   if (tg->bytes_burst_conf[rw])
+   bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw);
+   if (tg->io_burst_conf[rw])
+   io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw);
+#endif
+   return max(min_wait, max(bytes_wait, io_wait));
+}
+
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
 {
@@ -849,7 +880,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, 
bool rw)
 * is bad because it does not allow new slice to start.
 */
 
-   throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
+   throtl_set_slice_end(tg, rw,
+   jiffies + throtl_slice_wait(tg, rw, tg->td->throtl_slice));
 
time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -889,7 +921,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
  unsigned long *wait)
 {
bool rw = bio_data_dir(bio);
-   unsigned int io_allowed;
+   unsigned int io_allowed, io_disp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-14 Thread Khazhismel Kumykov

(Botched the to line, sorry)


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-18 Thread Khazhismel Kumykov

On Mon, Dec 18, 2017 at 1:01 PM, Vivek Goyal <vgo...@redhat.com> wrote:
> On Mon, Dec 18, 2017 at 12:39:50PM -0800, Khazhismel Kumykov wrote:
>> On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal <vgo...@redhat.com> wrote:
>> > On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote:
>> >> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> 
>> >> wrote:
>> >> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote:
>> >> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>> >> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
>> >> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> >> >>> >> Allows configuration additional bytes or ios before a throttle is
>> >> >>> >> triggered.
>> >> >>> >>
>> >> >>> >> This allows implementation of a bucket style rate-limit/throttle 
>> >> >>> >> on a
>> >> >>> >> block device. Previously, bursting to a device was limited to 
>> >> >>> >> allowance
>> >> >>> >> granted in a single throtl_slice (similar to a bucket with limit N 
>> >> >>> >> and
>> >> >>> >> refill rate N/slice).
>> >> >>> >>
>> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which 
>> >> >>> >> define a
>> >> >>> >> number of bytes/ios that must be depleted before throttling 
>> >> >>> >> happens. A
>> >> >>> >> tg that does not deplete this allowance functions as though it has 
>> >> >>> >> no
>> >> >>> >> configured limits. tgs earn additional allowance at rate defined by
>> >> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, 
>> >> >>> >> throttling
>> >> >>> >> kicks in. If a tg is idle for a while, it will again have some 
>> >> >>> >> burst
>> >> >>> >> allowance before it gets throttled again.
>> >> >>> >>
>> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall 
>> >> >>> >> to 0,
>> >> >>> >> when all "used" burst allowance would be earned back. trim_slice 
>> >> >>> >> still
>> >> >>> >> does progress slice_start as before and decrements *_disp as 
>> >> >>> >> before, and
>> >> >>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>> >> >>> >
>> >> >>> > Can you describe why we need this? It would be great if you can 
>> >> >>> > describe the
>> >> >>> > usage model and an example. Does this work for io.low/io.max or 
>> >> >>> > both?
>> >> >>> >
>> >> >>> > Thanks,
>> >> >>> > Shaohua
>> >> >>> >
>> >> >>>
>> >> >>> Use case that brought this up was configuring limits for a remote
>> >> >>> shared device. Bursting beyond io.max is desired but only for so much
>> >> >>> before the limit kicks in, afterwards with sustained usage throughput
>> >> >>> is capped. (This proactively avoids remote-side limits). In that case
>> >> >>> one would configure in a root container io.max + io.burst, and
>> >> >>> configure low/other limits on descendants sharing the resource on the
>> >> >>> same node.
>> >> >>>
>> >> >>> With this patch, so long as tg has not dispatched more than the burst,
>> >> >>> no limit is applied at all by that tg, including limit imposed by
>> >> >>> io.low in tg_iops_limit, etc.
>> >> >>
>> >> >> I'd appreciate if you can give more details about the 'why'. 
>> >> >> 'configuring
>> >> >> limits for a remote shared device' doesn't justify the change.
>> >> >
>> >> > This is to configure a bursty workload (and associated device) with
>> >> > known/a

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-18 Thread Khazhismel Kumykov

On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal <vgo...@redhat.com> wrote:
> On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote:
>> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> 
>> wrote:
>> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote:
>> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
>> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> >>> >> Allows configuration additional bytes or ios before a throttle is
>> >>> >> triggered.
>> >>> >>
>> >>> >> This allows implementation of a bucket style rate-limit/throttle on a
>> >>> >> block device. Previously, bursting to a device was limited to 
>> >>> >> allowance
>> >>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>> >>> >> refill rate N/slice).
>> >>> >>
>> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which 
>> >>> >> define a
>> >>> >> number of bytes/ios that must be depleted before throttling happens. A
>> >>> >> tg that does not deplete this allowance functions as though it has no
>> >>> >> configured limits. tgs earn additional allowance at rate defined by
>> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>> >>> >> kicks in. If a tg is idle for a while, it will again have some burst
>> >>> >> allowance before it gets throttled again.
>> >>> >>
>> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 
>> >>> >> 0,
>> >>> >> when all "used" burst allowance would be earned back. trim_slice still
>> >>> >> does progress slice_start as before and decrements *_disp as before, 
>> >>> >> and
>> >>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>> >>> >
>> >>> > Can you describe why we need this? It would be great if you can 
>> >>> > describe the
>> >>> > usage model and an example. Does this work for io.low/io.max or both?
>> >>> >
>> >>> > Thanks,
>> >>> > Shaohua
>> >>> >
>> >>>
>> >>> Use case that brought this up was configuring limits for a remote
>> >>> shared device. Bursting beyond io.max is desired but only for so much
>> >>> before the limit kicks in, afterwards with sustained usage throughput
>> >>> is capped. (This proactively avoids remote-side limits). In that case
>> >>> one would configure in a root container io.max + io.burst, and
>> >>> configure low/other limits on descendants sharing the resource on the
>> >>> same node.
>> >>>
>> >>> With this patch, so long as tg has not dispatched more than the burst,
>> >>> no limit is applied at all by that tg, including limit imposed by
>> >>> io.low in tg_iops_limit, etc.
>> >>
>> >> I'd appreciate if you can give more details about the 'why'. 'configuring
>> >> limits for a remote shared device' doesn't justify the change.
>> >
>> > This is to configure a bursty workload (and associated device) with
>> > known/allowed expected burst size, but to not allow full utilization
>> > of the device for extended periods of time for QoS. During idle or low
>> > use periods the burst allowance accrues, and then tasks can burst well
>> > beyond the configured throttle up to the limit, afterwards is
>> > throttled. A constant throttle speed isn't sufficient for this as you
>> > can only burst 1 slice worth, but a limit of sorts is desirable for
>> > preventing over utilization of the shared device. This type of limit
>> > is also slightly different than what i understand io.low does in local
>> > cases in that tg is only high priority/unthrottled if it is bursty,
>> > and is limited with constant usage
>> >
>> > Khazhy
>>
>> Hi Shaohua,
>>
>> Does this clarify the reason for this patch? Is this (or something
>> similar) a good fit for inclusion in blk-throttle?
>>
>
> So does this brus

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-18 Thread Khazhismel Kumykov

On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> wrote:
> On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote:
>> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
>>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>>> >> Allows configuration additional bytes or ios before a throttle is
>>> >> triggered.
>>> >>
>>> >> This allows implementation of a bucket style rate-limit/throttle on a
>>> >> block device. Previously, bursting to a device was limited to allowance
>>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>>> >> refill rate N/slice).
>>> >>
>>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
>>> >> number of bytes/ios that must be depleted before throttling happens. A
>>> >> tg that does not deplete this allowance functions as though it has no
>>> >> configured limits. tgs earn additional allowance at rate defined by
>>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>>> >> kicks in. If a tg is idle for a while, it will again have some burst
>>> >> allowance before it gets throttled again.
>>> >>
>>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>>> >> when all "used" burst allowance would be earned back. trim_slice still
>>> >> does progress slice_start as before and decrements *_disp as before, and
>>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>>> >
>>> > Can you describe why we need this? It would be great if you can describe 
>>> > the
>>> > usage model and an example. Does this work for io.low/io.max or both?
>>> >
>>> > Thanks,
>>> > Shaohua
>>> >
>>>
>>> Use case that brought this up was configuring limits for a remote
>>> shared device. Bursting beyond io.max is desired but only for so much
>>> before the limit kicks in, afterwards with sustained usage throughput
>>> is capped. (This proactively avoids remote-side limits). In that case
>>> one would configure in a root container io.max + io.burst, and
>>> configure low/other limits on descendants sharing the resource on the
>>> same node.
>>>
>>> With this patch, so long as tg has not dispatched more than the burst,
>>> no limit is applied at all by that tg, including limit imposed by
>>> io.low in tg_iops_limit, etc.
>>
>> I'd appreciate if you can give more details about the 'why'. 'configuring
>> limits for a remote shared device' doesn't justify the change.
>
> This is to configure a bursty workload (and associated device) with
> known/allowed expected burst size, but to not allow full utilization
> of the device for extended periods of time for QoS. During idle or low
> use periods the burst allowance accrues, and then tasks can burst well
> beyond the configured throttle up to the limit, afterwards is
> throttled. A constant throttle speed isn't sufficient for this as you
> can only burst 1 slice worth, but a limit of sorts is desirable for
> preventing over utilization of the shared device. This type of limit
> is also slightly different than what i understand io.low does in local
> cases in that tg is only high priority/unthrottled if it is bursty,
> and is limited with constant usage
>
> Khazhy

Hi Shaohua,

Does this clarify the reason for this patch? Is this (or something
similar) a good fit for inclusion in blk-throttle?

Thanks,
Khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-20 Thread Khazhismel Kumykov

On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote:
> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> >> Allows configuration additional bytes or ios before a throttle is
>> >> triggered.
>> >>
>> >> This allows implementation of a bucket style rate-limit/throttle on a
>> >> block device. Previously, bursting to a device was limited to allowance
>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>> >> refill rate N/slice).
>> >>
>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
>> >> number of bytes/ios that must be depleted before throttling happens. A
>> >> tg that does not deplete this allowance functions as though it has no
>> >> configured limits. tgs earn additional allowance at rate defined by
>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>> >> kicks in. If a tg is idle for a while, it will again have some burst
>> >> allowance before it gets throttled again.
>> >>
>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>> >> when all "used" burst allowance would be earned back. trim_slice still
>> >> does progress slice_start as before and decrements *_disp as before, and
>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>> >
>> > Can you describe why we need this? It would be great if you can describe 
>> > the
>> > usage model and an example. Does this work for io.low/io.max or both?
>> >
>> > Thanks,
>> > Shaohua
>> >
>>
>> Use case that brought this up was configuring limits for a remote
>> shared device. Bursting beyond io.max is desired but only for so much
>> before the limit kicks in, afterwards with sustained usage throughput
>> is capped. (This proactively avoids remote-side limits). In that case
>> one would configure in a root container io.max + io.burst, and
>> configure low/other limits on descendants sharing the resource on the
>> same node.
>>
>> With this patch, so long as tg has not dispatched more than the burst,
>> no limit is applied at all by that tg, including limit imposed by
>> io.low in tg_iops_limit, etc.
>
> I'd appreciate if you can give more details about the 'why'. 'configuring
> limits for a remote shared device' doesn't justify the change.

This is to configure a bursty workload (and associated device) with
known/allowed expected burst size, but to not allow full utilization
of the device for extended periods of time for QoS. During idle or low
use periods the burst allowance accrues, and then tasks can burst well
beyond the configured throttle up to the limit, afterwards is
throttled. A constant throttle speed isn't sufficient for this as you
can only burst 1 slice worth, but a limit of sorts is desirable for
preventing over utilization of the shared device. This type of limit
is also slightly different than what i understand io.low does in local
cases in that tg is only high priority/unthrottled if it is bursty,
and is limited with constant usage

Khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-16 Thread Khazhismel Kumykov

On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
> On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> Allows configuration additional bytes or ios before a throttle is
>> triggered.
>>
>> This allows implementation of a bucket style rate-limit/throttle on a
>> block device. Previously, bursting to a device was limited to allowance
>> granted in a single throtl_slice (similar to a bucket with limit N and
>> refill rate N/slice).
>>
>> Additional parameters bytes/io_burst_conf defined for tg, which define a
>> number of bytes/ios that must be depleted before throttling happens. A
>> tg that does not deplete this allowance functions as though it has no
>> configured limits. tgs earn additional allowance at rate defined by
>> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>> kicks in. If a tg is idle for a while, it will again have some burst
>> allowance before it gets throttled again.
>>
>> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>> when all "used" burst allowance would be earned back. trim_slice still
>> does progress slice_start as before and decrements *_disp as before, and
>> tgs continue to get bytes/ios in throtl_slice intervals.
>
> Can you describe why we need this? It would be great if you can describe the
> usage model and an example. Does this work for io.low/io.max or both?
>
> Thanks,
> Shaohua
>

Use case that brought this up was configuring limits for a remote
shared device. Bursting beyond io.max is desired but only for so much
before the limit kicks in, afterwards with sustained usage throughput
is capped. (This proactively avoids remote-side limits). In that case
one would configure in a root container io.max + io.burst, and
configure low/other limits on descendants sharing the resource on the
same node.

With this patch, so long as tg has not dispatched more than the burst,
no limit is applied at all by that tg, including limit imposed by
io.low in tg_iops_limit, etc.

Khazhy

>> Signed-off-by: Khazhismel Kumykov <kha...@google.com>
>> ---
>>  block/Kconfig|  11 +++
>>  block/blk-throttle.c | 192 
>> +++
>>  2 files changed, 189 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/Kconfig b/block/Kconfig
>> index 28ec55752b68..fbd05b419f93 100644
>> --- a/block/Kconfig
>> +++ b/block/Kconfig
>> @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW
>>
>>   Note, this is an experimental interface and could be changed someday.
>>
>> +config BLK_DEV_THROTTLING_BURST
>> +bool "Block throttling .burst allowance interface"
>> +depends on BLK_DEV_THROTTLING
>> +default n
>> +---help---
>> +Add .burst allowance for block throttling. Burst allowance allows for
>> +additional unthrottled usage, while still limiting speed for sustained
>> +usage.
>> +
>> +If in doubt, say N.
>> +
>>  config BLK_CMDLINE_PARSER
>>   bool "Block device command line partition parser"
>>   default n
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index 96ad32623427..27c084312772 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -157,6 +157,11 @@ struct throtl_grp {
>>   /* Number of bio's dispatched in current slice */
>>   unsigned int io_disp[2];
>>
>> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
>> + uint64_t bytes_burst_conf[2];
>> + unsigned int io_burst_conf[2];
>> +#endif
>> +
>>   unsigned long last_low_overflow_time[2];
>>
>>   uint64_t last_bytes_disp[2];
>> @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t 
>> gfp, int node)
>>   tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
>>   tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
>>   tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
>> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
>> + tg->bytes_burst_conf[READ] = 0;
>> + tg->bytes_burst_conf[WRITE] = 0;
>> + tg->io_burst_conf[READ] = 0;
>> + tg->io_burst_conf[WRITE] = 0;
>> +#endif
>>   /* LIMIT_LOW will have default value 0 */
>>
>>   tg->latency_target = DFL_LATENCY_TARGET;
>> @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct 
>> throtl_grp *tg, bool rw)
>>  tg->slice_end[rw], jiffies);
>>  }
>>
>> +/*
>> + * When current slice should end.
>> + *
>> + * With CONFIG_B

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-06 Thread Khazhismel Kumykov

On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov <kha...@google.com> wrote:
> On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li <s...@kernel.org> wrote:
>> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
>>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>>> >> Allows configuration additional bytes or ios before a throttle is
>>> >> triggered.
>>> >>
>>> >> This allows implementation of a bucket style rate-limit/throttle on a
>>> >> block device. Previously, bursting to a device was limited to allowance
>>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>>> >> refill rate N/slice).
>>> >>
>>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
>>> >> number of bytes/ios that must be depleted before throttling happens. A
>>> >> tg that does not deplete this allowance functions as though it has no
>>> >> configured limits. tgs earn additional allowance at rate defined by
>>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>>> >> kicks in. If a tg is idle for a while, it will again have some burst
>>> >> allowance before it gets throttled again.
>>> >>
>>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>>> >> when all "used" burst allowance would be earned back. trim_slice still
>>> >> does progress slice_start as before and decrements *_disp as before, and
>>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>>> >
>>> > Can you describe why we need this? It would be great if you can describe 
>>> > the
>>> > usage model and an example. Does this work for io.low/io.max or both?
>>> >
>>> > Thanks,
>>> > Shaohua
>>> >
>>>
>>> Use case that brought this up was configuring limits for a remote
>>> shared device. Bursting beyond io.max is desired but only for so much
>>> before the limit kicks in, afterwards with sustained usage throughput
>>> is capped. (This proactively avoids remote-side limits). In that case
>>> one would configure in a root container io.max + io.burst, and
>>> configure low/other limits on descendants sharing the resource on the
>>> same node.
>>>
>>> With this patch, so long as tg has not dispatched more than the burst,
>>> no limit is applied at all by that tg, including limit imposed by
>>> io.low in tg_iops_limit, etc.
>>
>> I'd appreciate if you can give more details about the 'why'. 'configuring
>> limits for a remote shared device' doesn't justify the change.
>
> This is to configure a bursty workload (and associated device) with
> known/allowed expected burst size, but to not allow full utilization
> of the device for extended periods of time for QoS. During idle or low
> use periods the burst allowance accrues, and then tasks can burst well
> beyond the configured throttle up to the limit, afterwards is
> throttled. A constant throttle speed isn't sufficient for this as you
> can only burst 1 slice worth, but a limit of sorts is desirable for
> preventing over utilization of the shared device. This type of limit
> is also slightly different than what i understand io.low does in local
> cases in that tg is only high priority/unthrottled if it is bursty,
> and is limited with constant usage
>
> Khazhy
Ping this time without html (oops)


smime.p7s
Description: S/MIME Cryptographic Signature

[RFC PATCH] blk-throttle: add burst allowance

2017-10-26 Thread Khazhismel Kumykov

Allows configuration additional bytes or ios before a throttle is
triggered. Slice end is extended to cover expended allowance recovery
time.

Usage would be e.g. per device to allow users to take up to X bytes/ios
at full speed, but be limited to Y bps/iops with sustained usage.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 block/Kconfig|  11 +++
 block/blk-throttle.c | 185 ---
 2 files changed, 186 insertions(+), 10 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 3ab42bbb06d5..16545caa7fc9 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -127,6 +127,17 @@ config BLK_DEV_THROTTLING_LOW
 
Note, this is an experimental interface and could be changed someday.
 
+config BLK_DEV_THROTTLING_BURST
+bool "Block throttling .burst allowance interface"
+depends on BLK_DEV_THROTTLING
+default n
+---help---
+Add .burst allowance for block throttling. Burst allowance allows for
+additional unthrottled usage, while still limiting speed for sustained
+usage.
+
+If in doubt, say N.
+
 config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d80c3f0144c5..e09ec11e9c5f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -156,6 +156,11 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   uint64_t bytes_burst_conf[2];
+   unsigned int io_burst_conf[2];
+#endif
+
unsigned long last_low_overflow_time[2];
 
uint64_t last_bytes_disp[2];
@@ -506,6 +511,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, 
int node)
tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   tg->bytes_burst_conf[READ] = 0;
+   tg->bytes_burst_conf[WRITE] = 0;
+   tg->io_burst_conf[READ] = 0;
+   tg->io_burst_conf[WRITE] = 0;
+#endif
/* LIMIT_LOW will have default value 0 */
 
tg->latency_target = DFL_LATENCY_TARGET;
@@ -799,6 +810,26 @@ static inline void throtl_start_new_slice(struct 
throtl_grp *tg, bool rw)
   tg->slice_end[rw], jiffies);
 }
 
+/*
+ * When current slice should end.
+ *
+ * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait
+ * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end
+ * before this would result in tg receiving additional burst allowance.
+ */
+static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw,
+   unsigned long min_wait)
+{
+   unsigned long bytes_wait = 0, io_wait = 0;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   if (tg->bytes_burst_conf[rw])
+   bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw);
+   if (tg->io_burst_conf[rw])
+   io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw);
+#endif
+   return jiffies + max(min_wait, max(bytes_wait, io_wait));
+}
+
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
 {
@@ -848,7 +879,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, 
bool rw)
 * is bad because it does not allow new slice to start.
 */
 
-   throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
+   throtl_set_slice_end(tg, rw,
+throtl_slice_wait(tg, rw, tg->td->throtl_slice));
 
time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -888,7 +920,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
  unsigned long *wait)
 {
bool rw = bio_data_dir(bio);
-   unsigned int io_allowed;
+   unsigned int io_allowed, io_disp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
 
@@ -907,6 +939,17 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
 * have been trimmed.
 */
 
+   io_disp = tg->io_disp[rw];
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   if (tg->io_disp[rw] < tg->io_burst_conf[rw]) {
+   if (wait)
+   *wait = 0;
+   return true;
+   }
+   io_disp -= tg->io_burst_conf[rw];
+#endif
+
tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
 
@@ -915,14 +958,14 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
else
io_allowed = tmp;
 
-   if (tg->io_disp[rw] + 1 <= io_allowed) {
+   if (io_disp + 1 &

[PATCH] dm mpath selector: more evenly distribute ties

2018-01-19 Thread Khazhismel Kumykov

Move the last used path to the end of the list (least preferred) so that
ties are more evenly distributed.

For example, in case with three paths with one that is slower than
others, the remaining two would be unevenly used if they tie. This is
due to the rotation not being a truely fair distribution.

Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are 'tied'
Three possible rotations:
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
...

So 'a' is used 2x more than 'b', although they should be used evenly.

With this change, the most recently used path is always the least
preferred, removing this bias resulting in even distribution.
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
(c, b, a) -> best path 'b'
...

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 drivers/md/dm-queue-length.c | 6 +++---
 drivers/md/dm-service-time.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 23f178641794..969c4f1a3633 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (list_empty(>valid_paths))
goto out;
 
-   /* Change preferred (first in list) path to evenly balance. */
-   list_move_tail(s->valid_paths.next, >valid_paths);
-
list_for_each_entry(pi, >valid_paths, list) {
if (!best ||
(atomic_read(>qlen) < atomic_read(>qlen)))
@@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (!best)
goto out;
 
+   /* Move most recently used to least preferred to evenly balance. */
+   list_move_tail(>list, >valid_paths);
+
ret = best->path;
 out:
spin_unlock_irqrestore(>lock, flags);
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 7b8642045c55..f006a9005593 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (list_empty(>valid_paths))
goto out;
 
-   /* Change preferred (first in list) path to evenly balance. */
-   list_move_tail(s->valid_paths.next, >valid_paths);
-
list_for_each_entry(pi, >valid_paths, list)
if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
best = pi;
@@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (!best)
goto out;
 
+   /* Move most recently used to least preferred to evenly balance. */
+   list_move_tail(>list, >valid_paths);
+
ret = best->path;
 out:
spin_unlock_irqrestore(>lock, flags);
-- 
2.16.0.rc1.238.g530d649a79-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties

2018-01-24 Thread Khazhismel Kumykov

On Wed, Jan 24, 2018 at 11:09 AM, Martin Wilck <mwi...@suse.com> wrote:
> On Wed, 2018-01-24 at 10:44 -0800, Khazhismel Kumykov wrote:
>> On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck <mwi...@suse.com>
>> wrote:
>> > On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote:
>> > > Move the last used path to the end of the list (least preferred)
>> > > so
>> > > that
>> > > ties are more evenly distributed.
>> > >
>> > > For example, in case with three paths with one that is slower
>> > > than
>> > > others, the remaining two would be unevenly used if they tie.
>> > > This is
>> > > due to the rotation not being a truely fair distribution.
>> > >
>> > > Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are
>> > > 'tied'
>> > > Three possible rotations:
>> > > (a, b, c) -> best path 'a'
>> > > (b, c, a) -> best path 'b'
>> > > (c, a, b) -> best path 'a'
>> > > (a, b, c) -> best path 'a'
>> > > (b, c, a) -> best path 'b'
>> > > (c, a, b) -> best path 'a'
>> > > ...
>> >
>> >
>> > This happens only if a and b actually have the same weight (e.g.
>> > queue
>> > length for the queue-length selector). If 'a' really receives more
>> > IO,
>> > its queue grows, and the selector will start preferring 'b', so the
>> > effect should level out automatically with the current code as soon
>> > as
>> > you have real IO going on. But maybe I haven't grasped what you're
>> > referring to as "tied".
>>
>> Yes, for "tied" I'm referring to paths with the same weight. As queue
>> length grows it does tend to level out, but in the case where queue
>> length doesn't grow (in this example I'm imagining 2 concurrent
>> requests to the device) the bias does show and the selectors really
>> do
>> send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a'
>> and 'b' are ~same speed).
>
> Have you actually observed this? I find the idea pretty academical that
> two paths would be walking "tied" this way. In practice, under IO load,
> I'd expect queue lengths (and service-times, for that matter) to
> fluctuate enough to prevent this effect to be measurable. But of
> course, I may be wrong. If you really did observe this, the next
> question would be: does hurt performance to an extent that can be
> noticed/measured? I reckon that if 'a' got saturated under the load,
> hurting performance, its queue length would grow quickly and 'b' would
> automatically get preferred.
This is fairly simple to observe - start two sync reader threads
against a device with 3 backing paths, with one path taking longer on
average to complete requests than the other two. One of the 'faster'
paths will be used ~2x more than the other. Perhaps not that common a
situation, but is a real one. The bias seemed simple to remove, so
that the two (or N) paths would be used equally.

I don't see a downside to more evenly distributing in this case,
although I can't say I've directly observed performance downsides for
a biased distribution (aside from the distribution being biased itself
being a downside).
The bias of note is inherent to the order paths are added to the
selector (and which path is 'always bad'), so if 'a' is saturated due
to this, then slows, once it recovers it will continue to be
preferred, versus in an even distribution.

Khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties

2018-01-24 Thread Khazhismel Kumykov

On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck <mwi...@suse.com> wrote:
> On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote:
>> Move the last used path to the end of the list (least preferred) so
>> that
>> ties are more evenly distributed.
>>
>> For example, in case with three paths with one that is slower than
>> others, the remaining two would be unevenly used if they tie. This is
>> due to the rotation not being a truely fair distribution.
>>
>> Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are
>> 'tied'
>> Three possible rotations:
>> (a, b, c) -> best path 'a'
>> (b, c, a) -> best path 'b'
>> (c, a, b) -> best path 'a'
>> (a, b, c) -> best path 'a'
>> (b, c, a) -> best path 'b'
>> (c, a, b) -> best path 'a'
>> ...
>
>
> This happens only if a and b actually have the same weight (e.g. queue
> length for the queue-length selector). If 'a' really receives more IO,
> its queue grows, and the selector will start preferring 'b', so the
> effect should level out automatically with the current code as soon as
> you have real IO going on. But maybe I haven't grasped what you're
> referring to as "tied".
Yes, for "tied" I'm referring to paths with the same weight. As queue
length grows it does tend to level out, but in the case where queue
length doesn't grow (in this example I'm imagining 2 concurrent
requests to the device) the bias does show and the selectors really do
send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a'
and 'b' are ~same speed).

>
> OTOH, if the "best" path has much lower queue length than the other
> paths for whatever reason, your pushing it to the tail will require a
> full list walk with every new call of the selector. I see tjat as a
> small disadvantage of your approach.
I see, with best at the tail, we would not see as much benefit from
the check if a path has no IOs on it in queue-length. In service-time
no such short circuit exists, so I don't believe anything changes
there. Am I understanding this correctly?

>
> Regards
> Martin
>

Thanks for your comments,
Khazhy

>>
>> So 'a' is used 2x more than 'b', although they should be used evenly.
>>
>> With this change, the most recently used path is always the least
>> preferred, removing this bias resulting in even distribution.
>> (a, b, c) -> best path 'a'
>> (b, c, a) -> best path 'b'
>> (c, a, b) -> best path 'a'
>> (c, b, a) -> best path 'b'
>> ...
>>
>> Signed-off-by: Khazhismel Kumykov <kha...@google.com>
>> ---
>>  drivers/md/dm-queue-length.c | 6 +++---
>>  drivers/md/dm-service-time.c | 6 +++---
>>  2 files changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-
>> length.c
>> index 23f178641794..969c4f1a3633 100644
>> --- a/drivers/md/dm-queue-length.c
>> +++ b/drivers/md/dm-queue-length.c
>> @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (list_empty(>valid_paths))
>>   goto out;
>>
>> - /* Change preferred (first in list) path to evenly balance.
>> */
>> - list_move_tail(s->valid_paths.next, >valid_paths);
>> -
>>   list_for_each_entry(pi, >valid_paths, list) {
>>   if (!best ||
>>   (atomic_read(>qlen) < atomic_read(
>> >qlen)))
>> @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (!best)
>>   goto out;
>>
>> + /* Move most recently used to least preferred to evenly
>> balance. */
>> + list_move_tail(>list, >valid_paths);
>> +
>>   ret = best->path;
>>  out:
>>   spin_unlock_irqrestore(>lock, flags);
>> diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-
>> time.c
>> index 7b8642045c55..f006a9005593 100644
>> --- a/drivers/md/dm-service-time.c
>> +++ b/drivers/md/dm-service-time.c
>> @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (list_empty(>valid_paths))
>>   goto out;
>>
>> - /* Change preferred (first in list) path to evenly balance.
>> */
>> - list_move_tail(s->valid_paths.next, >valid_paths);
>> -
>>   list_for_each_entry(pi, >valid_paths, list)
>>   if (!best || (st_compare_load(pi, best, nr_bytes) <
>> 0))
>>

[PATCH] block/compat_ioctl: fix range check in BLKGETSIZE

2018-04-06 Thread Khazhismel Kumykov

kernel ulong and compat_ulong_t may not be same width. Use type directly
to eliminate mismatches.

This would result in truncation rather than EFBIG for 32bit mode for
large disks.

Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 block/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 6ca015f92766..3a2c77f07da8 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -388,7 +388,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, 
unsigned long arg)
return 0;
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
-   if ((size >> 9) > ~0UL)
+   if ((size >> 9) > ~((compat_ulong_t)0UL))
return -EFBIG;
return compat_put_ulong(arg, size >> 9);
 
-- 
2.17.0.484.g0c8726318c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()

2018-04-13 Thread Khazhismel Kumykov

shrink_dcache_parent may spin waiting for a parallel shrink_dentry_list.
In this case we may have 0 dentries to dispose, so we will never
schedule out while waiting for the parallel shrink_dentry_list to
complete.

Tested that this fixes syzbot reports of stalls in shrink_dcache_parent()

Fixes: 32785c0539b7 ("fs/dcache.c: add cond_resched() in shrink_dentry_list()")
Reported-by: syzbot+ae80b790eb412884c...@syzkaller.appspotmail.com

Cc: Nikolay Borisov <nbori...@suse.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: David Rientjes <rient...@google.com>
Cc: Alexander Viro <v...@zeniv.linux.org.uk>
Cc: Goldwyn Rodrigues <rgold...@suse.de>
Cc: Jeff Mahoney <je...@suse.com>
Cc: Davidlohr Bueso <d...@stgolabs.net>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Signed-off-by: Khazhismel Kumykov <kha...@google.com>
---
 fs/dcache.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 591b34500e41..3507badeb60a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1489,6 +1489,7 @@ void shrink_dcache_parent(struct dentry *parent)
break;
 
shrink_dentry_list();
+   cond_resched();
}
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
-- 
2.17.0.484.g0c8726318c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()

2018-04-16 Thread Khazhismel Kumykov

On Sun, Apr 15, 2018 at 3:34 PM, Al Viro  wrote:
> On Sun, Apr 15, 2018 at 10:54:55PM +0100, Al Viro wrote:
>> On Sun, Apr 15, 2018 at 09:40:54PM +0100, Al Viro wrote:
>>
>> > BTW, the current placement of cond_resched() looks bogus; suppose we
>> > have collected a lot of victims and ran into need_resched().  We leave
>> > d_walk() and call shrink_dentry_list().  At that point there's a lot
>> > of stuff on our shrink list and anybody else running into them will
>> > have to keep scanning.  Giving up the timeslice before we take care
>> > of any of those looks like a bad idea, to put it mildly, and that's
>> > precisely what will happen.
>> >
>> > What about doing that in the end of __dentry_kill() instead?  And to
>> > hell with both existing call sites - dput() one (before going to
>> > the parent) is obviously covered by that (dentry_kill() only returns
>> > non-NULL after having called __dentry_kill()) and in shrink_dentry_list()
>> > we'll get to it as soon as we go through all dentries that can be
>> > immediately kicked off the shrink list.  Which, AFAICS, improves the
>> > situation, now that shrink_lock_dentry() contains no trylock loops...
>> >
>> > Comments?
>>
>> What I mean is something like this (cumulative diff, it'll obviously need
>> to be carved up into 3--4 commits):
>
> ... and carved-up version is in vfs.git#work.dcache.  Could syzbot folks
> hit it with their reproducers?

To me it seems like shrink_dcache_parent could still spin without
scheduling similar to before - found > 0 since *someone* is shrinking,
but we have 0 entries to shrink - we never call __dentry_kill or
schedule, and just spin.

And indeed, the syzbot reproducer @vfs#work.dcache hits a softlockup
in shrink_dcache_parent.

I'd think re-adding cond_resched to shrink_dcache_parent does make the
softlockup go way. It seems to fix the reproducer.
Although did we want to terminate the loop in shrink_dcache_parent earlier?


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] fs/fat: add cond_resched to fat_count_free_clusters

2018-10-10 Thread Khazhismel Kumykov

On non-preempt kernels this loop can take a long time (more than 50
ticks) processing through entries.

Signed-off-by: Khazhismel Kumykov 
---
 fs/fat/fatent.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index defc2168de91..f58c0cacc531 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb)
if (ops->ent_get() == FAT_ENT_FREE)
free++;
} while (fat_ent_next(sbi, ));
+   cond_resched();
}
sbi->free_clusters = free;
sbi->free_clus_valid = 1;
-- 
2.19.0.605.g01d371f741-goog

Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-09-29 Thread Khazhismel Kumykov

Noticed these don't seem to be in 4.14/scsi-queue

On Tue, Aug 29, 2017 at 6:45 PM, Martin K. Petersen
 wrote:
>
> Chris,
>
>> Looks good to me, fixes up the code given that the comment there about
>> calling iscsi_remove_session wasn't being followed.
>
> Applied these two to 4.14/scsi-queue.
>
> --
> Martin K. Petersen  Oracle Linux Engineering


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-18 Thread Khazhismel Kumykov

On Mon, Dec 18, 2017 at 1:01 PM, Vivek Goyal  wrote:
> On Mon, Dec 18, 2017 at 12:39:50PM -0800, Khazhismel Kumykov wrote:
>> On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal  wrote:
>> > On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote:
>> >> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov  
>> >> wrote:
>> >> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li  wrote:
>> >> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>> >> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li  wrote:
>> >> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> >> >>> >> Allows configuration additional bytes or ios before a throttle is
>> >> >>> >> triggered.
>> >> >>> >>
>> >> >>> >> This allows implementation of a bucket style rate-limit/throttle 
>> >> >>> >> on a
>> >> >>> >> block device. Previously, bursting to a device was limited to 
>> >> >>> >> allowance
>> >> >>> >> granted in a single throtl_slice (similar to a bucket with limit N 
>> >> >>> >> and
>> >> >>> >> refill rate N/slice).
>> >> >>> >>
>> >> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which 
>> >> >>> >> define a
>> >> >>> >> number of bytes/ios that must be depleted before throttling 
>> >> >>> >> happens. A
>> >> >>> >> tg that does not deplete this allowance functions as though it has 
>> >> >>> >> no
>> >> >>> >> configured limits. tgs earn additional allowance at rate defined by
>> >> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, 
>> >> >>> >> throttling
>> >> >>> >> kicks in. If a tg is idle for a while, it will again have some 
>> >> >>> >> burst
>> >> >>> >> allowance before it gets throttled again.
>> >> >>> >>
>> >> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall 
>> >> >>> >> to 0,
>> >> >>> >> when all "used" burst allowance would be earned back. trim_slice 
>> >> >>> >> still
>> >> >>> >> does progress slice_start as before and decrements *_disp as 
>> >> >>> >> before, and
>> >> >>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>> >> >>> >
>> >> >>> > Can you describe why we need this? It would be great if you can 
>> >> >>> > describe the
>> >> >>> > usage model and an example. Does this work for io.low/io.max or 
>> >> >>> > both?
>> >> >>> >
>> >> >>> > Thanks,
>> >> >>> > Shaohua
>> >> >>> >
>> >> >>>
>> >> >>> Use case that brought this up was configuring limits for a remote
>> >> >>> shared device. Bursting beyond io.max is desired but only for so much
>> >> >>> before the limit kicks in, afterwards with sustained usage throughput
>> >> >>> is capped. (This proactively avoids remote-side limits). In that case
>> >> >>> one would configure in a root container io.max + io.burst, and
>> >> >>> configure low/other limits on descendants sharing the resource on the
>> >> >>> same node.
>> >> >>>
>> >> >>> With this patch, so long as tg has not dispatched more than the burst,
>> >> >>> no limit is applied at all by that tg, including limit imposed by
>> >> >>> io.low in tg_iops_limit, etc.
>> >> >>
>> >> >> I'd appreciate if you can give more details about the 'why'. 
>> >> >> 'configuring
>> >> >> limits for a remote shared device' doesn't justify the change.
>> >> >
>> >> > This is to configure a bursty workload (and associated device) with
>> >> > known/allowed expected burst size, but to not allow full utilization
>> >> > of the device for extended peri

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-18 Thread Khazhismel Kumykov

On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov  wrote:
> On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li  wrote:
>> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li  wrote:
>>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>>> >> Allows configuration additional bytes or ios before a throttle is
>>> >> triggered.
>>> >>
>>> >> This allows implementation of a bucket style rate-limit/throttle on a
>>> >> block device. Previously, bursting to a device was limited to allowance
>>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>>> >> refill rate N/slice).
>>> >>
>>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
>>> >> number of bytes/ios that must be depleted before throttling happens. A
>>> >> tg that does not deplete this allowance functions as though it has no
>>> >> configured limits. tgs earn additional allowance at rate defined by
>>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>>> >> kicks in. If a tg is idle for a while, it will again have some burst
>>> >> allowance before it gets throttled again.
>>> >>
>>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>>> >> when all "used" burst allowance would be earned back. trim_slice still
>>> >> does progress slice_start as before and decrements *_disp as before, and
>>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>>> >
>>> > Can you describe why we need this? It would be great if you can describe 
>>> > the
>>> > usage model and an example. Does this work for io.low/io.max or both?
>>> >
>>> > Thanks,
>>> > Shaohua
>>> >
>>>
>>> Use case that brought this up was configuring limits for a remote
>>> shared device. Bursting beyond io.max is desired but only for so much
>>> before the limit kicks in, afterwards with sustained usage throughput
>>> is capped. (This proactively avoids remote-side limits). In that case
>>> one would configure in a root container io.max + io.burst, and
>>> configure low/other limits on descendants sharing the resource on the
>>> same node.
>>>
>>> With this patch, so long as tg has not dispatched more than the burst,
>>> no limit is applied at all by that tg, including limit imposed by
>>> io.low in tg_iops_limit, etc.
>>
>> I'd appreciate if you can give more details about the 'why'. 'configuring
>> limits for a remote shared device' doesn't justify the change.
>
> This is to configure a bursty workload (and associated device) with
> known/allowed expected burst size, but to not allow full utilization
> of the device for extended periods of time for QoS. During idle or low
> use periods the burst allowance accrues, and then tasks can burst well
> beyond the configured throttle up to the limit, afterwards is
> throttled. A constant throttle speed isn't sufficient for this as you
> can only burst 1 slice worth, but a limit of sorts is desirable for
> preventing over utilization of the shared device. This type of limit
> is also slightly different than what i understand io.low does in local
> cases in that tg is only high priority/unthrottled if it is bursty,
> and is limited with constant usage
>
> Khazhy

Hi Shaohua,

Does this clarify the reason for this patch? Is this (or something
similar) a good fit for inclusion in blk-throttle?

Thanks,
Khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-18 Thread Khazhismel Kumykov

On Mon, Dec 18, 2017 at 10:29 AM, Vivek Goyal  wrote:
> On Mon, Dec 18, 2017 at 10:16:02AM -0800, Khazhismel Kumykov wrote:
>> On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov  
>> wrote:
>> > On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li  wrote:
>> >> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>> >>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li  wrote:
>> >>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> >>> >> Allows configuration additional bytes or ios before a throttle is
>> >>> >> triggered.
>> >>> >>
>> >>> >> This allows implementation of a bucket style rate-limit/throttle on a
>> >>> >> block device. Previously, bursting to a device was limited to 
>> >>> >> allowance
>> >>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>> >>> >> refill rate N/slice).
>> >>> >>
>> >>> >> Additional parameters bytes/io_burst_conf defined for tg, which 
>> >>> >> define a
>> >>> >> number of bytes/ios that must be depleted before throttling happens. A
>> >>> >> tg that does not deplete this allowance functions as though it has no
>> >>> >> configured limits. tgs earn additional allowance at rate defined by
>> >>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>> >>> >> kicks in. If a tg is idle for a while, it will again have some burst
>> >>> >> allowance before it gets throttled again.
>> >>> >>
>> >>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 
>> >>> >> 0,
>> >>> >> when all "used" burst allowance would be earned back. trim_slice still
>> >>> >> does progress slice_start as before and decrements *_disp as before, 
>> >>> >> and
>> >>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>> >>> >
>> >>> > Can you describe why we need this? It would be great if you can 
>> >>> > describe the
>> >>> > usage model and an example. Does this work for io.low/io.max or both?
>> >>> >
>> >>> > Thanks,
>> >>> > Shaohua
>> >>> >
>> >>>
>> >>> Use case that brought this up was configuring limits for a remote
>> >>> shared device. Bursting beyond io.max is desired but only for so much
>> >>> before the limit kicks in, afterwards with sustained usage throughput
>> >>> is capped. (This proactively avoids remote-side limits). In that case
>> >>> one would configure in a root container io.max + io.burst, and
>> >>> configure low/other limits on descendants sharing the resource on the
>> >>> same node.
>> >>>
>> >>> With this patch, so long as tg has not dispatched more than the burst,
>> >>> no limit is applied at all by that tg, including limit imposed by
>> >>> io.low in tg_iops_limit, etc.
>> >>
>> >> I'd appreciate if you can give more details about the 'why'. 'configuring
>> >> limits for a remote shared device' doesn't justify the change.
>> >
>> > This is to configure a bursty workload (and associated device) with
>> > known/allowed expected burst size, but to not allow full utilization
>> > of the device for extended periods of time for QoS. During idle or low
>> > use periods the burst allowance accrues, and then tasks can burst well
>> > beyond the configured throttle up to the limit, afterwards is
>> > throttled. A constant throttle speed isn't sufficient for this as you
>> > can only burst 1 slice worth, but a limit of sorts is desirable for
>> > preventing over utilization of the shared device. This type of limit
>> > is also slightly different than what i understand io.low does in local
>> > cases in that tg is only high priority/unthrottled if it is bursty,
>> > and is limited with constant usage
>> >
>> > Khazhy
>>
>> Hi Shaohua,
>>
>> Does this clarify the reason for this patch? Is this (or something
>> similar) a good fit for inclusion in blk-throttle?
>>
>
> So does this brust have to be per cgroup. I mean if thortl_slice was
> configurable, that will allow to control t

[RFC PATCH] blk-throttle: add burst allowance

2017-10-26 Thread Khazhismel Kumykov

Allows configuration additional bytes or ios before a throttle is
triggered. Slice end is extended to cover expended allowance recovery
time.

Usage would be e.g. per device to allow users to take up to X bytes/ios
at full speed, but be limited to Y bps/iops with sustained usage.

Signed-off-by: Khazhismel Kumykov 
---
 block/Kconfig|  11 +++
 block/blk-throttle.c | 185 ---
 2 files changed, 186 insertions(+), 10 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 3ab42bbb06d5..16545caa7fc9 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -127,6 +127,17 @@ config BLK_DEV_THROTTLING_LOW
 
Note, this is an experimental interface and could be changed someday.
 
+config BLK_DEV_THROTTLING_BURST
+bool "Block throttling .burst allowance interface"
+depends on BLK_DEV_THROTTLING
+default n
+---help---
+Add .burst allowance for block throttling. Burst allowance allows for
+additional unthrottled usage, while still limiting speed for sustained
+usage.
+
+If in doubt, say N.
+
 config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d80c3f0144c5..e09ec11e9c5f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -156,6 +156,11 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   uint64_t bytes_burst_conf[2];
+   unsigned int io_burst_conf[2];
+#endif
+
unsigned long last_low_overflow_time[2];
 
uint64_t last_bytes_disp[2];
@@ -506,6 +511,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, 
int node)
tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   tg->bytes_burst_conf[READ] = 0;
+   tg->bytes_burst_conf[WRITE] = 0;
+   tg->io_burst_conf[READ] = 0;
+   tg->io_burst_conf[WRITE] = 0;
+#endif
/* LIMIT_LOW will have default value 0 */
 
tg->latency_target = DFL_LATENCY_TARGET;
@@ -799,6 +810,26 @@ static inline void throtl_start_new_slice(struct 
throtl_grp *tg, bool rw)
   tg->slice_end[rw], jiffies);
 }
 
+/*
+ * When current slice should end.
+ *
+ * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait
+ * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end
+ * before this would result in tg receiving additional burst allowance.
+ */
+static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw,
+   unsigned long min_wait)
+{
+   unsigned long bytes_wait = 0, io_wait = 0;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   if (tg->bytes_burst_conf[rw])
+   bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw);
+   if (tg->io_burst_conf[rw])
+   io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw);
+#endif
+   return jiffies + max(min_wait, max(bytes_wait, io_wait));
+}
+
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
 {
@@ -848,7 +879,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, 
bool rw)
 * is bad because it does not allow new slice to start.
 */
 
-   throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
+   throtl_set_slice_end(tg, rw,
+throtl_slice_wait(tg, rw, tg->td->throtl_slice));
 
time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -888,7 +920,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
  unsigned long *wait)
 {
bool rw = bio_data_dir(bio);
-   unsigned int io_allowed;
+   unsigned int io_allowed, io_disp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
 
@@ -907,6 +939,17 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
 * have been trimmed.
 */
 
+   io_disp = tg->io_disp[rw];
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   if (tg->io_disp[rw] < tg->io_burst_conf[rw]) {
+   if (wait)
+   *wait = 0;
+   return true;
+   }
+   io_disp -= tg->io_burst_conf[rw];
+#endif
+
tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
 
@@ -915,14 +958,14 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
else
io_allowed = tmp;
 
-   if (tg->io_disp[rw] + 1 <= io_allowed) {
+   if (io_disp + 1 <= io_allowed) {
if (wai

[RFC PATCH] blk-throttle: add burst allowance.

2017-11-14 Thread Khazhismel Kumykov

Allows configuration additional bytes or ios before a throttle is
triggered.

This allows implementation of a bucket style rate-limit/throttle on a
block device. Previously, bursting to a device was limited to allowance
granted in a single throtl_slice (similar to a bucket with limit N and
refill rate N/slice).

Additional parameters bytes/io_burst_conf defined for tg, which define a
number of bytes/ios that must be depleted before throttling happens. A
tg that does not deplete this allowance functions as though it has no
configured limits. tgs earn additional allowance at rate defined by
bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
kicks in. If a tg is idle for a while, it will again have some burst
allowance before it gets throttled again.

slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
when all "used" burst allowance would be earned back. trim_slice still
does progress slice_start as before and decrements *_disp as before, and
tgs continue to get bytes/ios in throtl_slice intervals.

Signed-off-by: Khazhismel Kumykov 
---
 block/Kconfig|  11 +++
 block/blk-throttle.c | 192 +++
 2 files changed, 189 insertions(+), 14 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 28ec55752b68..fbd05b419f93 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW
 
Note, this is an experimental interface and could be changed someday.
 
+config BLK_DEV_THROTTLING_BURST
+bool "Block throttling .burst allowance interface"
+depends on BLK_DEV_THROTTLING
+default n
+---help---
+Add .burst allowance for block throttling. Burst allowance allows for
+additional unthrottled usage, while still limiting speed for sustained
+usage.
+
+If in doubt, say N.
+
 config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad32623427..27c084312772 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -157,6 +157,11 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   uint64_t bytes_burst_conf[2];
+   unsigned int io_burst_conf[2];
+#endif
+
unsigned long last_low_overflow_time[2];
 
uint64_t last_bytes_disp[2];
@@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, 
int node)
tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   tg->bytes_burst_conf[READ] = 0;
+   tg->bytes_burst_conf[WRITE] = 0;
+   tg->io_burst_conf[READ] = 0;
+   tg->io_burst_conf[WRITE] = 0;
+#endif
/* LIMIT_LOW will have default value 0 */
 
tg->latency_target = DFL_LATENCY_TARGET;
@@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct 
throtl_grp *tg, bool rw)
   tg->slice_end[rw], jiffies);
 }
 
+/*
+ * When current slice should end.
+ *
+ * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait
+ * for slice to recover used burst allowance. (*_disp -> 0). Setting slice_end
+ * before this would result in tg receiving additional burst allowance.
+ */
+static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw,
+   unsigned long min_wait)
+{
+   unsigned long bytes_wait = 0, io_wait = 0;
+#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
+   if (tg->bytes_burst_conf[rw])
+   bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw);
+   if (tg->io_burst_conf[rw])
+   io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw);
+#endif
+   return max(min_wait, max(bytes_wait, io_wait));
+}
+
 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
 {
@@ -849,7 +880,8 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, 
bool rw)
 * is bad because it does not allow new slice to start.
 */
 
-   throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
+   throtl_set_slice_end(tg, rw,
+   jiffies + throtl_slice_wait(tg, rw, tg->td->throtl_slice));
 
time_elapsed = jiffies - tg->slice_start[rw];
 
@@ -889,7 +921,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, 
struct bio *bio,
  unsigned long *wait)
 {
bool rw = bio_data_dir(bio);
-   unsigned int io_allowed;
+   unsigned int io_allowed, io_disp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
 
@@ -908,6 +940,17 @@ stati

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-14 Thread Khazhismel Kumykov

(Botched the to line, sorry)


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] generic: Add nocheck shutdown stress test

2017-09-21 Thread Khazhismel Kumykov

Most shutdown tests only run on filesystems with metadata journaling, so
we lose coverage. Add a shutdown stress test that doesn't check for
consistency, so does not require journaling.

Signed-off-by: Khazhismel Kumykov 
---
 tests/generic/999 | 84 +++
 tests/generic/999.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 87 insertions(+)
 create mode 100755 tests/generic/999
 create mode 100644 tests/generic/999.out

diff --git a/tests/generic/999 b/tests/generic/999
new file mode 100755
index ..71b9aa4c
--- /dev/null
+++ b/tests/generic/999
@@ -0,0 +1,84 @@
+#!/bin/bash
+# FS QA Test No. 999
+#
+# Shutdown stress test - exercise shutdown codepath with fsstress,
+# make sure we don't BUG/WARN. Coverage for all fs with shutdown.
+#
+#---
+# Copyright (c) 2013 Red Hat, Inc.  All Rights Reserved.
+# Copyright (c) 2017 Google, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+_cleanup()
+{
+   cd /
+   _scratch_unmount 2>/dev/null
+   rm -f $tmp.*
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+
+_require_scratch_nocheck
+_require_scratch_shutdown
+_require_command "$KILLALL_PROG" killall
+
+rm -f $seqres.full
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount
+
+SLEEP_TIME=$((10 * $TIME_FACTOR))
+PROCS=$((4 * LOAD_FACTOR))
+
+load_dir=$SCRATCH_MNT/test
+
+$XFS_FSR_PROG -v $load_dir >> $seqres.full 2>&1
+$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> 
$seqres.full 2>&1 &
+sleep $SLEEP_TIME
+sync
+
+# now shutdown and unmount
+sleep 5
+$here/src/godown $load_dir
+$KILLALL_PROG -q $FSSTRESS_PROG
+wait
+
+# for some reason fsstress processes manage to live on beyond the wait?
+sleep 5
+
+_scratch_unmount
+
+echo "No output is good. Failures are loud."
+
+status=0
+exit
diff --git a/tests/generic/999.out b/tests/generic/999.out
new file mode 100644
index ..68a51fe7
--- /dev/null
+++ b/tests/generic/999.out
@@ -0,0 +1,2 @@
+QA output created by 999
+No output is good. Failures are loud.
diff --git a/tests/generic/group b/tests/generic/group
index f922b496..891386ac 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -463,3 +463,4 @@
 458 auto quick clone
 459 auto dangerous
 460 auto quick rw
+999 auto shutdown stress
-- 
2.14.1.821.g8fa685d3b7-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v2] generic: Add nocheck shutdown stress test

2017-09-22 Thread Khazhismel Kumykov

Most shutdown tests only run on filesystems with metadata journaling, so
we lose coverage. Add a shutdown stress test that doesn't check for
consistency, so does not require journaling. This is a modified version
of generic/051, with some extras trimmed and fs checking removed.

Signed-off-by: Khazhismel Kumykov 
---
v2: drop defrag, use "silence is golden"

 tests/generic/999 | 83 +++
 tests/generic/999.out |  2 ++
 tests/generic/group   |  1 +
 3 files changed, 86 insertions(+)
 create mode 100755 tests/generic/999
 create mode 100644 tests/generic/999.out

diff --git a/tests/generic/999 b/tests/generic/999
new file mode 100755
index ..7b612686
--- /dev/null
+++ b/tests/generic/999
@@ -0,0 +1,83 @@
+#!/bin/bash
+# FS QA Test No. 999
+#
+# Shutdown stress test - exercise shutdown codepath with fsstress,
+# make sure we don't BUG/WARN. Coverage for all fs with shutdown.
+#
+#---
+# Copyright (c) 2013 Red Hat, Inc.  All Rights Reserved.
+# Copyright (c) 2017 Google, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+_cleanup()
+{
+   cd /
+   _scratch_unmount 2>/dev/null
+   rm -f $tmp.*
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+
+_require_scratch_nocheck
+_require_scratch_shutdown
+_require_command "$KILLALL_PROG" killall
+
+rm -f $seqres.full
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount
+
+SLEEP_TIME=$((10 * $TIME_FACTOR))
+PROCS=$((4 * LOAD_FACTOR))
+
+load_dir=$SCRATCH_MNT/test
+
+$FSSTRESS_PROG $FSSTRESS_AVOID -n1000 -p $PROCS -d $load_dir >> 
$seqres.full 2>&1 &
+sleep $SLEEP_TIME
+sync
+
+# now shutdown and unmount
+sleep 5
+$here/src/godown $load_dir
+$KILLALL_PROG -q $FSSTRESS_PROG
+wait
+
+# for some reason fsstress processes manage to live on beyond the wait?
+sleep 5
+
+_scratch_unmount
+
+echo "Silence is golden"
+
+status=0
+exit
diff --git a/tests/generic/999.out b/tests/generic/999.out
new file mode 100644
index ..3b276ca8
--- /dev/null
+++ b/tests/generic/999.out
@@ -0,0 +1,2 @@
+QA output created by 999
+Silence is golden
diff --git a/tests/generic/group b/tests/generic/group
index f922b496..891386ac 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -463,3 +463,4 @@
 458 auto quick clone
 459 auto dangerous
 460 auto quick rw
+999 auto shutdown stress
-- 
2.14.1.821.g8fa685d3b7-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-20 Thread Khazhismel Kumykov

On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li  wrote:
> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li  wrote:
>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> >> Allows configuration additional bytes or ios before a throttle is
>> >> triggered.
>> >>
>> >> This allows implementation of a bucket style rate-limit/throttle on a
>> >> block device. Previously, bursting to a device was limited to allowance
>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>> >> refill rate N/slice).
>> >>
>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
>> >> number of bytes/ios that must be depleted before throttling happens. A
>> >> tg that does not deplete this allowance functions as though it has no
>> >> configured limits. tgs earn additional allowance at rate defined by
>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>> >> kicks in. If a tg is idle for a while, it will again have some burst
>> >> allowance before it gets throttled again.
>> >>
>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>> >> when all "used" burst allowance would be earned back. trim_slice still
>> >> does progress slice_start as before and decrements *_disp as before, and
>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>> >
>> > Can you describe why we need this? It would be great if you can describe 
>> > the
>> > usage model and an example. Does this work for io.low/io.max or both?
>> >
>> > Thanks,
>> > Shaohua
>> >
>>
>> Use case that brought this up was configuring limits for a remote
>> shared device. Bursting beyond io.max is desired but only for so much
>> before the limit kicks in, afterwards with sustained usage throughput
>> is capped. (This proactively avoids remote-side limits). In that case
>> one would configure in a root container io.max + io.burst, and
>> configure low/other limits on descendants sharing the resource on the
>> same node.
>>
>> With this patch, so long as tg has not dispatched more than the burst,
>> no limit is applied at all by that tg, including limit imposed by
>> io.low in tg_iops_limit, etc.
>
> I'd appreciate if you can give more details about the 'why'. 'configuring
> limits for a remote shared device' doesn't justify the change.

This is to configure a bursty workload (and associated device) with
known/allowed expected burst size, but to not allow full utilization
of the device for extended periods of time for QoS. During idle or low
use periods the burst allowance accrues, and then tasks can burst well
beyond the configured throttle up to the limit, afterwards is
throttled. A constant throttle speed isn't sufficient for this as you
can only burst 1 slice worth, but a limit of sorts is desirable for
preventing over utilization of the shared device. This type of limit
is also slightly different than what i understand io.low does in local
cases in that tg is only high priority/unthrottled if it is bursty,
and is limited with constant usage

Khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-08-24 Thread Khazhismel Kumykov

On Thu, Jul 13, 2017 at 9:11 AM, Khazhismel Kumykov  wrote:
Ping in case this was missed


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()

2018-04-13 Thread Khazhismel Kumykov

shrink_dcache_parent may spin waiting for a parallel shrink_dentry_list.
In this case we may have 0 dentries to dispose, so we will never
schedule out while waiting for the parallel shrink_dentry_list to
complete.

Tested that this fixes syzbot reports of stalls in shrink_dcache_parent()

Fixes: 32785c0539b7 ("fs/dcache.c: add cond_resched() in shrink_dentry_list()")
Reported-by: syzbot+ae80b790eb412884c...@syzkaller.appspotmail.com

Cc: Nikolay Borisov 
Cc: Andrew Morton 
Cc: David Rientjes 
Cc: Alexander Viro 
Cc: Goldwyn Rodrigues 
Cc: Jeff Mahoney 
Cc: Davidlohr Bueso 
Cc: Linus Torvalds 
Signed-off-by: Khazhismel Kumykov 
---
 fs/dcache.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 591b34500e41..3507badeb60a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1489,6 +1489,7 @@ void shrink_dcache_parent(struct dentry *parent)
break;
 
shrink_dentry_list();
+   cond_resched();
}
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
-- 
2.17.0.484.g0c8726318c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH] fs/dcache.c: re-add cond_resched() in shrink_dcache_parent()

2018-04-16 Thread Khazhismel Kumykov

On Sun, Apr 15, 2018 at 3:34 PM, Al Viro  wrote:
> On Sun, Apr 15, 2018 at 10:54:55PM +0100, Al Viro wrote:
>> On Sun, Apr 15, 2018 at 09:40:54PM +0100, Al Viro wrote:
>>
>> > BTW, the current placement of cond_resched() looks bogus; suppose we
>> > have collected a lot of victims and ran into need_resched().  We leave
>> > d_walk() and call shrink_dentry_list().  At that point there's a lot
>> > of stuff on our shrink list and anybody else running into them will
>> > have to keep scanning.  Giving up the timeslice before we take care
>> > of any of those looks like a bad idea, to put it mildly, and that's
>> > precisely what will happen.
>> >
>> > What about doing that in the end of __dentry_kill() instead?  And to
>> > hell with both existing call sites - dput() one (before going to
>> > the parent) is obviously covered by that (dentry_kill() only returns
>> > non-NULL after having called __dentry_kill()) and in shrink_dentry_list()
>> > we'll get to it as soon as we go through all dentries that can be
>> > immediately kicked off the shrink list.  Which, AFAICS, improves the
>> > situation, now that shrink_lock_dentry() contains no trylock loops...
>> >
>> > Comments?
>>
>> What I mean is something like this (cumulative diff, it'll obviously need
>> to be carved up into 3--4 commits):
>
> ... and carved-up version is in vfs.git#work.dcache.  Could syzbot folks
> hit it with their reproducers?

To me it seems like shrink_dcache_parent could still spin without
scheduling similar to before - found > 0 since *someone* is shrinking,
but we have 0 entries to shrink - we never call __dentry_kill or
schedule, and just spin.

And indeed, the syzbot reproducer @vfs#work.dcache hits a softlockup
in shrink_dcache_parent.

I'd think re-adding cond_resched to shrink_dcache_parent does make the
softlockup go way. It seems to fix the reproducer.
Although did we want to terminate the loop in shrink_dcache_parent earlier?


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] block/compat_ioctl: fix range check in BLKGETSIZE

2018-04-06 Thread Khazhismel Kumykov

kernel ulong and compat_ulong_t may not be same width. Use type directly
to eliminate mismatches.

This would result in truncation rather than EFBIG for 32bit mode for
large disks.

Signed-off-by: Khazhismel Kumykov 
---
 block/compat_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 6ca015f92766..3a2c77f07da8 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -388,7 +388,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, 
unsigned long arg)
return 0;
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
-   if ((size >> 9) > ~0UL)
+   if ((size >> 9) > ~((compat_ulong_t)0UL))
return -EFBIG;
return compat_put_ulong(arg, size >> 9);
 
-- 
2.17.0.484.g0c8726318c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-07-12 Thread Khazhismel Kumykov

Session attributes exposed through sysfs were freed before the device
was destroyed, resulting in a potential use-after-free. Free these
attributes after removing the device.

Signed-off-by: Khazhismel Kumykov 
---
 drivers/scsi/libiscsi.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 42381adf0769..f9199bebaec7 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup);
 /**
  * iscsi_session_teardown - destroy session, host, and cls_session
  * @cls_session: iscsi session
- *
- * The driver must have called iscsi_remove_session before
- * calling this.
  */
 void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 {
@@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
 
iscsi_pool_free(>cmdpool);
 
+   iscsi_remove_session(session);
+
kfree(session->password);
kfree(session->password_in);
kfree(session->username);
@@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
kfree(session->portal_type);
kfree(session->discovery_parent_type);
 
-   iscsi_destroy_session(cls_session);
+   iscsi_free_session(cls_session);
+
iscsi_host_dec_session_cnt(shost);
module_put(owner);
 }
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] libiscsi: Remove iscsi_destroy_session

2017-07-12 Thread Khazhismel Kumykov

iscsi_session_teardown was the only user of this function. Function
currently is just short for iscsi_remove_session + iscsi_free_session.

Signed-off-by: Khazhismel Kumykov 
---
 with "libiscsi: Fix use after free race during iscsi_session_teardown" 
removing the
 last user.

 drivers/scsi/scsi_transport_iscsi.c | 16 
 include/scsi/scsi_transport_iscsi.h |  1 -
 2 files changed, 17 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index a424eaeafeb0..924ac408d8a9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session 
*session)
 }
 EXPORT_SYMBOL_GPL(iscsi_free_session);
 
-/**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-   iscsi_remove_session(session);
-   ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-   iscsi_free_session(session);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
 /**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
diff --git a/include/scsi/scsi_transport_iscsi.h 
b/include/scsi/scsi_transport_iscsi.h
index 6183d20a01fb..b266d2a3bcb1 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session 
*iscsi_create_session(struct Scsi_Host *shost,
unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
int dd_size, uint32_t cid);
 extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[Patch v2 2/2] libiscsi: Remove iscsi_destroy_session

2017-07-13 Thread Khazhismel Kumykov

iscsi_session_teardown was the only user of this function. Function
currently is just short for iscsi_remove_session + iscsi_free_session.

Signed-off-by: Khazhismel Kumykov 
---
 drivers/scsi/scsi_transport_iscsi.c | 16 
 include/scsi/scsi_transport_iscsi.h |  1 -
 2 files changed, 17 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index a424eaeafeb0..924ac408d8a9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session 
*session)
 }
 EXPORT_SYMBOL_GPL(iscsi_free_session);
 
-/**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-   iscsi_remove_session(session);
-   ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-   iscsi_free_session(session);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
 /**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
diff --git a/include/scsi/scsi_transport_iscsi.h 
b/include/scsi/scsi_transport_iscsi.h
index 6183d20a01fb..b266d2a3bcb1 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session 
*iscsi_create_session(struct Scsi_Host *shost,
unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
int dd_size, uint32_t cid);
 extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[Patch v2 1/2] libiscsi: Fix use-after-free race during iscsi_session_teardown

2017-07-13 Thread Khazhismel Kumykov

Session attributes exposed through sysfs were freed before the device
was destroyed, resulting in a potential use-after-free. Free these
attributes after removing the device.

Signed-off-by: Khazhismel Kumykov 
---
 drivers/scsi/libiscsi.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 42381adf0769..8696a51a5a0c 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup);
 /**
  * iscsi_session_teardown - destroy session, host, and cls_session
  * @cls_session: iscsi session
- *
- * The driver must have called iscsi_remove_session before
- * calling this.
  */
 void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
 {
@@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
 
iscsi_pool_free(>cmdpool);
 
+   iscsi_remove_session(cls_session);
+
kfree(session->password);
kfree(session->password_in);
kfree(session->username);
@@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session 
*cls_session)
kfree(session->portal_type);
kfree(session->discovery_parent_type);
 
-   iscsi_destroy_session(cls_session);
+   iscsi_free_session(cls_session);
+
iscsi_host_dec_session_cnt(shost);
module_put(owner);
 }
-- 
2.13.2.932.g7449e964c-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-06-02 Thread Khazhismel Kumykov

On Fri, Jun 2, 2017 at 6:12 PM, Al Viro  wrote:
> Part of that could be relieved if we turned check_and_drop() into
> static void check_and_drop(void *_data)
> {
> struct detach_data *data = _data;
>
> if (!data->mountpoint && list_empty(>select.dispose))
> __d_drop(data->select.start);
> }

So with this change, d_invalidate will drop the starting dentry before
all it's children are dropped? Would it make sense to just drop it
right off the bat, and let one task handle shrinking all it's
children?
>
> It doesn't solve the entire problem, though - we still could get too
> many threads into that thing before any of them gets to that __d_drop().

Yes, the change isn't sufficient for my repro, as many threads get to
the loop before the drop, although new tasks don't get stuck in the
same loop after the dentry is dropped.

> I wonder if we should try and collect more victims; that could lead
> to contention of its own, but...

>From my understanding, the contention is the worst when one task is
shrinking everything, and several other tasks are busily looping
walking the dentry until everything is done. In this case, the tasks
busily looping d_walk hold the d_lock for a dentry while walking over
all it's children, then soon after it finishes the d_walk, it queues
again to walk again, while shrink_dentry_list releases and re-grabs
for each entry.

If we limit the number of items we pass to shrink_dentry_list at one
time things actually look quite a bit better. e.g., in testing
arbitrarily limiting select.dispose to 1000 elements does fix my
repro.

e.g.
diff --git a/fs/dcache.c b/fs/dcache.c
index 22af360ceca3..3892e0eb7ec2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1367,6 +1367,7 @@ struct select_data {
struct dentry *start;
struct list_head dispose;
int found;
+   int actually_found;
 };

 static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
@@ -1388,6 +1389,7 @@ static enum d_walk_ret select_collect(void
*_data, struct dentry *dentry)
if (!dentry->d_lockref.count) {
d_shrink_add(dentry, >dispose);
data->found++;
+   data->actually_found++;
}
}
/*
@@ -1397,6 +1399,9 @@ static enum d_walk_ret select_collect(void
*_data, struct dentry *dentry)
 */
if (!list_empty(>dispose))
ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+
+   if (data->actually_found > 1000)
+   ret = D_WALK_QUIT;
 out:
return ret;
 }
@@ -1415,6 +1420,7 @@ void shrink_dcache_parent(struct dentry *parent)
INIT_LIST_HEAD();
data.start = parent;
data.found = 0;
+   data.actually_found = 0;

d_walk(parent, , select_collect, NULL);
if (!data.found)
@@ -1536,6 +1542,7 @@ void d_invalidate(struct dentry *dentry)
INIT_LIST_HEAD();
data.select.start = dentry;
data.select.found = 0;
+   data.select.actually_found = 0;

d_walk(dentry, , detach_and_collect, check_and_drop);


smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-06-03 Thread Khazhismel Kumykov

On Fri, Jun 2, 2017 at 11:20 PM, Al Viro  wrote:
> The thing is, unlike shrink_dcache_parent() we *can* bugger off as
> soon as we'd found no victims, nothing mounted and dentry itself
> is unhashed.  We can't do anything in select_collect() (we would've
> broken shrink_dcache_parent() that way), but we can do unhashing
> in check_and_drop() in "really nothing to do" case and we can return
> from d_invalidate() after that.  So how about this:
That does the trick.


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-12-06 Thread Khazhismel Kumykov

On Mon, Nov 20, 2017 at 8:36 PM, Khazhismel Kumykov  wrote:
> On Fri, Nov 17, 2017 at 11:26 AM, Shaohua Li  wrote:
>> On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
>>> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li  wrote:
>>> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>>> >> Allows configuration additional bytes or ios before a throttle is
>>> >> triggered.
>>> >>
>>> >> This allows implementation of a bucket style rate-limit/throttle on a
>>> >> block device. Previously, bursting to a device was limited to allowance
>>> >> granted in a single throtl_slice (similar to a bucket with limit N and
>>> >> refill rate N/slice).
>>> >>
>>> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
>>> >> number of bytes/ios that must be depleted before throttling happens. A
>>> >> tg that does not deplete this allowance functions as though it has no
>>> >> configured limits. tgs earn additional allowance at rate defined by
>>> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>>> >> kicks in. If a tg is idle for a while, it will again have some burst
>>> >> allowance before it gets throttled again.
>>> >>
>>> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>>> >> when all "used" burst allowance would be earned back. trim_slice still
>>> >> does progress slice_start as before and decrements *_disp as before, and
>>> >> tgs continue to get bytes/ios in throtl_slice intervals.
>>> >
>>> > Can you describe why we need this? It would be great if you can describe 
>>> > the
>>> > usage model and an example. Does this work for io.low/io.max or both?
>>> >
>>> > Thanks,
>>> > Shaohua
>>> >
>>>
>>> Use case that brought this up was configuring limits for a remote
>>> shared device. Bursting beyond io.max is desired but only for so much
>>> before the limit kicks in, afterwards with sustained usage throughput
>>> is capped. (This proactively avoids remote-side limits). In that case
>>> one would configure in a root container io.max + io.burst, and
>>> configure low/other limits on descendants sharing the resource on the
>>> same node.
>>>
>>> With this patch, so long as tg has not dispatched more than the burst,
>>> no limit is applied at all by that tg, including limit imposed by
>>> io.low in tg_iops_limit, etc.
>>
>> I'd appreciate if you can give more details about the 'why'. 'configuring
>> limits for a remote shared device' doesn't justify the change.
>
> This is to configure a bursty workload (and associated device) with
> known/allowed expected burst size, but to not allow full utilization
> of the device for extended periods of time for QoS. During idle or low
> use periods the burst allowance accrues, and then tasks can burst well
> beyond the configured throttle up to the limit, afterwards is
> throttled. A constant throttle speed isn't sufficient for this as you
> can only burst 1 slice worth, but a limit of sorts is desirable for
> preventing over utilization of the shared device. This type of limit
> is also slightly different than what i understand io.low does in local
> cases in that tg is only high priority/unthrottled if it is bursty,
> and is limited with constant usage
>
> Khazhy
Ping this time without html (oops)


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH v2] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE

2020-09-01 Thread Khazhismel Kumykov

On Sat, Aug 29, 2020 at 6:00 PM Bart Van Assche  wrote:
>
> From https://www.kernel.org/doc/man-pages/linux-api-ml.html:
> "all Linux kernel patches that change userspace interfaces should be CCed
> to linux-...@vger.kernel.org"
>
> So I have added the linux-api mailing list to the Cc-list. Anyway:
Thanks, sorry for missing that!
>
> Reviewed-by: Bart Van Assche 

Jens, does this change look good?

khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH v2 1/3] fuse: on 64-bit store time in d_fsdata directly

2019-09-03 Thread Khazhismel Kumykov

On Thu, Aug 22, 2019 at 1:00 PM Khazhismel Kumykov  wrote:
>
> Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use
> ->d_time"), as the additional memory can be significant. (In particular,
> on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry,
> this can consume significant memory.
>
> Reviewed-by: Shakeel Butt 
> Signed-off-by: Khazhismel Kumykov 
> ---
>  fs/fuse/dir.c | 19 +++
>  1 file changed, 19 insertions(+)
>

ping?


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE

2020-08-24 Thread Khazhismel Kumykov

CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping.

Retain CAP_SYS_ADMIN permission for backwards compatibility.

Signed-off-by: Khazhismel Kumykov 
---
 block/ioprio.c  | 2 +-
 include/uapi/linux/capability.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/ioprio.c b/block/ioprio.c
index 77bcab11dce5..4572456430f9 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio)
 
switch (class) {
case IOPRIO_CLASS_RT:
-   if (!capable(CAP_SYS_ADMIN))
+   if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_NICE))
return -EPERM;
/* fall through */
/* rt has prio field too */
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 395dd0df8d08..c6ca33034147 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -288,6 +288,8 @@ struct vfs_ns_cap_data {
processes and setting the scheduling algorithm used by another
process. */
 /* Allow setting cpu affinity on other processes */
+/* Allow setting realtime ioprio class */
+/* Allow setting ioprio class on other processes */
 
 #define CAP_SYS_NICE 23
 
-- 
2.28.0.297.g1956fa8f8d-goog

Re: IOPRIO_CLASS_RT without CAP_SYS_ADMIN?

2020-08-24 Thread Khazhismel Kumykov

On Sat, Aug 22, 2020 at 7:14 PM Jens Axboe  wrote:
>
> On 8/22/20 7:58 PM, Bart Van Assche wrote:
> > On 2020-08-20 17:35, Khazhismel Kumykov wrote:
> >> It'd be nice to allow a process to send RT requests without granting
> >> it the wide capabilities of CAP_SYS_ADMIN, and we already have a
> >> capability which seems to almost fit this priority idea -
> >> CAP_SYS_NICE? Would this fit there?
> >>
> >> Being capable of setting IO priorities on per request or per thread
> >> basis (be it async submission or w/ thread ioprio_set) is useful
> >> especially when the userspace has its own prioritization/scheduling
> >> before hitting the kernel, allowing us to signal to the kernel how to
> >> order certain IOs, and it'd be nice to separate this from ADMIN for
> >> non-root processes, in a way that's less error prone than e.g. having
> >> a trusted launcher ionice the process and then drop priorities for
> >> everything but prio requests.
> >
> > Hi Khazhy,
> >
> > In include/uapi/linux/capability.h I found the following:
> >
> > /* Allow raising priority and setting priority on other (different
> >UID) processes */
> > /* Allow use of FIFO and round-robin (realtime) scheduling on own
> >processes and setting the scheduling algorithm used by another
> >process. */
> > /* Allow setting cpu affinity on other processes */
> > #define CAP_SYS_NICE 23
> >
> > If it is acceptable that every process that has permission to submit
> > IOPRIO_CLASS_RT I/O also has permission to modify the priority of
> > other processes then extending CAP_SYS_NICE is an option. Another
> > possibility is to extend the block cgroup controller such that the
> > capability to submit IOPRIO_CLASS_RT I/O can be enabled through the
> > cgroup interface. There may be other approaches. I'm not sure what
> > the best approach is.

I think it fits well with CAP_SYS_NICE, especially since that
capability already grants the ability to demote other processes to
IOPRIO_CLASS_IDLE, etc.

>
> I think CAP_SYS_NICE fits pretty nicely, and I was actually planning on
> using that for the io_uring SQPOLL side as well. So there is/will be
> some precedent for tying it into IO related things, too. For this use
> case, I think it's perfect.
>
> --
> Jens Axboe
>


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE

2020-08-24 Thread Khazhismel Kumykov

On Mon, Aug 24, 2020 at 1:45 PM Khazhismel Kumykov  wrote:
>
> CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping.
>
> Retain CAP_SYS_ADMIN permission for backwards compatibility.
>
> Signed-off-by: Khazhismel Kumykov 
> ---
>  block/ioprio.c  | 2 +-
>  include/uapi/linux/capability.h | 2 ++
>  2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/block/ioprio.c b/block/ioprio.c
> index 77bcab11dce5..4572456430f9 100644
> --- a/block/ioprio.c
> +++ b/block/ioprio.c
> @@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio)
>
> switch (class) {
> case IOPRIO_CLASS_RT:
> -   if (!capable(CAP_SYS_ADMIN))
> +   if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_NICE))
yikes, sorry for the spam
> return -EPERM;
> /* fall through */
> /* rt has prio field too */
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index 395dd0df8d08..c6ca33034147 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -288,6 +288,8 @@ struct vfs_ns_cap_data {
> processes and setting the scheduling algorithm used by another
> process. */
>  /* Allow setting cpu affinity on other processes */
> +/* Allow setting realtime ioprio class */
> +/* Allow setting ioprio class on other processes */
>
>  #define CAP_SYS_NICE 23
>
> --
> 2.28.0.297.g1956fa8f8d-goog
>


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v2] block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE

2020-08-24 Thread Khazhismel Kumykov

CAP_SYS_ADMIN is too broad, and ionice fits into CAP_SYS_NICE's grouping.

Retain CAP_SYS_ADMIN permission for backwards compatibility.

Signed-off-by: Khazhismel Kumykov 
---
 block/ioprio.c  | 2 +-
 include/uapi/linux/capability.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

v2: fix embarrassing logic mistake
diff --git a/block/ioprio.c b/block/ioprio.c
index 77bcab11dce5..276496246fe9 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio)
 
switch (class) {
case IOPRIO_CLASS_RT:
-   if (!capable(CAP_SYS_ADMIN))
+   if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN))
return -EPERM;
/* fall through */
/* rt has prio field too */
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 395dd0df8d08..c6ca33034147 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -288,6 +288,8 @@ struct vfs_ns_cap_data {
processes and setting the scheduling algorithm used by another
process. */
 /* Allow setting cpu affinity on other processes */
+/* Allow setting realtime ioprio class */
+/* Allow setting ioprio class on other processes */
 
 #define CAP_SYS_NICE 23
 
-- 
2.28.0.297.g1956fa8f8d-goog

IOPRIO_CLASS_RT without CAP_SYS_ADMIN?

2020-08-20 Thread Khazhismel Kumykov

It'd be nice to allow a process to send RT requests without granting
it the wide capabilities of CAP_SYS_ADMIN, and we already have a
capability which seems to almost fit this priority idea -
CAP_SYS_NICE? Would this fit there?

Being capable of setting IO priorities on per request or per thread
basis (be it async submission or w/ thread ioprio_set) is useful
especially when the userspace has its own prioritization/scheduling
before hitting the kernel, allowing us to signal to the kernel how to
order certain IOs, and it'd be nice to separate this from ADMIN for
non-root processes, in a way that's less error prone than e.g. having
a trusted launcher ionice the process and then drop priorities for
everything but prio requests.

khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v3 2/2] fuse: kmemcg account fs data

2019-09-16 Thread Khazhismel Kumykov

account per-file, dentry, and inode data

blockdev/superblock and temporary per-request data was left alone, as
this usually isn't accounted

Signed-off-by: Khazhismel Kumykov 
Reviewed-by: Shakeel Butt 
---
 fs/fuse/dir.c   | 3 ++-
 fs/fuse/file.c  | 5 +++--
 fs/fuse/inode.c | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 58557d4817e9..d572c900bb0f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -279,7 +279,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
 #if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
-   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
return dentry->d_fsdata ? 0 : -ENOMEM;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a2ea347c4d2c..862aff3665b5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -63,12 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
struct fuse_file *ff;
 
-   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
+   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
if (unlikely(!ff))
return NULL;
 
ff->fc = fc;
-   ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL);
+   ff->release_args = kzalloc(sizeof(*ff->release_args),
+  GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
kfree(ff);
return NULL;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3d598a5bb5b5..6cb445bed89d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -66,7 +66,8 @@ static struct file_system_type fuseblk_fs_type;
 
 struct fuse_forget_link *fuse_alloc_forget(void)
 {
-   return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+   return kzalloc(sizeof(struct fuse_forget_link),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 }
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
-- 
2.23.0.237.gc6a4ce50a0-goog

[PATCH v3 1/2] fuse: on 64-bit store time in d_fsdata directly

2019-09-16 Thread Khazhismel Kumykov

Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use
->d_time"), as the additional memory can be significant. (In particular,
on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry,
this can consume significant memory.

Reviewed-by: Shakeel Butt 
Signed-off-by: Khazhismel Kumykov 
---
v3:
 reapplied on fuse/for-next, droping the fuse_request_alloc refactor
 it was already done :) (and account new per-file alloc)

 fs/fuse/dir.c | 36 ++--
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ba0a175d7578..58557d4817e9 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -24,11 +24,34 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
set_bit(FUSE_I_ADVISE_RDPLUS, >state);
 }
 
+#if BITS_PER_LONG >= 64
+static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+   entry->d_fsdata = (void *) time;
+}
+
+static inline u64 fuse_dentry_time(const struct dentry *entry)
+{
+   return (u64)entry->d_fsdata;
+}
+
+#else
 union fuse_dentry {
u64 time;
struct rcu_head rcu;
 };
 
+static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
+{
+   ((union fuse_dentry *) dentry->d_fsdata)->time = time;
+}
+
+static inline u64 fuse_dentry_time(const struct dentry *entry)
+{
+   return ((union fuse_dentry *) entry->d_fsdata)->time;
+}
+#endif
+
 static void fuse_dentry_settime(struct dentry *dentry, u64 time)
 {
struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
@@ -47,12 +70,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 
time)
spin_unlock(>d_lock);
}
 
-   ((union fuse_dentry *) dentry->d_fsdata)->time = time;
-}
-
-static inline u64 fuse_dentry_time(const struct dentry *entry)
-{
-   return ((union fuse_dentry *) entry->d_fsdata)->time;
+   __fuse_dentry_settime(dentry, time);
 }
 
 /*
@@ -258,6 +276,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
goto out;
 }
 
+#if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
@@ -270,6 +289,7 @@ static void fuse_dentry_release(struct dentry *dentry)
 
kfree_rcu(fd, rcu);
 }
+#endif
 
 static int fuse_dentry_delete(const struct dentry *dentry)
 {
@@ -279,13 +299,17 @@ static int fuse_dentry_delete(const struct dentry *dentry)
 const struct dentry_operations fuse_dentry_operations = {
.d_revalidate   = fuse_dentry_revalidate,
.d_delete   = fuse_dentry_delete,
+#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
.d_release  = fuse_dentry_release,
+#endif
 };
 
 const struct dentry_operations fuse_root_dentry_operations = {
+#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
.d_release  = fuse_dentry_release,
+#endif
 };
 
 int fuse_valid_type(int m)
-- 
2.23.0.237.gc6a4ce50a0-goog

Re: [PATCH v3 2/2] fuse: kmemcg account fs data

2019-09-17 Thread Khazhismel Kumykov

On Tue, Sep 17, 2019 at 12:52 AM Miklos Szeredi  wrote:
>
> On Tue, Sep 17, 2019 at 1:56 AM Khazhismel Kumykov  wrote:
> >  struct fuse_forget_link *fuse_alloc_forget(void)
> >  {
> > -   return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
> > +   return kzalloc(sizeof(struct fuse_forget_link),
> > +  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
>
> What does __GFP_RECLAIMBALE signify in slab allocs?
>

Marking these allocations reclaimable hints to mm how much we can
reclaim overall by shrinking, so if it is reclaimable we should mark
it as such.

For d_fsdata, the lifetime is linked to the dentry, which is
reclaimable, so it makes sense here.

> You understand that the forget_link is not reclaimable in the sense,
> that it requires action (reading requests from the fuse device) from
> the userspace filesystem daemon?
>

Ah, I see, whenever we evict the fuse_inode, we queue a forget
command, which usually waits for userspace. So it's not actually
linked to the inode itself, and yeah, if we need userspace to
intervene we shouldn't treat forget_link as reclaimable. I had figured
since fuse_inode is reclaimable, this should be too, but missed that
disconnect, thanks.

smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v4] fuse: kmemcg account fs data

2019-09-17 Thread Khazhismel Kumykov

account per-file, dentry, and inode data

blockdev/superblock and temporary per-request data was left alone, as
this usually isn't accounted

Reviewed-by: Shakeel Butt 
Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dir.c   | 3 ++-
 fs/fuse/file.c  | 5 +++--
 fs/fuse/inode.c | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 58557d4817e9..d572c900bb0f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -279,7 +279,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
 #if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
-   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
return dentry->d_fsdata ? 0 : -ENOMEM;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8c7578b95d2c..0f0225686aee 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -63,12 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
struct fuse_file *ff;
 
-   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
+   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
if (unlikely(!ff))
return NULL;
 
ff->fc = fc;
-   ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL);
+   ff->release_args = kzalloc(sizeof(*ff->release_args),
+  GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
kfree(ff);
return NULL;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3d598a5bb5b5..e040e2a2b621 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -66,7 +66,7 @@ static struct file_system_type fuseblk_fs_type;
 
 struct fuse_forget_link *fuse_alloc_forget(void)
 {
-   return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+   return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
 }
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
-- 
2.23.0.237.gc6a4ce50a0-goog

Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties

2018-01-24 Thread Khazhismel Kumykov

On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck  wrote:
> On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote:
>> Move the last used path to the end of the list (least preferred) so
>> that
>> ties are more evenly distributed.
>>
>> For example, in case with three paths with one that is slower than
>> others, the remaining two would be unevenly used if they tie. This is
>> due to the rotation not being a truely fair distribution.
>>
>> Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are
>> 'tied'
>> Three possible rotations:
>> (a, b, c) -> best path 'a'
>> (b, c, a) -> best path 'b'
>> (c, a, b) -> best path 'a'
>> (a, b, c) -> best path 'a'
>> (b, c, a) -> best path 'b'
>> (c, a, b) -> best path 'a'
>> ...
>
>
> This happens only if a and b actually have the same weight (e.g. queue
> length for the queue-length selector). If 'a' really receives more IO,
> its queue grows, and the selector will start preferring 'b', so the
> effect should level out automatically with the current code as soon as
> you have real IO going on. But maybe I haven't grasped what you're
> referring to as "tied".
Yes, for "tied" I'm referring to paths with the same weight. As queue
length grows it does tend to level out, but in the case where queue
length doesn't grow (in this example I'm imagining 2 concurrent
requests to the device) the bias does show and the selectors really do
send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a'
and 'b' are ~same speed).

>
> OTOH, if the "best" path has much lower queue length than the other
> paths for whatever reason, your pushing it to the tail will require a
> full list walk with every new call of the selector. I see tjat as a
> small disadvantage of your approach.
I see, with best at the tail, we would not see as much benefit from
the check if a path has no IOs on it in queue-length. In service-time
no such short circuit exists, so I don't believe anything changes
there. Am I understanding this correctly?

>
> Regards
> Martin
>

Thanks for your comments,
Khazhy

>>
>> So 'a' is used 2x more than 'b', although they should be used evenly.
>>
>> With this change, the most recently used path is always the least
>> preferred, removing this bias resulting in even distribution.
>> (a, b, c) -> best path 'a'
>> (b, c, a) -> best path 'b'
>> (c, a, b) -> best path 'a'
>> (c, b, a) -> best path 'b'
>> ...
>>
>> Signed-off-by: Khazhismel Kumykov 
>> ---
>>  drivers/md/dm-queue-length.c | 6 +++---
>>  drivers/md/dm-service-time.c | 6 +++---
>>  2 files changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-
>> length.c
>> index 23f178641794..969c4f1a3633 100644
>> --- a/drivers/md/dm-queue-length.c
>> +++ b/drivers/md/dm-queue-length.c
>> @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (list_empty(>valid_paths))
>>   goto out;
>>
>> - /* Change preferred (first in list) path to evenly balance.
>> */
>> - list_move_tail(s->valid_paths.next, >valid_paths);
>> -
>>   list_for_each_entry(pi, >valid_paths, list) {
>>   if (!best ||
>>   (atomic_read(>qlen) < atomic_read(
>> >qlen)))
>> @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (!best)
>>   goto out;
>>
>> + /* Move most recently used to least preferred to evenly
>> balance. */
>> + list_move_tail(>list, >valid_paths);
>> +
>>   ret = best->path;
>>  out:
>>   spin_unlock_irqrestore(>lock, flags);
>> diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-
>> time.c
>> index 7b8642045c55..f006a9005593 100644
>> --- a/drivers/md/dm-service-time.c
>> +++ b/drivers/md/dm-service-time.c
>> @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (list_empty(>valid_paths))
>>   goto out;
>>
>> - /* Change preferred (first in list) path to evenly balance.
>> */
>> - list_move_tail(s->valid_paths.next, >valid_paths);
>> -
>>   list_for_each_entry(pi, >valid_paths, list)
>>   if (!best || (st_compare_load(pi, best, nr_bytes) <
>> 0))
>>   best = pi;
>> @@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct
>> path_selector *ps, size_t nr_bytes)
>>   if (!best)
>>   goto out;
>>
>> + /* Move most recently used to least preferred to evenly
>> balance. */
>> + list_move_tail(>list, >valid_paths);
>> +
>>   ret = best->path;
>>  out:
>>   spin_unlock_irqrestore(>lock, flags);
>> --
>> dm-devel mailing list
>> dm-de...@redhat.com
>> https://www.redhat.com/mailman/listinfo/dm-devel
>
> --
> Dr. Martin Wilck , Tel. +49 (0)911 74053 2107
> SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
>


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [dm-devel] [PATCH] dm mpath selector: more evenly distribute ties

2018-01-24 Thread Khazhismel Kumykov

On Wed, Jan 24, 2018 at 11:09 AM, Martin Wilck  wrote:
> On Wed, 2018-01-24 at 10:44 -0800, Khazhismel Kumykov wrote:
>> On Wed, Jan 24, 2018 at 2:57 AM, Martin Wilck 
>> wrote:
>> > On Fri, 2018-01-19 at 15:07 -0800, Khazhismel Kumykov wrote:
>> > > Move the last used path to the end of the list (least preferred)
>> > > so
>> > > that
>> > > ties are more evenly distributed.
>> > >
>> > > For example, in case with three paths with one that is slower
>> > > than
>> > > others, the remaining two would be unevenly used if they tie.
>> > > This is
>> > > due to the rotation not being a truely fair distribution.
>> > >
>> > > Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are
>> > > 'tied'
>> > > Three possible rotations:
>> > > (a, b, c) -> best path 'a'
>> > > (b, c, a) -> best path 'b'
>> > > (c, a, b) -> best path 'a'
>> > > (a, b, c) -> best path 'a'
>> > > (b, c, a) -> best path 'b'
>> > > (c, a, b) -> best path 'a'
>> > > ...
>> >
>> >
>> > This happens only if a and b actually have the same weight (e.g.
>> > queue
>> > length for the queue-length selector). If 'a' really receives more
>> > IO,
>> > its queue grows, and the selector will start preferring 'b', so the
>> > effect should level out automatically with the current code as soon
>> > as
>> > you have real IO going on. But maybe I haven't grasped what you're
>> > referring to as "tied".
>>
>> Yes, for "tied" I'm referring to paths with the same weight. As queue
>> length grows it does tend to level out, but in the case where queue
>> length doesn't grow (in this example I'm imagining 2 concurrent
>> requests to the device) the bias does show and the selectors really
>> do
>> send 'a' 2x more requests than 'b' (when 'c' is much slower and 'a'
>> and 'b' are ~same speed).
>
> Have you actually observed this? I find the idea pretty academical that
> two paths would be walking "tied" this way. In practice, under IO load,
> I'd expect queue lengths (and service-times, for that matter) to
> fluctuate enough to prevent this effect to be measurable. But of
> course, I may be wrong. If you really did observe this, the next
> question would be: does hurt performance to an extent that can be
> noticed/measured? I reckon that if 'a' got saturated under the load,
> hurting performance, its queue length would grow quickly and 'b' would
> automatically get preferred.
This is fairly simple to observe - start two sync reader threads
against a device with 3 backing paths, with one path taking longer on
average to complete requests than the other two. One of the 'faster'
paths will be used ~2x more than the other. Perhaps not that common a
situation, but is a real one. The bias seemed simple to remove, so
that the two (or N) paths would be used equally.

I don't see a downside to more evenly distributing in this case,
although I can't say I've directly observed performance downsides for
a biased distribution (aside from the distribution being biased itself
being a downside).
The bias of note is inherent to the order paths are added to the
selector (and which path is 'always bad'), so if 'a' is saturated due
to this, then slows, once it recovers it will continue to be
preferred, versus in an even distribution.

Khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-16 Thread Khazhismel Kumykov

On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li  wrote:
> On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
>> Allows configuration additional bytes or ios before a throttle is
>> triggered.
>>
>> This allows implementation of a bucket style rate-limit/throttle on a
>> block device. Previously, bursting to a device was limited to allowance
>> granted in a single throtl_slice (similar to a bucket with limit N and
>> refill rate N/slice).
>>
>> Additional parameters bytes/io_burst_conf defined for tg, which define a
>> number of bytes/ios that must be depleted before throttling happens. A
>> tg that does not deplete this allowance functions as though it has no
>> configured limits. tgs earn additional allowance at rate defined by
>> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
>> kicks in. If a tg is idle for a while, it will again have some burst
>> allowance before it gets throttled again.
>>
>> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
>> when all "used" burst allowance would be earned back. trim_slice still
>> does progress slice_start as before and decrements *_disp as before, and
>> tgs continue to get bytes/ios in throtl_slice intervals.
>
> Can you describe why we need this? It would be great if you can describe the
> usage model and an example. Does this work for io.low/io.max or both?
>
> Thanks,
> Shaohua
>

Use case that brought this up was configuring limits for a remote
shared device. Bursting beyond io.max is desired but only for so much
before the limit kicks in, afterwards with sustained usage throughput
is capped. (This proactively avoids remote-side limits). In that case
one would configure in a root container io.max + io.burst, and
configure low/other limits on descendants sharing the resource on the
same node.

With this patch, so long as tg has not dispatched more than the burst,
no limit is applied at all by that tg, including limit imposed by
io.low in tg_iops_limit, etc.

Khazhy

>> Signed-off-by: Khazhismel Kumykov 
>> ---
>>  block/Kconfig|  11 +++
>>  block/blk-throttle.c | 192 
>> +++
>>  2 files changed, 189 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/Kconfig b/block/Kconfig
>> index 28ec55752b68..fbd05b419f93 100644
>> --- a/block/Kconfig
>> +++ b/block/Kconfig
>> @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW
>>
>>   Note, this is an experimental interface and could be changed someday.
>>
>> +config BLK_DEV_THROTTLING_BURST
>> +bool "Block throttling .burst allowance interface"
>> +depends on BLK_DEV_THROTTLING
>> +default n
>> +---help---
>> +Add .burst allowance for block throttling. Burst allowance allows for
>> +additional unthrottled usage, while still limiting speed for sustained
>> +usage.
>> +
>> +If in doubt, say N.
>> +
>>  config BLK_CMDLINE_PARSER
>>   bool "Block device command line partition parser"
>>   default n
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index 96ad32623427..27c084312772 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -157,6 +157,11 @@ struct throtl_grp {
>>   /* Number of bio's dispatched in current slice */
>>   unsigned int io_disp[2];
>>
>> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
>> + uint64_t bytes_burst_conf[2];
>> + unsigned int io_burst_conf[2];
>> +#endif
>> +
>>   unsigned long last_low_overflow_time[2];
>>
>>   uint64_t last_bytes_disp[2];
>> @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t 
>> gfp, int node)
>>   tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
>>   tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
>>   tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
>> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
>> + tg->bytes_burst_conf[READ] = 0;
>> + tg->bytes_burst_conf[WRITE] = 0;
>> + tg->io_burst_conf[READ] = 0;
>> + tg->io_burst_conf[WRITE] = 0;
>> +#endif
>>   /* LIMIT_LOW will have default value 0 */
>>
>>   tg->latency_target = DFL_LATENCY_TARGET;
>> @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct 
>> throtl_grp *tg, bool rw)
>>  tg->slice_end[rw], jiffies);
>>  }
>>
>> +/*
>> + * When current slice should end.
>> + *
>> + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wai

[PATCH] dm mpath selector: more evenly distribute ties

2018-01-19 Thread Khazhismel Kumykov

Move the last used path to the end of the list (least preferred) so that
ties are more evenly distributed.

For example, in case with three paths with one that is slower than
others, the remaining two would be unevenly used if they tie. This is
due to the rotation not being a truely fair distribution.

Illustrated: paths a, b, c, 'c' has 1 outstanding IO, a and b are 'tied'
Three possible rotations:
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
...

So 'a' is used 2x more than 'b', although they should be used evenly.

With this change, the most recently used path is always the least
preferred, removing this bias resulting in even distribution.
(a, b, c) -> best path 'a'
(b, c, a) -> best path 'b'
(c, a, b) -> best path 'a'
(c, b, a) -> best path 'b'
...

Signed-off-by: Khazhismel Kumykov 
---
 drivers/md/dm-queue-length.c | 6 +++---
 drivers/md/dm-service-time.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 23f178641794..969c4f1a3633 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (list_empty(>valid_paths))
goto out;
 
-   /* Change preferred (first in list) path to evenly balance. */
-   list_move_tail(s->valid_paths.next, >valid_paths);
-
list_for_each_entry(pi, >valid_paths, list) {
if (!best ||
(atomic_read(>qlen) < atomic_read(>qlen)))
@@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (!best)
goto out;
 
+   /* Move most recently used to least preferred to evenly balance. */
+   list_move_tail(>list, >valid_paths);
+
ret = best->path;
 out:
spin_unlock_irqrestore(>lock, flags);
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 7b8642045c55..f006a9005593 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (list_empty(>valid_paths))
goto out;
 
-   /* Change preferred (first in list) path to evenly balance. */
-   list_move_tail(s->valid_paths.next, >valid_paths);
-
list_for_each_entry(pi, >valid_paths, list)
if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
best = pi;
@@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct path_selector 
*ps, size_t nr_bytes)
if (!best)
goto out;
 
+   /* Move most recently used to least preferred to evenly balance. */
+   list_move_tail(>list, >valid_paths);
+
ret = best->path;
 out:
spin_unlock_irqrestore(>lock, flags);
-- 
2.16.0.rc1.238.g530d649a79-goog



smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH 1/3] fuse: on 64-bit store time in d_fsdata directly

2019-08-21 Thread Khazhismel Kumykov

Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use
->d_time"), as the additional memory can be significant. (In particular,
on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry,
this can consume significant memory.

Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dir.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dd0f64f7bc06..f9c59a296568 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -24,6 +24,18 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
set_bit(FUSE_I_ADVISE_RDPLUS, >state);
 }
 
+#if BITS_PER_LONG >= 64
+static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+   entry->d_fsdata = (void *) time;
+}
+
+static inline u64 fuse_dentry_time(struct dentry *entry)
+{
+   return (u64)entry->d_fsdata;
+}
+
+#else
 union fuse_dentry {
u64 time;
struct rcu_head rcu;
@@ -38,6 +50,7 @@ static inline u64 fuse_dentry_time(struct dentry *entry)
 {
return ((union fuse_dentry *) entry->d_fsdata)->time;
 }
+#endif
 
 /*
  * FUSE caches dentries and attributes with separate timeout.  The
@@ -242,6 +255,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
goto out;
 }
 
+#if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
@@ -254,16 +268,21 @@ static void fuse_dentry_release(struct dentry *dentry)
 
kfree_rcu(fd, rcu);
 }
+#endif
 
 const struct dentry_operations fuse_dentry_operations = {
.d_revalidate   = fuse_dentry_revalidate,
+#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
.d_release  = fuse_dentry_release,
+#endif
 };
 
 const struct dentry_operations fuse_root_dentry_operations = {
+#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
.d_release  = fuse_dentry_release,
+#endif
 };
 
 int fuse_valid_type(int m)
-- 
2.23.0.187.g17f5b7556c-goog

[PATCH 3/3] fuse: pass gfp flags to fuse_request_alloc

2019-08-21 Thread Khazhismel Kumykov

Instead of having a helper per flag

Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dev.c| 22 +++---
 fs/fuse/file.c   |  6 +++---
 fs/fuse/fuse_i.h |  6 +-
 fs/fuse/inode.c  |  4 ++--
 4 files changed, 9 insertions(+), 29 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a0d166a6596f..c957620ce7ba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -66,7 +66,7 @@ static struct page **fuse_req_pages_alloc(unsigned int 
npages, gfp_t flags,
return pages;
 }
 
-static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags)
 {
struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req) {
@@ -90,24 +90,8 @@ static struct fuse_req *__fuse_request_alloc(unsigned 
npages, gfp_t flags)
}
return req;
 }
-
-struct fuse_req *fuse_request_alloc(unsigned npages)
-{
-   return __fuse_request_alloc(npages, GFP_KERNEL);
-}
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
 
-struct fuse_req *fuse_request_alloc_account(unsigned int npages)
-{
-   return __fuse_request_alloc(npages, GFP_KERNEL_ACCOUNT);
-}
-EXPORT_SYMBOL_GPL(fuse_request_alloc_account);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
-{
-   return __fuse_request_alloc(npages, GFP_NOFS);
-}
-
 static void fuse_req_pages_free(struct fuse_req *req)
 {
if (req->pages != req->inline_pages)
@@ -207,7 +191,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn 
*fc, unsigned npages,
if (fc->conn_error)
goto out;
 
-   req = fuse_request_alloc(npages);
+   req = fuse_request_alloc(npages, GFP_KERNEL);
err = -ENOMEM;
if (!req) {
if (for_background)
@@ -316,7 +300,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct 
fuse_conn *fc,
wait_event(fc->blocked_waitq, fc->initialized);
/* Matches smp_wmb() in fuse_set_initialized() */
smp_rmb();
-   req = fuse_request_alloc(0);
+   req = fuse_request_alloc(0, GFP_KERNEL);
if (!req)
req = get_reserved_req(fc, file);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c584ad7478b3..ae8c8016bb8e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -50,7 +50,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
return NULL;
 
ff->fc = fc;
-   ff->reserved_req = fuse_request_alloc_account(0);
+   ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL_ACCOUNT);
if (unlikely(!ff->reserved_req)) {
kfree(ff);
return NULL;
@@ -1703,7 +1703,7 @@ static int fuse_writepage_locked(struct page *page)
 
set_page_writeback(page);
 
-   req = fuse_request_alloc_nofs(1);
+   req = fuse_request_alloc(1, GFP_NOFS);
if (!req)
goto err;
 
@@ -1923,7 +1923,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
 
err = -ENOMEM;
-   req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES);
+   req = fuse_request_alloc(FUSE_REQ_INLINE_PAGES, GFP_NOFS);
if (!req) {
__free_page(tmp_page);
goto out_unlock;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 08161b2d9b08..8080a51096e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -902,11 +902,7 @@ void __exit fuse_ctl_cleanup(void);
 /**
  * Allocate a request
  */
-struct fuse_req *fuse_request_alloc(unsigned npages);
-
-struct fuse_req *fuse_request_alloc_account(unsigned int npages);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags);
 
 bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
gfp_t flags);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index eab44ddc68b9..ad92e93eaddd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1178,13 +1178,13 @@ static int fuse_fill_super(struct super_block *sb, void 
*data, int silent)
/* Root dentry doesn't have .d_revalidate */
sb->s_d_op = _dentry_operations;
 
-   init_req = fuse_request_alloc(0);
+   init_req = fuse_request_alloc(0, GFP_KERNEL);
if (!init_req)
goto err_put_root;
__set_bit(FR_BACKGROUND, _req->flags);
 
if (is_bdev) {
-   fc->destroy_req = fuse_request_alloc(0);
+   fc->destroy_req = fuse_request_alloc(0, GFP_KERNEL);
if (!fc->destroy_req)
goto err_free_init_req;
}
-- 
2.23.0.187.g17f5b7556c-goog

[PATCH 2/3] fuse: kmemcg account fs data

2019-08-21 Thread Khazhismel Kumykov

account per-file, dentry, and inode data

accounts the per-file reserved request, adding new
fuse_request_alloc_account()

blockdev/superblock and temporary per-request data was left alone, as
this usually isn't accounted

Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dev.c| 6 ++
 fs/fuse/dir.c| 3 ++-
 fs/fuse/file.c   | 4 ++--
 fs/fuse/fuse_i.h | 2 ++
 fs/fuse/inode.c  | 3 ++-
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ea8237513dfa..a0d166a6596f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -97,6 +97,12 @@ struct fuse_req *fuse_request_alloc(unsigned npages)
 }
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
 
+struct fuse_req *fuse_request_alloc_account(unsigned int npages)
+{
+   return __fuse_request_alloc(npages, GFP_KERNEL_ACCOUNT);
+}
+EXPORT_SYMBOL_GPL(fuse_request_alloc_account);
+
 struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
 {
return __fuse_request_alloc(npages, GFP_NOFS);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f9c59a296568..2013e1222de7 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -258,7 +258,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
 #if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
-   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
return dentry->d_fsdata ? 0 : -ENOMEM;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ae2828beb00..c584ad7478b3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -45,12 +45,12 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
struct fuse_file *ff;
 
-   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
+   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
if (unlikely(!ff))
return NULL;
 
ff->fc = fc;
-   ff->reserved_req = fuse_request_alloc(0);
+   ff->reserved_req = fuse_request_alloc_account(0);
if (unlikely(!ff->reserved_req)) {
kfree(ff);
return NULL;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24dbca75..08161b2d9b08 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -904,6 +904,8 @@ void __exit fuse_ctl_cleanup(void);
  */
 struct fuse_req *fuse_request_alloc(unsigned npages);
 
+struct fuse_req *fuse_request_alloc_account(unsigned int npages);
+
 struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
 
 bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4bb885b0f032..eab44ddc68b9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -76,7 +76,8 @@ struct fuse_mount_data {
 
 struct fuse_forget_link *fuse_alloc_forget(void)
 {
-   return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+   return kzalloc(sizeof(struct fuse_forget_link),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 }
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
-- 
2.23.0.187.g17f5b7556c-goog

Re: [PATCH 3/3] fuse: pass gfp flags to fuse_request_alloc

2019-08-22 Thread Khazhismel Kumykov

On Wed, Aug 21, 2019 at 5:18 PM Shakeel Butt  wrote:
>
> On Wed, Aug 21, 2019 at 5:10 PM Khazhismel Kumykov  wrote:
> >
> > Instead of having a helper per flag
> >
> > Signed-off-by: Khazhismel Kumykov 
>
> I think it would be better to re-order the patch 2 and 3 of this
> series. There will be less code churn.

That makes sense to me, I'll follow up with a v2


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v2 1/3] fuse: on 64-bit store time in d_fsdata directly

2019-08-22 Thread Khazhismel Kumykov

Implements the optimization noted in f75fdf22b0a8 ("fuse: don't use
->d_time"), as the additional memory can be significant. (In particular,
on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry,
this can consume significant memory.

Reviewed-by: Shakeel Butt 
Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dir.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dd0f64f7bc06..f9c59a296568 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -24,6 +24,18 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
set_bit(FUSE_I_ADVISE_RDPLUS, >state);
 }
 
+#if BITS_PER_LONG >= 64
+static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+   entry->d_fsdata = (void *) time;
+}
+
+static inline u64 fuse_dentry_time(struct dentry *entry)
+{
+   return (u64)entry->d_fsdata;
+}
+
+#else
 union fuse_dentry {
u64 time;
struct rcu_head rcu;
@@ -38,6 +50,7 @@ static inline u64 fuse_dentry_time(struct dentry *entry)
 {
return ((union fuse_dentry *) entry->d_fsdata)->time;
 }
+#endif
 
 /*
  * FUSE caches dentries and attributes with separate timeout.  The
@@ -242,6 +255,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
goto out;
 }
 
+#if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
@@ -254,16 +268,21 @@ static void fuse_dentry_release(struct dentry *dentry)
 
kfree_rcu(fd, rcu);
 }
+#endif
 
 const struct dentry_operations fuse_dentry_operations = {
.d_revalidate   = fuse_dentry_revalidate,
+#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
.d_release  = fuse_dentry_release,
+#endif
 };
 
 const struct dentry_operations fuse_root_dentry_operations = {
+#if BITS_PER_LONG < 64
.d_init = fuse_dentry_init,
.d_release  = fuse_dentry_release,
+#endif
 };
 
 int fuse_valid_type(int m)
-- 
2.23.0.187.g17f5b7556c-goog

[PATCH v2 3/3] fuse: kmemcg account fs data

2019-08-22 Thread Khazhismel Kumykov

account per-file, dentry, and inode data

accounts the per-file reserved request, adding new
fuse_request_alloc_account()

blockdev/superblock and temporary per-request data was left alone, as
this usually isn't accounted

Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dir.c   | 3 ++-
 fs/fuse/file.c  | 4 ++--
 fs/fuse/inode.c | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f9c59a296568..2013e1222de7 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -258,7 +258,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, 
unsigned int flags)
 #if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
-   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+   dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
return dentry->d_fsdata ? 0 : -ENOMEM;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 572d8347ebcb..ae8c8016bb8e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -45,12 +45,12 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
struct fuse_file *ff;
 
-   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
+   ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
if (unlikely(!ff))
return NULL;
 
ff->fc = fc;
-   ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL);
+   ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL_ACCOUNT);
if (unlikely(!ff->reserved_req)) {
kfree(ff);
return NULL;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5afd1872b8b1..ad92e93eaddd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -76,7 +76,8 @@ struct fuse_mount_data {
 
 struct fuse_forget_link *fuse_alloc_forget(void)
 {
-   return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+   return kzalloc(sizeof(struct fuse_forget_link),
+  GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 }
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
-- 
2.23.0.187.g17f5b7556c-goog

[PATCH v2 2/3] fuse: pass gfp flags to fuse_request_alloc

2019-08-22 Thread Khazhismel Kumykov

Instead of having a helper per flag

Signed-off-by: Khazhismel Kumykov 
---
 fs/fuse/dev.c| 16 +++-
 fs/fuse/file.c   |  6 +++---
 fs/fuse/fuse_i.h |  4 +---
 fs/fuse/inode.c  |  4 ++--
 4 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ea8237513dfa..c957620ce7ba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -66,7 +66,7 @@ static struct page **fuse_req_pages_alloc(unsigned int 
npages, gfp_t flags,
return pages;
 }
 
-static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags)
 {
struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
if (req) {
@@ -90,18 +90,8 @@ static struct fuse_req *__fuse_request_alloc(unsigned 
npages, gfp_t flags)
}
return req;
 }
-
-struct fuse_req *fuse_request_alloc(unsigned npages)
-{
-   return __fuse_request_alloc(npages, GFP_KERNEL);
-}
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
 
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
-{
-   return __fuse_request_alloc(npages, GFP_NOFS);
-}
-
 static void fuse_req_pages_free(struct fuse_req *req)
 {
if (req->pages != req->inline_pages)
@@ -201,7 +191,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn 
*fc, unsigned npages,
if (fc->conn_error)
goto out;
 
-   req = fuse_request_alloc(npages);
+   req = fuse_request_alloc(npages, GFP_KERNEL);
err = -ENOMEM;
if (!req) {
if (for_background)
@@ -310,7 +300,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct 
fuse_conn *fc,
wait_event(fc->blocked_waitq, fc->initialized);
/* Matches smp_wmb() in fuse_set_initialized() */
smp_rmb();
-   req = fuse_request_alloc(0);
+   req = fuse_request_alloc(0, GFP_KERNEL);
if (!req)
req = get_reserved_req(fc, file);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ae2828beb00..572d8347ebcb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -50,7 +50,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
return NULL;
 
ff->fc = fc;
-   ff->reserved_req = fuse_request_alloc(0);
+   ff->reserved_req = fuse_request_alloc(0, GFP_KERNEL);
if (unlikely(!ff->reserved_req)) {
kfree(ff);
return NULL;
@@ -1703,7 +1703,7 @@ static int fuse_writepage_locked(struct page *page)
 
set_page_writeback(page);
 
-   req = fuse_request_alloc_nofs(1);
+   req = fuse_request_alloc(1, GFP_NOFS);
if (!req)
goto err;
 
@@ -1923,7 +1923,7 @@ static int fuse_writepages_fill(struct page *page,
struct fuse_inode *fi = get_fuse_inode(inode);
 
err = -ENOMEM;
-   req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES);
+   req = fuse_request_alloc(FUSE_REQ_INLINE_PAGES, GFP_NOFS);
if (!req) {
__free_page(tmp_page);
goto out_unlock;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24dbca75..8080a51096e9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -902,9 +902,7 @@ void __exit fuse_ctl_cleanup(void);
 /**
  * Allocate a request
  */
-struct fuse_req *fuse_request_alloc(unsigned npages);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+struct fuse_req *fuse_request_alloc(unsigned int npages, gfp_t flags);
 
 bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
gfp_t flags);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4bb885b0f032..5afd1872b8b1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1177,13 +1177,13 @@ static int fuse_fill_super(struct super_block *sb, void 
*data, int silent)
/* Root dentry doesn't have .d_revalidate */
sb->s_d_op = _dentry_operations;
 
-   init_req = fuse_request_alloc(0);
+   init_req = fuse_request_alloc(0, GFP_KERNEL);
if (!init_req)
goto err_put_root;
__set_bit(FR_BACKGROUND, _req->flags);
 
if (is_bdev) {
-   fc->destroy_req = fuse_request_alloc(0);
+   fc->destroy_req = fuse_request_alloc(0, GFP_KERNEL);
if (!fc->destroy_req)
goto err_free_init_req;
}
-- 
2.23.0.187.g17f5b7556c-goog

Re: [PATCH 1/1] epoll: call final ep_events_available() check under the lock

2020-05-05 Thread Khazhismel Kumykov

On Tue, May 5, 2020 at 1:04 PM Andrew Morton  wrote:
>
> On Tue, 05 May 2020 10:42:05 +0200 Roman Penyaev  wrote:
>
> > May I ask you to remove "epoll: ensure ep_poll() doesn't miss wakeup
> > events" from your -mm queue? Jason lately found out that the patch
> > does not fully solve the problem and this one patch is a second
> > attempt to do things correctly in a different way (namely to do
> > the final check under the lock). Previous changes are not needed.
>
> Where do we stand with Khazhismel's "eventpoll: fix missing wakeup for
> ovflist in ep_poll_callback"?
>
> http://lkml.kernel.org/r/20200424190039.192373-1-kha...@google.com
>

My understanding is - we need the ep_poll_callback fix on a logical
level (ovfllist was never triggering wakeup), and the two follow-ups
close races - in both how we add/remove from the wait queue, and how
we observe the ready list, which are needed if we only wake when we
add events, where before we were also waking when we were splicing
ovflist events when reading the ready list. As well, the first two
together are needed for epoll60 to pass in my testing.

Khazhy

smime.p7s
Description: S/MIME Cryptographic Signature

Re: WARNING in notify_change

2019-04-15 Thread Khazhismel Kumykov

I was able to reproduce this by setting security.capability xattr on a
blockdev file, then writing to it - when writing to the blockdev we
never lock the inode, so when we clear the capability we hit this
lockdep warning.

Is the issue here that we can set this xattr in the first place so we
have to clear it at all? Or should we really be locking the inode for
blockdevs after all? I'm not too familiar, but my gut says former

this reproducer is able to immediately crash machine running
linux-next-20190415:

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

char *disk = "/dev/loop0";

int main(void) {
int fd = open(disk, 0);
if (fd < 0)
printf("open: %d\n", errno);

system("dd if=/dev/zero of=a_file count=51200");
system("losetup /dev/loop0 a_file");

uint32_t value[5] = {
0x200,
7,
0x20d0,
6,
4
};

int res = fsetxattr(fd, "security.capability", ,
sizeof(value), XATTR_CREATE);
if (res < 0)
printf ("xattr: %d\n", errno);

int fd2 = open(disk, O_RDWR);
write(fd2, "hello", 5);
return 0;
}


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] ext4: add cond_resched() to ext4_mb_init_backend()

2019-04-15 Thread Khazhismel Kumykov

on non-preempt kernels for filesystems with large number of groups we
may take a long time (>50 ticks) initializing all the groups.

Signed-off-by: Khazhismel Kumykov 
---
 fs/ext4/mballoc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8ef5f12bbee2..c89f497ccf50 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2490,6 +2490,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
+   cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
-- 
2.21.0.392.gf8f6787159e-goog

Re: [PATCH] ext4: add cond_resched() to ext4_mb_init_backend()

2019-04-18 Thread Khazhismel Kumykov

On Thu, Apr 18, 2019 at 4:53 AM Jan Kara  wrote:
>
> On Mon 15-04-19 19:59:34, Khazhismel Kumykov wrote:
> > on non-preempt kernels for filesystems with large number of groups we
> > may take a long time (>50 ticks) initializing all the groups.
> >
> > Signed-off-by: Khazhismel Kumykov 
>
> Thanks for the patch. I'm not opposed to this but I'm just wondering: Has
> this caused any real issues to you? Or how have you come across this
> particular loop? Because I'd think we have more similar loops in ext4
> codebase...
>
> Honza
>

We have some instrumentation to warn for these longer periods without
scheduling, which admittedly does trigger relatively frequently from
various places. This particular loop just so happens to be during
system startup where it caught my eye.


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH v2] ext4: cond_resched in work-heavy group loops

2019-04-23 Thread Khazhismel Kumykov

These functions may take a long time looping over many groups, which
may cause issues for non-preempt kernels.
ext4_mb_init_backend()
ext4_setup_system_zone()
ext4_mb_release()

Signed-off-by: Khazhismel Kumykov 
---
v2: 
 - a few other places that in testing showed to be slow

 fs/ext4/block_validity.c | 1 +
 fs/ext4/mballoc.c| 2 ++
 2 files changed, 3 insertions(+)

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 913061c0de1b..16134469ea3c 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -155,6 +155,7 @@ int ext4_setup_system_zone(struct super_block *sb)
return 0;
 
for (i=0; i < ngroups; i++) {
+   cond_resched();
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8ef5f12bbee2..99ba720dbb7a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2490,6 +2490,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
+   cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
@@ -2705,6 +2706,7 @@ int ext4_mb_release(struct super_block *sb)
 
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
+   cond_resched();
grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
kfree(grinfo->bb_bitmap);
-- 
2.21.0.593.g511ec345e18-goog

[PATCH] fs/fat: add cond_resched to fat_count_free_clusters

2018-10-10 Thread Khazhismel Kumykov

On non-preempt kernels this loop can take a long time (more than 50
ticks) processing through entries.

Signed-off-by: Khazhismel Kumykov 
---
 fs/fat/fatent.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index defc2168de91..f58c0cacc531 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb)
if (ops->ent_get() == FAT_ENT_FREE)
free++;
} while (fat_ent_next(sbi, ));
+   cond_resched();
}
sbi->free_clusters = free;
sbi->free_clus_valid = 1;
-- 
2.19.0.605.g01d371f741-goog

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-17 Thread Khazhismel Kumykov

On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov  wrote:
> Hi,
>
> I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on
> the same tree at the same, behavior time blows up and all the calls hang with
> large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
> entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang
> my test VMs)
>
> This seems to be due to thrashing on the dentry locks in multiple threads
> tightly looping calling d_walk waiting for a shrink_dentry_list call (also
> thrashing the locks) to complete. (d_invalidate loops calling d_walk so long 
> as
> any dentry in the tree is in a dcache shrink list).
>
> There was a similar report recently "soft lockup in d_invalidate()" that
> proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink
> list already, which does prevent this hang/lockup in this case as well, 
> although
> I'm not sure it doesn't violate the contract for d_invalidate. (Although the
> entries in a shrink list should be dropped eventually, not necessarily by the
> time d_invalidate returns).
>
> If someone more familiar with the dcache could provide input I would 
> appreciate.
>
> A reliable repro on mainline is:
>  - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
>  - create a directory and populate with ~100k files with content to
> populate dcache
>  - create some background processes opening/reading files in this folder (5
>   while true; cat $file was enough to get an indefinite hang for me)
>  - cause the directory to need to be invalidated (e.g., make_bad_inode on the
> directory)
>
> This results in the background processes all entering d_invalidate and 
> hanging,
> while with just one process in d_invalidate (e.g., stat'ing a file in the dir)
> things go pretty quickly as expected.
>
>
> (The proposed patch from other thread)
>
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 7b8feb6..3a3b0f37 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
> struct dentry *dentry)
>  goto out;
>
>  if (dentry->d_flags & DCACHE_SHRINK_LIST) {
> -   data->found++;
> +   goto out;
>  } else {
>  if (dentry->d_flags & DCACHE_LRU_LIST)
>  d_lru_del(dentry);
>
>
> khazhy

Would this change actually violate any guarantees? select_collect
looks like it used to ignore unused dentries that were part of a
shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found
would not be incremented). Once the dentry is on a shrink list would
it be unreachable anyways, so returning from d_invalidate before all
the shrink lists finish processing would be ok?

khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-25 Thread Khazhismel Kumykov

On Mon, May 22, 2017 at 11:18 AM, Khazhismel Kumykov  wrote:
> On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov  wrote:
>>
>> On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov  
>> wrote:
>> > Hi,
>> >
>> > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate 
>> > on
>> > the same tree at the same, behavior time blows up and all the calls hang 
>> > with
>> > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
>> > entries in d_subdir, and 5 or so threads calling d_invalidate was able to 
>> > hang
>> > my test VMs)
>> >
>> > This seems to be due to thrashing on the dentry locks in multiple threads
>> > tightly looping calling d_walk waiting for a shrink_dentry_list call (also
>> > thrashing the locks) to complete. (d_invalidate loops calling d_walk so 
>> > long as
>> > any dentry in the tree is in a dcache shrink list).
>> >
>> > There was a similar report recently "soft lockup in d_invalidate()" that
>> > proposed in the d_invalidate d_walk to ignore dentries marked as in a 
>> > shrink
>> > list already, which does prevent this hang/lockup in this case as well, 
>> > although
>> > I'm not sure it doesn't violate the contract for d_invalidate. (Although 
>> > the
>> > entries in a shrink list should be dropped eventually, not necessarily by 
>> > the
>> > time d_invalidate returns).
>> >
>> > If someone more familiar with the dcache could provide input I would 
>> > appreciate.
>> >
>> > A reliable repro on mainline is:
>> >  - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
>> >  - create a directory and populate with ~100k files with content to
>> > populate dcache
>> >  - create some background processes opening/reading files in this folder (5
>> >   while true; cat $file was enough to get an indefinite hang for me)
>> >  - cause the directory to need to be invalidated (e.g., make_bad_inode on 
>> > the
>> > directory)
>> >
>> > This results in the background processes all entering d_invalidate and 
>> > hanging,
>> > while with just one process in d_invalidate (e.g., stat'ing a file in the 
>> > dir)
>> > things go pretty quickly as expected.
>> >
>> >
>> > (The proposed patch from other thread)
>> >
>> > diff --git a/fs/dcache.c b/fs/dcache.c
>> > index 7b8feb6..3a3b0f37 100644
>> > --- a/fs/dcache.c
>> > +++ b/fs/dcache.c
>> > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
>> > struct dentry *dentry)
>> >  goto out;
>> >
>> >  if (dentry->d_flags & DCACHE_SHRINK_LIST) {
>> > -   data->found++;
>> > +   goto out;
>> >  } else {
>> >  if (dentry->d_flags & DCACHE_LRU_LIST)
>> >  d_lru_del(dentry);
>> >
>> >
>> > khazhy
>>
>> Would this change actually violate any guarantees? select_collect
>> looks like it used to ignore unused dentries that were part of a
>> shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found
>> would not be incremented). Once the dentry is on a shrink list would
>> it be unreachable anyways, so returning from d_invalidate before all
>> the shrink lists finish processing would be ok?
>>
>> khazhy
>
> Pinging in case this got missed, would appreciate thoughts from someone more
> familiar with the dcache.
>
> My previous email wasn't completely correct, while before fe91522a7ba8
> ("don't remove from shrink list in select_collect()") d_invalidate
> would not busy
> wait for other workers calling shrink_list to compete, it would return -EBUSY,
> rather than success, so a change to return success without waiting would not
> be equivalent behavior before. Currently, we will loop calling d_walk 
> repeatedly
> causing the extreme slowdown observed.
>
> I still want to understand better, in d_invalidate if we can return
> without pruning
> in-use dentries safely, would returning before unused dentries are
> pruned, so long
> as we know they will be pruned by another task (are on a shrink list),
> be safe as well?
> If not, would it make more sense to have have a mutual exclusion on calling
> d_invalidate on the same dentries simultaneously?
>
> khazhy

ping

Again to summarize:
With a directory with larg

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-22 Thread Khazhismel Kumykov

On Wed, May 17, 2017 at 2:58 PM Khazhismel Kumykov  wrote:
>
> On Mon, May 15, 2017 at 5:05 PM, Khazhismel Kumykov  wrote:
> > Hi,
> >
> > I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate 
> > on
> > the same tree at the same, behavior time blows up and all the calls hang 
> > with
> > large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
> > entries in d_subdir, and 5 or so threads calling d_invalidate was able to 
> > hang
> > my test VMs)
> >
> > This seems to be due to thrashing on the dentry locks in multiple threads
> > tightly looping calling d_walk waiting for a shrink_dentry_list call (also
> > thrashing the locks) to complete. (d_invalidate loops calling d_walk so 
> > long as
> > any dentry in the tree is in a dcache shrink list).
> >
> > There was a similar report recently "soft lockup in d_invalidate()" that
> > proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink
> > list already, which does prevent this hang/lockup in this case as well, 
> > although
> > I'm not sure it doesn't violate the contract for d_invalidate. (Although the
> > entries in a shrink list should be dropped eventually, not necessarily by 
> > the
> > time d_invalidate returns).
> >
> > If someone more familiar with the dcache could provide input I would 
> > appreciate.
> >
> > A reliable repro on mainline is:
> >  - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
> >  - create a directory and populate with ~100k files with content to
> > populate dcache
> >  - create some background processes opening/reading files in this folder (5
> >   while true; cat $file was enough to get an indefinite hang for me)
> >  - cause the directory to need to be invalidated (e.g., make_bad_inode on 
> > the
> > directory)
> >
> > This results in the background processes all entering d_invalidate and 
> > hanging,
> > while with just one process in d_invalidate (e.g., stat'ing a file in the 
> > dir)
> > things go pretty quickly as expected.
> >
> >
> > (The proposed patch from other thread)
> >
> > diff --git a/fs/dcache.c b/fs/dcache.c
> > index 7b8feb6..3a3b0f37 100644
> > --- a/fs/dcache.c
> > +++ b/fs/dcache.c
> > @@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
> > struct dentry *dentry)
> >  goto out;
> >
> >  if (dentry->d_flags & DCACHE_SHRINK_LIST) {
> > -   data->found++;
> > +   goto out;
> >  } else {
> >  if (dentry->d_flags & DCACHE_LRU_LIST)
> >  d_lru_del(dentry);
> >
> >
> > khazhy
>
> Would this change actually violate any guarantees? select_collect
> looks like it used to ignore unused dentries that were part of a
> shrink list before fe91522a7ba82ca1a51b07e19954b3825e4aaa22 (found
> would not be incremented). Once the dentry is on a shrink list would
> it be unreachable anyways, so returning from d_invalidate before all
> the shrink lists finish processing would be ok?
>
> khazhy

Pinging in case this got missed, would appreciate thoughts from someone more
familiar with the dcache.

My previous email wasn't completely correct, while before fe91522a7ba8
("don't remove from shrink list in select_collect()") d_invalidate
would not busy
wait for other workers calling shrink_list to compete, it would return -EBUSY,
rather than success, so a change to return success without waiting would not
be equivalent behavior before. Currently, we will loop calling d_walk repeatedly
causing the extreme slowdown observed.

I still want to understand better, in d_invalidate if we can return
without pruning
in-use dentries safely, would returning before unused dentries are
pruned, so long
as we know they will be pruned by another task (are on a shrink list),
be safe as well?
If not, would it make more sense to have have a mutual exclusion on calling
d_invalidate on the same dentries simultaneously?

khazhy

Hang/soft lockup in d_invalidate with simultaneous calls

2017-05-15 Thread Khazhismel Kumykov

Hi,

I'm seeing behavior in d_invalidate, if multiple threads call d_invalidate on
the same tree at the same, behavior time blows up and all the calls hang with
large enough trees/enough simultaneous callers. (e.g. a directory w/ 100k
entries in d_subdir, and 5 or so threads calling d_invalidate was able to hang
my test VMs)

This seems to be due to thrashing on the dentry locks in multiple threads
tightly looping calling d_walk waiting for a shrink_dentry_list call (also
thrashing the locks) to complete. (d_invalidate loops calling d_walk so long as
any dentry in the tree is in a dcache shrink list).

There was a similar report recently "soft lockup in d_invalidate()" that
proposed in the d_invalidate d_walk to ignore dentries marked as in a shrink
list already, which does prevent this hang/lockup in this case as well, although
I'm not sure it doesn't violate the contract for d_invalidate. (Although the
entries in a shrink list should be dropped eventually, not necessarily by the
time d_invalidate returns).

If someone more familiar with the dcache could provide input I would appreciate.

A reliable repro on mainline is:
 - create a mountpoint with DCACHE_OP_REVALIDATE, e.g. fuse passthrough
 - create a directory and populate with ~100k files with content to
populate dcache
 - create some background processes opening/reading files in this folder (5
  while true; cat $file was enough to get an indefinite hang for me)
 - cause the directory to need to be invalidated (e.g., make_bad_inode on the
directory)

This results in the background processes all entering d_invalidate and hanging,
while with just one process in d_invalidate (e.g., stat'ing a file in the dir)
things go pretty quickly as expected.


(The proposed patch from other thread)

diff --git a/fs/dcache.c b/fs/dcache.c
index 7b8feb6..3a3b0f37 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1364,7 +1364,7 @@ static enum d_walk_ret select_collect(void *_data,
struct dentry *dentry)
 goto out;

 if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-   data->found++;
+   goto out;
 } else {
 if (dentry->d_flags & DCACHE_LRU_LIST)
 d_lru_del(dentry);


khazhy


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] ext4: Return EIO on read error in ext4_find_entry

2017-06-22 Thread Khazhismel Kumykov

Previously, a read error would be ignored and we would eventually return
NULL from ext4_find_entry, which signals "no such file or directory". We
should be returning EIO.

Signed-off-by: Khazhismel Kumykov 
---
 fs/ext4/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 404256caf9cf..6fa17e9f7b6d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1428,11 +1428,11 @@ static struct buffer_head * ext4_find_entry (struct 
inode *dir,
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
-   /* read error, skip block & hope for the best */
EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
 (unsigned long) block);
brelse(bh);
-   goto next;
+   ret = ERR_PTR(-EIO);
+   goto cleanup_and_exit;
}
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
-- 
2.13.1.611.g7e3b11ae1-goog



smime.p7s
Description: S/MIME Cryptographic Signature

Re: [PATCH] ext4: Return EIO on read error in ext4_find_entry

2017-06-23 Thread Khazhismel Kumykov

On Fri, Jun 23, 2017 at 2:36 PM, Andreas Dilger  wrote:
> On Jun 23, 2017, at 6:26 AM, Theodore Ts'o  wrote:
>>
>> The problem is that if we continue, successive reads may all take
>> seconds or minutes to fail, thus tieing up the process for a long
>> time.
>
> Sorry, I don't understand where the seconds or minutes of delay come from?
> Is that because of long SCSI retries in the block layer, or in the disk
> itself, or something caused specifically because of this code?

For a networked block device we may be retrying for a while before
giving up, although this also applies to the initial failed read.

>
>> By returning EIO right away, we can "fast fail".
>
> But it seems like you don't necessarily need to fail at all?  Something like 
> the
> following would return an error if the entry is not found, but still search 
> the
> rest of the leaf blocks (if any) before giving up:
>

Giving up early or checking future blocks both work, critical thing
here is not returning NULL after seeing a read error.
Previously to this the behavior was to continue to check future blocks
after a read error, and it seemed OK.

Khazhy

smime.p7s
Description: S/MIME Cryptographic Signature

Re: Hang/soft lockup in d_invalidate with simultaneous calls

2017-06-12 Thread Khazhismel Kumykov

On Fri, Jun 2, 2017 at 11:47 PM, Khazhismel Kumykov  wrote:
> On Fri, Jun 2, 2017 at 11:20 PM, Al Viro  wrote:
>> The thing is, unlike shrink_dcache_parent() we *can* bugger off as
>> soon as we'd found no victims, nothing mounted and dentry itself
>> is unhashed.  We can't do anything in select_collect() (we would've
>> broken shrink_dcache_parent() that way), but we can do unhashing
>> in check_and_drop() in "really nothing to do" case and we can return
>> from d_invalidate() after that.  So how about this:
> That does the trick.

I'm not entirely familiar the process here, is the above change
committed somewhere, should I propose a patch?


smime.p7s
Description: S/MIME Cryptographic Signature

[PATCH] bfq: silence lockdep for bfqd/ioc lock inversion

2021-03-19 Thread Khazhismel Kumykov

lockdep warns of circular locking due to inversion between
bfq_insert_requests and bfq_exit_icq. If we end freeing a request when
merging, we *may* grab an ioc->lock if that request is the last refcount
to that ioc. bfq_bio_merge also potentially could have this ordering.
bfq_exit_icq, conversely, grabs bfqd but is always called with ioc->lock
held.

bfq_exit_icq may either be called from put_io_context_active with ioc
refcount raised, ioc_release_fn after the last refcount was already
dropped, or ioc_clear_queue, which is only called while queue is
quiesced or exiting, so the inverted orderings should never conflict.

Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as
an extra scheduler")

Signed-off-by: Khazhismel Kumykov 
---
 block/bfq-iosched.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

Noticed this lockdep running xfstests (generic/464) on top of a bfq
block device. I was also able to tease it out w/ binary trying to issue
requests that would end up merging while rapidly swapping the active
scheduler. As far as I could see, the deadlock would not actually occur,
so this patch opts to change lock class for the inverted case.

bfqd -> ioc :
[ 2995.524557] __lock_acquire+0x18f5/0x2660
[ 2995.524562] lock_acquire+0xb4/0x3a0
[ 2995.524565] _raw_spin_lock_irqsave+0x3f/0x60
[ 2995.524569] put_io_context+0x33/0x90.  -> ioc->lock grabbed
[ 2995.524573] blk_mq_free_request+0x51/0x140
[ 2995.524577] blk_put_request+0xe/0x10
[ 2995.524580] blk_attempt_req_merge+0x1d/0x30
[ 2995.524585] elv_attempt_insert_merge+0x56/0xa0
[ 2995.524590] blk_mq_sched_try_insert_merge+0x4b/0x60
[ 2995.524595] bfq_insert_requests+0x9e/0x18c0.    -> bfqd->lock grabbed
[ 2995.524598] blk_mq_sched_insert_requests+0xd6/0x2b0
[ 2995.524602] blk_mq_flush_plug_list+0x154/0x280
[ 2995.524606] blk_finish_plug+0x40/0x60
[ 2995.524609] ext4_writepages+0x696/0x1320
[ 2995.524614] do_writepages+0x1c/0x80
[ 2995.524621] __filemap_fdatawrite_range+0xd7/0x120
[ 2995.524625] sync_file_range+0xac/0xf0
[ 2995.524642] __x64_sys_sync_file_range+0x44/0x70
[ 2995.524646] do_syscall_64+0x31/0x40
[ 2995.524649] entry_SYSCALL_64_after_hwframe+0x44/0xae

ioc -> bfqd
[ 2995.524490] _raw_spin_lock_irqsave+0x3f/0x60
[ 2995.524498] bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed
[ 2995.524512] put_io_context_active+0x78/0xb0 -> ioc->lock grabbed
[ 2995.524516] exit_io_context+0x48/0x50
[ 2995.524519] do_exit+0x7e9/0xdd0
[ 2995.524526] do_group_exit+0x54/0xc0
[ 2995.524530] __x64_sys_exit_group+0x18/0x20
[ 2995.524534] do_syscall_64+0x31/0x40
[ 2995.524537] entry_SYSCALL_64_after_hwframe+0x44/0xae

Another trace where we grab ioc -> bfqd through bfq_exit_icq is when
changing elevator
               -> #1 (&(>lock)->rlock){-.-.}:
[  646.890820]        lock_acquire+0x9b/0x140
[  646.894868]        _raw_spin_lock_irqsave+0x3b/0x50
[  646.899707]        bfq_exit_icq_bfqq+0x47/0x1f0
[  646.904196]        bfq_exit_icq+0x21/0x30
[  646.908160]        ioc_destroy_icq+0xf3/0x130
[  646.912466]        ioc_clear_queue+0xb8/0x140
[  646.916771]        elevator_switch_mq+0xa4/0x3c0
[  646.921333]        elevator_switch+0x5f/0x340

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 95586137194e..cb50ac0ffe80 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5027,7 +5027,14 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, 
bool is_sync)
if (bfqq && bfqd) {
unsigned long flags;
 
-   spin_lock_irqsave(>lock, flags);
+   /* bfq_exit_icq is usually called with ioc->lock held, which is
+* inverse order from elsewhere, which may grab ioc->lock
+* under bfqd->lock if we merge requests and drop the last ioc
+* refcount. Since exit_icq is either called with a refcount,
+* or with queue quiesced, use a differnet lock class to
+* silence lockdep
+*/
+   spin_lock_irqsave_nested(>lock, flags, 1);
bfqq->bic = NULL;
bfq_exit_bfqq(bfqd, bfqq);
bic_set_bfqq(bic, NULL, is_sync);
-- 
2.31.0.rc2.261.g7f71774620-goog

90 matches

Mail list logo