Re: [PATCH 1/2] block: get rid of BLK_MAX_TIMEOUT

2018-12-06 Thread Weiping Zhang
Bart Van Assche  于2018年12月5日周三 下午11:59写道:
>
> On Wed, 2018-12-05 at 23:37 +0800, Weiping Zhang wrote:
> > @@ -130,7 +119,7 @@ void blk_add_timer(struct request *req)
> >* than an existing one, modify the timer. Round up to next nearest
> >* second.
> >*/
> > - expiry = blk_rq_timeout(round_jiffies_up(expiry));
> > + expiry = round_jiffies_up(expiry);
>
> If you would have read the comment above this code, you would have known
> that this patch does not do what you think it does and additionally that
> it introduces a regression.
>
Let's paste full comments here:
/*
 * If the timer isn't already pending or this timeout is earlier
 * than an existing one, modify the timer. Round up to next nearest
 * second.
 */
Before this patch, even we set io_timeout to 30*HZ(default), but
blk_rq_timeout always return jiffies +5*HZ,
  [1]. if there no pending request in timeout list, the timer callback
blk_rq_timed_out_timer will be called after 5*HZ, and then
blk_mq_check_expired will check is there exist some request
was delayed by compare jiffies and request->deadline, obvious
request is not timeout because we set request's timeouts is 30*HZ.
So for this case timer callback should be called at jiffies + 30 instead
of jiffies + 5*HZ.

  [2]. if there are pending request in timeout list, we compare request's
expiry and queue's expiry. If time_after(request->expire, queue->expire) modify
queue->timeout.expire to request->expire, otherwise do nothing.

So I think this patch just solve problem in [1], no other regression, or what's
I missing here ?

Thanks
Weiping
> Bart.


[PATCH 1/2] block: get rid of BLK_MAX_TIMEOUT

2018-12-05 Thread Weiping Zhang
Since io_timeout was added to sysfs, the user can tune timeouts
by that attribute, so kill this limitation.

Signed-off-by: Weiping Zhang 
---
 block/blk-timeout.c | 13 +
 block/blk.h |  4 
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 124c26128bf6..6b2b0f7e5929 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,17 +89,6 @@ void blk_abort_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
 
-unsigned long blk_rq_timeout(unsigned long timeout)
-{
-   unsigned long maxt;
-
-   maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
-   if (time_after(timeout, maxt))
-   timeout = maxt;
-
-   return timeout;
-}
-
 /**
  * blk_add_timer - Start timeout timer for a single request
  * @req:   request that is about to start running.
@@ -130,7 +119,7 @@ void blk_add_timer(struct request *req)
 * than an existing one, modify the timer. Round up to next nearest
 * second.
 */
-   expiry = blk_rq_timeout(round_jiffies_up(expiry));
+   expiry = round_jiffies_up(expiry);
 
if (!timer_pending(>timeout) ||
time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index 848278c52030..f0d0a18fb276 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -7,9 +7,6 @@
 #include 
 #include "blk-mq.h"
 
-/* Max future timer expiry for timeouts */
-#define BLK_MAX_TIMEOUT(5 * HZ)
-
 #ifdef CONFIG_DEBUG_FS
 extern struct dentry *blk_debugfs_root;
 #endif
@@ -151,7 +148,6 @@ static inline bool bio_integrity_endio(struct bio *bio)
 }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
-- 
2.14.1



[PATCH 2/2] block: add BLK_DEF_TIMEOUT

2018-12-05 Thread Weiping Zhang
Add an obvious definition for default request timeout.

Signed-off-by: Weiping Zhang 
---
 block/blk-mq.c | 2 +-
 block/blk.h| 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 900550594651..53337d5c37af 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2845,7 +2845,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
goto err_hctxs;
 
INIT_WORK(>timeout_work, blk_mq_timeout_work);
-   blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
+   blk_queue_rq_timeout(q, set->timeout ? set->timeout : BLK_DEF_TIMEOUT);
 
q->tag_set = set;
 
diff --git a/block/blk.h b/block/blk.h
index f0d0a18fb276..7905b952b7b3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -7,6 +7,9 @@
 #include 
 #include "blk-mq.h"
 
+/* the default request timeout */
+#define BLK_DEF_TIMEOUT (30 * HZ)
+
 #ifdef CONFIG_DEBUG_FS
 extern struct dentry *blk_debugfs_root;
 #endif
-- 
2.14.1



[PATCH 0/2] get rid of BLK_MAX_TIMEOUT

2018-12-05 Thread Weiping Zhang
Get rid of BLK_MAX_TIMEOUT, since we want use io_timeout to control
the maximun timeouts of a request, so remove this limitation.

Weiping Zhang (2):
  block: get rid of BLK_MAX_TIMEOUT
  block: add BLK_DEF_TIMEOUT

 block/blk-mq.c  |  2 +-
 block/blk-timeout.c | 13 +
 block/blk.h |  5 ++---
 3 files changed, 4 insertions(+), 16 deletions(-)

-- 
2.14.1



Re: [PATCH v3] block: add documentation for io_timeout

2018-12-05 Thread Weiping Zhang
Weiping Zhang  于2018年12月5日周三 下午10:49写道:
>
> Christoph Hellwig  于2018年12月5日周三 下午10:40写道:
> >
> > Can you please also send a patch to not show this attribute for
> > drivers without a timeout handler?  Thanks!
> >
Is there a simple way do that ?

Shall we return -ENOTSUPP when user read/write this attribute when
driver has no timeout handler ?
> Do you means checking q->mq_ops->timeout is NULL or not ?
>
> static void blk_mq_rq_timed_out(struct request *req, bool reserved)
> {
> req->rq_flags |= RQF_TIMED_OUT;
> If it's NULL, only re-start a timer for this request in blk_add_timer.
> Is there some defect ? if show this attribute and the driver doesn't
> support timeout handler ?
>
> if (req->q->mq_ops->timeout) {
> enum blk_eh_timer_return ret;
>
> ret = req->q->mq_ops->timeout(req, reserved);
> if (ret == BLK_EH_DONE)
> return;
> WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
>     }
>
> blk_add_timer(req);
> }
>
>
> > On Wed, Dec 05, 2018 at 10:17:06PM +0800, Weiping Zhang wrote:
> > > Add documentation for /sys/block//queue/io_timeout.
> > >
> > > Signed-off-by: Weiping Zhang 
> > > ---
> > >  Documentation/ABI/testing/sysfs-block | 10 ++
> > >  Documentation/block/queue-sysfs.txt   |  7 +++
> > >  2 files changed, 17 insertions(+)
> > >
> > > diff --git a/Documentation/ABI/testing/sysfs-block 
> > > b/Documentation/ABI/testing/sysfs-block
> > > index dea212db9df3..f254a374710a 100644
> > > --- a/Documentation/ABI/testing/sysfs-block
> > > +++ b/Documentation/ABI/testing/sysfs-block
> > > @@ -271,3 +271,13 @@ Description:
> > >   size of 512B sectors of the zones of the device, with
> > >   the eventual exception of the last zone of the device
> > >   which may be smaller.
> > > +
> > > +What:/sys/block//queue/io_timeout
> > > +Date:November 2018
> > > +Contact: Weiping Zhang 
> > > +Description:
> > > + io_timeout is a request’s timeouts at block layer in
> > > + milliseconds. When the underlying driver starts processing
> > > + a request, the generic block layer will start a timer, if
> > > + this request cannot be completed in io_timeout milliseconds,
> > > + a timeout event will occur.
> > > diff --git a/Documentation/block/queue-sysfs.txt 
> > > b/Documentation/block/queue-sysfs.txt
> > > index 2c1e67058fd3..f0c9bbce73fd 100644
> > > --- a/Documentation/block/queue-sysfs.txt
> > > +++ b/Documentation/block/queue-sysfs.txt
> > > @@ -67,6 +67,13 @@ If set to a value larger than 0, the kernel will put 
> > > the process issuing
> > >  IO to sleep for this amont of microseconds before entering classic
> > >  polling.
> > >
> > > +io_timeout (RW)
> > > +---
> > > +This is a request’s timeouts at block layer in milliseconds. When the
> > > +underlying driver starts processing a request, the generic block layer
> > > +will start a timer, if this request cannot be completed in io_timeout
> > > +milliseconds, a timeout event will occur.
> > > +
> > >  iostats (RW)
> > >  -
> > >  This file is used to control (on/off) the iostats accounting of the
> > > --
> > > 2.14.1
> > >
> > ---end quoted text---


Re: [PATCH v3] block: add documentation for io_timeout

2018-12-05 Thread Weiping Zhang
Christoph Hellwig  于2018年12月5日周三 下午10:40写道:
>
> Can you please also send a patch to not show this attribute for
> drivers without a timeout handler?  Thanks!
>
Do you means checking q->mq_ops->timeout is NULL or not ?

static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
req->rq_flags |= RQF_TIMED_OUT;
If it's NULL, only re-start a timer for this request in blk_add_timer.
Is there some defect ? if show this attribute and the driver doesn't
support timeout handler ?

if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;

ret = req->q->mq_ops->timeout(req, reserved);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
}

blk_add_timer(req);
}


> On Wed, Dec 05, 2018 at 10:17:06PM +0800, Weiping Zhang wrote:
> > Add documentation for /sys/block//queue/io_timeout.
> >
> > Signed-off-by: Weiping Zhang 
> > ---
> >  Documentation/ABI/testing/sysfs-block | 10 ++
> >  Documentation/block/queue-sysfs.txt   |  7 +++
> >  2 files changed, 17 insertions(+)
> >
> > diff --git a/Documentation/ABI/testing/sysfs-block 
> > b/Documentation/ABI/testing/sysfs-block
> > index dea212db9df3..f254a374710a 100644
> > --- a/Documentation/ABI/testing/sysfs-block
> > +++ b/Documentation/ABI/testing/sysfs-block
> > @@ -271,3 +271,13 @@ Description:
> >   size of 512B sectors of the zones of the device, with
> >   the eventual exception of the last zone of the device
> >   which may be smaller.
> > +
> > +What:/sys/block//queue/io_timeout
> > +Date:November 2018
> > +Contact: Weiping Zhang 
> > +Description:
> > + io_timeout is a request’s timeouts at block layer in
> > + milliseconds. When the underlying driver starts processing
> > + a request, the generic block layer will start a timer, if
> > + this request cannot be completed in io_timeout milliseconds,
> > + a timeout event will occur.
> > diff --git a/Documentation/block/queue-sysfs.txt 
> > b/Documentation/block/queue-sysfs.txt
> > index 2c1e67058fd3..f0c9bbce73fd 100644
> > --- a/Documentation/block/queue-sysfs.txt
> > +++ b/Documentation/block/queue-sysfs.txt
> > @@ -67,6 +67,13 @@ If set to a value larger than 0, the kernel will put the 
> > process issuing
> >  IO to sleep for this amont of microseconds before entering classic
> >  polling.
> >
> > +io_timeout (RW)
> > +---
> > +This is a request’s timeouts at block layer in milliseconds. When the
> > +underlying driver starts processing a request, the generic block layer
> > +will start a timer, if this request cannot be completed in io_timeout
> > +milliseconds, a timeout event will occur.
> > +
> >  iostats (RW)
> >  -
> >  This file is used to control (on/off) the iostats accounting of the
> > --
> > 2.14.1
> >
> ---end quoted text---


[PATCH v3] block: add documentation for io_timeout

2018-12-05 Thread Weiping Zhang
Add documentation for /sys/block//queue/io_timeout.

Signed-off-by: Weiping Zhang 
---
 Documentation/ABI/testing/sysfs-block | 10 ++
 Documentation/block/queue-sysfs.txt   |  7 +++
 2 files changed, 17 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block 
b/Documentation/ABI/testing/sysfs-block
index dea212db9df3..f254a374710a 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -271,3 +271,13 @@ Description:
size of 512B sectors of the zones of the device, with
the eventual exception of the last zone of the device
which may be smaller.
+
+What:  /sys/block//queue/io_timeout
+Date:  November 2018
+Contact:   Weiping Zhang 
+Description:
+   io_timeout is a request’s timeouts at block layer in
+   milliseconds. When the underlying driver starts processing
+   a request, the generic block layer will start a timer, if
+   this request cannot be completed in io_timeout milliseconds,
+   a timeout event will occur.
diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index 2c1e67058fd3..f0c9bbce73fd 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -67,6 +67,13 @@ If set to a value larger than 0, the kernel will put the 
process issuing
 IO to sleep for this amont of microseconds before entering classic
 polling.
 
+io_timeout (RW)
+---
+This is a request’s timeouts at block layer in milliseconds. When the
+underlying driver starts processing a request, the generic block layer
+will start a timer, if this request cannot be completed in io_timeout
+milliseconds, a timeout event will occur.
+
 iostats (RW)
 -
 This file is used to control (on/off) the iostats accounting of the
-- 
2.14.1



[PATCH] block: add documentation for io_timeout

2018-11-28 Thread Weiping Zhang
add documentation for /sys/block//queue/io_timeout

Signed-off-by: Weiping Zhang 
---
 Documentation/ABI/testing/sysfs-block | 9 +
 Documentation/block/queue-sysfs.txt   | 6 ++
 2 files changed, 15 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block 
b/Documentation/ABI/testing/sysfs-block
index dea212db9df3..f37cca16 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -271,3 +271,12 @@ Description:
size of 512B sectors of the zones of the device, with
the eventual exception of the last zone of the device
which may be smaller.
+
+What:  /sys/block//queue/io_timeout
+Date:  November 2018
+Contact:   Weiping Zhang 
+Description:
+   io_timeout is the timeout in millisecods of a request in
+   block layer. The block layer will start a timer when low
+   level device driver start the request, and cancle timer
+   when request completes.
diff --git a/Documentation/block/queue-sysfs.txt 
b/Documentation/block/queue-sysfs.txt
index 2c1e67058fd3..cbd44fe056fa 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -67,6 +67,12 @@ If set to a value larger than 0, the kernel will put the 
process issuing
 IO to sleep for this amont of microseconds before entering classic
 polling.
 
+io_timeout (RW)
+---
+This is the timeout in millisecods of a request in block layer.
+The block layer will start a timer when low level device driver start
+the request, and cancle timer when request completes.
+
 iostats (RW)
 -
 This file is used to control (on/off) the iostats accounting of the
-- 
2.14.1



[PATCH v3] block: add io timeout to sysfs

2018-11-28 Thread Weiping Zhang
Give a interface to adjust io timeout(ms) by device.

Signed-off-by: Weiping Zhang 
---

Changes since v2:
* use msecs_to_jiffies and jiffies_to_msecs

Changes since v1:
* make sure timeout > 0

 block/blk-sysfs.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 80eef48fddc8..9f0cb370b39b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -417,6 +417,26 @@ static ssize_t queue_poll_store(struct request_queue *q, 
const char *page,
return ret;
 }
 
+static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout));
+}
+
+static ssize_t queue_io_timeout_store(struct request_queue *q, const char 
*page,
+ size_t count)
+{
+   unsigned int val;
+   int err;
+
+   err = kstrtou32(page, 10, );
+   if (err || val == 0)
+   return -EINVAL;
+
+   blk_queue_rq_timeout(q, msecs_to_jiffies(val));
+
+   return count;
+}
+
 static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 {
if (!wbt_rq_qos(q))
@@ -685,6 +705,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_io_timeout_entry = {
+   .attr = {.name = "io_timeout", .mode = 0644 },
+   .show = queue_io_timeout_show,
+   .store = queue_io_timeout_store,
+};
+
 static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = 0644 },
.show = queue_wb_lat_show,
@@ -734,6 +760,7 @@ static struct attribute *default_attrs[] = {
_dax_entry.attr,
_wb_lat_entry.attr,
_poll_delay_entry.attr,
+   _io_timeout_entry.attr,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
_sample_time_entry.attr,
 #endif
-- 
2.14.1



Re: [RFC PATCH v2] block: add io timeout to sysfs

2018-11-28 Thread Weiping Zhang
Hi Jens,

It's useful if user want a short timeout when a disk doesn't work normally.
Would you give some comments for this patch,

Thanks a lot
Weiping Zhang  于2018年11月19日周一 下午10:30写道:
>
> Give a interface to adjust io timeout by device.
>
> Signed-off-by: Weiping Zhang 
> ---
>
> Changes since v1:
> * make sure timeout > 0
>
>  block/blk-sysfs.c | 27 +++
>  1 file changed, 27 insertions(+)
>
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 80eef48fddc8..90a927514d30 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -417,6 +417,26 @@ static ssize_t queue_poll_store(struct request_queue *q, 
> const char *page,
> return ret;
>  }
>
> +static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
> +{
> +   return sprintf(page, "%u\n", q->rq_timeout);
> +}
> +
> +static ssize_t queue_io_timeout_store(struct request_queue *q, const char 
> *page,
> + size_t count)
> +{
> +   unsigned int val;
> +   int err;
> +
> +   err = kstrtou32(page, 10, );
> +   if (err || val == 0)
> +   return -EINVAL;
> +
> +   blk_queue_rq_timeout(q, val);
> +
> +   return count;
> +}
> +
>  static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
>  {
> if (!wbt_rq_qos(q))
> @@ -685,6 +705,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
> .show = queue_dax_show,
>  };
>
> +static struct queue_sysfs_entry queue_io_timeout_entry = {
> +   .attr = {.name = "io_timeout", .mode = 0644 },
> +   .show = queue_io_timeout_show,
> +   .store = queue_io_timeout_store,
> +};
> +
>  static struct queue_sysfs_entry queue_wb_lat_entry = {
> .attr = {.name = "wbt_lat_usec", .mode = 0644 },
> .show = queue_wb_lat_show,
> @@ -734,6 +760,7 @@ static struct attribute *default_attrs[] = {
> _dax_entry.attr,
> _wb_lat_entry.attr,
> _poll_delay_entry.attr,
> +   _io_timeout_entry.attr,
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> _sample_time_entry.attr,
>  #endif
> --
> 2.14.1
>


[RFC PATCH v2] block: add io timeout to sysfs

2018-11-19 Thread Weiping Zhang
Give a interface to adjust io timeout by device.

Signed-off-by: Weiping Zhang 
---

Changes since v1:
* make sure timeout > 0

 block/blk-sysfs.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 80eef48fddc8..90a927514d30 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -417,6 +417,26 @@ static ssize_t queue_poll_store(struct request_queue *q, 
const char *page,
return ret;
 }
 
+static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%u\n", q->rq_timeout);
+}
+
+static ssize_t queue_io_timeout_store(struct request_queue *q, const char 
*page,
+ size_t count)
+{
+   unsigned int val;
+   int err;
+
+   err = kstrtou32(page, 10, );
+   if (err || val == 0)
+   return -EINVAL;
+
+   blk_queue_rq_timeout(q, val);
+
+   return count;
+}
+
 static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 {
if (!wbt_rq_qos(q))
@@ -685,6 +705,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_io_timeout_entry = {
+   .attr = {.name = "io_timeout", .mode = 0644 },
+   .show = queue_io_timeout_show,
+   .store = queue_io_timeout_store,
+};
+
 static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = 0644 },
.show = queue_wb_lat_show,
@@ -734,6 +760,7 @@ static struct attribute *default_attrs[] = {
_dax_entry.attr,
_wb_lat_entry.attr,
_poll_delay_entry.attr,
+   _io_timeout_entry.attr,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
_sample_time_entry.attr,
 #endif
-- 
2.14.1



[RFC] block: add io timeout to sysfs

2018-11-17 Thread Weiping Zhang
Give a interface to adjust io timeout by device.

Signed-off-by: Weiping Zhang 
---
 block/blk-sysfs.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 80eef48fddc8..0cabfb935e71 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -417,6 +417,26 @@ static ssize_t queue_poll_store(struct request_queue *q, 
const char *page,
return ret;
 }
 
+static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%u\n", q->rq_timeout);
+}
+
+static ssize_t queue_io_timeout_store(struct request_queue *q, const char 
*page,
+ size_t count)
+{
+   unsigned int val;
+   int err;
+
+   err = kstrtou32(page, 10, );
+   if (err)
+   return -EINVAL;
+
+   blk_queue_rq_timeout(q, val);
+
+   return count;
+}
+
 static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 {
if (!wbt_rq_qos(q))
@@ -685,6 +705,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_io_timeout_entry = {
+   .attr = {.name = "io_timeout", .mode = 0644 },
+   .show = queue_io_timeout_show,
+   .store = queue_io_timeout_store,
+};
+
 static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = 0644 },
.show = queue_wb_lat_show,
@@ -734,6 +760,7 @@ static struct attribute *default_attrs[] = {
_dax_entry.attr,
_wb_lat_entry.attr,
_poll_delay_entry.attr,
+   _io_timeout_entry.attr,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
_sample_time_entry.attr,
 #endif
-- 
2.14.1



Re: [RFC PATCH 0/2] use larger max_request_size for virtio_blk

2018-04-07 Thread Weiping Zhang
2018-04-05 22:29 GMT+08:00 Jens Axboe <ax...@kernel.dk>:
> On 4/5/18 4:09 AM, Weiping Zhang wrote:
>> Hi,
>>
>> For virtio block device, actually there is no a hard limit for max request
>> size, and virtio_blk driver set -1 to blk_queue_max_hw_sectors(q, -1U);.
>> But it doesn't work, because there is a default upper limitation
>> BLK_DEF_MAX_SECTORS (1280 sectors). So this series want to add a new helper
>> blk_queue_max_hw_sectors_no_limit to set a proper max reqeust size.
>>
>> Weiping Zhang (2):
>>   blk-setting: add new helper blk_queue_max_hw_sectors_no_limit
>>   virtio_blk: add new module parameter to set max request size
>>
>>  block/blk-settings.c   | 20 
>>  drivers/block/virtio_blk.c | 32 ++--
>>  include/linux/blkdev.h |  2 ++
>>  3 files changed, 52 insertions(+), 2 deletions(-)
>
> The driver should just use blk_queue_max_hw_sectors() to set the limit,
> and then the soft limit can be modified by a udev rule. Technically the
> driver doesn't own the software limit, it's imposed to ensure that we
> don't introduce too much latency per request.
>
> Your situation is no different from many other setups, where the
> hw limit is much higher than the default 1280k.
>
Hi Martin, Jens,

It seems more reasonable to change software limitation by udev rule,
thanks you.

>
> ___
> Virtualization mailing list
> virtualizat...@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC PATCH 1/2] blk-setting: add new helper blk_queue_max_hw_sectors_no_limit

2018-04-05 Thread Weiping Zhang
There is a default upper limitation BLK_DEF_MAX_SECTORS, but for
some virtual block device driver there is no such limitation. So
add a new help to set max request size.

Signed-off-by: Weiping Zhang <zhangweip...@didichuxing.com>
---
 block/blk-settings.c   | 20 
 include/linux/blkdev.h |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 48ebe6b..685c30c 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -253,6 +253,26 @@ void blk_queue_max_hw_sectors(struct request_queue *q, 
unsigned int max_hw_secto
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
+/* same as blk_queue_max_hw_sectors but without default upper limitation */
+void blk_queue_max_hw_sectors_no_limit(struct request_queue *q,
+   unsigned int max_hw_sectors)
+{
+   struct queue_limits *limits = >limits;
+   unsigned int max_sectors;
+
+   if ((max_hw_sectors << 9) < PAGE_SIZE) {
+   max_hw_sectors = 1 << (PAGE_SHIFT - 9);
+   printk(KERN_INFO "%s: set to minimum %d\n",
+  __func__, max_hw_sectors);
+   }
+
+   limits->max_hw_sectors = max_hw_sectors;
+   max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
+   limits->max_sectors = max_sectors;
+   q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
+}
+EXPORT_SYMBOL(blk_queue_max_hw_sectors_no_limit);
+
 /**
  * blk_queue_chunk_sectors - set size of the chunk for this queue
  * @q:  the request queue for the device
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed63f3b..2250709 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1243,6 +1243,8 @@ extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
+extern void blk_queue_max_hw_sectors_no_limit(struct request_queue *,
+   unsigned int);
 extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_discard_segments(struct request_queue *,
-- 
2.9.4



[RFC PATCH 0/2] use larger max_request_size for virtio_blk

2018-04-05 Thread Weiping Zhang
Hi,

For virtio block device, actually there is no a hard limit for max request
size, and virtio_blk driver set -1 to blk_queue_max_hw_sectors(q, -1U);.
But it doesn't work, because there is a default upper limitation
BLK_DEF_MAX_SECTORS (1280 sectors). So this series want to add a new helper
blk_queue_max_hw_sectors_no_limit to set a proper max reqeust size.

Weiping Zhang (2):
  blk-setting: add new helper blk_queue_max_hw_sectors_no_limit
  virtio_blk: add new module parameter to set max request size

 block/blk-settings.c   | 20 
 drivers/block/virtio_blk.c | 32 ++--
 include/linux/blkdev.h |  2 ++
 3 files changed, 52 insertions(+), 2 deletions(-)

-- 
2.9.4



[RFC PATCH 2/2] virtio_blk: add new module parameter to set max request size

2018-04-05 Thread Weiping Zhang
Actually there is no upper limitation, so add new module parameter
to provide a way to set a proper max request size for virtio block.
Using a larger request size can improve sequence performance in theory,
and reduce the interaction between guest and hypervisor.

Signed-off-by: Weiping Zhang <zhangweip...@didichuxing.com>
---
 drivers/block/virtio_blk.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c..5ac6d59 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -64,6 +64,34 @@ struct virtblk_req {
struct scatterlist sg[];
 };
 
+
+static int max_request_size_set(const char *val, const struct kernel_param 
*kp);
+
+static const struct kernel_param_ops max_request_size_ops = {
+   .set = max_request_size_set,
+   .get = param_get_uint,
+};
+
+static unsigned long max_request_size = 4096; /* in unit of KiB */
+module_param_cb(max_request_size, _request_size_ops, _request_size,
+   0444);
+MODULE_PARM_DESC(max_request_size, "set max request size, in unit of KiB");
+
+static int max_request_size_set(const char *val, const struct kernel_param *kp)
+{
+   int ret;
+   unsigned int size_kb, page_kb = 1 << (PAGE_SHIFT - 10);
+
+   ret = kstrtouint(val, 10, _kb);
+   if (ret != 0)
+   return -EINVAL;
+
+   if (size_kb < page_kb)
+   return -EINVAL;
+
+   return param_set_uint(val, kp);
+}
+
 static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
 {
switch (vbr->status) {
@@ -730,8 +758,8 @@ static int virtblk_probe(struct virtio_device *vdev)
/* We can handle whatever the host told us to handle. */
blk_queue_max_segments(q, vblk->sg_elems-2);
 
-   /* No real sector limit. */
-   blk_queue_max_hw_sectors(q, -1U);
+   /* No real sector limit, use 512b (max_request_size << 10) >> 9 */
+   blk_queue_max_hw_sectors_no_limit(q, max_request_size << 1);
 
/* Host can optionally specify maximum segment size and number of
 * segments. */
-- 
2.9.4



Re: [PATCH v2] bdi: show error log when fail to create bdi debugfs entry

2018-01-19 Thread weiping zhang
2018-01-20 3:54 GMT+08:00 Jens Axboe <ax...@kernel.dk>:
> On 1/19/18 10:36 AM, weiping zhang wrote:
>> 2018-01-11 0:36 GMT+08:00 weiping zhang <zhangweip...@didichuxing.com>:
>>> bdi debugfs dir/file may create fail, add error log here.
>>>
>>> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
>>> ---
>>> V1->V2:
>>> fix indentation and make log message more clear
>>>
>>>  mm/backing-dev.c | 4 +++-
>>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>>> index b5f940c..0a49665 100644
>>> --- a/mm/backing-dev.c
>>> +++ b/mm/backing-dev.c
>>> @@ -885,7 +885,9 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
>>> char *fmt, va_list args)
>>> cgwb_bdi_register(bdi);
>>> bdi->dev = dev;
>>>
>>> -   bdi_debug_register(bdi, dev_name(dev));
>>> +   if (bdi_debug_register(bdi, dev_name(dev)))
>>> +   pr_warn("blkdev %s: creation of bdi debugfs entries 
>>> failed.\n",
>>> +   dev_name(dev));
>>> set_bit(WB_registered, >wb.state);
>>>
>>> spin_lock_bh(_lock);
>>> --
>>
>> Hi Jens,
>>
>> madam has no permission to create debuts entry if SELINUX is enable at
>> Fedora and Centos,
>> and we have revert 6d0e4827b72 Revert "bdi: add error handle for
>> bdi_debug_register", that is to say
>> bdi debugfs is not the key component of block device, this patch just
>> add warning log.
>
> Have we fixed the case where we know it will trigger?

The reason is that mdadm has no permission to create dir/file under
/sys/kernel/debug/
, I think we can solve it in two possible ways.

1.Add proper SELINUX policy to allow mdadm create
/sys/kernel/debug/bdi/xxx, but not
every user add this allowance, so kernel show a warning for this case.

2.Split mdadm into 2 part, Firstly, user proccess mdadm trigger a
kwork and wait a event
done, secondly kwork will create gendisk)and then wake up event. But
it is more likely a
hack, and it may break SELINUX mechanism, so I give up this way.
https://marc.info/?l=linux-mm=151456540928231=2

> --
> Jens Axboe
>


Re: [PATCH] blk-throttle: use queue_is_rq_based

2018-01-19 Thread weiping zhang
2018-01-20 3:58 GMT+08:00 Jens Axboe <ax...@kernel.dk>:
> On 1/19/18 10:40 AM, weiping zhang wrote:
>> use queue_is_rq_based instead of open code.
>>
>> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
>> ---
>>  block/blk-throttle.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
>> index 96ad326..457e985 100644
>> --- a/block/blk-throttle.c
>> +++ b/block/blk-throttle.c
>> @@ -2456,7 +2456,7 @@ void blk_throtl_register_queue(struct request_queue *q)
>>   td->throtl_slice = DFL_THROTL_SLICE_HD;
>>  #endif
>>
>> - td->track_bio_latency = !q->mq_ops && !q->request_fn;
>> + td->track_bio_latency = !(queue_is_rq_based(q));
>
> Kill the extra parenthesis here.

Oh, kill it at V2.
> --
> Jens Axboe
>


[PATCH v2] blk-throttle: use queue_is_rq_based

2018-01-19 Thread weiping zhang
use queue_is_rq_based instead of open code.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e136f5e..c475f0f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2489,7 +2489,7 @@ void blk_throtl_register_queue(struct request_queue *q)
td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
-   td->track_bio_latency = !q->mq_ops && !q->request_fn;
+   td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
 }
-- 
2.9.4



[PATCH] blk-throttle: use queue_is_rq_based

2018-01-19 Thread weiping zhang
use queue_is_rq_based instead of open code.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad326..457e985 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2456,7 +2456,7 @@ void blk_throtl_register_queue(struct request_queue *q)
td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
-   td->track_bio_latency = !q->mq_ops && !q->request_fn;
+   td->track_bio_latency = !(queue_is_rq_based(q));
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
 }
-- 
2.9.4



Re: [PATCH v2] bdi: show error log when fail to create bdi debugfs entry

2018-01-19 Thread weiping zhang
2018-01-11 0:36 GMT+08:00 weiping zhang <zhangweip...@didichuxing.com>:
> bdi debugfs dir/file may create fail, add error log here.
>
> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> ---
> V1->V2:
> fix indentation and make log message more clear
>
>  mm/backing-dev.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index b5f940c..0a49665 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -885,7 +885,9 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
> char *fmt, va_list args)
> cgwb_bdi_register(bdi);
> bdi->dev = dev;
>
> -   bdi_debug_register(bdi, dev_name(dev));
> +   if (bdi_debug_register(bdi, dev_name(dev)))
> +   pr_warn("blkdev %s: creation of bdi debugfs entries 
> failed.\n",
> +   dev_name(dev));
> set_bit(WB_registered, >wb.state);
>
> spin_lock_bh(_lock);
> --

Hi Jens,

madam has no permission to create debuts entry if SELINUX is enable at
Fedora and Centos,
and we have revert 6d0e4827b72 Revert "bdi: add error handle for
bdi_debug_register", that is to say
bdi debugfs is not the key component of block device, this patch just
add warning log.

Thanks


Re: [RFC] blk-throttle: export io_serviced_recursive, io_service_bytes_recursive

2018-01-18 Thread weiping zhang
2017-12-11 23:17 GMT+08:00 Tejun Heo <t...@kernel.org>:
> On Mon, Dec 11, 2017 at 10:56:25PM +0800, weiping zhang wrote:
>> export these two interface for cgroup-v1.
>>
>> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
>
> Acked-by: Tejun Heo <t...@kernel.org>
>

Hi Jens,

Have you got time to check this patch ?

Thanks

> Thanks.
>
> --
> tejun


[RFC PATCH] blktrace: fail earlier if blk_trace in use

2018-01-15 Thread weiping zhang
add a check before allocate resource for blk_trace, if it's in use.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 kernel/trace/blktrace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 987d9a9a..16c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -491,6 +491,9 @@ static int do_blk_trace_setup(struct request_queue *q, char 
*name, dev_t dev,
struct dentry *dir = NULL;
int ret;
 
+   if (unlikely(q->blk_trace))
+   return -EBUSY;
+
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
 
-- 
2.9.4



[PATCH v2] bdi: show error log when fail to create bdi debugfs entry

2018-01-10 Thread weiping zhang
bdi debugfs dir/file may create fail, add error log here.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
V1->V2:
fix indentation and make log message more clear

 mm/backing-dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b5f940c..0a49665 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -885,7 +885,9 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
char *fmt, va_list args)
cgwb_bdi_register(bdi);
bdi->dev = dev;
 
-   bdi_debug_register(bdi, dev_name(dev));
+   if (bdi_debug_register(bdi, dev_name(dev)))
+   pr_warn("blkdev %s: creation of bdi debugfs entries failed.\n",
+   dev_name(dev));
set_bit(WB_registered, >wb.state);
 
spin_lock_bh(_lock);
-- 
2.9.4



Re: [PATCH] bdi: show error log when fail to create bdi debugfs entry

2018-01-10 Thread weiping zhang
2018-01-11 0:10 GMT+08:00 Bart Van Assche <bart.vanass...@wdc.com>:
> On Wed, 2018-01-10 at 23:18 +0800, weiping zhang wrote:
>> bdi debugfs dir/file may create fail, add error log here.
>>
>> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
>> ---
>>  mm/backing-dev.c | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
>> index b5f940c..9117c21 100644
>> --- a/mm/backing-dev.c
>> +++ b/mm/backing-dev.c
>> @@ -885,7 +885,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
>> char *fmt, va_list args)
>>   cgwb_bdi_register(bdi);
>>   bdi->dev = dev;
>>
>> - bdi_debug_register(bdi, dev_name(dev));
>> +  if (bdi_debug_register(bdi, dev_name(dev)))
>> +  pr_warn("blkdev %s create bdi debugfs failed\n", 
>> dev_name(dev));
>>   set_bit(WB_registered, >wb.state);
>>
>>   spin_lock_bh(_lock);
>
> The indentation of the if-statement is inconsistent. Have you verified this
> patch with checkpatch before submitting it?
>
> Additionally, the error message is hard to understand. Please make it more 
> clear,
> e.g. as follows: "%s: creation of bdi debugfs entries failed.\n".
>
OK, fix them at V2, thanks a lot
> Thanks,
>
> Bart.


[PATCH] bdi: show error log when fail to create bdi debugfs entry

2018-01-10 Thread weiping zhang
bdi debugfs dir/file may create fail, add error log here.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b5f940c..9117c21 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -885,7 +885,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
char *fmt, va_list args)
cgwb_bdi_register(bdi);
bdi->dev = dev;
 
-   bdi_debug_register(bdi, dev_name(dev));
+if (bdi_debug_register(bdi, dev_name(dev)))
+pr_warn("blkdev %s create bdi debugfs failed\n", 
dev_name(dev));
set_bit(WB_registered, >wb.state);
 
spin_lock_bh(_lock);
-- 
2.9.4



Re: [RFC] blk-throttle: export io_serviced_recursive, io_service_bytes_recursive

2017-12-29 Thread weiping zhang
2017-12-11 23:17 GMT+08:00 Tejun Heo <t...@kernel.org>:
> On Mon, Dec 11, 2017 at 10:56:25PM +0800, weiping zhang wrote:
>> export these two interface for cgroup-v1.
>>
>> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
>
> Acked-by: Tejun Heo <t...@kernel.org>
>
> Thanks.
>

Hi Jens,

these two interface are useful to collect blkcg info, would you pick it up ?

Thanks


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-29 Thread weiping zhang
On Fri, Dec 22, 2017 at 08:04:23AM -0600, Bruno Wolff III wrote:
> On Fri, Dec 22, 2017 at 21:20:10 +0800,
>  weiping zhang <zwp10...@gmail.com> wrote:
> >2017-12-22 12:53 GMT+08:00 Bruno Wolff III <br...@wolff.to>:
> >>On Thu, Dec 21, 2017 at 17:16:03 -0600,
> >> Bruno Wolff III <br...@wolff.to> wrote:
> >>>
> >>>
> >>>Enforcing mode alone isn't enough as I tested that one one machine at home
> >>>and it didn't trigger the problem. I'll try another machine late tonight.
> >>
> >>
> >>I got the problem to occur on my i686 machine when booting in enforcing
> >>mode. This machine uses raid 1 vua mdraid which may or may not be a factor
> >>in this problem. The boot log has a trace at the end and might be helpful,
> >>so I'm attaching it here.
> >Hi Bruno,
> >I can reproduce this issue in my QEMU test VM easily, just add an soft
> >RAID1, always trigger
> >that warning, I'll debug it later.
> 
> Great. When you have a fix, I can test it.
This issue can trigger easily in Centos7.3, if meet two factors:
1. SELINUX in enforceing mode
2. mdadm try to create new gendisk.

if disable SELINUX or let it in permissive mode, issue disappear.
As Jens has revert that commit, it seems boot normally, actually
this is no diretor created under /sys/kernel/debug/bdi/, though
has no effect on disk workflow.

As james said before, "debugfs files should be treated as optional",
so kernel give warning here is enough.

So there are 2 ways to fix this issue:
1. Add proper SELINUX policy allow mdadm create dir at debugfs
2. mdadm don't create gendisk directly, first mdadm trigger a kwork and
wait it done, let kwork create gendisk.
A possible change for MD like following:

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e4dee0..86ead5a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -90,6 +90,7 @@
 EXPORT_SYMBOL(md_cluster_mod);
 
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_probe_wq;
 static struct workqueue_struct *md_wq;
 static struct workqueue_struct *md_misc_wq;
 
@@ -5367,10 +5368,27 @@ static int md_alloc(dev_t dev, char *name)
return error;
 }
 
+static void md_probe_work_fn(struct work_struct *ws)
+{
+   struct md_probe_work *mpw = container_of(ws, struct md_probe_work,
+   work);
+   md_alloc(mpw->dev, NULL);
+   mpw->done = 1;
+   wake_up(>wait);
+}
+
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
-   if (create_on_open)
-   md_alloc(dev, NULL);
+   struct md_probe_work mpw;
+
+   if (create_on_open) {
+   init_waitqueue_head();
+   mpw.dev = dev;
+   mpw.done = 0;
+   INIT_WORK(, md_probe_work_fn);
+   queue_work(md_probe_wq, );
+   wait_event(mpw.wait, mpw.done);
+   }
return NULL;
 }
 
@@ -9023,9 +9041,13 @@ static int __init md_init(void)
 {
int ret = -ENOMEM;
 
+   md_probe_wq = alloc_workqueue("md_probe", 0, 0);
+   if (!md_probe_wq)
+   goto err_wq;
+
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
-   goto err_wq;
+   goto err_probe_wq;
 
md_misc_wq = alloc_workqueue("md_misc", 0, 0);
if (!md_misc_wq)
@@ -9055,6 +9077,8 @@ static int __init md_init(void)
destroy_workqueue(md_misc_wq);
 err_misc_wq:
destroy_workqueue(md_wq);
+err_probe_wq:
+   destroy_workqueue(md_probe_wq);
 err_wq:
return ret;
 }
@@ -9311,6 +9335,7 @@ static __exit void md_exit(void)
}
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
+   destroy_workqueue(md_probe_wq);
 }
 
 subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d6bcf0..3953896 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -487,6 +487,13 @@ enum recovery_flags {
MD_RECOVERY_ERROR,  /* sync-action interrupted because io-error */
 };
 
+struct md_probe_work {
+   struct work_struct work;
+   wait_queue_head_t wait;
+   dev_t dev;
+   int done;
+};
+
 static inline int __must_check mddev_lock(struct mddev *mddev)
 {
return mutex_lock_interruptible(>reconfig_mutex);


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-29 Thread weiping zhang
On Fri, Dec 22, 2017 at 08:04:23AM -0600, Bruno Wolff III wrote:
> On Fri, Dec 22, 2017 at 21:20:10 +0800,
>  weiping zhang <zwp10...@gmail.com> wrote:
> >2017-12-22 12:53 GMT+08:00 Bruno Wolff III <br...@wolff.to>:
> >>On Thu, Dec 21, 2017 at 17:16:03 -0600,
> >> Bruno Wolff III <br...@wolff.to> wrote:
> >>>
> >>>
> >>>Enforcing mode alone isn't enough as I tested that one one machine at home
> >>>and it didn't trigger the problem. I'll try another machine late tonight.
> >>
> >>
> >>I got the problem to occur on my i686 machine when booting in enforcing
> >>mode. This machine uses raid 1 vua mdraid which may or may not be a factor
> >>in this problem. The boot log has a trace at the end and might be helpful,
> >>so I'm attaching it here.
> >Hi Bruno,
> >I can reproduce this issue in my QEMU test VM easily, just add an soft
> >RAID1, always trigger
> >that warning, I'll debug it later.
> 
> Great. When you have a fix, I can test it.
This issue can trigger easily in Centos7.3 + kernel-4.15-rc3, if meet two 
factors:
1. SELINUX in enforceing mode
2. mdadm try to create new gendisk.

if disable SELINUX or let it in permissive mode, issue disappear.
As Jens has revert that commit, it seems boot normally, actually
there is no diretory created under /sys/kernel/debug/bdi/, though
has no effect on disk workflow.

As James said before, "debugfs files should be treated as optional",
so kernel give warning here is enough.

So, we may solve this issue in two ways:
1. Add proper SELINUX policy that give permission to mdadm for debugfs.
2. Split mdadm into 2 part, Firstly, user proccess mdadm trigger a kwork,
secondly kwork will create gendisk)and mdadm wait it done, Like
following: 

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e4dee0..86ead5a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -90,6 +90,7 @@
 EXPORT_SYMBOL(md_cluster_mod);
 
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_probe_wq;
 static struct workqueue_struct *md_wq;
 static struct workqueue_struct *md_misc_wq;
 
@@ -5367,10 +5368,27 @@ static int md_alloc(dev_t dev, char *name)
return error;
 }
 
+static void md_probe_work_fn(struct work_struct *ws)
+{
+   struct md_probe_work *mpw = container_of(ws, struct md_probe_work,
+   work);
+   md_alloc(mpw->dev, NULL);
+   mpw->done = 1;
+   wake_up(>wait);
+}
+
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
-   if (create_on_open)
-   md_alloc(dev, NULL);
+   struct md_probe_work mpw;
+
+   if (create_on_open) {
+   init_waitqueue_head();
+   mpw.dev = dev;
+   mpw.done = 0;
+   INIT_WORK(, md_probe_work_fn);
+   queue_work(md_probe_wq, );
+   wait_event(mpw.wait, mpw.done);
+   }
return NULL;
 }
 
@@ -9023,9 +9041,13 @@ static int __init md_init(void)
 {
int ret = -ENOMEM;
 
+   md_probe_wq = alloc_workqueue("md_probe", 0, 0);
+   if (!md_probe_wq)
+   goto err_wq;
+
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
if (!md_wq)
-   goto err_wq;
+   goto err_probe_wq;
 
md_misc_wq = alloc_workqueue("md_misc", 0, 0);
if (!md_misc_wq)
@@ -9055,6 +9077,8 @@ static int __init md_init(void)
destroy_workqueue(md_misc_wq);
 err_misc_wq:
destroy_workqueue(md_wq);
+err_probe_wq:
+   destroy_workqueue(md_probe_wq);
 err_wq:
return ret;
 }
@@ -9311,6 +9335,7 @@ static __exit void md_exit(void)
}
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
+   destroy_workqueue(md_probe_wq);
 }
 
 subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d6bcf0..3953896 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -487,6 +487,13 @@ enum recovery_flags {
MD_RECOVERY_ERROR,  /* sync-action interrupted because io-error */
 };
 
+struct md_probe_work {
+   struct work_struct work;
+   wait_queue_head_t wait;
+   dev_t dev;
+   int done;
+};
+
 static inline int __must_check mddev_lock(struct mddev *mddev)
 {
return mutex_lock_interruptible(>reconfig_mutex);


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-22 Thread weiping zhang
2017-12-22 12:53 GMT+08:00 Bruno Wolff III :
> On Thu, Dec 21, 2017 at 17:16:03 -0600,
>  Bruno Wolff III  wrote:
>>
>>
>> Enforcing mode alone isn't enough as I tested that one one machine at home
>> and it didn't trigger the problem. I'll try another machine late tonight.
>
>
> I got the problem to occur on my i686 machine when booting in enforcing
> mode. This machine uses raid 1 vua mdraid which may or may not be a factor
> in this problem. The boot log has a trace at the end and might be helpful,
> so I'm attaching it here.
Hi Bruno,
I can reproduce this issue in my QEMU test VM easily, just add an soft
RAID1, always trigger
that warning, I'll debug it later.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread weiping zhang
2017-12-21 23:36 GMT+08:00 Bruno Wolff III <br...@wolff.to>:
> On Thu, Dec 21, 2017 at 23:31:40 +0800,
>  weiping zhang <zwp10...@gmail.com> wrote:
>>
>> does every time boot fail can trigger WANRING in device_add_disk ?
>
>
> Not that I see. But the message could scroll off the screen. The boot gets
> far enough that systemd copies over dmesg output to permanent storage that I
> can see on my next successful boot. That's where I looked for the warning
> output you want. I never saw it for any kernels I compiled myself. Only when
> I test kernels built by Fedora do I see it.
see it every boot ?

> I just tried booting to single user and the boot still hangs.
>
> When I build the kernels, the compiler options are probably a bit different
> than when Fedora does. That might affect what happens during boot.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread weiping zhang
2017-12-21 21:00 GMT+08:00 Bruno Wolff III :
> After today, I won't have physical access to the problem machine until
> January 2nd. So if you guys have any testing suggestions I need them soon if
> they are to get done before my vacation.
> I do plan to try booting to level 1 to see if I can get a login prompt that
> might facilitate testing. The lockups do happen fairly late in the boot
> process. I never get to X, but maybe it will get far enough for a console
> login.
>
Hi,
how do you do bisect ?build all kernel commit one by one ?
as you did before:
https://bugzilla.redhat.com/show_bug.cgi?id=1520982

what kernel source code do you use that occur warning at device_add_disk?
from fedora or any official release ? if so ,could you provide web link?

if you use same kernel source code and same .config, why your own build
Cann't trigger that warning ?


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-17 Thread weiping zhang
2017-12-17 0:32 GMT+08:00 Bruno Wolff III :
> On Fri, Dec 15, 2017 at 13:51:22 -0600,
>  Bruno Wolff III  wrote:
>>
>>
>> I do not know what is different. Do you have any ideas? Most likely I
>> won't be able to test any more kernels until Monday (unless I can use most
>> of my most recent build over again very soon).
>
>
> The .config looks like it should be OK. I'll test setting loglevel on boot
> in case the default is different than what the config file says. I can't do
> that until Monday morning.
>
> I think it is more likely the the WARN_ON macro code isn't being compiled in
> for some reason. I haven't confirmed that, nor have I found anything that
> would leave that code out when I do a make, but include it during Fedora
> builds.
Hi, thanks for testing, I think you first reproduce this issue(got WARNING
at device_add_disk) by your own build, then add my debug patch.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-15 Thread weiping zhang
2017-12-15 19:10 GMT+08:00 Bruno Wolff III <br...@wolff.to>:
> On Fri, Dec 15, 2017 at 10:04:32 +0800,
>  weiping zhang <zwp10...@gmail.com> wrote:
>>
>> I just want to know WARN_ON WHAT in device_add_disk,
>> if bdi_register_owner return error code, it may fail at any step of
>> following:
>
>
> Was that output in the original boot log? I didn't see anything there that
> had the string WARN_ON. The first log was from a Fedora kernel. The second
Sorry to let you confuse, WARN_ON means we catch log as following:
WARNING: CPU: 3 PID: 3486 at block/genhd.c:680 device_add_disk+0x3d9/0x460

> from a kernel I built. I used a Fedora config though. The config was
> probably from one of their nodebug kernels, I could build another one using
> a config from a debug kernel. Would that likely provide what you are looking
> for?

Yes, please help reproduce this issue include my debug patch. Reproduce means
we can see WARN_ON in device_add_disk caused by failure of bdi_register_owner.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-14 Thread weiping zhang
2017-12-15 9:44 GMT+08:00 Bruno Wolff III <br...@wolff.to>:
> On Fri, Dec 15, 2017 at 09:22:21 +0800,
>  weiping zhang <zwp10...@gmail.com> wrote:
>>
>>
>> Thanks your testing, but I cann't find WARN_ON in device_add_disk from
>> this boot1.log, could you help reproduce that issue? And does this issue
>> can be
>> triggered at every bootup ?
>
>
> I don't know what you need for the first question. When I am physically at
> the machine I can do test reboots. If you have something specific you want
> me to try I should be able to.
>
> Every time I boot with the problem commit, the boot never completes. However
> it does seem to get pretty far. I get multiple register dumps every time.
> After a while (a few minutes) I reboot to a wrking kernel.
>
> The output I included is from: journalctl -k -b -1
> If you think it would be better to see more than dmesg output let me know.
I just want to know WARN_ON WHAT in device_add_disk,
if bdi_register_owner return error code, it may fail at any step of following:

bdi_debug_root is NULL
bdi->debug_dir is NULL
bdi->debug_stats is NULL

so I want see the WARN_ON as you paste before, also my DEBUG log will help
to find which step fail.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-14 Thread weiping zhang
2017-12-14 23:41 GMT+08:00 Bruno Wolff III <br...@wolff.to>:
> On Thu, Dec 14, 2017 at 18:09:27 +0800,
>  weiping zhang <zhangweip...@didichuxing.com> wrote:
>>
>>
>> It seems something wrong with bdi debugfs register, could you help
>> test the forllowing debug patch, I add some debug log, no function
>> change, thanks.
>
>
> I applied your patch to d39a01eff9af1045f6e30ff9db40310517c4b45f and there
> were some new debug messages in the dmesg output. Hopefully this helps. I
> also added the patch and output to the Fedora bug for people following
> there.

Hi Bruno,

Thanks your testing, but I cann't find WARN_ON in device_add_disk from
this boot1.log, could you help reproduce that issue? And does this issue can be
triggered at every bootup ?

--
Thanks
weiping


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-14 Thread weiping zhang
On Thu, Dec 14, 2017 at 02:24:52AM -0600, Bruno Wolff III wrote:
> On Wed, Dec 13, 2017 at 16:54:17 -0800,
>  Laura Abbott <labb...@redhat.com> wrote:
> >Hi,
> >
> >Fedora got a bug report https://bugzilla.redhat.com/show_bug.cgi?id=1520982
> >of a boot failure/bug on Linus' master (full bootlog at the bugzilla)
> 
> I'm available for testing. The problem happens on my x86_64 Dell
> Workstation, but not an old i386 server or an x86_64 mac hardware
> based laptop.

Hi,

It seems something wrong with bdi debugfs register, could you help
test the forllowing debug patch, I add some debug log, no function
change, thanks.


>From d2728c07589e8b83115a51e0c629451bff7308db Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweip...@didichuxing.com>
Date: Thu, 14 Dec 2017 17:56:22 +0800
Subject: [PATCH] bdi debugfs

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 28 +---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 84b2dc7..fbbb9a6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,6 +39,10 @@ static struct dentry *bdi_debug_root;
 static void bdi_debug_init(void)
 {
bdi_debug_root = debugfs_create_dir("bdi", NULL);
+   if (!bdi_debug_root)
+   pr_err("DEBUG:bdi_debug_root fail\n");
+   else
+   pr_err("DEBUG:bdi_debug_root success\n");
 }
 
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
@@ -115,18 +119,29 @@ static const struct file_operations bdi_debug_stats_fops 
= {
 
 static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 {
-   if (!bdi_debug_root)
+   if (!bdi_debug_root) {
+   pr_err("DEBUG:dev:%s, bdi_debug_root fail\n", name);
return -ENOMEM;
+   } else {
+   pr_err("DEBUG:dev:%s, bdi_debug_root success\n", name);
+   }
 
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
-   if (!bdi->debug_dir)
+   if (!bdi->debug_dir) {
+   pr_err("DEBUG:dev:%s, debug_dir fail\n", name);
return -ENOMEM;
+   } else {
+   pr_err("DEBUG:dev:%s, debug_dir success\n", name);
+   }
 
bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
   bdi, _debug_stats_fops);
if (!bdi->debug_stats) {
debugfs_remove(bdi->debug_dir);
+   pr_err("DEBUG:dev:%s, debug_stats fail\n", name);
return -ENOMEM;
+   } else {
+   pr_err("DEBUG:dev:%s, debug_stats success\n", name);
}
 
return 0;
@@ -879,13 +894,20 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
char *fmt, va_list args)
return 0;
 
dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
-   if (IS_ERR(dev))
+   if (IS_ERR(dev)) {
+   pr_err("DEBUG: bdi device_create_vargs fail\n");
return PTR_ERR(dev);
+   }
+   pr_err("DEBUG: bdi(0x%p) device_create_vargs sucess\n", bdi);
 
if (bdi_debug_register(bdi, dev_name(dev))) {
+   pr_err("DEBUG: dev:%s, bdi(0x%p) bdi_debug_register fail\n",
+   dev_name(dev), bdi);
device_destroy(bdi_class, dev->devt);
return -ENOMEM;
}
+   pr_err("DEBUG: dev:%s, bdi(0x%p) bdi_debug_register success\n",
+   dev_name(dev), bdi);
cgwb_bdi_register(bdi);
bdi->dev = dev;
 
-- 
2.9.4



[RFC] blk-throttle: export io_serviced_recursive, io_service_bytes_recursive

2017-12-11 Thread weiping zhang
export these two interface for cgroup-v1.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-throttle.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad326..1d7637f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1512,10 +1512,20 @@ static struct cftype throtl_legacy_files[] = {
.seq_show = blkg_print_stat_bytes,
},
{
+   .name = "throttle.io_service_bytes_recursive",
+   .private = (unsigned long)_policy_throtl,
+   .seq_show = blkg_print_stat_bytes_recursive,
+   },
+   {
.name = "throttle.io_serviced",
.private = (unsigned long)_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
+   {
+   .name = "throttle.io_serviced_recursive",
+   .private = (unsigned long)_policy_throtl,
+   .seq_show = blkg_print_stat_ios_recursive,
+   },
{ } /* terminate */
 };
 
-- 
2.9.4



[RFC PATCH] blk-throttle: export io_serviced_recursive, io_service_bytes_recursive

2017-11-27 Thread weiping zhang
export these two interface for cgroup-v1.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-throttle.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad326..1d7637f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1512,10 +1512,20 @@ static struct cftype throtl_legacy_files[] = {
.seq_show = blkg_print_stat_bytes,
},
{
+   .name = "throttle.io_service_bytes_recursive",
+   .private = (unsigned long)_policy_throtl,
+   .seq_show = blkg_print_stat_bytes_recursive,
+   },
+   {
.name = "throttle.io_serviced",
.private = (unsigned long)_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
+   {
+   .name = "throttle.io_serviced_recursive",
+   .private = (unsigned long)_policy_throtl,
+   .seq_show = blkg_print_stat_ios_recursive,
+   },
{ } /* terminate */
 };
 
-- 
2.9.4



[PATCH 4/5] blk-wbt: move wbt_clear_stat to common place in wbt_done

2017-11-23 Thread weiping zhang
wbt_done call wbt_clear_stat no matter current stat was tracked
or not, move it to common place.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-wbt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0fb65f0..cd9a20a 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -178,12 +178,11 @@ void wbt_done(struct rq_wb *rwb, struct blk_issue_stat 
*stat)
 
if (wbt_is_read(stat))
wb_timestamp(rwb, >last_comp);
-   wbt_clear_state(stat);
} else {
WARN_ON_ONCE(stat == rwb->sync_cookie);
__wbt_done(rwb, wbt_stat_to_mask(stat));
-   wbt_clear_state(stat);
}
+   wbt_clear_state(stat);
 }
 
 /*
-- 
2.9.4



[PATCH 5/5] blk-wbt: fix comments typo

2017-11-23 Thread weiping zhang
Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-wbt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index cd9a20a..9f4ef9c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -481,7 +481,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, 
unsigned long rw)
 
/*
 * At this point we know it's a buffered write. If this is
-* kswapd trying to free memory, or REQ_SYNC is set, set, then
+* kswapd trying to free memory, or REQ_SYNC is set, then
 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
 * that. If the write is marked as a background write, then use
 * the idle limit, or go to normal if we haven't had competing
-- 
2.9.4



[PATCH 3/5] blk-sysfs: remove NULL pointer checking in queue_wb_lat_store

2017-11-23 Thread weiping zhang
wbt_init doesn't set q->rq_wb to NULL, if wbt_init return 0,
so check return value is enough, remove NULL checking.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-sysfs.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b8362c0..9b1093a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -449,12 +449,9 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, 
const char *page,
ret = wbt_init(q);
if (ret)
return ret;
-
-   rwb = q->rq_wb;
-   if (!rwb)
-   return -EINVAL;
}
 
+   rwb = q->rq_wb;
if (val == -1)
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
else if (val >= 0)
-- 
2.9.4



[PATCH 2/5] blk-wbt: cleanup comments to one line

2017-11-23 Thread weiping zhang
Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-wbt.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index edb09e93..0fb65f0 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -729,9 +729,7 @@ int wbt_init(struct request_queue *q)
rwb->enable_state = WBT_STATE_ON_DEFAULT;
wbt_update_limits(rwb);
 
-   /*
-* Assign rwb and add the stats callback.
-*/
+   /* Assign rwb and add the stats callback. */
q->rq_wb = rwb;
blk_stat_add_callback(q, rwb->cb);
 
-- 
2.9.4



[PATCH 1/5] blk-wbt: remove duplicated setting in wbt_init

2017-11-23 Thread weiping zhang
rwb->wc and rwb->queue_depth were overwritten by wbt_set_write_cache and
wbt_set_queue_depth, remove the default setting.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-wbt.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index e59d59c..edb09e93 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -723,8 +723,6 @@ int wbt_init(struct request_queue *q)
init_waitqueue_head(>rq_wait[i].wait);
}
 
-   rwb->wc = 1;
-   rwb->queue_depth = RWB_DEF_DEPTH;
rwb->last_comp = rwb->last_issue = jiffies;
rwb->queue = q;
rwb->win_nsec = RWB_WINDOW_NSEC;
-- 
2.9.4



[PATCH 0/5] cleanup for blk-wbt

2017-11-23 Thread weiping zhang
Hi Jens,

several cleanup for blk-wbt, no function change, thanks

weiping zhang (5):
  blk-wbt: remove duplicated setting in wbt_init
  blk-wbt: cleanup comments to one line
  blk-sysfs: remove NULL pointer checking in queue_wb_lat_store
  blk-wbt: move wbt_clear_stat to common place in wbt_done
  blk-wbt: fix comments typo

 block/blk-sysfs.c |  5 +
 block/blk-wbt.c   | 11 +++
 2 files changed, 4 insertions(+), 12 deletions(-)

-- 
2.9.4



Re: [PATCH v2 2/3] bdi: add error handle for bdi_debug_register

2017-11-17 Thread weiping zhang
On Wed, Nov 01, 2017 at 02:47:22PM +0100, Jan Kara wrote:
> On Tue 31-10-17 18:38:24, weiping zhang wrote:
> > In order to make error handle more cleaner we call bdi_debug_register
> > before set state to WB_registered, that we can avoid call bdi_unregister
> > in release_bdi().
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> 
> Looks good to me. You can add:
> 
> Reviewed-by: Jan Kara <j...@suse.cz>
> 
>   Honza
> 
> > ---
> >  mm/backing-dev.c | 5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> > index b5f940ce0143..84b2dc76f140 100644
> > --- a/mm/backing-dev.c
> > +++ b/mm/backing-dev.c
> > @@ -882,10 +882,13 @@ int bdi_register_va(struct backing_dev_info *bdi, 
> > const char *fmt, va_list args)
> > if (IS_ERR(dev))
> > return PTR_ERR(dev);
> >  
> > +   if (bdi_debug_register(bdi, dev_name(dev))) {
> > +   device_destroy(bdi_class, dev->devt);
> > +   return -ENOMEM;
> > +   }
> > cgwb_bdi_register(bdi);
> > bdi->dev = dev;
> >  
> > -   bdi_debug_register(bdi, dev_name(dev));
> > set_bit(WB_registered, >wb.state);
> >  
> > spin_lock_bh(_lock);
> > -- 

Hello Jens,

Could you please give some comments for this series cleanup.

--
Thanks
weiping



[PATCH] blkcg: add wrappers for struct blkcg_policy operations

2017-11-09 Thread weiping zhang
Some blkcg policies may not implement all operations in struct blkcg_policy,
there are lots of "if (pol->xxx)", add wrappers for these pol->xxx_fn.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 55 +--
 include/linux/blk-cgroup.h | 72 ++
 2 files changed, 98 insertions(+), 29 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7ec676..e34ecb2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -71,7 +71,7 @@ static void blkg_free(struct blkcg_gq *blkg)
 
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (blkg->pd[i])
-   blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+   blkg_pol_free_pd(blkcg_policy[i], blkg->pd[i]);
 
if (blkg->blkcg != _root)
blk_exit_rl(blkg->q, >rl);
@@ -124,7 +124,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, 
struct request_queue *q,
continue;
 
/* alloc per-policy data and attach it to blkg */
-   pd = pol->pd_alloc_fn(gfp_mask, q->node);
+   pd = blkg_pol_alloc_pd(pol, gfp_mask, q->node);
if (!pd)
goto err_free;
 
@@ -218,8 +218,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_init_fn)
-   pol->pd_init_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_init_pd(pol, blkg->pd[i]);
}
 
/* insert */
@@ -232,8 +232,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_online_fn)
-   pol->pd_online_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_online_pd(pol, blkg->pd[i]);
}
}
blkg->online = true;
@@ -323,8 +323,8 @@ static void blkg_destroy(struct blkcg_gq *blkg)
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_offline_fn)
-   pol->pd_offline_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_offline_pd(pol, blkg->pd[i]);
}
 
if (parent) {
@@ -457,8 +457,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state 
*css,
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_reset_stats_fn)
-   pol->pd_reset_stats_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_reset_pd_stats(pol, blkg->pd[i]);
}
}
 
@@ -1045,7 +1045,7 @@ static void blkcg_css_free(struct cgroup_subsys_state 
*css)
 
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (blkcg->cpd[i])
-   blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+   blkcg_pol_free_cpd(blkcg_policy[i], blkcg->cpd[i]);
 
mutex_unlock(_pol_mutex);
 
@@ -1084,7 +1084,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
if (!pol || !pol->cpd_alloc_fn)
continue;
 
-   cpd = pol->cpd_alloc_fn(GFP_KERNEL);
+   cpd = blkcg_pol_alloc_cpd(pol, GFP_KERNEL);
if (!cpd) {
ret = ERR_PTR(-ENOMEM);
goto free_pd_blkcg;
@@ -1092,8 +1092,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg->cpd[i] = cpd;
cpd->blkcg = blkcg;
cpd->plid = i;
-   if (pol->cpd_init_fn)
-   pol->cpd_init_fn(cpd);
+   blkcg_pol_init_cpd(pol, cpd);
}
 
spin_lock_init(>lock);
@@ -1108,9 +1107,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
return >css;
 
 free_pd_blkcg:
-   for (i--; i >= 0; i--)
+   while (i--)
if (blkcg->cpd[i])
-   blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+   blkcg_pol_free_cpd(blkcg_policy[i], 
blkcg->cpd[pol->plid]);
 
if (blkcg != _root)
kfree(blkcg);
@@ -1246,7 +1245,7 @@ static void blkcg_bind(struct cgroup_subsys_state 
*root_css)
 
list_for_each_entry

Re: [PATCH v4] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-11-03 Thread weiping zhang
On Fri, Sep 22, 2017 at 11:36:28PM +0800, weiping zhang wrote:
> if blk-mq use "none" io scheduler, nr_request get a wrong value when
> input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> wrong value.
> 
> Reproduce:
> 
> echo none > /sys/block/nvme0n1/queue/scheduler
> echo 100 > /sys/block/nvme0n1/queue/nr_requests
> cat /sys/block/nvme0n1/queue/nr_requests
> 100
> 
> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> ---
> 
> Changes since v4:
>  * fix typo in commit message(queue/ioscheduler => queue/scheduler)
> 
> Changes since v3:
>  * remove compare nr with tags->qdepth, pass nr to blk_mq_tag_update_depth
> directly
> 
>  * remove return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
> 
> Changes since v2:
>  * add return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
>  * remove pr_warn, and return EINVAL, if input number is too large
> 
>  block/blk-mq.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 98a1860..491e336 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2642,8 +2642,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
> unsigned int nr)
>* queue depth. This is similar to what the old code would do.
>*/
>   if (!hctx->sched_tags) {
> - ret = blk_mq_tag_update_depth(hctx, >tags,
> - min(nr, 
> set->queue_depth),
> + ret = blk_mq_tag_update_depth(hctx, >tags, nr,
>   false);
>   } else {
>   ret = blk_mq_tag_update_depth(hctx, >sched_tags,
> -- 
> 2.9.4
> 
Hello Jens,

ping


[PATCH v2 3/3] block: add WARN_ON if bdi register fail

2017-10-31 Thread weiping zhang
device_add_disk need do more safety error handle, so this patch just
add WARN_ON.

Reviewed-by: Jan Kara <j...@suse.cz>
Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb0..52834433878c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -660,7 +660,7 @@ void device_add_disk(struct device *parent, struct gendisk 
*disk)
 
/* Register BDI before referencing it from bdev */
bdi = disk->queue->backing_dev_info;
-   bdi_register_owner(bdi, disk_to_dev(disk));
+   WARN_ON(bdi_register_owner(bdi, disk_to_dev(disk)));
 
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
-- 
2.14.2



[PATCH v2 1/3] bdi: convert bdi_debug_register to int

2017-10-31 Thread weiping zhang
Convert bdi_debug_register to int and then do error handle for it.

Reviewed-by: Jan Kara <j...@suse.cz>
Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 74b52dfd5852..b5f940ce0143 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -113,11 +113,23 @@ static const struct file_operations bdi_debug_stats_fops 
= {
.release= single_release,
 };
 
-static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 {
+   if (!bdi_debug_root)
+   return -ENOMEM;
+
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+   if (!bdi->debug_dir)
+   return -ENOMEM;
+
bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
   bdi, _debug_stats_fops);
+   if (!bdi->debug_stats) {
+   debugfs_remove(bdi->debug_dir);
+   return -ENOMEM;
+   }
+
+   return 0;
 }
 
 static void bdi_debug_unregister(struct backing_dev_info *bdi)
@@ -129,9 +141,10 @@ static void bdi_debug_unregister(struct backing_dev_info 
*bdi)
 static inline void bdi_debug_init(void)
 {
 }
-static inline void bdi_debug_register(struct backing_dev_info *bdi,
+static inline int bdi_debug_register(struct backing_dev_info *bdi,
  const char *name)
 {
+   return 0;
 }
 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
-- 
2.14.2



[PATCH v2 2/3] bdi: add error handle for bdi_debug_register

2017-10-31 Thread weiping zhang
In order to make error handle more cleaner we call bdi_debug_register
before set state to WB_registered, that we can avoid call bdi_unregister
in release_bdi().

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b5f940ce0143..84b2dc76f140 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -882,10 +882,13 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
char *fmt, va_list args)
if (IS_ERR(dev))
return PTR_ERR(dev);
 
+   if (bdi_debug_register(bdi, dev_name(dev))) {
+   device_destroy(bdi_class, dev->devt);
+   return -ENOMEM;
+   }
cgwb_bdi_register(bdi);
bdi->dev = dev;
 
-   bdi_debug_register(bdi, dev_name(dev));
set_bit(WB_registered, >wb.state);
 
spin_lock_bh(_lock);
-- 
2.14.2



[PATCH v2 0/3] add error handle for bdi debugfs register

2017-10-31 Thread weiping zhang
Hello,

Change since V1:
 * remove the patch for bdi_debug_init(), because patch1 add a check
   for bdi_debug_root
 * remove bdi_put in bdi_register, this functions has two callers:
caller1: mtd_bdi_init->bdi_register, bdi_put if register fail
caller2: device_add_disk->bdi_register_owner->bdi_register, this call
stack need more safety cleanup, so patch3 add an WARN_ON.

weiping zhang (3):
  bdi: convert bdi_debug_register to int
  bdi: add error handle for bdi_debug_register
  block: add WARN_ON if bdi register fail

 block/genhd.c|  2 +-
 mm/backing-dev.c | 22 +++---
 2 files changed, 20 insertions(+), 4 deletions(-)

-- 
2.14.2



Re: [PATCH 4/4] block: add WARN_ON if bdi register fail

2017-10-31 Thread weiping zhang
On Mon, Oct 30, 2017 at 02:14:30PM +0100, Jan Kara wrote:
> On Fri 27-10-17 01:36:42, weiping zhang wrote:
> > device_add_disk need do more safety error handle, so this patch just
> > add WARN_ON.
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  block/genhd.c | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/genhd.c b/block/genhd.c
> > index dd305c65ffb0..cb55eea821eb 100644
> > --- a/block/genhd.c
> > +++ b/block/genhd.c
> > @@ -660,7 +660,9 @@ void device_add_disk(struct device *parent, struct 
> > gendisk *disk)
> >  
> > /* Register BDI before referencing it from bdev */
> > bdi = disk->queue->backing_dev_info;
> > -   bdi_register_owner(bdi, disk_to_dev(disk));
> > +   retval = bdi_register_owner(bdi, disk_to_dev(disk));
> > +   if (retval)
> > +   WARN_ON(1);
> 
> Just a nit: You can do
> 
>   WARN_ON(retval);
> 
> Otherwise you can add:
> 
> Reviewed-by: Jan Kara <j...@suse.cz>
> 
more claner, I'll apply at V2, Thanks

--
weiping


Re: [PATCH 3/4] bdi: add error handle for bdi_debug_register

2017-10-31 Thread weiping zhang
On Mon, Oct 30, 2017 at 02:10:16PM +0100, Jan Kara wrote:
> On Fri 27-10-17 01:36:14, weiping zhang wrote:
> > In order to make error handle more cleaner we call bdi_debug_register
> > before set state to WB_registered, that we can avoid call bdi_unregister
> > in release_bdi().
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  mm/backing-dev.c | 7 ++-
> >  1 file changed, 6 insertions(+), 1 deletion(-)
> > 
> > diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> > index e9d6a1ede12b..54396d53f471 100644
> > --- a/mm/backing-dev.c
> > +++ b/mm/backing-dev.c
> > @@ -893,10 +893,13 @@ int bdi_register_va(struct backing_dev_info *bdi, 
> > const char *fmt, va_list args)
> > if (IS_ERR(dev))
> > return PTR_ERR(dev);
> >  
> > +   if (bdi_debug_register(bdi, dev_name(dev))) {
> > +   device_destroy(bdi_class, dev->devt);
> > +   return -ENOMEM;
> > +   }
> > cgwb_bdi_register(bdi);
> > bdi->dev = dev;
> >  
> > -   bdi_debug_register(bdi, dev_name(dev));
> > set_bit(WB_registered, >wb.state);
> >  
> > spin_lock_bh(_lock);
> > @@ -916,6 +919,8 @@ int bdi_register(struct backing_dev_info *bdi, const 
> > char *fmt, ...)
> > va_start(args, fmt);
> > ret = bdi_register_va(bdi, fmt, args);
> > va_end(args);
> > +   if (ret)
> > +   bdi_put(bdi);
> 
> Why do you drop bdi reference here in case of error? We didn't do it
> previously if bdi_register_va() failed for other reasons...
> 
At first I want add cleanup, because
device_add_disk->bdi_register_owner->bdi_register doen't do clanup. But
I notice that mtd_bdi_init also call bdi_register and do cleanup, so
this bdi_put() is wrong. I'll remove it at V2. Thanks a lot.

--
weiping


Re: [PATCH 1/4] bdi: add check for bdi_debug_root

2017-10-31 Thread weiping zhang
On Mon, Oct 30, 2017 at 02:00:28PM +0100, Jan Kara wrote:
> On Fri 27-10-17 01:35:36, weiping zhang wrote:
> > this patch add a check for bdi_debug_root and do error handle for it.
> > we should make sure it was created success, otherwise when add new
> > block device's bdi folder(eg, 8:0) will be create a debugfs root directory.
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  mm/backing-dev.c | 17 ++---
> >  1 file changed, 14 insertions(+), 3 deletions(-)
> 
> These functions get called only on system boot - ENOMEM in those cases is
> generally considered fatal and oopsing is acceptable result. So I don't
> think this patch is needed.
> 
OK, I drop this patch.

Thanks a ton.


[PATCH 4/4] block: add WARN_ON if bdi register fail

2017-10-26 Thread weiping zhang
device_add_disk need do more safety error handle, so this patch just
add WARN_ON.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/genhd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb0..cb55eea821eb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -660,7 +660,9 @@ void device_add_disk(struct device *parent, struct gendisk 
*disk)
 
/* Register BDI before referencing it from bdev */
bdi = disk->queue->backing_dev_info;
-   bdi_register_owner(bdi, disk_to_dev(disk));
+   retval = bdi_register_owner(bdi, disk_to_dev(disk));
+   if (retval)
+   WARN_ON(1);
 
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
-- 
2.14.2



[PATCH 3/4] bdi: add error handle for bdi_debug_register

2017-10-26 Thread weiping zhang
In order to make error handle more cleaner we call bdi_debug_register
before set state to WB_registered, that we can avoid call bdi_unregister
in release_bdi().

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e9d6a1ede12b..54396d53f471 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -893,10 +893,13 @@ int bdi_register_va(struct backing_dev_info *bdi, const 
char *fmt, va_list args)
if (IS_ERR(dev))
return PTR_ERR(dev);
 
+   if (bdi_debug_register(bdi, dev_name(dev))) {
+   device_destroy(bdi_class, dev->devt);
+   return -ENOMEM;
+   }
cgwb_bdi_register(bdi);
bdi->dev = dev;
 
-   bdi_debug_register(bdi, dev_name(dev));
set_bit(WB_registered, >wb.state);
 
spin_lock_bh(_lock);
@@ -916,6 +919,8 @@ int bdi_register(struct backing_dev_info *bdi, const char 
*fmt, ...)
va_start(args, fmt);
ret = bdi_register_va(bdi, fmt, args);
va_end(args);
+   if (ret)
+   bdi_put(bdi);
return ret;
 }
 EXPORT_SYMBOL(bdi_register);
-- 
2.14.2



[PATCH 1/4] bdi: add check for bdi_debug_root

2017-10-26 Thread weiping zhang
this patch add a check for bdi_debug_root and do error handle for it.
we should make sure it was created success, otherwise when add new
block device's bdi folder(eg, 8:0) will be create a debugfs root directory.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 74b52dfd5852..5072be19d9b2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -36,9 +36,12 @@ struct workqueue_struct *bdi_wq;
 
 static struct dentry *bdi_debug_root;
 
-static void bdi_debug_init(void)
+static int bdi_debug_init(void)
 {
bdi_debug_root = debugfs_create_dir("bdi", NULL);
+   if (!bdi_debug_root)
+   return -ENOMEM;
+   return 0;
 }
 
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
@@ -126,8 +129,9 @@ static void bdi_debug_unregister(struct backing_dev_info 
*bdi)
debugfs_remove(bdi->debug_dir);
 }
 #else
-static inline void bdi_debug_init(void)
+static inline int bdi_debug_init(void)
 {
+   return 0;
 }
 static inline void bdi_debug_register(struct backing_dev_info *bdi,
  const char *name)
@@ -229,12 +233,19 @@ ATTRIBUTE_GROUPS(bdi_dev);
 
 static __init int bdi_class_init(void)
 {
+   int ret;
+
bdi_class = class_create(THIS_MODULE, "bdi");
if (IS_ERR(bdi_class))
return PTR_ERR(bdi_class);
 
bdi_class->dev_groups = bdi_dev_groups;
-   bdi_debug_init();
+   ret = bdi_debug_init();
+   if (ret) {
+   class_destroy(bdi_class);
+   bdi_class = NULL;
+   return ret;
+   }
 
return 0;
 }
-- 
2.14.2



[PATCH 2/4] bdi: convert bdi_debug_register to int

2017-10-26 Thread weiping zhang
Convert bdi_debug_register to int and then do error handle for it.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 mm/backing-dev.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5072be19d9b2..e9d6a1ede12b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -116,11 +116,23 @@ static const struct file_operations bdi_debug_stats_fops 
= {
.release= single_release,
 };
 
-static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 {
+   if (!bdi_debug_root)
+   return -ENOMEM;
+
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+   if (!bdi->debug_dir)
+   return -ENOMEM;
+
bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
   bdi, _debug_stats_fops);
+   if (!bdi->debug_stats) {
+   debugfs_remove(bdi->debug_dir);
+   return -ENOMEM;
+   }
+
+   return 0;
 }
 
 static void bdi_debug_unregister(struct backing_dev_info *bdi)
@@ -133,9 +145,10 @@ static inline int bdi_debug_init(void)
 {
return 0;
 }
-static inline void bdi_debug_register(struct backing_dev_info *bdi,
+static inline int bdi_debug_register(struct backing_dev_info *bdi,
  const char *name)
 {
+   return 0;
 }
 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
-- 
2.14.2



[PATCH 0/4] add error handle for bdi debugfs register

2017-10-26 Thread weiping zhang

this series add error handle for bdi debugfs register flow, the first
three patches try to convert void function to int and do some cleanup
if create dir or file fail.

the fourth patch only add a WARN_ON in device_add_disk, no function change.

weiping zhang (4):
  bdi: add check for bdi_debug_root
  bdi: convert bdi_debug_register to int
  bdi: add error handle for bdi_debug_register
  block: add WARN_ON if bdi register fail

 block/genhd.c|  4 +++-
 mm/backing-dev.c | 41 +++--
 2 files changed, 38 insertions(+), 7 deletions(-)

-- 
2.14.2



[PATCH 2/2] null_blk: add usage hints for no_sched

2017-10-13 Thread weiping zhang
This parameter provide an option to disable io scheduler when nullb*
in multi-queue mode.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 Documentation/block/null_blk.txt | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 1f8d92c70458..b7161c8e70f2 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -73,3 +73,7 @@ use_per_node_hctx=[0/1]: Default: 0
 
 use_lightnvm=[0/1]: Default: 0
   Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
+
+no_sched=[0/1]: Default: 0
+  0: nullb* use default blk-mq io scheduler.
+  1: nullb* doesn't use io scheduler.
-- 
2.14.2



[PATCH 1/2] null_blk: update usage hints for submit_queues

2017-10-13 Thread weiping zhang
update the range of submits_queues, and correct usage hints.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 Documentation/block/null_blk.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 3140dbd860d8..1f8d92c70458 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -55,10 +55,10 @@ irqmode=[0-2]: Default: 1-Soft-irq
 completion_nsec=[ns]: Default: 10.000ns
   Combined with irqmode=2 (timer). The time each completion event must wait.
 
-submit_queues=[0..nr_cpus]:
+submit_queues=[1..nr_cpus]:
   The number of submission queues attached to the device driver. If unset, it
-  defaults to 1 on single-queue and bio-based instances. For multi-queue,
-  it is ignored when use_per_node_hctx module parameter is 1.
+  defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
+  parameter is 1.
 
 hw_queue_depth=[0..qdepth]: Default: 64
   The hardware queue depth of the device.
-- 
2.14.2



[PATCH 0/2] Update the usage hints of submit_queues and

2017-10-13 Thread weiping zhang
no_sched
Message-ID: <cover.1507911414.git.zhangweip...@didichuxing.com>
Reply-To: 

Hi Jens,

These two misc patch update the usage hints of null_blk module
parameter.

Because submit_queues was changed to 1 by default, so update it's
description.

The second patch add usage hints for no_sched.


weiping zhang (2):
  null_blk: update usage hints for submit_queues
  null_blk: add usage hints for no_sched

 Documentation/block/null_blk.txt | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

-- 
2.14.2



[PATCH v3] blkcg: add sanity check for blkcg policy operations

2017-10-12 Thread weiping zhang
blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs,
otherwise policy would register fail.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7ec676..6c5f5f3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1419,6 +1419,13 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS)
goto err_unlock;
 
+   /* Make sure cpd_alloc_fn and cpd_free_fn in pairs */
+   if (!pol->cpd_alloc_fn ^ !pol->cpd_free_fn)
+   goto err_unlock;
+
+   if (!pol->pd_alloc_fn ^ !pol->pd_free_fn)
+   goto err_unlock;
+
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
-- 
2.9.4



Re: [PATCH v2] blkcg: add sanity check for blkcg policy operations

2017-10-12 Thread weiping zhang
On Wed, Oct 11, 2017 at 10:51:32AM -0600, Jens Axboe wrote:
> On 10/11/2017 03:46 AM, weiping zhang wrote:
> > blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs,
> > otherwise policy would register fail.
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  block/blk-cgroup.c | 10 ++
> >  1 file changed, 10 insertions(+)
> > 
> > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> > index e7ec676..67b01c5 100644
> > --- a/block/blk-cgroup.c
> > +++ b/block/blk-cgroup.c
> > @@ -1419,6 +1419,16 @@ int blkcg_policy_register(struct blkcg_policy *pol)
> > if (i >= BLKCG_MAX_POLS)
> > goto err_unlock;
> >  
> > +   /* Make sure cpd_alloc_fn and cpd_free_fn in pairs */
> > +   if ((pol->cpd_alloc_fn && !pol->cpd_free_fn) ||
> > +   (!pol->cpd_alloc_fn && pol->cpd_free_fn))
> > +   goto err_unlock;
> 
> This might be cleaner (and more readable) as:
> 
> if (!pol->cpd_alloc_fn ^ !pol->cpd_free_fn)
> goto err_unlock;
> 
> Ditto for the pd part.
> 
Really nice, I'll send v3.

Thanks


Re: [PATCH V2 3/3] blockcg: export latency info for each cgroup

2017-10-11 Thread weiping zhang
2017-10-11 2:23 GMT+08:00 Shaohua Li <s...@kernel.org>:
> On Wed, Oct 11, 2017 at 01:35:51AM +0800, weiping zhang wrote:
>> On Fri, Oct 06, 2017 at 05:56:01PM -0700, Shaohua Li wrote:
>> > From: Shaohua Li <s...@fb.com>
>> >
>> > Export the latency info to user. The latency is a good sign to indicate
>> > if IO is congested or not. User can use the info to make decisions like
>> > adjust cgroup settings.
>> Hi Shaohua,
>> How to check IO congested or not by latency ? Different SSD has
>> different latency especially when mixed sequence and random IO
>> operatons.
>
> There is no magic here, you should know the SSD characteristics first. The 
> idea
> is observing the latency when the system isn't overcommited, for example,
> running the workload in a one-cgroup setup, then using the observed latency to
> guide configuration for multi-cgroup settings or check if IO is healthy in
> cgroups.
>
Could you please give more detail about how to get the SSD characteristics ?
I cann't find a good way to get the reasonable latency at mixed
sequence , random,
read, and write. Is there a value or a range represent the reasonable latency at
mixed circurmastance ?

Thanks
Weiping


[PATCH v2] blkcg: add sanity check for blkcg policy operations

2017-10-11 Thread weiping zhang
blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs,
otherwise policy would register fail.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7ec676..67b01c5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1419,6 +1419,16 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS)
goto err_unlock;
 
+   /* Make sure cpd_alloc_fn and cpd_free_fn in pairs */
+   if ((pol->cpd_alloc_fn && !pol->cpd_free_fn) ||
+   (!pol->cpd_alloc_fn && pol->cpd_free_fn))
+   goto err_unlock;
+
+   /* Make sure pd_alloc_fn and pd_free_fn in pairs */
+   if ((pol->pd_alloc_fn && !pol->pd_free_fn) ||
+   (!pol->pd_alloc_fn && pol->pd_free_fn))
+   goto err_unlock;
+
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
-- 
2.9.4



Re: [PATCH] blkcg: add sanity check for blkcg policy operations

2017-10-11 Thread weiping zhang
On Wed, Oct 11, 2017 at 09:58:33AM +0200, Johannes Thumshirn wrote:
> On Wed, Oct 11, 2017 at 12:00:55AM +0800, weiping zhang wrote:
> > + * blkcg_policy_check_ops - check policy's operations
> > + * @pol: blkcg policy to check
> > + *
> > + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs.
> > + * Return true on success and false on failure.
> > + */
> > +static bool blkcg_policy_check_ops(struct blkcg_policy *pol)
> > +{
> > +   if ((pol->cpd_alloc_fn && !pol->cpd_free_fn) ||
> > +   (!pol->cpd_alloc_fn && pol->cpd_free_fn))
> > +   return false;
> > +
> > +   if ((pol->pd_alloc_fn && !pol->pd_free_fn) ||
> > +   (!pol->pd_alloc_fn && pol->pd_free_fn))
> > +   return false;
> > +
> > +   return true;
> > +}
> > +
> > +/**
> >   * blkcg_policy_register - register a blkcg policy
> >   * @pol: blkcg policy to register
> >   *
> > @@ -1419,6 +1439,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
> > if (i >= BLKCG_MAX_POLS)
> > goto err_unlock;
> >  
> > +   if (!blkcg_policy_check_ops(pol))
> > +   goto err_unlock;
> > +
> 
> Can you merge it into the caller?
> 
> + if ((pol->cpd_alloc_fn && !pol->cpd_free_fn) ||
> + (!pol->cpd_alloc_fn && pol->cpd_free_fn))
> + goto err_unlock;
> +
> + if ((pol->pd_alloc_fn && !pol->pd_free_fn) ||
> + (!pol->pd_alloc_fn && pol->pd_free_fn))
> + goto err_unlock;
> 
> 
It's ok for me, I send V2 later.

Thanks
Weiping


Re: [PATCH V2 3/3] blockcg: export latency info for each cgroup

2017-10-10 Thread weiping zhang
On Fri, Oct 06, 2017 at 05:56:01PM -0700, Shaohua Li wrote:
> From: Shaohua Li 
> 
> Export the latency info to user. The latency is a good sign to indicate
> if IO is congested or not. User can use the info to make decisions like
> adjust cgroup settings.
Hi Shaohua,
How to check IO congested or not by latency ? Different SSD has
different latency especially when mixed sequence and random IO
operatons.

> 
> Existing io.stat shows accumulated IO bytes and requests, but
> accumulated value for latency doesn't make much sense. This patch
How to understand "accumulated value for latency doesn't make much
sense" could you give an example? I think iostat's r_await and w_await
are nearly equal to rlat_mean and wlat_mean if there is no too much
request starved.
> exports the latency info in a 100ms interval.
> 
> To reduce overhead, latency info of children is propagated to parents
> every 10ms. This means the parents's latency could lost 10ms info of its
> children in 100ms. This should be ok, as we don't need precise latency
> info.
> 
> A micro benchmark running fio test against null_blk in a sixth level
> cgroup doesn't show obvious regression. perf shows a little bit overhead
> in blk_stat_add (~1%) and blkg_lookup (~1%), which is unavoidable right
> now.
> 
> With this patch, the io.stat will show:
> 8:0 rbytes=7282688 wbytes=0 rios=83 wios=0 rlat_mean=2720 rlat_min=183 
> rlat_max=14880 wlat_mean=0 wlat_min=0 wlat_max=0
> The new fields will display read/write average/minimum/maximum latency
> within 100ms. The latency is us.
> 

Thanks
Weiping


[PATCH] blkcg: add sanity check for blkcg policy operations

2017-10-10 Thread weiping zhang
blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs,
otherwise policy would register fail.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7ec676..557c122 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1397,6 +1397,26 @@ void blkcg_deactivate_policy(struct request_queue *q,
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 
 /**
+ * blkcg_policy_check_ops - check policy's operations
+ * @pol: blkcg policy to check
+ *
+ * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs.
+ * Return true on success and false on failure.
+ */
+static bool blkcg_policy_check_ops(struct blkcg_policy *pol)
+{
+   if ((pol->cpd_alloc_fn && !pol->cpd_free_fn) ||
+   (!pol->cpd_alloc_fn && pol->cpd_free_fn))
+   return false;
+
+   if ((pol->pd_alloc_fn && !pol->pd_free_fn) ||
+   (!pol->pd_alloc_fn && pol->pd_free_fn))
+   return false;
+
+   return true;
+}
+
+/**
  * blkcg_policy_register - register a blkcg policy
  * @pol: blkcg policy to register
  *
@@ -1419,6 +1439,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS)
goto err_unlock;
 
+   if (!blkcg_policy_check_ops(pol))
+   goto err_unlock;
+
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
-- 
2.9.4



Re: [PATCH] blkcg: check pol->cpd_free_fn before free cpd

2017-10-10 Thread weiping zhang
On Tue, Oct 10, 2017 at 09:04:39AM -0600, Jens Axboe wrote:
> On 10/10/2017 08:53 AM, weiping zhang wrote:
> > check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd.
> 
> In practice this shouldn't make a difference, since if you have an
> alloc_fn, you better also have a free_fn. I'd argue a better
> patch would be ensuring that's the case, since the current situation
> would at least oops and show you there's an issue, but with the patch
> we'll just leak the memory.
> 
> I'll apply it, but would be nice if this was handled a bit more
> proactively.
> 
Thanks your comments, I'll add a sanity check in blkcg_policy_register
function.


[PATCH] blkcg: check pol->cpd_free_fn before free cpd

2017-10-10 Thread weiping zhang
check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56ba..e7ec676 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,7 +1452,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
 
 err_free_cpds:
-   if (pol->cpd_alloc_fn) {
+   if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1492,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(_pol_mutex);
 
-   if (pol->cpd_alloc_fn) {
+   if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
-- 
2.9.4



Re: [PATCH v4] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-10-07 Thread weiping zhang
On Sat, Sep 30, 2017 at 10:57:31PM +0800, weiping zhang wrote:
> On Fri, Sep 22, 2017 at 11:36:28PM +0800, weiping zhang wrote:
> > if blk-mq use "none" io scheduler, nr_request get a wrong value when
> > input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> > the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> > wrong value.
> > 
> > Reproduce:
> > 
> > echo none > /sys/block/nvme0n1/queue/scheduler
> > echo 100 > /sys/block/nvme0n1/queue/nr_requests
> > cat /sys/block/nvme0n1/queue/nr_requests
> > 100
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> > 
> > Changes since v4:
> >  * fix typo in commit message(queue/ioscheduler => queue/scheduler)
> > 
> > Changes since v3:
> >  * remove compare nr with tags->qdepth, pass nr to blk_mq_tag_update_depth
> > directly
> > 
> >  * remove return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
> > 
> > Changes since v2:
> >  * add return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
> >  * remove pr_warn, and return EINVAL, if input number is too large
> > 
> >  block/blk-mq.c | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 98a1860..491e336 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2642,8 +2642,7 @@ int blk_mq_update_nr_requests(struct request_queue 
> > *q, unsigned int nr)
> >  * queue depth. This is similar to what the old code would do.
> >  */
> > if (!hctx->sched_tags) {
> > -   ret = blk_mq_tag_update_depth(hctx, >tags,
> > -   min(nr, 
> > set->queue_depth),
> > +   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
> > false);
> > } else {
> > ret = blk_mq_tag_update_depth(hctx, >sched_tags,
> > -- 
> > 2.9.4
> > 
> 
> Hi Jens,
> 
> As you say before:
> > blk_mq_tag_update_depth() should already return
> > -EINVAL for the case where we can't grow the tags. Looks like this patch
> > should simply remove the min(nr, set->queue_depth) and just pass in 'nr'.
> I also find that hctx->tags->nr_tags equal to tag_set->queue_depth no matter 
> use
> hctx->sched_tags or not. So I think it's safe to verify 'nr' by
> hctx->tags->nr_tags.
> 
Hello Jens,

ping


Re: [PATCH v4] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-30 Thread weiping zhang
On Fri, Sep 22, 2017 at 11:36:28PM +0800, weiping zhang wrote:
> if blk-mq use "none" io scheduler, nr_request get a wrong value when
> input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> wrong value.
> 
> Reproduce:
> 
> echo none > /sys/block/nvme0n1/queue/scheduler
> echo 100 > /sys/block/nvme0n1/queue/nr_requests
> cat /sys/block/nvme0n1/queue/nr_requests
> 100
> 
> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> ---
> 
> Changes since v4:
>  * fix typo in commit message(queue/ioscheduler => queue/scheduler)
> 
> Changes since v3:
>  * remove compare nr with tags->qdepth, pass nr to blk_mq_tag_update_depth
> directly
> 
>  * remove return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
> 
> Changes since v2:
>  * add return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
>  * remove pr_warn, and return EINVAL, if input number is too large
> 
>  block/blk-mq.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 98a1860..491e336 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2642,8 +2642,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
> unsigned int nr)
>* queue depth. This is similar to what the old code would do.
>*/
>   if (!hctx->sched_tags) {
> - ret = blk_mq_tag_update_depth(hctx, >tags,
> - min(nr, 
> set->queue_depth),
> + ret = blk_mq_tag_update_depth(hctx, >tags, nr,
>   false);
>   } else {
>   ret = blk_mq_tag_update_depth(hctx, >sched_tags,
> -- 
> 2.9.4
> 

Hi Jens,

As you say before:
> blk_mq_tag_update_depth() should already return
> -EINVAL for the case where we can't grow the tags. Looks like this patch
> should simply remove the min(nr, set->queue_depth) and just pass in 'nr'.
I also find that hctx->tags->nr_tags equal to tag_set->queue_depth no matter use
hctx->sched_tags or not. So I think it's safe to verify 'nr' by
hctx->tags->nr_tags.

Thanks
weiping


[PATCH] blk-mq: remove unused function hctx_allow_merges

2017-09-30 Thread weiping zhang
since 9bddeb2a5b981 "blk-mq: make per-sw-queue bio merge as default .bio_merge"
there is no caller for this function.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-mq.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f84d145..520d257 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1484,12 +1484,6 @@ static void blk_mq_bio_to_request(struct request *rq, 
struct bio *bio)
blk_account_io_start(rq, true);
 }
 
-static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
-{
-   return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
-   !blk_queue_nomerges(hctx->queue);
-}
-
 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
   struct blk_mq_ctx *ctx,
   struct request *rq)
-- 
2.9.4



[PATCH v2] null_blk: add "no_sched" module parameter

2017-09-29 Thread weiping zhang
add an option that disable io scheduler for null block device.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 drivers/block/null_blk.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index bd92286..38f4a8c 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -154,6 +154,10 @@ enum {
NULL_Q_MQ   = 2,
 };
 
+static int g_no_sched;
+module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+MODULE_PARM_DESC(no_sched, "No io scheduler");
+
 static int g_submit_queues = 1;
 module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
@@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct 
blk_mq_tag_set *set)
set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
set->cmd_size   = sizeof(struct nullb_cmd);
set->flags = BLK_MQ_F_SHOULD_MERGE;
+   if (g_no_sched)
+   set->flags |= BLK_MQ_F_NO_SCHED;
set->driver_data = NULL;
 
if ((nullb && nullb->dev->blocking) || g_blocking)
-- 
2.9.4



Re: [PATCH] null_blk: add "no_sched" module parameter

2017-09-29 Thread weiping zhang
On Fri, Sep 29, 2017 at 11:39:03PM +0200, Jens Axboe wrote:
> On 09/29/2017 07:09 PM, weiping zhang wrote:
> > add an option that disable io scheduler for null block device.
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  drivers/block/null_blk.c | 6 +-
> >  1 file changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> > index bd92286..3c63863 100644
> > --- a/drivers/block/null_blk.c
> > +++ b/drivers/block/null_blk.c
> > @@ -154,6 +154,10 @@ enum {
> > NULL_Q_MQ   = 2,
> >  };
> >  
> > +static int g_no_sched;
> > +module_param_named(no_sched, g_no_sched, int, S_IRUGO);
> > +MODULE_PARM_DESC(no_sched, "No io scheduler");
> > +
> >  static int g_submit_queues = 1;
> >  module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
> >  MODULE_PARM_DESC(submit_queues, "Number of submission queues");
> > @@ -1753,7 +1757,7 @@ static int null_init_tag_set(struct nullb *nullb, 
> > struct blk_mq_tag_set *set)
> > g_hw_queue_depth;
> > set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
> > set->cmd_size   = sizeof(struct nullb_cmd);
> > -   set->flags = BLK_MQ_F_SHOULD_MERGE;
> > +   set->flags = g_no_sched ? BLK_MQ_F_NO_SCHED : BLK_MQ_F_SHOULD_MERGE;
> 
> This should be:
> 
>   set->flags = BLK_MQ_F_SHOULD_MERGE;
>   if (g_no_sched)
>   set->flags |= BLK_MQ_F_NO_SCHED;
> 
That's right, I go through these two flags, if no io scheduler,
BLK_MQ_F_SHOULD_MERGE can make sw ctx merge happen. I will send V2.

Thanks
weiping


[PATCH v4] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-22 Thread weiping zhang
if blk-mq use "none" io scheduler, nr_request get a wrong value when
input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
the smaller one min(nr, set->queue_depth), and then q->nr_request get a
wrong value.

Reproduce:

echo none > /sys/block/nvme0n1/queue/scheduler
echo 100 > /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/nvme0n1/queue/nr_requests
1000000

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---

Changes since v4:
 * fix typo in commit message(queue/ioscheduler => queue/scheduler)

Changes since v3:
 * remove compare nr with tags->qdepth, pass nr to blk_mq_tag_update_depth
directly

 * remove return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ

Changes since v2:
 * add return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ
 * remove pr_warn, and return EINVAL, if input number is too large

 block/blk-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a1860..491e336 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2642,8 +2642,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
 * queue depth. This is similar to what the old code would do.
 */
if (!hctx->sched_tags) {
-   ret = blk_mq_tag_update_depth(hctx, >tags,
-   min(nr, 
set->queue_depth),
+   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
false);
} else {
ret = blk_mq_tag_update_depth(hctx, >sched_tags,
-- 
2.9.4



Re: [PATCH v2 1/2] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-21 Thread weiping zhang
On Thu, Sep 21, 2017 at 10:37:48AM -0600, Jens Axboe wrote:
> On 09/21/2017 09:17 AM, weiping zhang wrote:
> > if blk-mq use "none" io scheduler, nr_request get a wrong value when
> > input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> > the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> > wrong value.
> > 
> > Reproduce:
> > 
> > echo none > /sys/block/nvme0n1/queue/ioscheduler
> > echo 100 > /sys/block/nvme0n1/queue/nr_requests
> > cat /sys/block/nvme0n1/queue/nr_requests
> > 100
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  block/blk-mq.c | 8 ++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 98a1860..479c35a 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2642,8 +2642,12 @@ int blk_mq_update_nr_requests(struct request_queue 
> > *q, unsigned int nr)
> >  * queue depth. This is similar to what the old code would do.
> >  */
> > if (!hctx->sched_tags) {
> > -   ret = blk_mq_tag_update_depth(hctx, >tags,
> > -   min(nr, 
> > set->queue_depth),
> > +   if (nr > set->queue_depth) {
> > +   ret = -EINVAL;
> > +   break;
> > +   }
> > +
> > +   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
> > false);
> 
> What am I missing here? blk_mq_tag_update_depth() should already return
> -EINVAL for the case where we can't grow the tags. Looks like this patch
> should simply remove the min(nr, set->queue_depth) and just pass in 'nr'.
> Should not need the duplicated check for depth.
> 
Ya, you are right, I will send V3 later.

Thanks


[PATCH v3] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-21 Thread weiping zhang
if blk-mq use "none" io scheduler, nr_request get a wrong value when
input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
the smaller one min(nr, set->queue_depth), and then q->nr_request get a
wrong value.

Reproduce:

echo none > /sys/block/nvme0n1/queue/ioscheduler
echo 100 > /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/nvme0n1/queue/nr_requests
1000000

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a1860..491e336 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2642,8 +2642,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
 * queue depth. This is similar to what the old code would do.
 */
if (!hctx->sched_tags) {
-   ret = blk_mq_tag_update_depth(hctx, >tags,
-   min(nr, 
set->queue_depth),
+   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
false);
} else {
ret = blk_mq_tag_update_depth(hctx, >sched_tags,
-- 
2.9.4



Re: [PATCH v2 2/2] blk-sysfs: return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ

2017-09-21 Thread weiping zhang
On Thu, Sep 21, 2017 at 10:38:22AM -0600, Jens Axboe wrote:
> On 09/21/2017 09:17 AM, weiping zhang wrote:
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  block/blk-sysfs.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> > index b8362c0..03a6e19 100644
> > --- a/block/blk-sysfs.c
> > +++ b/block/blk-sysfs.c
> > @@ -75,7 +75,7 @@ queue_requests_store(struct request_queue *q, const char 
> > *page, size_t count)
> > return ret;
> >  
> > if (nr < BLKDEV_MIN_RQ)
> > -   nr = BLKDEV_MIN_RQ;
> > +   return -EINVAL;
> 
> This is potentially breaking existing scripts.
> 
I just want this keep same behavior with the too larger case, If this
may make other side effect, I drop this patch.

Thanks
weiping


[PATCH v2 2/2] blk-sysfs: return EINVAL when user modify nr_request less than BLKDEV_MIN_RQ

2017-09-21 Thread weiping zhang
Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b8362c0..03a6e19 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -75,7 +75,7 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
return ret;
 
if (nr < BLKDEV_MIN_RQ)
-   nr = BLKDEV_MIN_RQ;
+   return -EINVAL;
 
if (q->request_fn)
err = blk_update_nr_requests(q, nr);
-- 
2.9.4



[PATCH v2 1/2] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-21 Thread weiping zhang
if blk-mq use "none" io scheduler, nr_request get a wrong value when
input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
the smaller one min(nr, set->queue_depth), and then q->nr_request get a
wrong value.

Reproduce:

echo none > /sys/block/nvme0n1/queue/ioscheduler
echo 100 > /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/nvme0n1/queue/nr_requests
1000000

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-mq.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a1860..479c35a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2642,8 +2642,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
 * queue depth. This is similar to what the old code would do.
 */
if (!hctx->sched_tags) {
-   ret = blk_mq_tag_update_depth(hctx, >tags,
-   min(nr, 
set->queue_depth),
+   if (nr > set->queue_depth) {
+   ret = -EINVAL;
+   break;
+   }
+
+   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
false);
} else {
ret = blk_mq_tag_update_depth(hctx, >sched_tags,
-- 
2.9.4



[PATCH v2 0/2] fix wrong value when user modify nr_request by sysfs

2017-09-21 Thread weiping zhang
Hi Jens,

This is v2 of fixing nr_request.

v1 -> v2:

blk-mq: fix nr_requests wrong value when modify it from sysfs
this patch return -EINVAL when user write a value that's too large.

blk-sysfs: return EINVAL when user modify nr_request less than
  BLKDEV_MIN_RQ
In order to keep same behavior with former patch, also return EINVAL 
when user write a value less than BLKDEV_MIN_RQ

weiping zhang (2):
  blk-mq: fix nr_requests wrong value when modify it from sysfs
  blk-sysfs: return EINVAL when user modify nr_request less than
BLKDEV_MIN_RQ

 block/blk-mq.c| 8 ++--
 block/blk-sysfs.c | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

-- 
2.9.4



Re: [PATCH] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-21 Thread weiping zhang
On Thu, Sep 21, 2017 at 08:09:47AM -0600, Jens Axboe wrote:
> On 09/21/2017 07:03 AM, weiping zhang wrote:
> > On Tue, Sep 12, 2017 at 09:57:32PM +0800, weiping zhang wrote:
> >> On Wed, Sep 06, 2017 at 01:00:44PM +, Bart Van Assche wrote:
> >>> On Wed, 2017-09-06 at 15:34 +0800, weiping zhang wrote:
> >>>> On Tue, Sep 05, 2017 at 03:42:45PM +, Bart Van Assche wrote:
> >>>>> On Sun, 2017-09-03 at 21:46 +0800, weiping zhang wrote:
> >>>>>> if blk-mq use "none" io scheduler, nr_request get a wrong value when
> >>>>>> input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> >>>>>> the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> >>>>>> wrong value.
> >>>>>>
> >>>>>> Reproduce:
> >>>>>>
> >>>>>> echo none > /sys/block/nvme0n1/queue/ioscheduler
> >>>>>> echo 100 > /sys/block/nvme0n1/queue/nr_requests
> >>>>>> cat /sys/block/nvme0n1/queue/nr_requests
> >>>>>> 100
> >>>>>>
> >>>>>> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> >>>>>> ---
> >>>>>>  block/blk-mq.c | 7 +--
> >>>>>>  1 file changed, 5 insertions(+), 2 deletions(-)
> >>>>>>
> >>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >>>>>> index f84d145..8303e5e 100644
> >>>>>> --- a/block/blk-mq.c
> >>>>>> +++ b/block/blk-mq.c
> >>>>>> @@ -2622,8 +2622,11 @@ int blk_mq_update_nr_requests(struct 
> >>>>>> request_queue *q, unsigned int nr)
> >>>>>> * queue depth. This is similar to what the old code 
> >>>>>> would do.
> >>>>>> */
> >>>>>>if (!hctx->sched_tags) {
> >>>>>> -  ret = blk_mq_tag_update_depth(hctx, >tags,
> >>>>>> -  min(nr, 
> >>>>>> set->queue_depth),
> >>>>>> +  if (nr > set->queue_depth) {
> >>>>>> +  nr = set->queue_depth;
> >>>>>> +  pr_warn("reduce nr_request to %u\n", 
> >>>>>> nr);
> >>>>>> +  }
> >>>>>> +  ret = blk_mq_tag_update_depth(hctx, 
> >>>>>> >tags, nr,
> >>>>>>false);
> >>>>>>} else {
> >>>>>>ret = blk_mq_tag_update_depth(hctx, 
> >>>>>> >sched_tags,
> >>>>>
> >>>>> Shouldn't this code return -EINVAL or -ERANGE if 'nr' is too large? 
> >>>>> That will help to
> >>>>> keep user space code simple that updates the queue depth.
> >>>>
> >>>> Hi Bart,
> >>>>
> >>>> The reason why not return -EINVAL is keeping alin with minimum checking 
> >>>> in queue_requests_store,
> >>>> if you insist return -EINVAL/-ERANGE, minimum checking should also keep
> >>>> same behavior. Both return error meesage and quietly changing are okey
> >>>> for me. Which way do you prefer ?
> >>>>
> >>>> static ssize_t
> >>>> queue_requests_store(struct request_queue *q, const char *page, size_t 
> >>>> count)
> >>>> {
> >>>>  unsigned long nr;
> >>>>  int ret, err;
> >>>>
> >>>>  if (!q->request_fn && !q->mq_ops)
> >>>>  return -EINVAL;
> >>>>
> >>>>  ret = queue_var_store(, page, count);
> >>>>  if (ret < 0)
> >>>>  return ret;
> >>>>
> >>>>  if (nr < BLKDEV_MIN_RQ)
> >>>>  nr = BLKDEV_MIN_RQ;
> >>>
> >>> Hello Jens,
> >>>
> >>> Do you perhaps have a preference for one of the approaches that have been 
> >>> discussed
> >>> in this e-mail thread?
> >>>
> >>> Thanks,
> >>>
> >>> Bart.
> >>
> > Hello Jens,
> > 
> > Would you please give some comments about this patch,
> 
> If someone writes a value that's too large, return -EINVAL and
> don't set it. Don't add weird debug printks.
> 
> 
OK, I send patch V2 soon.


Re: [PATCH] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-21 Thread weiping zhang
On Tue, Sep 12, 2017 at 09:57:32PM +0800, weiping zhang wrote:
> On Wed, Sep 06, 2017 at 01:00:44PM +, Bart Van Assche wrote:
> > On Wed, 2017-09-06 at 15:34 +0800, weiping zhang wrote:
> > > On Tue, Sep 05, 2017 at 03:42:45PM +, Bart Van Assche wrote:
> > > > On Sun, 2017-09-03 at 21:46 +0800, weiping zhang wrote:
> > > > > if blk-mq use "none" io scheduler, nr_request get a wrong value when
> > > > > input a number > tag_set->queue_depth. blk_mq_tag_update_depth will 
> > > > > get
> > > > > the smaller one min(nr, set->queue_depth), and then q->nr_request get 
> > > > > a
> > > > > wrong value.
> > > > > 
> > > > > Reproduce:
> > > > > 
> > > > > echo none > /sys/block/nvme0n1/queue/ioscheduler
> > > > > echo 100 > /sys/block/nvme0n1/queue/nr_requests
> > > > > cat /sys/block/nvme0n1/queue/nr_requests
> > > > > 100
> > > > > 
> > > > > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > > > > ---
> > > > >  block/blk-mq.c | 7 +--
> > > > >  1 file changed, 5 insertions(+), 2 deletions(-)
> > > > > 
> > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > > > > index f84d145..8303e5e 100644
> > > > > --- a/block/blk-mq.c
> > > > > +++ b/block/blk-mq.c
> > > > > @@ -2622,8 +2622,11 @@ int blk_mq_update_nr_requests(struct 
> > > > > request_queue *q, unsigned int nr)
> > > > >* queue depth. This is similar to what the old code 
> > > > > would do.
> > > > >*/
> > > > >   if (!hctx->sched_tags) {
> > > > > - ret = blk_mq_tag_update_depth(hctx, >tags,
> > > > > - min(nr, 
> > > > > set->queue_depth),
> > > > > + if (nr > set->queue_depth) {
> > > > > + nr = set->queue_depth;
> > > > > + pr_warn("reduce nr_request to %u\n", 
> > > > > nr);
> > > > > + }
> > > > > + ret = blk_mq_tag_update_depth(hctx, 
> > > > > >tags, nr,
> > > > >   false);
> > > > >   } else {
> > > > >   ret = blk_mq_tag_update_depth(hctx, 
> > > > > >sched_tags,
> > > > 
> > > > Shouldn't this code return -EINVAL or -ERANGE if 'nr' is too large? 
> > > > That will help to
> > > > keep user space code simple that updates the queue depth.
> > > 
> > > Hi Bart,
> > > 
> > > The reason why not return -EINVAL is keeping alin with minimum checking 
> > > in queue_requests_store,
> > > if you insist return -EINVAL/-ERANGE, minimum checking should also keep
> > > same behavior. Both return error meesage and quietly changing are okey
> > > for me. Which way do you prefer ?
> > > 
> > > static ssize_t
> > > queue_requests_store(struct request_queue *q, const char *page, size_t 
> > > count)
> > > {
> > >   unsigned long nr;
> > >   int ret, err;
> > > 
> > >   if (!q->request_fn && !q->mq_ops)
> > >   return -EINVAL;
> > > 
> > >   ret = queue_var_store(, page, count);
> > >   if (ret < 0)
> > >   return ret;
> > > 
> > >   if (nr < BLKDEV_MIN_RQ)
> > >   nr = BLKDEV_MIN_RQ;
> > 
> > Hello Jens,
> > 
> > Do you perhaps have a preference for one of the approaches that have been 
> > discussed
> > in this e-mail thread?
> > 
> > Thanks,
> > 
> > Bart.
> 
Hello Jens,

Would you please give some comments about this patch,

Thanks
Weiping.


Re: [PATCH v2 0/2] Add wrapper for blkcg policy operatins

2017-09-15 Thread weiping zhang
On Fri, Sep 01, 2017 at 10:16:45PM +0800, weiping zhang wrote:
> The first patch is the V2 of [PATCH] blkcg: check pol->cpd_free_fn
> before free cpd, it fixs a checking before free cpd.
> 
> The second patch add some wrappers for struct blkcg_policy->xxx_fn, because 
> not
> every block cgroup policy implement all operations.
> 
> weiping zhang (2):
>   blkcg: check pol->cpd_free_fn before free cpd
>   blkcg: add wrappers for struct blkcg_policy operations
> 
>  block/blk-cgroup.c | 57 +---
>  include/linux/blk-cgroup.h | 72 
> ++
>  2 files changed, 99 insertions(+), 30 deletions(-)
> 
Hello Jens,

ping



Re: [PATCH] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-12 Thread weiping zhang
On Wed, Sep 06, 2017 at 01:00:44PM +, Bart Van Assche wrote:
> On Wed, 2017-09-06 at 15:34 +0800, weiping zhang wrote:
> > On Tue, Sep 05, 2017 at 03:42:45PM +, Bart Van Assche wrote:
> > > On Sun, 2017-09-03 at 21:46 +0800, weiping zhang wrote:
> > > > if blk-mq use "none" io scheduler, nr_request get a wrong value when
> > > > input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> > > > the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> > > > wrong value.
> > > > 
> > > > Reproduce:
> > > > 
> > > > echo none > /sys/block/nvme0n1/queue/ioscheduler
> > > > echo 100 > /sys/block/nvme0n1/queue/nr_requests
> > > > cat /sys/block/nvme0n1/queue/nr_requests
> > > > 100
> > > > 
> > > > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > > > ---
> > > >  block/blk-mq.c | 7 +--
> > > >  1 file changed, 5 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > > > index f84d145..8303e5e 100644
> > > > --- a/block/blk-mq.c
> > > > +++ b/block/blk-mq.c
> > > > @@ -2622,8 +2622,11 @@ int blk_mq_update_nr_requests(struct 
> > > > request_queue *q, unsigned int nr)
> > > >  * queue depth. This is similar to what the old code 
> > > > would do.
> > > >  */
> > > > if (!hctx->sched_tags) {
> > > > -   ret = blk_mq_tag_update_depth(hctx, >tags,
> > > > -   min(nr, 
> > > > set->queue_depth),
> > > > +   if (nr > set->queue_depth) {
> > > > +   nr = set->queue_depth;
> > > > +   pr_warn("reduce nr_request to %u\n", 
> > > > nr);
> > > > +   }
> > > > +   ret = blk_mq_tag_update_depth(hctx, 
> > > > >tags, nr,
> > > > false);
> > > > } else {
> > > > ret = blk_mq_tag_update_depth(hctx, 
> > > > >sched_tags,
> > > 
> > > Shouldn't this code return -EINVAL or -ERANGE if 'nr' is too large? That 
> > > will help to
> > > keep user space code simple that updates the queue depth.
> > 
> > Hi Bart,
> > 
> > The reason why not return -EINVAL is keeping alin with minimum checking in 
> > queue_requests_store,
> > if you insist return -EINVAL/-ERANGE, minimum checking should also keep
> > same behavior. Both return error meesage and quietly changing are okey
> > for me. Which way do you prefer ?
> > 
> > static ssize_t
> > queue_requests_store(struct request_queue *q, const char *page, size_t 
> > count)
> > {
> > unsigned long nr;
> > int ret, err;
> > 
> > if (!q->request_fn && !q->mq_ops)
> > return -EINVAL;
> > 
> > ret = queue_var_store(, page, count);
> > if (ret < 0)
> > return ret;
> > 
> > if (nr < BLKDEV_MIN_RQ)
> > nr = BLKDEV_MIN_RQ;
> 
> Hello Jens,
> 
> Do you perhaps have a preference for one of the approaches that have been 
> discussed
> in this e-mail thread?
> 
> Thanks,
> 
> Bart.

Hello Jens,

Would you please give some comments about this patch,

Thanks
Weiping.


Re: [PATCH] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-06 Thread weiping zhang
On Tue, Sep 05, 2017 at 03:42:45PM +, Bart Van Assche wrote:
> On Sun, 2017-09-03 at 21:46 +0800, weiping zhang wrote:
> > if blk-mq use "none" io scheduler, nr_request get a wrong value when
> > input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
> > the smaller one min(nr, set->queue_depth), and then q->nr_request get a
> > wrong value.
> > 
> > Reproduce:
> > 
> > echo none > /sys/block/nvme0n1/queue/ioscheduler
> > echo 100 > /sys/block/nvme0n1/queue/nr_requests
> > cat /sys/block/nvme0n1/queue/nr_requests
> > 100
> > 
> > Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> > ---
> >  block/blk-mq.c | 7 +--
> >  1 file changed, 5 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index f84d145..8303e5e 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2622,8 +2622,11 @@ int blk_mq_update_nr_requests(struct request_queue 
> > *q, unsigned int nr)
> >  * queue depth. This is similar to what the old code would do.
> >  */
> > if (!hctx->sched_tags) {
> > -   ret = blk_mq_tag_update_depth(hctx, >tags,
> > -   min(nr, 
> > set->queue_depth),
> > +   if (nr > set->queue_depth) {
> > +   nr = set->queue_depth;
> > +   pr_warn("reduce nr_request to %u\n", nr);
> > +   }
> > +   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
> > false);
> > } else {
> > ret = blk_mq_tag_update_depth(hctx, >sched_tags,
> 
> Shouldn't this code return -EINVAL or -ERANGE if 'nr' is too large? That will 
> help to
> keep user space code simple that updates the queue depth.

Hi Bart,

The reason why not return -EINVAL is keeping alin with minimum checking in 
queue_requests_store,
if you insist return -EINVAL/-ERANGE, minimum checking should also keep
same behavior. Both return error meesage and quietly changing are okey
for me. Which way do you prefer ?

static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long nr;
int ret, err;

if (!q->request_fn && !q->mq_ops)
return -EINVAL;

ret = queue_var_store(, page, count);
if (ret < 0)
return ret;

if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;

Thanks


[PATCH] blk-mq: fix nr_requests wrong value when modify it from sysfs

2017-09-03 Thread weiping zhang
if blk-mq use "none" io scheduler, nr_request get a wrong value when
input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
the smaller one min(nr, set->queue_depth), and then q->nr_request get a
wrong value.

Reproduce:

echo none > /sys/block/nvme0n1/queue/ioscheduler
echo 100 > /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/nvme0n1/queue/nr_requests
1000000

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-mq.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f84d145..8303e5e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2622,8 +2622,11 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
 * queue depth. This is similar to what the old code would do.
 */
if (!hctx->sched_tags) {
-   ret = blk_mq_tag_update_depth(hctx, >tags,
-   min(nr, 
set->queue_depth),
+   if (nr > set->queue_depth) {
+   nr = set->queue_depth;
+   pr_warn("reduce nr_request to %u\n", nr);
+   }
+   ret = blk_mq_tag_update_depth(hctx, >tags, nr,
false);
} else {
ret = blk_mq_tag_update_depth(hctx, >sched_tags,
-- 
2.9.4



Re: [PATCH 3/5] bfq: Check kstrtoul() return value

2017-09-01 Thread weiping zhang
> Sorry but I do not like this proposal because:
> * If invalid input is provided writing into a sysfs attribute should fail
>   instead of ignoring the invalid input silently.
> * simple_strtoul() is considered obsolete and must not be used in new code.
>   From include/linux/kernel.h:
>
> /* Obsolete, do not use.  Use kstrto instead */
>
> extern unsigned long simple_strtoul(const char *,char **,unsigned int);
> extern long simple_strtol(const char *,char **,unsigned int);
> extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
> extern long long simple_strtoll(const char *,char **,unsigned int);
>

Hello Bart,

Agree with you, it seems more reasonable to give error message to user.

Thanks


Re: [PATCH 3/5] bfq: Check kstrtoul() return value

2017-09-01 Thread weiping zhang
2017-09-02 1:14 GMT+08:00 Paolo Valente :
>
>> Il giorno 30 ago 2017, alle ore 20:42, Bart Van Assche 
>>  ha scritto:
>>
>> Make sysfs writes fail for invalid numbers instead of storing
>> uninitialized data copied from the stack. This patch removes
>> all uninitialized_var() occurrences from the BFQ source code.
>>
>> Signed-off-by: Bart Van Assche 
>> Cc: Paolo Valente 
>
> Acked-by: Paolo Valente 
>

Hi Bart,

how about using simple_strtoul  which was used in cfq/mq-iosched.c
*var = simple_strtoul(p, , 10);

if invalid string came from sysfs, this function just return 0,
and there are validations after every calling bfq_var_store.

Thanks


[PATCH v2 2/2] blkcg: add wrappers for struct blkcg_policy operations

2017-09-01 Thread weiping zhang
Some blkcg policies may not implement all operations in struct blkcg_policy,
add wrappers for these pol->xxx_fn.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 53 --
 include/linux/blk-cgroup.h | 72 ++
 2 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0c45870..8588739 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -71,7 +71,7 @@ static void blkg_free(struct blkcg_gq *blkg)
 
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (blkg->pd[i])
-   blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+   blkg_pol_free_pd(blkcg_policy[i], blkg->pd[i]);
 
if (blkg->blkcg != _root)
blk_exit_rl(blkg->q, >rl);
@@ -124,7 +124,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, 
struct request_queue *q,
continue;
 
/* alloc per-policy data and attach it to blkg */
-   pd = pol->pd_alloc_fn(gfp_mask, q->node);
+   pd = blkg_pol_alloc_pd(pol, gfp_mask, q->node);
if (!pd)
goto err_free;
 
@@ -218,8 +218,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_init_fn)
-   pol->pd_init_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_init_pd(pol, blkg->pd[i]);
}
 
/* insert */
@@ -232,8 +232,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_online_fn)
-   pol->pd_online_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_online_pd(pol, blkg->pd[i]);
}
}
blkg->online = true;
@@ -323,8 +323,8 @@ static void blkg_destroy(struct blkcg_gq *blkg)
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_offline_fn)
-   pol->pd_offline_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_offline_pd(pol, blkg->pd[i]);
}
 
if (parent) {
@@ -457,8 +457,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state 
*css,
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
 
-   if (blkg->pd[i] && pol->pd_reset_stats_fn)
-   pol->pd_reset_stats_fn(blkg->pd[i]);
+   if (blkg->pd[i])
+   blkg_pol_reset_pd_stats(pol, blkg->pd[i]);
}
}
 
@@ -1045,7 +1045,7 @@ static void blkcg_css_free(struct cgroup_subsys_state 
*css)
 
for (i = 0; i < BLKCG_MAX_POLS; i++)
if (blkcg->cpd[i])
-   blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+   blkcg_pol_free_cpd(blkcg_policy[i], blkcg->cpd[i]);
 
mutex_unlock(_pol_mutex);
 
@@ -1084,7 +1084,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
if (!pol || !pol->cpd_alloc_fn)
continue;
 
-   cpd = pol->cpd_alloc_fn(GFP_KERNEL);
+   cpd = blkcg_pol_alloc_cpd(pol, GFP_KERNEL);
if (!cpd) {
ret = ERR_PTR(-ENOMEM);
goto free_pd_blkcg;
@@ -1092,8 +1092,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg->cpd[i] = cpd;
cpd->blkcg = blkcg;
cpd->plid = i;
-   if (pol->cpd_init_fn)
-   pol->cpd_init_fn(cpd);
+   blkcg_pol_init_cpd(pol, cpd);
}
 
spin_lock_init(>lock);
@@ -1110,7 +1109,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 free_pd_blkcg:
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
-   blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+   blkcg_pol_free_cpd(blkcg_policy[i], blkcg->cpd[i]);
 free_blkcg:
kfree(blkcg);
mutex_unlock(_pol_mutex);
@@ -1244,7 +1243,7 @@ static void blkcg_bind(struct cgroup_subsys_state 
*root_css)
 
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node)
if (blkcg->cpd[pol->plid])
-   

[PATCH v2 1/2] blkcg: check pol->cpd_free_fn before free cpd

2017-09-01 Thread weiping zhang
check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0480892..0c45870 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1450,7 +1450,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
 
 err_free_cpds:
-   if (pol->cpd_alloc_fn) {
+   if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1490,7 +1490,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(_pol_mutex);
 
-   if (pol->cpd_alloc_fn) {
+   if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
-- 
2.9.4



[PATCH v2 0/2] Add wrapper for blkcg policy operatins

2017-09-01 Thread weiping zhang
The first patch is the V2 of [PATCH] blkcg: check pol->cpd_free_fn
before free cpd, it fixs a checking before free cpd.

The second patch add some wrappers for struct blkcg_policy->xxx_fn, because not
every block cgroup policy implement all operations.

weiping zhang (2):
  blkcg: check pol->cpd_free_fn before free cpd
  blkcg: add wrappers for struct blkcg_policy operations

 block/blk-cgroup.c | 57 +---
 include/linux/blk-cgroup.h | 72 ++
 2 files changed, 99 insertions(+), 30 deletions(-)

-- 
2.9.4



[PATCH] blkcg: check pol->cpd_free_fn before free cpd

2017-08-29 Thread weiping zhang
check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0480892..adcbc3e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1044,7 +1044,7 @@ static void blkcg_css_free(struct cgroup_subsys_state 
*css)
list_del(>all_blkcgs_node);
 
for (i = 0; i < BLKCG_MAX_POLS; i++)
-   if (blkcg->cpd[i])
+   if (blkcg->cpd[i] && blkcg_policy[i]->cpd_free_fn)
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 
mutex_unlock(_pol_mutex);
@@ -1109,7 +1109,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
 free_pd_blkcg:
for (i--; i >= 0; i--)
-   if (blkcg->cpd[i])
+   if (blkcg->cpd[i] && blkcg_policy[i]->cpd_free_fn)
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
kfree(blkcg);
@@ -1450,7 +1450,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
 
 err_free_cpds:
-   if (pol->cpd_alloc_fn) {
+   if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1490,7 +1490,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(_pol_mutex);
 
-   if (pol->cpd_alloc_fn) {
+   if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, _blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
-- 
2.9.4



Re: [PATCH] block, scheduler: convert xxx_var_store to void

2017-08-28 Thread weiping zhang
On Mon, Aug 28, 2017 at 10:00:46AM -0600, Jens Axboe wrote:
> On 08/28/2017 06:22 AM, weiping zhang wrote:
> > On Fri, Aug 25, 2017 at 01:11:33AM +0800, weiping zhang wrote:
> >> The last parameter "count" never be used in xxx_var_store,
> >> convert these functions to void.
> >>
> >
> > Would you please look this misc patch at your convenience ?
> 
> Looks fine. But please don't send reminders so shortly after sending a
> patch, especially when it's just a cleanup. This was received Thursday.
> 
> -- 
> Jens Axboe
> 

Got it, Thanks! ^_^


Re: [PATCH] block, scheduler: convert xxx_var_store to void

2017-08-28 Thread weiping zhang
On Fri, Aug 25, 2017 at 01:11:33AM +0800, weiping zhang wrote:
> The last parameter "count" never be used in xxx_var_store,
> convert these functions to void.
> 
> Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
> ---
>  block/bfq-iosched.c  | 33 +
>  block/cfq-iosched.c  | 13 ++---
>  block/deadline-iosched.c |  9 -
>  block/mq-deadline.c  |  9 -
>  4 files changed, 31 insertions(+), 33 deletions(-)
> 
> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
> index 436b6ca..7a4085d 100644
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -4787,16 +4787,13 @@ static ssize_t bfq_var_show(unsigned int var, char 
> *page)
>   return sprintf(page, "%u\n", var);
>  }
>  
> -static ssize_t bfq_var_store(unsigned long *var, const char *page,
> -  size_t count)
> +static void bfq_var_store(unsigned long *var, const char *page)
>  {
>   unsigned long new_val;
>   int ret = kstrtoul(page, 10, _val);
>  
>   if (ret == 0)
>   *var = new_val;
> -
> - return count;
>  }
>  
>  #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
> @@ -4838,7 +4835,7 @@ __FUNC(struct elevator_queue *e, const char *page, 
> size_t count)\
>  {\
>   struct bfq_data *bfqd = e->elevator_data;   \
>   unsigned long uninitialized_var(__data);\
> - int ret = bfq_var_store(&__data, (page), count);\
> + bfq_var_store(&__data, (page)); \
>   if (__data < (MIN)) \
>   __data = (MIN); \
>   else if (__data > (MAX))\
> @@ -4849,7 +4846,7 @@ __FUNC(struct elevator_queue *e, const char *page, 
> size_t count)\
>   *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
>   else\
>   *(__PTR) = __data;  \
> - return ret; \
> + return count;   \
>  }


Hi Jens,

Would you please look this misc patch at your convenience ?

Thanks



[PATCH] blkcg: avoid free blkcg_root when failed to alloc blkcg policy

2017-08-25 Thread weiping zhang
this patch fix two errors, firstly avoid kfree blk_root, secondly not
free(blkcg) ,if blkcg alloc fail(blkcg == NULL), just unlock that mutex;

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/blk-cgroup.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0480892..d3f56ba 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1067,7 +1067,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg) {
ret = ERR_PTR(-ENOMEM);
-   goto free_blkcg;
+   goto unlock;
}
}
 
@@ -,8 +,10 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-free_blkcg:
-   kfree(blkcg);
+
+   if (blkcg != _root)
+   kfree(blkcg);
+unlock:
mutex_unlock(_pol_mutex);
return ret;
 }
-- 
2.9.4



[PATCH] block, scheduler: convert xxx_var_store to void

2017-08-24 Thread weiping zhang
The last parameter "count" never be used in xxx_var_store,
convert these functions to void.

Signed-off-by: weiping zhang <zhangweip...@didichuxing.com>
---
 block/bfq-iosched.c  | 33 +
 block/cfq-iosched.c  | 13 ++---
 block/deadline-iosched.c |  9 -
 block/mq-deadline.c  |  9 -
 4 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 436b6ca..7a4085d 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4787,16 +4787,13 @@ static ssize_t bfq_var_show(unsigned int var, char 
*page)
return sprintf(page, "%u\n", var);
 }
 
-static ssize_t bfq_var_store(unsigned long *var, const char *page,
-size_t count)
+static void bfq_var_store(unsigned long *var, const char *page)
 {
unsigned long new_val;
int ret = kstrtoul(page, 10, _val);
 
if (ret == 0)
*var = new_val;
-
-   return count;
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)   \
@@ -4838,7 +4835,7 @@ __FUNC(struct elevator_queue *e, const char *page, size_t 
count)  \
 {  \
struct bfq_data *bfqd = e->elevator_data;   \
unsigned long uninitialized_var(__data);\
-   int ret = bfq_var_store(&__data, (page), count);\
+   bfq_var_store(&__data, (page)); \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX))\
@@ -4849,7 +4846,7 @@ __FUNC(struct elevator_queue *e, const char *page, size_t 
count)  \
*(__PTR) = (u64)__data * NSEC_PER_MSEC; \
else\
*(__PTR) = __data;  \
-   return ret; \
+   return count;   \
 }
 STORE_FUNCTION(bfq_fifo_expire_sync_store, >bfq_fifo_expire[1], 1,
INT_MAX, 2);
@@ -4866,13 +4863,13 @@ static ssize_t __FUNC(struct elevator_queue *e, const 
char *page, size_t count)\
 {  \
struct bfq_data *bfqd = e->elevator_data;   \
unsigned long uninitialized_var(__data);\
-   int ret = bfq_var_store(&__data, (page), count);\
+   bfq_var_store(&__data, (page)); \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX))\
__data = (MAX); \
*(__PTR) = (u64)__data * NSEC_PER_USEC; \
-   return ret; \
+   return count;   \
 }
 USEC_STORE_FUNCTION(bfq_slice_idle_us_store, >bfq_slice_idle, 0,
UINT_MAX);
@@ -4883,7 +4880,8 @@ static ssize_t bfq_max_budget_store(struct elevator_queue 
*e,
 {
struct bfq_data *bfqd = e->elevator_data;
unsigned long uninitialized_var(__data);
-   int ret = bfq_var_store(&__data, (page), count);
+
+   bfq_var_store(&__data, (page));
 
if (__data == 0)
bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
@@ -4895,7 +4893,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue 
*e,
 
bfqd->bfq_user_max_budget = __data;
 
-   return ret;
+   return count;
 }
 
 /*
@@ -4907,7 +4905,8 @@ static ssize_t bfq_timeout_sync_store(struct 
elevator_queue *e,
 {
struct bfq_data *bfqd = e->elevator_data;
unsigned long uninitialized_var(__data);
-   int ret = bfq_var_store(&__data, (page), count);
+
+   bfq_var_store(&__data, (page));
 
if (__data < 1)
__data = 1;
@@ -4918,7 +4917,7 @@ static ssize_t bfq_timeout_sync_store(struct 
elevator_queue *e,
if (bfqd->bfq_user_max_budget == 0)
bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
 
-   return ret;
+   return count;
 }
 
 static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
@@ -4926,7 +4925,8 @@ static ssize_t bfq_strict_guarantees_store(struct 
elevator_queue *e,
 {
struct bfq_data *bfqd = e->elevator_data;
unsigned long uninitialized_var(__data);
-   int ret = bfq_var_store(&__data, 

  1   2   >