[PATCH] blktests: Add '--outdir' to store results in a different directory

2018-07-17 Thread Hannes Reinecke
Adding an option '--outdir' to store results in a different
director so as not to clutter the git repository itself.

Signed-off-by: Hannes Reinecke 
---
 check | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/check b/check
index a635531..42d07f8 100755
--- a/check
+++ b/check
@@ -334,7 +334,7 @@ _call_test() {
fi
 
trap _cleanup EXIT
-   if ! TMPDIR="$(mktemp --tmpdir -p "$PWD/results" -d 
"tmpdir.${TEST_NAME//\//.}.XXX")"; then
+   if ! TMPDIR="$(mktemp --tmpdir -p "$RESULTS_DIR" -d 
"tmpdir.${TEST_NAME//\//.}.XXX")"; then
return
fi
 
@@ -415,7 +415,7 @@ _run_test() {
return 0
fi
 
-   RESULTS_DIR="results/nodev"
+   RESULTS_DIR="${OUT_DIR}/results/nodev"
_call_test test
else
if [[ ${#TEST_DEVS[@]} -eq 0 ]]; then
@@ -434,7 +434,7 @@ _run_test() {
_output_notrun "$TEST_NAME => $(basename 
"$TEST_DEV")"
continue
fi
-   RESULTS_DIR="results/$(basename "$TEST_DEV")"
+   RESULTS_DIR="${OUT_DIR}/results/$(basename "$TEST_DEV")"
if ! _call_test test_device; then
ret=1
fi
@@ -567,6 +567,7 @@ Test runs:
 tests to run
 
 Miscellaneous:
+  -o, --outdir=OUTDIRwrite results into the specified directory
   -h, --help display this help message and exit"
 
case "$1" in
@@ -581,12 +582,13 @@ Miscellaneous:
esac
 }
 
-if ! TEMP=$(getopt -o 'dq::x:h' --long 'quick::,exclude:,help' -n "$0" -- 
"$@"); then
+if ! TEMP=$(getopt -o 'dq::o:x:h' --long 'quick::,exclude:,outdir:,help' -n 
"$0" -- "$@"); then
exit 1
 fi
 
 eval set -- "$TEMP"
 unset TEMP
+OUT_DIR="."
 
 if [[ -r config ]]; then
# shellcheck disable=SC1091
@@ -629,6 +631,10 @@ while true; do
EXCLUDE+=("$2")
shift 2
;;
+   '-o'|'--outdir')
+   OUT_DIR="${2:-${OUT_DIR:-.}}"
+   shift 2
+   ;;
'-h'|'--help')
usage out
;;
-- 
2.12.3



[PATCH 1/2] blktests: enable ANA support

2018-07-17 Thread Hannes Reinecke
Update nvme functions to support ANA.

Signed-off-by: Hannes Reinecke 
---
 tests/nvme/rc | 49 ++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/tests/nvme/rc b/tests/nvme/rc
index fb5dbdf..116661d 100644
--- a/tests/nvme/rc
+++ b/tests/nvme/rc
@@ -27,7 +27,7 @@ group_device_requires() {
_test_dev_is_nvme
 }
 
-NVMET_CFS="/sys/kernel/config/nvmet/"
+NVMET_CFS="/sys/kernel/config/nvmet"
 
 _test_dev_is_nvme() {
if ! readlink -f "$TEST_DEV_SYSFS/device" | grep -q nvme; then
@@ -49,6 +49,7 @@ _create_nvmet_port() {
 
mkdir "${NVMET_CFS}/ports/${port}"
echo "${trtype}" > "${NVMET_CFS}/ports/${port}/addr_trtype"
+   echo "${port}" > "${NVMET_CFS}/ports/${port}/addr_traddr"
 
echo "${port}"
 }
@@ -58,6 +59,39 @@ _remove_nvmet_port() {
rmdir "${NVMET_CFS}/ports/${port}"
 }
 
+_create_nvmet_anagroup() {
+   local port="$1"
+   local port_cfs="${NVMET_CFS}/ports/${port}"
+   local grpid
+
+   for ((grpid = 1; ; grpid++)); do
+   if [[ ! -e "${port_cfs}/ana_groups/${grpid}" ]]; then
+   break
+   fi
+   done
+
+   mkdir "${port_cfs}/ana_groups/${grpid}"
+
+   echo "${grpid}"
+}
+
+_remove_nvmet_anagroup() {
+   local port="$1"
+   local grpid="$2"
+   local ana_cfs="${NVMET_CFS}/ports/${port}/ana_groups/${grpid}"
+
+   rmdir "${ana_cfs}"
+}
+
+_set_nvmet_anagroup_state() {
+   local port="$1"
+   local grpid="$2"
+   local state="$3"
+   local ana_cfs="${NVMET_CFS}/ports/${port}/ana_groups/${grpid}"
+
+   echo "${state}" > "${ana_cfs}/ana_state"
+}
+
 _create_nvmet_ns() {
local nvmet_subsystem="$1"
local nsid="$2"
@@ -65,14 +99,22 @@ _create_nvmet_ns() {
local uuid="----"
local subsys_path="${NVMET_CFS}/subsystems/${nvmet_subsystem}"
local ns_path="${subsys_path}/namespaces/${nsid}"
+   local ana_grpid
 
-   if [[ $# -eq 4 ]]; then
+   if [[ $# -ge 4 ]]; then
uuid="$4"
fi
 
+   if [[ $# -eq 5 ]]; then
+   ana_grpid="$5"
+   fi
+
mkdir "${ns_path}"
printf "%s" "${blkdev}" > "${ns_path}/device_path"
printf "%s" "${uuid}" > "${ns_path}/device_uuid"
+   if [ -n "${ana_grpid}" ] ; then
+   printf "%s" "${ana_grpid}" > "${ns_path}/ana_grpid"
+   fi
printf 1 > "${ns_path}/enable"
 }
 
@@ -80,11 +122,12 @@ _create_nvmet_subsystem() {
local nvmet_subsystem="$1"
local blkdev="$2"
local uuid=$3
+   local ana_grpid=$4
local cfs_path="${NVMET_CFS}/subsystems/${nvmet_subsystem}"
 
mkdir -p "${cfs_path}"
echo 1 > "${cfs_path}/attr_allow_any_host"
-   _create_nvmet_ns "${nvmet_subsystem}" "1" "${blkdev}" "${uuid}"
+   _create_nvmet_ns "${nvmet_subsystem}" "1" "${blkdev}" "${uuid}" 
"${ana_grpid}"
 }
 
 _remove_nvmet_ns() {
-- 
2.12.3



[PATCH 0/2] blktests: test ANA base support

2018-07-17 Thread Hannes Reinecke
Hi all,

here's a small patchset for testing ANA base support.
The test itself requires the ANA patchset from Christoph, plus
the fixes I've sent to the mailing list earlier.

Hannes Reinecke (2):
  blktests: enable ANA support
  blktests: add test for ANA state transition

 tests/nvme/014 | 158 +
 tests/nvme/014.out |  17 ++
 tests/nvme/rc  |  49 -
 3 files changed, 221 insertions(+), 3 deletions(-)
 create mode 100755 tests/nvme/014
 create mode 100644 tests/nvme/014.out

-- 
2.12.3



[PATCH 2/2] blktests: add test for ANA state transition

2018-07-17 Thread Hannes Reinecke
Signed-off-by: Hannes Reinecke 
---
 tests/nvme/014 | 158 +
 tests/nvme/014.out |  17 ++
 2 files changed, 175 insertions(+)
 create mode 100755 tests/nvme/014
 create mode 100644 tests/nvme/014.out

diff --git a/tests/nvme/014 b/tests/nvme/014
new file mode 100755
index 000..4b57229
--- /dev/null
+++ b/tests/nvme/014
@@ -0,0 +1,158 @@
+#!/bin/bash
+#
+# Regression test for ANA base support
+#
+# Copyright (C) 2018 Hannes Reinecke
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+
+. tests/nvme/rc
+
+DESCRIPTION="test ANA optimized/transitioning/inaccessible support"
+QUICK=1
+
+switch_nvmet_anagroup() {
+   local port1="$1"
+   local port2="$2"
+   local mode="$3"
+
+   echo "ANA state ${mode}"
+
+   if [ "${mode}" = "change" ] ; then
+   _set_nvmet_anagroup_state "${port1}" "1" "change"
+   _set_nvmet_anagroup_state "${port1}" "2" "change"
+   _set_nvmet_anagroup_state "${port2}" "1" "change"
+   _set_nvmet_anagroup_state "${port2}" "2" "change"
+   elif [ "${mode}" = "failover" ] ; then
+   _set_nvmet_anagroup_state "${port1}" "1" "inaccessible"
+   _set_nvmet_anagroup_state "${port1}" "2" "optimized"
+   _set_nvmet_anagroup_state "${port2}" "1" "optimized"
+   _set_nvmet_anagroup_state "${port2}" "2" "inaccessible"
+   else
+   _set_nvmet_anagroup_state "${port1}" "1" "optimized"
+   _set_nvmet_anagroup_state "${port1}" "2" "inaccessible"
+   _set_nvmet_anagroup_state "${port2}" "1" "inaccessible"
+   _set_nvmet_anagroup_state "${port2}" "2" "optimized"
+   fi
+}
+
+_display_ana_state() {
+   local grpid state
+   for nvme in /sys/class/nvme/* ; do
+   for c in ${nvme}/nvme* ; do
+   if [ ! -d ${c} ] ; then
+   echo "${nvme##*/}: ANA disabled"
+   continue
+   fi
+   grpid="$(cat "${c}/ana_grpid")"
+   state="$(cat "${c}/ana_state")"
+   echo "${c##*/}: grpid ${grpid} state ${state}"
+   done
+   done
+}
+
+_switch_ana_states() {
+   local port1=$1
+   local port2=$2
+
+}
+
+requires() {
+   _have_program nvme && _have_module nvme-loop && _have_module loop && \
+   _have_configfs && _have_fio
+}
+
+test() {
+   echo "Running ${TEST_NAME}"
+
+   modprobe nvmet
+   modprobe nvme-loop
+
+   local port1
+   port1="$(_create_nvmet_port "loop")"
+   ag1="$(_create_nvmet_anagroup "${port1}")"
+
+   local port2
+   port2="$(_create_nvmet_port "loop")"
+   ag2="$(_create_nvmet_anagroup "${port2}")"
+
+   truncate -s 1G "$TMPDIR/img1"
+
+   local loop_dev1
+   loop_dev1="$(losetup -f --show "$TMPDIR/img1")"
+
+   _create_nvmet_subsystem "blktests-subsystem-1" "${loop_dev1}" \
+   "91fdba0d-f87b-4c25-b80f-db7be1418b9e" "1"
+
+   truncate -s 1G "$TMPDIR/img2"
+
+   local loop_dev2
+   loop_dev2="$(losetup -f --show "$TMPDIR/img2")"
+
+   _create_nvmet_ns "blktests-subsystem-1" "2" "${loop_dev2}" \
+   "9aed0138-bfd9-46f5-92ac-24c70377fd49" "2"
+
+   _add_nvmet_subsys_to_port "${port1}" "blktests-subsystem-1"
+   _add_nvmet_subsys_to_port "${port2}" "blktests-subsystem-1"
+
+   switch_nvmet_anagroup "${port1}" "${port2}" failback
+
+   nvme connect -t loop -a "${port1}" -n blktests-subsystem-1
+   nvme connect -t loop -a "${port2}" -n blktests-subsystem-1
+
+   _display_ana_state
+
+   _run_fio_rand_io --size=256m --filename="/dev/nvme0n1" &
+   trap "kill $!" EXIT
+
+   sleep 10
+
+   switch_nvmet_anagroup "${port1}" "${port2}" "change"
+
+   # Insert a delay to allow the AEN to be processed
+   sleep 1
+
+   _display_ana_state
+
+   sleep 6
+
+   switch_nvmet_anagroup "${port1}" "${port2}" "failover"
+
+   # Insert a delay to allow the AEN to be processed
+   sleep 1
+
+   _display_ana_state
+
+   wait
+   trap - EXIT
+
+   nvme disconnect -n blktests-subsystem-1
+
+   _remove_nvmet_subsystem_from_port "${port1}" "blktests-subsystem-1"
+   _remove_nvmet_subsystem_from_port "${port2}" 

Re: [PATCH 2/2] blktests: add test for ANA state transition

2018-07-17 Thread Johannes Thumshirn
On Tue, Jul 17, 2018 at 03:31:18PM +0200, Hannes Reinecke wrote:
> +requires() {
> + _have_program nvme && _have_module nvme-loop && _have_module loop && \
> + _have_configfs && _have_fio
> +}

this needs '_have_module_param nvme-core multipath' as well.


-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


RE: [PATCH] blk-mq: issue directly if hw queue isn't busy in case of 'none'

2018-07-17 Thread Kashyap Desai
> -Original Message-
> From: Ming Lei [mailto:ming@redhat.com]
> Sent: Tuesday, July 10, 2018 6:34 AM
> To: Jens Axboe
> Cc: linux-block@vger.kernel.org; Ming Lei; Kashyap Desai; Laurence
Oberman;
> Omar Sandoval; Christoph Hellwig; Bart Van Assche; Hannes Reinecke
> Subject: [PATCH] blk-mq: issue directly if hw queue isn't busy in case
of 'none'
>
> In case of 'none' io scheduler, when hw queue isn't busy, it isn't
> necessary to enqueue request to sw queue and dequeue it from
> sw queue because request may be submitted to hw queue asap without
> extra cost, meantime there shouldn't be much request in sw queue,
> and we don't need to worry about effect on IO merge.
>
> There are still some single hw queue SCSI HBAs(HPSA, megaraid_sas, ...)
> which may connect high performance devices, so 'none' is often required
> for obtaining good performance.

This Patch is tested on my setup and seeing very good performance
improvement.  Just use one R0 VD from configuration -

Without upstream fix and RHEL7.5 kernel -
IOPS goes 840K and CPU utilization goes upto 11%.

After applying another block layer fix -
IOPS goes 1066K and CPU utilization goes up to 6%.

Overall performance improvement is 30% performance and 80% less cpu
utilization.

Any review comments ?

Kashyap

>
> This patch improves IOPS and decreases CPU unilization on megaraid_sas,
> per Kashyap's test.
>
> Cc: Kashyap Desai 
> Cc: Laurence Oberman 
> Cc: Omar Sandoval 
> Cc: Christoph Hellwig 
> Cc: Bart Van Assche 
> Cc: Hannes Reinecke 
> Reported-by: Kashyap Desai 
> Tested-by: Kashyap Desai 
> Signed-off-by: Ming Lei 


Re: [PATCH] blk-mq: issue directly if hw queue isn't busy in case of 'none'

2018-07-17 Thread Jens Axboe
On 7/9/18 7:03 PM, Ming Lei wrote:
> In case of 'none' io scheduler, when hw queue isn't busy, it isn't
> necessary to enqueue request to sw queue and dequeue it from
> sw queue because request may be submitted to hw queue asap without
> extra cost, meantime there shouldn't be much request in sw queue,
> and we don't need to worry about effect on IO merge.
> 
> There are still some single hw queue SCSI HBAs(HPSA, megaraid_sas, ...)
> which may connect high performance devices, so 'none' is often required
> for obtaining good performance.
> 
> This patch improves IOPS and decreases CPU unilization on megaraid_sas,
> per Kashyap's test.

Looks reasonable to me, applied for 4.19.

-- 
Jens Axboe



Re: Silent data corruption in blkdev_direct_IO()

2018-07-17 Thread Martin Wilck
On Mon, 2018-07-16 at 19:45 +0800, Ming Lei wrote:
> On Sat, Jul 14, 2018 at 6:29 AM, Martin Wilck 
> wrote:
> > Hi Ming & Jens,
> > 
> > On Fri, 2018-07-13 at 12:54 -0600, Jens Axboe wrote:
> > > On 7/12/18 5:29 PM, Ming Lei wrote:
> > > > 
> > > > Maybe you can try the following patch from Christoph to see if
> > > > it
> > > > makes a
> > > > difference:
> > > > 
> > > > https://marc.info/?l=linux-kernel=153013977816825=2
> > > 
> > > That's not a bad idea.
> > 
> > Are you saying that the previous "nasty hack"  in
> > bio_iov_iter_get_pages() was broken, and the new one is not?
> > I've scratched my head over that (old) code lately, but I couldn't
> > spot
> > an actual error in it.
> 
> Yeah, I think the new patch in above link is better, and looks its
> correctness
> is easy to prove.
> 
>https://marc.info/?t=15281208141=1=2
> 
> So please test the patch if possible.

I haven't tested yet, but upon further inspection, I can tell that the
current code (without Christoph's patch) is actually broken if the
function is called with bio->bi_vcnt > 0. The following patch explains
the problem. AFAICS, Christoph's new code is correct.

>From b75adc856119346e02126cf8975755300f2d9b7f Mon Sep 17 00:00:00 2001
From: Martin Wilck 
Date: Wed, 18 Jul 2018 01:56:37 +0200
Subject: [PATCH] block: bio_iov_iter_get_pages: fix size of last iovec

If the last page of the bio is not "full", the length of the last
vector bin needs to be corrected. This bin has the index
(bio->bi_vcnt - 1), but in bio->bi_io_vec, not in the "bv" helper array which
is shifted by the value of bio->bi_vcnt at function invocation.

Signed-off-by: Martin Wilck 
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 53e0f0a..c22e76f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -913,7 +913,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter 
*iter)
bv[0].bv_offset += offset;
bv[0].bv_len -= offset;
if (diff)
-   bv[bio->bi_vcnt - 1].bv_len -= diff;
+   bio->bi_io_vec[bio->bi_vcnt - 1].bv_len -= diff;
 
iov_iter_advance(iter, size);
return 0;
-- 
2.17.1



Re: Silent data corruption in blkdev_direct_IO()

2018-07-17 Thread Ming Lei
On Wed, Jul 18, 2018 at 02:07:28AM +0200, Martin Wilck wrote:
> On Mon, 2018-07-16 at 19:45 +0800, Ming Lei wrote:
> > On Sat, Jul 14, 2018 at 6:29 AM, Martin Wilck 
> > wrote:
> > > Hi Ming & Jens,
> > > 
> > > On Fri, 2018-07-13 at 12:54 -0600, Jens Axboe wrote:
> > > > On 7/12/18 5:29 PM, Ming Lei wrote:
> > > > > 
> > > > > Maybe you can try the following patch from Christoph to see if
> > > > > it
> > > > > makes a
> > > > > difference:
> > > > > 
> > > > > https://marc.info/?l=linux-kernel=153013977816825=2
> > > > 
> > > > That's not a bad idea.
> > > 
> > > Are you saying that the previous "nasty hack"  in
> > > bio_iov_iter_get_pages() was broken, and the new one is not?
> > > I've scratched my head over that (old) code lately, but I couldn't
> > > spot
> > > an actual error in it.
> > 
> > Yeah, I think the new patch in above link is better, and looks its
> > correctness
> > is easy to prove.
> > 
> >https://marc.info/?t=15281208141=1=2
> > 
> > So please test the patch if possible.
> 
> I haven't tested yet, but upon further inspection, I can tell that the
> current code (without Christoph's patch) is actually broken if the
> function is called with bio->bi_vcnt > 0. The following patch explains
> the problem. AFAICS, Christoph's new code is correct.
> 
> From b75adc856119346e02126cf8975755300f2d9b7f Mon Sep 17 00:00:00 2001
> From: Martin Wilck 
> Date: Wed, 18 Jul 2018 01:56:37 +0200
> Subject: [PATCH] block: bio_iov_iter_get_pages: fix size of last iovec
> 
> If the last page of the bio is not "full", the length of the last
> vector bin needs to be corrected. This bin has the index
> (bio->bi_vcnt - 1), but in bio->bi_io_vec, not in the "bv" helper array which
> is shifted by the value of bio->bi_vcnt at function invocation.
> 
> Signed-off-by: Martin Wilck 
> ---
>  block/bio.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 53e0f0a..c22e76f 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -913,7 +913,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct 
> iov_iter *iter)
>   bv[0].bv_offset += offset;
>   bv[0].bv_len -= offset;
>   if (diff)
> - bv[bio->bi_vcnt - 1].bv_len -= diff;
> + bio->bi_io_vec[bio->bi_vcnt - 1].bv_len -= diff;
>  
>   iov_iter_advance(iter, size);
>   return 0;

Right, that is the issue, we need this fix for -stable, but maybe the
following fix is more readable:

diff --git a/block/bio.c b/block/bio.c
index f3536bfc8298..6e37b803755b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -914,16 +914,16 @@ EXPORT_SYMBOL(bio_add_page);
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
-   unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+   unsigned short idx, nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
-   size_t offset, diff;
+   size_t offset;
ssize_t size;
 
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
if (unlikely(size <= 0))
return size ? size : -EFAULT;
-   nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+   idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
 
/*
 * Deep magic below:  We need to walk the pinned pages backwards
@@ -936,17 +936,15 @@ int bio_iov_iter_get_pages(struct bio *bio, struct 
iov_iter *iter)
bio->bi_iter.bi_size += size;
bio->bi_vcnt += nr_pages;
 
-   diff = (nr_pages * PAGE_SIZE - offset) - size;
-   while (nr_pages--) {
-   bv[nr_pages].bv_page = pages[nr_pages];
-   bv[nr_pages].bv_len = PAGE_SIZE;
-   bv[nr_pages].bv_offset = 0;
+   while (idx--) {
+   bv[idx].bv_page = pages[idx];
+   bv[idx].bv_len = PAGE_SIZE;
+   bv[idx].bv_offset = 0;
}
 
bv[0].bv_offset += offset;
bv[0].bv_len -= offset;
-   if (diff)
-   bv[bio->bi_vcnt - 1].bv_len -= diff;
+   bv[nr_pages - 1].bv_len -= (nr_pages * PAGE_SIZE - offset) - size;
 
iov_iter_advance(iter, size);
return 0;

And for mainline, I suggest to make Christoph's new code in, that is
easy to prove its correctness, and seems simpler.

Thanks,
Ming


[PATCH 3/3] blk-mq: Enable support for runtime power management

2018-07-17 Thread Bart Van Assche
Now that the blk-mq core processes power management requests
(marked with RQF_PREEMPT) in other states than RPM_ACTIVE, enable
runtime power management for blk-mq.

Signed-off-by: Bart Van Assche 
Cc: Christoph Hellwig 
Cc: Ming Lei 
Cc: Alan Stern 
Cc: Johannes Thumshirn 
---
 block/blk-core.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 65d7f27c8c22..5aade9fa96be 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3739,10 +3739,6 @@ EXPORT_SYMBOL(blk_finish_plug);
  */
 void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
 {
-   /* not support for RQF_PM and ->rpm_status in blk-mq yet */
-   if (q->mq_ops)
-   return;
-
q->dev = dev;
q->rpm_status = RPM_ACTIVE;
pm_runtime_set_autosuspend_delay(q->dev, -1);
-- 
2.18.0



[PATCH 0/3] blk-mq: Enable runtime power management

2018-07-17 Thread Bart Van Assche
Hello Jens,

This patch series not only enables runtime power management for blk-mq but also
simplifies power management for the legacy block layer. Please consider this
patch series for kernel v4.19 (I am aware that you are on vacation).

Thanks,

Bart.

Bart Van Assche (3):
  block: Fix a comment in a header file
  block, scsi: Rework runtime power management
  blk-mq: Enable support for runtime power management

 block/blk-core.c  | 60 ---
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c| 12 
 block/elevator.c  | 11 +--
 drivers/scsi/sd.c |  4 +--
 drivers/scsi/ufs/ufshcd.c | 10 +++
 include/linux/blk-mq.h|  1 +
 include/linux/blkdev.h|  9 ++
 8 files changed, 42 insertions(+), 66 deletions(-)

-- 
2.18.0



[PATCH 2/3] block, scsi: Rework runtime power management

2018-07-17 Thread Bart Van Assche
Instead of allowing requests that are not power management requests
to enter the queue in runtime suspended status (RPM_SUSPENDED), make
the blk_get_request() caller block. This change fixes a starvation
issue: it is now guaranteed that power management requests will be
executed no matter how many blk_get_request() callers are waiting.
Instead of maintaining the q->nr_pending counter, rely on
q->q_usage_counter. Call pm_runtime_mark_last_busy() every time a
request finishes instead of only if the queue depth drops to zero.
Use RQF_PREEMPT to mark power management requests instead of RQF_PM.
This is safe because the power management core serializes system-wide
suspend/resume and runtime power management state changes.

Signed-off-by: Bart Van Assche 
Cc: Christoph Hellwig 
Cc: Ming Lei 
Cc: Alan Stern 
Cc: Johannes Thumshirn 
---
 block/blk-core.c  | 56 +--
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c| 12 -
 block/elevator.c  | 11 +---
 drivers/scsi/sd.c |  4 +--
 drivers/scsi/ufs/ufshcd.c | 10 +++
 include/linux/blk-mq.h|  1 +
 include/linux/blkdev.h|  7 ++---
 8 files changed, 41 insertions(+), 61 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index f84a9b7b6f5a..65d7f27c8c22 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1719,7 +1719,7 @@ EXPORT_SYMBOL_GPL(part_round_stats);
 #ifdef CONFIG_PM
 static void blk_pm_put_request(struct request *rq)
 {
-   if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
+   if (rq->q->dev && !(rq->rq_flags & RQF_PREEMPT))
pm_runtime_mark_last_busy(rq->q->dev);
 }
 #else
@@ -2737,30 +2737,6 @@ void blk_account_io_done(struct request *req, u64 now)
}
 }
 
-#ifdef CONFIG_PM
-/*
- * Don't process normal requests when queue is suspended
- * or in the process of suspending/resuming
- */
-static bool blk_pm_allow_request(struct request *rq)
-{
-   switch (rq->q->rpm_status) {
-   case RPM_RESUMING:
-   case RPM_SUSPENDING:
-   return rq->rq_flags & RQF_PM;
-   case RPM_SUSPENDED:
-   return false;
-   }
-
-   return true;
-}
-#else
-static bool blk_pm_allow_request(struct request *rq)
-{
-   return true;
-}
-#endif
-
 void blk_account_io_start(struct request *rq, bool new_io)
 {
struct hd_struct *part;
@@ -2806,11 +2782,12 @@ static struct request *elv_next_request(struct 
request_queue *q)
 
while (1) {
list_for_each_entry(rq, >queue_head, queuelist) {
-   if (blk_pm_allow_request(rq))
-   return rq;
-
-   if (rq->rq_flags & RQF_SOFTBARRIER)
-   break;
+   /*
+* If a request gets queued in state RPM_SUSPENDED
+* then that's a kernel bug.
+*/
+   WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
+   return rq;
}
 
/*
@@ -3801,8 +3778,11 @@ int blk_pre_runtime_suspend(struct request_queue *q)
if (!q->dev)
return ret;
 
+   blk_set_preempt_only(q);
+   blk_freeze_queue_start(q);
+
spin_lock_irq(q->queue_lock);
-   if (q->nr_pending) {
+   if (!percpu_ref_is_zero(>q_usage_counter)) {
ret = -EBUSY;
pm_runtime_mark_last_busy(q->dev);
} else {
@@ -3837,8 +3817,15 @@ void blk_post_runtime_suspend(struct request_queue *q, 
int err)
} else {
q->rpm_status = RPM_ACTIVE;
pm_runtime_mark_last_busy(q->dev);
+   queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
}
spin_unlock_irq(q->queue_lock);
+
+   if (err) {
+   blk_unfreeze_queue(q);
+   /* Because QUEUE_FLAG_PREEMPT_ONLY has been cleared. */
+   wake_up_all(>mq_freeze_wq);
+   }
 }
 EXPORT_SYMBOL(blk_post_runtime_suspend);
 
@@ -3888,11 +3875,18 @@ void blk_post_runtime_resume(struct request_queue *q, 
int err)
q->rpm_status = RPM_ACTIVE;
__blk_run_queue(q);
pm_runtime_mark_last_busy(q->dev);
+   queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
pm_request_autosuspend(q->dev);
} else {
q->rpm_status = RPM_SUSPENDED;
}
spin_unlock_irq(q->queue_lock);
+
+   if (!err) {
+   blk_unfreeze_queue(q);
+   /* Because QUEUE_FLAG_PREEMPT_ONLY has been cleared. */
+   wake_up_all(>mq_freeze_wq);
+   }
 }
 EXPORT_SYMBOL(blk_post_runtime_resume);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1c4532e92938..74c9b17811e2 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -339,7 +339,6 @@ static const char *const rqf_name[] = {
RQF_NAME(ELVPRIV),