Re: [block] 044f1daaaa kernel BUG at block/blk-mq.c:415!

2017-06-26 Thread Jens Axboe
On 06/26/2017 09:53 PM, Fengguang Wu wrote:
> Greetings,
> 
> Sorry the report is a bit late, this bug looks still active:
> 
 
First I've heard about it!

> [   36.037364]   sector 2, nr/cnr 0/2
> [   36.037367]   bio 88001af84e80, biotail 88001af84e80, len 0
> [   36.037367]   bio 88001af84e80, biotail 88001af84e80, len 0
> [   36.037424] [ cut here ]
> [   36.037424] [ cut here ]
> [   36.037428] kernel BUG at block/blk-mq.c:415!
> [   36.037428] kernel BUG at block/blk-mq.c:415!
> [   36.037433] invalid opcode:  [#1] PREEMPT SMP
> [   36.037433] invalid opcode:  [#1] PREEMPT SMP
> [   36.037434] Modules linked in:
> [   36.037434] Modules linked in:

Do you have the full dmesg of this? I looked at the one attached to the
bug report, but that's a completely different bug (NULL deref in drm).
Are you sure it's still the above blk-mq BUG() we're hitting?

-- 
Jens Axboe



[PATCH blktests v2] loop/002: Regression testing for loop device flush

2017-06-26 Thread James Wang
Add a regression testing for loop device. when an unbound device
be close that take too long time. kernel will consume serveral orders
of magnitude more wall time than it does for a mounted device.

Signed-off-by: James Wang 
---
 tests/loop/002 | 63 ++
 tests/loop/002.out |  2 ++
 2 files changed, 65 insertions(+)
 create mode 100755 tests/loop/002
 create mode 100644 tests/loop/002.out

diff --git a/tests/loop/002 b/tests/loop/002
new file mode 100755
index 000..ef69729
--- /dev/null
+++ b/tests/loop/002
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# Test if close()ing a unbound loop device is too slow
+# Copyright (C) 2017 James Wang 
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+
+DESCRIPTION="Test if close()ing a unbound loop device is too slow"
+
+QUICK=1
+
+run_test() {
+   TIMEFORMAT='%5R'
+   time {
+   for ((i=0;i<200;i++)); do dd if=/dev/loop0 of=/dev/null  bs=512 
count=1 >/dev/null 2>&1; done
+   }
+}
+clean_up() {
+   if lsmod | grep loop >/dev/null 2>&1; then
+   if ! rmmod loop;then
+   return 2;
+   fi
+   fi
+}
+
+prepare() {
+   modprobe loop max_loop=1
+}
+
+
+test() {
+   echo "Running ${TEST_NAME}"
+
+   clean_up
+   prepare
+   SECONDS=0
+   run_test >/dev/null 2>&1
+   DURATION=${SECONDS}
+
+   clean_up
+   if ! clean_up; then
+   echo "Test complete"
+   return 2
+   fi
+   echo "Test complete"
+   if [[ "${DURATION}" -gt 1 ]]; then
+   echo "test took too long ($URATION seconds)"
+   return 1
+   else
+   return 0
+   fi
+}
diff --git a/tests/loop/002.out b/tests/loop/002.out
new file mode 100644
index 000..5c34a37
--- /dev/null
+++ b/tests/loop/002.out
@@ -0,0 +1,2 @@
+Running loop/002
+Test complete
-- 
2.12.3



Re: [PATCH blktests] loop/002: Regression testing for loop device flush

2017-06-26 Thread James Wang


On 06/27/2017 02:58 AM, Omar Sandoval wrote:
> Hi, James, thanks for sending this in. Sorry for the delay, I've been
> out of the office for a couple of weeks. A few comments below.
>
> On Thu, Jun 08, 2017 at 08:28:12PM +0800, James Wang wrote:
>> Add a regression testing for loop device. when an unbound device
>> be close that take too long time. kernel will consume serveral orders
>> of magnitude more wall time than it does for a mounted device.
>>
>> Signed-off-by: James Wang 
>> ---
>>  tests/loop/002 | 77 
>> ++
>>  tests/loop/002.out |  2 ++
>>  2 files changed, 79 insertions(+)
>>
>> diff --git a/tests/loop/002 b/tests/loop/002
>> new file mode 100755
>> index 000..fd607d1
>> --- /dev/null
>> +++ b/tests/loop/002
>> @@ -0,0 +1,77 @@
>> +#!/bin/bash
>> +#
>> +# Test if close()ing a unbound loop device is too slow
>> +# Copyright (C) 2017 James Wang
>> +#
>> +# This program is free software: you can redistribute it and/or modify
>> +# it under the terms of the GNU General Public License as published by
>> +# the Free Software Foundation, either version 3 of the License, or
>> +# (at your option) any later version.
>> +#
>> +# This program is distributed in the hope that it will be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program.  If not, see .
>> +
>> +DESCRIPTION="Test if close()ing a unbound loop device is too slow"
>> +
>> +QUICK=1
>> +
>> +function run_test() {
> For consistency with everything else in blktests, please don't use
> "function" when defining a function.
I will fix it.
>> +TIMEFORMAT='%5R'
>> +time {
>> +for f in `ls /dev/loop[0-9]*|sort`; do dd if=$f of=/dev/null  
>> bs=512 count=1 >/dev/null 2>&1; done
>> +}
>> +}
>> +function clean_up() {
>> +if lsmod | grep loop >/dev/null 2>&1; then
>> +umount /dev/loop* >/dev/null 2>&1
>> +losetup -D
>> +sleep 5
>> +
>> +if ! rmmod loop;then
>> +return 2;
>> +fi
>> +fi
>> +}
>> +
>> +function prepare() {
>> +modprobe loop max_loop=64
> If loop is already loaded, this won't work, right?
Actually, I could use clean_up() first , but due to My testing machine
has a bug causes clean_up() very slow..
I use call clean_up() before prepare(), make sense?
>
>> +dd if=/dev/zero of=${TMPDIR}/disk bs=512 count=200K >/dev/null 2>&1
>> +for((i=0;i<4;i++))
>> +do
>> +losetup -f ${TMPDIR}/disk;
>> +done
>> +mkfs.ext4 -F /dev/loop0 >/dev/null 2>&1
> Hm, so if I happened to have something I care about on /dev/loop0,
> running blktests will destroy it? This is a no-go.
Yes, but due to our insert loop module and create a fake-disk and bound
to loop0, so format loop0 should doesn't matter.

>> +for((i=0;i<4;i++))
>> +do
>> +mkdir -p t$i;
>> +mount /dev/loop$i t$i;
>> +done
>> +
>> +}
>> +
>> +
>> +test() {
>> +echo "Running ${TEST_NAME}"
>> +
>> +prepare
>> +SECONDS=0
>> +run_test >/dev/null 2>&1
>> +DURATION=${SECONDS}
> Nifty, I didn't know about $SECONDS.
SECONDS is a built-in variable in bash, it will automatic increase.
>
>> +
>> +clean_up
>> +if ! clean_up; then
>> +echo "Test complete"
>> +return 2
>> +fi
>> +echo "Test complete"
>> +if [[ "${DURATION}" -gt 1 ]]; then
>> +return 1
>> +else
>> +return 0
>> +fi
> I'd really like a meaningful output if this test fails, so something
> like this instead of the if/else
>
> if [[ "${DURATION}" -gt 1 ]]; then
>   echo "test took too long ($DURATION seconds)"
> fi
I will fix this.
>> +}
>> diff --git a/tests/loop/002.out b/tests/loop/002.out
>> new file mode 100644
>> index 000..5c34a37
>> --- /dev/null
>> +++ b/tests/loop/002.out
>> @@ -0,0 +1,2 @@
>> +Running loop/002
>> +Test complete
>> -- 
>> 2.12.3
>>
> Overall, is there an easier way to test this than setting up 64 loop
> devices at modprobe time? E.g., can you losetup -f and run it on a
> single loop device many times to measure the same issue?
Use many loop devices for get a enough long time to compare with 1 second.
if we only create 1 loop device, I afraid it can't be measured.
In this scenario, I could get the duration of unbound and bound loop
device takes.
OK, I could try your suggestion.

I will send patch later.

James
>
> Thanks again!
>
>

-- 
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)



Re: [PATCH 00/20] LightNVM: pblk patches for 4.13

2017-06-26 Thread Jens Axboe
On Mon, Jun 26 2017, Javier González wrote:
> Hi Matias,
> 
> Here you have the pblk patchset for this window.
> 
> Apart from small fixes for LightNVM core and pblk, there are three
> relevant changes:
> 
>  - Metadata for each line is no longer issued on a separate workqueue,
>but instead, all I/Os are scheduled on the write thread. This allows
>to have more control over LUN collisions. The result is that we can
>use the full bandwidth of the device for user data. In the 4.12 pblk,
>we use ~25% of the bandwidth for metadata.
> 
>  - The rate-limiter and GC have been tuned to keep the GC writer busy.
>Also, capacity per line is guaranteed for GC I/O as we reach
>capacity.
> 
>  - We have now a FTL state machine. This allows to fail gracefully to
>user space in case of irrecoverable errors. This state machine will
>be the base for the pblk's FTL log.
> 
> The patches apply into your for-4.13/core. You can also find them in:

Nope, they do not. Patch #7 fails, because it's not on top of the
bio_set changes that went in a week or two ago. I hand applied that one,
didn't find issues with the rest.

But please remember to ensure that it applies on top of the current
branch.

-- 
Jens Axboe



Re: [PATCH blktests 2/2] block/011: Perform PCI reset while doing IO

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:29 PM, Omar Sandoval wrote:
> On Mon, Jun 26, 2017 at 08:25:36AM -0600, Jens Axboe wrote:
>> On 06/26/2017 08:06 AM, Johannes Thumshirn wrote:
>>> On Fri, Jun 23, 2017 at 09:36:14AM -0600, Jens Axboe wrote:
 On 06/23/2017 08:29 AM, Johannes Thumshirn wrote:
> From: Omar Sandoval 
>
> This test-case performs I/O with fio while doing PCI disable/enable
> cycles.
>
> In the results we don't care for I/O errors but for hiccups in dmesg only.

 Let's get this in, that would be a very useful test. A few comments -
 not necessarily on this patch in particular, but for future cleanups
 and improvements.

> + if _test_dev_is_rotational; then
> + size="32m"
> + else
> + size="1g"
> + fi

 I introduced this idea in one of my previous patches. I wonder if we
 should turn that into a helper. Pass in the dev, get returned a
 suitable fio size, instead of hard coding this in each job that
 needs it.
> 
> What I wanted to have here eventually is a helper that you can run when
> you just want arbitrary I/O. Haven't gotten around to it.

That would be handy. Until that happens, I would not worry about it,
it's not like it's a lot of work to just copy/paste the "do random
reads QD=x with Y jobs" between jobs. It'd be fine as a separate
cleanup at some point.


-- 
Jens Axboe



Re: [PATCH blktests 1/2] rc: add helpers to handle PCI test devices

2017-06-26 Thread Omar Sandoval
On Fri, Jun 23, 2017 at 04:29:50PM +0200, Johannes Thumshirn wrote:
> Add two helpers to check whether a device is attached via PCI and to get the
> PCI device from a TEST_DEV
> 
> Signed-off-by: Johannes Thumshirn 
> ---
>  common/rc | 15 +++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index b01f936b878b..497cf81ec475 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -120,3 +120,18 @@ _test_dev_queue_set() {
>   fi
>   echo "$2" >"${TEST_DEV_SYSFS}/queue/$1"
>  }
> +
> +_test_dev_is_pci() {
> + if ! readlink -f "$TEST_DEV_SYSFS/device" | grep -q pci; then
> + SKIP_REASON="$TEST_DEV is not a PCI device"
> + return 1
> + fi
> + return 0
> +}
> +
> +_get_pci_dev_from_blkdev() {
> + pdev="$(readlink -f "$TEST_DEV_SYSFS/device" | \
> + grep -Eo '[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f]')"
> +
> + echo "$pdev"
> +}

No need to do pdev=$(blah) and then echo $pdev, you can just do

_get_pci_dev_from_blkdev() {
readlink -f "$TEST_DEV_SYSFS/device" | grep -Eo 
'[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f]'
}

I'll fix that up when I commit it. Looks good otherwise.


Re: [PATCH blktests 2/2] block/011: Perform PCI reset while doing IO

2017-06-26 Thread Omar Sandoval
On Mon, Jun 26, 2017 at 08:25:36AM -0600, Jens Axboe wrote:
> On 06/26/2017 08:06 AM, Johannes Thumshirn wrote:
> > On Fri, Jun 23, 2017 at 09:36:14AM -0600, Jens Axboe wrote:
> >> On 06/23/2017 08:29 AM, Johannes Thumshirn wrote:
> >>> From: Omar Sandoval 
> >>>
> >>> This test-case performs I/O with fio while doing PCI disable/enable
> >>> cycles.
> >>>
> >>> In the results we don't care for I/O errors but for hiccups in dmesg only.
> >>
> >> Let's get this in, that would be a very useful test. A few comments -
> >> not necessarily on this patch in particular, but for future cleanups
> >> and improvements.
> >>
> >>> + if _test_dev_is_rotational; then
> >>> + size="32m"
> >>> + else
> >>> + size="1g"
> >>> + fi
> >>
> >> I introduced this idea in one of my previous patches. I wonder if we
> >> should turn that into a helper. Pass in the dev, get returned a
> >> suitable fio size, instead of hard coding this in each job that
> >> needs it.

What I wanted to have here eventually is a helper that you can run when
you just want arbitrary I/O. Haven't gotten around to it.

> > 
> > Sure.
> > 
> >>
> >>> + # start fio job
> >>> + _run_fio --bs=4k --rw=randread --norandommap \
> >>> + --name=reads --filename="$TEST_DEV" --size="$size" \
> >>> + --numjobs=8 --direct=1 2>/dev/null &
> >>
> >> I don't believe we check for fio errors right now, but we probably
> >> should in the future. So I think you'd want to add something ala:
> >>
> >> --ignore_error=EIO,ENXIO,ENODEV
> >>
> >> to your options to make it explicit that you don't care about IO
> >> errors for this test.

Yup, we redirect fio errors to /dev/null everywhere, we should fix that.

> > Oh nice, didn't know about the option. Btw as we're currently all have
> > arbitrary values for the numjobs parameter, how about a wrapper over getconf
> > _NPROCESSORS_ONLN?
> 
> Yes that's a good idea, then we can at least size the jobs based on
> how many cores we have.

We can just use nproc for this.


Re: [PATCH blktests 2/2] block/011: Perform PCI reset while doing IO

2017-06-26 Thread Omar Sandoval
On Fri, Jun 23, 2017 at 04:29:51PM +0200, Johannes Thumshirn wrote:
> From: Omar Sandoval 
> 
> This test-case performs I/O with fio while doing PCI disable/enable
> cycles.
> 
> In the results we don't care for I/O errors but for hiccups in dmesg only.
> 
> Signed-off-by: Johannes Thumshirn 
> ---
>  tests/block/011 | 54 
> +
>  tests/block/011.out |  2 ++
>  2 files changed, 56 insertions(+)
>  create mode 100755 tests/block/011
>  create mode 100644 tests/block/011.out
> 
> diff --git a/tests/block/011 b/tests/block/011
> new file mode 100755
> index ..b0de35816d48
> --- /dev/null
> +++ b/tests/block/011
> @@ -0,0 +1,54 @@
> +#!/bin/bash
> +#
> +# Do disable PCI device while doing I/O to it
> +#
> +# Copyright (C) 2017 Johannes Thumshirn 
> +#
> +# This program is free software: you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation, either version 3 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program.  If not, see .
> +
> +DESCRIPTION="disable PCI device while doing I/O"
> +TIMED=1
> +
> +requires() {
> + _have_fio
> +}
> +
> +device_requires() {
> + _test_dev_is_pci
> +}
> +
> +test_device() {
> + echo "Running ${TEST_NAME}"
> +
> + pdev=$(_get_pci_dev_from_blkdev)
> +
> + if _test_dev_is_rotational; then
> + size="32m"
> + else
> + size="1g"
> + fi
> +
> + # start fio job
> + _run_fio --bs=4k --rw=randread --norandommap \
> + --name=reads --filename="$TEST_DEV" --size="$size" \
> + --numjobs=8 --direct=1 2>/dev/null &
> +
> + while kill -0 $! 2>/dev/null; do
> + echo 0 > "/sys/bus/pci/devices/${pdev}/enable"
> + sleep .2
> + echo 1 > "/sys/bus/pci/devices/${pdev}/enable"

Test looks good, but one question: do you want another sleep .2 here?
Like this, you immediately disable it after enabling it, but maybe
that's what you want :)

> + done
> +
> + echo "Test complete"
> +}
> diff --git a/tests/block/011.out b/tests/block/011.out
> new file mode 100644
> index ..8e067df63097
> --- /dev/null
> +++ b/tests/block/011.out
> @@ -0,0 +1,2 @@
> +Running block/011
> +Test complete
> -- 
> 2.12.3
> 


Re: [GIT PULL] New test and a fixup for blktests

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:04 PM, Omar Sandoval wrote:
> On Mon, Jun 26, 2017 at 03:02:55PM -0600, Jens Axboe wrote:
>> On 06/26/2017 02:45 PM, Omar Sandoval wrote:
>>> On Wed, Jun 21, 2017 at 09:22:47AM -0600, Jens Axboe wrote:
 Hi Omar,

 Two changes here:

 - block/006 wants to use the iops results, but it isn't using _fio_perf.
 - Add block/010 to test for shared vs non-shared tags performance.


   https://github.com/axboe/blktests.git axboe


 
 Jens Axboe (2):
   block/006: use _fio_perf to get perf results
   block/010: add test case for shared/unshared tags

  tests/block/006 |  4 +--
  tests/block/010 | 82 
 +
  tests/block/010.out |  2 ++
  3 files changed, 86 insertions(+), 2 deletions(-)
  create mode 100755 tests/block/010
  create mode 100644 tests/block/010.out

 -- 
 Jens Axboe

>>>
>>> Thanks, Jens, applied the first one, but
>>>
>>> [ 1876.142470] null_blk: unknown parameter 'shared_tags' ignored
>>>
>>> Am I missing a patch?
>>
>> Yes, the shared tags is a new feature, it's coming in 4.13. But I checked
>> and it doesn't abort the module load. So the test ends up just being the
>> same thing run twice, if shared_tags isn't available. I figured that
>> was good enough.
> 
> Sounds good, I'll apply it as-is and add an extra patch on top that
> checks if the module parameter exists.

Awesome, extra cookie for you :-)

-- 
Jens Axboe



Re: [GIT PULL] New test and a fixup for blktests

2017-06-26 Thread Omar Sandoval
On Mon, Jun 26, 2017 at 03:02:55PM -0600, Jens Axboe wrote:
> On 06/26/2017 02:45 PM, Omar Sandoval wrote:
> > On Wed, Jun 21, 2017 at 09:22:47AM -0600, Jens Axboe wrote:
> >> Hi Omar,
> >>
> >> Two changes here:
> >>
> >> - block/006 wants to use the iops results, but it isn't using _fio_perf.
> >> - Add block/010 to test for shared vs non-shared tags performance.
> >>
> >>
> >>   https://github.com/axboe/blktests.git axboe
> >>
> >>
> >> 
> >> Jens Axboe (2):
> >>   block/006: use _fio_perf to get perf results
> >>   block/010: add test case for shared/unshared tags
> >>
> >>  tests/block/006 |  4 +--
> >>  tests/block/010 | 82 
> >> +
> >>  tests/block/010.out |  2 ++
> >>  3 files changed, 86 insertions(+), 2 deletions(-)
> >>  create mode 100755 tests/block/010
> >>  create mode 100644 tests/block/010.out
> >>
> >> -- 
> >> Jens Axboe
> >>
> > 
> > Thanks, Jens, applied the first one, but
> > 
> > [ 1876.142470] null_blk: unknown parameter 'shared_tags' ignored
> > 
> > Am I missing a patch?
> 
> Yes, the shared tags is a new feature, it's coming in 4.13. But I checked
> and it doesn't abort the module load. So the test ends up just being the
> same thing run twice, if shared_tags isn't available. I figured that
> was good enough.

Sounds good, I'll apply it as-is and add an extra patch on top that
checks if the module parameter exists.


Re: [GIT PULL] New test and a fixup for blktests

2017-06-26 Thread Jens Axboe
On 06/26/2017 02:45 PM, Omar Sandoval wrote:
> On Wed, Jun 21, 2017 at 09:22:47AM -0600, Jens Axboe wrote:
>> Hi Omar,
>>
>> Two changes here:
>>
>> - block/006 wants to use the iops results, but it isn't using _fio_perf.
>> - Add block/010 to test for shared vs non-shared tags performance.
>>
>>
>>   https://github.com/axboe/blktests.git axboe
>>
>>
>> 
>> Jens Axboe (2):
>>   block/006: use _fio_perf to get perf results
>>   block/010: add test case for shared/unshared tags
>>
>>  tests/block/006 |  4 +--
>>  tests/block/010 | 82 
>> +
>>  tests/block/010.out |  2 ++
>>  3 files changed, 86 insertions(+), 2 deletions(-)
>>  create mode 100755 tests/block/010
>>  create mode 100644 tests/block/010.out
>>
>> -- 
>> Jens Axboe
>>
> 
> Thanks, Jens, applied the first one, but
> 
> [ 1876.142470] null_blk: unknown parameter 'shared_tags' ignored
> 
> Am I missing a patch?

Yes, the shared tags is a new feature, it's coming in 4.13. But I checked
and it doesn't abort the module load. So the test ends up just being the
same thing run twice, if shared_tags isn't available. I figured that
was good enough.

-- 
Jens Axboe



Re: [PATCH] sd: add support for TCG OPAL self encrypting disks

2017-06-26 Thread Tejun Heo
On Mon, Jun 26, 2017 at 12:43:27PM -0400, Martin K. Petersen wrote:
> 
> Christoph,
> 
> > ping?
> 
> Looks good to me. I'll queue it up for 4.13 as soon as Linus has pulled
> in the ata bits.

I can route it through libata tree w/ your ack if that's more convenient.

Thanks.

-- 
tejun


Re: [GIT PULL] New test and a fixup for blktests

2017-06-26 Thread Omar Sandoval
On Wed, Jun 21, 2017 at 09:22:47AM -0600, Jens Axboe wrote:
> Hi Omar,
> 
> Two changes here:
> 
> - block/006 wants to use the iops results, but it isn't using _fio_perf.
> - Add block/010 to test for shared vs non-shared tags performance.
> 
> 
>   https://github.com/axboe/blktests.git axboe
> 
> 
> 
> Jens Axboe (2):
>   block/006: use _fio_perf to get perf results
>   block/010: add test case for shared/unshared tags
> 
>  tests/block/006 |  4 +--
>  tests/block/010 | 82 
> +
>  tests/block/010.out |  2 ++
>  3 files changed, 86 insertions(+), 2 deletions(-)
>  create mode 100755 tests/block/010
>  create mode 100644 tests/block/010.out
> 
> -- 
> Jens Axboe
> 

Thanks, Jens, applied the first one, but

[ 1876.142470] null_blk: unknown parameter 'shared_tags' ignored

Am I missing a patch?


Re: [PATCH blktests] sg/001: don't require scsi_debug

2017-06-26 Thread Omar Sandoval
On Fri, Jun 23, 2017 at 10:14:48AM +0200, Johannes Thumshirn wrote:
> Don't require scsi_debug but check if TEST_DEV is a SCSI device and use it
> instead.
> 
> Signed-off-by: Johannes Thumshirn 
> ---
>  tests/sg/001 | 8 +++-
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/tests/sg/001 b/tests/sg/001
> index a4cfabe25ba0..fc883ebd1c8a 100755
> --- a/tests/sg/001
> +++ b/tests/sg/001
> @@ -24,13 +24,13 @@ DESCRIPTION="try triggering a kernel GPF with 0 byte SG 
> reads"
>  QUICK=1
>  
>  requires() {
> - _have_src_program sg/syzkaller1 && _have_scsi_debug
> + _have_src_program sg/syzkaller1
>  }
>  
> -test() {
> +test_device() {
>   echo "Running ${TEST_NAME}"
>  
> - if ! _get_scsi_debug_dev; then
> + if ! _test_dev_is_scsi; then
>   return 1
>   fi

Since we already check this in tests/sg/group, we don't need this bit.
Got rid of it and applied, thanks, Johannes!

> @@ -38,7 +38,5 @@ test() {
>   cd "$TMPDIR" || return 1
>   timeout -s INT 10s "$SRCDIR/sg/syzkaller1" "$SG_DEV"
>  
> - _put_scsi_debug_dev
> -
>   echo "Test complete"
>  }
> -- 
> 2.12.3
> 


Re: [PATCH 9/9] nvme: add support for streams and directives

2017-06-26 Thread Jens Axboe
On 06/26/2017 01:36 PM, Andreas Dilger wrote:
> On Jun 26, 2017, at 7:56 AM, Jens Axboe  wrote:
>>
>> On 06/26/2017 03:59 AM, Christoph Hellwig wrote:
>>> Looks mostly good,
>>>
>>> but two nit-picks:
>>>
>>> - can we keep a module option to disable streams, or in fact for
>>>   now maybe to explicitly enable it?  I expect this to be interesting
>>>   at least for the first devices that implement it.  Also given that
>>>   it needs to be explicitly enabled I would expect some overhead of
>>>   just enabling it when never used
>>
>> Fine with me, I can add the 'streams' parameter back, but just default
>> it to false.
> 
> Better would be a parameter to set the default streams count, 0 by default.

The user should not need to know. If streams is enabled (bool), then it'll
ask for as many as we need on the block side right now, and scale down if
we have to. So I'd rather keep it as a "use streams or not" bool on the
nvme side.


-- 
Jens Axboe



Re: [PATCH 9/9] nvme: add support for streams and directives

2017-06-26 Thread Andreas Dilger
On Jun 26, 2017, at 7:56 AM, Jens Axboe  wrote:
> 
> On 06/26/2017 03:59 AM, Christoph Hellwig wrote:
>> Looks mostly good,
>> 
>> but two nit-picks:
>> 
>> - can we keep a module option to disable streams, or in fact for
>>   now maybe to explicitly enable it?  I expect this to be interesting
>>   at least for the first devices that implement it.  Also given that
>>   it needs to be explicitly enabled I would expect some overhead of
>>   just enabling it when never used
> 
> Fine with me, I can add the 'streams' parameter back, but just default
> it to false.

Better would be a parameter to set the default streams count, 0 by default.

>> - do we even need the < 4 streams fallback now that they are global
>>   instead of per-ns instead of just disabling the feature for now?
> 
> Maybe the device only supports 2? or 3?
> 
> --
> Jens Axboe
> 


Cheers, Andreas







signature.asc
Description: Message signed with OpenPGP


Re: [PATCH blktests] loop/002: Regression testing for loop device flush

2017-06-26 Thread Omar Sandoval
Hi, James, thanks for sending this in. Sorry for the delay, I've been
out of the office for a couple of weeks. A few comments below.

On Thu, Jun 08, 2017 at 08:28:12PM +0800, James Wang wrote:
> Add a regression testing for loop device. when an unbound device
> be close that take too long time. kernel will consume serveral orders
> of magnitude more wall time than it does for a mounted device.
> 
> Signed-off-by: James Wang 
> ---
>  tests/loop/002 | 77 
> ++
>  tests/loop/002.out |  2 ++
>  2 files changed, 79 insertions(+)
> 
> diff --git a/tests/loop/002 b/tests/loop/002
> new file mode 100755
> index 000..fd607d1
> --- /dev/null
> +++ b/tests/loop/002
> @@ -0,0 +1,77 @@
> +#!/bin/bash
> +#
> +# Test if close()ing a unbound loop device is too slow
> +# Copyright (C) 2017 James Wang
> +#
> +# This program is free software: you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation, either version 3 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program.  If not, see .
> +
> +DESCRIPTION="Test if close()ing a unbound loop device is too slow"
> +
> +QUICK=1
> +
> +function run_test() {

For consistency with everything else in blktests, please don't use
"function" when defining a function.

> + TIMEFORMAT='%5R'
> + time {
> + for f in `ls /dev/loop[0-9]*|sort`; do dd if=$f of=/dev/null  
> bs=512 count=1 >/dev/null 2>&1; done
> + }
> +}
> +function clean_up() {
> + if lsmod | grep loop >/dev/null 2>&1; then
> + umount /dev/loop* >/dev/null 2>&1
> + losetup -D
> + sleep 5
> + 
> + if ! rmmod loop;then
> + return 2;
> + fi
> + fi
> +}
> +
> +function prepare() {
> + modprobe loop max_loop=64

If loop is already loaded, this won't work, right?

> + dd if=/dev/zero of=${TMPDIR}/disk bs=512 count=200K >/dev/null 2>&1
> + for((i=0;i<4;i++))
> + do
> + losetup -f ${TMPDIR}/disk;
> + done
> + mkfs.ext4 -F /dev/loop0 >/dev/null 2>&1

Hm, so if I happened to have something I care about on /dev/loop0,
running blktests will destroy it? This is a no-go.

> + for((i=0;i<4;i++))
> + do
> + mkdir -p t$i;
> + mount /dev/loop$i t$i;
> + done
> +
> +}
> +
> +
> +test() {
> + echo "Running ${TEST_NAME}"
> +
> + prepare
> + SECONDS=0
> + run_test >/dev/null 2>&1
> + DURATION=${SECONDS}

Nifty, I didn't know about $SECONDS.

> +
> + clean_up
> + if ! clean_up; then
> + echo "Test complete"
> + return 2
> + fi
> + echo "Test complete"
> + if [[ "${DURATION}" -gt 1 ]]; then
> + return 1
> + else
> + return 0
> + fi

I'd really like a meaningful output if this test fails, so something
like this instead of the if/else

if [[ "${DURATION}" -gt 1 ]]; then
echo "test took too long ($DURATION seconds)"
fi

> +}
> diff --git a/tests/loop/002.out b/tests/loop/002.out
> new file mode 100644
> index 000..5c34a37
> --- /dev/null
> +++ b/tests/loop/002.out
> @@ -0,0 +1,2 @@
> +Running loop/002
> +Test complete
> -- 
> 2.12.3
> 

Overall, is there an easier way to test this than setting up 64 loop
devices at modprobe time? E.g., can you losetup -f and run it on a
single loop device many times to measure the same issue?

Thanks again!


Re: [PATCH v7 21/22] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-26 Thread Darrick J. Wong
On Mon, Jun 26, 2017 at 01:58:32PM -0400, jlay...@redhat.com wrote:
> On Mon, 2017-06-26 at 08:22 -0700, Darrick J. Wong wrote:
> > On Fri, Jun 16, 2017 at 03:34:26PM -0400, Jeff Layton wrote:
> > > Just check and advance the data errseq_t in struct file before
> > > before returning from fsync on normal files. Internal filemap_*
> > > callers are left as-is.
> > > 
> > > Signed-off-by: Jeff Layton 
> > > ---
> > >  fs/xfs/xfs_file.c | 15 +++
> > >  1 file changed, 11 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 5fb5a0958a14..bc3b1575e8db 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -134,7 +134,7 @@ xfs_file_fsync(
> > >   struct inode*inode = file->f_mapping-
> > > >host;
> > >   struct xfs_inode*ip = XFS_I(inode);
> > >   struct xfs_mount*mp = ip->i_mount;
> > > - int error = 0;
> > > + int error = 0, err2;
> > >   int log_flushed = 0;
> > >   xfs_lsn_t   lsn = 0;
> > >  
> > > @@ -142,10 +142,12 @@ xfs_file_fsync(
> > >  
> > >   error = filemap_write_and_wait_range(inode->i_mapping,
> > > start, end);
> > >   if (error)
> > > - return error;
> > > + goto out;
> > >  
> > > - if (XFS_FORCED_SHUTDOWN(mp))
> > > - return -EIO;
> > > + if (XFS_FORCED_SHUTDOWN(mp)) {
> > > + error = -EIO;
> > > + goto out;
> > > + }
> > >  
> > >   xfs_iflags_clear(ip, XFS_ITRUNCATED);
> > >  
> > > @@ -197,6 +199,11 @@ xfs_file_fsync(
> > >   mp->m_logdev_targp == mp->m_ddev_targp)
> > >   xfs_blkdev_issue_flush(mp->m_ddev_targp);
> > >  
> > > +out:
> > > + err2 = filemap_report_wb_err(file);
> > 
> > Could we have a comment here to remind anyone reading the code a year
> > from now that filemap_report_wb_err has side effects?  Pre-coffee me
> > was
> > wondering why we'd bother calling filemap_report_wb_err in the
> > XFS_FORCED_SHUTDOWN case, then remembered that it touches data
> > structures.
> > 
> > The first sentence of the commit message (really, the word 'advance')
> > added as a comment was adequate to remind me of the side effects.
> > 
> > Once that's added,
> > Reviewed-by: Darrick J. Wong 
> > 
> > --D
> > 
> 
> Yeah, definitely. I'm working on a respin of the series now to
> incorporate HCH's suggestion too. I'll add that in as well.
> 
> Maybe I should rename that function to file_check_and_advance_wb_err()
> ? It would be good to make it clear that it does advance the errseq_t
> cursor.

Seems like a good idea.

--D

> 
> > > + if (!error)
> > > + error = err2;
> > > +
> > >   return error;
> > >  }
> > >  
> > > -- 
> > > 2.13.0
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-
> > > xfs" in
> > > the body of a message to majord...@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-
> > btrfs" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 14/51] btrfs: avoid to access bvec table directly for a cloned bio

2017-06-26 Thread Liu Bo
On Mon, Jun 26, 2017 at 08:09:57PM +0800, Ming Lei wrote:
> Commit 17347cec15f919901c90(Btrfs: change how we iterate bios in endio)
> mentioned that for dio the submitted bio may be fast cloned, we
> can't access the bvec table directly for a cloned bio, so use
> bio_get_first_bvec() to retrieve the 1st bvec.
>

Looks good to me.

Reviewed-by: Liu Bo 

-liubo
> Cc: Chris Mason 
> Cc: Josef Bacik 
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Liu Bo 
> Signed-off-by: Ming Lei 
> ---
>  fs/btrfs/inode.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 06dea7c89bbd..4ab02b34f029 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -7993,6 +7993,7 @@ static int dio_read_error(struct inode *inode, struct 
> bio *failed_bio,
>   int read_mode = 0;
>   int segs;
>   int ret;
> + struct bio_vec bvec;
>  
>   BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
>  
> @@ -8008,8 +8009,9 @@ static int dio_read_error(struct inode *inode, struct 
> bio *failed_bio,
>   }
>  
>   segs = bio_segments(failed_bio);
> + bio_get_first_bvec(failed_bio, );
>   if (segs > 1 ||
> - (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
> + (bvec.bv_len > btrfs_inode_sectorsize(inode)))
>   read_mode |= REQ_FAILFAST_DEV;
>  
>   isector = start - btrfs_io_bio(failed_bio)->logical;
> -- 
> 2.9.4
> 


Re: [PATCH 9/9] nvme: add support for streams and directives

2017-06-26 Thread Jens Axboe
On 06/26/2017 11:52 AM, Martin K. Petersen wrote:
> 
> Christoph,
> 
>>  - can we keep a module option to disable streams, or in fact for
>>now maybe to explicitly enable it?  I expect this to be interesting
>>at least for the first devices that implement it.  Also given that
>>it needs to be explicitly enabled I would expect some overhead of
>>just enabling it when never used
> 
> Yeah, based on my experiments we'll need to drive this as an opt-in
> feature for now. Short term the module option is OK. Once more devices
> start materializing we probably need a white/blacklist/quirk scheme.

Completely agree. Might even need quirks for stream allocations too,
for instance. But let's hope we can keep it clean.

-- 
Jens Axboe



Re: [PATCH v7 21/22] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-26 Thread jlayton
On Mon, 2017-06-26 at 08:22 -0700, Darrick J. Wong wrote:
> On Fri, Jun 16, 2017 at 03:34:26PM -0400, Jeff Layton wrote:
> > Just check and advance the data errseq_t in struct file before
> > before returning from fsync on normal files. Internal filemap_*
> > callers are left as-is.
> > 
> > Signed-off-by: Jeff Layton 
> > ---
> >  fs/xfs/xfs_file.c | 15 +++
> >  1 file changed, 11 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 5fb5a0958a14..bc3b1575e8db 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -134,7 +134,7 @@ xfs_file_fsync(
> > struct inode*inode = file->f_mapping-
> > >host;
> > struct xfs_inode*ip = XFS_I(inode);
> > struct xfs_mount*mp = ip->i_mount;
> > -   int error = 0;
> > +   int error = 0, err2;
> > int log_flushed = 0;
> > xfs_lsn_t   lsn = 0;
> >  
> > @@ -142,10 +142,12 @@ xfs_file_fsync(
> >  
> > error = filemap_write_and_wait_range(inode->i_mapping,
> > start, end);
> > if (error)
> > -   return error;
> > +   goto out;
> >  
> > -   if (XFS_FORCED_SHUTDOWN(mp))
> > -   return -EIO;
> > +   if (XFS_FORCED_SHUTDOWN(mp)) {
> > +   error = -EIO;
> > +   goto out;
> > +   }
> >  
> > xfs_iflags_clear(ip, XFS_ITRUNCATED);
> >  
> > @@ -197,6 +199,11 @@ xfs_file_fsync(
> > mp->m_logdev_targp == mp->m_ddev_targp)
> > xfs_blkdev_issue_flush(mp->m_ddev_targp);
> >  
> > +out:
> > +   err2 = filemap_report_wb_err(file);
> 
> Could we have a comment here to remind anyone reading the code a year
> from now that filemap_report_wb_err has side effects?  Pre-coffee me
> was
> wondering why we'd bother calling filemap_report_wb_err in the
> XFS_FORCED_SHUTDOWN case, then remembered that it touches data
> structures.
> 
> The first sentence of the commit message (really, the word 'advance')
> added as a comment was adequate to remind me of the side effects.
> 
> Once that's added,
> Reviewed-by: Darrick J. Wong 
> 
> --D
> 

Yeah, definitely. I'm working on a respin of the series now to
incorporate HCH's suggestion too. I'll add that in as well.

Maybe I should rename that function to file_check_and_advance_wb_err()
? It would be good to make it clear that it does advance the errseq_t
cursor.

> > +   if (!error)
> > +   error = err2;
> > +
> > return error;
> >  }
> >  
> > -- 
> > 2.13.0
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-
> > xfs" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-
> btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] nvme: add support for streams and directives

2017-06-26 Thread Martin K. Petersen

Christoph,

>  - can we keep a module option to disable streams, or in fact for
>now maybe to explicitly enable it?  I expect this to be interesting
>at least for the first devices that implement it.  Also given that
>it needs to be explicitly enabled I would expect some overhead of
>just enabling it when never used

Yeah, based on my experiments we'll need to drive this as an opt-in
feature for now. Short term the module option is OK. Once more devices
start materializing we probably need a white/blacklist/quirk scheme.

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH v2 00/51] block: support multipage bvec

2017-06-26 Thread Jens Axboe
On 06/26/2017 06:09 AM, Ming Lei wrote:
> Hi,
> 
> This patchset brings multipage bvec into block layer:
> 
> 1) what is multipage bvec?
> 
> Multipage bvecs means that one 'struct bio_bvec' can hold
> multiple pages which are physically contiguous instead
> of one single page used in linux kernel for long time.
> 
> 2) why is multipage bvec introduced?
> 
> Kent proposed the idea[1] first. 
> 
> As system's RAM becomes much bigger than before, and 
> at the same time huge page, transparent huge page and
> memory compaction are widely used, it is a bit easy now
> to see physically contiguous pages from fs in I/O.
> On the other hand, from block layer's view, it isn't
> necessary to store intermediate pages into bvec, and
> it is enough to just store the physicallly contiguous
> 'segment' in each io vector.
> 
> Also huge pages are being brought to filesystem and swap
> [2][6], we can do IO on a hugepage each time[3], which
> requires that one bio can transfer at least one huge page
> one time. Turns out it isn't flexiable to change BIO_MAX_PAGES
> simply[3][5]. Multipage bvec can fit in this case very well.
> 
> With multipage bvec:
> 
> - segment handling in block layer can be improved much
> in future since it should be quite easy to convert
> multipage bvec into segment easily. For example, we might 
> just store segment in each bvec directly in future.
> 
> - bio size can be increased and it should improve some
> high-bandwidth IO case in theory[4].
> 
> - Inside block layer, both bio splitting and sg map can
> become more efficient than before by just traversing the
> physically contiguous 'segment' instead of each page.
> 
> - there is opportunity in future to improve memory footprint
> of bvecs. 
> 
> 3) how is multipage bvec implemented in this patchset?
> 
> The 1st 18 patches comment on some special cases and deal with
> some special cases of direct access to bvec table.
> 
> The 2nd part(19~29) implements multipage bvec in block layer:
> 
>   - put all tricks into bvec/bio/rq iterators, and as far as
>   drivers and fs use these standard iterators, they are happy
>   with multipage bvec
> 
>   - use multipage bvec to split bio and map sg
> 
>   - bio_for_each_segment_all() changes
>   this helper pass pointer of each bvec directly to user, and
>   it has to be changed. Two new helpers(bio_for_each_segment_all_sp()
>   and bio_for_each_segment_all_mp()) are introduced. 
> 
> The 3rd part(30~49) convert current users of bio_for_each_segment_all()
> to bio_for_each_segment_all_sp()/bio_for_each_segment_all_mp().
> 
> The last part(50~51) enables multipage bvec.
> 
> These patches can be found in the following git tree:
> 
>   https://github.com/ming1/linux/commits/mp-bvec-1.4-v4.12-rc
> 
> Thanks Christoph for looking at the early version and providing
> very good suggestions, such as: introduce bio_init_with_vec_table(),
> remove another unnecessary helpers for cleanup and so on.
> 
> Any comments are welcome!

I'll take some time to review this over the next week or so. In any
case, it's a little late to stuff into 4.13 and get a decent amount
of exposure and testing on it. A 4.14 target for this would be the
way to go, imho.


-- 
Jens Axboe



Re: [PATCH v2 00/51] block: support multipage bvec

2017-06-26 Thread David Sterba
On Mon, Jun 26, 2017 at 08:09:43PM +0800, Ming Lei wrote:
>   btrfs: avoid access to .bi_vcnt directly
>   btrfs: avoid to access bvec table directly for a cloned bio
>   btrfs: comment on direct access bvec table
>   btrfs: use bvec_get_last_page to get bio's last page
>   fs/btrfs: convert to bio_for_each_segment_all_sp()

Acked-by: David Sterba 

for all the btrfs patches.


Re: [PATCH] sd: add support for TCG OPAL self encrypting disks

2017-06-26 Thread Martin K. Petersen

Christoph,

> ping?

Looks good to me. I'll queue it up for 4.13 as soon as Linus has pulled
in the ata bits.

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Jens Axboe
On 06/26/2017 10:09 AM, Darrick J. Wong wrote:
> On Mon, Jun 26, 2017 at 07:55:27AM -0600, Jens Axboe wrote:
>> On 06/26/2017 03:51 AM, Christoph Hellwig wrote:
>>> Please document the userspace API (added linux-api and linux-man
>>> to CC for sugestions), especially including the odd effects of the
>>> per-inode settings.
>>
>> Of course, I'll send in a diff for the fcntl(2) man page.
>>
>>> Also I would highly recommend to use different fcntl commands
>>> for the file vs inode hints to avoid any strange behavior.
>>
>> OK, used to have that too... I can add specific _FILE versions.
> 
> While you're at it, can you also send in an xfstest or two to check the
> basic functionality of the fcntl so that we know the code reflects the
> userspace API ("I set this hint and now I can query it back" and "file
> hint overrides inode hint") that we want?

I definitely can. I already wrote the below to verify that it behaves
the way it should.


/*
 * test-writehints.c: test file/inode write hint setting/getting
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef F_GET_RW_HINT
#define F_LINUX_SPECIFIC_BASE   1024
#define F_GET_RW_HINT   (F_LINUX_SPECIFIC_BASE + 11)
#define F_SET_RW_HINT   (F_LINUX_SPECIFIC_BASE + 12)
#define F_GET_FILE_RW_HINT  (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT  (F_LINUX_SPECIFIC_BASE + 14)

#define RWF_WRITE_LIFE_NOT_SET  0
#define RWH_WRITE_LIFE_NONE 1
#define RWH_WRITE_LIFE_SHORT2
#define RWH_WRITE_LIFE_MEDIUM   3
#define RWH_WRITE_LIFE_LONG 4
#define RWH_WRITE_LIFE_EXTREME  5

#endif

static int __get_write_hint(int fd, int cmd)
{
uint64_t hint;
int ret;

ret = fcntl(fd, cmd, );
if (ret < 0) {
perror("fcntl: F_GET_RW_FILE_HINT");
return -1;
}

return hint;
}

static int get_file_write_hint(int fd)
{
return __get_write_hint(fd, F_GET_FILE_RW_HINT);
}

static int get_inode_write_hint(int fd)
{
return __get_write_hint(fd, F_GET_RW_HINT);
}

static void set_file_write_hint(int fd, uint64_t hint)
{
uint64_t set_hint = hint;
int ret;

ret = fcntl(fd, F_SET_FILE_RW_HINT, _hint);
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return;
}
}

static void set_inode_write_hint(int fd, uint64_t hint)
{
uint64_t set_hint = hint;
int ret;

ret = fcntl(fd, F_SET_RW_HINT, _hint);
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return;
}
}

int main(int argc, char *argv[])
{
char filename[] = "/tmp/writehintsXX";
int ihint, fhint, fd;

fd = open(filename, O_RDWR | O_CREAT | 0644);
if (fd < 0) {
perror("open");
return 2;
}

/*
 * Default hints for both file and inode should be NOT_SET
 */
fhint = get_file_write_hint(fd);
if (fhint < 0)
return 0;
ihint = get_inode_write_hint(fd);
assert(fhint == ihint);
assert(fhint == RWF_WRITE_LIFE_NOT_SET);

/*
 * Set inode hint, check file hint returns the right hint
 */
set_inode_write_hint(fd, RWH_WRITE_LIFE_SHORT);
fhint = get_file_write_hint(fd);
ihint = get_inode_write_hint(fd);
assert(fhint == ihint);
assert(fhint == RWH_WRITE_LIFE_SHORT);

/*
 * Now set file hint, ensure that this is now the hint we get
 */
set_file_write_hint(fd, RWH_WRITE_LIFE_LONG);
fhint = get_file_write_hint(fd);
ihint = get_inode_write_hint(fd);
assert(fhint == RWH_WRITE_LIFE_LONG);
assert(ihint == RWH_WRITE_LIFE_SHORT);

/*
 * Clear inode write hint, ensure that file still returns the set hint
 */
set_inode_write_hint(fd, RWF_WRITE_LIFE_NOT_SET);
fhint = get_file_write_hint(fd);
ihint = get_inode_write_hint(fd);
assert(fhint == RWH_WRITE_LIFE_LONG);
assert(ihint == RWF_WRITE_LIFE_NOT_SET);

/*
 * Clear file write hint, ensure that now returns cleared
 */
set_file_write_hint(fd, RWF_WRITE_LIFE_NOT_SET);
fhint = get_file_write_hint(fd);
assert(fhint == RWF_WRITE_LIFE_NOT_SET);

close(fd);
unlink(filename);
return 0;
}


-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Darrick J. Wong
On Mon, Jun 26, 2017 at 07:55:27AM -0600, Jens Axboe wrote:
> On 06/26/2017 03:51 AM, Christoph Hellwig wrote:
> > Please document the userspace API (added linux-api and linux-man
> > to CC for sugestions), especially including the odd effects of the
> > per-inode settings.
> 
> Of course, I'll send in a diff for the fcntl(2) man page.
> 
> > Also I would highly recommend to use different fcntl commands
> > for the file vs inode hints to avoid any strange behavior.
> 
> OK, used to have that too... I can add specific _FILE versions.

While you're at it, can you also send in an xfstest or two to check the
basic functionality of the fcntl so that we know the code reflects the
userspace API ("I set this hint and now I can query it back" and "file
hint overrides inode hint") that we want?

--D

> 
> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] xfs: add support for passing in write hints for buffered writes

2017-06-26 Thread Jens Axboe
Reviewed-by: Andreas Dilger 
Reviewed-by: Martin K. Petersen 
Reviewed-by: Darrick J. Wong 
Signed-off-by: Jens Axboe 
---
 fs/xfs/xfs_aops.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 76b6f988e2fa..ceb124bd8f80 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -506,6 +506,7 @@ xfs_submit_ioend(
return status;
}
 
+   ioend->io_bio->bi_opf |= inode_hint_to_opf(ioend->io_inode);
submit_bio(ioend->io_bio);
return 0;
 }
@@ -565,6 +566,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+   ioend->io_bio->bi_opf |= inode_hint_to_opf(ioend->io_inode);
submit_bio(ioend->io_bio);
ioend->io_bio = new;
 }
-- 
2.7.4



[PATCH 2/9] block: add support for write hints in a bio

2017-06-26 Thread Jens Axboe
No functional changes in this patch, we just set aside 3 bits
in the bio/request flags, which can be used to hold a WRITE_LIFE_*
life time hint.

Ensure that we don't merge requests that have different life time
hints assigned to them.

Reviewed-by: Martin K. Petersen 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 block/blk-merge.c | 16 
 fs/inode.c|  9 +
 include/linux/blk_types.h | 31 +++
 include/linux/fs.h|  2 ++
 4 files changed, 58 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5df13041b851..be1e955db75e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -673,6 +673,14 @@ static struct request *attempt_merge(struct request_queue 
*q,
return NULL;
 
/*
+* Don't allow merge of different write hints, or for a hint with
+* non-hint IO.
+*/
+   if ((req->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+   (next->cmd_flags & REQ_WRITE_LIFE_MASK))
+   return NULL;
+
+   /*
 * If we are allowed to merge, then append bio list
 * from next to rq and release next. merge_requests_fn
 * will have updated segment counts, update sector
@@ -791,6 +799,14 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
 
+   /*
+* Don't allow merge of different write hints, or for a hint with
+* non-hint IO.
+*/
+   if ((rq->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+   (bio->bi_opf & REQ_WRITE_LIFE_MASK))
+   return false;
+
return true;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index defb015a2c6d..66cc431c9a96 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2131,3 +2131,12 @@ void inode_set_write_hint(struct inode *inode, enum 
rw_hint hint)
inode_unlock(inode);
}
 }
+
+/*
+ * Returns block write hint mask for the inode
+ */
+unsigned int inode_hint_to_opf(struct inode *inode)
+{
+   return write_hint_to_opf(inode_write_hint(inode));
+}
+EXPORT_SYMBOL(inode_hint_to_opf);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e210da6d14b8..0d44dce19d9f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -7,6 +7,7 @@
 
 #include 
 #include 
+#include 
 
 struct bio_set;
 struct bio;
@@ -223,6 +224,10 @@ enum req_flag_bits {
__REQ_RAHEAD,   /* read ahead, can fail anytime */
__REQ_BACKGROUND,   /* background IO */
 
+   __REQ_WRITE_HINT_SHIFT, /* 3 bits for life time hint */
+   __REQ_WRITE_HINT_PAD1,
+   __REQ_WRITE_HINT_PAD2,
+
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP,  /* do not free blocks when zeroing */
 
@@ -244,6 +249,13 @@ enum req_flag_bits {
 #define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
 
+#define REQ_WRITE_SHORT(WRITE_LIFE_SHORT << 
__REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_MEDIUM   (WRITE_LIFE_MEDIUM << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_LONG (WRITE_LIFE_LONG << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_EXTREME  (WRITE_LIFE_EXTREME << __REQ_WRITE_HINT_SHIFT)
+
+#define REQ_WRITE_LIFE_MASK(0x7 << __REQ_WRITE_HINT_SHIFT)
+
 #define REQ_NOUNMAP(1ULL << __REQ_NOUNMAP)
 #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
 
@@ -335,4 +347,23 @@ struct blk_rq_stat {
u64 batch;
 };
 
+static inline unsigned int write_hint_to_opf(enum rw_hint hint)
+{
+   return hint << __REQ_WRITE_HINT_SHIFT;
+}
+
+/*
+ * Don't let drivers see WRITE_LIFE_NOT_SET, return NONE for that
+ */
+static inline enum rw_hint opf_to_write_hint(unsigned int opf)
+{
+   enum rw_hint ret;
+
+   ret = (opf & REQ_WRITE_LIFE_MASK) >> __REQ_WRITE_HINT_SHIFT;
+   if (ret == WRITE_LIFE_NOT_SET)
+   ret = WRITE_LIFE_NONE;
+
+   return ret;
+}
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ef5d110d2bc..86888a6ccad1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1912,6 +1912,8 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode)
return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
 }
 
+extern unsigned int inode_hint_to_opf(struct inode *inode);
+
 static inline unsigned int write_hint_to_mask(enum rw_hint hint,
  unsigned int shift)
 {
-- 
2.7.4



[PATCH 9/9] nvme: add support for streams and directives

2017-06-26 Thread Jens Axboe
This adds support for Directives in NVMe, particular for the Streams
directive. Support for Directives is a new feature in NVMe 1.3. It
allows a user to pass in information about where to store the data, so
that it the device can do so most effiently. If an application is
managing and writing data with different life times, mixing differently
retentioned data onto the same locations on flash can cause write
amplification to grow. This, in turn, will reduce performance and life
time of the device.

Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 drivers/nvme/host/core.c | 148 +--
 drivers/nvme/host/nvme.h |   4 ++
 include/linux/nvme.h |  48 +++
 3 files changed, 196 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index aee37b73231d..2d9835617953 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -65,6 +65,10 @@ static bool force_apst;
 module_param(force_apst, bool, 0644);
 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if 
quirked off");
 
+static bool streams;
+module_param(streams, bool, 0644);
+MODULE_PARM_DESC(stream, "turn on support for Streams write directives");
+
 struct workqueue_struct *nvme_wq;
 EXPORT_SYMBOL_GPL(nvme_wq);
 
@@ -297,6 +301,102 @@ struct request *nvme_alloc_request(struct request_queue 
*q,
 }
 EXPORT_SYMBOL_GPL(nvme_alloc_request);
 
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+   struct nvme_command c;
+
+   memset(, 0, sizeof(c));
+
+   c.directive.opcode = nvme_admin_directive_send;
+   c.directive.nsid = cpu_to_le32(0x);
+   c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+   c.directive.dtype = NVME_DIR_IDENTIFY;
+   c.directive.tdtype = NVME_DIR_STREAMS;
+   c.directive.endir = NVME_DIR_ENDIR;
+
+   return nvme_submit_sync_cmd(ctrl->admin_q, , NULL, 0);
+}
+
+static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
+ struct streams_directive_params *s, u32 nsid)
+{
+   struct nvme_command c;
+
+   memset(, 0, sizeof(c));
+   memset(s, 0, sizeof(*s));
+
+   c.directive.opcode = nvme_admin_directive_recv;
+   c.directive.nsid = cpu_to_le32(nsid);
+   c.directive.numd = sizeof(*s);
+   c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+   c.directive.dtype = NVME_DIR_STREAMS;
+
+   return nvme_submit_sync_cmd(ctrl->admin_q, , s, sizeof(*s));
+}
+
+static int nvme_configure_directives(struct nvme_ctrl *ctrl)
+{
+   struct streams_directive_params s;
+   int ret;
+
+   if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+   return 0;
+   if (!streams)
+   return 0;
+
+   ret = nvme_enable_streams(ctrl);
+   if (ret)
+   return ret;
+
+   ret = nvme_get_stream_params(ctrl, , 0x);
+   if (ret)
+   return ret;
+
+   ctrl->nssa = le16_to_cpu(s.nssa);
+   ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+   return 0;
+}
+
+/*
+ * Write hint number to stream mappings
+ */
+static const unsigned int 
stream_mappings[BLK_MAX_WRITE_HINTS][BLK_MAX_WRITE_HINTS] = {
+   /* 0 or 1 stream, we don't use streams */
+   { 0, },
+   { 0, },
+   /* collapse short+medium to short, and long+extreme to medium */
+   { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_SHORT,
+   WRITE_LIFE_MEDIUM, WRITE_LIFE_MEDIUM },
+   /* collapse long+extreme to long */
+   { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+   WRITE_LIFE_LONG, WRITE_LIFE_LONG },
+   /* 4 streams, no collapsing needed */
+   { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+   WRITE_LIFE_LONG, WRITE_LIFE_EXTREME },
+};
+
+/*
+ * Check if 'req' has a write hint associated with it. If it does, assign
+ * a valid namespace stream to the write. If we haven't setup streams yet,
+ * kick off configuration and ignore the hints until that has completed.
+ */
+static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
+struct request *req, u16 *control,
+u32 *dsmgmt)
+{
+   enum rw_hint streamid;
+
+   streamid = opf_to_write_hint(req->cmd_flags);
+   if (streamid != WRITE_LIFE_NONE) {
+   streamid = stream_mappings[ctrl->nr_streams][streamid - 1];
+   *control |= NVME_RW_DTYPE_STREAMS;
+   *dsmgmt |= streamid << 16;
+   }
+
+   if (streamid < ARRAY_SIZE(req->q->write_hints))
+   req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+}
+
 static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
 {
@@ -348,6 +448,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, 
struct request *req,
 static inline 

[PATCH 8/9] btrfs: add support for passing in write hints for buffered writes

2017-06-26 Thread Jens Axboe
Reviewed-by: Andreas Dilger 
Signed-off-by: Chris Mason 
Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 fs/btrfs/extent_io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 19eedf2e630b..fde09c6005fc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2830,6 +2830,7 @@ static int submit_extent_page(int op, int op_flags, 
struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+   op_flags |= inode_hint_to_opf(page->mapping->host);
bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
-- 
2.7.4



[PATCH 5/9] fs: add support for buffered writeback to pass down write hints

2017-06-26 Thread Jens Axboe
Reviewed-by: Andreas Dilger 
Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 fs/buffer.c | 14 +-
 fs/mpage.c  |  1 +
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 306b720f7383..307b508c9d60 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,7 @@
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-struct writeback_control *wbc);
+unsigned int hint, struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -1829,7 +1829,8 @@ int __block_write_full_page(struct inode *inode, struct 
page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
-   submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+   submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+   inode_write_hint(inode), wbc);
nr_underway++;
}
bh = next;
@@ -1883,7 +1884,8 @@ int __block_write_full_page(struct inode *inode, struct 
page *page,
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
-   submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+   submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+   inode_write_hint(inode), wbc);
nr_underway++;
}
bh = next;
@@ -3091,7 +3093,7 @@ void guard_bio_eod(int op, struct bio *bio)
 }
 
 static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-struct writeback_control *wbc)
+unsigned int write_hint, struct writeback_control *wbc)
 {
struct bio *bio;
 
@@ -3134,6 +3136,8 @@ static int submit_bh_wbc(int op, int op_flags, struct 
buffer_head *bh,
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
+
+   op_flags |= write_hint_to_opf(write_hint);
bio_set_op_attrs(bio, op, op_flags);
 
submit_bio(bio);
@@ -3142,7 +3146,7 @@ static int submit_bh_wbc(int op, int op_flags, struct 
buffer_head *bh,
 
 int submit_bh(int op, int op_flags, struct buffer_head *bh)
 {
-   return submit_bh_wbc(op, op_flags, bh, NULL);
+   return submit_bh_wbc(op, op_flags, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
diff --git a/fs/mpage.c b/fs/mpage.c
index 9524fdde00c2..07587fd6debf 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -615,6 +615,7 @@ static int __mpage_writepage(struct page *page, struct 
writeback_control *wbc,
goto confused;
 
wbc_init_bio(wbc, bio);
+   bio->bi_opf |= inode_hint_to_opf(inode);
}
 
/*
-- 
2.7.4



[PATCH 4/9] fs: add O_DIRECT support for sending down write life time hints

2017-06-26 Thread Jens Axboe
Reviewed-by: Andreas Dilger 
Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 2 ++
 fs/direct-io.c | 2 ++
 fs/iomap.c | 5 -
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index dd91c99e9ba0..30e1fb65c2fa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -183,6 +183,8 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
/* avoid the need for a I/O completion work item */
if (iocb->ki_flags & IOCB_DSYNC)
op |= REQ_FUA;
+
+   op |= write_hint_to_opf(iocb_write_hint(iocb));
return op;
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c87077d1dc33..5fea570551e5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -385,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
else
bio->bi_end_io = dio_bio_end_io;
 
+   bio->bi_opf |= write_hint_to_opf(iocb_write_hint(dio->iocb));
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
diff --git a/fs/iomap.c b/fs/iomap.c
index c71a64b97fba..42d4ecf3ba54 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -803,7 +803,10 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t 
length,
}
 
if (dio->flags & IOMAP_DIO_WRITE) {
-   bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | 
REQ_IDLE);
+   bio_set_op_attrs(bio, REQ_OP_WRITE,
+   REQ_SYNC | REQ_IDLE);
+   bio->bi_opf |=
+   write_hint_to_opf(iocb_write_hint(dio->iocb));
task_io_account_write(bio->bi_iter.bi_size);
} else {
bio_set_op_attrs(bio, REQ_OP_READ, 0);
-- 
2.7.4



[PATCHSET v10] Add support for write life time hints

2017-06-26 Thread Jens Axboe
A new iteration of this patchset, previously known as write streams.
As before, this patchset aims at enabling applications split up
writes into separate streams, based on the perceived life time
of the data written. This is useful for a variety of reasons:

- For NVMe, this feature is ratified and released with the NVMe 1.3
  spec. Devices implementing Directives can expose multiple streams.
  Separating data written into streams based on life time can
  drastically reduce the write amplification. This helps device
  endurance, and increases performance. Testing just performed
  internally at Facebook with these patches showed up to a 25% reduction
  in NAND writes in a RocksDB setup.

- Software caching solutions can make more intelligent decisions
  on how and where to place data.

Contrary to previous patches, we're not exposing numeric stream values anymore.
I've previously advocated for just doing a set of hints that makes sense
instead. See the coverage from the LSFMM summit this year:

https://lwn.net/Articles/717755/

This patchset attempts to do that. We add an fcntl(2) interface to
get/set these types of hints. We define 4 hints that pertain to
data write life times:

RWH_WRITE_LIFE_SHORTData written with this flag is expected to have
a high overwrite rate, or life time.

RWH_WRITE_LIFE_MEDIUM   Longer life time than SHORT

RWH_WRITE_LIFE_LONG Longer life time than MEDIUM

RWH_WRITE_LIFE_EXTREME  Longer life time than LONG

The idea is that these are relative values, so an application can
use them as they see fit. The underlying device can then place
data appropriately, or be free to ignore the hint. It's just a hint.

A branch based on current master can be pulled
from here:

git://git.kernel.dk/linux-block write-stream.10

Changes since v9:

- Address Christoph's concerns:
- Add NVMe 'streams' parameter, default to off.
- Add file get/set fcntl() commands.
- Add helper for getting block opf mask from inode write
  hint.
- Fixup a few < 80 lines.

Changes since v8:

- Add file write hints as well. File hints override inode hints,
  if both are valid and available.
- Distinguish between "hint not set" or "hint none".
- NVMe: remove global stream allocation and stream parameter
- Rebase on top of new for-4.13/block, to fixup conflicts with
  the NOWAIT patchset.

Changes since v7:

- NVMe: change 'streams' parameter to be a bool enable/disable. We
  hardwire the number of streams anyway and use the appropriate amount,
  so no point in exposing this value.
- NVMe: collapse stream values appropriately, instead of just doing
  a basic MOD.
- Get rid of pwritev2(2) flags. Just use the fcntl(2) interface.
- Collapse some patches
- Change fcntl(2) interface to get/set values from a user supplied
  64-bit pointer.
- Move inode-to-iocb mask setting to iocb_flags().

Changes since v6:

- Rewrite NVMe write stream assignment
- Change NVMe stream assignment to be per-controller, not per-ns. Then
  we can use the same IDs across name spaces, and we don't have to do
  lazy setup of streams.
- If streams are enabled on nvme, set io min/opt and discard
  granularity based on the stream params reported.
- Fixup F_SET_RW_HINT definition, it was 20, should have been 12.

Changes since v5:

- Change enum write_hint to enum rw_hint.
- Change fcntl() interface to be read/write generic
- Bring enum rw_hint all the way to bio/request
- Change references to streams in changelogs and debugfs interface
- Rebase to master to resolve blkdev.h conflict
- Reshuffle patches so the WRITE_LIFE_* hints and type come first. Allowed
  me to merge two block patches as well.

Changes since v4:

- Add enum write_hint and the WRITE_HINT_* values. This is what we
  use internally (until transformed to req/bio flags), and what is
  exposed to user space with the fcntl() interface. Maps directly
  to the RWF_WRITE_LIFE_* values.
- Add fcntl() interface for getting/setting hint values.
- Get rid of inode ->i_write_hint, encode the 3 bits of hint info
  in the inode flags intead.
- Allow a write with no hint to clear the old hint. Previously we
  only changed the hint if a new valid hint was given, not if no
  hint was passed in.
- Shrink flag space grabbed from 4 to 3 bits for RWF_* and the inode
  flags.

Changes since v3:

- Change any naming of stream ID to write hint.
- Various little API changes, suggested by Christoph
- Cleanup the NVMe bits, dump the debug info.
- Change NVMe to lazily allocate the streams.
- Various NVMe error handling improvements and command checking.

Changes since v2:

- Get rid of bio->bi_stream and replace with four request/bio flags.
  These map directly to the RWF_WRITE_* flags that the user passes in.
- Cleanup the NVMe stream setting.
- Drivers now responsible for updating the queue stream write counter,
  as they determine what stream to map a given flag to.

Changes since v1:

- Guard queue stream stats to ensure we don't mess up 

[PATCH 6/9] ext4: add support for passing in write hints for buffered writes

2017-06-26 Thread Jens Axboe
Reviewed-by: Andreas Dilger 
Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 fs/ext4/page-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 930ca0fc9a0f..02e5a7b8d60b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -350,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
  REQ_SYNC : 0;
+   io_op_flags |= inode_hint_to_opf(io->io_end->inode);
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
@@ -397,6 +398,7 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
+   io->io_bio->bi_opf |= inode_hint_to_opf(inode);
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
-- 
2.7.4



[PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Jens Axboe
Define a set of write life time hints:

RWH_WRITE_LIFE_NOT_SET  No hint information set
RWH_WRITE_LIFE_NONE No hints about write life time
RWH_WRITE_LIFE_SHORTData written has a short life time
RWH_WRITE_LIFE_MEDIUM   Data written has a medium life time
RWH_WRITE_LIFE_LONG Data written has a long life time
RWH_WRITE_LIFE_EXTREME  Data written has an extremely long life time

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT   Returns the read/write hint set on the
underlying inode.

F_SET_RW_HINT   Set one of the above write hints on the
underlying inode.

F_GET_FILE_RW_HINT  Returns the read/write hint set on the
file descriptor.

F_SET_FILE_RW_HINT  Set one of the above write hints on the
file descriptor.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags
and in struct file as well, and pass them to the kiocb flags. If
both a file and its corresponding inode has a write hint, then we
use the one in the file, if available. The file hint can be used
for sync/direct IO, for buffered writeback only the inode hint
is available.

This is in preparation for utilizing these hints in the block layer,
to guide on-media data placement.

/*
 * writehint.c: get or set an inode write hint
 */
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 

 #ifndef F_GET_RW_HINT
 #define F_LINUX_SPECIFIC_BASE  1024
 #define F_GET_RW_HINT  (F_LINUX_SPECIFIC_BASE + 11)
 #define F_SET_RW_HINT  (F_LINUX_SPECIFIC_BASE + 12)
 #endif

static char *str[] = { "RWF_WRITE_LIFE_NOT_SET", "RWH_WRITE_LIFE_NONE",
"RWH_WRITE_LIFE_SHORT", "RWH_WRITE_LIFE_MEDIUM",
"RWH_WRITE_LIFE_LONG", "RWH_WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
uint64_t hint;
int fd, ret;

if (argc < 2) {
fprintf(stderr, "%s: file \n", argv[0]);
return 1;
}

fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 2;
}

if (argc > 2) {
hint = atoi(argv[2]);
ret = fcntl(fd, F_SET_RW_HINT, );
if (ret < 0) {
perror("fcntl: F_SET_RW_HINT");
return 4;
}
}

ret = fcntl(fd, F_GET_RW_HINT, );
if (ret < 0) {
perror("fcntl: F_GET_RW_HINT");
return 3;
}

printf("%s: hint %s\n", argv[1], str[hint]);
close(fd);
return 0;
}

Reviewed-by: Martin K. Petersen 
Signed-off-by: Jens Axboe 
---
 fs/fcntl.c | 66 +
 fs/inode.c | 11 +++
 fs/open.c  |  1 +
 include/linux/fs.h | 74 --
 include/uapi/linux/fcntl.h | 21 +
 5 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..e166807646bf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,66 @@ static int f_getowner_uids(struct file *filp, unsigned 
long arg)
 }
 #endif
 
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+   struct inode *inode = file_inode(file);
+   bool on_file = false;
+   enum rw_hint hint;
+   long ret = 0;
+
+   switch (cmd) {
+   case F_GET_FILE_RW_HINT:
+   on_file = true;
+   case F_GET_RW_HINT:
+   /*
+* If we ask for the file descriptor hint and it isn't set,
+* return the underlying inode write hint. This is what
+* writeback does as well.
+*/
+   hint = RWF_WRITE_LIFE_NOT_SET;
+   if (on_file)
+   hint = file->f_write_hint;
+
+   if (!on_file || hint == RWF_WRITE_LIFE_NOT_SET)
+   hint = mask_to_write_hint(inode->i_flags,
+   S_WRITE_LIFE_SHIFT);
+   if (put_user(hint, (u64 __user *) arg))
+   ret = -EFAULT;
+   break;
+   case F_SET_FILE_RW_HINT:
+   on_file = true;
+   case F_SET_RW_HINT:
+   if (get_user(hint, (u64 __user *) arg)) {
+   ret = -EFAULT;
+   break;
+   }
+   switch (hint) {
+

[PATCH 3/9] blk-mq: expose write hints through debugfs

2017-06-26 Thread Jens Axboe
Useful to verify that things are working the way they should.
Reading the file will return number of kb written with each
write hint. Writing the file will reset the statistics. No care
is taken to ensure that we don't race on updates.

Drivers will write to q->write_hints[] if they handle a given
write hint.

Reviewed-by: Andreas Dilger 
Reviewed-by: Martin K. Petersen 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 block/blk-mq-debugfs.c | 24 
 include/linux/blkdev.h |  3 +++
 2 files changed, 27 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 9edebbdce0bd..9ebc2945f991 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -135,6 +135,29 @@ static void print_stat(struct seq_file *m, struct 
blk_rq_stat *stat)
}
 }
 
+static int queue_write_hint_show(void *data, struct seq_file *m)
+{
+   struct request_queue *q = data;
+   int i;
+
+   for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
+   seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]);
+
+   return 0;
+}
+
+static ssize_t queue_write_hint_store(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+   struct request_queue *q = data;
+   int i;
+
+   for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
+   q->write_hints[i] = 0;
+
+   return count;
+}
+
 static int queue_poll_stat_show(void *data, struct seq_file *m)
 {
struct request_queue *q = data;
@@ -730,6 +753,7 @@ static const struct blk_mq_debugfs_attr 
blk_mq_debugfs_queue_attrs[] = {
{"poll_stat", 0400, queue_poll_stat_show},
{"requeue_list", 0400, .seq_ops = _requeue_list_seq_ops},
{"state", 0600, queue_state_show, queue_state_write},
+   {"write_hints", 0600, queue_write_hint_show, queue_write_hint_store},
{},
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bf2157141d53..596de77b9a0a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -594,6 +594,9 @@ struct request_queue {
void*rq_alloc_data;
 
struct work_struct  release_work;
+
+#define BLK_MAX_WRITE_HINTS5
+   u64 write_hints[BLK_MAX_WRITE_HINTS];
 };
 
 #define QUEUE_FLAG_QUEUED  1   /* uses generic tag queueing */
-- 
2.7.4



Re: [PATCH v7 21/22] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-26 Thread Darrick J. Wong
On Fri, Jun 16, 2017 at 03:34:26PM -0400, Jeff Layton wrote:
> Just check and advance the data errseq_t in struct file before
> before returning from fsync on normal files. Internal filemap_*
> callers are left as-is.
> 
> Signed-off-by: Jeff Layton 
> ---
>  fs/xfs/xfs_file.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5fb5a0958a14..bc3b1575e8db 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -134,7 +134,7 @@ xfs_file_fsync(
>   struct inode*inode = file->f_mapping->host;
>   struct xfs_inode*ip = XFS_I(inode);
>   struct xfs_mount*mp = ip->i_mount;
> - int error = 0;
> + int error = 0, err2;
>   int log_flushed = 0;
>   xfs_lsn_t   lsn = 0;
>  
> @@ -142,10 +142,12 @@ xfs_file_fsync(
>  
>   error = filemap_write_and_wait_range(inode->i_mapping, start, end);
>   if (error)
> - return error;
> + goto out;
>  
> - if (XFS_FORCED_SHUTDOWN(mp))
> - return -EIO;
> + if (XFS_FORCED_SHUTDOWN(mp)) {
> + error = -EIO;
> + goto out;
> + }
>  
>   xfs_iflags_clear(ip, XFS_ITRUNCATED);
>  
> @@ -197,6 +199,11 @@ xfs_file_fsync(
>   mp->m_logdev_targp == mp->m_ddev_targp)
>   xfs_blkdev_issue_flush(mp->m_ddev_targp);
>  
> +out:
> + err2 = filemap_report_wb_err(file);

Could we have a comment here to remind anyone reading the code a year
from now that filemap_report_wb_err has side effects?  Pre-coffee me was
wondering why we'd bother calling filemap_report_wb_err in the
XFS_FORCED_SHUTDOWN case, then remembered that it touches data
structures.

The first sentence of the commit message (really, the word 'advance')
added as a comment was adequate to remind me of the side effects.

Once that's added,
Reviewed-by: Darrick J. Wong 

--D

> + if (!error)
> + error = err2;
> +
>   return error;
>  }
>  
> -- 
> 2.13.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 16/22] block: convert to errseq_t based writeback error tracking

2017-06-26 Thread Jeff Layton
On Sat, 2017-06-24 at 09:16 -0400, Jeff Layton wrote:
> On Sat, 2017-06-24 at 04:59 -0700, Christoph Hellwig wrote:
> > On Tue, Jun 20, 2017 at 01:44:44PM -0400, Jeff Layton wrote:
> > > In order to query for errors with errseq_t, you need a previously-
> > > sampled point from which to check. When you call
> > > filemap_write_and_wait_range though you don't have a struct file and so
> > > no previously-sampled value.
> > 
> > So can we simply introduce variants of them that take a struct file?
> > That would be:
> > 
> >  a) less churn
> >  b) less code
> >  c) less chance to get data integrity wrong
> 
> Yeah, I had that thought after I sent the reply to you earlier.
> 
> The main reason I didn't do that before was that I had myself convinced
> that we needed to do the check_and_advance as late as possible in the
> fsync process, after the metadata had been written.
> 
> Now that I think about it more, I think you're probably correct. As long
> as we do the check and advance at some point after doing the
> write_and_wait, we're fine here and shouldn't violate exactly once
> semantics on the fsync return.

So I have a file_write_and_wait_range now that should DTRT for this
patch.

The bigger question is -- what about more complex filesystems like
ext4?  There are a couple of cases where we can return -EIO or -EROFS on
fsync before filemap_write_and_wait_range is ever called. Like this one
for instance:

if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb
return -EIO;

...and the EXT4_MF_FS_ABORTED case.

Are those conditions ever recoverable, such that a later fsync could
succeed? IOW, could I do a remount or something such that the existing
fds are left open and become usable again? 

If so, then we really ought to advance the errseq_t in the file when we
catch those cases as well. If we have to do that, then it probably makes
sense to leave the ext4 patch as-is.
-- 
Jeff Layton 


Re: [PATCH blktests 2/2] block/011: Perform PCI reset while doing IO

2017-06-26 Thread Jens Axboe
On 06/26/2017 08:06 AM, Johannes Thumshirn wrote:
> On Fri, Jun 23, 2017 at 09:36:14AM -0600, Jens Axboe wrote:
>> On 06/23/2017 08:29 AM, Johannes Thumshirn wrote:
>>> From: Omar Sandoval 
>>>
>>> This test-case performs I/O with fio while doing PCI disable/enable
>>> cycles.
>>>
>>> In the results we don't care for I/O errors but for hiccups in dmesg only.
>>
>> Let's get this in, that would be a very useful test. A few comments -
>> not necessarily on this patch in particular, but for future cleanups
>> and improvements.
>>
>>> +   if _test_dev_is_rotational; then
>>> +   size="32m"
>>> +   else
>>> +   size="1g"
>>> +   fi
>>
>> I introduced this idea in one of my previous patches. I wonder if we
>> should turn that into a helper. Pass in the dev, get returned a
>> suitable fio size, instead of hard coding this in each job that
>> needs it.
> 
> Sure.
> 
>>
>>> +   # start fio job
>>> +   _run_fio --bs=4k --rw=randread --norandommap \
>>> +   --name=reads --filename="$TEST_DEV" --size="$size" \
>>> +   --numjobs=8 --direct=1 2>/dev/null &
>>
>> I don't believe we check for fio errors right now, but we probably
>> should in the future. So I think you'd want to add something ala:
>>
>> --ignore_error=EIO,ENXIO,ENODEV
>>
>> to your options to make it explicit that you don't care about IO
>> errors for this test.
> 
> Oh nice, didn't know about the option. Btw as we're currently all have
> arbitrary values for the numjobs parameter, how about a wrapper over getconf
> _NPROCESSORS_ONLN?

Yes that's a good idea, then we can at least size the jobs based on
how many cores we have.

-- 
Jens Axboe



Re: [PATCH 7/9] xfs: add support for passing in write hints for buffered writes

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:56 AM, Christoph Hellwig wrote:
>> +ioend->io_bio->bi_opf |= 
>> write_hint_to_opf(inode_write_hint(ioend->io_inode));
> 
> Too long line again.
> 
> I think simply adding an inode_hint_to_opf() flag would make the various
> opencoded versions of the above call a little easier to read.

Sure, I'll do that.

-- 
Jens Axboe



Re: [PATCH 4/9] fs: add O_DIRECT support for sending down write life time hints

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:55 AM, Christoph Hellwig wrote:
>> @@ -385,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
>>  else
>>  bio->bi_end_io = dio_bio_end_io;
>>  
>> +bio->bi_opf |= write_hint_to_opf(iocb_write_hint(dio->iocb));
>> +
>>  sdio->bio = bio;
>>  sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
>>  }
>> diff --git a/fs/iomap.c b/fs/iomap.c
>> index c71a64b97fba..9c9f8406018b 100644
>> --- a/fs/iomap.c
>> +++ b/fs/iomap.c
>> @@ -804,6 +804,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t 
>> length,
>>  
>>  if (dio->flags & IOMAP_DIO_WRITE) {
>>  bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | 
>> REQ_IDLE);
>> +bio->bi_opf |= 
>> write_hint_to_opf(inode_write_hint(inode));
> 
> This should be iocb based as well, otherwise you'll miss out on
> the per-file hints.

Good catch, fixed up.

> Also please don't add a > 80 char line.  (And yes, I should fix the one
> just above, too)

I fixed up them both...

-- 
Jens Axboe



Re: [PATCH blktests 2/2] block/011: Perform PCI reset while doing IO

2017-06-26 Thread Johannes Thumshirn
On Fri, Jun 23, 2017 at 09:36:14AM -0600, Jens Axboe wrote:
> On 06/23/2017 08:29 AM, Johannes Thumshirn wrote:
> > From: Omar Sandoval 
> > 
> > This test-case performs I/O with fio while doing PCI disable/enable
> > cycles.
> > 
> > In the results we don't care for I/O errors but for hiccups in dmesg only.
> 
> Let's get this in, that would be a very useful test. A few comments -
> not necessarily on this patch in particular, but for future cleanups
> and improvements.
> 
> > +   if _test_dev_is_rotational; then
> > +   size="32m"
> > +   else
> > +   size="1g"
> > +   fi
> 
> I introduced this idea in one of my previous patches. I wonder if we
> should turn that into a helper. Pass in the dev, get returned a
> suitable fio size, instead of hard coding this in each job that
> needs it.

Sure.

> 
> > +   # start fio job
> > +   _run_fio --bs=4k --rw=randread --norandommap \
> > +   --name=reads --filename="$TEST_DEV" --size="$size" \
> > +   --numjobs=8 --direct=1 2>/dev/null &
> 
> I don't believe we check for fio errors right now, but we probably
> should in the future. So I think you'd want to add something ala:
> 
> --ignore_error=EIO,ENXIO,ENODEV
> 
> to your options to make it explicit that you don't care about IO
> errors for this test.

Oh nice, didn't know about the option. Btw as we're currently all have
arbitrary values for the numjobs parameter, how about a wrapper over getconf
_NPROCESSORS_ONLN?

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 9/9] nvme: add support for streams and directives

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:59 AM, Christoph Hellwig wrote:
> Looks mostly good,
> 
> but two nit-picks:
> 
>  - can we keep a module option to disable streams, or in fact for
>now maybe to explicitly enable it?  I expect this to be interesting
>at least for the first devices that implement it.  Also given that
>it needs to be explicitly enabled I would expect some overhead of
>just enabling it when never used

Fine with me, I can add the 'streams' parameter back, but just default
it to false.

>  - do we even need the < 4 streams fallback now that they are global
>instead of per-ns instead of just disabling the feature for now?

Maybe the device only supports 2? or 3?

-- 
Jens Axboe



Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints

2017-06-26 Thread Jens Axboe
On 06/26/2017 03:51 AM, Christoph Hellwig wrote:
> Please document the userspace API (added linux-api and linux-man
> to CC for sugestions), especially including the odd effects of the
> per-inode settings.

Of course, I'll send in a diff for the fcntl(2) man page.

> Also I would highly recommend to use different fcntl commands
> for the file vs inode hints to avoid any strange behavior.

OK, used to have that too... I can add specific _FILE versions.

-- 
Jens Axboe



Re: [PATCH v7 21/22] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-26 Thread Carlos Maiolino
On Fri, Jun 16, 2017 at 03:34:26PM -0400, Jeff Layton wrote:
> Just check and advance the data errseq_t in struct file before
> before returning from fsync on normal files. Internal filemap_*
> callers are left as-is.
> 

Looks good.

Reviewed-by: Carlos Maiolino 

> Signed-off-by: Jeff Layton 
> ---
>  fs/xfs/xfs_file.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5fb5a0958a14..bc3b1575e8db 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -134,7 +134,7 @@ xfs_file_fsync(
>   struct inode*inode = file->f_mapping->host;
>   struct xfs_inode*ip = XFS_I(inode);
>   struct xfs_mount*mp = ip->i_mount;
> - int error = 0;
> + int error = 0, err2;
>   int log_flushed = 0;
>   xfs_lsn_t   lsn = 0;
>  
> @@ -142,10 +142,12 @@ xfs_file_fsync(
>  
>   error = filemap_write_and_wait_range(inode->i_mapping, start, end);
>   if (error)
> - return error;
> + goto out;
>  
> - if (XFS_FORCED_SHUTDOWN(mp))
> - return -EIO;
> + if (XFS_FORCED_SHUTDOWN(mp)) {
> + error = -EIO;
> + goto out;
> + }
>  
>   xfs_iflags_clear(ip, XFS_ITRUNCATED);
>  
> @@ -197,6 +199,11 @@ xfs_file_fsync(
>   mp->m_logdev_targp == mp->m_ddev_targp)
>   xfs_blkdev_issue_flush(mp->m_ddev_targp);
>  
> +out:
> + err2 = filemap_report_wb_err(file);
> + if (!error)
> + error = err2;
> +
>   return error;
>  }
>  
> -- 
> 2.13.0
> 

-- 
Carlos


[PATCH v2 17/51] bvec_iter: introduce BVEC_ITER_ALL_INIT

2017-06-26 Thread Ming Lei
Introduce BVEC_ITER_ALL_INIT for iterating one bio
from start to end.

Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 89b65b82d98f..162ca7caf510 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -94,4 +94,13 @@ static inline void bvec_iter_advance(const struct bio_vec 
*bv,
((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
 bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
 
+/* for iterating one bio from start to end */
+#define BVEC_ITER_ALL_INIT (struct bvec_iter)  \
+{  \
+   .bi_sector  = 0,\
+   .bi_size= UINT_MAX, \
+   .bi_idx = 0,\
+   .bi_bvec_done   = 0,\
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
2.9.4



[PATCH v2 11/51] md: raid1: initialize bvec table via bio_add_page()

2017-06-26 Thread Ming Lei
We will support multipage bvec soon, so initialize bvec
table using the standardy way instead of writing the
talbe directly. Otherwise it won't work any more once
multipage bvec is enabled.

Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 drivers/md/raid1.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3febfc8391fb..835c42396861 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2086,10 +2086,8 @@ static void process_checks(struct r1bio *r1_bio)
/* Fix variable parts of all bios */
vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
for (i = 0; i < conf->raid_disks * 2; i++) {
-   int j;
int size;
blk_status_t status;
-   struct bio_vec *bi;
struct bio *b = r1_bio->bios[i];
struct resync_pages *rp = get_resync_pages(b);
if (b->bi_end_io != end_sync_read)
@@ -2098,8 +2096,6 @@ static void process_checks(struct r1bio *r1_bio)
status = b->bi_status;
bio_reset(b);
b->bi_status = status;
-   b->bi_vcnt = vcnt;
-   b->bi_iter.bi_size = r1_bio->sectors << 9;
b->bi_iter.bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
b->bi_bdev = conf->mirrors[i].rdev->bdev;
@@ -2107,15 +2103,20 @@ static void process_checks(struct r1bio *r1_bio)
rp->raid_bio = r1_bio;
b->bi_private = rp;
 
-   size = b->bi_iter.bi_size;
-   bio_for_each_segment_all(bi, b, j) {
-   bi->bv_offset = 0;
-   if (size > PAGE_SIZE)
-   bi->bv_len = PAGE_SIZE;
-   else
-   bi->bv_len = size;
-   size -= PAGE_SIZE;
-   }
+   /* initialize bvec table again */
+   rp->idx = 0;
+   size = r1_bio->sectors << 9;
+   do {
+   struct page *page = resync_fetch_page(rp, rp->idx++);
+   int len = min_t(int, size, PAGE_SIZE);
+
+   /*
+* won't fail because the vec table is big
+* enough to hold all these pages
+*/
+   bio_add_page(b, page, len, 0);
+   size -= len;
+   } while (rp->idx < RESYNC_PAGES && size > 0);
}
for (primary = 0; primary < conf->raid_disks * 2; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-- 
2.9.4



[PATCH v2 15/51] btrfs: comment on direct access bvec table

2017-06-26 Thread Ming Lei
Cc: Chris Mason 
Cc: Josef Bacik 
Cc: David Sterba 
Cc: linux-bt...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 fs/btrfs/compression.c |  4 
 fs/btrfs/inode.c   | 12 
 2 files changed, 16 insertions(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2c0b7b57fcd5..5972f74354ca 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -541,6 +541,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode 
*inode, struct bio *bio,
 
/* we need the actual starting offset of this extent in the file */
read_lock(_tree->lock);
+   /*
+* It is still safe to retrieve the 1st page of the bio
+* in this way after supporting multipage bvec.
+*/
em = lookup_extent_mapping(em_tree,
   page_offset(bio->bi_io_vec->bv_page),
   PAGE_SIZE);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4ab02b34f029..7e725d84917b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8055,6 +8055,12 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
if (bio->bi_status)
goto end;
 
+   /*
+* WARNING:
+*
+* With multipage bvec, the following way of direct access to
+* bvec table is only safe if the bio includes single page.
+*/
ASSERT(bio->bi_vcnt == 1);
io_tree = _I(inode)->io_tree;
failure_tree = _I(inode)->io_failure_tree;
@@ -8146,6 +8152,12 @@ static void btrfs_retry_endio(struct bio *bio)
 
uptodate = 1;
 
+   /*
+* WARNING:
+*
+* With multipage bvec, the following way of direct access to
+* bvec table is only safe if the bio includes single page.
+*/
ASSERT(bio->bi_vcnt == 1);
ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
 
-- 
2.9.4



[PATCH v2 12/51] md: raid10: avoid to access bvec table directly

2017-06-26 Thread Ming Lei
Inside sync_request_write(), .bi_vcnt is written after this bio
is reseted, this way won't work any more after multipage bvec
is enabled.

So reset_bvec_table() is introduced for re-add these pages into
bio, then .bi_vcnt needn't to be touched any more.

Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 drivers/md/raid10.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5026e7ad51d3..2fca1fe67092 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1995,6 +1995,24 @@ static void end_sync_write(struct bio *bio)
end_sync_request(r10_bio);
 }
 
+/* called after bio_reset() */
+static void reset_bvec_table(struct bio *bio, struct resync_pages *rp, int 
size)
+{
+   /* initialize bvec table again */
+   rp->idx = 0;
+   do {
+   struct page *page = resync_fetch_page(rp, rp->idx++);
+   int len = min_t(int, size, PAGE_SIZE);
+
+   /*
+* won't fail because the vec table is big
+* enough to hold all these pages
+*/
+   bio_add_page(bio, page, len, 0);
+   size -= len;
+   } while (rp->idx < RESYNC_PAGES && size > 0);
+}
+
 /*
  * Note: sync and recover and handled very differently for raid10
  * This code is for resync.
@@ -2087,8 +2105,8 @@ static void sync_request_write(struct mddev *mddev, 
struct r10bio *r10_bio)
rp = get_resync_pages(tbio);
bio_reset(tbio);
 
-   tbio->bi_vcnt = vcnt;
-   tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
+   reset_bvec_table(tbio, rp, fbio->bi_iter.bi_size);
+
rp->raid_bio = r10_bio;
tbio->bi_private = rp;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
-- 
2.9.4



[PATCH v2 14/51] btrfs: avoid to access bvec table directly for a cloned bio

2017-06-26 Thread Ming Lei
Commit 17347cec15f919901c90(Btrfs: change how we iterate bios in endio)
mentioned that for dio the submitted bio may be fast cloned, we
can't access the bvec table directly for a cloned bio, so use
bio_get_first_bvec() to retrieve the 1st bvec.

Cc: Chris Mason 
Cc: Josef Bacik 
Cc: David Sterba 
Cc: linux-bt...@vger.kernel.org
Cc: Liu Bo 
Signed-off-by: Ming Lei 
---
 fs/btrfs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06dea7c89bbd..4ab02b34f029 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7993,6 +7993,7 @@ static int dio_read_error(struct inode *inode, struct bio 
*failed_bio,
int read_mode = 0;
int segs;
int ret;
+   struct bio_vec bvec;
 
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
@@ -8008,8 +8009,9 @@ static int dio_read_error(struct inode *inode, struct bio 
*failed_bio,
}
 
segs = bio_segments(failed_bio);
+   bio_get_first_bvec(failed_bio, );
if (segs > 1 ||
-   (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
+   (bvec.bv_len > btrfs_inode_sectorsize(inode)))
read_mode |= REQ_FAILFAST_DEV;
 
isector = start - btrfs_io_bio(failed_bio)->logical;
-- 
2.9.4



[PATCH v2 19/51] block: comments on bio_for_each_segment[_all]

2017-06-26 Thread Ming Lei
This patch clarifies the fact that even though both
bio_for_each_segment() and bio_for_each_segment_all()
are named as _segment/_segment_all, they still return
one page in each vector, instead of real segment(multipage bvec).

With comming multipage bvec, both the two helpers
are capable of returning real segment(multipage bvec),
but the callers(users) of the two helpers may not be
capable of handling of the multipage bvec or real
segment, so we still keep the interfaces of the helpers
not changed. And new helpers for returning multipage bvec(real segment)
will be introduced later.

Signed-off-by: Ming Lei 
---
 include/linux/bio.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4907bea03908..d425be4d1ced 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -155,7 +155,10 @@ static inline void *bio_data(struct bio *bio)
 
 /*
  * drivers should _never_ use the all version - the bio may have been split
- * before it got to the driver and the driver won't own all of it
+ * before it got to the driver and the driver won't own all of it.
+ *
+ * Even though the helper is named as _segment_all, it still returns
+ * page one by one instead of real segment.
  */
 #define bio_for_each_segment_all(bvl, bio, i)  \
for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
@@ -177,6 +180,10 @@ static inline void bio_advance_iter(struct bio *bio, 
struct bvec_iter *iter,
((bvl = bio_iter_iovec((bio), (iter))), 1); \
 bio_advance_iter((bio), &(iter), (bvl).bv_len))
 
+/*
+ * Even though the helper is named as _segment, it still returns
+ * page one by one instead of real segment.
+ */
 #define bio_for_each_segment(bvl, bio, iter)   \
__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
-- 
2.9.4



[PATCH v2 20/51] block: introduce multipage/single page bvec helpers

2017-06-26 Thread Ming Lei
This patch introduces helpers which are suffixed with _mp
and _sp for the multipage bvec/segment support.

The helpers with _mp suffix are the interfaces for treating
one bvec/segment as real multipage one, for example, .bv_len
is the total length of the multipage segment.

The helpers with _sp suffix are interfaces for supporting
current bvec iterator which is thought as singlepage only
by drivers, fs, dm and etc. These _sp helpers are introduced
to build singlepage bvec in flight, so users of bio/bvec
iterator still can work well and needn't change even though
we store multipage into bvec.

Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 56 +---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 162ca7caf510..f52587e283d4 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -24,6 +24,42 @@
 #include 
 
 /*
+ * What is multipage bvecs(segment)?
+ *
+ * - bvec stored in bio->bi_io_vec is always multipage style vector
+ *
+ * - bvec(struct bio_vec) represents one physically contiguous I/O
+ *   buffer, now the buffer may include more than one pages since
+ *   multipage(mp) bvec is supported, and all these pages represented
+ *   by one bvec is physically contiguous. Before mp support, at most
+ *   one page can be included in one bvec, we call it singlepage(sp)
+ *   bvec.
+ *
+ * - .bv_page of th bvec represents the 1st page in the mp segment
+ *
+ * - .bv_offset of the bvec represents offset of the buffer in the bvec
+ *
+ * The effect on the current drivers/filesystem/dm/bcache/...:
+ *
+ * - almost everyone supposes that one bvec only includes one single
+ *   page, so we keep the sp interface not changed, for example,
+ *   bio_for_each_segment() still returns bvec with single page
+ *
+ * - bio_for_each_segment_all() will be changed to return singlepage
+ *   bvec too
+ *
+ * - during iterating, iterator variable(struct bvec_iter) is always
+ *   updated in multipage bvec style and that means bvec_iter_advance()
+ *   is kept not changed
+ *
+ * - returned(copied) singlepage bvec is generated in flight by bvec
+ *   helpers from the stored mp bvec
+ *
+ * - In case that some components(such as iov_iter) need to support mp
+ *   segment, we introduce new helpers(suffixed with _mp) for them.
+ */
+
+/*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
  */
 struct bio_vec {
@@ -49,16 +85,30 @@ struct bvec_iter {
  */
 #define __bvec_iter_bvec(bvec, iter)   (&(bvec)[(iter).bi_idx])
 
-#define bvec_iter_page(bvec, iter) \
+#define bvec_iter_page_mp(bvec, iter)  \
(__bvec_iter_bvec((bvec), (iter))->bv_page)
 
-#define bvec_iter_len(bvec, iter)  \
+#define bvec_iter_len_mp(bvec, iter)   \
min((iter).bi_size, \
__bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
 
-#define bvec_iter_offset(bvec, iter)   \
+#define bvec_iter_offset_mp(bvec, iter)\
(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
 
+/*
+ *  of singlepage(sp) segment.
+ *
+ * This helpers will be implemented for building sp bvec in flight.
+ */
+#define bvec_iter_offset_sp(bvec, iter)bvec_iter_offset_mp((bvec), 
(iter))
+#define bvec_iter_len_sp(bvec, iter)   bvec_iter_len_mp((bvec), (iter))
+#define bvec_iter_page_sp(bvec, iter)  bvec_iter_page_mp((bvec), (iter))
+
+/* current interfaces support sp style at default */
+#define bvec_iter_page(bvec, iter) bvec_iter_page_sp((bvec), (iter))
+#define bvec_iter_len(bvec, iter)  bvec_iter_len_sp((bvec), (iter))
+#define bvec_iter_offset(bvec, iter)   bvec_iter_offset_sp((bvec), (iter))
+
 #define bvec_iter_bvec(bvec, iter) \
 ((struct bio_vec) {\
.bv_page= bvec_iter_page((bvec), (iter)),   \
-- 
2.9.4



[PATCH v2 23/51] blk-merge: compute bio->bi_seg_front_size efficiently

2017-06-26 Thread Ming Lei
It is enough to check and compute bio->bi_seg_front_size just
after the 1st segment is found, but current code checks that
for each bvec, which is inefficient.

This patch follows the way in  __blk_recalc_rq_segments()
for computing bio->bi_seg_front_size, and it is more efficient
and code becomes more readable too.

Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5df13041b851..821b9c206308 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -145,22 +145,21 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
bvprvp = 
sectors += bv.bv_len >> 9;
 
-   if (nsegs == 1 && seg_size > front_seg_size)
-   front_seg_size = seg_size;
continue;
}
 new_segment:
if (nsegs == queue_max_segments(q))
goto split;
 
+   if (nsegs == 1 && seg_size > front_seg_size)
+   front_seg_size = seg_size;
+
nsegs++;
bvprv = bv;
bvprvp = 
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
 
-   if (nsegs == 1 && seg_size > front_seg_size)
-   front_seg_size = seg_size;
}
 
do_split = false;
@@ -173,6 +172,8 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
bio = new;
}
 
+   if (nsegs == 1 && seg_size > front_seg_size)
+   front_seg_size = seg_size;
bio->bi_seg_front_size = front_seg_size;
if (seg_size > bio->bi_seg_back_size)
bio->bi_seg_back_size = seg_size;
-- 
2.9.4



[PATCH v2 21/51] block: implement sp version of bvec iterator helpers

2017-06-26 Thread Ming Lei
This patch implements singlepage version of the following
3 helpers:
- bvec_iter_offset_sp()
- bvec_iter_len_sp()
- bvec_iter_page_sp()

So that one multipage bvec can be splited to singlepage
bvec, and make users of current bvec iterator happy.

Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index f52587e283d4..61632e9db3b8 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -22,6 +22,7 @@
 
 #include 
 #include 
+#include 
 
 /*
  * What is multipage bvecs(segment)?
@@ -95,14 +96,25 @@ struct bvec_iter {
 #define bvec_iter_offset_mp(bvec, iter)\
(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
 
+#define bvec_iter_page_idx_mp(bvec, iter)  \
+   (bvec_iter_offset_mp((bvec), (iter)) / PAGE_SIZE)
+
+
 /*
  *  of singlepage(sp) segment.
  *
  * This helpers will be implemented for building sp bvec in flight.
  */
-#define bvec_iter_offset_sp(bvec, iter)bvec_iter_offset_mp((bvec), 
(iter))
-#define bvec_iter_len_sp(bvec, iter)   bvec_iter_len_mp((bvec), (iter))
-#define bvec_iter_page_sp(bvec, iter)  bvec_iter_page_mp((bvec), (iter))
+#define bvec_iter_offset_sp(bvec, iter)
\
+   (bvec_iter_offset_mp((bvec), (iter)) % PAGE_SIZE)
+
+#define bvec_iter_len_sp(bvec, iter)   \
+   min_t(unsigned, bvec_iter_len_mp((bvec), (iter)),   \
+   (PAGE_SIZE - (bvec_iter_offset_sp((bvec), (iter)
+
+#define bvec_iter_page_sp(bvec, iter)  \
+   nth_page(bvec_iter_page_mp((bvec), (iter)), \
+bvec_iter_page_idx_mp((bvec), (iter)))
 
 /* current interfaces support sp style at default */
 #define bvec_iter_page(bvec, iter) bvec_iter_page_sp((bvec), (iter))
-- 
2.9.4



[PATCH v2 24/51] block: blk-merge: try to make front segments in full size

2017-06-26 Thread Ming Lei
When merging one bvec into segment, if the bvec is too big
to merge, current policy is to move the whole bvec into another
new segment.

This patchset changes the policy into trying to maximize size of
front segments, that means in above situation, part of bvec
is merged into current segment, and the remainder is put
into next segment.

This patch prepares for support multipage bvec because
it can be quite common to see this case and we should try
to make front segments in full size.

Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 54 +-
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 821b9c206308..bf7a0fa0199f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -108,6 +108,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
bool do_split = true;
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
+   unsigned advance = 0;
 
bio_for_each_segment(bv, bio, iter) {
/*
@@ -133,12 +134,32 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
}
 
if (bvprvp && blk_queue_cluster(q)) {
-   if (seg_size + bv.bv_len > queue_max_segment_size(q))
-   goto new_segment;
if (!BIOVEC_PHYS_MERGEABLE(bvprvp, ))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, ))
goto new_segment;
+   if (seg_size + bv.bv_len > queue_max_segment_size(q)) {
+   /*
+* On assumption is that initial value of
+* @seg_size(equals to bv.bv_len) won't be
+* bigger than max segment size, but will
+* becomes false after multipage bvec comes.
+*/
+   advance = queue_max_segment_size(q) - seg_size;
+
+   if (advance > 0) {
+   seg_size += advance;
+   sectors += advance >> 9;
+   bv.bv_len -= advance;
+   bv.bv_offset += advance;
+   }
+
+   /*
+* Still need to put remainder of current
+* bvec into a new segment.
+*/
+   goto new_segment;
+   }
 
seg_size += bv.bv_len;
bvprv = bv;
@@ -160,6 +181,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
seg_size = bv.bv_len;
sectors += bv.bv_len >> 9;
 
+   /* restore the bvec for iterator */
+   if (advance) {
+   bv.bv_len += advance;
+   bv.bv_offset -= advance;
+   advance = 0;
+   }
}
 
do_split = false;
@@ -360,16 +387,29 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,
 {
 
int nbytes = bvec->bv_len;
+   unsigned advance = 0;
 
if (*sg && *cluster) {
-   if ((*sg)->length + nbytes > queue_max_segment_size(q))
-   goto new_segment;
-
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
goto new_segment;
 
+   /*
+* try best to merge part of the bvec into previous
+* segment and follow same policy with
+* blk_bio_segment_split()
+*/
+   if ((*sg)->length + nbytes > queue_max_segment_size(q)) {
+   advance = queue_max_segment_size(q) - (*sg)->length;
+   if (advance) {
+   (*sg)->length += advance;
+   bvec->bv_offset += advance;
+   bvec->bv_len -= advance;
+   }
+   goto new_segment;
+   }
+
(*sg)->length += nbytes;
} else {
 new_segment:
@@ -392,6 +432,10 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,
 
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
(*nsegs)++;
+
+   /* for making iterator happy */
+   bvec->bv_offset -= advance;
+   bvec->bv_len += advance;
}
*bvprv = *bvec;
 }
-- 
2.9.4



[PATCH v2 26/51] block: use bio_for_each_segment_mp() to compute segments count

2017-06-26 Thread Ming Lei
Firstly it is more efficient to use bio_for_each_segment_mp()
in both blk_bio_segment_split() and __blk_recalc_rq_segments()
to compute how many segments there are in the bio.

Secondaly once bio_for_each_segment_mp() is used, the bvec
may need to be splitted because its length can be very long
and more than max segment size, so we have to support to split
one bvec into several segments.

Thirdly during splitting mp bvec into segments, max segment
number may be reached, then the bio need to be splitted when
this happens.

Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 97 ---
 1 file changed, 79 insertions(+), 18 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index c6fcc49b9aea..8d2c2d763456 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -96,6 +96,62 @@ static inline unsigned get_max_io_size(struct request_queue 
*q,
return sectors;
 }
 
+/*
+ * Split the bvec @bv into segments, and update all kinds of
+ * variables.
+ */
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+   unsigned *nsegs, unsigned *last_seg_size,
+   unsigned *front_seg_size, unsigned *sectors)
+{
+   bool need_split = false;
+   unsigned len = bv->bv_len;
+   unsigned total_len = 0;
+   unsigned new_nsegs = 0, seg_size = 0;
+
+   if ((*nsegs >= queue_max_segments(q)) || !len)
+   return need_split;
+
+   /*
+* Multipage bvec may be too big to hold in one segment,
+* so the current bvec has to be splitted as multiple
+* segments.
+*/
+   while (new_nsegs + *nsegs < queue_max_segments(q)) {
+   seg_size = min(queue_max_segment_size(q), len);
+
+   new_nsegs++;
+   total_len += seg_size;
+   len -= seg_size;
+
+   if ((queue_virt_boundary(q) && ((bv->bv_offset +
+   total_len) & queue_virt_boundary(q))) || !len)
+   break;
+   }
+
+   /* split in the middle of the bvec */
+   if (len)
+   need_split = true;
+
+   /* update front segment size */
+   if (!*nsegs) {
+   unsigned first_seg_size = seg_size;
+
+   if (new_nsegs > 1)
+   first_seg_size = queue_max_segment_size(q);
+   if (*front_seg_size < first_seg_size)
+   *front_seg_size = first_seg_size;
+   }
+
+   /* update other varibles */
+   *last_seg_size = seg_size;
+   *nsegs += new_nsegs;
+   if (sectors)
+   *sectors += total_len >> 9;
+
+   return need_split;
+}
+
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 struct bio *bio,
 struct bio_set *bs,
@@ -110,7 +166,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
const unsigned max_sectors = get_max_io_size(q, bio);
unsigned advance = 0;
 
-   bio_for_each_segment(bv, bio, iter) {
+   bio_for_each_segment_mp(bv, bio, iter) {
/*
 * If the queue doesn't support SG gaps and adding this
 * offset would create a gap, disallow it.
@@ -125,8 +181,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
 */
if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) {
-   nsegs++;
-   sectors = max_sectors;
+   /* split in the middle of bvec */
+   bv.bv_len = (max_sectors - sectors) << 9;
+   bvec_split_segs(q, , ,
+   _size,
+   _seg_size,
+   );
}
goto split;
}
@@ -138,10 +198,9 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
goto new_segment;
if (seg_size + bv.bv_len > queue_max_segment_size(q)) {
/*
-* On assumption is that initial value of
-* @seg_size(equals to bv.bv_len) won't be
-* bigger than max segment size, but will
-* becomes false after multipage bvec comes.
+* The initial value of @seg_size won't be
+* bigger than max segment size, because we
+* split the bvec via bvec_split_segs().
 */
advance = queue_max_segment_size(q) - seg_size;
 
@@ -173,11 +232,12 @@ 

[PATCH v2 32/51] btrfs: use bvec_get_last_page to get bio's last page

2017-06-26 Thread Ming Lei
Preparing for supporting multipage bvec.

Cc: Chris Mason 
Cc: Josef Bacik 
Cc: David Sterba 
Cc: linux-bt...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 fs/btrfs/compression.c | 5 -
 fs/btrfs/extent_io.c   | 8 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5972f74354ca..fdab5b821aa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -391,8 +391,11 @@ blk_status_t btrfs_submit_compressed_write(struct inode 
*inode, u64 start,
 static u64 bio_end_offset(struct bio *bio)
 {
struct bio_vec *last = >bi_io_vec[bio->bi_vcnt - 1];
+   struct bio_vec bv;
 
-   return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
+   bvec_get_last_page(last, );
+
+   return page_offset(bv.bv_page) + bv.bv_len + bv.bv_offset;
 }
 
 static noinline int add_ra_bio_pages(struct inode *inode,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5b453cada1ea..7cc6c8a52e49 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2741,11 +2741,15 @@ static int __must_check submit_one_bio(struct bio *bio, 
int mirror_num,
 {
blk_status_t ret = 0;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-   struct page *page = bvec->bv_page;
struct extent_io_tree *tree = bio->bi_private;
+   struct bio_vec bv;
+   struct page *page;
u64 start;
 
-   start = page_offset(page) + bvec->bv_offset;
+   bvec_get_last_page(bvec, );
+   page = bv.bv_page;
+
+   start = page_offset(page) + bv.bv_offset;
 
bio->bi_private = NULL;
bio_get(bio);
-- 
2.9.4



[PATCH v2 33/51] block: deal with dirtying pages for multipage bvec

2017-06-26 Thread Ming Lei
In bio_check_pages_dirty(), bvec->bv_page is used as flag
for marking if the page has been dirtied & released, and if
no, it will be dirtied in deferred workqueue.

With multipage bvec, we can't do that any more, so change
the logic into checking all pages in one mp bvec, and only
release all these pages if all are dirtied, otherwise dirty
them all in deferred wrokqueue.

Signed-off-by: Ming Lei 
---
 block/bio.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index bf7f25889f6e..22e5deec7ec7 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1641,8 +1641,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
 
if (page && !PageCompound(page))
@@ -1650,16 +1651,26 @@ void bio_set_pages_dirty(struct bio *bio)
}
 }
 
+static inline void release_mp_bvec_pages(struct bio_vec *bvec)
+{
+   struct bio_vec bv;
+   struct bvec_iter iter;
+
+   bvec_for_each_sp_bvec(bv, bvec, iter)
+   put_page(bv.bv_page);
+}
+
 static void bio_release_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   /* iterate each mp bvec */
+   bio_for_each_segment_all_mp(bvec, bio, i) {
struct page *page = bvec->bv_page;
 
if (page)
-   put_page(page);
+   release_mp_bvec_pages(bvec);
}
 }
 
@@ -1703,20 +1714,38 @@ static void bio_dirty_fn(struct work_struct *work)
}
 }
 
+static inline void check_mp_bvec_pages(struct bio_vec *bvec,
+   int *nr_dirty, int *nr_pages)
+{
+   struct bio_vec bv;
+   struct bvec_iter iter;
+
+   bvec_for_each_sp_bvec(bv, bvec, iter) {
+   struct page *page = bv.bv_page;
+
+   if (PageDirty(page) || PageCompound(page))
+   (*nr_dirty)++;
+   (*nr_pages)++;
+   }
+}
+
 void bio_check_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
int nr_clean_pages = 0;
int i;
 
-   bio_for_each_segment_all(bvec, bio, i) {
-   struct page *page = bvec->bv_page;
+   bio_for_each_segment_all_mp(bvec, bio, i) {
+   int nr_dirty = 0, nr_pages = 0;
+
+   check_mp_bvec_pages(bvec, _dirty, _pages);
 
-   if (PageDirty(page) || PageCompound(page)) {
-   put_page(page);
+   /* release all pages in the mp bvec if all are dirtied */
+   if (nr_dirty == nr_pages) {
+   release_mp_bvec_pages(bvec);
bvec->bv_page = NULL;
} else {
-   nr_clean_pages++;
+   nr_clean_pages += nr_pages;
}
}
 
-- 
2.9.4



[PATCH v2 36/51] md: raid1: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 drivers/md/raid1.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 835c42396861..ca4b9ff8d39b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2135,13 +2135,14 @@ static void process_checks(struct r1bio *r1_bio)
struct page **spages = get_resync_pages(sbio)->pages;
struct bio_vec *bi;
int page_len[RESYNC_PAGES] = { 0 };
+   struct bvec_iter_all bia;
 
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_status = 0;
 
-   bio_for_each_segment_all(bi, sbio, j)
+   bio_for_each_segment_all_sp(bi, sbio, j, bia)
page_len[j] = bi->bv_len;
 
if (!status) {
-- 
2.9.4



[PATCH v2 37/51] dm-crypt: don't clear bvec->bv_page in crypt_free_buffer_pages()

2017-06-26 Thread Ming Lei
The bio is always freed after running crypt_free_buffer_pages(),
so it isn't necessary to clear the bv->bv_page.

Cc: Mike Snitzer 
Cc:dm-de...@redhat.com
Signed-off-by: Ming Lei 
---
 drivers/md/dm-crypt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cdf6b1e12460..664ba3504f48 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1450,7 +1450,6 @@ static void crypt_free_buffer_pages(struct crypt_config 
*cc, struct bio *clone)
bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
-   bv->bv_page = NULL;
}
 }
 
-- 
2.9.4



[PATCH v2 44/51] gfs2: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: Steven Whitehouse 
Cc: Bob Peterson 
Cc: cluster-de...@redhat.com
Signed-off-by: Ming Lei 
---
 fs/gfs2/lops.c| 3 ++-
 fs/gfs2/meta_io.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index d62939f00d53..294f1926d9be 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -206,11 +206,12 @@ static void gfs2_end_log_write(struct bio *bio)
struct bio_vec *bvec;
struct page *page;
int i;
+   struct bvec_iter_all bia;
 
if (bio->bi_status)
fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
page = bvec->bv_page;
if (page_has_buffers(page))
gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index fabe1614f879..6879b0103539 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
struct buffer_head *bh = page_buffers(page);
unsigned int len = bvec->bv_len;
-- 
2.9.4



[PATCH v2 46/51] exofs: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: Boaz Harrosh 
Signed-off-by: Ming Lei 
---
 fs/exofs/ore.c  | 3 ++-
 fs/exofs/ore_raid.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 8bb72807e70d..38a7d8bfdd4c 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -406,8 +406,9 @@ static void _clear_bio(struct bio *bio)
 {
struct bio_vec *bv;
unsigned i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_sp(bv, bio, i, bia) {
unsigned this_count = bv->bv_len;
 
if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 27cbdb697649..37c0a9aa2ec2 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -429,6 +429,7 @@ static void _mark_read4write_pages_uptodate(struct 
ore_io_state *ios, int ret)
 {
struct bio_vec *bv;
unsigned i, d;
+   struct bvec_iter_all bia;
 
/* loop on all devices all pages */
for (d = 0; d < ios->numdevs; d++) {
@@ -437,7 +438,7 @@ static void _mark_read4write_pages_uptodate(struct 
ore_io_state *ios, int ret)
if (!bio)
continue;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_sp(bv, bio, i, bia) {
struct page *page = bv->bv_page;
 
SetPageUptodate(page);
-- 
2.9.4



[PATCH v2 41/51] fs/iomap: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 fs/iomap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index c71a64b97fba..4319284c1fbd 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -696,8 +696,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all_sp(bvec, bio, i, bia)
put_page(bvec->bv_page);
bio_put(bio);
}
-- 
2.9.4



[PATCH v2 45/51] f2fs: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: Jaegeuk Kim 
Cc: Chao Yu 
Cc: linux-f2fs-de...@lists.sourceforge.net
Signed-off-by: Ming Lei 
---
 fs/f2fs/data.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 622c44a1be78..57d5a2760bf1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -54,6 +54,7 @@ static void f2fs_read_end_io(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
/*
@@ -75,7 +76,7 @@ static void f2fs_read_end_io(struct bio *bio)
}
}
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
 
if (!bio->bi_status) {
@@ -95,8 +96,9 @@ static void f2fs_write_end_io(struct bio *bio)
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
 
@@ -256,6 +258,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io,
struct bio_vec *bvec;
struct page *target;
int i;
+   struct bvec_iter_all bia;
 
if (!io->bio)
return false;
@@ -263,7 +266,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io,
if (!inode && !ino)
return true;
 
-   bio_for_each_segment_all(bvec, io->bio, i) {
+   bio_for_each_segment_all_sp(bvec, io->bio, i, bia) {
 
if (bvec->bv_page->mapping)
target = bvec->bv_page;
-- 
2.9.4



[PATCH v2 35/51] bcache: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: linux-bca...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 drivers/md/bcache/btree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3da595ae565b..74cbb7387dc5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -422,8 +422,9 @@ static void do_btree_node_write(struct btree *b)
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bv, b->bio, j)
+   bio_for_each_segment_all_sp(bv, b->bio, j, bia)
memcpy(page_address(bv->bv_page),
   base + j * PAGE_SIZE, PAGE_SIZE);
 
-- 
2.9.4



[PATCH v2 47/51] fs: crypto: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 fs/crypto/bio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 6181e9526860..d5516ed19166 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -36,8 +36,9 @@ static void completion_pages(struct work_struct *work)
struct bio *bio = ctx->r.bio;
struct bio_vec *bv;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_sp(bv, bio, i, bia) {
struct page *page = bv->bv_page;
int ret = fscrypt_decrypt_page(page->mapping->host, page,
PAGE_SIZE, 0, page->index);
-- 
2.9.4



[PATCH v2 49/51] fs/direct-io: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 fs/direct-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index c87077d1dc33..a139b3bbad8e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -489,7 +489,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, 
struct bio *bio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
} else {
-   bio_for_each_segment_all(bvec, bio, i) {
+   struct bvec_iter_all bia;
+
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
 
if (dio->op == REQ_OP_READ && !PageCompound(page) &&
-- 
2.9.4



[PATCH v2 50/51] block: enable multipage bvecs

2017-06-26 Thread Ming Lei
This patch pulls the trigger for multipage bvecs.

Now any request queue which supports queue cluster
will see multipage bvecs.

Signed-off-by: Ming Lei 
---
 block/bio.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index c460888f14b5..436305cde045 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -839,6 +839,11 @@ int bio_add_page(struct bio *bio, struct page *page,
 * a consecutive offset.  Optimize this special case.
 */
if (bio->bi_vcnt > 0) {
+   struct request_queue *q = NULL;
+
+   if (bio->bi_bdev)
+   q = bdev_get_queue(bio->bi_bdev);
+
bv = >bi_io_vec[bio->bi_vcnt - 1];
 
if (page == bv->bv_page &&
@@ -846,6 +851,14 @@ int bio_add_page(struct bio *bio, struct page *page,
bv->bv_len += len;
goto done;
}
+
+   /* disable multipage bvec too if cluster isn't enabled */
+   if (q && blk_queue_cluster(q) &&
+   (bvec_to_phys(bv) + bv->bv_len ==
+page_to_phys(page) + offset)) {
+   bv->bv_len += len;
+   goto done;
+   }
}
 
if (bio->bi_vcnt >= bio->bi_max_vecs)
-- 
2.9.4



[PATCH v2 48/51] fs/btrfs: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: Chris Mason 
Cc: Josef Bacik 
Cc: David Sterba 
Cc: linux-bt...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/disk-io.c |  3 ++-
 fs/btrfs/extent_io.c   | 12 
 fs/btrfs/inode.c   |  6 --
 fs/btrfs/raid56.c  |  6 --
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index fdab5b821aa8..9d1693ecf468 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -147,12 +147,13 @@ static void end_compressed_bio_read(struct bio *bio)
} else {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all bia;
 
/*
 * we have verified the checksum already, set page
 * checked so the end_io handlers know about it
 */
-   bio_for_each_segment_all(bvec, cb->orig_bio, i)
+   bio_for_each_segment_all_sp(bvec, cb->orig_bio, i, bia)
SetPageChecked(bvec->bv_page);
 
bio_endio(cb->orig_bio);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f4f54d13db6d..e7efbaa3566c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -963,8 +963,9 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
struct bio_vec *bvec;
struct btrfs_root *root;
int i, ret = 0;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7cc6c8a52e49..8e51452894ba 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2359,8 +2359,9 @@ static unsigned int get_bio_pages(struct bio *bio)
 {
unsigned i;
struct bio_vec *bv;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bv, bio, i)
+   bio_for_each_segment_all_sp(bv, bio, i, bia)
;
 
return i;
@@ -2468,8 +2469,9 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2538,8 +2540,9 @@ static void end_bio_extent_readpage(struct bio *bio)
int mirror;
int ret;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3695,8 +3698,9 @@ static void end_bio_extent_buffer_writepage(struct bio 
*bio)
struct bio_vec *bvec;
struct extent_buffer *eb;
int i, done;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
 
eb = (struct extent_buffer *)page->private;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e725d84917b..61cc6d899ae5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8051,6 +8051,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
struct bio_vec *bvec;
struct extent_io_tree *io_tree, *failure_tree;
int i;
+   struct bvec_iter_all bia;
 
if (bio->bi_status)
goto end;
@@ -8067,7 +8068,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
 
done->uptodate = 1;
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all_sp(bvec, bio, i, bia)
clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
 io_tree, done->start, bvec->bv_page,
 btrfs_ino(BTRFS_I(inode)), 0);
@@ -8146,6 +8147,7 @@ static void btrfs_retry_endio(struct bio *bio)
int uptodate;
int ret;
int i;
+   struct bvec_iter_all bia;
 
if (bio->bi_status)
goto end;
@@ -8164,7 +8166,7 @@ static void btrfs_retry_endio(struct bio *bio)
io_tree = _I(inode)->io_tree;
failure_tree = _I(inode)->io_failure_tree;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
ret = __readpage_endio_check(inode, io_bio, i, 

[PATCH v2 51/51] block: bio: pass segments to bio if bio_add_page() is bypassed

2017-06-26 Thread Ming Lei
Under some situations, such as block direct I/O, we can't use
bio_add_page() for merging pages into multipage bvec, so
a new function is implemented for converting page array into one
segment array, then these cases can benefit from multipage bvec
too.

Signed-off-by: Ming Lei 
---
 block/bio.c | 54 --
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 436305cde045..e2bcbb842982 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -876,6 +876,41 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+static unsigned convert_to_segs(struct bio* bio, struct page **pages,
+   unsigned char *page_cnt,
+   unsigned nr_pages)
+{
+
+   unsigned idx;
+   unsigned nr_seg = 0;
+   struct request_queue *q = NULL;
+
+   if (bio->bi_bdev)
+   q = bdev_get_queue(bio->bi_bdev);
+
+   if (!q || !blk_queue_cluster(q)) {
+   memset(page_cnt, 0, nr_pages);
+   return nr_pages;
+   }
+
+   page_cnt[nr_seg] = 0;
+   for (idx = 1; idx < nr_pages; idx++) {
+   struct page *pg_s = pages[nr_seg];
+   struct page *pg = pages[idx];
+
+   if (page_to_pfn(pg_s) + page_cnt[nr_seg] + 1 ==
+   page_to_pfn(pg)) {
+   page_cnt[nr_seg]++;
+   } else {
+   page_cnt[++nr_seg] = 0;
+   if (nr_seg < idx)
+   pages[nr_seg] = pg;
+   }
+   }
+
+   return nr_seg + 1;
+}
+
 /**
  * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
  * @bio: bio to add pages to
@@ -895,6 +930,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter 
*iter)
struct page **pages = (struct page **)bv;
size_t offset, diff;
ssize_t size;
+   unsigned short nr_segs;
+   unsigned char page_cnt[nr_pages];   /* at most 256 pages */
 
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
if (unlikely(size <= 0))
@@ -910,13 +947,18 @@ int bio_iov_iter_get_pages(struct bio *bio, struct 
iov_iter *iter)
 * need to be reflected here as well.
 */
bio->bi_iter.bi_size += size;
-   bio->bi_vcnt += nr_pages;
-
diff = (nr_pages * PAGE_SIZE - offset) - size;
-   while (nr_pages--) {
-   bv[nr_pages].bv_page = pages[nr_pages];
-   bv[nr_pages].bv_len = PAGE_SIZE;
-   bv[nr_pages].bv_offset = 0;
+
+   /* convert into segments */
+   nr_segs = convert_to_segs(bio, pages, page_cnt, nr_pages);
+   bio->bi_vcnt += nr_segs;
+
+   while (nr_segs--) {
+   unsigned cnt = (unsigned)page_cnt[nr_segs] + 1;
+
+   bv[nr_segs].bv_page = pages[nr_segs];
+   bv[nr_segs].bv_len = PAGE_SIZE * cnt;
+   bv[nr_segs].bv_offset = 0;
}
 
bv[0].bv_offset += offset;
-- 
2.9.4



[PATCH v2 42/51] ext4: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: "Theodore Ts'o" 
Cc: Andreas Dilger 
Cc: linux-e...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 fs/ext4/page-io.c  | 3 ++-
 fs/ext4/readpage.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 930ca0fc9a0f..0e59404fc530 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -62,8 +62,9 @@ static void ext4_finish_bio(struct bio *bio)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
struct page *page = bvec->bv_page;
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
struct page *data_page = NULL;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 40a5497b0f60..6bd33c4c1f7f 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -71,6 +71,7 @@ static void mpage_end_io(struct bio *bio)
 {
struct bio_vec *bv;
int i;
+   struct bvec_iter_all bia;
 
if (ext4_bio_encrypted(bio)) {
if (bio->bi_status) {
@@ -80,7 +81,7 @@ static void mpage_end_io(struct bio *bio)
return;
}
}
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_sp(bv, bio, i, bia) {
struct page *page = bv->bv_page;
 
if (!bio->bi_status) {
-- 
2.9.4



[PATCH v2 43/51] xfs: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: "Darrick J. Wong" 
Cc: linux-...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 fs/xfs/xfs_aops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11ef989b8629..621efe71c70a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -139,6 +139,7 @@ xfs_destroy_ioend(
for (bio = >io_inline_bio; bio; bio = next) {
struct bio_vec  *bvec;
int i;
+   struct bvec_iter_all bia;
 
/*
 * For the last bio, bi_private points to the ioend, so we
@@ -150,7 +151,7 @@ xfs_destroy_ioend(
next = bio->bi_private;
 
/* walk each page on bio, ending page IO on them */
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all_sp(bvec, bio, i, bia)
xfs_finish_page_writeback(inode, bvec, error);
 
bio_put(bio);
-- 
2.9.4



[PATCH v2 39/51] fs/mpage: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 fs/mpage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index 0da38f401564..bdb4692ae30c 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -46,9 +46,10 @@
 static void mpage_end_io(struct bio *bio)
 {
struct bio_vec *bv;
+   struct bvec_iter_all bia;
int i;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_sp(bv, bio, i, bia) {
struct page *page = bv->bv_page;
page_endio(page, op_is_write(bio_op(bio)),
blk_status_to_errno(bio->bi_status));
-- 
2.9.4



[PATCH v2 40/51] fs/block: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 fs/block_dev.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a57c26bcb970..d82e43bd8e82 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -209,6 +209,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
ssize_t ret;
blk_qc_t qc;
int i;
+   struct bvec_iter_all bia;
 
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@@ -253,7 +254,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
}
__set_current_state(TASK_RUNNING);
 
-   bio_for_each_segment_all(bvec, , i) {
+   bio_for_each_segment_all_sp(bvec, , i, bia) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
@@ -317,8 +318,9 @@ static void blkdev_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all_sp(bvec, bio, i, bia)
put_page(bvec->bv_page);
bio_put(bio);
}
-- 
2.9.4



[PATCH v2 38/51] dm-crypt: convert to bio_for_each_segment_all_sp()

2017-06-26 Thread Ming Lei
Cc: Mike Snitzer 
Cc:dm-de...@redhat.com
Signed-off-by: Ming Lei 
---
 drivers/md/dm-crypt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 664ba3504f48..0f2f44a73a32 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1446,8 +1446,9 @@ static void crypt_free_buffer_pages(struct crypt_config 
*cc, struct bio *clone)
 {
unsigned int i;
struct bio_vec *bv;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bv, clone, i) {
+   bio_for_each_segment_all_sp(bv, clone, i, bia) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
}
-- 
2.9.4



[PATCH v2 34/51] block: convert to singe/multi page version of bio_for_each_segment_all()

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 block/bio.c   | 17 +++--
 block/blk-zoned.c |  5 +++--
 block/bounce.c|  6 --
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 22e5deec7ec7..c460888f14b5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -988,7 +988,7 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
int i;
struct bio_vec *bv;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_mp(bv, bio, i) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
@@ -1089,8 +1089,9 @@ static int bio_copy_from_iter(struct bio *bio, struct 
iov_iter iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
ssize_t ret;
 
ret = copy_page_from_iter(bvec->bv_page,
@@ -1120,8 +1121,9 @@ static int bio_copy_to_iter(struct bio *bio, struct 
iov_iter iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
ssize_t ret;
 
ret = copy_page_to_iter(bvec->bv_page,
@@ -1143,8 +1145,9 @@ void bio_free_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all_sp(bvec, bio, i, bia)
__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1435,11 +1438,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
/*
 * make sure we dirty pages we wrote to
 */
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
 
@@ -1531,8 +1535,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all bia;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 3bd15d8095b1..558b84ae2d86 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -81,6 +81,7 @@ int blkdev_report_zones(struct block_device *bdev,
unsigned int ofst;
void *addr;
int ret;
+   struct bvec_iter_all bia;
 
if (!q)
return -ENXIO;
@@ -148,7 +149,7 @@ int blkdev_report_zones(struct block_device *bdev,
n = 0;
nz = 0;
nr_rep = 0;
-   bio_for_each_segment_all(bv, bio, i) {
+   bio_for_each_segment_all_sp(bv, bio, i, bia) {
 
if (!bv->bv_page)
break;
@@ -181,7 +182,7 @@ int blkdev_report_zones(struct block_device *bdev,
 
*nr_zones = nz;
 out:
-   bio_for_each_segment_all(bv, bio, i)
+   bio_for_each_segment_all_sp(bv, bio, i, bia)
__free_page(bv->bv_page);
bio_put(bio);
 
diff --git a/block/bounce.c b/block/bounce.c
index 590dcdb1de76..1f46ba9535c1 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -144,11 +144,12 @@ static void bounce_end_io(struct bio *bio, mempool_t 
*pool)
struct bio_vec *bvec, orig_vec;
int i;
struct bvec_iter orig_iter = bio_orig->bi_iter;
+   struct bvec_iter_all bia;
 
/*
 * free up bounce indirect pages used
 */
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all_sp(bvec, bio, i, bia) {
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page == orig_vec.bv_page)
goto next;
@@ -205,6 +206,7 @@ static void __blk_queue_bounce(struct request_queue *q, 
struct bio **bio_orig,
unsigned i = 0;
bool bounce = false;
int sectors = 0;
+   struct bvec_iter_all bia;
 
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
@@ -223,7 +225,7 @@ static void __blk_queue_bounce(struct request_queue *q, 
struct bio **bio_orig,
}
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
 
-   bio_for_each_segment_all(to, bio, i) {
+   bio_for_each_segment_all_sp(to, bio, i, bia) {
struct page *page = to->bv_page;
 
if (page_to_pfn(page) <= queue_bounce_pfn(q))
-- 
2.9.4



[PATCH v2 31/51] fs/buffer.c: use bvec iterator to truncate the bio

2017-06-26 Thread Ming Lei
Once multipage bvec is enabled, the last bvec may include
more than one page, this patch use bvec_get_last_page()
to truncate the bio.

Signed-off-by: Ming Lei 
---
 fs/buffer.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 1910f539770b..53b8a29f4525 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3055,8 +3055,7 @@ void guard_bio_eod(int op, struct bio *bio)
unsigned truncated_bytes;
/*
 * It is safe to truncate the last bvec in the following way
-* even though multipage bvec is supported, but we need to
-* fix the parameters passed to zero_user().
+* even though multipage bvec is supported.
 */
struct bio_vec *bvec = >bi_io_vec[bio->bi_vcnt - 1];
 
@@ -3085,7 +3084,10 @@ void guard_bio_eod(int op, struct bio *bio)
 
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
-   zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+   struct bio_vec bv;
+
+   bvec_get_last_page(bvec, );
+   zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
 }
-- 
2.9.4



[PATCH v2 27/51] block: use bio_for_each_segment_mp() to map sg

2017-06-26 Thread Ming Lei
It is more efficient to use bio_for_each_segment_mp()
for mapping sg, meantime we have to consider splitting
multipage bvec as done in blk_bio_segment_split().

Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 72 +++
 1 file changed, 52 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8d2c2d763456..894dcd017b56 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -439,6 +439,56 @@ static int blk_phys_contig_segment(struct request_queue 
*q, struct bio *bio,
return 0;
 }
 
+static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
+   struct scatterlist *sglist)
+{
+   if (!*sg)
+   return sglist;
+   else {
+   /*
+* If the driver previously mapped a shorter
+* list, we could see a termination bit
+* prematurely unless it fully inits the sg
+* table on each mapping. We KNOW that there
+* must be more entries here or the driver
+* would be buggy, so force clear the
+* termination bit to avoid doing a full
+* sg_init_table() in drivers for each command.
+*/
+   sg_unmark_end(*sg);
+   return sg_next(*sg);
+   }
+}
+
+static inline unsigned
+blk_bvec_map_sg(struct request_queue *q, struct bio_vec *bvec,
+   struct scatterlist *sglist, struct scatterlist **sg)
+{
+   unsigned nbytes = bvec->bv_len;
+   unsigned nsegs = 0, total = 0;
+
+   while (nbytes > 0) {
+   unsigned seg_size;
+   struct page *pg;
+   unsigned offset, idx;
+
+   *sg = blk_next_sg(sg, sglist);
+
+   seg_size = min(nbytes, queue_max_segment_size(q));
+   offset = (total + bvec->bv_offset) % PAGE_SIZE;
+   idx = (total + bvec->bv_offset) / PAGE_SIZE;
+   pg = nth_page(bvec->bv_page, idx);
+
+   sg_set_page(*sg, pg, seg_size, offset);
+
+   total += seg_size;
+   nbytes -= seg_size;
+   nsegs++;
+   }
+
+   return nsegs;
+}
+
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 struct scatterlist *sglist, struct bio_vec *bvprv,
@@ -472,25 +522,7 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,
(*sg)->length += nbytes;
} else {
 new_segment:
-   if (!*sg)
-   *sg = sglist;
-   else {
-   /*
-* If the driver previously mapped a shorter
-* list, we could see a termination bit
-* prematurely unless it fully inits the sg
-* table on each mapping. We KNOW that there
-* must be more entries here or the driver
-* would be buggy, so force clear the
-* termination bit to avoid doing a full
-* sg_init_table() in drivers for each command.
-*/
-   sg_unmark_end(*sg);
-   *sg = sg_next(*sg);
-   }
-
-   sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
-   (*nsegs)++;
+   (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
 
/* for making iterator happy */
bvec->bv_offset -= advance;
@@ -516,7 +548,7 @@ static int __blk_bios_map_sg(struct request_queue *q, 
struct bio *bio,
int cluster = blk_queue_cluster(q), nsegs = 0;
 
for_each_bio(bio)
-   bio_for_each_segment(bvec, bio, iter)
+   bio_for_each_segment_mp(bvec, bio, iter)
__blk_segment_map_sg(q, , sglist, , sg,
 , );
 
-- 
2.9.4



[PATCH v2 30/51] block: introduce bvec_get_last_page()

2017-06-26 Thread Ming Lei
BTRFS and guard_bio_eod() need to get the last page, so introduce
this helper to make them happy.

Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 7addceea9828..6673e3c0b7eb 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -206,4 +206,18 @@ static inline void bvec_iter_advance_mp(const struct 
bio_vec *bv,
 #define bvec_for_each_sp_bvec(sp_bvl, mp_bvec, iter)   \
__bvec_for_each_sp_bvec(sp_bvl, mp_bvec, iter, BVEC_ITER_ALL_INIT)
 
+/*
+ * get the last page from the multipage bvec and store it
+ * in @sp_bv
+ */
+static inline void bvec_get_last_page(struct bio_vec *mp_bv,
+ struct bio_vec *sp_bv)
+{
+   struct bvec_iter iter;
+
+   *sp_bv = *mp_bv;
+   bvec_for_each_sp_bvec(*sp_bv, mp_bv, iter)
+   ;
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
2.9.4



[PATCH v2 29/51] block: bio: introduce single/multi page version of bio_for_each_segment_all()

2017-06-26 Thread Ming Lei
This patches introduce bio_for_each_segment_all_sp() and
bio_for_each_segment_all_mp().

bio_for_each_segment_all_sp() is for replacing bio_for_each_segment_all()
in case that the returned bvec has to be single page bvec.

bio_for_each_segment_all_mp() is for replacing bio_for_each_segment_all()
in case that user wants to update the returned bvec via the pointer.

Signed-off-by: Ming Lei 
---
 include/linux/bio.h   | 24 
 include/linux/blk_types.h |  6 ++
 2 files changed, 30 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index bdbc9480229d..a4bb694b4da5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -216,6 +216,30 @@ static inline void bio_advance_iter_mp(struct bio *bio, 
struct bvec_iter *iter,
 #define bio_for_each_segment_mp(bvl, bio, iter)
\
__bio_for_each_segment_mp(bvl, bio, iter, (bio)->bi_iter)
 
+/*
+ * This helper returns each bvec stored in bvec table directly,
+ * so the returned bvec points to one multipage bvec in the table
+ * and caller can update the bvec via the returnd pointer.
+ */
+#define bio_for_each_segment_all_mp(bvl, bio, i)   \
+   bio_for_each_segment_all((bvl), (bio), (i))
+
+/*
+ * This helper returns singlepage bvec to caller, and the sp bvec
+ * is generated in-flight from multipage bvec stored in bvec table.
+ * So we can _not_ change the bvec stored in bio->bi_io_vec[] via
+ * this helper.
+ *
+ * If someone need to update bvec in the table, please use
+ * bio_for_each_segment_all_mp() and make sure it is correctly used
+ * since the bvec points to one multipage bvec.
+ */
+#define bio_for_each_segment_all_sp(bvl, bio, i, bi)   \
+   for ((bi).iter = BVEC_ITER_ALL_INIT, i = 0, bvl = &(bi).bv; \
+(bi).iter.bi_idx < (bio)->bi_vcnt &&   \
+   (((bi).bv = bio_iter_iovec((bio), (bi).iter)), 1);  \
+bio_advance_iter((bio), &(bi).iter, (bi).bv.bv_len), i++)
+
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 static inline unsigned bio_segments(struct bio *bio)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e210da6d14b8..3650932f2080 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,6 +118,12 @@ struct bio {
 
 #define BIO_RESET_BYTESoffsetof(struct bio, bi_max_vecs)
 
+/* this iter is only for implementing bio_for_each_segment_rd() */
+struct bvec_iter_all {
+   struct bvec_iteriter;
+   struct bio_vec  bv;  /* in-flight singlepage bvec */
+};
+
 /*
  * bio flags
  */
-- 
2.9.4



[PATCH v2 28/51] block: introduce bvec_for_each_sp_bvec()

2017-06-26 Thread Ming Lei
This helper can be used to iterate each singlepage bvec
from one multipage bvec.

Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 5c51c58fe202..7addceea9828 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -192,4 +192,18 @@ static inline void bvec_iter_advance_mp(const struct 
bio_vec *bv,
.bi_bvec_done   = 0,\
 }
 
+/*
+ * This helper iterates over the multipage bvec of @mp_bvec and
+ * returns each singlepage bvec via @sp_bvl.
+ */
+#define __bvec_for_each_sp_bvec(sp_bvl, mp_bvec, iter, start)  \
+   for (iter = start,  \
+(iter).bi_size = (mp_bvec)->bv_len  - (iter).bi_bvec_done; \
+(iter).bi_size &&  \
+   ((sp_bvl = bvec_iter_bvec((mp_bvec), (iter))), 1);  \
+bvec_iter_advance((mp_bvec), &(iter), (sp_bvl).bv_len))
+
+#define bvec_for_each_sp_bvec(sp_bvl, mp_bvec, iter)   \
+   __bvec_for_each_sp_bvec(sp_bvl, mp_bvec, iter, BVEC_ITER_ALL_INIT)
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
2.9.4



[PATCH v2 25/51] block: blk-merge: remove unnecessary check

2017-06-26 Thread Ming Lei
In this case, 'sectors' can't be zero at all, so remove the check
and let the bio be splitted.

Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index bf7a0fa0199f..c6fcc49b9aea 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
nsegs++;
sectors = max_sectors;
}
-   if (sectors)
-   goto split;
-   /* Make this single bvec as the 1st segment */
+   goto split;
}
 
if (bvprvp && blk_queue_cluster(q)) {
-- 
2.9.4



[PATCH v2 16/51] block: bounce: avoid direct access to bvec table

2017-06-26 Thread Ming Lei
We will support multipage bvecs in the future, so change to
iterator way for getting bv_page of bvec from original bio.

Signed-off-by: Ming Lei 
---
 block/bounce.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/block/bounce.c b/block/bounce.c
index 916ee9a9a216..4eea1b2d8618 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -135,21 +135,22 @@ static void copy_to_high_bio_irq(struct bio *to, struct 
bio *from)
 static void bounce_end_io(struct bio *bio, mempool_t *pool)
 {
struct bio *bio_orig = bio->bi_private;
-   struct bio_vec *bvec, *org_vec;
+   struct bio_vec *bvec, orig_vec;
int i;
-   int start = bio_orig->bi_iter.bi_idx;
+   struct bvec_iter orig_iter = bio_orig->bi_iter;
 
/*
 * free up bounce indirect pages used
 */
bio_for_each_segment_all(bvec, bio, i) {
-   org_vec = bio_orig->bi_io_vec + i + start;
-
-   if (bvec->bv_page == org_vec->bv_page)
-   continue;
+   orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+   if (bvec->bv_page == orig_vec.bv_page)
+   goto next;
 
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
mempool_free(bvec->bv_page, pool);
+ next:
+   bio_advance_iter(bio_orig, _iter, orig_vec.bv_len);
}
 
bio_orig->bi_status = bio->bi_status;
-- 
2.9.4



[PATCH v2 10/51] dm: limit the max bio size as BIO_MAX_PAGES * PAGE_SIZE

2017-06-26 Thread Ming Lei
For BIO based DM, some targets aren't ready for dealing with
bigger incoming bio than 1Mbyte, such as crypt target.

Cc: Mike Snitzer 
Cc:dm-de...@redhat.com
Signed-off-by: Ming Lei 
---
 drivers/md/dm.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 96bd13e581cd..49583c623cdd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -921,7 +921,16 @@ int dm_set_target_max_io_len(struct dm_target *ti, 
sector_t len)
return -EINVAL;
}
 
-   ti->max_io_len = (uint32_t) len;
+   /*
+* BIO based queue uses its own splitting. When multipage bvecs
+* is switched on, size of the incoming bio may be too big to
+* be handled in some targets, such as crypt.
+*
+* When these targets are ready for the big bio, we can remove
+* the limit.
+*/
+   ti->max_io_len = min_t(uint32_t, len,
+  (BIO_MAX_PAGES * PAGE_SIZE));
 
return 0;
 }
-- 
2.9.4



[PATCH v2 09/51] block: comment on bio_iov_iter_get_pages()

2017-06-26 Thread Ming Lei
bio_iov_iter_get_pages() used unused bvec spaces for
storing page pointer array temporarily, and this patch
comments on this usage wrt. multipage bvec support.

Signed-off-by: Ming Lei 
---
 block/bio.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index a5db117e8dfa..bf7f25889f6e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -870,6 +870,10 @@ EXPORT_SYMBOL(bio_add_page);
  *
  * Pins as many pages from *iter and appends them to @bio's bvec array. The
  * pages will have to be released using put_page() when done.
+ *
+ * The hacking way of using bvec table as page pointer array is safe
+ * even after multipage bvec is introduced because that space can be
+ * thought as unused by bio_add_page().
  */
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
-- 
2.9.4



[PATCH v2 08/51] block: comment on bio_alloc_pages()

2017-06-26 Thread Ming Lei
This patch adds comment on usage of bio_alloc_pages().

Signed-off-by: Ming Lei 
---
 block/bio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 89a51bd49ab7..a5db117e8dfa 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -972,7 +972,9 @@ EXPORT_SYMBOL(bio_advance);
  * @bio: bio to allocate pages for
  * @gfp_mask: flags for allocation
  *
- * Allocates pages up to @bio->bi_vcnt.
+ * Allocates pages up to @bio->bi_vcnt, and this function should only
+ * be called on a new initialized bio, which means all pages aren't added
+ * to the bio via bio_add_page() yet.
  *
  * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages 
are
  * freed.
-- 
2.9.4



[PATCH v2 07/51] bcache: comment on direct access to bvec table

2017-06-26 Thread Ming Lei
Looks all are safe after multipage bvec is supported.

Cc: linux-bca...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 drivers/md/bcache/btree.c | 1 +
 drivers/md/bcache/super.c | 6 ++
 drivers/md/bcache/util.c  | 7 +++
 3 files changed, 14 insertions(+)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 866dcf78ff8e..3da595ae565b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -431,6 +431,7 @@ static void do_btree_node_write(struct btree *b)
 
continue_at(cl, btree_node_write_done, NULL);
} else {
+   /* No harm for multipage bvec since the new is just allocated */
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 8352fad765f6..6808f548cd13 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -208,6 +208,7 @@ static void write_bdev_super_endio(struct bio *bio)
 
 static void __write_super(struct cache_sb *sb, struct bio *bio)
 {
+   /* single page bio, safe for multipage bvec */
struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
unsigned i;
 
@@ -1154,6 +1155,8 @@ static void register_bdev(struct cache_sb *sb, struct 
page *sb_page,
dc->bdev->bd_holder = dc;
 
bio_init(>sb_bio, dc->sb_bio.bi_inline_vecs, 1);
+
+   /* single page bio, safe for multipage bvec */
dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
get_page(sb_page);
 
@@ -1799,6 +1802,7 @@ void bch_cache_release(struct kobject *kobj)
for (i = 0; i < RESERVE_NR; i++)
free_fifo(>free[i]);
 
+   /* single page bio, safe for multipage bvec */
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
put_page(ca->sb_bio.bi_io_vec[0].bv_page);
 
@@ -1854,6 +1858,8 @@ static int register_cache(struct cache_sb *sb, struct 
page *sb_page,
ca->bdev->bd_holder = ca;
 
bio_init(>sb_bio, ca->sb_bio.bi_inline_vecs, 1);
+
+   /* single page bio, safe for multipage bvec */
ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
get_page(sb_page);
 
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 8c3a938f4bf0..11b4230ea6ad 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -223,6 +223,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t 
done)
: 0;
 }
 
+/*
+ * Generally it isn't good to access .bi_io_vec and .bi_vcnt
+ * directly, the preferred way is bio_add_page, but in
+ * this case, bch_bio_map() supposes that the bvec table
+ * is empty, so it is safe to access .bi_vcnt & .bi_io_vec
+ * in this way even after multipage bvec is supported.
+ */
 void bch_bio_map(struct bio *bio, void *base)
 {
size_t size = bio->bi_iter.bi_size;
-- 
2.9.4



[PATCH v2 06/51] f2fs: f2fs_read_end_io: comment on direct access to bvec table

2017-06-26 Thread Ming Lei
Cc: Jaegeuk Kim 
Cc: Chao Yu 
Cc: linux-f2fs-de...@lists.sourceforge.net
Signed-off-by: Ming Lei 
---
 fs/f2fs/data.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7697d03e8a98..622c44a1be78 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -56,6 +56,10 @@ static void f2fs_read_end_io(struct bio *bio)
int i;
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
+   /*
+* It is still safe to retrieve the 1st page of the bio
+* in this way after supporting multipage bvec.
+*/
if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
f2fs_show_injection_info(FAULT_IO);
bio->bi_status = BLK_STS_IOERR;
-- 
2.9.4



[PATCH v2 04/51] mm: page_io.c: comment on direct access to bvec table

2017-06-26 Thread Ming Lei
Cc: Andrew Morton 
Cc: linux...@kvack.org
Signed-off-by: Ming Lei 
---
 mm/page_io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/page_io.c b/mm/page_io.c
index b6c4ac388209..11c6f4a9a25b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -43,6 +43,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
 
 void end_swap_bio_write(struct bio *bio)
 {
+   /* single page bio, safe for multipage bvec */
struct page *page = bio->bi_io_vec[0].bv_page;
 
if (bio->bi_status) {
@@ -116,6 +117,7 @@ static void swap_slot_free_notify(struct page *page)
 
 static void end_swap_bio_read(struct bio *bio)
 {
+   /* single page bio, safe for multipage bvec */
struct page *page = bio->bi_io_vec[0].bv_page;
struct task_struct *waiter = bio->bi_private;
 
-- 
2.9.4



[PATCH v2 05/51] fs/buffer: comment on direct access to bvec table

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 fs/buffer.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 4d5d03b42e11..1910f539770b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3052,8 +3052,13 @@ static void end_bio_bh_io_sync(struct bio *bio)
 void guard_bio_eod(int op, struct bio *bio)
 {
sector_t maxsector;
-   struct bio_vec *bvec = >bi_io_vec[bio->bi_vcnt - 1];
unsigned truncated_bytes;
+   /*
+* It is safe to truncate the last bvec in the following way
+* even though multipage bvec is supported, but we need to
+* fix the parameters passed to zero_user().
+*/
+   struct bio_vec *bvec = >bi_io_vec[bio->bi_vcnt - 1];
 
maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
if (!maxsector)
-- 
2.9.4



[PATCH v2 03/51] kernel/power/swap.c: comment on direct access to bvec table

2017-06-26 Thread Ming Lei
Cc: "Rafael J. Wysocki" 
Cc: linux...@vger.kernel.org
Signed-off-by: Ming Lei 
---
 kernel/power/swap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d22571f306..aa52ccc03fcc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -238,6 +238,8 @@ static void hib_init_batch(struct hib_bio_batch *hb)
 static void hib_end_io(struct bio *bio)
 {
struct hib_bio_batch *hb = bio->bi_private;
+
+   /* single page bio, safe for multipage bvec */
struct page *page = bio->bi_io_vec[0].bv_page;
 
if (bio->bi_status) {
-- 
2.9.4



[PATCH v2 02/51] block: loop: comment on direct access to bvec table

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 drivers/block/loop.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0de11444e317..88063ab17e9a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -487,6 +487,11 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
/* nomerge for loop request queue */
WARN_ON(cmd->rq->bio != cmd->rq->biotail);
 
+   /*
+* For multipage bvec support, it is safe to pass the bvec
+* table to iov iterator, because iov iter still uses bvec
+* iter helpers to travese bvec.
+*/
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
iov_iter_bvec(, ITER_BVEC | rw, bvec,
  bio_segments(bio), blk_rq_bytes(cmd->rq));
-- 
2.9.4



[PATCH v2 00/51] block: support multipage bvec

2017-06-26 Thread Ming Lei
Hi,

This patchset brings multipage bvec into block layer:

1) what is multipage bvec?

Multipage bvecs means that one 'struct bio_bvec' can hold
multiple pages which are physically contiguous instead
of one single page used in linux kernel for long time.

2) why is multipage bvec introduced?

Kent proposed the idea[1] first. 

As system's RAM becomes much bigger than before, and 
at the same time huge page, transparent huge page and
memory compaction are widely used, it is a bit easy now
to see physically contiguous pages from fs in I/O.
On the other hand, from block layer's view, it isn't
necessary to store intermediate pages into bvec, and
it is enough to just store the physicallly contiguous
'segment' in each io vector.

Also huge pages are being brought to filesystem and swap
[2][6], we can do IO on a hugepage each time[3], which
requires that one bio can transfer at least one huge page
one time. Turns out it isn't flexiable to change BIO_MAX_PAGES
simply[3][5]. Multipage bvec can fit in this case very well.

With multipage bvec:

- segment handling in block layer can be improved much
in future since it should be quite easy to convert
multipage bvec into segment easily. For example, we might 
just store segment in each bvec directly in future.

- bio size can be increased and it should improve some
high-bandwidth IO case in theory[4].

- Inside block layer, both bio splitting and sg map can
become more efficient than before by just traversing the
physically contiguous 'segment' instead of each page.

- there is opportunity in future to improve memory footprint
of bvecs. 

3) how is multipage bvec implemented in this patchset?

The 1st 18 patches comment on some special cases and deal with
some special cases of direct access to bvec table.

The 2nd part(19~29) implements multipage bvec in block layer:

- put all tricks into bvec/bio/rq iterators, and as far as
drivers and fs use these standard iterators, they are happy
with multipage bvec

- use multipage bvec to split bio and map sg

- bio_for_each_segment_all() changes
this helper pass pointer of each bvec directly to user, and
it has to be changed. Two new helpers(bio_for_each_segment_all_sp()
and bio_for_each_segment_all_mp()) are introduced. 

The 3rd part(30~49) convert current users of bio_for_each_segment_all()
to bio_for_each_segment_all_sp()/bio_for_each_segment_all_mp().

The last part(50~51) enables multipage bvec.

These patches can be found in the following git tree:

https://github.com/ming1/linux/commits/mp-bvec-1.4-v4.12-rc

Thanks Christoph for looking at the early version and providing
very good suggestions, such as: introduce bio_init_with_vec_table(),
remove another unnecessary helpers for cleanup and so on.

Any comments are welcome!

V2:
- bvec table direct access in raid has been cleaned, so NO_MP
flag is dropped
- rebase on recent Neil Brown's change on bio and bounce code
- reorganize the patchset

V1:
- against v4.10-rc1 and some cleanup in V0 are in -linus already
- handle queue_virt_boundary() in mp bvec change and make NVMe happy
- further BTRFS cleanup
- remove QUEUE_FLAG_SPLIT_MP
- rename for two new helpers of bio_for_each_segment_all()
- fix bounce convertion
- address comments in V0

[1], http://marc.info/?l=linux-kernel=141680246629547=2
[2], https://patchwork.kernel.org/patch/9451523/
[3], http://marc.info/?t=14773544711=1=2
[4], http://marc.info/?l=linux-mm=147745525801433=2
[5], http://marc.info/?t=14956948457=1=2
[6], http://marc.info/?t=14982021534=1=2

Ming Lei (51):
  block: drbd: comment on direct access bvec table
  block: loop: comment on direct access to bvec table
  kernel/power/swap.c: comment on direct access to bvec table
  mm: page_io.c: comment on direct access to bvec table
  fs/buffer: comment on direct access to bvec table
  f2fs: f2fs_read_end_io: comment on direct access to bvec table
  bcache: comment on direct access to bvec table
  block: comment on bio_alloc_pages()
  block: comment on bio_iov_iter_get_pages()
  dm: limit the max bio size as BIO_MAX_PAGES * PAGE_SIZE
  md: raid1: initialize bvec table via bio_add_page()
  md: raid10: avoid to access bvec table directly
  btrfs: avoid access to .bi_vcnt directly
  btrfs: avoid to access bvec table directly for a cloned bio
  btrfs: comment on direct access bvec table
  block: bounce: avoid direct access to bvec table
  bvec_iter: introduce BVEC_ITER_ALL_INIT
  block: bounce: don't access bio->bi_io_vec in copy_to_high_bio_irq
  block: comments on bio_for_each_segment[_all]
  block: introduce multipage/single page bvec helpers
  block: implement sp version of bvec iterator helpers
  block: introduce bio_for_each_segment_mp()
  blk-merge: compute bio->bi_seg_front_size efficiently
  block: blk-merge: try to make front segments in full size
  block: blk-merge: remove 

[PATCH v2 01/51] block: drbd: comment on direct access bvec table

2017-06-26 Thread Ming Lei
Signed-off-by: Ming Lei 
---
 drivers/block/drbd/drbd_bitmap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 809fd245c3dc..70890d950dc9 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -953,6 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
struct drbd_bm_aio_ctx *ctx = bio->bi_private;
struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap;
+   /* single page bio, safe for multipage bvec */
unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
 
if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
-- 
2.9.4



Re: [PATCH] sd: add support for TCG OPAL self encrypting disks

2017-06-26 Thread Christoph Hellwig
ping?

On Mon, Jun 19, 2017 at 02:26:46PM +0200, Christoph Hellwig wrote:
> Just wire up the generic TCG OPAL infrastructure to the SCSI disk driver
> and the Security In/Out commands.
> 
> Note that I don't know of any actual SCSI disks that do support TCG OPAL,
> but this is required to support ATA disks through libata.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/ata/libata-scsi.c  |  3 +++
>  drivers/scsi/sd.c  | 53 
> +-
>  drivers/scsi/sd.h  |  2 ++
>  include/scsi/scsi_device.h |  1 +
>  4 files changed, 58 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index 0f788ad6f2f6..3e5ca2e894a4 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -1321,6 +1321,9 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
>  
>   blk_queue_flush_queueable(q, false);
>  
> + if (dev->flags & ATA_DFLAG_TRUSTED)
> + sdev->security_supported = 1;
> +
>   dev->sdev = sdev;
>   return 0;
>  }
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index f9d1432d7cc5..5d32fd7d3a3e 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -50,6 +50,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -643,6 +644,26 @@ static void scsi_disk_put(struct scsi_disk *sdkp)
>   mutex_unlock(_ref_mutex);
>  }
>  
> +#ifdef CONFIG_BLK_SED_OPAL
> +static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
> + size_t len, bool send)
> +{
> + struct scsi_device *sdev = data;
> + u8 cdb[12] = { 0, };
> + int ret;
> +
> + cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN;
> + cdb[1] = secp;
> + put_unaligned_be16(spsp, [2]);
> + put_unaligned_be32(len, [6]);
> +
> + ret = scsi_execute_req(sdev, cdb,
> + send ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
> + buffer, len, NULL, SD_TIMEOUT, SD_MAX_RETRIES, NULL);
> + return ret <= 0 ? ret : -EIO;
> +}
> +#endif /* CONFIG_BLK_SED_OPAL */
> +
>  static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
>  unsigned int dix, unsigned int dif)
>  {
> @@ -1439,6 +1460,9 @@ static int sd_ioctl(struct block_device *bdev, fmode_t 
> mode,
>   if (error)
>   goto out;
>  
> + if (is_sed_ioctl(cmd))
> + return sed_ioctl(sdkp->opal_dev, cmd, p);
> +
>   /*
>* Send SCSI addressing ioctls directly to mid level, send other
>* ioctls to block level and then onto mid level if they can't be
> @@ -2994,6 +3018,20 @@ static void sd_read_write_same(struct scsi_disk *sdkp, 
> unsigned char *buffer)
>   sdkp->ws10 = 1;
>  }
>  
> +static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer)
> +{
> + struct scsi_device *sdev = sdkp->device;
> +
> + if (!sdev->security_supported)
> + return;
> +
> + if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE,
> + SECURITY_PROTOCOL_IN) == 1 &&
> + scsi_report_opcode(sdev, buffer, SD_BUF_SIZE,
> + SECURITY_PROTOCOL_OUT) == 1)
> + sdkp->security = 1;
> +}
> +
>  /**
>   *   sd_revalidate_disk - called the first time a new disk is seen,
>   *   performs disk spin up, read_capacity, etc.
> @@ -3047,6 +3085,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
>   sd_read_cache_type(sdkp, buffer);
>   sd_read_app_tag_own(sdkp, buffer);
>   sd_read_write_same(sdkp, buffer);
> + sd_read_security(sdkp, buffer);
>   }
>  
>   sdkp->first_scan = 0;
> @@ -3207,6 +3246,12 @@ static void sd_probe_async(void *data, async_cookie_t 
> cookie)
>  
>   sd_revalidate_disk(gd);
>  
> + if (sdkp->security) {
> + sdkp->opal_dev = init_opal_dev(sdp, _sec_submit);
> + if (sdkp->opal_dev)
> + sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n");
> + }
> +
>   sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
> sdp->removable ? "removable " : "");
>   scsi_autopm_put_device(sdp);
> @@ -3356,6 +3401,8 @@ static int sd_remove(struct device *dev)
>  
>   sd_zbc_remove(sdkp);
>  
> + free_opal_dev(sdkp->opal_dev);
> +
>   blk_register_region(devt, SD_MINORS, NULL,
>   sd_default_probe, NULL, NULL);
>  
> @@ -3497,6 +3544,7 @@ static int sd_suspend_runtime(struct device *dev)
>  static int sd_resume(struct device *dev)
>  {
>   struct scsi_disk *sdkp = dev_get_drvdata(dev);
> + int ret;
>  
>   if (!sdkp)  /* E.g.: runtime resume at the start of sd_probe() */
>   return 0;
> @@ -3505,7 +3553,10 @@ static int sd_resume(struct device *dev)
>   return 0;
>  
>   sd_printk(KERN_NOTICE, sdkp, "Starting disk\n");
> - return 

[PATCH 1/3] blk-mq: include all present CPUs in the default queue mapping

2017-06-26 Thread Christoph Hellwig
This way we get a nice distribution independent of the current cpu
online / offline state.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-cpumap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8e61e8640e17..5eaecd40f701 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -35,7 +35,6 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
 {
unsigned int *map = set->mq_map;
unsigned int nr_queues = set->nr_hw_queues;
-   const struct cpumask *online_mask = cpu_online_mask;
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
 
@@ -44,7 +43,7 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
 
cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
-   for_each_cpu(i, online_mask) {
+   for_each_present_cpu(i) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
@@ -54,7 +53,7 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
 
queue = 0;
for_each_possible_cpu(i) {
-   if (!cpumask_test_cpu(i, online_mask)) {
+   if (!cpumask_test_cpu(i, cpu_present_mask)) {
map[i] = 0;
continue;
}
-- 
2.11.0



block: spread MSI(-X) vectors to all possible CPUs

2017-06-26 Thread Christoph Hellwig
Hi all,

this series contains the left-over block bits to spread the MSI-X
vectors over all CPU.  Thomas already rewrote and then merged the
irq bits into the tip irq/core branch, and this is the remainder.

As there are no dependencies on other block changes adding them
to the tip tree might be easiest if Jens could ACK them.


[PATCH 19/20] lightnvm: pblk: set mempool and workqueue params.

2017-06-26 Thread Javier González
Make constants to define sizes for internal mempools and workqueues. In
this process, adjust the values to be more meaningful given the internal
constrains of the FTL. In order to do this for workqueues, separate the
current auxiliary workqueue into two dedicated workqueues to manage
lines being closed and bad blocks.

Signed-off-by: Javier González 
Signed-off-by: Matias Bjørling 
---
 drivers/lightnvm/pblk-core.c  |  7 ---
 drivers/lightnvm/pblk-init.c  | 39 ++-
 drivers/lightnvm/pblk-write.c |  5 +++--
 drivers/lightnvm/pblk.h   | 13 +++--
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index ba3b88f0e1f7..823e53f95a80 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -33,7 +33,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line 
*line,
pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
line->id, pos);
 
-   pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb);
+   pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq);
 }
 
 static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
@@ -1528,7 +1528,8 @@ void pblk_line_mark_bb(struct work_struct *work)
 }
 
 void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
- void (*work)(struct work_struct *))
+ void (*work)(struct work_struct *),
+ struct workqueue_struct *wq)
 {
struct pblk_line_ws *line_ws;
 
@@ -1541,7 +1542,7 @@ void pblk_line_run_ws(struct pblk *pblk, struct pblk_line 
*line, void *priv,
line_ws->priv = priv;
 
INIT_WORK(_ws->ws, work);
-   queue_work(pblk->kw_wq, _ws->ws);
+   queue_work(wq, _ws->ws);
 }
 
 void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 41db12deaa49..78a9bebac1fe 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -250,7 +250,7 @@ static int pblk_core_init(struct pblk *pblk)
if (!pblk->page_pool)
return -ENOMEM;
 
-   pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+   pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE,
pblk_blk_ws_cache);
if (!pblk->line_ws_pool)
goto free_page_pool;
@@ -259,35 +259,45 @@ static int pblk_core_init(struct pblk *pblk)
if (!pblk->rec_pool)
goto free_blk_ws_pool;
 
-   pblk->g_rq_pool = mempool_create_slab_pool(64, pblk_g_rq_cache);
+   pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE,
+   pblk_g_rq_cache);
if (!pblk->g_rq_pool)
goto free_rec_pool;
 
-   pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+   pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2,
+   pblk_w_rq_cache);
if (!pblk->w_rq_pool)
goto free_g_rq_pool;
 
pblk->line_meta_pool =
-   mempool_create_slab_pool(16, pblk_line_meta_cache);
+   mempool_create_slab_pool(PBLK_META_POOL_SIZE,
+   pblk_line_meta_cache);
if (!pblk->line_meta_pool)
goto free_w_rq_pool;
 
-   pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
-   WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-   if (!pblk->kw_wq)
+   pblk->close_wq = alloc_workqueue("pblk-close-wq",
+   WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS);
+   if (!pblk->close_wq)
goto free_line_meta_pool;
 
+   pblk->bb_wq = alloc_workqueue("pblk-bb-wq",
+   WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+   if (!pblk->bb_wq)
+   goto free_close_wq;
+
if (pblk_set_ppaf(pblk))
-   goto free_kw_wq;
+   goto free_bb_wq;
 
if (pblk_rwb_init(pblk))
-   goto free_kw_wq;
+   goto free_bb_wq;
 
INIT_LIST_HEAD(>compl_list);
return 0;
 
-free_kw_wq:
-   destroy_workqueue(pblk->kw_wq);
+free_bb_wq:
+   destroy_workqueue(pblk->bb_wq);
+free_close_wq:
+   destroy_workqueue(pblk->close_wq);
 free_line_meta_pool:
mempool_destroy(pblk->line_meta_pool);
 free_w_rq_pool:
@@ -305,8 +315,11 @@ static int pblk_core_init(struct pblk *pblk)
 
 static void pblk_core_free(struct pblk *pblk)
 {
-   if (pblk->kw_wq)
-   destroy_workqueue(pblk->kw_wq);
+   if (pblk->close_wq)
+   destroy_workqueue(pblk->close_wq);
+
+   if (pblk->bb_wq)
+

[PATCH 20/20] lightnvm: pblk: fail gracefully on irrec. error

2017-06-26 Thread Javier González
Due to user writes being decoupled from media writes because of the need
of an intermediate write buffer, irrecoverable media write errors lead
to pblk stalling; user writes fill up the buffer and end up in an
infinite retry loop.

In order to let user writes fail gracefully, it is necessary for pblk to
keep track of its own internal state and prevent further writes from
being placed into the write buffer.

This patch implements a state machine to keep track of internal errors
and, in case of failure, fail further user writes in an standard way.
Depending on the type of error, pblk will do its best to persist
buffered writes (which are already acknowledged) and close down on a
graceful manner. This way, data might be recovered by re-instantiating
pblk. Such state machine paves out the way for a state-based FTL log.

Signed-off-by: Javier González 
Signed-off-by: Matias Bjørling 
---
 drivers/lightnvm/pblk-cache.c|   8 +-
 drivers/lightnvm/pblk-core.c | 280 ---
 drivers/lightnvm/pblk-init.c |   6 +-
 drivers/lightnvm/pblk-map.c  |  23 +++-
 drivers/lightnvm/pblk-rb.c   |  25 +++-
 drivers/lightnvm/pblk-read.c |   3 +
 drivers/lightnvm/pblk-recovery.c |  31 +++--
 drivers/lightnvm/pblk-rl.c   |  30 -
 drivers/lightnvm/pblk-sysfs.c|   8 +-
 drivers/lightnvm/pblk-write.c|   5 +-
 drivers/lightnvm/pblk.h  |  27 +++-
 11 files changed, 332 insertions(+), 114 deletions(-)

diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 59bcea88db84..024a8fc93069 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -31,9 +31,13 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, 
unsigned long flags)
 */
 retry:
ret = pblk_rb_may_write_user(>rwb, bio, nr_entries, );
-   if (ret == NVM_IO_REQUEUE) {
+   switch (ret) {
+   case NVM_IO_REQUEUE:
io_schedule();
goto retry;
+   case NVM_IO_ERR:
+   pblk_pipeline_stop(pblk);
+   goto out;
}
 
if (unlikely(!bio_has_data(bio)))
@@ -58,6 +62,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, 
unsigned long flags)
atomic_long_add(nr_entries, >req_writes);
 #endif
 
+   pblk_rl_inserted(>rl, nr_entries);
+
 out:
pblk_write_should_kick(pblk);
return ret;
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 823e53f95a80..7648186bd1b1 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -53,6 +53,8 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct 
nvm_rq *rqd)
*ppa = rqd->ppa_addr;
pblk_mark_bb(pblk, line, ppa);
}
+
+   atomic_dec(>inflight_io);
 }
 
 /* Erase completion assumes that only one block is erased at the time */
@@ -257,35 +259,25 @@ void pblk_end_io_sync(struct nvm_rq *rqd)
complete(waiting);
 }
 
-void pblk_flush_writer(struct pblk *pblk)
+void pblk_wait_for_meta(struct pblk *pblk)
 {
-   struct bio *bio;
-   int ret;
-   DECLARE_COMPLETION_ONSTACK(wait);
+   do {
+   if (!atomic_read(>inflight_io))
+   break;
 
-   bio = bio_alloc(GFP_KERNEL, 1);
-   if (!bio)
-   return;
+   schedule();
+   } while (1);
+}
 
-   bio->bi_iter.bi_sector = 0; /* internal bio */
-   bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
-   bio->bi_private = 
-   bio->bi_end_io = pblk_end_bio_sync;
-
-   ret = pblk_write_to_cache(pblk, bio, 0);
-   if (ret == NVM_IO_OK) {
-   if (!wait_for_completion_io_timeout(,
-   msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-   pr_err("pblk: flush cache timed out\n");
-   }
-   } else if (ret != NVM_IO_DONE) {
-   pr_err("pblk: tear down bio failed\n");
-   }
-
-   if (bio->bi_status)
-   pr_err("pblk: flush sync write failed (%u)\n", bio->bi_status);
+static void pblk_flush_writer(struct pblk *pblk)
+{
+   pblk_rb_flush(>rwb);
+   do {
+   if (!pblk_rb_read_count(>rwb))
+   break;
 
-   bio_put(bio);
+   schedule();
+   } while (1);
 }
 
 struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
@@ -425,6 +417,9 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
}
}
 #endif
+
+   atomic_inc(>inflight_io);
+
return nvm_submit_io(dev, rqd);
 }
 
@@ -676,6 +671,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
struct pblk_line *line,
msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
pr_err("pblk: emeta I/O timed out\n");
}
+   atomic_dec(>inflight_io);
reinit_completion();
 
if 

  1   2   >