Re: [PATCH] scsi: fc: check for rport presence in fc_block_scsi_eh

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 14:00 +0200, Johannes Thumshirn wrote:
> Coverity-scan recently found a possible NULL pointer dereference in
> fc_block_scsi_eh() as starget_to_rport() either returns the rport for
> the startget or NULL.
> 
> While it is rather unlikely to have fc_block_scsi_eh() called without
> an rport associated it's a good idea to catch potential misuses of the
> API gracefully.
> 
> Signed-off-by: Johannes Thumshirn 
> ---
>  drivers/scsi/scsi_transport_fc.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/scsi/scsi_transport_fc.c 
> b/drivers/scsi/scsi_transport_fc.c
> index ba9d70f8a6a1..830ce53f30fb 100644
> --- a/drivers/scsi/scsi_transport_fc.c
> +++ b/drivers/scsi/scsi_transport_fc.c
> @@ -3328,6 +3328,9 @@ int fc_block_scsi_eh(struct scsi_cmnd *cmnd)
>  {
>   struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
>  
> + if (WARN_ON(!rport))
> + return 0;
> +
>   return fc_block_rport(rport);
>  }
>  EXPORT_SYMBOL(fc_block_scsi_eh);

Did you perhaps intend to use WARN_ON_ONCE() instead of WARN_ON()? Anyway:

Reviewed-by: Bart Van Assche 


Re: [PATCH] mpt3sas: remove redundant copy_from_user in _ctl_getiocinfo

2017-09-25 Thread Martin K. Petersen

Meng,

> Since right after the user copy, we are going to
> memset(, 0, sizeof(karg)), the copy_from_user is redundant

Applied to 4.15/scsi-queue. Thank you!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [RFC] apparently broken error recovery in vhost_scsi_iov_to_sgl()

2017-09-25 Thread Michael S. Tsirkin
On Sun, Sep 24, 2017 at 11:36:33PM +0100, Al Viro wrote:
> Suppose vhost_scsi_iov_to_sgl() got a two-iovec array, mapped
> e.g. 20 pages from the first one just fine and failed on the
> second.
> 
> static int
> vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
>   struct iov_iter *iter,
>   struct scatterlist *sg, int sg_count)
> {
> size_t off = iter->iov_offset;
> int i, ret;
> 
> for (i = 0; i < iter->nr_segs; i++) {
> void __user *base = iter->iov[i].iov_base + off;
> size_t len = iter->iov[i].iov_len - off;
> 
> ret = vhost_scsi_map_to_sgl(cmd, base, len, sg, write);
> if (ret < 0) {
> for (i = 0; i < sg_count; i++) {
> struct page *page = sg_page([i]);
> if (page)
> put_page(page);
> }
> return ret;
> }
> sg += ret;
> off = 0;
> }
> return 0;
> }
> 
> What are we trying to drop in the if (ret < 0) in there?  In the case
> above we step into it on the second pass through the loop.  The first
> 20 entries of sg had been filled... and sg had been increased by 20,
> so whatever we find and feed to put_page(), it won't be those 20 pages.
> Moreover, the caller will reset cmd->tvc_{prot_,}sgl_count to zero,
> so vhost_scsi_release_cmd() won't find them either.
>
> Am I missing something subtle here, or should that thing be doing
> something like

Looks right to me. I think Nicholas wrote this, CC him.

> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index 046f6d280af5..e47c5bc3ddca 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -688,6 +688,7 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool 
> write,
> struct scatterlist *sg, int sg_count)
>  {
>   size_t off = iter->iov_offset;
> + struct scatterlist *p = sg;
>   int i, ret;
>  
>   for (i = 0; i < iter->nr_segs; i++) {
> @@ -696,8 +697,8 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool 
> write,
>  
>   ret = vhost_scsi_map_to_sgl(cmd, base, len, sg, write);
>   if (ret < 0) {
> - for (i = 0; i < sg_count; i++) {
> - struct page *page = sg_page([i]);
> + while (p < sg) {
> + struct page *page = sg_page(p++);
>   if (page)
>   put_page(page);
>   }


Re: [PATCH V5 13/14] block: mq-deadline: Limit write request dispatch for zoned block devices

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> - return rq_entry_fifo(dd->fifo_list[data_dir].next);
> + if (!dd->zones_wlock || data_dir == READ)
> + return rq_entry_fifo(dd->fifo_list[data_dir].next);
> +
> + spin_lock_irqsave(>zone_lock, flags);
> +
> + list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
> + if (deadline_can_dispatch_request(dd, rq))
> + goto out;
> + }
> + rq = NULL;
> +
> +out:
> + spin_unlock_irqrestore(>zone_lock, flags);

Is it documented somewhere what dd->zone_lock protects and when that lock 
should be
acquired?

>   /*
>* This may be a requeue of a request that has locked its
> -  * target zone. If this is the case, release the request zone lock.
> +  * target zone. If this is the case, release the zone lock.
>*/
>   if (deadline_request_has_zone_wlock(rq))
>   deadline_wunlock_zone(dd, rq);

Can this change be folded into the patch that introduced that comment?

> @@ -570,6 +621,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, 
> struct request *rq,
>  
>   blk_mq_sched_request_inserted(rq);
>  
> + if (at_head && deadline_request_needs_zone_wlock(dd, rq))
> + pr_info(" Write at head !\n");
> +
>   if (at_head || blk_rq_is_passthrough(rq)) {
>   if (at_head)
>   list_add(>queuelist, >dispatch);

Will it be easy to users who analyze a kernel log to figure out why that
message has been generated? Should that message perhaps include the block
device name, zone number and request sector number?

Thanks,

Bart.

Re: [PATCH] scsi_transport_fc: set scsi_target_id upon rescan

2017-09-25 Thread Martin K. Petersen

Hannes,

> When an rport is found in the bindings array there is no guarantee
> that it had been a target port, so we need to call
> fc_remote_port_rolechg() here to ensure the scsi_target_id is set
> correctly.  Otherwise the port will never be scanned.

Applied to 4.14/scsi-fixes. Thank you!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH v4 00/14] mpt3sas driver NVMe support:

2017-09-25 Thread Martin K. Petersen

Hi Suganath,

> Also, Making all PRP buffer may or may not need FW changes (assuming
> it is possible.), we may end up into multiple FW version check.

I don't understand how submitting an I/O that is guaranteed to honor the
constraints of the target NVMe drive could possibly cause problems for
the controller firmware. Quite the contrary, it's the best case
scenario.

> Since this is main IO path and current driver is following H/W
> limitation, we should avoid any changes in this area until and unless
> change is universal acceptable in FW (for all type of work load).

This is why you need to involve the Linux community early in the design
process and not when your implementation is complete.

We could have told you right away what the correct approach would be for
your Linux driver. And that said approach works for products from other
vendors so we see no compelling reason to deviate from it.

As evidenced by Broadcom disowning the legacy mpt and megaraid drivers,
I will be stuck maintaining this mpt3sas code for a decade or more. Long
after Broadcom has ended official support and moved on to different
ASICs and programming interfaces. Consequently, I am very heavily biased
towards solutions that leverage the shared interfaces provided by the
kernel and that don't have special cases and workarounds inside the
driver.

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V5 06/14] block: Add zoned block device information to request queue

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> Components relying only on the requeuest_queue structure for accessing
> block devices (e.g. I/O schedulers) have a limited knowledged of the
> device characteristics. In particular, the device capacity cannot be
> easily discovered, which for a zoned block device also result in the
> inability to easily know the number of zones of the device (the zone
> size is indicated by the chunk_sectors field of the queue limits).
> 
> Introduce the nr_zones field to the request_queue sturcture to simplify
> access to this information. Also, add the bitmap seq_zone_bitmap which
> indicates which zones of the device are sequential zones (write
> preferred or write required). These two fields are initialized by the
> low level block device driver (sd.c for ZBC/ZAC disks). They are not
> initialized by stacking drivers (device mappers) handling zoned block
> devices (e.g. dm-linear).

Reviewed-by: Bart Van Assche 



Re: [PATCH 3/6] scsi: lpfc: Cocci spatch "pool_zalloc-simple"

2017-09-25 Thread Martin K. Petersen

Thomas,

> Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0.
> Found by coccinelle spatch "api/alloc/pool_zalloc-simple.cocci"

Applied to 4.15/scsi-queue. Thank you!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH] scsi: aacraid: Add a small delay after IOP reset

2017-09-25 Thread Guilherme G. Piccoli
On 09/21/2017 01:19 PM, Dave Carroll wrote:
>> [...]
>> ---
> Acked-by: Dave Carroll 
> 

Thanks Dave!

James/Martin, am I expected to send a v2 with some change? Perhaps with
Dave's ack?
Sorry to annoy, thanks in advance for any advice!

Cheers,


Guilherme



Re: [PATCH V5 12/14] block: mq-deadline: Introduce zone locking support

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> +static inline bool deadline_request_needs_zone_wlock(struct deadline_data 
> *dd,
> +  struct request *rq)
> +{
> +
> + if (!dd->zones_wlock)
> + return false;
> +
> + if (blk_rq_is_passthrough(rq))
> + return false;
> +
> + switch (req_op(rq)) {
> + case REQ_OP_WRITE_ZEROES:
> + case REQ_OP_WRITE_SAME:
> + case REQ_OP_WRITE:
> + return blk_rq_zone_is_seq(rq);
> + default:
> + return false;
> + }

If anyone ever adds a new write request type it will be easy to overlook this
function. Should the 'default' case be left out and should all request types
be mentioned in the switch/case statement such that the compiler will issue a
warning if a new request operation type is added to enum req_opf?

> +/*
> + * Abuse the elv.priv[0] pointer to indicate if a request has write
> + * locked its target zone. Only write request to a zoned block device
> + * can own a zone write lock.
> + */
> +#define RQ_ZONE_WLOCKED  ((void *)1UL)
> +static inline void deadline_set_request_zone_wlock(struct request *rq)
> +{
> + rq->elv.priv[0] = RQ_ZONE_WLOCKED;
> +}
> +
> +#define RQ_ZONE_NO_WLOCK ((void *)0UL)
> +static inline void deadline_clear_request_zone_wlock(struct request *rq)
> +{
> + rq->elv.priv[0] = RQ_ZONE_NO_WLOCK;
> +}

Should an enumeration type be introduced for RQ_ZONE_WLOCKED and 
RQ_ZONE_NO_WLOCK?

> +/*
> + * Write lock the target zone of a write request.
> + */
> +static void deadline_wlock_zone(struct deadline_data *dd,
> + struct request *rq)
> +{
> + unsigned int zno = blk_rq_zone_no(rq);
> +
> + WARN_ON_ONCE(deadline_request_has_zone_wlock(rq));
> + WARN_ON_ONCE(test_and_set_bit(zno, dd->zones_wlock));
> + deadline_set_request_zone_wlock(rq);
> +}
> +
> +/*
> + * Write unlock the target zone of a write request.
> + */
> +static void deadline_wunlock_zone(struct deadline_data *dd,
> +   struct request *rq)
> +{
> + unsigned int zno = blk_rq_zone_no(rq);
> + unsigned long flags;
> +
> + spin_lock_irqsave(>zone_lock, flags);
> +
> + WARN_ON_ONCE(!test_and_clear_bit(zno, dd->zones_wlock));
> + deadline_clear_request_zone_wlock(rq);
> +
> + spin_unlock_irqrestore(>zone_lock, flags);
> +}

Why does deadline_wunlock_zone() protect modifications with dd->zone_lock but
deadline_wlock_zone() not? If this code is correct, please add a
lockdep_assert_held() statement in the first function.

> +/*
> + * Test the write lock state of the target zone of a write request.
> + */
> +static inline bool deadline_zone_is_wlocked(struct deadline_data *dd,
> + struct request *rq)
> +{
> + unsigned int zno = blk_rq_zone_no(rq);
> +
> + return test_bit(zno, dd->zones_wlock);
> +}

Do we really need the local variable 'zno'?

> +/*
> + * For zoned block devices, write unlock the target zone of
> + * completed write requests.
> + */
> +static void dd_completed_request(struct request *rq)
> +{
> +

Please leave out the blank line at the start of this function.

Thanks,

Bart.

Re: [PATCH 5/6] scsi: qla2xxx: Cocci spatch "pool_zalloc-simple"

2017-09-25 Thread Martin K. Petersen

Thomas,

> Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0.
> Found by coccinelle spatch "api/alloc/pool_zalloc-simple.cocci"

Applied to 4.15/scsi-queue. Thanks!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V5 03/14] scsi: sd_zbc: Rearrange code

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
> assignments and move the calculation of sdkp->zone_shift together
> with the assignment of the verified zone_blocks value in
> sd_zbc_check_zone_size().
> 
> No functional change is introduced by this patch.
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Bart Van Assche 



Re: [PATCH V5 07/14] scsi: sd_zbc: Initialize device request queue zoned data

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> + return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
> + * sizeof(unsigned long),

Does this perhaps fit on one line?

> +/**
> + * sd_zbc_get_seq_zones - Parse report zones reply to identify sequential 
> zones
> + * @sdkp: disk used
> + * @buf: report reply buffer
> + * @seq_zone_bitamp: bitmap of sequential zones to set
> + * @zno: Zone number of the first zone in the report

'zno' is an input and output parameter but the above line only describes what
happens with the value passed to sd_zbc_get_seq_zones(). I think we also need a
description for the value of 'zno' upon return.

> + * Parse reported zone descriptors to find sequiential zones.
  ^^^
  sequential?

Otherwise this patch looks fine to me.

Bart.

Re: [PATCH V5 11/14] blokc: mq-deadline: Introduce dispatch helpers

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> Avoid directly referencing the next_rq and fifo_list arrays using the
> helper functions deadline_next_request() and deadline_fifo_request() to
> facilitate changes in the dispatch request selection in
> __dd_dispatch_request().

Reviewed-by: Bart Van Assche 


Re: [PATCH 0/2] scsi_dh: suppress errors from unsupported devices

2017-09-25 Thread Martin K. Petersen

Hannes,

> here's a small patchset to suppress errors from scsi_dh_attach() for
> unsupported devices.

Applied to 4.15/scsi-queue. Thank you!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH] scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add()

2017-09-25 Thread Martin K. Petersen

Hannes,

> During failover there is a small race window between
> fc_remote_port_add() and fc_timeout_deleted_rport(); the latter drops
> the lock after setting the port to NOTPRESENT, so if
> fc_remote_port_add() is called right at that time it will fail to
> detect the existing rport and happily adding a new structure, causing
> rports to get registered twice.

Applied to 4.14/scsi-fixes. Thanks!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

2017-09-25 Thread 陈华才
Hi, Robin,

Can ARM/ARM64 use the same implementation as MIPS? Or I just do MIPS things is 
OK?
 
Huacai
 
-- Original --
From:  "Robin Murphy";
Date:  Mon, Sep 25, 2017 08:57 PM
To:  "Huacai Chen"; "Christoph Hellwig"; 
Cc:  "Marek Szyprowski"; "Andrew 
Morton"; "Fuxin Zhang"; 
"linux-kernel"; "Ralf 
Baechle"; "James Hogan"; 
"linux-mips"; "James E . J . 
Bottomley"; "Martin K . 
Petersen"; 
"linux-scsi"; "Roland Dreier"; 
"Pawel Osciak"; "Kyungmin Park"; 
"Michael Chan"; "Benjamin 
Herrenschmidt"; "Ivan Mikhaylov"; 
"Tariq Toukan"; "Andy Gross"; "Mark 
A . Greer"; "Robert Baldyga"; 
"stable"; 
Subject:  Re: [PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

 
On 25/09/17 10:46, Huacai Chen wrote:
> Make dma_get_cache_alignment() to accept a 'dev' argument. As a result,
> it can return different alignments due to different devices' I/O cache
> coherency.
> 
> Currently, MIPS is the only architecture which support coherent & non-
> coherent devices co-exist. This may be changed in the future, so add a
> new get_cache_alignment() function pointer in 'struct dma_map_ops' as a
> generic solution.

FWIW, ARM and arm64 have also supported per-device coherency for quite
some time.

> For compatibility (always return ARCH_DMA_MINALIGN), make all existing
> callers pass a NULL dev argument to dma_get_cache_alignment().
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Huacai Chen  ---
>  arch/mips/cavium-octeon/dma-octeon.c   |  3 ++-
>  arch/mips/include/asm/dma-mapping.h|  2 ++
>  arch/mips/loongson64/common/dma-swiotlb.c  |  1 +
>  arch/mips/mm/dma-default.c | 11 ++-
>  arch/mips/netlogic/common/nlm-dma.c|  3 ++-
>  drivers/infiniband/hw/mthca/mthca_main.c   |  2 +-
>  drivers/media/v4l2-core/videobuf2-dma-contig.c |  2 +-
>  drivers/net/ethernet/broadcom/b44.c|  2 +-
>  drivers/net/ethernet/ibm/emac/core.h   |  2 +-
>  drivers/net/ethernet/mellanox/mlx4/main.c  |  2 +-
>  drivers/spi/spi-qup.c  |  4 ++--
>  drivers/tty/serial/mpsc.c  | 16 
>  drivers/tty/serial/samsung.c   | 14 +++---
>  include/linux/dma-mapping.h| 17 -
>  14 files changed, 51 insertions(+), 30 deletions(-)

I think it might be neater to split this into two patches - one making
the treewide prototype change, then introducing the .get_cache_alignemnt
callback separately - but that's only my personal preference.

Otherwise (and modulo Christoph's comments), I'd say we're nearly there.

Thanks,
Robin.

> diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
> b/arch/mips/cavium-octeon/dma-octeon.c
> index c64bd87..7978237 100644
> --- a/arch/mips/cavium-octeon/dma-octeon.c
> +++ b/arch/mips/cavium-octeon/dma-octeon.c
> @@ -324,7 +324,8 @@ static struct octeon_dma_map_ops _octeon_pci_dma_map_ops 
> = {
>   .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
>   .sync_sg_for_device = octeon_dma_sync_sg_for_device,
>   .mapping_error = swiotlb_dma_mapping_error,
> - .dma_supported = swiotlb_dma_supported
> + .dma_supported = swiotlb_dma_supported,
> + .get_cache_alignment = mips_get_cache_alignment
>   },
>  };
>  
> diff --git a/arch/mips/include/asm/dma-mapping.h 
> b/arch/mips/include/asm/dma-mapping.h
> index aba7138..e2c5d9e 100644
> --- a/arch/mips/include/asm/dma-mapping.h
> +++ b/arch/mips/include/asm/dma-mapping.h
> @@ -39,4 +39,6 @@ static inline void arch_setup_dma_ops(struct device *dev, 
> u64 dma_base,
>  #endif
>  }
>  
> +int mips_get_cache_alignment(struct device *dev);
> +
>  #endif /* _ASM_DMA_MAPPING_H */
> diff --git a/arch/mips/loongson64/common/dma-swiotlb.c 
> b/arch/mips/loongson64/common/dma-swiotlb.c
> index 34486c1..09cb8a4 100644
> --- a/arch/mips/loongson64/common/dma-swiotlb.c
> +++ b/arch/mips/loongson64/common/dma-swiotlb.c
> @@ -119,6 +119,7 @@ static const struct dma_map_ops loongson_dma_map_ops = {
>   .sync_sg_for_device = loongson_dma_sync_sg_for_device,
>   .mapping_error = swiotlb_dma_mapping_error,
>   .dma_supported = loongson_dma_supported,
> + .get_cache_alignment = mips_get_cache_alignment
>  };
>  
>  void __init plat_swiotlb_setup(void)
> diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
> 

Re: [PATCH] sd: Limit WRITE SAME / WRITE SAME(16) w/UNMAP length for certain devices

2017-09-25 Thread Martin K. Petersen

Ewan,

> Some devices do not support a WRITE SAME / WRITE SAME(16) with the
> UNMAP bit set up to the length specified in the MAXIMUM WRITE SAME
> LENGTH field in the block limits VPD page (or, the field is zero,
> indicating there is no limit).  Limit the length by the MAXIMUM UNMAP
> LBA COUNT value.  Otherwise the command might be rejected.

>From SBC4:

  "A MAXIMUM UNMAP LBA COUNT field set to a non-zero value indicates the
  maximum number of LBAs that may be unmapped by an UNMAP command"

Note that it explicitly states "UNMAP command" and not "unmap
operation".

  "A MAXIMUM WRITE SAME LENGTH field set to a non-zero value indicates
  the maximum number of contiguous logical blocks that the device server
  allows to be unmapped or written in a single WRITE SAME command."

It says "unmapped or written" and "WRITE SAME command".

The spec is crystal clear. The device needs to be fixed. We can
blacklist older firmware revs.

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

2017-09-25 Thread 陈华才
Hi, Christoph,

Can I put the declaration in asm/dma-coherence.h?

And, last time you said it is OK to pass a NULL to dma_get_cache_alignment() 
and cc all driver maintainers. I have do so.

Huacai

-- Original --
From:  "Christoph Hellwig";
Date:  Mon, Sep 25, 2017 08:51 PM
To:  "Huacai Chen"; 
Cc:  "Christoph Hellwig"; "Marek 
Szyprowski"; "Robin Murphy"; 
"Andrew Morton"; "Fuxin Zhang"; 
"linux-kernel"; "Ralf 
Baechle"; "James Hogan"; 
"linux-mips"; "James E . J . 
Bottomley"; "Martin K . 
Petersen"; 
"linux-scsi"; "Roland Dreier"; 
"Pawel Osciak"; "Kyungmin Park"; 
"Michael Chan"; "Benjamin 
Herrenschmidt"; "Ivan Mikhaylov"; 
"Tariq Toukan"; "Andy Gross"; "Mark 
A . Greer"; "Robert Baldyga"; 
"stable"; 
Subject:  Re: [PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

 
> index aba7138..e2c5d9e 100644
> --- a/arch/mips/include/asm/dma-mapping.h
> +++ b/arch/mips/include/asm/dma-mapping.h
> @@ -39,4 +39,6 @@ static inline void arch_setup_dma_ops(struct device *dev, 
> u64 dma_base,
>  #endif
>  }
>  
> +int mips_get_cache_alignment(struct device *dev);

All the other mips generic dma helpers are prefixed mips_dma_*
so it might make sense to follow that.

Also please don't add arch-local helpers to asm/dma-mapping.h - this
is a header used by linux/dma-mapping.h and should not contain
implementation details if avoidable.

> +dma_get_cache_alignment(NULL)) / 
> mdev->limits.mtt_seg_size;

As said before - please don't pass NULL to this function but the proper
device, which would be >pdev->dev in this case for example.

Re: [PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

2017-09-25 Thread Christoph Hellwig
> index aba7138..e2c5d9e 100644
> --- a/arch/mips/include/asm/dma-mapping.h
> +++ b/arch/mips/include/asm/dma-mapping.h
> @@ -39,4 +39,6 @@ static inline void arch_setup_dma_ops(struct device *dev, 
> u64 dma_base,
>  #endif
>  }
>  
> +int mips_get_cache_alignment(struct device *dev);

All the other mips generic dma helpers are prefixed mips_dma_*
so it might make sense to follow that.

Also please don't add arch-local helpers to asm/dma-mapping.h - this
is a header used by linux/dma-mapping.h and should not contain
implementation details if avoidable.

> +dma_get_cache_alignment(NULL)) / 
> mdev->limits.mtt_seg_size;

As said before - please don't pass NULL to this function but the proper
device, which would be >pdev->dev in this case for example.


Re: [PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

2017-09-25 Thread Robin Murphy
On 25/09/17 10:46, Huacai Chen wrote:
> Make dma_get_cache_alignment() to accept a 'dev' argument. As a result,
> it can return different alignments due to different devices' I/O cache
> coherency.
> 
> Currently, MIPS is the only architecture which support coherent & non-
> coherent devices co-exist. This may be changed in the future, so add a
> new get_cache_alignment() function pointer in 'struct dma_map_ops' as a
> generic solution.

FWIW, ARM and arm64 have also supported per-device coherency for quite
some time.

> For compatibility (always return ARCH_DMA_MINALIGN), make all existing
> callers pass a NULL dev argument to dma_get_cache_alignment().
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Huacai Chen  ---
>  arch/mips/cavium-octeon/dma-octeon.c   |  3 ++-
>  arch/mips/include/asm/dma-mapping.h|  2 ++
>  arch/mips/loongson64/common/dma-swiotlb.c  |  1 +
>  arch/mips/mm/dma-default.c | 11 ++-
>  arch/mips/netlogic/common/nlm-dma.c|  3 ++-
>  drivers/infiniband/hw/mthca/mthca_main.c   |  2 +-
>  drivers/media/v4l2-core/videobuf2-dma-contig.c |  2 +-
>  drivers/net/ethernet/broadcom/b44.c|  2 +-
>  drivers/net/ethernet/ibm/emac/core.h   |  2 +-
>  drivers/net/ethernet/mellanox/mlx4/main.c  |  2 +-
>  drivers/spi/spi-qup.c  |  4 ++--
>  drivers/tty/serial/mpsc.c  | 16 
>  drivers/tty/serial/samsung.c   | 14 +++---
>  include/linux/dma-mapping.h| 17 -
>  14 files changed, 51 insertions(+), 30 deletions(-)

I think it might be neater to split this into two patches - one making
the treewide prototype change, then introducing the .get_cache_alignemnt
callback separately - but that's only my personal preference.

Otherwise (and modulo Christoph's comments), I'd say we're nearly there.

Thanks,
Robin.

> diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
> b/arch/mips/cavium-octeon/dma-octeon.c
> index c64bd87..7978237 100644
> --- a/arch/mips/cavium-octeon/dma-octeon.c
> +++ b/arch/mips/cavium-octeon/dma-octeon.c
> @@ -324,7 +324,8 @@ static struct octeon_dma_map_ops _octeon_pci_dma_map_ops 
> = {
>   .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
>   .sync_sg_for_device = octeon_dma_sync_sg_for_device,
>   .mapping_error = swiotlb_dma_mapping_error,
> - .dma_supported = swiotlb_dma_supported
> + .dma_supported = swiotlb_dma_supported,
> + .get_cache_alignment = mips_get_cache_alignment
>   },
>  };
>  
> diff --git a/arch/mips/include/asm/dma-mapping.h 
> b/arch/mips/include/asm/dma-mapping.h
> index aba7138..e2c5d9e 100644
> --- a/arch/mips/include/asm/dma-mapping.h
> +++ b/arch/mips/include/asm/dma-mapping.h
> @@ -39,4 +39,6 @@ static inline void arch_setup_dma_ops(struct device *dev, 
> u64 dma_base,
>  #endif
>  }
>  
> +int mips_get_cache_alignment(struct device *dev);
> +
>  #endif /* _ASM_DMA_MAPPING_H */
> diff --git a/arch/mips/loongson64/common/dma-swiotlb.c 
> b/arch/mips/loongson64/common/dma-swiotlb.c
> index 34486c1..09cb8a4 100644
> --- a/arch/mips/loongson64/common/dma-swiotlb.c
> +++ b/arch/mips/loongson64/common/dma-swiotlb.c
> @@ -119,6 +119,7 @@ static const struct dma_map_ops loongson_dma_map_ops = {
>   .sync_sg_for_device = loongson_dma_sync_sg_for_device,
>   .mapping_error = swiotlb_dma_mapping_error,
>   .dma_supported = loongson_dma_supported,
> + .get_cache_alignment = mips_get_cache_alignment
>  };
>  
>  void __init plat_swiotlb_setup(void)
> diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
> index c01bd20..c49987e 100644
> --- a/arch/mips/mm/dma-default.c
> +++ b/arch/mips/mm/dma-default.c
> @@ -394,6 +394,14 @@ void dma_cache_sync(struct device *dev, void *vaddr, 
> size_t size,
>  
>  EXPORT_SYMBOL(dma_cache_sync);
>  
> +int mips_get_cache_alignment(struct device *dev)
> +{
> + if (plat_device_is_coherent(dev))
> + return 1;
> + else
> + return ARCH_DMA_MINALIGN;
> +}
> +
>  static const struct dma_map_ops mips_default_dma_map_ops = {
>   .alloc = mips_dma_alloc_coherent,
>   .free = mips_dma_free_coherent,
> @@ -407,7 +415,8 @@ static const struct dma_map_ops mips_default_dma_map_ops 
> = {
>   .sync_sg_for_cpu = mips_dma_sync_sg_for_cpu,
>   .sync_sg_for_device = mips_dma_sync_sg_for_device,
>   .mapping_error = mips_dma_mapping_error,
> - .dma_supported = mips_dma_supported
> + .dma_supported = mips_dma_supported,
> + .get_cache_alignment = mips_get_cache_alignment
>  };
>  
>  const struct dma_map_ops *mips_dma_map_ops = _default_dma_map_ops;
> diff --git a/arch/mips/netlogic/common/nlm-dma.c 
> b/arch/mips/netlogic/common/nlm-dma.c
> index 0ec9d9d..1e107ac 100644
> --- a/arch/mips/netlogic/common/nlm-dma.c
> +++ b/arch/mips/netlogic/common/nlm-dma.c
> @@ -79,7 +79,8 

[PATCH V7 2/2] scsi: Align block queue to dma_get_cache_alignment()

2017-09-25 Thread Huacai Chen
In non-coherent DMA mode, kernel uses cache flushing operations to
maintain I/O coherency, so scsi's block queue should be aligned to
ARCH_DMA_MINALIGN. Otherwise, it will cause data corruption, at least
on MIPS:

Step 1, dma_map_single
Step 2, cache_invalidate (no writeback)
Step 3, dma_from_device
Step 4, dma_unmap_single

If a DMA buffer and a kernel structure share a same cache line, and if
the kernel structure has dirty data, cache_invalidate (no writeback)
will cause data lost.

Cc: sta...@vger.kernel.org
Signed-off-by: Huacai Chen 
---
 drivers/scsi/scsi_lib.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80..19abc2e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2132,11 +2132,11 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct 
request_queue *q)
q->limits.cluster = 0;
 
/*
-* set a reasonable default alignment on word boundaries: the
-* host and device may alter it using
+* set a reasonable default alignment on word/cacheline boundaries:
+* the host and device may alter it using
 * blk_queue_update_dma_alignment() later.
 */
-   blk_queue_dma_alignment(q, 0x03);
+   blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment(dev)) - 1);
 }
 EXPORT_SYMBOL_GPL(__scsi_init_queue);
 
-- 
2.7.0





Re: [PATCH 07/19] lpfc: Move CQ processing to a soft IRQ

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 08/19] lpfc: Fix FCP hba_wqidx assignment

2017-09-25 Thread Johannes Thumshirn
Again, if this is fixing an oops it should be tagged for inclusion
into stable.

Byte,
Johannes
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 09/19] lpfc: Reduce log spew on controller reconnects

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 10/19] lpfc: Set missing abort context

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 12/19] lpfc: Fix oops if nvmet_fc_register_targetport fails

2017-09-25 Thread Johannes Thumshirn
CC: stable?

Anyways,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 11/19] lpfc: Revise NVME module parameter descriptions for better clarity

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 13/19] lpfc: Disable NPIV support if NVME is enabled

2017-09-25 Thread Johannes Thumshirn
On Thu, Sep 21, 2017 at 11:17:35PM -0700, James Smart wrote:
> Support for NPIV with NVME will be added in the near future.

Sounds like the future is bright,
Reviewed-by: Johannes Thumshirn 

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 14/19] lpfc: Fix crash in lpfc_nvme_fcp_io_submit during LIP

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 16/19] lpfc: Ensure io aborts interlocked with the target.

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 17/19] lpfc: Extend RDP support

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 15/19] lpfc: Fix secure firmware updates

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 18/19] lpfc: Fix oops of nvme host during driver unload.

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 

But again, this should be considered for inclusion into stable
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 19/19] lpfc: change version to 11.4.0.4

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: scsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly

2017-09-25 Thread Vladis Dronov
hello,

an additional research shows that the very latest kernels are not showing
a crash with a reproducer. git bisect showed that:

commit 7f564528a480084e2318cd48caba7aef4a54a77f is the first commit (between
v4.11 and v4.12-rc1) a crash is not reproduced with:

commit 7f564528a480084e2318cd48caba7aef4a54a77f
Author: Steffen Klassert 
Date:   Sat Apr 8 20:36:24 2017 +0200
skbuff: Extend gso_type to unsigned int.

i.e. this is commit which fixed the crash. checking the code, it looks like
struct skb_shared_info's fields were reordered, so a field which overwrite
was causing a panic has been moved. nevertheless, the buffer overwrite is still
there, so a suggested patch 9923803 (or its later version) is still needed.

for a proof compare a flaw description:

> ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a
> new value to skb_shinfo(SKB)->nr_frags by ev->type.

and the commit message:

>The remaining two byte hole is moved to the
>beginning of the structure, this protects us
>from immediate overwites on out of bound writes
>to the sk_buff head.
> 
>Structure layout on x86-64 before the change:
> 
>struct skb_shared_info {
>unsigned char  nr_frags;
>__u8   tx_flags;
> 
>Structure layout on x86-64 after the change:
> 
>struct skb_shared_info {
>short unsigned int _unused;
>unsigned char  nr_frags;
>__u8   tx_flags;

Best regards,
Vladis Dronov | Red Hat, Inc. | Product Security Engineer

- Original Message -
From: Xin Long 
To: linux-scsi@vger.kernel.org
Sent: Sun, 27 Aug 2017 20:25:26 +0800
Subject: scsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly

> ChunYu found a kernel crash by syzkaller:
> 
> [  651.617875] kasan: CONFIG_KASAN_INLINE enabled
> [  651.618217] kasan: GPF could be caused by NULL-ptr deref or user memory 
> access
> [  651.618731] general protection fault:  [#1] SMP KASAN
> [  651.621543] CPU: 1 PID: 9539 Comm: scsi Not tainted 4.11.0.cov #32
> [  651.621938] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> [  651.622309] task: 88011778 task.stack: 8800a3188000
> [  651.622762] RIP: 0010:skb_release_data+0x26c/0x590



Re: [PATCH 1/1] bsg-lib: fix use-after-free under memory-pressure

2017-09-25 Thread Christoph Hellwig
>   if (!q)
>   return ERR_PTR(-ENOMEM);
>   q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
> - q->init_rq_fn = bsg_init_rq;
> - q->exit_rq_fn = bsg_exit_rq;
> + q->init_rq_fn = bsg_init_job;
> + q->exit_rq_fn = bsg_exit_job;
> + q->initialize_rq_fn = bsg_init_rq;

Please use function names that match the method names, that is keep
the existing names and name the new helper bsg_initialize_rq;

Except for that the patch looks fine to me:

Reviewed-by: Christoph Hellwig 


Re: [PATCH 1/1] bsg-lib: fix use-after-free under memory-pressure

2017-09-25 Thread Benjamin Block
On Mon, Sep 25, 2017 at 08:53:07AM -0700, Christoph Hellwig wrote:
> > if (!q)
> > return ERR_PTR(-ENOMEM);
> > q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
> > -   q->init_rq_fn = bsg_init_rq;
> > -   q->exit_rq_fn = bsg_exit_rq;
> > +   q->init_rq_fn = bsg_init_job;
> > +   q->exit_rq_fn = bsg_exit_job;
> > +   q->initialize_rq_fn = bsg_init_rq;
> 
> Please use function names that match the method names, that is keep
> the existing names and name the new helper bsg_initialize_rq;
>

OK, I can change that.


Beste Grüße / Best regards,
  - Benjamin Block

> 
> Except for that the patch looks fine to me:
> 
> Reviewed-by: Christoph Hellwig 
> 

-- 
Linux on z Systems Development / IBM Systems & Technology Group
  IBM Deutschland Research & Development GmbH 
Vorsitz. AufsR.: Martina Koederitz /Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen / Registergericht: AmtsG Stuttgart, HRB 243294



Re: [PATCH] scsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly

2017-09-25 Thread Martin K. Petersen

Xin,

> ChunYu found a kernel crash by syzkaller:

[...]

> It's caused by skb_shared_info at the end of sk_buff was overwritten by
> ISCSI_KEVENT_IF_ERROR when parsing nlmsg info from skb in iscsi_if_rx.
>
> During the loop if skb->len == nlh->nlmsg_len and both are sizeof(*nlh),
> ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a
> new value to skb_shinfo(SKB)->nr_frags by ev->type.
>
> This patch is to fix it by checking nlh->nlmsg_len properly there to
> avoid over accessing sk_buff.

Applied to 4.14/scsi-fixes. Thank you!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH] scsi: ufs: Make use of UFS_BIT macro wherever possible

2017-09-25 Thread Martin K. Petersen

Alim,

> Should I drop this patch and send another one which removes UFS_BIT()
> macro?

I fail to see the point of UFS_BIT(). So yes.

Please make sure to CC: Subhash on ufs changes.

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH] scsi: csiostor: enable PCIe relax ordering if supported

2017-09-25 Thread Martin K. Petersen

Varun,

> Set PCIe relax ordering bits in FW_IQ_CMD if relax ordering is enabled
> in the PCIe device.

Applied to 4.15/scsi-queue. Thank you!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V3 1/2] scsi: sd: Fix sd_config_write_same()

2017-09-25 Thread Martin K. Petersen

Damien,

> Reporting a maximum number of blocks that is not aligned on the device
> physical size would cause a large write same request to be split into
> physically unaligned chunks by __blkdev_issue_write_zeroes() and
> __blkdev_issue_write_same(), even if the caller of these functions
> took care to align its request to physical sectors.

Applied to 4.15/scsi-queue. Thanks!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V5 10/14] block: mq-deadline: Add zoned block device data

2017-09-25 Thread Bart Van Assche
On Mon, 2017-09-25 at 15:14 +0900, Damien Le Moal wrote:
> Modify mq-dealine init_queue and exit_queue elevator methods to handle
 ^^
 mq-deadline ?

> +static int deadline_init_zones_wlock(struct request_queue *q,
> +  struct deadline_data *dd)
> +{
> + /*
> +  * For regular drives or non-conforming zoned block device,
> +  * do not use zone write locking.
> +  */
> + if (!blk_queue_nr_zones(q))
> + return 0;
> +
> + /*
> +  * Treat host aware drives as regular disks.
> +  */
> + if (blk_queue_zoned_model(q) != BLK_ZONED_HM)
> + return 0;
> +
> + dd->zones_wlock = kzalloc_node(BITS_TO_LONGS(blk_queue_nr_zones(q))
> +* sizeof(unsigned long),
> +GFP_KERNEL, q->node);

A request queue is created before disk validation occurs and before the
number of zones is initialized (sd_probe_async()). If a scheduler is
assigned to a ZBC drive through a udev rule, can it happen that
deadline_init_zones_wlock() is called before the number of zones has been
initialized?

Bart.

Re: [PATCH] scsi: aacraid: Add a small delay after IOP reset

2017-09-25 Thread Martin K. Petersen

Guilherme,

> James/Martin, am I expected to send a v2 with some change? Perhaps
> with Dave's ack?  Sorry to annoy, thanks in advance for any advice!

I was just about to mail Dave and ask for confirmation that your
interpretation of the controller behavior is correct.

Dave?

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH V2 0/9] pm80xx updates

2017-09-25 Thread Martin K. Petersen

Viswas,

> Do we need to send V3 patch set with corrected date ? 

Yes, please. And address Jack's/kbuild robot's comments.

Thanks!

-- 
Martin K. Petersen  Oracle Linux Engineering


[PATCH V7 1/2] dma-mapping: Rework dma_get_cache_alignment()

2017-09-25 Thread Huacai Chen
Make dma_get_cache_alignment() to accept a 'dev' argument. As a result,
it can return different alignments due to different devices' I/O cache
coherency.

Currently, MIPS is the only architecture which support coherent & non-
coherent devices co-exist. This may be changed in the future, so add a
new get_cache_alignment() function pointer in 'struct dma_map_ops' as a
generic solution.

For compatibility (always return ARCH_DMA_MINALIGN), make all existing
callers pass a NULL dev argument to dma_get_cache_alignment().

Cc: sta...@vger.kernel.org
Signed-off-by: Huacai Chen 
---
 arch/mips/cavium-octeon/dma-octeon.c   |  3 ++-
 arch/mips/include/asm/dma-mapping.h|  2 ++
 arch/mips/loongson64/common/dma-swiotlb.c  |  1 +
 arch/mips/mm/dma-default.c | 11 ++-
 arch/mips/netlogic/common/nlm-dma.c|  3 ++-
 drivers/infiniband/hw/mthca/mthca_main.c   |  2 +-
 drivers/media/v4l2-core/videobuf2-dma-contig.c |  2 +-
 drivers/net/ethernet/broadcom/b44.c|  2 +-
 drivers/net/ethernet/ibm/emac/core.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  |  2 +-
 drivers/spi/spi-qup.c  |  4 ++--
 drivers/tty/serial/mpsc.c  | 16 
 drivers/tty/serial/samsung.c   | 14 +++---
 include/linux/dma-mapping.h| 17 -
 14 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index c64bd87..7978237 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -324,7 +324,8 @@ static struct octeon_dma_map_ops _octeon_pci_dma_map_ops = {
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = octeon_dma_sync_sg_for_device,
.mapping_error = swiotlb_dma_mapping_error,
-   .dma_supported = swiotlb_dma_supported
+   .dma_supported = swiotlb_dma_supported,
+   .get_cache_alignment = mips_get_cache_alignment
},
 };
 
diff --git a/arch/mips/include/asm/dma-mapping.h 
b/arch/mips/include/asm/dma-mapping.h
index aba7138..e2c5d9e 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -39,4 +39,6 @@ static inline void arch_setup_dma_ops(struct device *dev, u64 
dma_base,
 #endif
 }
 
+int mips_get_cache_alignment(struct device *dev);
+
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c 
b/arch/mips/loongson64/common/dma-swiotlb.c
index 34486c1..09cb8a4 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -119,6 +119,7 @@ static const struct dma_map_ops loongson_dma_map_ops = {
.sync_sg_for_device = loongson_dma_sync_sg_for_device,
.mapping_error = swiotlb_dma_mapping_error,
.dma_supported = loongson_dma_supported,
+   .get_cache_alignment = mips_get_cache_alignment
 };
 
 void __init plat_swiotlb_setup(void)
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index c01bd20..c49987e 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -394,6 +394,14 @@ void dma_cache_sync(struct device *dev, void *vaddr, 
size_t size,
 
 EXPORT_SYMBOL(dma_cache_sync);
 
+int mips_get_cache_alignment(struct device *dev)
+{
+   if (plat_device_is_coherent(dev))
+   return 1;
+   else
+   return ARCH_DMA_MINALIGN;
+}
+
 static const struct dma_map_ops mips_default_dma_map_ops = {
.alloc = mips_dma_alloc_coherent,
.free = mips_dma_free_coherent,
@@ -407,7 +415,8 @@ static const struct dma_map_ops mips_default_dma_map_ops = {
.sync_sg_for_cpu = mips_dma_sync_sg_for_cpu,
.sync_sg_for_device = mips_dma_sync_sg_for_device,
.mapping_error = mips_dma_mapping_error,
-   .dma_supported = mips_dma_supported
+   .dma_supported = mips_dma_supported,
+   .get_cache_alignment = mips_get_cache_alignment
 };
 
 const struct dma_map_ops *mips_dma_map_ops = _default_dma_map_ops;
diff --git a/arch/mips/netlogic/common/nlm-dma.c 
b/arch/mips/netlogic/common/nlm-dma.c
index 0ec9d9d..1e107ac 100644
--- a/arch/mips/netlogic/common/nlm-dma.c
+++ b/arch/mips/netlogic/common/nlm-dma.c
@@ -79,7 +79,8 @@ const struct dma_map_ops nlm_swiotlb_dma_ops = {
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.mapping_error = swiotlb_dma_mapping_error,
-   .dma_supported = swiotlb_dma_supported
+   .dma_supported = swiotlb_dma_supported,
+   .get_cache_alignment = mips_get_cache_alignment
 };
 
 void __init plat_swiotlb_setup(void)
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c 
b/drivers/infiniband/hw/mthca/mthca_main.c
index e36a9bc..cac5fac 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ 

Re: [PATCH V5 06/14] block: Add zoned block device information to request queue

2017-09-25 Thread Ming Lei
On Mon, Sep 25, 2017 at 03:14:46PM +0900, Damien Le Moal wrote:
> Components relying only on the requeuest_queue structure for accessing
> block devices (e.g. I/O schedulers) have a limited knowledged of the
> device characteristics. In particular, the device capacity cannot be
> easily discovered, which for a zoned block device also result in the
> inability to easily know the number of zones of the device (the zone
> size is indicated by the chunk_sectors field of the queue limits).
> 
> Introduce the nr_zones field to the request_queue sturcture to simplify
> access to this information. Also, add the bitmap seq_zone_bitmap which
> indicates which zones of the device are sequential zones (write
> preferred or write required). These two fields are initialized by the
> low level block device driver (sd.c for ZBC/ZAC disks). They are not
> initialized by stacking drivers (device mappers) handling zoned block
> devices (e.g. dm-linear).
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Ming Lei 

-- 
Ming


Re: [PATCH 01/19] lpfc: Fix crash in pci hot plug situations

2017-09-25 Thread Johannes Thumshirn
This probably should be two patches and both CCed to stable.

Byte,
Johannes
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 02/19] lpfc: Fix crash receiving ELS while detaching driver

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 

But should probably go into stable as well.
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 04/19] lpfc: Fix warning messages when NVME_TARGET_FC not defined

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 03/19] lpfc: Fix lpfc nvme host rejecting IO with Not Ready message

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 05/19] lpfc: PLOGI failures during NPIV testing

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 

But maybe worth adding a:
Fixes: 4042629e426d ("[SCSI] lpfc 8.3.20: Updates to FC discovery commands")

Byte,
Johannes
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


RE: [PATCH V2] megaraid: kmemleak: Track page allocation for fusion

2017-09-25 Thread Shivasharan Srikanteshwara
> -Original Message-
> From: Christoph Hellwig [mailto:h...@infradead.org]
> Sent: Monday, September 18, 2017 9:52 PM
> To: Shivasharan Srikanteshwara
> Cc: Christoph Hellwig; shuw...@redhat.com; Kashyap Desai; Sumit Saxena;
> j...@linux.vnet.ibm.com; martin.peter...@oracle.com;
> PDL,MEGARAIDLINUX; linux-scsi@vger.kernel.org; linux-
> ker...@vger.kernel.org; ch...@redhat.com; yiz...@redhat.com;
> catalin.mari...@arm.com
> Subject: Re: [PATCH V2] megaraid: kmemleak: Track page allocation for
fusion
>
> Oh, I missed log_to_span.  Well, in that case log_to_span is _the_
candidate
> for moving into a separate allocation.
>
> And in fact you're probably better off by using a sensible data
structure for it,
> e.g. a radix tree.

Thanks Christoph.
We will make the changes suggested in phased approach.
First we will fix kmemleak false positives by moving log_to_span
allocation separate from fusion_context.
The data structure change would involve major changes which affects IO
path as well.
Also driver expects log_to_span and other data structures to be available
at load time itself. Considering this, we need to understand if radix tree
would be a good choice for the change.
Based on internal discussions, we see other similar arrays in driver code
that we can change similarly eg. load_balance_info.
This is definitely something to add to our to-do lists.
These changes need to go through our internal regression test cycle and
then submit it to upstream.

Best regards,
Shivasharan


Re: [PATCH V4 02/16] scsi: sd_zbc: Fix comments and indentation

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH V4 03/16] scsi: sd_zbc: Rearrange code

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


[PATCH] scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add()

2017-09-25 Thread Hannes Reinecke
During failover there is a small race window between fc_remote_port_add()
and fc_timeout_deleted_rport(); the latter drops the lock after setting
the port to NOTPRESENT, so if fc_remote_port_add() is called right at
that time it will fail to detect the existing rport and happily adding
a new structure, causing rports to get registered twice.

Signed-off-by: Hannes Reinecke 
---
 drivers/scsi/scsi_transport_fc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 3c6bc00..9d2d559 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -2739,7 +2739,8 @@ struct fc_rport *
 
list_for_each_entry(rport, _host->rports, peers) {
 
-   if ((rport->port_state == FC_PORTSTATE_BLOCKED) &&
+   if ((rport->port_state == FC_PORTSTATE_BLOCKED ||
+rport->port_state == FC_PORTSTATE_NOTPRESENT) &&
(rport->channel == channel)) {
 
switch (fc_host->tgtid_bind_type) {
-- 
1.8.5.6



Re: [PATCH] scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add()

2017-09-25 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


[PATCH] scsi: fc: check for rport presence in fc_block_scsi_eh

2017-09-25 Thread Johannes Thumshirn
Coverity-scan recently found a possible NULL pointer dereference in
fc_block_scsi_eh() as starget_to_rport() either returns the rport for
the startget or NULL.

While it is rather unlikely to have fc_block_scsi_eh() called without
an rport associated it's a good idea to catch potential misuses of the
API gracefully.

Signed-off-by: Johannes Thumshirn 
---
 drivers/scsi/scsi_transport_fc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index ba9d70f8a6a1..830ce53f30fb 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3328,6 +3328,9 @@ int fc_block_scsi_eh(struct scsi_cmnd *cmnd)
 {
struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
 
+   if (WARN_ON(!rport))
+   return 0;
+
return fc_block_rport(rport);
 }
 EXPORT_SYMBOL(fc_block_scsi_eh);
-- 
2.13.5



[PATCH V5 04/14] scsi: sd_zbc: Use well defined macros

2017-09-25 Thread Damien Le Moal
instead of open coding, use the min() macro to calculate a report zones
reply buffer length in sd_zbc_check_zone_size() and the round_up()
macro for calculating the number of zones in sd_zbc_setup().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
Reviewed-by: Johannes Thumshirn 
Reviewed-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7dbaf920679e..bbad851c1789 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -475,7 +475,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf)
return 0;
 }
 
-#define SD_ZBC_BUF_SIZE 131072
+#define SD_ZBC_BUF_SIZE 131072U
 
 /**
  * sd_zbc_check_zone_size - Check the device zone sizes
@@ -526,10 +526,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
/* Parse REPORT ZONES header */
list_length = get_unaligned_be32([0]) + 64;
rec = buf + 64;
-   if (list_length < SD_ZBC_BUF_SIZE)
-   buf_len = list_length;
-   else
-   buf_len = SD_ZBC_BUF_SIZE;
+   buf_len = min(list_length, SD_ZBC_BUF_SIZE);
 
/* Parse zone descriptors */
while (rec < buf + buf_len) {
@@ -599,9 +596,8 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
/* chunk_sectors indicates the zone size */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
-   if (sdkp->capacity & (sdkp->zone_blocks - 1))
-   sdkp->nr_zones++;
+   sdkp->nr_zones =
+   round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
if (!sdkp->zones_wlock) {
sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
-- 
2.13.5



[PATCH V5 11/14] blokc: mq-deadline: Introduce dispatch helpers

2017-09-25 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
__dd_dispatch_request().

Signed-off-by: Damien Le Moal 
---
 block/mq-deadline.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index af2eb9b3936e..296880e2471f 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -195,13 +195,42 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
 static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-   struct request *rq;
+   struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
 
@@ -217,10 +246,9 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -263,19 +291,20 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.13.5



[PATCH V5 13/14] block: mq-deadline: Limit write request dispatch for zoned block devices

2017-09-25 Thread Damien Le Moal
When dispatching writes to a zoned block device, only allow the request
to be dispatched if its target zone is not locked. If it is, leave the
request in the scheduler queue and look for another suitable write
request. If no write can be dispatched, allow reads to be dispatched
even if the write batch is not done.

Signed-off-by: Damien Le Moal 
---
 block/mq-deadline.c | 62 +
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 186c32099845..fc3e50a0a495 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -282,19 +282,47 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 }
 
 /*
+ * Test if a request can be dispatched.
+ */
+static inline bool deadline_can_dispatch_request(struct deadline_data *dd,
+struct request *rq)
+{
+   if (!deadline_request_needs_zone_wlock(dd, rq))
+   return true;
+   return !deadline_zone_is_wlocked(dd, rq);
+}
+
+/*
  * For the specified data direction, return the next request to
  * dispatch using arrival ordered lists.
  */
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (!dd->zones_wlock || data_dir == READ)
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+
+   spin_lock_irqsave(>zone_lock, flags);
+
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (deadline_can_dispatch_request(dd, rq))
+   goto out;
+   }
+   rq = NULL;
+
+out:
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -304,10 +332,25 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!dd->zones_wlock || data_dir == READ)
+   return rq;
+
+   spin_lock_irqsave(>zone_lock, flags);
+   while (rq) {
+   if (deadline_can_dispatch_request(dd, rq))
+   break;
+   rq = deadline_latter_request(rq);
+   }
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -349,7 +392,8 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -394,6 +438,13 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
rq = next_rq;
}
 
+   /*
+* If we only have writes queued and none of them can be dispatched,
+* rq will be NULL.
+*/
+   if (!rq)
+   return NULL;
+
dd->batching = 0;
 
 dispatch_request:
@@ -560,7 +611,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, 
struct request *rq,
 
/*
 * This may be a requeue of a request that has locked its
-* target zone. If this is the case, release the request zone lock.
+* target zone. If this is the case, release the zone lock.
 */
if (deadline_request_has_zone_wlock(rq))
deadline_wunlock_zone(dd, rq);
@@ -570,6 +621,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, 
struct request *rq,
 
blk_mq_sched_request_inserted(rq);
 
+   if (at_head && deadline_request_needs_zone_wlock(dd, rq))
+   pr_info(" Write at head !\n");
+
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(>queuelist, >dispatch);
-- 
2.13.5



[PATCH V5 05/14] scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()

2017-09-25 Thread Damien Le Moal
The three values starting at byte 8 of the Zoned Block Device
Characteristics VPD page B6h are 32 bits values, not 64bits. So use
get_unaligned_be32() to retrieve the values and not get_unaligned_be64()

Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
Cc: 

Signed-off-by: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Reviewed-by: Johannes Thumshirn 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index bbad851c1789..27793b9f54c0 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -423,15 +423,15 @@ static int sd_zbc_read_zoned_characteristics(struct 
scsi_disk *sdkp,
if (sdkp->device->type != TYPE_ZBC) {
/* Host-aware */
sdkp->urswrz = 1;
-   sdkp->zones_optimal_open = get_unaligned_be64([8]);
-   sdkp->zones_optimal_nonseq = get_unaligned_be64([12]);
+   sdkp->zones_optimal_open = get_unaligned_be32([8]);
+   sdkp->zones_optimal_nonseq = get_unaligned_be32([12]);
sdkp->zones_max_open = 0;
} else {
/* Host-managed */
sdkp->urswrz = buf[4] & 1;
sdkp->zones_optimal_open = 0;
sdkp->zones_optimal_nonseq = 0;
-   sdkp->zones_max_open = get_unaligned_be64([16]);
+   sdkp->zones_max_open = get_unaligned_be32([16]);
}
 
return 0;
-- 
2.13.5



[PATCH V5 14/14] block: do not set mq default scheduler

2017-09-25 Thread Damien Le Moal
For blk-mq disks with a single hardware queue, setting by default the
disk scheduler to mq-deadline early during the queue initialization
prevents properly setting zone write locking for host managed zoned
block device as the disk type is not yet known.

Fix this by simply not setting the default scheduler to mq-deadline for
single hardware queue disks. A udev rule can be used to easily do the
same later in the system initialization sequence, when the device
characteristics are known.

Signed-off-by: Damien Le Moal 
---
 block/elevator.c | 17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 153926a90901..8b65a757f726 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -222,19 +222,14 @@ int elevator_init(struct request_queue *q, char *name)
 
if (!e) {
/*
-* For blk-mq devices, we default to using mq-deadline,
-* if available, for single queue devices. If deadline
-* isn't available OR we have multiple queues, default
-* to "none".
+* For blk-mq devices, default to "none". udev can later set
+* an appropriate default scheduler based on the disk
+* characteristics which we do not yet have here.
 */
-   if (q->mq_ops) {
-   if (q->nr_hw_queues == 1)
-   e = elevator_get("mq-deadline", false);
-   if (!e)
-   return 0;
-   } else
-   e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+   if (q->mq_ops)
+   return 0;
 
+   e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
if (!e) {
printk(KERN_ERR
"Default I/O scheduler not found. " \
-- 
2.13.5



[PATCH V5 00/14] scsi-mq support for ZBC disks

2017-09-25 Thread Damien Le Moal
This series implements support for ZBC disks used through the scsi-mq I/O path.

The current scsi level support of ZBC disks guarantees write request ordering
using a per-zone write lock which prevents issuing simultaneously multiple
write commands to a zone, doing so avoid reordering of sequential writes to
sequential zones. This method is however ineffective when scsi-mq is used with
zoned block devices. This is due to the different execution model of blk-mq
which passes a request to the scsi layer for dispatching after the request has
been removed from the I/O scheduler queue. That is, when the scsi layer tries
to lock the target zone of the request, the request may already be out of
order and zone write locking fails to prevent that.

Various approaches have been tried to solve this problem. All of them had the
serious disadvantage of cluttering blk-mq code with zoned block device specific
conditions and processing. As such extensive changes can only turn into a
maintenance nightmares, a radically different solution is proposed here.

This series proposes implementing scsi-mq support for zoned block devices at
the I/O scheduler level with simple modifications of the mq-deadline scheduler.
the modifications are the addition of a per zone write locking mechanism
similar to that implemented in sd_zbc.c for the legacy scsi path. The zone
write locking mechanism is used for the exact same purpose, that is, to limit
writes per zone to at most one request to avoid reordering. The locking context
however changes from that of scsi-sq and is moved to the dispatch_request
method of the scheduler. Within this context, under a spin lock guaranteeing
atomicity against other dispatch contexts, target zones of write requests can
be locked before write requests removal from the scheduler. In effect, this
results in the same behavior as the legacy scsi path. Sequential write ordering
is preserved.

The changes to mq-deadline do not affect regular disks: the same scheduling
behavior is maintained for these. The modification are also optimized to not
lock conventional zones. To do so, additional data is introduced in the
request queue structure so that the low level scsi code can pass upward
information such as the total number of zones and zone types of the device.
The availability of this new data avoids difficulties in accessing this
information from the I/O scheduler initialization method (init_queue() method)
context.

Of note is that the last patch of this series removes setting mq-deadline as the
default scheduler for block devices with a single hardware queue. The reason for
this is that setting the default scheduler is done very early in the device
initialization sequence, when the disk characteristics are not yet known. This
results in mq-deadline not correctly setting the default zones write locking
behavior nased on the device zoning model. Setting of a default I/O scheduler
can be done easily with udev rules later in the system initialization process,
leading to correct default settings for zoned block devices.

Comments are as always very much appreciated.

Changes from v4:
* Various fixes and improvements (From Christoph's comments)
* Dropped zones_wlock scheduler tunable attribute

Changes from v3:
* Integrated support directly into mq-deadline instead of creating a new I/O
  scheduler.
* Disable setting of default mq scheduler for single queue devices

Changes from v2:
* Introduced blk_zoned structure
* Moved I/O scheduler from drivers/scsi to block 

Changes from v1:
* Addressed Bart's comments for the blk-mq patches (declarations files)
* Split (former) patch 4 into multiple patches to facilitate review
* Fixed scsi disk lookup from io scheduler by introducing
  scsi_disk_from_queue()

Damien Le Moal (14):
  scsi: sd_zbc: Move ZBC declarations to scsi_proto.h
  scsi: sd_zbc: Fix comments and indentation
  scsi: sd_zbc: Rearrange code
  scsi: sd_zbc: Use well defined macros
  scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()
  block: Add zoned block device information to request queue
  scsi: sd_zbc: Initialize device request queue zoned data
  scsi: sd_zbc: Limit zone write locking to sequential zones
  scsi: sd_zbc: Disable zone write locking with scsi-mq
  block: mq-deadline: Add zoned block device data
  blokc: mq-deadline: Introduce dispatch helpers
  block: mq-deadline: Introduce zone locking support
  block: mq-deadline: Limit write request dispatch for zoned block
devices
  block: do not set mq default scheduler

 block/elevator.c  |  17 +--
 block/mq-deadline.c   | 265 ++--
 drivers/scsi/scsi_lib.c   |   5 +-
 drivers/scsi/sd_zbc.c | 340 +++---
 include/linux/blkdev.h|  53 
 include/scsi/scsi_proto.h |  45 --
 6 files changed, 612 insertions(+), 113 deletions(-)

-- 
2.13.5



[PATCH V5 02/14] scsi: sd_zbc: Fix comments and indentation

2017-09-25 Thread Damien Le Moal
Fix comments style (use kernel-doc style) and content to clarify some
functions. Also fix some functions signature indentation and remove a
useless blank line in sd_zbc_read_zones().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/scsi_lib.c |   5 ++-
 drivers/scsi/sd_zbc.c   | 117 +---
 2 files changed, 104 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe297..c72b97a74906 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1752,7 +1752,10 @@ static void scsi_done(struct scsi_cmnd *cmd)
  *
  * Returns: Nothing
  *
- * Lock status: IO request lock assumed to be held when called.
+ * Lock status: request queue lock assumed to be held when called.
+ *
+ * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
+ * protection for ZBC disks.
  */
 static void scsi_request_fn(struct request_queue *q)
__releases(q->queue_lock)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 692c8cbc7ed8..023f705ae235 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -32,10 +32,14 @@
 #include "sd.h"
 
 /**
- * Convert a zone descriptor to a zone struct.
+ * sd_zbc_parse_report - Convert a zone descriptor to a struct blk_zone,
+ * @sdkp: The disk the report originated from
+ * @buf: Address of the report zone descriptor
+ * @zone: the destination zone structure
+ *
+ * All LBA sized values are converted to 512B sectors unit.
  */
-static void sd_zbc_parse_report(struct scsi_disk *sdkp,
-   u8 *buf,
+static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
struct blk_zone *zone)
 {
struct scsi_device *sdp = sdkp->device;
@@ -58,7 +62,13 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp,
 }
 
 /**
- * Issue a REPORT ZONES scsi command.
+ * sd_zbc_report_zones - Issue a REPORT ZONES scsi command.
+ * @sdkp: The target disk
+ * @buf: Buffer to use for the reply
+ * @buflen: the buffer size
+ * @lba: Start LBA of the report
+ *
+ * For internal use during device validation.
  */
 static int sd_zbc_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
   unsigned int buflen, sector_t lba)
@@ -99,6 +109,12 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, 
unsigned char *buf,
return 0;
 }
 
+/**
+ * sd_zbc_setup_report_cmnd - Prepare a REPORT ZONES scsi command
+ * @cmd: The command to setup
+ *
+ * Call in sd_init_command() for a REQ_OP_ZONE_REPORT request.
+ */
 int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -141,6 +157,14 @@ int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
return BLKPREP_OK;
 }
 
+/**
+ * sd_zbc_report_zones_complete - Process a REPORT ZONES scsi command reply.
+ * @scmd: The completed report zones command
+ * @good_bytes: reply size in bytes
+ *
+ * Convert all reported zone descriptors to struct blk_zone. The conversion
+ * is done in-place, directly in the request specified sg buffer.
+ */
 static void sd_zbc_report_zones_complete(struct scsi_cmnd *scmd,
 unsigned int good_bytes)
 {
@@ -196,17 +220,32 @@ static void sd_zbc_report_zones_complete(struct scsi_cmnd 
*scmd,
local_irq_restore(flags);
 }
 
+/**
+ * sd_zbc_zone_sectors - Get the device zone size in number of 512B sectors.
+ * @sdkp: The target disk
+ */
 static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
 {
return logical_to_sectors(sdkp->device, sdkp->zone_blocks);
 }
 
+/**
+ * sd_zbc_zone_no - Get the number of the zone conataining a sector.
+ * @sdkp: The target disk
+ * @sector: 512B sector address contained in the zone
+ */
 static inline unsigned int sd_zbc_zone_no(struct scsi_disk *sdkp,
  sector_t sector)
 {
return sectors_to_logical(sdkp->device, sector) >> sdkp->zone_shift;
 }
 
+/**
+ * sd_zbc_setup_reset_cmnd - Prepare a RESET WRITE POINTER scsi command.
+ * @cmd: the command to setup
+ *
+ * Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
+ */
 int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -239,6 +278,23 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
return BLKPREP_OK;
 }
 
+/**
+ * sd_zbc_write_lock_zone - Write lock a sequential zone.
+ * @cmd: write command
+ *
+ * Called from sd_init_cmd() for write requests (standard write, write same or
+ * write zeroes operations). If the request target zone is not already locked,
+ * the zone is locked and BLKPREP_OK returned, allowing the request to proceed
+ * through dispatch in scsi_request_fn(). Otherwise, BLKPREP_DEFER is returned,
+ * forcing the request to wait for the zone to be unlocked, that is, for the
+ * 

[PATCH V5 07/14] scsi: sd_zbc: Initialize device request queue zoned data

2017-09-25 Thread Damien Le Moal
Initialize the seq_zone_bitmap and nr_zones fields of the disk request
queue on disk revalidate. As the seq_zone_bitmap allocation is
identical to the allocation of the zone write lock bitmap, introduce
the helper sd_zbc_alloc_zone_bitmap(). Using this helper, wait for the
disk capacity and number of zones to stabilize on the second
revalidation pass to allocate and initialize the bitmaps.

Signed-off-by: Damien Le Moal 
---
 drivers/scsi/sd_zbc.c | 144 --
 1 file changed, 139 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 27793b9f54c0..cc64fada9cd9 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -586,8 +586,127 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return 0;
 }
 
+/**
+ * sd_zbc_alloc_zone_bitmap - Allocate a zone bitmap (one bit per zone).
+ * @sdkp: The disk of the bitmap
+ */
+static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
+   * sizeof(unsigned long),
+   GFP_KERNEL, q->node);
+}
+
+/**
+ * sd_zbc_get_seq_zones - Parse report zones reply to identify sequential zones
+ * @sdkp: disk used
+ * @buf: report reply buffer
+ * @seq_zone_bitamp: bitmap of sequential zones to set
+ * @zno: Zone number of the first zone in the report
+ *
+ * Parse reported zone descriptors to find sequiential zones.
+ * Since read-only and offline zones cannot be written, do not
+ * mark them as sequential in the bitmap.
+ * Return the LBA after the last zone reported.
+ */
+static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char 
*buf,
+unsigned int buflen,
+unsigned long *seq_zone_bitmap,
+unsigned int *zno)
+{
+   sector_t last_lba = sdkp->capacity;
+   unsigned int buf_len, list_length;
+   unsigned int n = *zno;
+   unsigned char *rec;
+   u8 type, cond;
+
+   list_length = get_unaligned_be32([0]) + 64;
+   buf_len = min(list_length, buflen);
+   rec = buf + 64;
+
+   while (rec < buf + buf_len) {
+   type = rec[0] & 0x0f;
+   cond = (rec[1] >> 4) & 0xf;
+   if (type != ZBC_ZONE_TYPE_CONV &&
+   cond != ZBC_ZONE_COND_READONLY &&
+   cond != ZBC_ZONE_COND_OFFLINE)
+   set_bit(n, seq_zone_bitmap);
+   last_lba = get_unaligned_be64([8]) +
+   get_unaligned_be64([16]);
+   rec += 64;
+   n++;
+   }
+
+   *zno = n;
+
+   return last_lba;
+}
+
+/**
+ * sd_zbc_setup_seq_zone_bitmap - Initialize the disk seq zone bitmap.
+ * @sdkp: target disk
+ *
+ * Allocate a zone bitmap and initialize it by identifying sequential zones.
+ */
+static int sd_zbc_setup_seq_zone_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+   unsigned long *seq_zone_bitmap;
+   sector_t lba = 0;
+   unsigned char *buf;
+   unsigned int n = 0;
+   int ret = -ENOMEM;
+
+   seq_zone_bitmap = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!seq_zone_bitmap)
+   return -ENOMEM;
+
+   buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   while (lba < sdkp->capacity) {
+   ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, lba);
+   if (ret)
+   goto out;
+   lba = sd_zbc_get_seq_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+  seq_zone_bitmap, );
+   }
+
+   if (n != sdkp->nr_zones) {
+   /* Something went wrong */
+   ret = -EIO;
+   }
+
+out:
+   kfree(buf);
+   if (ret) {
+   kfree(seq_zone_bitmap);
+   return ret;
+   }
+
+   q->seq_zone_bitmap = seq_zone_bitmap;
+
+   return 0;
+}
+
+static void sd_zbc_cleanup(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   kfree(q->seq_zone_bitmap);
+   q->seq_zone_bitmap = NULL;
+
+   kfree(sdkp->zones_wlock);
+   sdkp->zones_wlock = NULL;
+}
+
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   int ret;
 
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
@@ -599,14 +718,29 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
sdkp->nr_zones =
round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
+   /*
+* Wait for the disk capacity to stabilize before
+* initializing zone related information.
+*/
+   if (sdkp->first_scan)
+   return 0;
+
  

[PATCH V5 06/14] block: Add zoned block device information to request queue

2017-09-25 Thread Damien Le Moal
Components relying only on the requeuest_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue sturcture to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required). These two fields are initialized by the
low level block device driver (sd.c for ZBC/ZAC disks). They are not
initialized by stacking drivers (device mappers) handling zoned block
devices (e.g. dm-linear).

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 include/linux/blkdev.h | 53 ++
 1 file changed, 53 insertions(+)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 460294bb0fa5..90285f39030d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -544,6 +544,18 @@ struct request_queue {
struct queue_limits limits;
 
/*
+* Zoned block device information for mq I/O schedulers.
+* nr_zones is the total number of zones of the device. This is always
+* 0 for regular block devices. seq_zone_bitmap is a bitmap of nr_zones
+* bits which indicates if a zone is conventional (bit clear) or
+* sequential (bit set). Both nr_zones and seq_zone_bitmap are set
+* by the low level device driver. Stacking drivers (device mappers)
+* may or may not initialize these fields.
+*/
+   unsigned intnr_zones;
+   unsigned long   *seq_zone_bitmap;
+
+   /*
 * sg stuff
 */
unsigned intsg_timeout;
@@ -785,6 +797,27 @@ static inline unsigned int blk_queue_zone_sectors(struct 
request_queue *q)
return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+   return q->nr_zones;
+}
+
+static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+sector_t sector)
+{
+   if (!blk_queue_is_zoned(q))
+   return 0;
+   return sector >> ilog2(q->limits.chunk_sectors);
+}
+
+static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+sector_t sector)
+{
+   if (!blk_queue_is_zoned(q) || !q->seq_zone_bitmap)
+   return false;
+   return test_bit(blk_queue_zone_no(q, sector), q->seq_zone_bitmap);
+}
+
 static inline bool rq_is_sync(struct request *rq)
 {
return op_is_sync(rq->cmd_flags);
@@ -1031,6 +1064,16 @@ static inline unsigned int blk_rq_cur_sectors(const 
struct request *rq)
return blk_rq_cur_bytes(rq) >> 9;
 }
 
+static inline unsigned int blk_rq_zone_no(struct request *rq)
+{
+   return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+}
+
+static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+{
+   return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+}
+
 /*
  * Some commands like WRITE SAME have a payload or data transfer size which
  * is different from the size of the request.  Any driver that supports such
@@ -1582,6 +1625,16 @@ static inline unsigned int bdev_zone_sectors(struct 
block_device *bdev)
return 0;
 }
 
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+   struct request_queue *q = bdev_get_queue(bdev);
+
+   if (q)
+   return blk_queue_nr_zones(q);
+
+   return 0;
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
return q ? q->dma_alignment : 511;
-- 
2.13.5



[PATCH V5 01/14] scsi: sd_zbc: Move ZBC declarations to scsi_proto.h

2017-09-25 Thread Damien Le Moal
Move standard macro definitions for the zone types and zone conditions
to scsi_proto.h together with the definitions related to the
REPORT ZONES command. While at it, define all values in the enums to
be clear.

Also remove unnecessary includes in sd_zbc.c.

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Reviewed-by: Johannes Thumshirn 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 24 
 include/scsi/scsi_proto.h | 45 ++---
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 8aa54779aac1..692c8cbc7ed8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -28,32 +28,8 @@
 
 #include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
 
 #include "sd.h"
-#include "scsi_priv.h"
-
-enum zbc_zone_type {
-   ZBC_ZONE_TYPE_CONV = 0x1,
-   ZBC_ZONE_TYPE_SEQWRITE_REQ,
-   ZBC_ZONE_TYPE_SEQWRITE_PREF,
-   ZBC_ZONE_TYPE_RESERVED,
-};
-
-enum zbc_zone_cond {
-   ZBC_ZONE_COND_NO_WP,
-   ZBC_ZONE_COND_EMPTY,
-   ZBC_ZONE_COND_IMP_OPEN,
-   ZBC_ZONE_COND_EXP_OPEN,
-   ZBC_ZONE_COND_CLOSED,
-   ZBC_ZONE_COND_READONLY = 0xd,
-   ZBC_ZONE_COND_FULL,
-   ZBC_ZONE_COND_OFFLINE,
-};
 
 /**
  * Convert a zone descriptor to a zone struct.
diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h
index 8c285d9a06d8..39130a9c05bf 100644
--- a/include/scsi/scsi_proto.h
+++ b/include/scsi/scsi_proto.h
@@ -301,19 +301,42 @@ struct scsi_lun {
 
 /* Reporting options for REPORT ZONES */
 enum zbc_zone_reporting_options {
-   ZBC_ZONE_REPORTING_OPTION_ALL = 0,
-   ZBC_ZONE_REPORTING_OPTION_EMPTY,
-   ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN,
-   ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN,
-   ZBC_ZONE_REPORTING_OPTION_CLOSED,
-   ZBC_ZONE_REPORTING_OPTION_FULL,
-   ZBC_ZONE_REPORTING_OPTION_READONLY,
-   ZBC_ZONE_REPORTING_OPTION_OFFLINE,
-   ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
-   ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE,
-   ZBC_ZONE_REPORTING_OPTION_NON_WP = 0x3f,
+   ZBC_ZONE_REPORTING_OPTION_ALL   = 0x00,
+   ZBC_ZONE_REPORTING_OPTION_EMPTY = 0x01,
+   ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN = 0x02,
+   ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN = 0x03,
+   ZBC_ZONE_REPORTING_OPTION_CLOSED= 0x04,
+   ZBC_ZONE_REPORTING_OPTION_FULL  = 0x05,
+   ZBC_ZONE_REPORTING_OPTION_READONLY  = 0x06,
+   ZBC_ZONE_REPORTING_OPTION_OFFLINE   = 0x07,
+   /* 0x08 to 0x0f are reserved */
+   ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
+   ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE  = 0x11,
+   /* 0x12 to 0x3e are reserved */
+   ZBC_ZONE_REPORTING_OPTION_NON_WP= 0x3f,
 };
 
 #define ZBC_REPORT_ZONE_PARTIAL 0x80
 
+/* Zone types of REPORT ZONES zone descriptors */
+enum zbc_zone_type {
+   ZBC_ZONE_TYPE_CONV  = 0x1,
+   ZBC_ZONE_TYPE_SEQWRITE_REQ  = 0x2,
+   ZBC_ZONE_TYPE_SEQWRITE_PREF = 0x3,
+   /* 0x4 to 0xf are reserved */
+};
+
+/* Zone conditions of REPORT ZONES zone descriptors */
+enum zbc_zone_cond {
+   ZBC_ZONE_COND_NO_WP = 0x0,
+   ZBC_ZONE_COND_EMPTY = 0x1,
+   ZBC_ZONE_COND_IMP_OPEN  = 0x2,
+   ZBC_ZONE_COND_EXP_OPEN  = 0x3,
+   ZBC_ZONE_COND_CLOSED= 0x4,
+   /* 0x5 to 0xc are reserved */
+   ZBC_ZONE_COND_READONLY  = 0xd,
+   ZBC_ZONE_COND_FULL  = 0xe,
+   ZBC_ZONE_COND_OFFLINE   = 0xf,
+};
+
 #endif /* _SCSI_PROTO_H_ */
-- 
2.13.5



[PATCH V5 09/14] scsi: sd_zbc: Disable zone write locking with scsi-mq

2017-09-25 Thread Damien Le Moal
In the case of a ZBC disk used with scsi-mq, zone write locking does
not prevent write reordering in sequential zones. Unlike the legacy
case, zone locking is done after the command request is removed from
the scheduler dispatch queue. That is, at the time of zone locking,
the write command may already be out of order, making locking
ineffective. Write order guarantees can only be provided by an
adapted I/O scheduler.

Disable zone write locking in sd_zbc_write_lock_zone() if the disk is
used with scsi-mq. As the disk zones_wlock bitmap is not necessry,
do not allocate it.

Signed-off-by: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Reviewed-by: Johannes Thumshirn 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 12d663614099..201b2983d8b8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -287,6 +287,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
+   struct request_queue *q = rq->q;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
sector_t zone_sectors = sd_zbc_zone_sectors(sdkp);
@@ -297,10 +298,14 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 */
 
/* Do not allow zone boundaries crossing on host-managed drives */
-   if (blk_queue_zoned_model(rq->q) == BLK_ZONED_HM &&
+   if (blk_queue_zoned_model(q) == BLK_ZONED_HM &&
(sector & (zone_sectors - 1)) + blk_rq_sectors(rq) > zone_sectors)
return BLKPREP_KILL;
 
+   /* No write locking with scsi-mq */
+   if (q->mq_ops)
+   return BLKPREP_OK;
+
/*
 * There is no write constraints on conventional zones. So any write
 * command can be sent. But do not issue more than one write command
@@ -717,7 +722,7 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
if (sdkp->first_scan)
return 0;
 
-   if (!sdkp->zones_wlock) {
+   if (!q->mq_ops && !sdkp->zones_wlock) {
sdkp->zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
if (!sdkp->zones_wlock)
return -ENOMEM;
-- 
2.13.5



[PATCH V5 03/14] scsi: sd_zbc: Rearrange code

2017-09-25 Thread Damien Le Moal
Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
assignments and move the calculation of sdkp->zone_shift together
with the assignment of the verified zone_blocks value in
sd_zbc_check_zone_size().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 023f705ae235..7dbaf920679e 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -584,6 +584,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
}
 
sdkp->zone_blocks = zone_blocks;
+   sdkp->zone_shift = ilog2(zone_blocks);
 
return 0;
 }
@@ -591,10 +592,13 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
 
+   /* READ16/WRITE16 is mandatory for ZBC disks */
+   sdkp->device->use_16_for_rw = 1;
+   sdkp->device->use_10_for_rw = 0;
+
/* chunk_sectors indicates the zone size */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->zone_shift = ilog2(sdkp->zone_blocks);
sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
if (sdkp->capacity & (sdkp->zone_blocks - 1))
sdkp->nr_zones++;
@@ -657,10 +661,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned 
char *buf)
if (ret)
goto err;
 
-   /* READ16/WRITE16 is mandatory for ZBC disks */
-   sdkp->device->use_16_for_rw = 1;
-   sdkp->device->use_10_for_rw = 0;
-
return 0;
 
 err:
-- 
2.13.5



[PATCH V5 12/14] block: mq-deadline: Introduce zone locking support

2017-09-25 Thread Damien Le Moal
For a write request to a zoned block device, lock the request target
zone upon request displatch. The zone is unlocked either when the
request completes or when the request is requeued (inserted).

To indicate that a request has locked its target zone, use the first
pointer of the request elevator private data to store the value
RQ_ZONE_WLOCKED. Testing for this value allows quick decision in
dd_insert_request() and dd_completed_request() regarding the need for
unlocking the target zone of a request.

Signed-off-by: Damien Le Moal 
---
 block/mq-deadline.c | 114 
 1 file changed, 114 insertions(+)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 296880e2471f..186c32099845 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -178,6 +178,93 @@ deadline_move_request(struct deadline_data *dd, struct 
request *rq)
 }
 
 /*
+ * Return true if a request is a write requests that needs zone
+ * write locking.
+ */
+static inline bool deadline_request_needs_zone_wlock(struct deadline_data *dd,
+struct request *rq)
+{
+
+   if (!dd->zones_wlock)
+   return false;
+
+   if (blk_rq_is_passthrough(rq))
+   return false;
+
+   switch (req_op(rq)) {
+   case REQ_OP_WRITE_ZEROES:
+   case REQ_OP_WRITE_SAME:
+   case REQ_OP_WRITE:
+   return blk_rq_zone_is_seq(rq);
+   default:
+   return false;
+   }
+}
+
+/*
+ * Abuse the elv.priv[0] pointer to indicate if a request has write
+ * locked its target zone. Only write request to a zoned block device
+ * can own a zone write lock.
+ */
+#define RQ_ZONE_WLOCKED((void *)1UL)
+static inline void deadline_set_request_zone_wlock(struct request *rq)
+{
+   rq->elv.priv[0] = RQ_ZONE_WLOCKED;
+}
+
+#define RQ_ZONE_NO_WLOCK   ((void *)0UL)
+static inline void deadline_clear_request_zone_wlock(struct request *rq)
+{
+   rq->elv.priv[0] = RQ_ZONE_NO_WLOCK;
+}
+
+static inline bool deadline_request_has_zone_wlock(struct request *rq)
+{
+   return rq->elv.priv[0] == RQ_ZONE_WLOCKED;
+}
+
+/*
+ * Write lock the target zone of a write request.
+ */
+static void deadline_wlock_zone(struct deadline_data *dd,
+   struct request *rq)
+{
+   unsigned int zno = blk_rq_zone_no(rq);
+
+   WARN_ON_ONCE(deadline_request_has_zone_wlock(rq));
+   WARN_ON_ONCE(test_and_set_bit(zno, dd->zones_wlock));
+   deadline_set_request_zone_wlock(rq);
+}
+
+/*
+ * Write unlock the target zone of a write request.
+ */
+static void deadline_wunlock_zone(struct deadline_data *dd,
+ struct request *rq)
+{
+   unsigned int zno = blk_rq_zone_no(rq);
+   unsigned long flags;
+
+   spin_lock_irqsave(>zone_lock, flags);
+
+   WARN_ON_ONCE(!test_and_clear_bit(zno, dd->zones_wlock));
+   deadline_clear_request_zone_wlock(rq);
+
+   spin_unlock_irqrestore(>zone_lock, flags);
+}
+
+/*
+ * Test the write lock state of the target zone of a write request.
+ */
+static inline bool deadline_zone_is_wlocked(struct deadline_data *dd,
+   struct request *rq)
+{
+   unsigned int zno = blk_rq_zone_no(rq);
+
+   return test_bit(zno, dd->zones_wlock);
+}
+
+/*
  * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(>fifo_list[data_dir])
  */
@@ -316,6 +403,11 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
dd->batching++;
deadline_move_request(dd, rq);
 done:
+   /*
+* If the request needs its target zone locked, do it.
+*/
+   if (deadline_request_needs_zone_wlock(dd, rq))
+   deadline_wlock_zone(dd, rq);
rq->rq_flags |= RQF_STARTED;
return rq;
 }
@@ -466,6 +558,13 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, 
struct request *rq,
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
 
+   /*
+* This may be a requeue of a request that has locked its
+* target zone. If this is the case, release the request zone lock.
+*/
+   if (deadline_request_has_zone_wlock(rq))
+   deadline_wunlock_zone(dd, rq);
+
if (blk_mq_sched_try_insert_merge(q, rq))
return;
 
@@ -510,6 +609,20 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
spin_unlock(>lock);
 }
 
+/*
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests.
+ */
+static void dd_completed_request(struct request *rq)
+{
+
+   if (deadline_request_has_zone_wlock(rq)) {
+   struct deadline_data *dd = rq->q->elevator->elevator_data;
+
+   deadline_wunlock_zone(dd, rq);
+   }
+}
+
 static bool dd_has_work(struct 

[PATCH V5 08/14] scsi: sd_zbc: Limit zone write locking to sequential zones

2017-09-25 Thread Damien Le Moal
Conventional zones of zoned block devices have no write constraints.
Write locking of conventional zones is thus not necessary and can even
hurt performance by unnecessarily operating the disk under low queue
depth. To avoid this, use the disk request queue seq_zone_bitmap to
allow any write to be issued to conventional zones, locking only
sequential zones.

While at it, remove the helper sd_zbc_zone_no() and use
blk_rq_zone_no() instead.

Signed-off-by: Damien Le Moal 
---
 drivers/scsi/sd_zbc.c | 32 
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index cc64fada9cd9..12d663614099 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -230,17 +230,6 @@ static inline sector_t sd_zbc_zone_sectors(struct 
scsi_disk *sdkp)
 }
 
 /**
- * sd_zbc_zone_no - Get the number of the zone conataining a sector.
- * @sdkp: The target disk
- * @sector: 512B sector address contained in the zone
- */
-static inline unsigned int sd_zbc_zone_no(struct scsi_disk *sdkp,
- sector_t sector)
-{
-   return sectors_to_logical(sdkp->device, sector) >> sdkp->zone_shift;
-}
-
-/**
  * sd_zbc_setup_reset_cmnd - Prepare a RESET WRITE POINTER scsi command.
  * @cmd: the command to setup
  *
@@ -301,7 +290,6 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
sector_t zone_sectors = sd_zbc_zone_sectors(sdkp);
-   unsigned int zno = sd_zbc_zone_no(sdkp, sector);
 
/*
 * Note: Checks of the alignment of the write command on
@@ -309,18 +297,21 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 */
 
/* Do not allow zone boundaries crossing on host-managed drives */
-   if (blk_queue_zoned_model(sdkp->disk->queue) == BLK_ZONED_HM &&
+   if (blk_queue_zoned_model(rq->q) == BLK_ZONED_HM &&
(sector & (zone_sectors - 1)) + blk_rq_sectors(rq) > zone_sectors)
return BLKPREP_KILL;
 
/*
-* Do not issue more than one write at a time per
-* zone. This solves write ordering problems due to
-* the unlocking of the request queue in the dispatch
-* path in the non scsi-mq case.
+* There is no write constraints on conventional zones. So any write
+* command can be sent. But do not issue more than one write command
+* at a time per sequential zone. This avoids write ordering problems
+* due to the unlocking of the request queue in the dispatch path of
+* legacy scsi path, as well as at the HBA level (e.g. AHCI).
 */
+   if (!blk_rq_zone_is_seq(rq))
+   return BLKPREP_OK;
if (sdkp->zones_wlock &&
-   test_and_set_bit(zno, sdkp->zones_wlock))
+   test_and_set_bit(blk_rq_zone_no(rq), sdkp->zones_wlock))
return BLKPREP_DEFER;
 
WARN_ON_ONCE(cmd->flags & SCMD_ZONE_WRITE_LOCK);
@@ -341,8 +332,9 @@ void sd_zbc_write_unlock_zone(struct scsi_cmnd *cmd)
struct request *rq = cmd->request;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 
-   if (sdkp->zones_wlock && cmd->flags & SCMD_ZONE_WRITE_LOCK) {
-   unsigned int zno = sd_zbc_zone_no(sdkp, blk_rq_pos(rq));
+   if (cmd->flags & SCMD_ZONE_WRITE_LOCK) {
+   unsigned int zno = blk_rq_zone_no(rq);
+
WARN_ON_ONCE(!test_bit(zno, sdkp->zones_wlock));
cmd->flags &= ~SCMD_ZONE_WRITE_LOCK;
clear_bit_unlock(zno, sdkp->zones_wlock);
-- 
2.13.5



[PATCH V5 10/14] block: mq-deadline: Add zoned block device data

2017-09-25 Thread Damien Le Moal
Introduce new fields to mq-deadline private data to support zoned block
devices. The fields are a zone bitmap used to implement zone write
locking and a spinlock to atomically handle zone write locking with
other processing.

Modify mq-dealine init_queue and exit_queue elevator methods to handle
initialization and cleanup of the zone write lock bitmap.

Signed-off-by: Damien Le Moal 
---
 block/mq-deadline.c | 50 ++
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a1cad4331edd..af2eb9b3936e 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -60,6 +60,9 @@ struct deadline_data {
 
spinlock_t lock;
struct list_head dispatch;
+
+   spinlock_t zone_lock;
+   unsigned long *zones_wlock;
 };
 
 static inline struct rb_root *
@@ -300,6 +303,34 @@ static struct request *dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
return rq;
 }
 
+static int deadline_init_zones_wlock(struct request_queue *q,
+struct deadline_data *dd)
+{
+   /*
+* For regular drives or non-conforming zoned block device,
+* do not use zone write locking.
+*/
+   if (!blk_queue_nr_zones(q))
+   return 0;
+
+   /*
+* Treat host aware drives as regular disks.
+*/
+   if (blk_queue_zoned_model(q) != BLK_ZONED_HM)
+   return 0;
+
+   dd->zones_wlock = kzalloc_node(BITS_TO_LONGS(blk_queue_nr_zones(q))
+  * sizeof(unsigned long),
+  GFP_KERNEL, q->node);
+   if (!dd->zones_wlock)
+   return -ENOMEM;
+
+   pr_info("mq-deadline: %s: zones write locking enabled\n",
+   dev_name(q->backing_dev_info->dev));
+
+   return 0;
+}
+
 static void dd_exit_queue(struct elevator_queue *e)
 {
struct deadline_data *dd = e->elevator_data;
@@ -307,6 +338,7 @@ static void dd_exit_queue(struct elevator_queue *e)
BUG_ON(!list_empty(>fifo_list[READ]));
BUG_ON(!list_empty(>fifo_list[WRITE]));
 
+   kfree(dd->zones_wlock);
kfree(dd);
 }
 
@@ -317,16 +349,15 @@ static int dd_init_queue(struct request_queue *q, struct 
elevator_type *e)
 {
struct deadline_data *dd;
struct elevator_queue *eq;
+   int ret = -ENOMEM;
 
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
 
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
-   if (!dd) {
-   kobject_put(>kobj);
-   return -ENOMEM;
-   }
+   if (!dd)
+   goto out_put_elv;
eq->elevator_data = dd;
 
INIT_LIST_HEAD(>fifo_list[READ]);
@@ -340,9 +371,20 @@ static int dd_init_queue(struct request_queue *q, struct 
elevator_type *e)
dd->fifo_batch = fifo_batch;
spin_lock_init(>lock);
INIT_LIST_HEAD(>dispatch);
+   spin_lock_init(>zone_lock);
+
+   ret = deadline_init_zones_wlock(q, dd);
+   if (ret)
+   goto out_free_dd;
 
q->elevator = eq;
return 0;
+
+out_free_dd:
+   kfree(dd);
+out_put_elv:
+   kobject_put(>kobj);
+   return ret;
 }
 
 static int dd_request_merge(struct request_queue *q, struct request **rq,
-- 
2.13.5