[PATCH v2 10/11] block: add a report_zones method

2018-10-11 Thread Damien Le Moal
From: Christoph Hellwig 

Dispatching a report zones command through the request queue is a major
pain due to the command reply payload rewriting necessary. Given that
blkdev_report_zones() is executing everything synchronously, implement
report zones as a block device file operation instead, allowing major
simplification of the code in many places.

sd, null-blk, dm-linear and dm-flakey being the only block device
drivers supporting exposing zoned block devices, these drivers are
modified to provide the device side implementation of the
report_zones() block device file operation.

For device mappers, a new report_zones() target type operation is
defined so that the upper block layer calls blkdev_report_zones() can
be propagated down to the underlying devices of the dm targets.
Implementation for this new operation is added to the dm-linear and
dm-flakey targets.

Signed-off-by: Christoph Hellwig 
[Damien]
* Changed method block_device argument to gendisk
* Various bug fixes and improvements
* Added support for null_blk, dm-linear and dm-flakey.
Signed-off-by: Damien Le Moal 
---
 block/blk-core.c   |   1 -
 block/blk-mq-debugfs.c |   1 -
 block/blk-zoned.c  | 164 ++--
 drivers/block/null_blk.h   |  11 ++-
 drivers/block/null_blk_main.c  |  23 +
 drivers/block/null_blk_zoned.c |  57 +++
 drivers/md/dm-flakey.c |  30 --
 drivers/md/dm-linear.c |  35 ---
 drivers/md/dm.c| 169 -
 drivers/scsi/sd.c  |  13 +--
 drivers/scsi/sd.h  |  11 +--
 drivers/scsi/sd_zbc.c  | 153 +
 include/linux/blk_types.h  |   2 -
 include/linux/blkdev.h |   8 +-
 include/linux/device-mapper.h  |  12 ++-
 include/trace/events/f2fs.h|   1 -
 16 files changed, 266 insertions(+), 425 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index cff0a60ee200..18e7050eb5a4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2307,7 +2307,6 @@ generic_make_request_checks(struct bio *bio)
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
-   case REQ_OP_ZONE_REPORT:
case REQ_OP_ZONE_RESET:
if (!blk_queue_is_zoned(q))
goto not_supported;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index cb1e6cf7ac48..a14ed04c1ff7 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -275,7 +275,6 @@ static const char *const op_name[] = {
REQ_OP_NAME(WRITE),
REQ_OP_NAME(FLUSH),
REQ_OP_NAME(DISCARD),
-   REQ_OP_NAME(ZONE_REPORT),
REQ_OP_NAME(SECURE_ERASE),
REQ_OP_NAME(ZONE_RESET),
REQ_OP_NAME(WRITE_SAME),
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 5d967fd39fbd..90cf503091d5 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -93,13 +93,10 @@ unsigned int blkdev_nr_zones(struct block_device *bdev)
 EXPORT_SYMBOL_GPL(blkdev_nr_zones);
 
 /*
- * Check that a zone report belongs to the partition.
- * If yes, fix its start sector and write pointer, copy it in the
- * zone information array and return true. Return false otherwise.
+ * Check that a zone report belongs to this partition, and if yes, fix its 
start
+ * sector and write pointer and return true. Return false otherwise.
  */
-static bool blkdev_report_zone(struct block_device *bdev,
-  struct blk_zone *rep,
-  struct blk_zone *zone)
+static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
 {
sector_t offset = get_start_sect(bdev);
 
@@ -114,11 +111,36 @@ static bool blkdev_report_zone(struct block_device *bdev,
rep->wp = rep->start + rep->len;
else
rep->wp -= offset;
-   memcpy(zone, rep, sizeof(struct blk_zone));
-
return true;
 }
 
+static int blk_report_zones(struct gendisk *disk, sector_t sector,
+   struct blk_zone *zones, unsigned int *nr_zones,
+   gfp_t gfp_mask)
+{
+   struct request_queue *q = disk->queue;
+   unsigned int z = 0, n, nrz = *nr_zones;
+   sector_t capacity = get_capacity(disk);
+   int ret;
+
+   while (z < nrz && sector < capacity) {
+   n = nrz - z;
+   ret = disk->fops->report_zones(disk, sector, [z], ,
+  gfp_mask);
+   if (ret)
+   return ret;
+   if (!n)
+   break;
+   sector += blk_queue_zone_sectors(q) * n;
+   z += n;
+   }
+
+   WARN_ON(z > *nr_zones);
+   *nr_zones = z;
+
+   return 0;
+}
+
 /**
  * blkdev_report_zones - Get zones information
  * @bdev:  Target block device
@@ -133,130 +155,46

[PATCH v2 09/11] block: Expose queue nr_zones in sysfs

2018-10-11 Thread Damien Le Moal
Expose through sysfs the nr_zones field of a zoned block device request
queue. This represents the total number of zones of the device
calculated using the known disk capacity and zone size.

Exposing this value helps in debugging disk issues as well as
facilitating scripts based use of the disk (e.g. blktests).

Signed-off-by: Damien Le Moal 
---
 block/blk-sysfs.c  | 11 +++
 include/linux/blkdev.h |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3772671cf2bc..f7060a938bf9 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -300,6 +300,11 @@ static ssize_t queue_zoned_show(struct request_queue *q, 
char *page)
}
 }
 
+static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
+{
+   return queue_var_show(q->nr_zones, page);
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
return queue_var_show((blk_queue_nomerges(q) << 1) |
@@ -637,6 +642,11 @@ static struct queue_sysfs_entry queue_zoned_entry = {
.show = queue_zoned_show,
 };
 
+static struct queue_sysfs_entry queue_nr_zones_entry = {
+   .attr = {.name = "nr_zones", .mode = 0444 },
+   .show = queue_nr_zones_show,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
.attr = {.name = "nomerges", .mode = 0644 },
.show = queue_nomerges_show,
@@ -727,6 +737,7 @@ static struct attribute *default_attrs[] = {
_write_zeroes_max_entry.attr,
_nonrot_entry.attr,
_zoned_entry.attr,
+   _nr_zones_entry.attr,
_nomerges_entry.attr,
_rq_affinity_entry.attr,
_iostats_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c24969b1741b..23ab53d2d4ca 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -593,7 +593,6 @@ struct request_queue {
 
struct queue_limits limits;
 
-#ifdef CONFIG_BLK_DEV_ZONED
/*
 * Zoned block device information for request dispatch control.
 * nr_zones is the total number of zones of the device. This is always
@@ -612,6 +611,7 @@ struct request_queue {
 * blk_mq_unfreeze_queue().
 */
unsigned intnr_zones;
+#ifdef CONFIG_BLK_DEV_ZONED
unsigned long   *seq_zones_bitmap;
unsigned long   *seq_zones_wlock;
 #endif /* CONFIG_BLK_DEV_ZONED */
-- 
2.17.1



[PATCH v2 11/11] block: Introduce blk_revalidate_disk_zones()

2018-10-11 Thread Damien Le Moal
Drivers exposing zoned block devices have to initialize and maintain
correctness (i.e. revalidate) of the device zone bitmaps attached to
the device request queue (seq_zones_bitmap and seq_zones_wlock).

To simplify coding this, introduce a generic helper function
blk_revalidate_disk_zones() suitable for most (and likely all) cases.
This new function always update the seq_zones_bitmap and seq_zones_wlock
bitmaps as well as the queue nr_zones field when called for a disk
using a request based queue. For a disk using a BIO based queue, only
the number of zones is updated since these queues do not have
schedulers and so do not need the zone bitmaps.

With this change, the zone bitmap initialization code in sd_zbc.c can be
replaced with a call to this function in sd_zbc_read_zones(), which is
called from the disk revalidate block operation method.

A call to blk_revalidate_disk_zones() is also added to the null_blk
driver for devices created with the zoned mode enabled.

Finally, to ensure that zoned devices created with dm-linear or
dm-flakey expose the correct number of zones through sysfs, a call to
blk_revalidate_disk_zones() is added to dm_table_set_restrictions().

The zone bitmaps allocated and initialized with
blk_revalidate_disk_zones() are freed automatically from
__blk_release_queue() using the block internal function
blk_queue_free_zone_bitmaps().

Signed-off-by: Damien Le Moal 
---
 block/blk-sysfs.c |   2 +
 block/blk-zoned.c | 136 +
 block/blk.h   |   6 +
 drivers/block/null_blk_main.c |   7 ++
 drivers/md/dm-table.c |  10 ++
 drivers/scsi/sd.c |   2 -
 drivers/scsi/sd.h |   4 -
 drivers/scsi/sd_zbc.c | 218 --
 include/linux/blkdev.h|   7 ++
 9 files changed, 194 insertions(+), 198 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f7060a938bf9..8bc1a9a9d6f7 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -852,6 +852,8 @@ static void __blk_release_queue(struct work_struct *work)
if (q->queue_tags)
__blk_queue_free_tags(q);
 
+   blk_queue_free_zone_bitmaps(q);
+
if (!q->mq_ops) {
if (q->exit_rq_fn)
q->exit_rq_fn(q, q->fq->flush_rq);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 90cf503091d5..13ba2011a306 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "blk.h"
 
@@ -359,3 +360,138 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, 
fmode_t mode,
return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
  GFP_KERNEL);
 }
+
+static inline unsigned long *blk_alloc_zone_bitmap(int node,
+  unsigned int nr_zones)
+{
+   return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
+   GFP_NOIO, node);
+}
+
+/*
+ * Allocate an array of struct blk_zone to get nr_zones zone information.
+ * The allocated array may be smaller than nr_zones.
+ */
+static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+{
+   size_t size = *nr_zones * sizeof(struct blk_zone);
+   struct page *page;
+   int order;
+
+   for (order = get_order(size); order > 0; order--) {
+   page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
+   if (page) {
+   *nr_zones = min_t(unsigned int, *nr_zones,
+   (PAGE_SIZE << order) / sizeof(struct blk_zone));
+   return page_address(page);
+   }
+   }
+
+   return NULL;
+}
+
+void blk_queue_free_zone_bitmaps(struct request_queue *q)
+{
+   kfree(q->seq_zones_bitmap);
+   q->seq_zones_bitmap = NULL;
+   kfree(q->seq_zones_wlock);
+   q->seq_zones_wlock = NULL;
+}
+
+/**
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
+ * @disk:  Target disk
+ *
+ * Helper function for low-level device drivers to (re) allocate and initialize
+ * a disk request queue zone bitmaps. This functions should normally be called
+ * within the disk ->revalidate method. For BIO based queues, no zone bitmap
+ * is allocated.
+ */
+int blk_revalidate_disk_zones(struct gendisk *disk)
+{
+   struct request_queue *q = disk->queue;
+   unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk));
+   unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
+   unsigned int i, rep_nr_zones = 0, z = 0, nrz;
+   struct blk_zone *zones = NULL;
+   sector_t sector = 0;
+   int ret = 0;
+
+   /*
+* BIO based queues do not use a scheduler so only q->nr_zones
+* needs to be updated so that the sysfs exposed value is correct.
+*/
+   if (!queu

[PATCH v2 08/11] block: Improve zone reset execution

2018-10-11 Thread Damien Le Moal
There is no need to synchronously execute all REQ_OP_ZONE_RESET BIOs
necessary to reset a range of zones. Similarly to what is done for
discard BIOs in blk-lib.c, all zone reset BIOs can be chained and
executed asynchronously and a synchronous call done only for the last
BIO of the chain.

Modify blkdev_reset_zones() to operate similarly to
blkdev_issue_discard() using the next_bio() helper for chaining BIOs. To
avoid code duplication of that function in blk_zoned.c, rename
next_bio() into blk_next_bio() and declare it as a block internal
function in blk.h.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 block/blk-lib.c   | 13 ++---
 block/blk-zoned.c | 29 -
 block/blk.h   |  2 ++
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index d1b9dd03da25..d7bedee5f9ba 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,8 +10,7 @@
 
 #include "blk.h"
 
-static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
-   gfp_t gfp)
+struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
 {
struct bio *new = bio_alloc(gfp, nr_pages);
 
@@ -87,7 +86,7 @@ int __blkdev_issue_discard(struct block_device *bdev, 
sector_t sector,
req_sects = end_sect - sector;
}
 
-   bio = next_bio(bio, 0, gfp_mask);
+   bio = blk_next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio_set_op_attrs(bio, op, 0);
@@ -189,7 +188,7 @@ static int __blkdev_issue_write_same(struct block_device 
*bdev, sector_t sector,
max_write_same_sectors = UINT_MAX >> 9;
 
while (nr_sects) {
-   bio = next_bio(bio, 1, gfp_mask);
+   bio = blk_next_bio(bio, 1, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio->bi_vcnt = 1;
@@ -265,7 +264,7 @@ static int __blkdev_issue_write_zeroes(struct block_device 
*bdev,
return -EOPNOTSUPP;
 
while (nr_sects) {
-   bio = next_bio(bio, 0, gfp_mask);
+   bio = blk_next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE_ZEROES;
@@ -316,8 +315,8 @@ static int __blkdev_issue_zero_pages(struct block_device 
*bdev,
return -EPERM;
 
while (nr_sects != 0) {
-   bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-  gfp_mask);
+   bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+  gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index bb4ed69f917f..5d967fd39fbd 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -13,6 +13,8 @@
 #include 
 #include 
 
+#include "blk.h"
+
 static inline sector_t blk_zone_start(struct request_queue *q,
  sector_t sector)
 {
@@ -277,16 +279,17 @@ int blkdev_reset_zones(struct block_device *bdev,
struct request_queue *q = bdev_get_queue(bdev);
sector_t zone_sectors;
sector_t end_sector = sector + nr_sectors;
-   struct bio *bio;
+   struct bio *bio = NULL;
+   struct blk_plug plug;
int ret;
 
-   if (!q)
-   return -ENXIO;
-
if (!blk_queue_is_zoned(q))
return -EOPNOTSUPP;
 
-   if (end_sector > bdev->bd_part->nr_sects)
+   if (bdev_read_only(bdev))
+   return -EPERM;
+
+   if (!nr_sectors || end_sector > bdev->bd_part->nr_sects)
/* Out of range */
return -EINVAL;
 
@@ -299,19 +302,14 @@ int blkdev_reset_zones(struct block_device *bdev,
end_sector != bdev->bd_part->nr_sects)
return -EINVAL;
 
+   blk_start_plug();
while (sector < end_sector) {
 
-   bio = bio_alloc(gfp_mask, 0);
+   bio = blk_next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
 
-   ret = submit_bio_wait(bio);
-   bio_put(bio);
-
-   if (ret)
-   return ret;
-
sector += zone_sectors;
 
/* This may take a while, so be nice to others */
@@ -319,7 +317,12 @@ int blkdev_reset_zones(struct block_device *bdev,
 
}
 
-   return 0;
+   ret = submit_bio_wait(bio);
+   bio_put(bio);
+
+   blk_finish_plug();
+
+   return ret;
 }
 EXPORT_SYMB

[PATCH v2 02/11] scsi: sd_zbc: Reduce boot device scan and revalidate time

2018-10-11 Thread Damien Le Moal
Handling checks of ZBC device capacity using the max_lba field of the
REPORT ZONES command reply for disks with rc_basis == 0 can be done
using the same report zones command reply used to check the "same"
field.

Avoid executing a report zones command solely to check the disk capacity
by merging sd_zbc_check_capacity() into sd_zbc_check_zone_size() and
renaming that function to sd_zbc_check_zones(). This removes a costly
execution of a full report zones command and so reduces device scan
duration at boot time as well as the duration of disk revalidate calls.

Furthermore, setting the partial report bit in the REPORT ZONES command
cdb can significantly reduce this command execution time as the device
does not have to count and report the total number of zones that could
be reported assuming a large enough reply buffer. A non-partial zone
report is necessary only for the first execution of report zones used to
check the same field value (to ensure that this value applies to all
zones of the disk). All other calls to sd_zbc_report_zones() can use a
partial report to reduce execution time.

Using a 14 TB ZBC disk, these simple changes reduce device scan time at
boot from about 3.5s down to about 900ms. Disk revalidate times are also
reduced from about 450ms down to 230ms.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 94 ++-
 1 file changed, 40 insertions(+), 54 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 0b7d8787f785..ca73c46931c0 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -67,11 +67,17 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 
*buf,
  * @buf: Buffer to use for the reply
  * @buflen: the buffer size
  * @lba: Start LBA of the report
+ * @partial: Do partial report
  *
  * For internal use during device validation.
+ * Using partial=true can significantly speed up execution of a report zones
+ * command because the disk does not have to count all possible report matching
+ * zones and will only report the count of zones fitting in the command reply
+ * buffer.
  */
 static int sd_zbc_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
-  unsigned int buflen, sector_t lba)
+  unsigned int buflen, sector_t lba,
+  bool partial)
 {
struct scsi_device *sdp = sdkp->device;
const int timeout = sdp->request_queue->rq_timeout;
@@ -85,6 +91,8 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, 
unsigned char *buf,
cmd[1] = ZI_REPORT_ZONES;
put_unaligned_be64(lba, [2]);
put_unaligned_be32(buflen, [10]);
+   if (partial)
+   cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
memset(buf, 0, buflen);
 
result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
@@ -350,60 +358,25 @@ static int sd_zbc_check_zoned_characteristics(struct 
scsi_disk *sdkp,
return 0;
 }
 
-/**
- * sd_zbc_check_capacity - Check reported capacity.
- * @sdkp: Target disk
- * @buf: Buffer to use for commands
- *
- * ZBC drive may report only the capacity of the first conventional zones at
- * LBA 0. This is indicated by the RC_BASIS field of the read capacity reply.
- * Check this here. If the disk reported only its conventional zones capacity,
- * get the total capacity by doing a report zones.
- */
-static int sd_zbc_check_capacity(struct scsi_disk *sdkp, unsigned char *buf)
-{
-   sector_t lba;
-   int ret;
-
-   if (sdkp->rc_basis != 0)
-   return 0;
-
-   /* Do a report zone to get the maximum LBA to check capacity */
-   ret = sd_zbc_report_zones(sdkp, buf, SD_BUF_SIZE, 0);
-   if (ret)
-   return ret;
-
-   /* The max_lba field is the capacity of this device */
-   lba = get_unaligned_be64([8]);
-   if (lba + 1 == sdkp->capacity)
-   return 0;
-
-   if (sdkp->first_scan)
-   sd_printk(KERN_WARNING, sdkp,
- "Changing capacity from %llu to max LBA+1 %llu\n",
- (unsigned long long)sdkp->capacity,
- (unsigned long long)lba + 1);
-   sdkp->capacity = lba + 1;
-
-   return 0;
-}
-
 #define SD_ZBC_BUF_SIZE 131072U
 
 /**
- * sd_zbc_check_zone_size - Check the device zone sizes
+ * sd_zbc_check_zones - Check the device capacity and zone sizes
  * @sdkp: Target disk
  *
- * Check that all zones of the device are equal. The last zone can however
- * be smaller. The zone size must also be a power of two number of LBAs.
+ * Check that the device capacity as reported by READ CAPACITY matches the
+ * max_lba value (plus one)of the report zones command reply. Also check that
+ * all zones of the device have an equal size, only allowing the last zone of
+ * the disk to have a smaller size (runt zone). The zone size must also be a
+ *

[PATCH v2 06/11] block: Introduce BLKGETZONESZ ioctl

2018-10-11 Thread Damien Le Moal
Get a zoned block device zone size in number of 512 B sectors.
The zone size is always 0 for regular block devices.

Signed-off-by: Damien Le Moal 
---
 block/ioctl.c | 2 ++
 include/uapi/linux/blkzoned.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/block/ioctl.c b/block/ioctl.c
index 3884d810efd2..f6d2c6f1f050 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -532,6 +532,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, 
unsigned cmd,
return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
case BLKRESETZONE:
return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
+   case BLKGETZONESZ:
+   return put_uint(arg, bdev_zone_sectors(bdev));
case HDIO_GETGEO:
return blkdev_getgeo(bdev, argp);
case BLKRAGET:
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index ff5a5db8906a..281ac605f752 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -137,8 +137,10 @@ struct blk_zone_range {
  * sector specified in the report request structure.
  * @BLKRESETZONE: Reset the write pointer of the zones in the specified
  *sector range. The sector range must be zone aligned.
+ * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
  */
 #define BLKREPORTZONE  _IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE   _IOW(0x12, 131, struct blk_zone_range)
+#define BLKGETZONESZ   _IOW(0x12, 132, __u32)
 
 #endif /* _UAPI_BLKZONED_H */
-- 
2.17.1



[PATCH v2 03/11] scsi: sd_zbc: Fix sd_zbc_check_zones() error checks

2018-10-11 Thread Damien Le Moal
The 32 bits overflow check for the zone size value is already done
within sd_zbc_check_zones() with the test:

} else if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {

so there is no need to check again for an out of range value in
sd_zbc_read_zones(). Simplify the code and fix sd_zbc_check_zones()
error return to -EFBIG instead of -ENODEV if the zone size is too large.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index ca73c46931c0..44b64b4a922a 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -373,7 +373,7 @@ static int sd_zbc_check_zoned_characteristics(struct 
scsi_disk *sdkp,
  * Returns the zone size in number of blocks upon success or an error code
  * upon failure.
  */
-static s64 sd_zbc_check_zones(struct scsi_disk *sdkp)
+static s32 sd_zbc_check_zones(struct scsi_disk *sdkp)
 {
u64 zone_blocks = 0;
sector_t max_lba, block = 0;
@@ -472,7 +472,7 @@ static s64 sd_zbc_check_zones(struct scsi_disk *sdkp)
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Zone size too large\n");
-   ret = -ENODEV;
+   ret = -EFBIG;
} else {
ret = zone_blocks;
}
@@ -668,8 +668,7 @@ static int sd_zbc_setup(struct scsi_disk *sdkp, u32 
zone_blocks)
 
 int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 {
-   int64_t zone_blocks;
-   int ret;
+   int ret, zone_blocks;
 
if (!sd_is_zoned(sdkp))
/*
@@ -688,12 +687,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned 
char *buf)
 * an eventual last runt zone) that is a power of 2 are supported.
 */
zone_blocks = sd_zbc_check_zones(sdkp);
-   ret = -EFBIG;
-   if (zone_blocks != (u32)zone_blocks)
-   goto err;
-   ret = zone_blocks;
-   if (ret < 0)
+   if (zone_blocks < 0) {
+   ret = zone_blocks;
goto err;
+   }
 
/* The drive satisfies the kernel restrictions: set it up */
ret = sd_zbc_setup(sdkp, zone_blocks);
-- 
2.17.1



[PATCH v2 00/11] Zoned block device support improvements

2018-10-11 Thread Damien Le Moal
This series improves zoned block device support (reduce overhead) and
introduces many simplifications to the code (overall, there are more deletions
than insertions).

In more details:
* Patches 1 to 3 are SCSI side (sd driver) cleanups and improvements reducing
  the overhead of report zones command execution during disk scan and
  revalidation.
* Patches 4 to 9 improve the useability and user API of zoned block devices.
* Patch 10 is the main part of this series. This patch replaces the
  REQ_OP_ZONE_REPORT BIO/request operation for executing report zones commands
  with a block device file operation, removing the need for the command reply
  payload in-place rewriting in the BIO buffer. This leads to major
  simplification of the code in many places.
* Patch 11 further simplifies the code of low level drivers by providing a
  generic implementation of zoned block device request queue zone bitmaps
  initialization and revalidation.

Please consider the addition of these patches in 4.20.
Comments are as always welcome.

Changes from v1:
* Addressed Christoph's and Bart's comments
* Fixed several compilation errors with zoned block device support disabled
* Rebased on latest rc including the most recent dm patches

Christoph Hellwig (1):
  block: add a report_zones method

Damien Le Moal (10):
  scsi: sd_zbc: Rearrange code
  scsi: sd_zbc: Reduce boot device scan and revalidate time
  scsi: sd_zbc: Fix sd_zbc_check_zones() error checks
  block: Introduce blkdev_nr_zones() helper
  block: Limit allocation of zone descriptors for report zones
  block: Introduce BLKGETZONESZ ioctl
  block: Introduce BLKGETNRZONES ioctl
  block: Improve zone reset execution
  block: Expose queue nr_zones in sysfs
  block: Introduce blk_revalidate_disk_zones()

 block/blk-core.c   |   1 -
 block/blk-lib.c|  13 +-
 block/blk-mq-debugfs.c |   1 -
 block/blk-sysfs.c  |  13 +
 block/blk-zoned.c  | 359 +++-
 block/blk.h|   8 +
 block/ioctl.c  |   4 +
 drivers/block/null_blk.h   |  11 +-
 drivers/block/null_blk_main.c  |  30 +-
 drivers/block/null_blk_zoned.c |  57 +---
 drivers/md/dm-flakey.c |  30 +-
 drivers/md/dm-linear.c |  35 ++-
 drivers/md/dm-table.c  |  10 +
 drivers/md/dm-zoned-target.c   |   3 +-
 drivers/md/dm.c| 169 ++-
 drivers/scsi/sd.c  |  15 +-
 drivers/scsi/sd.h  |  15 +-
 drivers/scsi/sd_zbc.c  | 497 +
 include/linux/blk_types.h  |   2 -
 include/linux/blkdev.h |  22 +-
 include/linux/device-mapper.h  |  12 +-
 include/trace/events/f2fs.h|   1 -
 include/uapi/linux/blkzoned.h  |   3 +
 23 files changed, 591 insertions(+), 720 deletions(-)

-- 
2.17.1



[PATCH v2 05/11] block: Limit allocation of zone descriptors for report zones

2018-10-11 Thread Damien Le Moal
There is no point in allocating more zone descriptors than the number of
zones a block device has for doing a zone report. Avoid doing that in
blkdev_report_zones_ioctl() by limiting the number of zone decriptors
allocated internally to process the user request.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 block/blk-zoned.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 32e377f755d8..bb4ed69f917f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -355,8 +355,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, 
fmode_t mode,
if (!rep.nr_zones)
return -EINVAL;
 
-   if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone))
-   return -ERANGE;
+   rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones);
 
zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone),
   GFP_KERNEL | __GFP_ZERO);
-- 
2.17.1



[PATCH v2 01/11] scsi: sd_zbc: Rearrange code

2018-10-11 Thread Damien Le Moal
Move the urswrz check out of sd_zbc_read_zones() and into
sd_zbc_read_zoned_characteristics() where that value is obtained (read
from the disk zoned characteristics VPD page). Since this function now
does more than simply reading the VPD page, rename it to
sd_zbc_check_zoned_characteristics().
Also fix the error message displayed when reading that VPD page fails.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 39 +++
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 412c1787dcd9..0b7d8787f785 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -305,19 +305,19 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int 
good_bytes,
 }
 
 /**
- * sd_zbc_read_zoned_characteristics - Read zoned block device characteristics
+ * sd_zbc_check_zoned_characteristics - Check zoned block device 
characteristics
  * @sdkp: Target disk
  * @buf: Buffer where to store the VPD page data
  *
- * Read VPD page B6.
+ * Read VPD page B6, get information and check that reads are unconstrained.
  */
-static int sd_zbc_read_zoned_characteristics(struct scsi_disk *sdkp,
-unsigned char *buf)
+static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
+ unsigned char *buf)
 {
 
if (scsi_get_vpd_page(sdkp->device, 0xb6, buf, 64)) {
sd_printk(KERN_NOTICE, sdkp,
- "Unconstrained-read check failed\n");
+ "Read zoned characteristics VPD page failed\n");
return -ENODEV;
}
 
@@ -335,6 +335,18 @@ static int sd_zbc_read_zoned_characteristics(struct 
scsi_disk *sdkp,
sdkp->zones_max_open = get_unaligned_be32([16]);
}
 
+   /*
+* Check for unconstrained reads: host-managed devices with
+* constrained reads (drives failing read after write pointer)
+* are not supported.
+*/
+   if (!sdkp->urswrz) {
+   if (sdkp->first_scan)
+   sd_printk(KERN_NOTICE, sdkp,
+ "constrained reads devices are not supported\n");
+   return -ENODEV;
+   }
+
return 0;
 }
 
@@ -675,24 +687,11 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned 
char *buf)
 */
return 0;
 
-   /* Get zoned block device characteristics */
-   ret = sd_zbc_read_zoned_characteristics(sdkp, buf);
+   /* Check zoned block device characteristics (unconstrained reads) */
+   ret = sd_zbc_check_zoned_characteristics(sdkp, buf);
if (ret)
goto err;
 
-   /*
-* Check for unconstrained reads: host-managed devices with
-* constrained reads (drives failing read after write pointer)
-* are not supported.
-*/
-   if (!sdkp->urswrz) {
-   if (sdkp->first_scan)
-   sd_printk(KERN_NOTICE, sdkp,
- "constrained reads devices are not supported\n");
-   ret = -ENODEV;
-   goto err;
-   }
-
/* Check capacity */
ret = sd_zbc_check_capacity(sdkp, buf);
if (ret)
-- 
2.17.1



[PATCH v2 07/11] block: Introduce BLKGETNRZONES ioctl

2018-10-11 Thread Damien Le Moal
Get a zoned block device total number of zones. The device can be a
partition of the whole device. The number of zones is always 0 for
regular block devices.

Signed-off-by: Damien Le Moal 
---
 block/ioctl.c | 2 ++
 include/uapi/linux/blkzoned.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/block/ioctl.c b/block/ioctl.c
index f6d2c6f1f050..4825c78a6baa 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -534,6 +534,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, 
unsigned cmd,
return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
case BLKGETZONESZ:
return put_uint(arg, bdev_zone_sectors(bdev));
+   case BLKGETNRZONES:
+   return put_uint(arg, blkdev_nr_zones(bdev));
case HDIO_GETGEO:
return blkdev_getgeo(bdev, argp);
case BLKRAGET:
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 281ac605f752..8f08ff9bdea0 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -142,5 +142,6 @@ struct blk_zone_range {
 #define BLKREPORTZONE  _IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE   _IOW(0x12, 131, struct blk_zone_range)
 #define BLKGETZONESZ   _IOW(0x12, 132, __u32)
+#define BLKGETNRZONES  _IOW(0x12, 133, __u32)
 
 #endif /* _UAPI_BLKZONED_H */
-- 
2.17.1



[PATCH v2 04/11] block: Introduce blkdev_nr_zones() helper

2018-10-11 Thread Damien Le Moal
Introduce the blkdev_nr_zones() helper function to get the total
number of zones of a zoned block device. This number is always 0 for a
regular block device (q->limits.zoned == BLK_ZONED_NONE case).

Replace hard-coded number of zones calculation in dmz_get_zoned_device()
with a call to this helper.

Signed-off-by: Damien Le Moal 
Reviewed-by: Christoph Hellwig 
---
 block/blk-zoned.c| 27 +++
 drivers/md/dm-zoned-target.c |  3 +--
 include/linux/blkdev.h   |  5 +
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index c461cf63f1f4..32e377f755d8 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -63,6 +63,33 @@ void __blk_req_zone_write_unlock(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 
+static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
+sector_t nr_sectors)
+{
+   unsigned long zone_sectors = blk_queue_zone_sectors(q);
+
+   return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
+}
+
+/**
+ * blkdev_nr_zones - Get number of zones
+ * @bdev:  Target block device
+ *
+ * Description:
+ *Return the total number of zones of a zoned block device.
+ *For a regular block device, the number of zones is always 0.
+ */
+unsigned int blkdev_nr_zones(struct block_device *bdev)
+{
+   struct request_queue *q = bdev_get_queue(bdev);
+
+   if (!blk_queue_is_zoned(q))
+   return 0;
+
+   return __blkdev_nr_zones(q, bdev->bd_part->nr_sects);
+}
+EXPORT_SYMBOL_GPL(blkdev_nr_zones);
+
 /*
  * Check that a zone report belongs to the partition.
  * If yes, fix its start sector and write pointer, copy it in the
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index a44183ff4be0..12d96a263623 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -702,8 +702,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char 
*path)
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
 
-   dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1)
-   >> dev->zone_nr_sectors_shift;
+   dev->nr_zones = blkdev_nr_zones(dev->bdev);
 
dmz->dev = dev;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6980014357d4..c24969b1741b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -401,6 +401,7 @@ struct blk_zone_report_hdr {
u8  padding[60];
 };
 
+extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
   sector_t sector, struct blk_zone *zones,
   unsigned int *nr_zones, gfp_t gfp_mask);
@@ -414,6 +415,10 @@ extern int blkdev_reset_zones_ioctl(struct block_device 
*bdev, fmode_t mode,
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
+static inline unsigned int blkdev_nr_zones(struct block_device *bdev)
+{
+   return 0;
+}
 static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
fmode_t mode, unsigned int cmd,
unsigned long arg)
-- 
2.17.1



Re: [RFC PATCH 2/5] break sd_done sense processing out to own function

2018-08-05 Thread Damien Le Moal
nt->result = 0;
> - memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
> - break;
> - case ABORTED_COMMAND:
> - if (sshdr.asc == 0x10)  /* DIF: Target detected corruption */
> - good_bytes = sd_completed_bytes(SCpnt);
> - break;
> - case ILLEGAL_REQUEST:
> - switch (sshdr.asc) {
> - case 0x10:  /* DIX: Host detected corruption */
> - good_bytes = sd_completed_bytes(SCpnt);
> - break;
> - case 0x20:  /* INVALID COMMAND OPCODE */
> - case 0x24:  /* INVALID FIELD IN CDB */
> - switch (SCpnt->cmnd[0]) {
> - case UNMAP:
> - sd_config_discard(sdkp, SD_LBP_DISABLE);
> - break;
> - case WRITE_SAME_16:
> - case WRITE_SAME:
> - if (SCpnt->cmnd[1] & 8) { /* UNMAP */
> - sd_config_discard(sdkp, SD_LBP_DISABLE);
> - } else {
> - sdkp->device->no_write_same = 1;
> -         sd_config_write_same(sdkp);
> - req->__data_len = blk_rq_bytes(req);
> - req->rq_flags |= RQF_QUIET;
> - }
> - break;
> - }
> - }
> - break;
> - default:
> - break;
> - }
> -
> - out:
>   if (sd_is_zoned(sdkp))
>   sd_zbc_complete(SCpnt, good_bytes, );
>  
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH 1/5] add tweakable bounds_check flag, now off by default

2018-08-05 Thread Damien Le Moal
md)
>   goto out;
>   }
>  
> - if (unlikely(blk_rq_pos(rq) + blk_rq_sectors(rq)
> -  > logical_to_sectors(sdp, sdkp->capacity))) {
> - scmd_printk(KERN_ERR, cmd, "access beyond end of device\n");
> - ret = BLKPREP_KILL;
> - goto out;
> - }
> + if (sdkp->bounds_check) {
> + unsigned int mask = logical_to_sectors(sdp, 1) - 1;
>  
> - if (unlikely((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask))) {
> - scmd_printk(KERN_ERR, cmd, "request not aligned to the logical 
> block size\n");
> - ret = BLKPREP_KILL;
> - goto out;
> + if (unlikely(blk_rq_pos(rq) + blk_rq_sectors(rq)
> +  > logical_to_sectors(sdp, sdkp->capacity))) {
> + scmd_printk(KERN_ERR, cmd,
> + "access beyond end of device\n");
> + ret = BLKPREP_KILL;
> + goto out;
> + }
> +
> + if (unlikely((blk_rq_pos(rq) & mask) ||
> +  (blk_rq_sectors(rq) & mask))) {
> + scmd_printk(KERN_ERR, cmd,
> + "request not aligned to logical block size\n");
> + ret = BLKPREP_KILL;
> + goto out;
> +     }
>   }
>  
>   /*
> diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
> index 392c7d078ae3..6f58d130fb75 100644
> --- a/drivers/scsi/sd.h
> +++ b/drivers/scsi/sd.h
> @@ -117,6 +117,7 @@ struct scsi_disk {
>   unsignedurswrz : 1;
>   unsignedsecurity : 1;
>   unsignedignore_medium_access_errors : 1;
> + unsignedbounds_check : 1;
>  };
>  #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
>  
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH 3/5] streamline REQ_OP_READ-WRITE access

2018-08-05 Thread Damien Le Moal
expected REQ_OP=%u\n",
> +   (unsigned int)req_op(req));
> + WARN_ON_ONCE(true);
> + break;
> + }
>   }
>  
>   if (result) {
> @@ -3597,6 +3614,17 @@ static int __init init_sd(void)
>   int majors = 0, i, err;
>  
>   SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
> + /*
> +  * The sd_init_command() and sd_done() assume REQ_OP_READ and
> +  * REQ_OP_WRITE are 0 and 1 and will fail if they are not. If they
> +  * are not, would prefer a compile failure but the preprocessor can't
> +  * use enum constants. Place check here because only need to check
> +  * early and once.
> +  */
> + if (REQ_OP_READ + REQ_OP_WRITE > 1)
> + pr_err("%s: REQ_OP_READ=%d REQ_OP_WRITE=%d %s.\n", __func__,
> +REQ_OP_READ, REQ_OP_WRITE,
> +"expected 0 and 1. Logic ERROR");
>  
>   for (i = 0; i < SD_MAJORS; i++) {
>   if (register_blkdev(sd_major(i), "sd") != 0)
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH 4/5] streamline some logical operations

2018-08-05 Thread Damien Le Moal
On 2018/08/06 13:51, Douglas Gilbert wrote:
> Re-arrange some logic to lessen the number of checks. With logical
> ANDs put the least likely first, with logical ORs put the most
> likely first. Also add conditional hints on the assumed fastpath.
> 
> Signed-off-by: Douglas Gilbert 
> ---
>  drivers/scsi/sd.c | 43 ---
>  1 file changed, 24 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index 9f047fd3c92d..05014054e357 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -1171,9 +1171,9 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
> *cmd)
>  
>   fua = (rq->cmd_flags & REQ_FUA) ? 0x8 : 0;
>   dix = scsi_prot_sg_count(cmd);
> - dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
> + dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
>  
> - if (write && dix)
> + if (dix && write)
>   sd_dif_prepare(cmd);
>  
>   if (dif || dix)
> @@ -1181,19 +1181,27 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
> *cmd)
>   else
>   protect = 0;
>  
> - if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
> + if (unlikely(protect &&
> +  sdkp->protection_type == T10_PI_TYPE2_PROTECTION))
>   ret = sd_setup_read_write_32_cmnd(cmd, write, lba, nr_blocks,
> protect | fua);
> - } else if (sdp->use_16_for_rw || (nr_blocks > 0x)) {
> + else if (sdp->use_16_for_rw)
>   ret = sd_setup_read_write_16_cmnd(cmd, write, lba, nr_blocks,
> protect | fua);

So here, without use_16_for_rw being forced on (which is the case for most disks
I think, except ZBC disks which mandate it) or most disks, all read/write to low
LBAs will have to go through a longer chain of if/else if... Is this change
really such a gain in average ? It looks like this will be a loss for the first
small partition at the beginning of the disk.

> - } else if ((nr_blocks > 0xff) || (lba > 0x1f) || sdp->use_10_for_rw
> -|| protect) {
> - ret = sd_setup_read_write_10_cmnd(cmd, write, lba, nr_blocks,
> -   protect | fua);
> - } else {
> - ret = sd_setup_read_write_6_cmnd(cmd, write, lba, nr_blocks,
> -  protect | fua);
> + else if (likely(nr_blocks < 0x100)) {
> + if (sdp->use_10_for_rw || (lba > 0x1f) || protect)
> + ret = sd_setup_read_write_10_cmnd(cmd, write, lba,
> +  nr_blocks, protect | fua);
> + else
> + ret = sd_setup_read_write_6_cmnd(cmd, write, lba,
> +  nr_blocks, protect | fua);
> + } else {/* not already done and nr_blocks > 0xff */
> + if (unlikely(nr_blocks > 0x))
> + ret = sd_setup_read_write_16_cmnd(cmd, write, lba,
> +  nr_blocks, protect | fua);
> + else
> + ret = sd_setup_read_write_10_cmnd(cmd, write, lba,
> +  nr_blocks, protect | fua);
>   }
>  
>   if (ret != BLKPREP_OK)
> @@ -1976,7 +1984,6 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>   struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
>   struct request *req = SCpnt->request;
>   int sense_valid = 0;
> - int sense_deferred = 0;
>  
>   /*
>* Assumption that REQ_OP_READ and REQ_OP_WRITE are 0 and 1 is
> @@ -2030,16 +2037,14 @@ static int sd_done(struct scsi_cmnd *SCpnt)
>   }
>   }
>  
> - if (result) {
> + if (unlikely(result)) {
>   sense_valid = scsi_command_normalize_sense(SCpnt, );
> - if (sense_valid)
> - sense_deferred = scsi_sense_is_deferred();
> + if (driver_byte(result) == DRIVER_SENSE ||
> + (sense_valid && (!scsi_sense_is_deferred(
> + good_bytes = sd_done_sense(SCpnt, good_bytes, );
>   }
> - sdkp->medium_access_timed_out = 0;
>  
> - if (unlikely(driver_byte(result) == DRIVER_SENSE ||
> -  (sense_valid && !sense_deferred)))
> - good_bytes = sd_done_sense(SCpnt, good_bytes, );
> + sdkp->medium_access_timed_out = 0;
>  
>   if (sd_is_zoned(sdkp))
>   sd_zbc_complete(SCpnt, good_bytes, );
> 


-- 
Damien Le Moal
Western Digital Research


[PATCH] scsi: sd_zbc: Fix variable type and bogus comment

2018-07-03 Thread Damien Le Moal
Fix the description of sd_zbc_check_zone_size() to correctly explain
that the returned value is a number of device blocks, not bytes.
Additionally, the 32 bits "ret" variable used in this function may
truncate the 64 bits zone_blocks variable value upon return. To fix
this, change "ret" type to s64.

Fixes: ccce20fc79 ("sd_zbc: Avoid that resetting a zone fails sporadically")
Signed-off-by: Damien Le Moal 
Cc: Bart Van Assche 
Cc: sta...@kernel.org
---
 drivers/scsi/sd_zbc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a14fef11776e..2bf3bf73886e 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -391,7 +391,8 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf)
  * Check that all zones of the device are equal. The last zone can however
  * be smaller. The zone size must also be a power of two number of LBAs.
  *
- * Returns the zone size in bytes upon success or an error code upon failure.
+ * Returns the zone size in number of blocks upon success or an error code
+ * upon failure.
  */
 static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
@@ -401,7 +402,7 @@ static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
unsigned char *rec;
unsigned int buf_len;
unsigned int list_length;
-   int ret;
+   s64 ret;
u8 same;
 
/* Get a buffer */
-- 
2.17.1



[PATCH V2 2/2] ata: Fix ZBC_OUT all bit handling

2018-06-26 Thread Damien Le Moal
If the ALL bit is set in the ZBC_OUT command, the command zone ID field
(block) should be ignored.

Reported-by: David Butterfield 
Signed-off-by: Damien Le Moal 
Cc: sta...@vger.kernel.org
---
 drivers/ata/libata-scsi.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a5543751f446..aad1b01447de 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3805,7 +3805,14 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
 */
goto invalid_param_len;
}
-   if (block >= dev->n_sectors) {
+
+   all = cdb[14] & 0x1;
+   if (all) {
+   /*
+* Ignore the block address (zone ID) as defined by ZBC.
+*/
+   block = 0;
+   } else if (block >= dev->n_sectors) {
/*
 * Block must be a valid zone ID (a zone start LBA).
 */
@@ -3813,8 +3820,6 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
goto invalid_fld;
}
 
-   all = cdb[14] & 0x1;
-
if (ata_ncq_enabled(qc->dev) &&
ata_fpdma_zac_mgmt_out_supported(qc->dev)) {
tf->protocol = ATA_PROT_NCQ_NODATA;
-- 
2.17.1



[PATCH V2 1/2] ata: Fix ZBC_OUT command block check

2018-06-26 Thread Damien Le Moal
The block (LBA) specified must not exceed the last addressable LBA,
which is dev->nr_sectors - 1. So fix the correct check is
"if (block >= dev->n_sectors)" and not "if (block > dev->n_sectords)".

Additionally, the asc/ascq to return for an LBA that is not a zone start
LBA should be ILLEGAL REQUEST, regardless if the bad LBA is out of
range.

Reported-by: David Butterfield 
Signed-off-by: Damien Le Moal 
Cc: sta...@vger.kernel.org
---
 drivers/ata/libata-scsi.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6a91d04351d9..a5543751f446 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3805,8 +3805,13 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
 */
goto invalid_param_len;
}
-   if (block > dev->n_sectors)
-   goto out_of_range;
+   if (block >= dev->n_sectors) {
+   /*
+* Block must be a valid zone ID (a zone start LBA).
+*/
+   fp = 2;
+   goto invalid_fld;
+   }
 
all = cdb[14] & 0x1;
 
@@ -3837,10 +3842,6 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
  invalid_fld:
ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
return 1;
- out_of_range:
-   /* "Logical Block Address out of range" */
-   ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x00);
-   return 1;
 invalid_param_len:
/* "Parameter list length error" */
ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
-- 
2.17.1



[PATCH V2 0/2] ZBC_OUT command translation fixes

2018-06-26 Thread Damien Le Moal
Tejun,

These two patches fix problems with the checks of the ZBC_OUT command fields
prior to its translation to ZAC MANAGEMENT OUT.

The first patch fixes an incorrect out-of-range check and changes the returned
asc/ascq to the ZBC defined INVALID FIELD IN CDB instead of (the more natural
but incorrect) LBA OUT OF RANGE.

The second patch disables the ZBC_OUT command block address check if the ALL
bit is set, as defined by the ZBC specifications.

Thank you for considering these patches for inclusion in 4.18 fixes (and CC
stable).

Damien Le Moal (2):
  ata: Fix ZBC_OUT command block check
  ata: Fix ZBC_OUT all bit handling

 drivers/ata/libata-scsi.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

Changes from V1: Added "Cc: stable"

-- 
2.17.1



[PATCH V2 0/2] ZBC_OUT command translation fixes

2018-06-26 Thread Damien Le Moal
Tejun,

These two patches fix problems with the checks of the ZBC_OUT command fields
prior to its translation to ZAC MANAGEMENT OUT.

The first patch fixes an incorrect out-of-range check and changes the returned
asc/ascq to the ZBC defined INVALID FIELD IN CDB instead of (the more natural
but incorrect) LBA OUT OF RANGE.

The second patch disables the ZBC_OUT command block address check if the ALL
bit is set, as defined by the ZBC specifications.

Thank you for considering these patches for inclusion in 4.18 fixes (and CC
stable).

Damien Le Moal (2):
  ata: Fix ZBC_OUT command block check
  ata: Fix ZBC_OUT all bit handling

 drivers/ata/libata-scsi.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

Changes from V1: Added "Cc: stable"

-- 
2.17.1



[PATCH 1/2] ata: Fix ZBC_OUT command block check

2018-06-26 Thread Damien Le Moal
The block (LBA) specified must not exceed the last addressable LBA,
which is dev->nr_sectors - 1. So fix the correct check is
"if (block >= dev->n_sectors)" and not "if (block > dev->n_sectords)".

Additionally, the asc/ascq to return for an LBA that is not a zone start
LBA should be ILLEGAL REQUEST, regardless if the bad LBA is out of
range.

Reported-by: David Butterfield 
Signed-off-by: Damien Le Moal 
---
 drivers/ata/libata-scsi.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 6a91d04351d9..a5543751f446 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3805,8 +3805,13 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
 */
goto invalid_param_len;
}
-   if (block > dev->n_sectors)
-   goto out_of_range;
+   if (block >= dev->n_sectors) {
+   /*
+* Block must be a valid zone ID (a zone start LBA).
+*/
+   fp = 2;
+   goto invalid_fld;
+   }
 
all = cdb[14] & 0x1;
 
@@ -3837,10 +3842,6 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
  invalid_fld:
ata_scsi_set_invalid_field(qc->dev, scmd, fp, 0xff);
return 1;
- out_of_range:
-   /* "Logical Block Address out of range" */
-   ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x21, 0x00);
-   return 1;
 invalid_param_len:
/* "Parameter list length error" */
ata_scsi_set_sense(qc->dev, scmd, ILLEGAL_REQUEST, 0x1a, 0x0);
-- 
2.17.1



[PATCH 2/2] ata: Fix ZBC_OUT all bit handling

2018-06-26 Thread Damien Le Moal
If the ALL bit is set in the ZBC_OUT command, the command zone ID field
(block) should be ignored.

Reported-by: David Butterfield 
Signed-off-by: Damien Le Moal 
---
 drivers/ata/libata-scsi.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a5543751f446..aad1b01447de 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3805,7 +3805,14 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
 */
goto invalid_param_len;
}
-   if (block >= dev->n_sectors) {
+
+   all = cdb[14] & 0x1;
+   if (all) {
+   /*
+* Ignore the block address (zone ID) as defined by ZBC.
+*/
+   block = 0;
+   } else if (block >= dev->n_sectors) {
/*
 * Block must be a valid zone ID (a zone start LBA).
 */
@@ -3813,8 +3820,6 @@ static unsigned int ata_scsi_zbc_out_xlat(struct 
ata_queued_cmd *qc)
goto invalid_fld;
}
 
-   all = cdb[14] & 0x1;
-
if (ata_ncq_enabled(qc->dev) &&
ata_fpdma_zac_mgmt_out_supported(qc->dev)) {
tf->protocol = ATA_PROT_NCQ_NODATA;
-- 
2.17.1



[PATCH 0/2] ZBC_OUT command translation fixes

2018-06-26 Thread Damien Le Moal
Tejun,

These two patches fix problems with the checks of the ZBC_OUT command fields
prior to its translation to ZAC MANAGEMENT OUT.

The first patch fixes an incorrect out-of-range check and changes the returned
asc/ascq to the ZBC defined INVALID FIELD IN CDB instead of (the more natural
but incorrect) LBA OUT OF RANGE.

The second patch disables the ZBC_OUT command block address check if the ALL
bit is set, as defined by the ZBC specifications.

Thank you for considering these patches for inclusion in 4.18 fixes (and CC
stable).

Damien Le Moal (2):
  ata: Fix ZBC_OUT command block check
  ata: Fix ZBC_OUT all bit handling

 drivers/ata/libata-scsi.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

-- 
2.17.1



[PATCH v2 0/2] 4.14 long term stable ZBC fixes

2018-06-04 Thread Damien Le Moal
Patch 4b433924b275 ("scsi: sd_zbc: Fix potential memory leak") was added in
4.16 and 4.15 stable but did not make it to long term stable 4.14 (as far as I
can tell).

Patch ccce20fc7968 ("scsi: sd_zbc: Avoid that resetting a zone fails
sporadically") is included in 4.16 but does not apply to 4.15 stable nor to
4.14 long term stable and requires extensive modifications.

This small series provides a backport of both patches against 4.14. Please
consider these patches for inclusion in this long term stable kernel.

Bart Van Assche (1):
  scsi: sd_zbc: Avoid that resetting a zone fails sporadically

Damien Le Moal (1):
  scsi: sd_zbc: Fix potential memory leak

 drivers/scsi/sd_zbc.c | 128 +-
 1 file changed, 76 insertions(+), 52 deletions(-)

Changes from v1:
* Fixed upstream commit reference in the first patch commit message

-- 
2.17.0



[PATCH v2 1/2] scsi: sd_zbc: Fix potential memory leak

2018-06-04 Thread Damien Le Moal
[Backport of upstream commit 4b433924b2755a94f99258c178684a0e05c344de]

Rework sd_zbc_check_zone_size() to avoid a memory leak due to an early
return if sd_zbc_report_zones() fails.

Signed-off-by: Damien Le Moal 
Cc: sta...@vger.kernel.org # 4.14
---
 drivers/scsi/sd_zbc.c | 34 +++---
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 2eb61d54bbb4..bc3cb81a9c7d 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -425,7 +425,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp,
 
 static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
-   u64 zone_blocks;
+   u64 zone_blocks = 0;
sector_t block = 0;
unsigned char *buf;
unsigned char *rec;
@@ -443,10 +443,8 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
/* Do a report zone to get the same field */
ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0);
-   if (ret) {
-   zone_blocks = 0;
-   goto out;
-   }
+   if (ret)
+   goto out_free;
 
same = buf[4] & 0x0f;
if (same > 0) {
@@ -489,7 +487,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
ret = sd_zbc_report_zones(sdkp, buf,
  SD_ZBC_BUF_SIZE, block);
if (ret)
-   return ret;
+   goto out_free;
}
 
} while (block < sdkp->capacity);
@@ -497,34 +495,32 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
zone_blocks = sdkp->zone_blocks;
 
 out:
-   kfree(buf);
-
if (!zone_blocks) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Devices with non constant zone "
  "size are not supported\n");
-   return -ENODEV;
-   }
-
-   if (!is_power_of_2(zone_blocks)) {
+   ret = -ENODEV;
+   } else if (!is_power_of_2(zone_blocks)) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Devices with non power of 2 zone "
  "size are not supported\n");
-   return -ENODEV;
-   }
-
-   if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
+   ret = -ENODEV;
+   } else if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Zone size too large\n");
-   return -ENODEV;
+   ret = -ENODEV;
+   } else {
+   sdkp->zone_blocks = zone_blocks;
+   sdkp->zone_shift = ilog2(zone_blocks);
}
 
-   sdkp->zone_blocks = zone_blocks;
+out_free:
+   kfree(buf);
 
-   return 0;
+   return ret;
 }
 
 static int sd_zbc_setup(struct scsi_disk *sdkp)
-- 
2.17.0



[PATCH v2 2/2] scsi: sd_zbc: Avoid that resetting a zone fails sporadically

2018-06-04 Thread Damien Le Moal
From: Bart Van Assche 

[Backport of upstream commit ccce20fc7968d546fb1e8e147bf5cdc8afc4278a]

Since SCSI scanning occurs asynchronously, since sd_revalidate_disk() is
called from sd_probe_async() and since sd_revalidate_disk() calls
sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called
concurrently with blkdev_report_zones() and/or blkdev_reset_zones().
That can cause these functions to fail with -EIO because
sd_zbc_read_zones() sets sdkp->nr_zones to zero before restoring it to
the actual value, even if no drive characteristics have changed.
Avoid that this can happen by modifying making the following changes:

- Protect the code that updates zone information with blk_mq_freeze()
  and blk_mq_unfreeze().
- Modify sd_zbc_setup() such that these functions do not modify
  struct scsi_disk before all zone information has been obtained.
- Reallocate the zone write lock bitmap if the number of zones changed.

Note: since commit 055f6e18e08f ("block: Make q_usage_counter also track
legacy requests"; kernel v4.15) the request queue freezing mechanism also
affects legacy request queues.

Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
Signed-off-by: Bart Van Assche 
[Damien]
* Backport for 4.14-stable
* Updated this commit message
Signed-off-by: Damien Le Moal 
Cc: sta...@vger.kernel.org # 4.14
---
 drivers/scsi/sd_zbc.c | 98 +++
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index bc3cb81a9c7d..ea9e1e0ed5b8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -423,7 +423,16 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp,
 
 #define SD_ZBC_BUF_SIZE 131072
 
-static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
+/**
+ * sd_zbc_check_zone_size - Check the device zone sizes
+ * @sdkp: Target disk
+ *
+ * Check that all zones of the device are equal. The last zone can however
+ * be smaller. The zone size must also be a power of two number of LBAs.
+ *
+ * Returns the zone size in bytes upon success or an error code upon failure.
+ */
+static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
u64 zone_blocks = 0;
sector_t block = 0;
@@ -434,8 +443,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
int ret;
u8 same;
 
-   sdkp->zone_blocks = 0;
-
/* Get a buffer */
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
if (!buf)
@@ -470,16 +477,17 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
/* Parse zone descriptors */
while (rec < buf + buf_len) {
-   zone_blocks = get_unaligned_be64([8]);
-   if (sdkp->zone_blocks == 0) {
-   sdkp->zone_blocks = zone_blocks;
-   } else if (zone_blocks != sdkp->zone_blocks &&
-  (block + zone_blocks < sdkp->capacity
-   || zone_blocks > sdkp->zone_blocks)) {
+   u64 this_zone_blocks = get_unaligned_be64([8]);
+
+   if (zone_blocks == 0) {
+   zone_blocks = this_zone_blocks;
+   } else if (this_zone_blocks != zone_blocks &&
+  (block + this_zone_blocks < sdkp->capacity
+   || this_zone_blocks > zone_blocks)) {
zone_blocks = 0;
goto out;
}
-   block += zone_blocks;
+   block += this_zone_blocks;
rec += 64;
}
 
@@ -492,8 +500,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
} while (block < sdkp->capacity);
 
-   zone_blocks = sdkp->zone_blocks;
-
 out:
if (!zone_blocks) {
if (sdkp->first_scan)
@@ -513,8 +519,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
  "Zone size too large\n");
ret = -ENODEV;
} else {
-   sdkp->zone_blocks = zone_blocks;
-   sdkp->zone_shift = ilog2(zone_blocks);
+   ret = zone_blocks;
}
 
 out_free:
@@ -523,23 +528,44 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return ret;
 }
 
-static int sd_zbc_setup(struct scsi_disk *sdkp)
+static int sd_zbc_setup(struct scsi_disk *sdkp, u32 zone_blocks)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   u32 zone_shift = ilog2(zone_blocks);
+   u32 nr_zones;
 
/* chunk_sectors indicates the zone size */
-   blk_queue_chunk_sectors(sdkp->disk->queue,
-   logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->zone_shift

[PATCH 0/2] 4.14 long term stable ZBC fixes

2018-05-31 Thread Damien Le Moal
Patch 0aa3fdb8b3a6 ("scsi: sd_zbc: Fix potential memory leak") was added in
4.16 and 4.15 stable but did not make it to long term stable 4.14 (as far as I
can tell).

Patch ccce20fc7968 ("scsi: sd_zbc: Avoid that resetting a zone fails
sporadically") is included in 4.16 but does not apply to 4.15 stable nor to
4.14 long term stable and requires extensive modifications.

This small series provides a backport of both patches against 4.14. Please
consider these patches for inclusion in this long term stable kernel.

Bart Van Assche (1):
  scsi: sd_zbc: Avoid that resetting a zone fails sporadically

Damien Le Moal (1):
  scsi: sd_zbc: Fix potential memory leak

 drivers/scsi/sd_zbc.c | 128 +-
 1 file changed, 76 insertions(+), 52 deletions(-)

-- 
2.17.0



[PATCH 1/2] scsi: sd_zbc: Fix potential memory leak

2018-05-31 Thread Damien Le Moal
[Backport of upstream commit 0aa3fdb8b3a6df3c2e3b61dbfe079db9d30e03cd]

Rework sd_zbc_check_zone_size() to avoid a memory leak due to an early
return if sd_zbc_report_zones() fails.

Signed-off-by: Damien Le Moal 
Cc: sta...@vger.kernel.org # 4.14
---
 drivers/scsi/sd_zbc.c | 34 +++---
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 2eb61d54bbb4..bc3cb81a9c7d 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -425,7 +425,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp,
 
 static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
-   u64 zone_blocks;
+   u64 zone_blocks = 0;
sector_t block = 0;
unsigned char *buf;
unsigned char *rec;
@@ -443,10 +443,8 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
/* Do a report zone to get the same field */
ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0);
-   if (ret) {
-   zone_blocks = 0;
-   goto out;
-   }
+   if (ret)
+   goto out_free;
 
same = buf[4] & 0x0f;
if (same > 0) {
@@ -489,7 +487,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
ret = sd_zbc_report_zones(sdkp, buf,
  SD_ZBC_BUF_SIZE, block);
if (ret)
-   return ret;
+   goto out_free;
}
 
} while (block < sdkp->capacity);
@@ -497,34 +495,32 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
zone_blocks = sdkp->zone_blocks;
 
 out:
-   kfree(buf);
-
if (!zone_blocks) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Devices with non constant zone "
  "size are not supported\n");
-   return -ENODEV;
-   }
-
-   if (!is_power_of_2(zone_blocks)) {
+   ret = -ENODEV;
+   } else if (!is_power_of_2(zone_blocks)) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Devices with non power of 2 zone "
  "size are not supported\n");
-   return -ENODEV;
-   }
-
-   if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
+   ret = -ENODEV;
+   } else if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Zone size too large\n");
-   return -ENODEV;
+   ret = -ENODEV;
+   } else {
+   sdkp->zone_blocks = zone_blocks;
+   sdkp->zone_shift = ilog2(zone_blocks);
}
 
-   sdkp->zone_blocks = zone_blocks;
+out_free:
+   kfree(buf);
 
-   return 0;
+   return ret;
 }
 
 static int sd_zbc_setup(struct scsi_disk *sdkp)
-- 
2.17.0



[PATCH 2/2] scsi: sd_zbc: Avoid that resetting a zone fails sporadically

2018-05-31 Thread Damien Le Moal
From: Bart Van Assche 

[Backport of upstream commit ccce20fc7968d546fb1e8e147bf5cdc8afc4278a]

Since SCSI scanning occurs asynchronously, since sd_revalidate_disk() is
called from sd_probe_async() and since sd_revalidate_disk() calls
sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called
concurrently with blkdev_report_zones() and/or blkdev_reset_zones().
That can cause these functions to fail with -EIO because
sd_zbc_read_zones() sets sdkp->nr_zones to zero before restoring it to
the actual value, even if no drive characteristics have changed.
Avoid that this can happen by modifying making the following changes:

- Protect the code that updates zone information with blk_mq_freeze()
  and blk_mq_unfreeze().
- Modify sd_zbc_setup() such that these functions do not modify
  struct scsi_disk before all zone information has been obtained.
- Reallocate the zone write lock bitmap if the number of zones changed.

Note: since commit 055f6e18e08f ("block: Make q_usage_counter also track
legacy requests"; kernel v4.15) the request queue freezing mechanism also
affects legacy request queues.

Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
Signed-off-by: Bart Van Assche 
[Damien]
* Backport for 4.14-stable
* Updated this commit message
Signed-off-by: Damien Le Moal 
Cc: sta...@vger.kernel.org # 4.14
---
 drivers/scsi/sd_zbc.c | 98 +++
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index bc3cb81a9c7d..ea9e1e0ed5b8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -423,7 +423,16 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp,
 
 #define SD_ZBC_BUF_SIZE 131072
 
-static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
+/**
+ * sd_zbc_check_zone_size - Check the device zone sizes
+ * @sdkp: Target disk
+ *
+ * Check that all zones of the device are equal. The last zone can however
+ * be smaller. The zone size must also be a power of two number of LBAs.
+ *
+ * Returns the zone size in bytes upon success or an error code upon failure.
+ */
+static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
u64 zone_blocks = 0;
sector_t block = 0;
@@ -434,8 +443,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
int ret;
u8 same;
 
-   sdkp->zone_blocks = 0;
-
/* Get a buffer */
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
if (!buf)
@@ -470,16 +477,17 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
/* Parse zone descriptors */
while (rec < buf + buf_len) {
-   zone_blocks = get_unaligned_be64([8]);
-   if (sdkp->zone_blocks == 0) {
-   sdkp->zone_blocks = zone_blocks;
-   } else if (zone_blocks != sdkp->zone_blocks &&
-  (block + zone_blocks < sdkp->capacity
-   || zone_blocks > sdkp->zone_blocks)) {
+   u64 this_zone_blocks = get_unaligned_be64([8]);
+
+   if (zone_blocks == 0) {
+   zone_blocks = this_zone_blocks;
+   } else if (this_zone_blocks != zone_blocks &&
+  (block + this_zone_blocks < sdkp->capacity
+   || this_zone_blocks > zone_blocks)) {
zone_blocks = 0;
goto out;
}
-   block += zone_blocks;
+   block += this_zone_blocks;
rec += 64;
}
 
@@ -492,8 +500,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
} while (block < sdkp->capacity);
 
-   zone_blocks = sdkp->zone_blocks;
-
 out:
if (!zone_blocks) {
if (sdkp->first_scan)
@@ -513,8 +519,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
  "Zone size too large\n");
ret = -ENODEV;
} else {
-   sdkp->zone_blocks = zone_blocks;
-   sdkp->zone_shift = ilog2(zone_blocks);
+   ret = zone_blocks;
}
 
 out_free:
@@ -523,23 +528,44 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return ret;
 }
 
-static int sd_zbc_setup(struct scsi_disk *sdkp)
+static int sd_zbc_setup(struct scsi_disk *sdkp, u32 zone_blocks)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   u32 zone_shift = ilog2(zone_blocks);
+   u32 nr_zones;
 
/* chunk_sectors indicates the zone size */
-   blk_queue_chunk_sectors(sdkp->disk->queue,
-   logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->zone_shift

[PATCH] sd_zbc: Fix sd_zbc_check_zone_size() error path

2018-05-31 Thread Damien Le Moal
If a drive with variable zone sizes or an invalid last zone size is
detected, the local variable this_zone_blocks is set to 0 and early
retrun from the function triggered, but this does not result in an
error return. The local variable zone_blocks must be set to 0 for an
error to be returned.

Fixes: ccce20fc7968 ("scsi: sd_zbc: Avoid that resetting a zone fails 
sporadically")
Signed-off-by: Damien Le Moal 
Cc: Bart Van Assche 
---
 drivers/scsi/sd_zbc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 323e3dc4bc59..850c803a6b3d 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -442,7 +442,7 @@ static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
} else if (this_zone_blocks != zone_blocks &&
   (block + this_zone_blocks < sdkp->capacity
|| this_zone_blocks > zone_blocks)) {
-   this_zone_blocks = 0;
+   zone_blocks = 0;
goto out;
}
block += this_zone_blocks;
-- 
2.17.0



Re: [PATCH 3/3] sd_zbc: Avoid that resetting a zone fails sporadically

2018-04-17 Thread Damien Le Moal
On 2018/04/16 18:04, Bart Van Assche wrote:
> Since SCSI scanning occurs asynchronously, since sd_revalidate_disk()
> is called from sd_probe_async() and since sd_revalidate_disk() calls
> sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called
> concurrently with blkdev_report_zones() and/or blkdev_reset_zones().
> That can cause these functions to fail with -EIO because
> sd_zbc_read_zones() e.g. sets q->nr_zones to zero before restoring it
> to the actual value, even if no drive characteristics have changed.
> Avoid that this can happen by making the following changes:
> - Protect the code that updates zone information with blk_queue_enter()
>   and blk_queue_exit().
> - Modify sd_zbc_setup_seq_zones_bitmap() and sd_zbc_setup() such that
>   these functions do not modify struct scsi_disk before all zone
>   information has been obtained.
> 
> Note: since commit 055f6e18e08f ("block: Make q_usage_counter also
> track legacy requests"; kernel v4.15) the request queue freezing
> mechanism also affects legacy request queues.
> 
> Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Jens Axboe <ax...@kernel.dk>
> Cc: Damien Le Moal <damien.lem...@wdc.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: sta...@vger.kernel.org # v4.10
> ---
>  drivers/scsi/sd_zbc.c  | 140 
> +
>  include/linux/blkdev.h |   5 ++
>  2 files changed, 87 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 2d0c06f7db3e..323e3dc4bc59 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -390,8 +390,10 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
> unsigned char *buf)
>   *
>   * Check that all zones of the device are equal. The last zone can however
>   * be smaller. The zone size must also be a power of two number of LBAs.
> + *
> + * Returns the zone size in bytes upon success or an error code upon failure.
>   */
> -static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
> +static s64 sd_zbc_check_zone_size(struct scsi_disk *sdkp)
>  {
>   u64 zone_blocks = 0;
>   sector_t block = 0;
> @@ -402,8 +404,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
>   int ret;
>   u8 same;
>  
> - sdkp->zone_blocks = 0;
> -
>   /* Get a buffer */
>   buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
>   if (!buf)
> @@ -435,16 +435,17 @@ static int sd_zbc_check_zone_size(struct scsi_disk 
> *sdkp)
>  
>   /* Parse zone descriptors */
>   while (rec < buf + buf_len) {
> - zone_blocks = get_unaligned_be64([8]);
> - if (sdkp->zone_blocks == 0) {
> - sdkp->zone_blocks = zone_blocks;
> - } else if (zone_blocks != sdkp->zone_blocks &&
> -(block + zone_blocks < sdkp->capacity
> - || zone_blocks > sdkp->zone_blocks)) {
> - zone_blocks = 0;
> + u64 this_zone_blocks = get_unaligned_be64([8]);
> +
> + if (zone_blocks == 0) {
> + zone_blocks = this_zone_blocks;
> + } else if (this_zone_blocks != zone_blocks &&
> +(block + this_zone_blocks < sdkp->capacity
> + || this_zone_blocks > zone_blocks)) {
> + this_zone_blocks = 0;
>   goto out;
>   }
> - block += zone_blocks;
> + block += this_zone_blocks;
>   rec += 64;
>   }
>  
> @@ -457,8 +458,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
>  
>   } while (block < sdkp->capacity);
>  
> - zone_blocks = sdkp->zone_blocks;
> -
>  out:
>   if (!zone_blocks) {
>   if (sdkp->first_scan)
> @@ -478,8 +477,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
> "Zone size too large\n");
>   ret = -ENODEV;
>   } else {
> - sdkp->zone_blocks = zone_blocks;
> - sdkp->zone_shift = ilog2(zone_blocks);
> + ret = zone_blocks;
>   }
>  
>  out_free:
> @@ -490,15 +488,14 @@ static int sd_zbc_check_zone_size(struct scsi_disk 
> *sdkp)
>  
>  /**
> 

Re: [PATCH 2/3] sd_zbc: Let the SCSI core handle ILLEGAL REQUEST / ASC 0x21

2018-04-17 Thread Damien Le Moal
On 2018/04/16 18:04, Bart Van Assche wrote:
> scsi_io_completion() translates the sense key ILLEGAL REQUEST / ASC
> 0x21 into ACTION_FAIL. That means that setting cmd->allowed to zero
> in sd_zbc_complete() for this sense code / ASC combination is not
> necessary. Hence remove the code that resets cmd->allowed from
> sd_zbc_complete().
> 
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Damien Le Moal <damien.lem...@wdc.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Hannes Reinecke <h...@suse.com>
> ---
>  drivers/scsi/sd_zbc.c | 10 --
>  1 file changed, 10 deletions(-)
> 
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 41df75eea57b..2d0c06f7db3e 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -299,16 +299,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int 
> good_bytes,
>   case REQ_OP_WRITE:
>   case REQ_OP_WRITE_ZEROES:
>   case REQ_OP_WRITE_SAME:
> -
> - if (result &&
> - sshdr->sense_key == ILLEGAL_REQUEST &&
> - sshdr->asc == 0x21)
> - /*
> -  * INVALID ADDRESS FOR WRITE error: It is unlikely that
> -  * retrying write requests failed with any kind of
> -  * alignement error will result in success. So don't.
> -      */
> -         cmd->allowed = 0;
>   break;
>  
>   case REQ_OP_ZONE_REPORT:
> 

Reviewed-by: Damien Le Moal <damien.lem...@wdc.com>

-- 
Damien Le Moal
Western Digital Research

Re: [PATCH 1/3] sd_zbc: Change the type of the ZBC fields into u32

2018-04-17 Thread Damien Le Moal
On 2018/04/16 18:04, Bart Van Assche wrote:
> This patch does not change any functionality but makes it clear
> that it is on purpose that these fields are 32 bits wide.
> 
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Damien Le Moal <damien.lem...@wdc.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Hannes Reinecke <h...@suse.com>
> ---
>  drivers/scsi/sd.h | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
> index 0d663b5e45bb..392c7d078ae3 100644
> --- a/drivers/scsi/sd.h
> +++ b/drivers/scsi/sd.h
> @@ -74,12 +74,12 @@ struct scsi_disk {
>   struct gendisk  *disk;
>   struct opal_dev *opal_dev;
>  #ifdef CONFIG_BLK_DEV_ZONED
> - unsigned intnr_zones;
> - unsigned intzone_blocks;
> - unsigned intzone_shift;
> - unsigned intzones_optimal_open;
> - unsigned intzones_optimal_nonseq;
> - unsigned intzones_max_open;
> + u32 nr_zones;
> + u32 zone_blocks;
> + u32 zone_shift;
> + u32 zones_optimal_open;
> + u32 zones_optimal_nonseq;
> + u32 zones_max_open;
>  #endif
>       atomic_topeners;
>   sector_t    capacity;   /* size in logical blocks */
> 

Reviewed-by: Damien Le Moal <damien.lem...@wdc.com>

-- 
Damien Le Moal
Western Digital Research

Re: [PATCH v2] Fix DID_OK handling in __scsi_error_from_host_byte()

2018-04-04 Thread Damien Le Moal
On Wed, 2018-04-04 at 10:53 -0700, Bart Van Assche wrote:
> Commit e39a97353e53 modified __scsi_error_from_host_byte() such
> that that function translates DID_OK into BLK_STS_OK. However,
> the description of that commit is wrong: it mentions that commit
> 2a842acab109 introduced a bug in __scsi_error_from_host_byte()
> although that commit did not change the behavior of that function.
> Additionally, commit e39a97353e53 introduced a severe bug: it causes
> commands that fail with hostbyte=DID_OK and driverbyte=DRIVER_SENSE
> to be completed with BLK_STS_OK. Fix __scsi_error_from_host_byte()
> by only translating good status values into BLK_STS_OK.
> 
> Fixes: e39a97353e53 ("scsi: core: return BLK_STS_OK for DID_OK in
> __scsi_error_from_host_byte()")
> Reported-by: Damien Le Moal <damien.lem...@wdc.com>
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Douglas Gilbert <dgilb...@interlog.com>
> Cc: Damien Le Moal <damien.lem...@wdc.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: sta...@vger.kernel.org
> ---
> 
> Changes compared to v1:
> - Modified __scsi_error_from_host_byte() such that it again returns
>   BLK_STS_OK for CONDITION MET and other result codes that represent
>   success.
> 
>  drivers/scsi/scsi_lib.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 74a39db57d49..1496b34af409 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -736,7 +736,13 @@ static blk_status_t __scsi_error_from_host_byte(struct
> scsi_cmnd *cmd,
>  {
>   switch (host_byte(result)) {
>   case DID_OK:
> - return BLK_STS_OK;
> + /*
> +  * Also check the other bytes than the status byte in
> result
> +  * to handle the case when a SCSI LLD sets result to
> +  * DRIVER_SENSE << 24 without setting
> SAM_STAT_CHECK_CONDITION.
> +  */
> + return scsi_status_is_good(result) && (result & ~0xff) == 0
> ?
> + BLK_STS_OK : BLK_STS_IOERR;

This fixes the problem on my system.

Tested-by: Damien Le Moal <damien.lem...@wdc.com>

-- 
Damien Le Moal
Western Digital

Re: [PATCH 2/2] sd_zbc: Avoid errors due to sd_zbc_setup() execution

2018-04-04 Thread Damien Le Moal
Bart,

On 4/5/18 00:22, Bart Van Assche wrote:
> On Wed, 2018-04-04 at 17:54 +0900, Damien Le Moal wrote:
>> Since SCSI scanning occurs asynchronously, since sd_revalidate_disk()
>> is called from sd_probe_async() and since sd_revalidate_disk() calls
>> sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called
>> concurrently with operations referencing a drive zone bitmaps and number
>
> 
> Should "a" be changed into "the"?

Yes.


>> [Damien] Updated commit message and changed nr_zones/bitmap swap order.
> 
> Updating the number of zones after having updated the bitmap pointers is not
> sufficient to avoid trouble if the number of zones as reported by the drive
> changes while I/O is in progress. With the current implementation if the
> number of zones changes the seq_zones_bitmap is cleared. Can this cause
> trouble for the mq-deadline scheduler? Additionally, CPUs other than x86 can
> reorder store operations. Even worse, a CPU could cache the zone bitmap
> pointers which means that at least RCU protection + kfree_rcu() is needed to
> avoid trouble. I think we either should handle this case properly or issue a
> kernel warning.

OK. Let's work on that.


-- 
Damien Le Moal,
Western Digital

Re: [PATCH 1/2] sd_zbc: Avoid errors due to sd_zbc_check_zone_size() execution

2018-04-04 Thread Damien Le Moal
Bart,

On 4/5/18 00:08, Bart Van Assche wrote:
> On 04/04/18 01:54, Damien Le Moal wrote:
>>   static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
>>   {
>> +u64 sdkp_zone_blocks = sdkp->zone_blocks;
> 
> Shouldn't this variable be initialized to zero such that zone size 
> changes are accepted even if the SAME field in the REPORT ZONES response 
> is zero?

sdkp_zone_blocks will be 0 when sd_zbc_check_zone_size() is called for
the first scan of a disk and will hold the current disk value if
sd_zbc_check_zone_size() is called on a revalidate after first scan. If
the initial value is 0, there is no check and the variable is first
initialized. Otherwise, the value is compared to the zone size reported.
In both cases, the zone size change will be cought.

But granted, setting the value initially to 0 is easier to understand.
I will also change:

} else {
sdkp->zone_blocks = zone_blocks;

sdkp->zone_shift = ilog2(zone_blocks);

}

to

} else if (sdkp->zone_blocks != zone_blocks) {

sdkp->zone_blocks = zone_blocks;

sdkp->zone_shift = ilog2(zone_blocks);

}

to make things really clear.

Similarly to a capacity change, It may also be good to add a warning
message. After all, on a drive swap, we can have the case where the
capacity does not change, but the zone size does.

Sending a v2.

Best regards.

-- 
Damien Le Moal,
Western Digital

[PATCH 2/2] sd_zbc: Avoid errors due to sd_zbc_setup() execution

2018-04-04 Thread Damien Le Moal
From: Bart Van Assche <bart.vanass...@wdc.com>

Since SCSI scanning occurs asynchronously, since sd_revalidate_disk()
is called from sd_probe_async() and since sd_revalidate_disk() calls
sd_zbc_read_zones() it can happen that sd_zbc_read_zones() is called
concurrently with operations referencing a drive zone bitmaps and number
of zones. Make sure that this race does not cause failures when
revalidate does not detect any change by making the following changes to
sd_zbc_setup():
- Ensure that sd_zbc_setup_seq_zones_bitmap() does not change any
  ZBC metadata in the request queue.
- Only modify the ZBC information in the request queue that has
  changed. If the number of zones has changed, update q->nr_zones,
  q->seq_zones_wlock and q->seq_zones_bitmap. If the type of some
  zones has changed but not the number of zones, only update the
  zone type information.

Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
[Damien] Updated commit message and changed nr_zones/bitmap swap order.
Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Hannes Reinecke <h...@suse.com>
Cc: sta...@vger.kernel.org
---
 drivers/scsi/sd_zbc.c | 45 +++--
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index b59454ed5087..39ddbe92769c 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -551,14 +551,13 @@ static sector_t sd_zbc_get_seq_zones(struct scsi_disk 
*sdkp, unsigned char *buf,
 }
 
 /**
- * sd_zbc_setup_seq_zones_bitmap - Initialize the disk seq zone bitmap.
+ * sd_zbc_setup_seq_zones_bitmap - Initialize a seq zone bitmap.
  * @sdkp: target disk
  *
  * Allocate a zone bitmap and initialize it by identifying sequential zones.
  */
-static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
+static unsigned long *sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
 {
-   struct request_queue *q = sdkp->disk->queue;
unsigned long *seq_zones_bitmap;
sector_t lba = 0;
unsigned char *buf;
@@ -566,7 +565,7 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk 
*sdkp)
 
seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(sdkp);
if (!seq_zones_bitmap)
-   return -ENOMEM;
+   return ERR_PTR(-ENOMEM);
 
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
if (!buf)
@@ -589,12 +588,9 @@ static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk 
*sdkp)
kfree(buf);
if (ret) {
kfree(seq_zones_bitmap);
-   return ret;
+   return ERR_PTR(ret);
}
-
-   q->seq_zones_bitmap = seq_zones_bitmap;
-
-   return 0;
+   return seq_zones_bitmap;
 }
 
 static void sd_zbc_cleanup(struct scsi_disk *sdkp)
@@ -630,24 +626,37 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
 * of zones changed.
 */
if (sdkp->nr_zones != q->nr_zones) {
+   struct request_queue *q = sdkp->disk->queue;
+   unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
+   size_t zone_bitmap_size;
 
-   sd_zbc_cleanup(sdkp);
-
-   q->nr_zones = sdkp->nr_zones;
if (sdkp->nr_zones) {
-   q->seq_zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
-   if (!q->seq_zones_wlock) {
+   seq_zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!seq_zones_wlock) {
ret = -ENOMEM;
goto err;
}
 
-   ret = sd_zbc_setup_seq_zones_bitmap(sdkp);
-   if (ret) {
-   sd_zbc_cleanup(sdkp);
+   seq_zones_bitmap = sd_zbc_setup_seq_zones_bitmap(sdkp);
+   if (IS_ERR(seq_zones_bitmap)) {
+   ret = PTR_ERR(seq_zones_bitmap);
+   kfree(seq_zones_wlock);
goto err;
}
}
-
+   zone_bitmap_size = BITS_TO_LONGS(sdkp->nr_zones) *
+   sizeof(unsigned long);
+   if (q->nr_zones != sdkp->nr_zones) {
+   swap(q->seq_zones_wlock, seq_zones_wlock);
+   swap(q->seq_zones_bitmap, seq_zones_bitmap);
+   q->nr_zones = sdkp->nr_zones;
+   } else if (memcmp(q->seq_zones_bitmap, seq_zones_bitmap,
+ zone_bitmap_size) != 0) {
+   memcpy(q->seq_zones_bitmap, seq_zones_bitmap,
+  zone_bitmap_size);
+   }
+   kfree(seq_zones_wlock);
+   kfree(seq_zones_bitmap);
}
 
return 0;
-- 
2.14.3



[PATCH 0/2] Fix errors due to revalidation of ZBC disks

2018-04-04 Thread Damien Le Moal
The concurrent submission of commands such as a zone reset with the execution of
sd_zbc_read_zones() from sd_revalidate() context can cause the command
submissions to fail due to possible references to temporarily invalid values
such as the number of zones or the disk zone size.

This two ptches series introduces fix these problems by avoiding any change to
the disk information unless a change is detected by revalidate.

Bart Van Assche (1):
  sd_zbc: Avoid errors due to sd_zbc_setup() execution

Damien Le Moal (1):
  sd_zbc: Avoid errors due to sd_zbc_check_zone_size() execution

 drivers/scsi/sd_zbc.c | 58 +--
 1 file changed, 33 insertions(+), 25 deletions(-)

-- 
2.14.3



[PATCH 1/2] sd_zbc: Avoid errors due to sd_zbc_check_zone_size() execution

2018-04-04 Thread Damien Le Moal
When sd_revalidate() is executed for a ZBC disk (zoned block device),
sd_zbc_read_zones() is called to revalidate the disk zone configuration.
This executes sd_zbc_check_zone_size() to check that the disk zone sizes
are in line with the defined constraints (all zones must be the same
size and a power of 2 number of LBAs). As part of its execution,
sd_zbc_check_zone_size() was temporarily setting sdkp->zone_blocks to 0.
If during the execution of sd_zbc_check_zone_size() within
sd_revalidate() context, another context issues a command which
references sdkp->zone_blocks to check zone alignment of the command
(e.g. a zone reset is issued and sd_zbc_setup_reset_cmnd() is called),
an invalid value for the disk zone size is used and the alignment check
fails.

Simply fix this by using an on-stack variable inside
sd_zbc_check_zone_size() instead of directly using sdkp->zone_blocks.
This change is valid for both revalidate as well as for the first scan
of the device.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Cc: <sta...@vger.kernel.org>
---
 drivers/scsi/sd_zbc.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 89cf4498f535..b59454ed5087 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -403,6 +403,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf)
  */
 static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
+   u64 sdkp_zone_blocks = sdkp->zone_blocks;
u64 zone_blocks = 0;
sector_t block = 0;
unsigned char *buf;
@@ -412,8 +413,6 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
int ret;
u8 same;
 
-   sdkp->zone_blocks = 0;
-
/* Get a buffer */
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
if (!buf)
@@ -446,11 +445,11 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
/* Parse zone descriptors */
while (rec < buf + buf_len) {
zone_blocks = get_unaligned_be64([8]);
-   if (sdkp->zone_blocks == 0) {
-   sdkp->zone_blocks = zone_blocks;
-   } else if (zone_blocks != sdkp->zone_blocks &&
+   if (sdkp_zone_blocks == 0) {
+   sdkp_zone_blocks = zone_blocks;
+   } else if (zone_blocks != sdkp_zone_blocks &&
   (block + zone_blocks < sdkp->capacity
-   || zone_blocks > sdkp->zone_blocks)) {
+   || zone_blocks > sdkp_zone_blocks)) {
zone_blocks = 0;
goto out;
}
@@ -467,7 +466,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
} while (block < sdkp->capacity);
 
-   zone_blocks = sdkp->zone_blocks;
+   zone_blocks = sdkp_zone_blocks;
 
 out:
if (!zone_blocks) {
-- 
2.14.3



Re: [PATCH] scsi: Fix failed request error code

2018-04-04 Thread Damien Le Moal


On 4/4/18 16:39, Damien Le Moal wrote:
> Hannes,
> 
> On 4/4/18 16:35, Hannes Reinecke wrote:
>> On Wed, 4 Apr 2018 07:06:58 +0000
>> Damien Le Moal <damien.lem...@wdc.com> wrote:
>>
>>> Hannes,
>>>
>>> On 4/4/18 15:57, Hannes Reinecke wrote:
>>>> On Wed,  4 Apr 2018 15:51:38 +0900
>>>> Damien Le Moal <damien.lem...@wdc.com> wrote:
>>>>   
>>>>> With the introduction of commit e39a97353e53 ("scsi: core: return
>>>>> BLK_STS_OK for DID_OK in __scsi_error_from_host_byte()"), a command
>>>>> that failed with hostbyte=DID_OK and driverbyte=DRIVER_SENSE but
>>>>> lacking additional sense information will have a return code set to
>>>>> BLK_STS_OK. This results in the request issuer to see successful
>>>>> request execution despite the failure. An example of such case is
>>>>> an unaligned write on a host managed ZAC disk connected to a SAS
>>>>> HBA with a malfunctioning SAT. The unaligned write command gets
>>>>> aborted but has no additional sense information.
>>>>>
>>>>> sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK
>>>>> driverbyte=DRIVER_SENSE sd 10:0:0:0: [sde] tag#3905 Sense Key :
>>>>> Aborted Command [current] sd 10:0:0:0: [sde] tag#3905 Add. Sense:
>>>>> No additional sense information sd 10:0:0:0: [sde] tag#3905 CDB:
>>>>> Write(16) 8a 00 00 00 00 00 02 0c 00 01 00 00 00 01 00 00
>>>>> print_req_error: I/O error, dev sde, sector 274726920
>>>>>
>>>>> In scsi_io_completion(), sense key handling to not change the
>>>>> request error code and success being reported to the issuer.
>>>>>
>>>>> Fix this by making sure that the error code always indicates an
>>>>> error if scsi_io_completion() decide that the action to be taken
>>>>> for a failed command is to not retry it and terminate it
>>>>> immediately (ACTION_FAIL) .
>>>>>
>>>>> Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
>>>>> Fixes: e39a97353e53 ("scsi: core: return BLK_STS_OK for DID_OK in
>>>>> __scsi_error_from_host_byte()") Cc: Hannes Reinecke <h...@suse.com>
>>>>> Cc: <sta...@vger.kernel.org>
>>>>> ---
>>>>>  drivers/scsi/scsi_lib.c | 9 +
>>>>>  1 file changed, 9 insertions(+)
>>>>>
>>>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>>>> index c84f931388f2..87579bfcc186 100644
>>>>> --- a/drivers/scsi/scsi_lib.c
>>>>> +++ b/drivers/scsi/scsi_lib.c
>>>>> @@ -1002,6 +1002,15 @@ void scsi_io_completion(struct scsi_cmnd
>>>>> *cmd, unsigned int good_bytes) scsi_print_command(cmd);
>>>>>   }
>>>>>   }
>>>>> + /*
>>>>> +  * The command failed and should not be retried.
>>>>> If the host
>>>>> +  * byte is DID_OK, then
>>>>> __scsi_error_from_host_byte() returned
>>>>> +  * BLK_STS_OK and error indicates a success. Make
>>>>> sure to not
>>>>> +  * use that as the completion code and always
>>>>> return an
>>>>> +  * I/O error.
>>>>> +  */
>>>>> + if (error == BLK_STS_OK)
>>>>> + error = BLK_STS_IOERR;
>>>>>   if (!scsi_end_request(req, error,
>>>>> blk_rq_err_bytes(req), 0)) return;
>>>>>   /*FALLTHRU*/  
>>>>
>>>> That looks wrong.
>>>> Shouldn't __scsi_error_from_host_byte() return the correct status
>>>> here?  
>>>
>>> My drive said:
>>>
>>> sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK
>>> driverbyte=DRIVER_SENSE
>>> sd 10:0:0:0: [sde] tag#3905 Sense Key : Aborted Command [current]
>>> sd 10:0:0:0: [sde] tag#3905 Add. Sense: No additional sense
>>> information sd 10:0:0:0: [sde] tag#3905 CDB: Write(16) 8a 00 00 00 00
>>> 00 02 0c 00 01 00 00 00 01 00 00
>>>
>>> Since hostbyte is DID_OK, __scsi_error_from_host_byte() returns
>>> BLK_STS_OK. The HBA fails to give sense data, so the ABORTED_COMMAND
>>> case in scsi_io_completion() "switch (sshdr.sense_key)" does nothing
>>> and error stays equal to succes

Re: [PATCH] scsi: Fix failed request error code

2018-04-04 Thread Damien Le Moal
Hannes,

On 4/4/18 16:35, Hannes Reinecke wrote:
> On Wed, 4 Apr 2018 07:06:58 +
> Damien Le Moal <damien.lem...@wdc.com> wrote:
> 
>> Hannes,
>>
>> On 4/4/18 15:57, Hannes Reinecke wrote:
>>> On Wed,  4 Apr 2018 15:51:38 +0900
>>> Damien Le Moal <damien.lem...@wdc.com> wrote:
>>>   
>>>> With the introduction of commit e39a97353e53 ("scsi: core: return
>>>> BLK_STS_OK for DID_OK in __scsi_error_from_host_byte()"), a command
>>>> that failed with hostbyte=DID_OK and driverbyte=DRIVER_SENSE but
>>>> lacking additional sense information will have a return code set to
>>>> BLK_STS_OK. This results in the request issuer to see successful
>>>> request execution despite the failure. An example of such case is
>>>> an unaligned write on a host managed ZAC disk connected to a SAS
>>>> HBA with a malfunctioning SAT. The unaligned write command gets
>>>> aborted but has no additional sense information.
>>>>
>>>> sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK
>>>> driverbyte=DRIVER_SENSE sd 10:0:0:0: [sde] tag#3905 Sense Key :
>>>> Aborted Command [current] sd 10:0:0:0: [sde] tag#3905 Add. Sense:
>>>> No additional sense information sd 10:0:0:0: [sde] tag#3905 CDB:
>>>> Write(16) 8a 00 00 00 00 00 02 0c 00 01 00 00 00 01 00 00
>>>> print_req_error: I/O error, dev sde, sector 274726920
>>>>
>>>> In scsi_io_completion(), sense key handling to not change the
>>>> request error code and success being reported to the issuer.
>>>>
>>>> Fix this by making sure that the error code always indicates an
>>>> error if scsi_io_completion() decide that the action to be taken
>>>> for a failed command is to not retry it and terminate it
>>>> immediately (ACTION_FAIL) .
>>>>
>>>> Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
>>>> Fixes: e39a97353e53 ("scsi: core: return BLK_STS_OK for DID_OK in
>>>> __scsi_error_from_host_byte()") Cc: Hannes Reinecke <h...@suse.com>
>>>> Cc: <sta...@vger.kernel.org>
>>>> ---
>>>>  drivers/scsi/scsi_lib.c | 9 +
>>>>  1 file changed, 9 insertions(+)
>>>>
>>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>>> index c84f931388f2..87579bfcc186 100644
>>>> --- a/drivers/scsi/scsi_lib.c
>>>> +++ b/drivers/scsi/scsi_lib.c
>>>> @@ -1002,6 +1002,15 @@ void scsi_io_completion(struct scsi_cmnd
>>>> *cmd, unsigned int good_bytes) scsi_print_command(cmd);
>>>>}
>>>>}
>>>> +  /*
>>>> +   * The command failed and should not be retried.
>>>> If the host
>>>> +   * byte is DID_OK, then
>>>> __scsi_error_from_host_byte() returned
>>>> +   * BLK_STS_OK and error indicates a success. Make
>>>> sure to not
>>>> +   * use that as the completion code and always
>>>> return an
>>>> +   * I/O error.
>>>> +   */
>>>> +  if (error == BLK_STS_OK)
>>>> +  error = BLK_STS_IOERR;
>>>>if (!scsi_end_request(req, error,
>>>> blk_rq_err_bytes(req), 0)) return;
>>>>/*FALLTHRU*/  
>>>
>>> That looks wrong.
>>> Shouldn't __scsi_error_from_host_byte() return the correct status
>>> here?  
>>
>> My drive said:
>>
>> sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK
>> driverbyte=DRIVER_SENSE
>> sd 10:0:0:0: [sde] tag#3905 Sense Key : Aborted Command [current]
>> sd 10:0:0:0: [sde] tag#3905 Add. Sense: No additional sense
>> information sd 10:0:0:0: [sde] tag#3905 CDB: Write(16) 8a 00 00 00 00
>> 00 02 0c 00 01 00 00 00 01 00 00
>>
>> Since hostbyte is DID_OK, __scsi_error_from_host_byte() returns
>> BLK_STS_OK. The HBA fails to give sense data, so the ABORTED_COMMAND
>> case in scsi_io_completion() "switch (sshdr.sense_key)" does nothing
>> and error stays equal to success. scsi_end_request() gets called with
>> that and dd sees a success...
>>
>> There are also plenty of other sense keys cases where error is not
>> changed despite the fact that error can be BLK_STS_SUCCESS (in fact, I
>> think this is likely the most common case since an command failure
>> with hostbyte=DID_OK and driverbyte=DRIVER_SENSE is

Re: [PATCH] scsi: Fix failed request error code

2018-04-04 Thread Damien Le Moal
Hannes,

On 4/4/18 15:57, Hannes Reinecke wrote:
> On Wed,  4 Apr 2018 15:51:38 +0900
> Damien Le Moal <damien.lem...@wdc.com> wrote:
> 
>> With the introduction of commit e39a97353e53 ("scsi: core: return
>> BLK_STS_OK for DID_OK in __scsi_error_from_host_byte()"), a command
>> that failed with hostbyte=DID_OK and driverbyte=DRIVER_SENSE but
>> lacking additional sense information will have a return code set to
>> BLK_STS_OK. This results in the request issuer to see successful
>> request execution despite the failure. An example of such case is an
>> unaligned write on a host managed ZAC disk connected to a SAS HBA
>> with a malfunctioning SAT. The unaligned write command gets aborted
>> but has no additional sense information.
>>
>> sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK
>> driverbyte=DRIVER_SENSE sd 10:0:0:0: [sde] tag#3905 Sense Key :
>> Aborted Command [current] sd 10:0:0:0: [sde] tag#3905 Add. Sense: No
>> additional sense information sd 10:0:0:0: [sde] tag#3905 CDB:
>> Write(16) 8a 00 00 00 00 00 02 0c 00 01 00 00 00 01 00 00
>> print_req_error: I/O error, dev sde, sector 274726920
>>
>> In scsi_io_completion(), sense key handling to not change the request
>> error code and success being reported to the issuer.
>>
>> Fix this by making sure that the error code always indicates an error
>> if scsi_io_completion() decide that the action to be taken for a
>> failed command is to not retry it and terminate it immediately
>> (ACTION_FAIL) .
>>
>> Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
>> Fixes: e39a97353e53 ("scsi: core: return BLK_STS_OK for DID_OK in
>> __scsi_error_from_host_byte()") Cc: Hannes Reinecke <h...@suse.com>
>> Cc: <sta...@vger.kernel.org>
>> ---
>>  drivers/scsi/scsi_lib.c | 9 +
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index c84f931388f2..87579bfcc186 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -1002,6 +1002,15 @@ void scsi_io_completion(struct scsi_cmnd *cmd,
>> unsigned int good_bytes) scsi_print_command(cmd);
>>  }
>>  }
>> +/*
>> + * The command failed and should not be retried. If
>> the host
>> + * byte is DID_OK, then
>> __scsi_error_from_host_byte() returned
>> + * BLK_STS_OK and error indicates a success. Make
>> sure to not
>> + * use that as the completion code and always return
>> an
>> + * I/O error.
>> + */
>> +if (error == BLK_STS_OK)
>> +error = BLK_STS_IOERR;
>>  if (!scsi_end_request(req, error,
>> blk_rq_err_bytes(req), 0)) return;
>>  /*FALLTHRU*/
> 
> That looks wrong.
> Shouldn't __scsi_error_from_host_byte() return the correct status here?

My drive said:

sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK
driverbyte=DRIVER_SENSE
sd 10:0:0:0: [sde] tag#3905 Sense Key : Aborted Command [current]
sd 10:0:0:0: [sde] tag#3905 Add. Sense: No additional sense information
sd 10:0:0:0: [sde] tag#3905 CDB: Write(16) 8a 00 00 00 00 00 02 0c 00 01
00 00 00 01 00 00

Since hostbyte is DID_OK, __scsi_error_from_host_byte() returns
BLK_STS_OK. The HBA fails to give sense data, so the ABORTED_COMMAND
case in scsi_io_completion() "switch (sshdr.sense_key)" does nothing and
error stays equal to success. scsi_end_request() gets called with that
and dd sees a success...

There are also plenty of other sense keys cases where error is not
changed despite the fact that error can be BLK_STS_SUCCESS (in fact, I
think this is likely the most common case since an command failure with
hostbyte=DID_OK and driverbyte=DRIVER_SENSE is probably the most common one.

My patch is a bit of a hammer and makes sure that an ACTION_FAIL request
is completed as a failure... Am I getting all this wrong ?

Best.

-- 
Damien Le Moal,
Western Digital

[PATCH] scsi: Fix failed request error code

2018-04-04 Thread Damien Le Moal
With the introduction of commit e39a97353e53 ("scsi: core: return
BLK_STS_OK for DID_OK in __scsi_error_from_host_byte()"), a command that
failed with hostbyte=DID_OK and driverbyte=DRIVER_SENSE but lacking
additional sense information will have a return code set to BLK_STS_OK.
This results in the request issuer to see successful request execution
despite the failure. An example of such case is an unaligned write on a
host managed ZAC disk connected to a SAS HBA with a malfunctioning SAT.
The unaligned write command gets aborted but has no additional sense
information.

sd 10:0:0:0: [sde] tag#3905 FAILED Result: hostbyte=DID_OK 
driverbyte=DRIVER_SENSE
sd 10:0:0:0: [sde] tag#3905 Sense Key : Aborted Command [current]
sd 10:0:0:0: [sde] tag#3905 Add. Sense: No additional sense information
sd 10:0:0:0: [sde] tag#3905 CDB: Write(16) 8a 00 00 00 00 00 02 0c 00 01 00 00 
00 01 00 00
print_req_error: I/O error, dev sde, sector 274726920

In scsi_io_completion(), sense key handling to not change the request
error code and success being reported to the issuer.

Fix this by making sure that the error code always indicates an error
if scsi_io_completion() decide that the action to be taken for a failed
command is to not retry it and terminate it immediately (ACTION_FAIL) .

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Fixes: e39a97353e53 ("scsi: core: return BLK_STS_OK for DID_OK in 
__scsi_error_from_host_byte()")
Cc: Hannes Reinecke <h...@suse.com>
Cc: <sta...@vger.kernel.org>
---
 drivers/scsi/scsi_lib.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c84f931388f2..87579bfcc186 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1002,6 +1002,15 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned 
int good_bytes)
scsi_print_command(cmd);
}
}
+   /*
+* The command failed and should not be retried. If the host
+* byte is DID_OK, then __scsi_error_from_host_byte() returned
+* BLK_STS_OK and error indicates a success. Make sure to not
+* use that as the completion code and always return an
+* I/O error.
+*/
+   if (error == BLK_STS_OK)
+   error = BLK_STS_IOERR;
if (!scsi_end_request(req, error, blk_rq_err_bytes(req), 0))
return;
/*FALLTHRU*/
-- 
2.14.3



Re: [PATCH] Improve ZBC/ZAC error handling

2018-03-06 Thread Damien Le Moal
On 2018/03/07 3:49, Martin K. Petersen wrote:
> 
> Tejun,
> 
>> Except for the nit on the last patch, ata part looks good to me.
>> Martin, how do you wanna route the SCSI part?
> 
> I want to route it to /dev/null on the grounds of being a BLATANT
> LAYERING VIOLATION (cue dramatic sound effect).

Got it... Will add some more rework to v2.

> scsi_error.c is SPC territory, we really shouldn't wedge any ZBC/SBC
> stuff in there. Nor should we call into this file from libata. If
> there's a ZAC/ZBC SAT retry deficiency, let's address that instead of
> working around it.

Understood. sd_zbc already handles the retry checks for scsi side, and almost
exactly the same code is necessary from libata (since retry tests are based on
sense asc/ascq and not on ATA status bits). So is it OK to export a function
from sd_zbc.c to call from libata ? Replicating the code is of course trivial
but rather dirty.

Best regards.

-- 
Damien Le Moal
Western Digital Research

Re: [PATCH] Improve ZBC/ZAC error handling

2018-03-04 Thread Damien Le Moal
Tejun,

On 2018/03/05 5:33, Tejun Heo wrote:
> On Fri, Mar 02, 2018 at 04:40:18AM +0900, Damien Le Moal wrote:
>> This series introduces changes to scsi and libata error handling for ZBC and 
>> ZAC
>> devices.
>>
>> The first patch moves ZBC specific error handling in sd_zbc_complete() to a
>> generic scsi error function that can be used also in libata (second patch). 
>> The
>> goal of this change is to limit retries for commands that are identify as not
>> worth retrying (know failure condition) for both scsi/ZBC and libata/ZAC.
>> Without these two patches, only ZBC behaves nicely (commands that are known 
>> to
>> fail are retried in the ZAC case).
>>
>> The following 2 snmall patches are simple fixes.
>>
>> The last 2 pacthes are improvements in libata error handling verbosity.
>>
>> Damien Le Moal (6):
>>   scsi: Introduce scsi_zbc_noretry_cmd()
>>   libata: Use scsi_zbc_noretry_cmd() for ZAC devices
>>   libata: Fix comment typo in ata_eh_analyze_tf()
>>   libata: Fix ata_err_string()
>>   libata: Honor RQF_QUIET flag
>>   libata: Be quiet when asked to
> 
> Except for the nit on the last patch, ata part looks good to me.
> Martin, how do you wanna route the SCSI part?
> 
> Thanks.

Thanks for the review. I will send a V2 to correct the last patch.
There is also a typo in the commit message of the first patch to correct.

-- 
Damien Le Moal
Western Digital Research

[PATCH] sd_zbc: Fix potential memory leak

2018-03-01 Thread Damien Le Moal
Rework sd_zbc_check_zone_size() to avoid a memory leak due to an early
return if sd_zbc_report_zones() fails.

Reported-by: David.butterfield <david.butterfi...@wdc.com>
Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Cc: sta...@vger.kernel.org
---
 drivers/scsi/sd_zbc.c | 35 +++
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 14174f26af98..ec55a255d1a2 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -366,7 +366,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf)
  */
 static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
-   u64 zone_blocks;
+   u64 zone_blocks = 0;
sector_t block = 0;
unsigned char *buf;
unsigned char *rec;
@@ -384,10 +384,8 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
/* Do a report zone to get the same field */
ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0);
-   if (ret) {
-   zone_blocks = 0;
-   goto out;
-   }
+   if (ret)
+   goto out_free;
 
same = buf[4] & 0x0f;
if (same > 0) {
@@ -427,7 +425,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
ret = sd_zbc_report_zones(sdkp, buf,
  SD_ZBC_BUF_SIZE, block);
if (ret)
-   return ret;
+   goto out_free;
}
 
} while (block < sdkp->capacity);
@@ -435,35 +433,32 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
zone_blocks = sdkp->zone_blocks;
 
 out:
-   kfree(buf);
-
if (!zone_blocks) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Devices with non constant zone "
  "size are not supported\n");
-   return -ENODEV;
-   }
-
-   if (!is_power_of_2(zone_blocks)) {
+   ret = -ENODEV;
+   } else if (!is_power_of_2(zone_blocks)) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Devices with non power of 2 zone "
  "size are not supported\n");
-   return -ENODEV;
-   }
-
-   if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
+   ret = -ENODEV;
+   } else if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
if (sdkp->first_scan)
sd_printk(KERN_NOTICE, sdkp,
  "Zone size too large\n");
-   return -ENODEV;
+   ret = -ENODEV;
+   } else {
+   sdkp->zone_blocks = zone_blocks;
+   sdkp->zone_shift = ilog2(zone_blocks);
}
 
-   sdkp->zone_blocks = zone_blocks;
-   sdkp->zone_shift = ilog2(zone_blocks);
+out_free:
+   kfree(buf);
 
-   return 0;
+   return ret;
 }
 
 /**
-- 
2.14.3



[PATCH 2/6] libata: Use scsi_zbc_noretry_cmd() for ZAC devices

2018-03-01 Thread Damien Le Moal
Improve decisions regarding command retry worthiness by calling
the funtion scsi_zbc_noretry_cmd() in ata_eh_worth_retry() if the
command target is a ZAC device.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/ata/libata-eh.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 11c3137d7b0a..504272b18e75 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2139,6 +2139,10 @@ static unsigned int ata_eh_speed_down(struct ata_device 
*dev,
  */
 static inline int ata_eh_worth_retry(struct ata_queued_cmd *qc)
 {
+   if (qc->dev->flags & ATA_DFLAG_ZAC &&
+   qc->flags & ATA_QCFLAG_SENSE_VALID &&
+   scsi_zbc_noretry_cmd(qc->scsicmd))
+   return 0;   /* retrying will fail again */
if (qc->err_mask & AC_ERR_MEDIA)
return 0;   /* don't retry media errors */
if (qc->flags & ATA_QCFLAG_IO)
-- 
2.14.3



[PATCH 1/6] scsi: Introduce scsi_zbc_noretry_cmd()

2018-03-01 Thread Damien Le Moal
For ZBC/ZAC devices, retrying a command with a condition known to lead
to a failure is useless. One example is an unaligned write past the
write pointer of a sequential zone. Retrying the same command will
result in an error again.

Currently, these iknown error condition cases are handled in sd_zbc.c
using the sd_zbc_complete() function which is called from sd_done() when
a command completes. However, these known error conditions are not
handled in libata, nor is scsi_noretry_cmd() considering them.

Fix this by introducing the function scsi_zbc_noretry_cmd() and use this
function in scsi_noretry_cmd(). This allows simplifying
sd_zbc_complete() which now only has to deal with report zones command
reply.

scsi_zbc_noretry_cmd() is also exported so that it can be used from
libata.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/scsi/scsi_error.c | 66 +++
 drivers/scsi/sd.c |  2 +-
 drivers/scsi/sd.h |  8 +++---
 drivers/scsi/sd_zbc.c | 47 -
 include/scsi/scsi_eh.h|  1 +
 5 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index ca53a5f785ee..abb33d250176 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1671,6 +1671,66 @@ static void scsi_eh_offline_sdevs(struct list_head 
*work_q,
return;
 }
 
+/**
+ * scsi_zbc_noretry_cmd - Determine if ZBC device command can be retried
+ * @scmd:   Failed cmd to check
+ *
+ * Test the error condition of a failed ZBC device command to determine cases
+ * that are known to be not worth retrying.
+ * If the specified command is not intended for a ZBC device, do nothing.
+ */
+bool scsi_zbc_noretry_cmd(struct scsi_cmnd *scmd)
+{
+   struct request *rq = scmd->request;
+   struct scsi_sense_hdr sshdr;
+
+   /*
+* The request queue zone model may not be set when this is called
+* during device probe/revalidation. In that case, just fall back to
+* default behavior and let the caller decide what to do with failures.
+*/
+   if (!blk_queue_is_zoned(rq->q))
+   return false;
+
+   if (!scsi_command_normalize_sense(scmd, ))
+   /* no valid sense data, don't know, so maybe retry */
+   return false;
+
+   if (sshdr.sense_key != ILLEGAL_REQUEST)
+   return false;
+
+   switch (req_op(rq)) {
+   case REQ_OP_ZONE_RESET:
+
+   if (sshdr.asc == 0x24) {
+   /*
+* INVALID FIELD IN CDB error: reset of a conventional
+* zone was attempted. Nothing to worry about, so be
+* quiet about the error.
+*/
+   if (!blk_rq_is_passthrough(rq))
+   rq->rq_flags |= RQF_QUIET;
+   return true;
+   }
+   return false;
+
+   case REQ_OP_WRITE:
+   case REQ_OP_WRITE_ZEROES:
+   case REQ_OP_WRITE_SAME:
+
+   /*
+* INVALID ADDRESS FOR WRITE error: It is unlikely that
+* retrying write requests failed with any kind of
+* alignement error will result in success. So don't.
+*/
+   return sshdr.asc == 0x21;
+
+   default:
+   return false;
+   }
+}
+EXPORT_SYMBOL_GPL(scsi_zbc_noretry_cmd);
+
 /**
  * scsi_noretry_cmd - determine if command should be failed fast
  * @scmd:  SCSI cmd to examine.
@@ -1699,6 +1759,12 @@ int scsi_noretry_cmd(struct scsi_cmnd *scmd)
return 0;
 
 check_type:
+   /*
+* For ZBC, do not retry conditions that will only fail again.
+*/
+   if (scmd->device->type == TYPE_ZBC &&
+   scsi_zbc_noretry_cmd(scmd))
+   return 1;
/*
 * assume caller has checked sense and determined
 * the check condition was retryable.
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bff21e636ddd..93c6baa7d677 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2041,7 +2041,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 
  out:
if (sd_is_zoned(sdkp))
-   sd_zbc_complete(SCpnt, good_bytes, );
+   sd_zbc_complete(SCpnt, good_bytes);
 
SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
   "sd_done: completed %d of %d 
bytes\n",
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 0d663b5e45bb..b777ffecf386 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -284,8 +284,7 @@ extern void sd_zbc_remove(struct scsi_disk *sdkp);
 extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
 extern int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd);
 extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
-extern void sd_zbc_c

[PATCH 4/6] libata: Fix ata_err_string()

2018-03-01 Thread Damien Le Moal
Add proper error string output for ATA_ERR_NCQ and ATA_ERR_NODEV_HINT
instead of returning "unknown error".

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/ata/libata-eh.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index d6264235c3d7..006ea1507dcf 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1482,6 +1482,10 @@ static const char *ata_err_string(unsigned int err_mask)
return "invalid argument";
if (err_mask & AC_ERR_DEV)
return "device error";
+   if (err_mask & AC_ERR_NCQ)
+   return "NCQ error";
+   if (err_mask & AC_ERR_NODEV_HINT)
+   return "Polling detection error";
return "unknown error";
 }
 
-- 
2.14.3



[PATCH 3/6] libata: Fix comment typo in ata_eh_analyze_tf()

2018-03-01 Thread Damien Le Moal
Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/ata/libata-eh.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 504272b18e75..d6264235c3d7 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1865,10 +1865,10 @@ static unsigned int ata_eh_analyze_tf(struct 
ata_queued_cmd *qc,
if (qc->flags & ATA_QCFLAG_SENSE_VALID) {
int ret = scsi_check_sense(qc->scsicmd);
/*
-* SUCCESS here means that the sense code could
+* SUCCESS here means that the sense code could be
 * evaluated and should be passed to the upper layers
 * for correct evaluation.
-* FAILED means the sense code could not interpreted
+* FAILED means the sense code could not be interpreted
 * and the device would need to be reset.
 * NEEDS_RETRY and ADD_TO_MLQUEUE means that the
 * command would need to be retried.
-- 
2.14.3



[PATCH 6/6] libata: Be quiet when asked to

2018-03-01 Thread Damien Le Moal
For a successful setting of the device transfer speed mode in
ata_dev_set_mode(), do not print the message
"ataX.XX: configured for xxx" if the EH context has the quiet flag set.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/ata/libata-core.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 3c09122bf038..258afc2e8efd 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3573,9 +3573,10 @@ static int ata_dev_set_mode(struct ata_device *dev)
DPRINTK("xfer_shift=%u, xfer_mode=0x%x\n",
dev->xfer_shift, (int)dev->xfer_mode);
 
-   ata_dev_info(dev, "configured for %s%s\n",
-ata_mode_string(ata_xfer_mode2mask(dev->xfer_mode)),
-dev_err_whine);
+   if (!(ehc->i.flags & ATA_EHI_QUIET))
+   ata_dev_info(dev, "configured for %s%s\n",
+   ata_mode_string(ata_xfer_mode2mask(dev->xfer_mode)),
+   dev_err_whine);
 
return 0;
 
-- 
2.14.3



[PATCH 5/6] libata: Honor RQF_QUIET flag

2018-03-01 Thread Damien Le Moal
Currently, libata ignores requests RQF_QUIET flag and print error
messages for failed commands, regardless if this flag is set in the
command request. Fix this by introducing the ata_eh_quiet() function and
using this function in ata_eh_link_autopsy() to determine if the EH
context should be quiet. This works by counting the number of failed
commands and the number of commands with the quiet flag set. If both
numbers are equal, the the EH context can be set to quiet and all error
messages suppressed. Otherwise, only the error messages for the failed
commands are suppressed and the link Emask and irq_stat messages printed.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/ata/libata-eh.c   | 26 +-
 drivers/ata/libata-scsi.c |  3 +++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 006ea1507dcf..c9f0c8660a7b 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2156,6 +2156,21 @@ static inline int ata_eh_worth_retry(struct 
ata_queued_cmd *qc)
return qc->err_mask != AC_ERR_DEV;  /* retry if not dev error */
 }
 
+/**
+ *  ata_eh_quiet - check if we need to be quiet about a command error
+ *  @qc: qc to check
+ *
+ *  Look at the qc flags anbd its scsi command request flags to determine
+ *  if we need to be quiet about the command failure.
+ */
+static inline bool ata_eh_quiet(struct ata_queued_cmd *qc)
+{
+   if (qc->scsicmd &&
+   qc->scsicmd->request->rq_flags & RQF_QUIET)
+   qc->flags |= ATA_QCFLAG_QUIET;
+   return qc->flags & ATA_QCFLAG_QUIET;
+}
+
 /**
  * ata_eh_link_autopsy - analyze error and determine recovery action
  * @link: host link to perform autopsy on
@@ -2173,7 +2188,7 @@ static void ata_eh_link_autopsy(struct ata_link *link)
struct ata_eh_context *ehc = >eh_context;
struct ata_device *dev;
unsigned int all_err_mask = 0, eflags = 0;
-   int tag;
+   int tag, nr_failed = 0, nr_quiet = 0;
u32 serror;
int rc;
 
@@ -2239,8 +2254,17 @@ static void ata_eh_link_autopsy(struct ata_link *link)
if (qc->flags & ATA_QCFLAG_IO)
eflags |= ATA_EFLAG_IS_IO;
trace_ata_eh_link_autopsy_qc(qc);
+
+   /* Count quiet errors */
+   if (ata_eh_quiet(qc))
+   nr_quiet++;
+   nr_failed++;
}
 
+   /* If all failed commands requested silence, then be quiet */
+   if (nr_quiet == nr_failed)
+   ehc->i.flags |= ATA_EHI_QUIET;
+
/* enforce default EH actions */
if (ap->pflags & ATA_PFLAG_FROZEN ||
all_err_mask & (AC_ERR_HSM | AC_ERR_TIMEOUT))
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 66be961c93a4..84b6fc8906a2 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -872,6 +872,9 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct 
ata_device *dev,
 
qc->sg = scsi_sglist(cmd);
qc->n_elem = scsi_sg_count(cmd);
+
+   if (cmd->request->rq_flags & RQF_QUIET)
+   qc->flags |= ATA_QCFLAG_QUIET;
} else {
cmd->result = (DID_OK << 16) | (QUEUE_FULL << 1);
cmd->scsi_done(cmd);
-- 
2.14.3



[PATCH] Improve ZBC/ZAC error handling

2018-03-01 Thread Damien Le Moal
This series introduces changes to scsi and libata error handling for ZBC and ZAC
devices.

The first patch moves ZBC specific error handling in sd_zbc_complete() to a
generic scsi error function that can be used also in libata (second patch). The
goal of this change is to limit retries for commands that are identify as not
worth retrying (know failure condition) for both scsi/ZBC and libata/ZAC.
Without these two patches, only ZBC behaves nicely (commands that are known to
fail are retried in the ZAC case).

The following 2 snmall patches are simple fixes.

The last 2 pacthes are improvements in libata error handling verbosity.

Damien Le Moal (6):
  scsi: Introduce scsi_zbc_noretry_cmd()
  libata: Use scsi_zbc_noretry_cmd() for ZAC devices
  libata: Fix comment typo in ata_eh_analyze_tf()
  libata: Fix ata_err_string()
  libata: Honor RQF_QUIET flag
  libata: Be quiet when asked to

 drivers/ata/libata-core.c |  7 ++---
 drivers/ata/libata-eh.c   | 38 ---
 drivers/ata/libata-scsi.c |  3 +++
 drivers/scsi/scsi_error.c | 66 +++
 drivers/scsi/sd.c |  2 +-
 drivers/scsi/sd.h |  8 +++---
 drivers/scsi/sd_zbc.c | 47 -
 include/scsi/scsi_eh.h|  1 +
 8 files changed, 119 insertions(+), 53 deletions(-)

-- 
2.14.3



Re: [PATCH v2] Avoid that ATA error handling can trigger a kernel hang or oops

2018-02-27 Thread Damien Le Moal
On 2018/02/27 10:53, Bart Van Assche wrote:
> On Thu, 2018-02-22 at 11:30 -0800, Bart Van Assche wrote:
>> Avoid that the recently introduced call_rcu() call in the SCSI core
>> triggers a double call_rcu() call.
>> [ ... ]
> 
> Can anyone review this patch? Multiple users have confirmed independently
> that this patch fixes the double call_rcu() issue for them.
> 
> Thanks,
> 
> Bart.

Please feel free to add:

Reviewed-by: Damien Le Moal <damien.lem...@wdc.com>
Tested-by: Damien Le Moal <damien.lem...@wdc.com>


-- 
Damien Le Moal
Western Digital Research

Re: [PATCH] Avoid that ATA error handling hangs

2018-02-21 Thread Damien Le Moal


On 2/22/18 14:08, Damien Le Moal wrote:
> Bart,
> 
> On 2/22/18 12:53, Bart Van Assche wrote:
>> On Thu, 2018-02-22 at 02:23 +0000, Damien Le Moal wrote:
>>> On Wed, 2018-02-21 at 09:23 -0800, Bart Van Assche wrote:
>>>> [ ... ]
>>> This does not compile.
>>
>> This patch depends on another patch that is not yet in Martin's tree. See 
>> also
>> https://marc.info/?l=linux-scsi=151675130615597. I should have mentioned 
>> this
>> in the patch description.
> 
> OK. Got it.
> 
>>> Testing this, the rcu hang is now gone.
>>
>> Thanks for the testing :-)
>>
>>> However, the behavior of the error recovery  is still different from what I
>>> see in 4.15 and 4.14. For my test case, an unaligned write to a sequential
>>> zone on a ZAC drive connected to an AHCI port, the report zone issued during
>>> the disk revalidation after the write error fails with a timeout, which 
>>> causes
>>> capacity change to 0, port reset and recovery again. Eventually, everything
>>> comes back up OK, but it takes some time.
>>>
>>> I am investigating to make sure I am not hitting a device FW bug to confirm 
>>> if
>>> this is a kernel problem.
>>
>> This patch was tested with the SRP protocol. I'm not an ATA expert but I hope
>> that someone who is more familiar with ATA than I can chime in.
> 
> Well, I would expect the retry of the unaligned write to fail
> immediately while the report zones is still on-going. These are both NCQ
> commands, so that would result in the queue to be aborted and eh to see
> the failed write retry and the aborted report zones, which should be
> restarted right away. But report zones timeout after 30sec...
> 
> Example with CONFIG_DEBUG_OBJECTS_RCU_HEAD turned on (check time stamps):
> 
> [   43.978457] ata6.00: exception Emask 0x0 SAct 0x20 SErr 0x0 action 0x0
> [   43.985239] ata6.00: irq_stat 0x4008
> [   43.989272] ata6.00: failed command: WRITE FPDMA QUEUED
> [   43.994657] ata6.00: cmd 61/08:28:08:00:60/00:00:10:00:00/40 tag 5
> ncq dma 4096 out
> [   43.994657]  res 43/04:08:08:00:60/00:00:10:00:00/00 Emask
> 0x400 (unknown error) 
> [   44.011157] ata6.00: status: { DRDY SENSE ERR }
> [   44.015825] ata6.00: error: { ABRT }
> [   44.268565] ata6.00: configured for UDMA/133
> [   44.273208] ata6: EH complete
> [   44.303440] ata6.00: exception Emask 0x0 SAct 0x10020 SErr 0x0 action 0x0
> [   44.310743] ata6.00: irq_stat 0x4008
> [   44.314989] ata6.00: failed command: WRITE FPDMA QUEUED
> [   44.320634] ata6.00: cmd 61/08:28:08:00:60/00:00:10:00:00/40 tag 5
> ncq dma 4096 out
> [   44.320634]  res 43/04:08:08:00:60/00:00:10:00:00/00 Emask
> 0x400 (unknown error) 
> [   44.337787] ata6.00: status: { DRDY SENSE ERR }
> [   44.342576] ata6.00: error: { ABRT }
> [   44.374416] ata6.00: configured for UDMA/133
> [   44.379094] ata6: EH complete
> [   74.638316] ata6.00: exception Emask 0x1 SAct 0x110 SErr 0x0 action
> 0x6 frozen
> [   74.645999] ata6.00: irq_stat 0x4008
> [   74.650001] ata6.00: failed command: RECEIVE FPDMA QUEUED
> [   74.655544] ata6.00: cmd 65/00:20:00:00:00/01:02:00:00:00/40 tag 4
> ncq dma 131072 in
> [   74.655544]  res 40/00:00:00:00:38/00:00:3f:06:00/40 Emask
> 0x5 (timeout)
> [   74.671001] ata6.00: status: { DRDY }
> [   74.674768] ata6.00: failed command: WRITE FPDMA QUEUED
> [   74.680140] ata6.00: cmd 61/08:40:08:00:60/00:00:10:00:00/40 tag 8
> ncq dma 4096 out
> [   74.680140]  res 40/00:00:08:00:60/00:00:10:00:00/40 Emask
> 0x1 (device error)
> [   74.695966] ata6.00: status: { DRDY }
> [   74.699730] ata6: hard resetting link
> [   75.013606] ata6: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
> [   75.067939] ata6.00: configured for UDMA/133
> [   75.072465] ata6.00: device reported invalid CHS sector 0
> [   75.078167] ata6: EH complete
> [   75.078188] sd 5:0:0:0: [sdd] REPORT ZONES lba 0 failed with 0/8
> [   75.087601] sd 5:0:0:0: [sdd] 0 512-byte logical blocks: (0 B/0 B)
> [   75.093903] sd 5:0:0:0: [sdd] 4096-byte physical blocks
> [   75.099276] sdd: detected capacity change from 14000519643136 to 0
> [  106.189951] ata6.00: exception Emask 0x1 SAct 0x400010 SErr 0x0
> action 0x6 frozen
> [  106.197683] ata6.00: irq_stat 0x4008
> [  106.201730] ata6.00: failed command: WRITE FPDMA QUEUED
> [  106.207096] ata6.00: cmd 61/08:20:08:00:60/00:00:10:00:00/40 tag 4
> ncq dma 4096 out
> [  106.207096]  res 40/00:00:08:00:60/00:00:10:00:00/40 Emask
> 0x1 (device error)
> [  106.222930] ata6.00: status: { DRDY }
> [  106.226690] ata6.00: failed command: RECEIVE FPDMA QUEUED
> [  10

Re: [PATCH] Avoid that ATA error handling hangs

2018-02-21 Thread Damien Le Moal
Bart,

On 2/22/18 12:53, Bart Van Assche wrote:
> On Thu, 2018-02-22 at 02:23 +0000, Damien Le Moal wrote:
>> On Wed, 2018-02-21 at 09:23 -0800, Bart Van Assche wrote:
>>> [ ... ]
>> This does not compile.
> 
> This patch depends on another patch that is not yet in Martin's tree. See also
> https://marc.info/?l=linux-scsi=151675130615597. I should have mentioned 
> this
> in the patch description.

OK. Got it.

>> Testing this, the rcu hang is now gone.
> 
> Thanks for the testing :-)
> 
>> However, the behavior of the error recovery  is still different from what I
>> see in 4.15 and 4.14. For my test case, an unaligned write to a sequential
>> zone on a ZAC drive connected to an AHCI port, the report zone issued during
>> the disk revalidation after the write error fails with a timeout, which 
>> causes
>> capacity change to 0, port reset and recovery again. Eventually, everything
>> comes back up OK, but it takes some time.
>>
>> I am investigating to make sure I am not hitting a device FW bug to confirm 
>> if
>> this is a kernel problem.
> 
> This patch was tested with the SRP protocol. I'm not an ATA expert but I hope
> that someone who is more familiar with ATA than I can chime in.

Well, I would expect the retry of the unaligned write to fail
immediately while the report zones is still on-going. These are both NCQ
commands, so that would result in the queue to be aborted and eh to see
the failed write retry and the aborted report zones, which should be
restarted right away. But report zones timeout after 30sec...

Example with CONFIG_DEBUG_OBJECTS_RCU_HEAD turned on (check time stamps):

[   43.978457] ata6.00: exception Emask 0x0 SAct 0x20 SErr 0x0 action 0x0
[   43.985239] ata6.00: irq_stat 0x4008
[   43.989272] ata6.00: failed command: WRITE FPDMA QUEUED
[   43.994657] ata6.00: cmd 61/08:28:08:00:60/00:00:10:00:00/40 tag 5
ncq dma 4096 out
[   43.994657]  res 43/04:08:08:00:60/00:00:10:00:00/00 Emask
0x400 (unknown error) 
[   44.011157] ata6.00: status: { DRDY SENSE ERR }
[   44.015825] ata6.00: error: { ABRT }
[   44.268565] ata6.00: configured for UDMA/133
[   44.273208] ata6: EH complete
[   44.303440] ata6.00: exception Emask 0x0 SAct 0x10020 SErr 0x0 action 0x0
[   44.310743] ata6.00: irq_stat 0x4008
[   44.314989] ata6.00: failed command: WRITE FPDMA QUEUED
[   44.320634] ata6.00: cmd 61/08:28:08:00:60/00:00:10:00:00/40 tag 5
ncq dma 4096 out
[   44.320634]  res 43/04:08:08:00:60/00:00:10:00:00/00 Emask
0x400 (unknown error) 
[   44.337787] ata6.00: status: { DRDY SENSE ERR }
[   44.342576] ata6.00: error: { ABRT }
[   44.374416] ata6.00: configured for UDMA/133
[   44.379094] ata6: EH complete
[   74.638316] ata6.00: exception Emask 0x1 SAct 0x110 SErr 0x0 action
0x6 frozen
[   74.645999] ata6.00: irq_stat 0x4008
[   74.650001] ata6.00: failed command: RECEIVE FPDMA QUEUED
[   74.655544] ata6.00: cmd 65/00:20:00:00:00/01:02:00:00:00/40 tag 4
ncq dma 131072 in
[   74.655544]  res 40/00:00:00:00:38/00:00:3f:06:00/40 Emask
0x5 (timeout)
[   74.671001] ata6.00: status: { DRDY }
[   74.674768] ata6.00: failed command: WRITE FPDMA QUEUED
[   74.680140] ata6.00: cmd 61/08:40:08:00:60/00:00:10:00:00/40 tag 8
ncq dma 4096 out
[   74.680140]  res 40/00:00:08:00:60/00:00:10:00:00/40 Emask
0x1 (device error)
[   74.695966] ata6.00: status: { DRDY }
[   74.699730] ata6: hard resetting link
[   75.013606] ata6: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
[   75.067939] ata6.00: configured for UDMA/133
[   75.072465] ata6.00: device reported invalid CHS sector 0
[   75.078167] ata6: EH complete
[   75.078188] sd 5:0:0:0: [sdd] REPORT ZONES lba 0 failed with 0/8
[   75.087601] sd 5:0:0:0: [sdd] 0 512-byte logical blocks: (0 B/0 B)
[   75.093903] sd 5:0:0:0: [sdd] 4096-byte physical blocks
[   75.099276] sdd: detected capacity change from 14000519643136 to 0
[  106.189951] ata6.00: exception Emask 0x1 SAct 0x400010 SErr 0x0
action 0x6 frozen
[  106.197683] ata6.00: irq_stat 0x4008
[  106.201730] ata6.00: failed command: WRITE FPDMA QUEUED
[  106.207096] ata6.00: cmd 61/08:20:08:00:60/00:00:10:00:00/40 tag 4
ncq dma 4096 out
[  106.207096]  res 40/00:00:08:00:60/00:00:10:00:00/40 Emask
0x1 (device error)
[  106.222930] ata6.00: status: { DRDY }
[  106.226690] ata6.00: failed command: RECEIVE FPDMA QUEUED
[  106.232235] ata6.00: cmd 65/00:b0:00:00:00/01:02:00:00:00/40 tag 22
ncq dma 131072 in
[  106.232235]  res 40/00:01:00:4f:c2/00:00:00:00:00/00 Emask
0x5 (timeout)
[  106.247834] ata6.00: status: { DRDY }
[  106.251641] ata6: hard resetting link
[  106.565274] ata6: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
[  106.600020] ata6.00: configured for UDMA/133
[  106.604501] ata6: EH complete
[  106.604526] sd 5:0:0:0: [sdd] REPORT ZONES lba 0 failed with 0/8
[  106.607669] print_req_error: I/O error

Re: [PATCH] Avoid that ATA error handling hangs

2018-02-21 Thread Damien Le Moal
Bart,

On 2/22/18 13:39, Bart Van Assche wrote:
> On Thu, 2018-02-22 at 04:39 +, Bart Van Assche wrote:
>> On Thu, 2018-02-22 at 04:19 +0000, Damien Le Moal wrote:
>>> It looks OK to me, at least if CONFIG_DEBUG_OBJECTS_RCU_HEAD is turned
>>> off since the init_rcu_head() and destroy_rcu_head() functions only care
>>> about that.
>>>
>>> With rcu head debug turned on, I am not so sure. The object debug code
>>> will have references to unused rcu heads left behind for unused scsi
>>> cmds, which are indeed dynamically allocated for a device together with
>>> requests when the device is initialized, but they are never freed until
>>> the device is removed. So "dynamically allocated object", yes, but that
>>> does not match the use of the object done in scsi (i.e. alloc before use
>>> + free after use).
>>
>> Hello Damien,
>>
>> Please have a look at the following part of
>> Documentation/RCU/Design/Requirements/Requirements.html:
>>
>>  Similarly, statically allocated non-stack rcu_head
>>  structures must be initialized with init_rcu_head()
>>  and cleaned up with destroy_rcu_head().
> 
> And from :
> 
>  * rcu_head structures
>  * allocated dynamically in the heap or defined statically don't need any
>  * initialization.

Yes, I understood that. But my guess is this comment implies that the
objects are freed after use, which clears any reference to it from the
memory object debug hash automatically. That is not the case with scsi
command structs: there are allocated dynamically with the device, but
they are not freed after use. And here by use, I mean the normal use
cycle of a request+cmd: get unused request -> issue command -> command
completed -> return request in free state.
That is not an alloc+free cycle, so the memory object debug code will
never be involved and the scsi command rcu head never destroyed.

Considering that, I am not sure if it is really safe to remove the
init/destroy rcu head functions. At the very least, that will make the
memory object debug table grow larger with the first use of any scsi
command.

Cheers.

-- 
Damien Le Moal,
Western Digital

Re: [PATCH] Avoid that ATA error handling hangs

2018-02-21 Thread Damien Le Moal
Martin,

On 2/22/18 13:06, Martin K. Petersen wrote:
> 
> Bart,
> 
>> This patch depends on another patch that is not yet in Martin's
>> tree.
> 
> Nobody reviewed it. Same goes for your queuecommand tweak :/
> 
> I'm pretty picky about getting at least one other person than me to look
> over core changes.
> 
> Reviewers: Fame and fortune awaits!

It looks OK to me, at least if CONFIG_DEBUG_OBJECTS_RCU_HEAD is turned
off since the init_rcu_head() and destroy_rcu_head() functions only care
about that.

With rcu head debug turned on, I am not so sure. The object debug code
will have references to unused rcu heads left behind for unused scsi
cmds, which are indeed dynamically allocated for a device together with
requests when the device is initialized, but they are never freed until
the device is removed. So "dynamically allocated object", yes, but that
does not match the use of the object done in scsi (i.e. alloc before use
+ free after use).

Because of this doubt, No reviewed-by from me. I will miss fame and
fortune this time :)

Best regards.

-- 
Damien Le Moal,
Western Digital

Re: [PATCH] Avoid that ATA error handling hangs

2018-02-21 Thread Damien Le Moal
Bart,

On Wed, 2018-02-21 at 09:23 -0800, Bart Van Assche wrote:
> Avoid that the recently introduced call_rcu() call in the SCSI core
> causes the RCU core to complain about double call_rcu() calls.
> 
> Reported-by: Natanael Copa <nc...@alpinelinux.org>
> Reported-by: Damien Le Moal <damien.lem...@wdc.com>
> References: https://bugzilla.kernel.org/show_bug.cgi?id=198861
> Fixes: 3bd6f43f5cb3 ("scsi: core: Ensure that the SCSI error handler gets
> woken up")
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Natanael Copa <nc...@alpinelinux.org>
> Cc: Damien Le Moal <damien.lem...@wdc.com>
> Cc: Pavel Tikhomirov <ptikhomi...@virtuozzo.com>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Johannes Thumshirn <jthumsh...@suse.de>
> Cc: <sta...@vger.kernel.org>
> ---
>  drivers/scsi/scsi_error.c | 5 +++--
>  include/scsi/scsi_cmnd.h  | 3 +++
>  include/scsi/scsi_host.h  | 2 --
>  3 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
> index ae325985eac1..ac9ce099530e 100644
> --- a/drivers/scsi/scsi_error.c
> +++ b/drivers/scsi/scsi_error.c
> @@ -229,7 +229,8 @@ static void scsi_eh_reset(struct scsi_cmnd *scmd)
>  
>  static void scsi_eh_inc_host_failed(struct rcu_head *head)
>  {
> - struct Scsi_Host *shost = container_of(head, typeof(*shost), rcu);
> + struct scsi_cmnd *scmd = container_of(head, typeof(*scmd), rcu);
> + struct Scsi_Host *shost = scmd->device->host;
>   unsigned long flags;
>  
>   spin_lock_irqsave(shost->host_lock, flags);
> @@ -265,7 +266,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
>* Ensure that all tasks observe the host state change before the
>* host_failed change.
>*/
> - call_rcu(>rcu, scsi_eh_inc_host_failed);
> + call_rcu(>rcu, scsi_eh_inc_host_failed);
>  }
>  
>  /**
> diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
> index d8d4a902a88d..2280b2351739 100644
> --- a/include/scsi/scsi_cmnd.h
> +++ b/include/scsi/scsi_cmnd.h
> @@ -68,6 +68,9 @@ struct scsi_cmnd {
>   struct list_head list;  /* scsi_cmnd participates in queue lists */
>   struct list_head eh_entry; /* entry for the host eh_cmd_q */
>   struct delayed_work abort_work;
> +
> + struct rcu_head rcu;
> +
>   int eh_eflags;  /* Used by error handlr */
>  
>   /*
> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
> index 1a1df0d21ee3..a8b7bf879ced 100644
> --- a/include/scsi/scsi_host.h
> +++ b/include/scsi/scsi_host.h
> @@ -571,8 +571,6 @@ struct Scsi_Host {
>   struct blk_mq_tag_set   tag_set;
>   };
>  
> - struct rcu_head rcu;
> -
>   atomic_t host_busy;/* commands actually active
> on low-level */
>   atomic_t host_blocked;

This does not compile. You missed the init_rcu_head() and destroy_rcu_head()
changes. Adding this:

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 57bf43e34863..dd9464920456 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -328,8 +328,6 @@ static void scsi_host_dev_release(struct device *dev)
if (shost->work_q)
destroy_workqueue(shost->work_q);
 
-   destroy_rcu_head(>rcu);
-
if (shost->shost_state == SHOST_CREATED) {
/*
 * Free the shost_dev device name here if scsi_host_alloc()
@@ -404,7 +402,6 @@ struct Scsi_Host *scsi_host_alloc(struct
scsi_host_template *sht, int privsize)
INIT_LIST_HEAD(>starved_list);
init_waitqueue_head(>host_wait);
mutex_init(>scan_mutex);
-   init_rcu_head(>rcu);
 
index = ida_simple_get(_index_ida, 0, 0, GFP_KERNEL);
if (index < 0)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a86df9ca7d1c..488e5c9acedf 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -590,6 +590,8 @@ static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
if (drv->uninit_command)
drv->uninit_command(cmd);
}
+
+   destroy_rcu_head(>rcu);
 }
 
 static void scsi_mq_free_sgtables(struct scsi_cmnd *cmd)
@@ -1153,6 +1155,7 @@ static void scsi_initialize_rq(struct request *rq)
scsi_req_init(>req);
cmd->jiffies_at_alloc = jiffies;
cmd->retries = 0;
+   init_rcu_head(>rcu);
 }
 
 /* Add a command to the list used by the aacraid and dpt_i2o drivers */

And it compiles.

Testing this, the rcu hang is now gone.

However, the behavior of the error recovery  is still different from what I
see in 4.15 and 4.14. For my test case, an unaligned write to a sequential
zone on a 

Re: [PATCH v2] scsi: remove extra white space at the end of the line

2017-12-21 Thread Damien Le Moal
Jason,

On 2017/12/21 20:35, Jason Yan wrote:
> My editor always try to remove the extra white space at the end of the
> line when I make some changes. I'm tired of adjusting them manually.
> Can we remove them in mainline?
> 
> Signed-off-by: Jason Yan <yanai...@huawei.com>
> ---
>  drivers/scsi/3w-9xxx.c|  48 ++---
>  drivers/scsi/3w-sas.c |  42 ++--
>  drivers/scsi/3w-.c| 140 ++--
>  drivers/scsi/3w-.h|  66 +++---
>  drivers/scsi/53c700.c | 104 -
>  drivers/scsi/53c700.h |   8 +-
>  drivers/scsi/FlashPoint.c |  10 +-
>  drivers/scsi/NCR5380.h|  22 +-
>  drivers/scsi/NCR53c406a.c |  26 +--
>  drivers/scsi/NCR_D700.c   |   8 +-
>  drivers/scsi/NCR_Q720.c   |  10 +-
>  drivers/scsi/a100u2w.c|   2 +-
>  drivers/scsi/advansys.c   |   2 +-
>  drivers/scsi/aha1542.c|  14 +-
>  drivers/scsi/aha1740.c|  36 ++--
>  drivers/scsi/atp870u.c| 180 
>  drivers/scsi/atp870u.h|   2 +-
>  drivers/scsi/dc395x.c |  84 
>  drivers/scsi/dmx3191d.c   |   2 +-
>  drivers/scsi/dpt_i2o.c| 242 ++---
>  drivers/scsi/dpti.h   |  14 +-
>  drivers/scsi/eata.c   |   4 +-
>  drivers/scsi/eata_generic.h   |  58 ++---
>  drivers/scsi/eata_pio.c   |  18 +-
>  drivers/scsi/eata_pio.h   |   2 +-
>  drivers/scsi/fdomain.c| 106 -
>  drivers/scsi/gdth.c   | 422 ++--
>  drivers/scsi/gdth.h   |  40 ++--
>  drivers/scsi/gdth_ioctl.h |   8 +-
>  drivers/scsi/gdth_proc.c  |  62 +++---
>  drivers/scsi/gdth_proc.h  |   2 +-
>  drivers/scsi/hosts.c  |   6 +-
>  drivers/scsi/imm.c|  18 +-
>  drivers/scsi/imm.h|   8 +-
>  drivers/scsi/initio.c |   2 +-
>  drivers/scsi/lasi700.c|   4 +-
>  drivers/scsi/mac53c94.c   |   6 +-
>  drivers/scsi/megaraid.c   |  42 ++--
>  drivers/scsi/mesh.c   |  22 +-
>  drivers/scsi/ncr53c8xx.c  | 440 
> +++---
>  drivers/scsi/ncr53c8xx.h  |  78 +++
>  drivers/scsi/nsp32.c  | 116 +-
>  drivers/scsi/nsp32_debug.c|  22 +-
>  drivers/scsi/osst.c   | 168 +++
>  drivers/scsi/osst.h   |   8 +-
>  drivers/scsi/osst_options.h   |   6 +-
>  drivers/scsi/ppa.c|  38 ++--
>  drivers/scsi/ppa.h|  10 +-
>  drivers/scsi/qla1280.c|  22 +-
>  drivers/scsi/qla1280.h|  10 +-
>  drivers/scsi/qlogicfas.c  |  10 +-
>  drivers/scsi/qlogicfas408.c   |  52 ++---
>  drivers/scsi/qlogicpti.c  |   8 +-
>  drivers/scsi/qlogicpti.h  |   2 +-
>  drivers/scsi/raid_class.c |   6 +-
>  drivers/scsi/scsi.c   |   2 +-
>  drivers/scsi/scsi.h   |   2 +-
>  drivers/scsi/scsi_debug.c |   2 +-
>  drivers/scsi/scsi_ioctl.c |  22 +-
>  drivers/scsi/scsi_lib.c   |  20 +-
>  drivers/scsi/scsi_priv.h  |   4 +-
>  drivers/scsi/scsi_proc.c  |  12 +-
>  drivers/scsi/scsi_scan.c  |  10 +-
>  drivers/scsi/scsi_sysfs.c |   4 +-
>  drivers/scsi/scsi_transport_spi.c |  48 ++---
>  drivers/scsi/scsicam.c|   6 +-
>  drivers/scsi/sd.c |  60 +++---
>  drivers/scsi/sg.c |  14 +-
>  drivers/scsi/sgiwd93.c|   2 +-
>  drivers/scsi/sim710.c |   4 +-
>  drivers/scsi/sr.c |   6 +-
>  drivers/scsi/sr.h |   6 +-
>  drivers/scsi/sr_ioctl.c   |   8 +-
>  drivers/scsi/sr_vendor.c  |   8 +-
>  drivers/scsi/st.c |   4 +-
>  drivers/scsi/sun3_scsi.c  |  34 +--
>  drivers/scsi/sym53c416.c  |  20 +-
>  drivers/scsi/sym53c416.h  |   2 +-
>  drivers/scsi/zalon.c  |   6 +-
>  79 files changed, 1597 insertions(+), 1597 deletions(-)

scripts/checkpatch.pl reports 220 instances of white spaces at the *beginning*
of lines, and that is only for what your patches touched and the lines around 
them.

If I run in the drivers/scsi directory the command:

grep -r '^ [[ ]]*' *.[ch] | wc -l

I get 8795 lines. I guess you could make your patch even bigger:)

If we exclude leading spaces, applying the patch and doing a "diff -rb" of the
patched scsi files against the original unpatched files leads to no differences
at all. So it looks good.

Best regards.

-- 
Damien Le Moal
Western Digital Research

[PATCH V9 3/7] mq-deadline: Introduce zone locking support

2017-12-20 Thread Damien Le Moal
Introduce zone write locking to avoid write request reordering with
zoned block devices. This is achieved using a finer selection of the
next request to dispatch:
1) Any non-write request is always allowed to proceed.
2) Any write to a conventional zone is always allowed to proceed.
3) For a write to a sequential zone, the zone lock is first checked.
   a) If the zone is not locked, the write is allowed to proceed after
  its target zone is locked.
   b) If the zone is locked, the write request is skipped and the next
  request in the dispatch queue tested (back to step 1).

For a write request that has locked its target zone, the zone is
unlocked either when the request completes with a call to the method
deadline_request_completed() or when the request is requeued using
dd_insert_request().

Requests targeting a locked zone are always left in the scheduler queue
to preserve the lba ordering for write requests. If no write request
can be dispatched, allow reads to be dispatched even if the write batch
is not done.

If the device used is not a zoned block device, or if zoned block device
support is disabled, this patch does not modify mq-deadline behavior.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/mq-deadline.c | 89 +++--
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 8bd6db9e69c7..d56972e8ebda 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
int front_merges;
 
spinlock_t lock;
+   spinlock_t zone_lock;
struct list_head dispatch;
 };
 
@@ -198,13 +199,33 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   spin_lock_irqsave(>zone_lock, flags);
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   goto out;
+   }
+   rq = NULL;
+out:
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -214,10 +235,32 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!rq)
+   return NULL;
+
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   spin_lock_irqsave(>zone_lock, flags);
+   while (rq) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   break;
+   rq = deadline_latter_request(rq);
+   }
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -259,7 +302,8 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -304,6 +348,13 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
rq = next_rq;
}
 
+   /*
+* For a zoned block device, if we only have writes queued and none of
+* them can be dispatched, rq will be NULL.
+*/
+   if (!rq)
+   return NULL;
+
dd->batching = 0;
 
 dispatch_request:
@@ -313,6 +364,10 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
dd->batching++;
deadline_move_request(dd, rq);
 done:
+   /*
+* If the request needs it

[PATCH V9 7/7] sd: Remove zone write locking

2017-12-20 Thread Damien Le Moal
The block layer now handles zone write locking.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 drivers/scsi/sd.c| 41 +++-
 drivers/scsi/sd.h| 11 ---
 drivers/scsi/sd_zbc.c| 83 
 include/scsi/scsi_cmnd.h |  3 +-
 4 files changed, 6 insertions(+), 132 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a028ab3322a9..f1157b4fe32e 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -851,16 +851,13 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd 
*cmd)
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
-   int ret;
 
if (!(rq->cmd_flags & REQ_NOUNMAP)) {
switch (sdkp->zeroing_mode) {
case SD_ZERO_WS16_UNMAP:
-   ret = sd_setup_write_same16_cmnd(cmd, true);
-   goto out;
+   return sd_setup_write_same16_cmnd(cmd, true);
case SD_ZERO_WS10_UNMAP:
-   ret = sd_setup_write_same10_cmnd(cmd, true);
-   goto out;
+   return sd_setup_write_same10_cmnd(cmd, true);
}
}
 
@@ -868,15 +865,9 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd 
*cmd)
return BLKPREP_INVALID;
 
if (sdkp->ws16 || sector > 0x || nr_sectors > 0x)
-   ret = sd_setup_write_same16_cmnd(cmd, false);
-   else
-   ret = sd_setup_write_same10_cmnd(cmd, false);
-
-out:
-   if (sd_is_zoned(sdkp) && ret == BLKPREP_OK)
-   return sd_zbc_write_lock_zone(cmd);
+   return sd_setup_write_same16_cmnd(cmd, false);
 
-   return ret;
+   return sd_setup_write_same10_cmnd(cmd, false);
 }
 
 static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -964,12 +955,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 
BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
-   if (sd_is_zoned(sdkp)) {
-   ret = sd_zbc_write_lock_zone(cmd);
-   if (ret != BLKPREP_OK)
-   return ret;
-   }
-
sector >>= ilog2(sdp->sector_size) - 9;
nr_sectors >>= ilog2(sdp->sector_size) - 9;
 
@@ -1004,9 +989,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
ret = scsi_init_io(cmd);
rq->__data_len = nr_bytes;
 
-   if (sd_is_zoned(sdkp) && ret != BLKPREP_OK)
-   sd_zbc_write_unlock_zone(cmd);
-
return ret;
 }
 
@@ -1036,19 +1018,12 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
sector_t threshold;
unsigned int this_count = blk_rq_sectors(rq);
unsigned int dif, dix;
-   bool zoned_write = sd_is_zoned(sdkp) && rq_data_dir(rq) == WRITE;
int ret;
unsigned char protect;
 
-   if (zoned_write) {
-   ret = sd_zbc_write_lock_zone(SCpnt);
-   if (ret != BLKPREP_OK)
-   return ret;
-   }
-
ret = scsi_init_io(SCpnt);
if (ret != BLKPREP_OK)
-   goto out;
+   return ret;
WARN_ON_ONCE(SCpnt != rq->special);
 
/* from here on until we're complete, any goto out
@@ -1267,9 +1242,6 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
 */
ret = BLKPREP_OK;
  out:
-   if (zoned_write && ret != BLKPREP_OK)
-   sd_zbc_write_unlock_zone(SCpnt);
-
return ret;
 }
 
@@ -1314,9 +1286,6 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
struct request *rq = SCpnt->request;
u8 *cmnd;
 
-   if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK)
-   sd_zbc_write_unlock_zone(SCpnt);
-
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
__free_page(rq->special_vec.bv_page);
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 320de758323e..0d663b5e45bb 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -77,7 +77,6 @@ struct scsi_disk {
unsigned intnr_zones;
unsigned intzone_blocks;
unsigned intzone_shift;
-   unsigned long   *zones_wlock;
unsigned intzones_optimal_open;
unsigned intzones_optimal_nonseq;
unsigned intzones_max_open;
@@ -283,8 +282,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
 extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
 extern void sd_zbc_remove(struct scsi_disk *sdkp);
 extern void sd_zbc_print_zones(struct scsi_disk *sd

[PATCH V9 5/7] deadline-iosched: Introduce zone locking support

2017-12-20 Thread Damien Le Moal
Introduce zone write locking to avoid write request reordering with
zoned block devices. This is achieved using a finer selection of the
next request to dispatch:
1) Any non-write request is always allowed to proceed.
2) Any write to a conventional zone is always allowed to proceed.
3) For a write to a sequential zone, the zone lock is first checked.
   a) If the zone is not locked, the write is allowed to proceed after
  its target zone is locked.
   b) If the zone is locked, the write request is skipped and the next
  request in the dispatch queue tested (back to step 1).

For a write request that has locked its target zone, the zone is
unlocked either when the request completes and the method
deadline_request_completed() is called, or when the request is requeued
using the method deadline_add_request().

Requests targeting a locked zone are always left in the scheduler queue
to preserve the initial write order. If no write request can be
dispatched, allow reads to be dispatched even if the write batch is not
done.

If the device used is not a zoned block device, or if zoned block device
support is disabled, this patch does not modify deadline behavior.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/deadline-iosched.c | 71 ++--
 1 file changed, 68 insertions(+), 3 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 81e3f0897457..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -98,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request 
*rq)
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
 
+   /*
+* This may be a requeue of a write request that has locked its
+* target zone. If it is the case, this releases the zone lock.
+*/
+   blk_req_zone_write_unlock(rq);
+
deadline_add_rq_rb(dd, rq);
 
/*
@@ -188,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct 
request *rq)
 {
struct request_queue *q = rq->q;
 
+   /*
+* For a zoned block device, write requests must write lock their
+* target zone.
+*/
+   blk_req_zone_write_lock(rq);
+
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
 }
@@ -235,13 +247,28 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   return rq;
+   }
+
+   return NULL;
 }
 
 /*
@@ -251,10 +278,29 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!rq)
+   return NULL;
+
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   while (rq) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   return rq;
+   rq = deadline_latter_request(rq);
+   }
+
+   return NULL;
 }
 
 /*
@@ -288,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue 
*q, int force)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -333,6 +380,13 @@ static int deadline_dispatch_requests(struct request_queue 
*q, int force)
rq = next_rq;
}
 
+   /*
+* For a zoned bl

[PATCH V9 6/7] sd_zbc: Initialize device request queue zoned data

2017-12-20 Thread Damien Le Moal
Initialize the seq_zones_bitmap, seq_zones_wlock and nr_zones fields of
the disk request queue on disk revalidate. As the seq_zones_bitmap
and seq_zones_wlock allocations are identical, introduce the helper
sd_zbc_alloc_zone_bitmap(). Using this helper, reallocate the bitmaps
whenever the disk capacity (number of zones) changes.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/scsi/sd_zbc.c | 152 +++---
 1 file changed, 144 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 27793b9f54c0..c715b8363ce0 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -586,8 +586,123 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return 0;
 }
 
+/**
+ * sd_zbc_alloc_zone_bitmap - Allocate a zone bitmap (one bit per zone).
+ * @sdkp: The disk of the bitmap
+ */
+static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
+   * sizeof(unsigned long),
+   GFP_KERNEL, q->node);
+}
+
+/**
+ * sd_zbc_get_seq_zones - Parse report zones reply to identify sequential zones
+ * @sdkp: disk used
+ * @buf: report reply buffer
+ * @seq_zone_bitamp: bitmap of sequential zones to set
+ *
+ * Parse reported zone descriptors in @buf to identify sequential zones and
+ * set the reported zone bit in @seq_zones_bitmap accordingly.
+ * Since read-only and offline zones cannot be written, do not
+ * mark them as sequential in the bitmap.
+ * Return the LBA after the last zone reported.
+ */
+static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char 
*buf,
+unsigned int buflen,
+unsigned long *seq_zones_bitmap)
+{
+   sector_t lba, next_lba = sdkp->capacity;
+   unsigned int buf_len, list_length;
+   unsigned char *rec;
+   u8 type, cond;
+
+   list_length = get_unaligned_be32([0]) + 64;
+   buf_len = min(list_length, buflen);
+   rec = buf + 64;
+
+   while (rec < buf + buf_len) {
+   type = rec[0] & 0x0f;
+   cond = (rec[1] >> 4) & 0xf;
+   lba = get_unaligned_be64([16]);
+   if (type != ZBC_ZONE_TYPE_CONV &&
+   cond != ZBC_ZONE_COND_READONLY &&
+   cond != ZBC_ZONE_COND_OFFLINE)
+   set_bit(lba >> sdkp->zone_shift, seq_zones_bitmap);
+   next_lba = lba + get_unaligned_be64([8]);
+   rec += 64;
+   }
+
+   return next_lba;
+}
+
+/**
+ * sd_zbc_setup_seq_zones_bitmap - Initialize the disk seq zone bitmap.
+ * @sdkp: target disk
+ *
+ * Allocate a zone bitmap and initialize it by identifying sequential zones.
+ */
+static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+   unsigned long *seq_zones_bitmap;
+   sector_t lba = 0;
+   unsigned char *buf;
+   int ret = -ENOMEM;
+
+   seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!seq_zones_bitmap)
+   return -ENOMEM;
+
+   buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   while (lba < sdkp->capacity) {
+   ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, lba);
+   if (ret)
+   goto out;
+   lba = sd_zbc_get_seq_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+  seq_zones_bitmap);
+   }
+
+   if (lba != sdkp->capacity) {
+   /* Something went wrong */
+   ret = -EIO;
+   }
+
+out:
+   kfree(buf);
+   if (ret) {
+   kfree(seq_zones_bitmap);
+   return ret;
+   }
+
+   q->seq_zones_bitmap = seq_zones_bitmap;
+
+   return 0;
+}
+
+static void sd_zbc_cleanup(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   kfree(q->seq_zones_bitmap);
+   q->seq_zones_bitmap = NULL;
+
+   kfree(q->seq_zones_wlock);
+   q->seq_zones_wlock = NULL;
+
+   q->nr_zones = 0;
+}
+
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   int ret;
 
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
@@ -599,15 +714,36 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
sdkp->nr_zones =
round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
-   if (!sdkp->zones_wlock) {
-   sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
-   sizeof(unsigned l

[PATCH V9 2/7] mq-deadline: Introduce dispatch helpers

2017-12-20 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
__dd_dispatch_request() for zoned block devices.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/mq-deadline.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 0179e484ec98..8bd6db9e69c7 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -191,6 +191,35 @@ static inline int deadline_check_fifo(struct deadline_data 
*dd, int ddir)
return 0;
 }
 
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
 /*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
@@ -198,7 +227,7 @@ static inline int deadline_check_fifo(struct deadline_data 
*dd, int ddir)
 static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-   struct request *rq;
+   struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
 
@@ -214,10 +243,9 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -260,19 +288,20 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.14.3



[PATCH V9 4/7] deadline-iosched: Introduce dispatch helpers

2017-12-20 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
deadline_dispatch_requests() for zoned block devices.

While at it, also remove the unnecessary forward declaration of the
function deadline_move_request().

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/deadline-iosched.c | 47 +--
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..81e3f0897457 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
int front_merges;
 };
 
-static void deadline_move_request(struct deadline_data *, struct request *);
-
 static inline struct rb_root *
 deadline_rb_root(struct deadline_data *dd, struct request *rq)
 {
@@ -230,6 +228,35 @@ static inline int deadline_check_fifo(struct deadline_data 
*dd, int ddir)
return 0;
 }
 
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
 /*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
@@ -239,16 +266,15 @@ static int deadline_dispatch_requests(struct 
request_queue *q, int force)
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(>fifo_list[READ]);
const int writes = !list_empty(>fifo_list[WRITE]);
-   struct request *rq;
+   struct request *rq, *next_rq;
int data_dir;
 
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -291,19 +317,20 @@ static int deadline_dispatch_requests(struct 
request_queue *q, int force)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.14.3



[PATCH V9 0/7] blk-mq support for ZBC disks

2017-12-20 Thread Damien Le Moal
This series, formerly titled "scsi-mq support for ZBC disks", implements
support for ZBC disks for system using the scsi-mq I/O path.

The current scsi level support of ZBC disks guarantees write request ordering
using a per-zone write lock which prevents issuing simultaneously multiple
write commands to a zone, doing so avoid reordering of sequential writes to
sequential zones. This method is however ineffective when scsi-mq is used with
zoned block devices. This is due to the different execution model of blk-mq
which passes a request to the scsi layer for dispatching after the request has
been removed from the I/O scheduler queue. That is, when the scsi layer tries
to lock the target zone of the request, the request may already be out of
order and zone write locking fails to prevent that.

Various approaches have been tried to solve this problem directly from the core
code of blk-mq. All of them had the serious disadvantage of cluttering blk-mq
code with zoned block device specific conditions and processing, making
maintenance and testing difficult.

This series adds blk-mq support for zoned block devices at the I/O scheduler
level with simple modifications of the mq-deadline scheduler. Implementation
is done with reusable helpers defined in the zoned block device support file
(blk-zoned.c). These helpers provide per zone write locking control functions
similar to what was implemented directly in the SCSI layer in sd_zbc.c.
The zone write locking mechanism is used by mq-deadline for the exact same
purpose, that is, to limit writes per zone to at most one request to avoid
reordering.

The changes to mq-deadline do not affect its operation with regular disks. The
same scheduling behavior is maintained for these devices. Compared to the SCSI
layer zone locking implementation, this series optimizes avoids locking
conventional zones which result in a use of these zone that is comparable to a
regular disk.

This series also implements changes to the legacy deadline-iosched. Doing so,
the zone locking code at the SCSI layer in sd.c and sd_zbc.c can be removed.
This results in a significant simplification of the sd driver command handling.

Patch 1 to 5 introduce the zone locking helpers in the block layer and modify
the deadline and mq-deadline schedulers.
Patch 6 and 7 remove the SCSI layer zone locking and initialize the device
request queue zone information.

All patches apply without conflicts to the scsi tree branch 4.16/scsi-queue, to
the block tree branch for-linus as well as to the current 4.15-rc4 tree.

Of note is that this series imposes the use of the deadline and mq-deadline
schedulers with zoned block devices. A system can trivialy enforce this using
a udev rule such as:

ACTION=="add|change", KERNEL=="sd[a-z]", ATTRS{queue/zoned}=="host-managed", \
ATTR{queue/scheduler}="deadline"

This rules applies equally for the legacy SCSI path as well as the scsi-mq path
thanks to "mq-deadline" being aliased to "deadline".

Comments are as always very much appreciated.

Changes from v8:
* Rebased on 4.15-rc4

Changes from v7:
* Merged patch 8 into patch 6
* Fixed various typos and commit messages

Changes from v6:
* Implement zone write locking helpers in the block layer
* Also modify legacy path deadline scheduler to remove all zone write locking
  code from the scsi layer

Changes from v5:
* Refactor patches to introduce the zone_lock spinlock only when needed
* Addressed Bart's comments (in particular explanations of the zone_lock
  spinlock use)

Changes from v4:
* Various fixes and improvements (From Christoph's comments)
* Dropped zones_wlock scheduler tunable attribute

Changes from v3:
* Integrated support directly into mq-deadline instead of creating a new I/O
  scheduler.
* Disable setting of default mq scheduler for single queue devices

Changes from v2:
* Introduced blk_zoned structure
* Moved I/O scheduler from drivers/scsi to block

Changes from v1:
* Addressed Bart's comments for the blk-mq patches (declarations files)
* Split (former) patch 4 into multiple patches to facilitate review
* Fixed scsi disk lookup from io scheduler by introducing
  scsi_disk_from_queue()

Christoph Hellwig (1):
  block: introduce zoned block devices zone write locking

Damien Le Moal (6):
  mq-deadline: Introduce dispatch helpers
  mq-deadline: Introduce zone locking support
  deadline-iosched: Introduce dispatch helpers
  deadline-iosched: Introduce zone locking support
  sd_zbc: Initialize device request queue zoned data
  sd: Remove zone write locking

 block/blk-core.c |   1 +
 block/blk-zoned.c|  42 +
 block/deadline-iosched.c | 114 ---
 block/mq-deadline.c  | 130 --
 drivers/scsi/sd.c|  41 +
 drivers/scsi/sd.h|  11 ---
 drivers/scsi/sd_zbc.c| 235 +--
 include/linux/blkdev.h   | 111 +

[PATCH V9 1/7] block: introduce zoned block devices zone write locking

2017-12-20 Thread Damien Le Moal
From: Christoph Hellwig <h...@lst.de>

Components relying only on the request_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue structure to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required) and the bitmap seq_zones_wlock which
indicates if a zone is write locked, that is, if a write request
targeting a zone was dispatched to the device. These fields are
initialized by the low level block device driver (sd.c for ZBC/ZAC
disks). They are not initialized by stacking drivers (device mappers)
handling zoned block devices (e.g. dm-linear).

Using this, I/O schedulers can introduce zone write locking to control
request dispatching to a zoned block device and avoid write request
reordering by limiting to at most a single write request per zone
outside of the scheduler at any time.

Based on previous patches from Damien Le Moal.

Signed-off-by: Christoph Hellwig <h...@lst.de>
[Damien]
* Fixed comments and identation in blkdev.h
* Changed helper functions
* Fixed this commit message
Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/blk-core.c   |   1 +
 block/blk-zoned.c  |  42 +++
 include/linux/blkdev.h | 111 +
 3 files changed, 154 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index b8881750a3ac..e6e5bbc4c366 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1641,6 +1641,7 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
 
lockdep_assert_held(q->queue_lock);
 
+   blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
 
elv_completed_request(q, req);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -21,6 +21,48 @@ static inline sector_t blk_zone_start(struct request_queue 
*q,
return sector & ~zone_mask;
 }
 
+/*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+   if (!rq->q->seq_zones_wlock)
+   return false;
+
+   if (blk_rq_is_passthrough(rq))
+   return false;
+
+   switch (req_op(rq)) {
+   case REQ_OP_WRITE_ZEROES:
+   case REQ_OP_WRITE_SAME:
+   case REQ_OP_WRITE:
+   return blk_rq_zone_is_seq(rq);
+   default:
+   return false;
+   }
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+
+void __blk_req_zone_write_lock(struct request *rq)
+{
+   if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+ rq->q->seq_zones_wlock)))
+   return;
+
+   WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+   rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+   rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+   if (rq->q->seq_zones_wlock)
+   WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+
 /*
  * Check that a zone report belongs to the partition.
  * If yes, fix its start sector and write pointer, copy it in the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8089ca17db9a..46e606f5b44b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
bio chain. */
 #define RQF_SPECIAL_PAYLOAD((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -546,6 +548,22 @@ struct request_queue {
 
struct queue_limits limits;
 
+   /*
+* Zoned block device information for request dispatch control.
+* nr_zones is the total number of zones of the device. This is always
+* 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+* bits which indicates if a zone is conventional (bit clear) or
+* sequential (bit set). seq_zones_wlock is a bitmap of 

Re: [PATCH] scsi: remove extra white space at the end of the line

2017-12-20 Thread Damien Le Moal
Jason,

On 12/21/17 11:40, Jason Yan wrote:
> My editor always try to remove the extra white space at the end of the
> line when I make some changes. I'm tired of adjusting them manually.
> Can we remove them in mainline?
> 
> Signed-off-by: Jason Yan <yanai...@huawei.com>
> ---
>  drivers/scsi/scsi.c   |  2 +-
>  drivers/scsi/scsi_lib.c   | 20 ++--
>  drivers/scsi/scsi_scan.c  | 10 +-
>  drivers/scsi/scsi_sysfs.c |  4 ++--
>  4 files changed, 18 insertions(+), 18 deletions(-)

There are plenty in drivers/scsi/sd.c too :)
I would also love to get rid of these white spaces so that my emacs
stops looking like a Christmas tree all year long.
Last time I tried I got nacked...

Martin,

What about a (may be big) whitespace-only patch to clean up all the scsi
code and finally get checkpatch to stop screaming at us if we touch a
line near one with whitespaces (which can happen a lot...) ?

Best regards.

-- 
Damien Le Moal,
Western Digital

Re: [PATCH V8 0/7] blk-mq support for ZBC disks

2017-12-11 Thread Damien Le Moal
Jens,

On Fri, 2017-11-24 at 16:54 -0700, Jens Axboe wrote:
> On 11/24/2017 04:48 PM, Damien Le Moal wrote:
> > [Full quote deleted]
> > 
> > Hi Jens,
> > 
> > Any comment regarding this series ?
> > I understand that this would be for the 4.16 merge window, so no hurry,
> > but I would like to know if I need to go back to the drawing board for
> > ZBC blk-mq/scsi-mq support or if this is an acceptable solution.
> 
> I'll give it a thorough look-over on Monday.

Would you have any comment on the series ?
I understand you are busy so please feel free to let me know if there is
anything I can do to facilitate your review.

Thank you.

-- 
Damien Le Moal
Western Digital

Re: [PATCH V8 0/7] blk-mq support for ZBC disks

2017-11-24 Thread Damien Le Moal
[Full quote deleted]

Hi Jens,

Any comment regarding this series ?
I understand that this would be for the 4.16 merge window, so no hurry,
but I would like to know if I need to go back to the drawing board for
ZBC blk-mq/scsi-mq support or if this is an acceptable solution.

Best regards.

-- 
Damien Le Moal
Western Digital Research


[PATCH V8 2/7] mq-deadline: Introduce dispatch helpers

2017-11-08 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
__dd_dispatch_request() for zoned block devices.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/mq-deadline.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 0179e484ec98..8bd6db9e69c7 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -192,13 +192,42 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
 static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-   struct request *rq;
+   struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
 
@@ -214,10 +243,9 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -260,19 +288,20 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.13.6



[PATCH V8 4/7] deadline-iosched: Introduce dispatch helpers

2017-11-08 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
deadline_dispatch_requests() for zoned block devices.

While at it, also remove the unnecessary forward declaration of the
function deadline_move_request().

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/deadline-iosched.c | 47 +--
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..81e3f0897457 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
int front_merges;
 };
 
-static void deadline_move_request(struct deadline_data *, struct request *);
-
 static inline struct rb_root *
 deadline_rb_root(struct deadline_data *dd, struct request *rq)
 {
@@ -231,6 +229,35 @@ static inline int deadline_check_fifo(struct deadline_data 
*dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to dispatch using
+ * arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
@@ -239,16 +266,15 @@ static int deadline_dispatch_requests(struct 
request_queue *q, int force)
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(>fifo_list[READ]);
const int writes = !list_empty(>fifo_list[WRITE]);
-   struct request *rq;
+   struct request *rq, *next_rq;
int data_dir;
 
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -291,19 +317,20 @@ static int deadline_dispatch_requests(struct 
request_queue *q, int force)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.13.6



[PATCH V8 6/7] sd_zbc: Initialize device request queue zoned data

2017-11-08 Thread Damien Le Moal
Initialize the seq_zones_bitmap, seq_zones_wlock and nr_zones fields of
the disk request queue on disk revalidate. As the seq_zones_bitmap
and seq_zones_wlock allocations are identical, introduce the helper
sd_zbc_alloc_zone_bitmap(). Using this helper, reallocate the bitmaps
whenever the disk capacity (number of zones) changes.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/scsi/sd_zbc.c | 152 +++---
 1 file changed, 144 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 27793b9f54c0..c715b8363ce0 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -586,8 +586,123 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return 0;
 }
 
+/**
+ * sd_zbc_alloc_zone_bitmap - Allocate a zone bitmap (one bit per zone).
+ * @sdkp: The disk of the bitmap
+ */
+static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
+   * sizeof(unsigned long),
+   GFP_KERNEL, q->node);
+}
+
+/**
+ * sd_zbc_get_seq_zones - Parse report zones reply to identify sequential zones
+ * @sdkp: disk used
+ * @buf: report reply buffer
+ * @seq_zone_bitamp: bitmap of sequential zones to set
+ *
+ * Parse reported zone descriptors in @buf to identify sequential zones and
+ * set the reported zone bit in @seq_zones_bitmap accordingly.
+ * Since read-only and offline zones cannot be written, do not
+ * mark them as sequential in the bitmap.
+ * Return the LBA after the last zone reported.
+ */
+static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char 
*buf,
+unsigned int buflen,
+unsigned long *seq_zones_bitmap)
+{
+   sector_t lba, next_lba = sdkp->capacity;
+   unsigned int buf_len, list_length;
+   unsigned char *rec;
+   u8 type, cond;
+
+   list_length = get_unaligned_be32([0]) + 64;
+   buf_len = min(list_length, buflen);
+   rec = buf + 64;
+
+   while (rec < buf + buf_len) {
+   type = rec[0] & 0x0f;
+   cond = (rec[1] >> 4) & 0xf;
+   lba = get_unaligned_be64([16]);
+   if (type != ZBC_ZONE_TYPE_CONV &&
+   cond != ZBC_ZONE_COND_READONLY &&
+   cond != ZBC_ZONE_COND_OFFLINE)
+   set_bit(lba >> sdkp->zone_shift, seq_zones_bitmap);
+   next_lba = lba + get_unaligned_be64([8]);
+   rec += 64;
+   }
+
+   return next_lba;
+}
+
+/**
+ * sd_zbc_setup_seq_zones_bitmap - Initialize the disk seq zone bitmap.
+ * @sdkp: target disk
+ *
+ * Allocate a zone bitmap and initialize it by identifying sequential zones.
+ */
+static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+   unsigned long *seq_zones_bitmap;
+   sector_t lba = 0;
+   unsigned char *buf;
+   int ret = -ENOMEM;
+
+   seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!seq_zones_bitmap)
+   return -ENOMEM;
+
+   buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   while (lba < sdkp->capacity) {
+   ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, lba);
+   if (ret)
+   goto out;
+   lba = sd_zbc_get_seq_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+  seq_zones_bitmap);
+   }
+
+   if (lba != sdkp->capacity) {
+   /* Something went wrong */
+   ret = -EIO;
+   }
+
+out:
+   kfree(buf);
+   if (ret) {
+   kfree(seq_zones_bitmap);
+   return ret;
+   }
+
+   q->seq_zones_bitmap = seq_zones_bitmap;
+
+   return 0;
+}
+
+static void sd_zbc_cleanup(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   kfree(q->seq_zones_bitmap);
+   q->seq_zones_bitmap = NULL;
+
+   kfree(q->seq_zones_wlock);
+   q->seq_zones_wlock = NULL;
+
+   q->nr_zones = 0;
+}
+
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   int ret;
 
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
@@ -599,15 +714,36 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
sdkp->nr_zones =
round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
-   if (!sdkp->zones_wlock) {
-   sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
-   sizeof(unsigned l

[PATCH V8 0/7] blk-mq support for ZBC disks

2017-11-08 Thread Damien Le Moal
This series, formerly titled "scsi-mq support for ZBC disks", implements
support for ZBC disks for system using the scsi-mq I/O path.

The current scsi level support of ZBC disks guarantees write request ordering
using a per-zone write lock which prevents issuing simultaneously multiple
write commands to a zone, doing so avoid reordering of sequential writes to
sequential zones. This method is however ineffective when scsi-mq is used with
zoned block devices. This is due to the different execution model of blk-mq
which passes a request to the scsi layer for dispatching after the request has
been removed from the I/O scheduler queue. That is, when the scsi layer tries
to lock the target zone of the request, the request may already be out of
order and zone write locking fails to prevent that.

Various approaches have been tried to solve this problem directly from the core
code of blk-mq. All of them had the serious disadvantage of cluttering blk-mq
code with zoned block device specific conditions and processing, making
maintenance and testing difficult.

This series adds blk-mq support for zoned block devices at the I/O scheduler
level with simple modifications of the mq-deadline scheduler. Implementation
is done with reusable helpers defined in the zoned block device support file
(blk-zoned.c). These helpers provide per zone write locking control functions
similar to what was implemented directly in the SCSI layer in sd_zbc.c.
The zone write locking mechanism is used by mq-deadline for the exact same
purpose, that is, to limit writes per zone to at most one request to avoid
reordering.

The changes to mq-deadline do not affect its operation with regular disks. The
same scheduling behavior is maintained for these devices. Compared to the SCSI
layer zone locking implementation, this series optimizes avoids locking
conventional zones which result in a use of these zone that is comparable to a
regular disk.

This series also implements changes to the legacy deadline-iosched. Doing so,
the zone locking code at the SCSI layer in sd.c and sd_zbc.c can be removed.
This results in a significant simplification of the sd driver command handling.

Patch 1 to 5 introduce the zone locking helpers in the block layer and modify
the deadline and mq-deadline schedulers. They equally apply on top of the block
tree branch for-4.15/block and on top of the scsi tree branch 4.15/scsi-queue.
Patch 6 to 8 remove the SCSI layer zone locking and initialize the device
request queue zone information. They apply to the scsi tree branch
4.15/scsi-queue. To cleanly apply these last 3 patches to the block tree branch
for-4.15/block, the following patches from the scsi tree must first be applied:

aa8a845662 "scsi: sd_zbc: Move ZBC declarations to scsi_proto.h"
e98f42bcad "scsi: sd_zbc: Fix comments and indentation"
5eed92d173 "scsi: sd_zbc: Rearrange code"
e8c77ec483 "scsi: sd_zbc: Use well defined macros"
4a109032e3 "scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()"

Of note is that this series imposes the use of the deadline and mq-deadline
schedulers with zoned block devices. A system can trivialy enforce this using
a udev rule such as:

ACTION=="add|change", KERNEL=="sd[a-z]", ATTRS{queue/zoned}=="host-managed", \
ATTR{queue/scheduler}="deadline"

This rules applies equally for the legacy SCSI path as well as the scsi-mq path
thanks to "mq-deadline" being aliased to "deadline".

Comments are as always very much appreciated.

Changes from v7:
* Merged patch 8 into patch 6
* Fixed various typos and commit messages

Changes from v6:
* Implement zone write locking helpers in the block layer
* Also modify legacy path deadline scheduler to remove all zone write locking
  code from the scsi layer

Changes from v5:
* Refactor patches to introduce the zone_lock spinlock only when needed
* Addressed Bart's comments (in particular explanations of the zone_lock
  spinlock use)

Changes from v4:
* Various fixes and improvements (From Christoph's comments)
* Dropped zones_wlock scheduler tunable attribute

Changes from v3:
* Integrated support directly into mq-deadline instead of creating a new I/O
  scheduler.
* Disable setting of default mq scheduler for single queue devices

Changes from v2:
* Introduced blk_zoned structure
* Moved I/O scheduler from drivers/scsi to block

Changes from v1:
* Addressed Bart's comments for the blk-mq patches (declarations files)
* Split (former) patch 4 into multiple patches to facilitate review
* Fixed scsi disk lookup from io scheduler by introducing
  scsi_disk_from_queue()

Christoph Hellwig (1):
  block: introduce zoned block devices zone write locking

Damien Le Moal (6):
  mq-deadline: Introduce dispatch helpers
  mq-deadline: Introduce zone locking support
  deadline-iosched: Introduce dispatch helpers
  deadline-iosched: Introduce zone locking support
  sd_zbc: Initialize device req

[PATCH V8 5/7] deadline-iosched: Introduce zone locking support

2017-11-08 Thread Damien Le Moal
Introduce zone write locking to avoid write request reordering with
zoned block devices. This is achieved using a finer selection of the
next request to dispatch:
1) Any non-write request is always allowed to proceed.
2) Any write to a conventional zone is always allowed to proceed.
3) For a write to a sequential zone, the zone lock is first checked.
   a) If the zone is not locked, the write is allowed to proceed after
  its target zone is locked.
   b) If the zone is locked, the write request is skipped and the next
  request in the dispatch queue tested (back to step 1).

For a write request that has locked its target zone, the zone is
unlocked either when the request completes and the method
deadline_request_completed() is called, or when the request is requeued
using the method deadline_add_request().

Requests targeting a locked zone are always left in the scheduler queue
to preserve the initial write order. If no write request can be
dispatched, allow reads to be dispatched even if the write batch is not
done.

If the device used is not a zoned block device, or if zoned block device
support is disabled, this patch does not modify deadline behavior.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/deadline-iosched.c | 71 ++--
 1 file changed, 68 insertions(+), 3 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 81e3f0897457..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -98,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request 
*rq)
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
 
+   /*
+* This may be a requeue of a write request that has locked its
+* target zone. If it is the case, this releases the zone lock.
+*/
+   blk_req_zone_write_unlock(rq);
+
deadline_add_rq_rb(dd, rq);
 
/*
@@ -188,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct 
request *rq)
 {
struct request_queue *q = rq->q;
 
+   /*
+* For a zoned block device, write requests must write lock their
+* target zone.
+*/
+   blk_req_zone_write_lock(rq);
+
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
 }
@@ -235,13 +247,28 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   return rq;
+   }
+
+   return NULL;
 }
 
 /*
@@ -251,10 +278,29 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!rq)
+   return NULL;
+
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   while (rq) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   return rq;
+   rq = deadline_latter_request(rq);
+   }
+
+   return NULL;
 }
 
 /*
@@ -288,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue 
*q, int force)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -333,6 +380,13 @@ static int deadline_dispatch_requests(struct request_queue 
*q, int force)
rq = next_rq;
}
 
+   /*
+* For a zoned bl

[PATCH V8 1/7] block: introduce zoned block devices zone write locking

2017-11-08 Thread Damien Le Moal
From: Christoph Hellwig <h...@lst.de>

Components relying only on the request_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue structure to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required) and the bitmap seq_zones_wlock which
indicates if a zone is write locked, that is, if a write request
targeting a zone was dispatched to the device. These fields are
initialized by the low level block device driver (sd.c for ZBC/ZAC
disks). They are not initialized by stacking drivers (device mappers)
handling zoned block devices (e.g. dm-linear).

Using this, I/O schedulers can introduce zone write locking to control
request dispatching to a zoned block device and avoid write request
reordering by limiting to at most a single write request per zone
outside of the scheduler at any time.

Based on previous patches from Damien Le Moal.

Signed-off-by: Christoph Hellwig <h...@lst.de>
[Damien]
* Fixed comments and identation in blkdev.h
* Changed helper functions
* Fixed this commit message
Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/blk-core.c   |   1 +
 block/blk-zoned.c  |  42 +++
 include/linux/blkdev.h | 111 +
 3 files changed, 154 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index b8d1aa2d1008..e887c0b45d0b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1555,6 +1555,7 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
 
lockdep_assert_held(q->queue_lock);
 
+   blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
 
elv_completed_request(q, req);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue 
*q,
 }
 
 /*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+   if (!rq->q->seq_zones_wlock)
+   return false;
+
+   if (blk_rq_is_passthrough(rq))
+   return false;
+
+   switch (req_op(rq)) {
+   case REQ_OP_WRITE_ZEROES:
+   case REQ_OP_WRITE_SAME:
+   case REQ_OP_WRITE:
+   return blk_rq_zone_is_seq(rq);
+   default:
+   return false;
+   }
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+
+void __blk_req_zone_write_lock(struct request *rq)
+{
+   if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+ rq->q->seq_zones_wlock)))
+   return;
+
+   WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+   rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+   rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+   if (rq->q->seq_zones_wlock)
+   WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+
+/*
  * Check that a zone report belongs to the partition.
  * If yes, fix its start sector and write pointer, copy it in the
  * zone information array and return true. Return false otherwise.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 225617dd0a3f..1c715dae31e4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -120,6 +120,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
bio chain. */
 #define RQF_SPECIAL_PAYLOAD((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -546,6 +548,22 @@ struct request_queue {
struct queue_limits limits;
 
/*
+* Zoned block device information for request dispatch control.
+* nr_zones is the total number of zones of the device. This is always
+* 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+* bits which indicates if a zone is conventional (bit clear) or
+* sequential (bit set). seq_zones_wloc

[PATCH V8 7/7] sd: Remove zone write locking

2017-11-08 Thread Damien Le Moal
The block layer now handles zone write locking.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 drivers/scsi/sd.c| 41 +++-
 drivers/scsi/sd.h| 11 ---
 drivers/scsi/sd_zbc.c| 83 
 include/scsi/scsi_cmnd.h |  3 +-
 4 files changed, 6 insertions(+), 132 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fb9f8b5f4673..2d5b0f84ff14 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -835,16 +835,13 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd 
*cmd)
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
-   int ret;
 
if (!(rq->cmd_flags & REQ_NOUNMAP)) {
switch (sdkp->zeroing_mode) {
case SD_ZERO_WS16_UNMAP:
-   ret = sd_setup_write_same16_cmnd(cmd, true);
-   goto out;
+   return sd_setup_write_same16_cmnd(cmd, true);
case SD_ZERO_WS10_UNMAP:
-   ret = sd_setup_write_same10_cmnd(cmd, true);
-   goto out;
+   return sd_setup_write_same10_cmnd(cmd, true);
}
}
 
@@ -852,15 +849,9 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd 
*cmd)
return BLKPREP_INVALID;
 
if (sdkp->ws16 || sector > 0x || nr_sectors > 0x)
-   ret = sd_setup_write_same16_cmnd(cmd, false);
-   else
-   ret = sd_setup_write_same10_cmnd(cmd, false);
-
-out:
-   if (sd_is_zoned(sdkp) && ret == BLKPREP_OK)
-   return sd_zbc_write_lock_zone(cmd);
+   return sd_setup_write_same16_cmnd(cmd, false);
 
-   return ret;
+   return sd_setup_write_same10_cmnd(cmd, false);
 }
 
 static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -928,12 +919,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 
BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
-   if (sd_is_zoned(sdkp)) {
-   ret = sd_zbc_write_lock_zone(cmd);
-   if (ret != BLKPREP_OK)
-   return ret;
-   }
-
sector >>= ilog2(sdp->sector_size) - 9;
nr_sectors >>= ilog2(sdp->sector_size) - 9;
 
@@ -968,9 +953,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
ret = scsi_init_io(cmd);
rq->__data_len = nr_bytes;
 
-   if (sd_is_zoned(sdkp) && ret != BLKPREP_OK)
-   sd_zbc_write_unlock_zone(cmd);
-
return ret;
 }
 
@@ -1000,19 +982,12 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
sector_t threshold;
unsigned int this_count = blk_rq_sectors(rq);
unsigned int dif, dix;
-   bool zoned_write = sd_is_zoned(sdkp) && rq_data_dir(rq) == WRITE;
int ret;
unsigned char protect;
 
-   if (zoned_write) {
-   ret = sd_zbc_write_lock_zone(SCpnt);
-   if (ret != BLKPREP_OK)
-   return ret;
-   }
-
ret = scsi_init_io(SCpnt);
if (ret != BLKPREP_OK)
-   goto out;
+   return ret;
WARN_ON_ONCE(SCpnt != rq->special);
 
/* from here on until we're complete, any goto out
@@ -1231,9 +1206,6 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
 */
ret = BLKPREP_OK;
  out:
-   if (zoned_write && ret != BLKPREP_OK)
-   sd_zbc_write_unlock_zone(SCpnt);
-
return ret;
 }
 
@@ -1277,9 +1249,6 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 {
struct request *rq = SCpnt->request;
 
-   if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK)
-   sd_zbc_write_unlock_zone(SCpnt);
-
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
__free_page(rq->special_vec.bv_page);
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 99c4dde9b6bf..112627c1cc85 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -76,7 +76,6 @@ struct scsi_disk {
unsigned intnr_zones;
unsigned intzone_blocks;
unsigned intzone_shift;
-   unsigned long   *zones_wlock;
unsigned intzones_optimal_open;
unsigned intzones_optimal_nonseq;
unsigned intzones_max_open;
@@ -282,8 +281,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
 extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
 extern void sd_zbc_remove(struct scsi_disk *sdkp);
 extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
-extern int sd_zbc_w

[PATCH V8 3/7] mq-deadline: Introduce zone locking support

2017-11-08 Thread Damien Le Moal
Introduce zone write locking to avoid write request reordering with
zoned block devices. This is achieved using a finer selection of the
next request to dispatch:
1) Any non-write request is always allowed to proceed.
2) Any write to a conventional zone is always allowed to proceed.
3) For a write to a sequential zone, the zone lock is first checked.
   a) If the zone is not locked, the write is allowed to proceed after
  its target zone is locked.
   b) If the zone is locked, the write request is skipped and the next
  request in the dispatch queue tested (back to step 1).

For a write request that has locked its target zone, the zone is
unlocked either when the request completes with a call to the method
deadline_request_completed() or when the request is requeued using
dd_insert_request().

Requests targeting a locked zone are always left in the scheduler queue
to preserve the lba ordering for write requests. If no write request
can be dispatched, allow reads to be dispatched even if the write batch
is not done.

If the device used is not a zoned block device, or if zoned block device
support is disabled, this patch does not modify mq-deadline behavior.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
---
 block/mq-deadline.c | 89 +++--
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 8bd6db9e69c7..d56972e8ebda 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
int front_merges;
 
spinlock_t lock;
+   spinlock_t zone_lock;
struct list_head dispatch;
 };
 
@@ -198,13 +199,33 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   spin_lock_irqsave(>zone_lock, flags);
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   goto out;
+   }
+   rq = NULL;
+out:
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -214,10 +235,32 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!rq)
+   return NULL;
+
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   spin_lock_irqsave(>zone_lock, flags);
+   while (rq) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   break;
+   rq = deadline_latter_request(rq);
+   }
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -259,7 +302,8 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -304,6 +348,13 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
rq = next_rq;
}
 
+   /*
+* For a zoned block device, if we only have writes queued and none of
+* them can be dispatched, rq will be NULL.
+*/
+   if (!rq)
+   return NULL;
+
dd->batching = 0;
 
 dispatch_request:
@@ -313,6 +364,10 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
dd->batching++;
deadline_move_request(dd, rq);
 done:
+   /*
+* If the request needs it

Re: [PATCH V7 6/8] scsi: sd_zbc: Initialize device request queue zoned data

2017-11-08 Thread Damien Le Moal
Martin,

On 11/9/17 08:41, Martin K. Petersen wrote:
> 
> Damien,
> 
>> wait for the disk capacity and number of zones to stabilize on the
>> second revalidation pass to allocate and initialize the bitmaps.
> 
> Stabilize how?

If RC_BASIS is 0, the capacity changes after the first report zones...
In any case, as you suggested, patch 8 should be merged with this one.
I will fix that and post a v8.
Thank you for the review.

-- 
Damien Le Moal,
Western Digital


[PATCH V7 3/8] block: mq-deadline: Introduce zone locking support

2017-11-08 Thread Damien Le Moal
Introduce zone write locking to avoid write request reordering with
zoned block devices. This is achieved using a finer selection of the
next request to dispatch:
1) Any non-write request is always allowed to proceed.
2) Any write to a conventional zone is always allowed to proceed.
3) For a write to a sequential zone, the zone lock is first checked.
   a) If the zone is not locked, the write is allowed to proceed after
  its target zone is locked.
   b) If the zone is locked, the write request is skipped and the next
  request in the dispatch queue tested (back to step 1).

For a write request that has locked its target zone, the zone is
unlocked either when the request completes with a call to the method
deadline_request_completed() or when the request is requeued using
dd_insert_request().

Requests targeting a locked zone are always left in the scheduler queue
to preserve the lba ordering for write requests. If no write request
can be dispatched, allow reads to be dispatched even if the write batch
is not done.

If the device used is not a zoned block device, or if zoned block device
support is disabled, this patch does not modify mq-deadline behavior.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 block/mq-deadline.c | 89 +++--
 1 file changed, 86 insertions(+), 3 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 8bd6db9e69c7..d56972e8ebda 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
int front_merges;
 
spinlock_t lock;
+   spinlock_t zone_lock;
struct list_head dispatch;
 };
 
@@ -198,13 +199,33 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   spin_lock_irqsave(>zone_lock, flags);
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   goto out;
+   }
+   rq = NULL;
+out:
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -214,10 +235,32 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+   unsigned long flags;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!rq)
+   return NULL;
+
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   spin_lock_irqsave(>zone_lock, flags);
+   while (rq) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   break;
+   rq = deadline_latter_request(rq);
+   }
+   spin_unlock_irqrestore(>zone_lock, flags);
+
+   return rq;
 }
 
 /*
@@ -259,7 +302,8 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -304,6 +348,13 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
rq = next_rq;
}
 
+   /*
+* For a zoned block device, if we only have writes queued and none of
+* them can be dispatched, rq will be NULL.
+*/
+   if (!rq)
+   return NULL;
+
dd->batching = 0;
 
 dispatch_request:
@@ -313,6 +364,10 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
dd->batching++;
deadline_move_request(dd, rq);
 done:
+   /*
+* If the request needs its target zone locked, do it.
+*/
+   blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
r

[PATCH V7 8/8] scsi: sd_zbc: Fix zone information initialization

2017-11-08 Thread Damien Le Moal
Make sure that the device request queue zone information (number of
zones and zone bitmaps) are reinitialized if the number of zones
changes (e.g. on a drive capacity change on revalidate).

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/scsi/sd_zbc.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index d2121e7738d7..6c348a211ebb 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -632,29 +632,29 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
/*
-* Wait for the disk capacity to stabilize before
-* initializing zone related information.
+* Initialize the device request queue information if the number
+* of zones changed.
 */
-   if (sdkp->first_scan)
-   return 0;
+   if (sdkp->nr_zones != q->nr_zones) {
 
-   if (!q->seq_zones_wlock) {
-   q->seq_zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
-   if (!q->seq_zones_wlock) {
-   ret = -ENOMEM;
-   goto err;
-   }
-   }
+   sd_zbc_cleanup(sdkp);
 
-   if (!q->seq_zones_bitmap) {
-   ret = sd_zbc_setup_seq_zones_bitmap(sdkp);
-   if (ret) {
-   sd_zbc_cleanup(sdkp);
-   goto err;
+   q->nr_zones = sdkp->nr_zones;
+   if (sdkp->nr_zones) {
+   q->seq_zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!q->seq_zones_wlock) {
+   ret = -ENOMEM;
+   goto err;
+   }
+
+   ret = sd_zbc_setup_seq_zones_bitmap(sdkp);
+   if (ret) {
+   sd_zbc_cleanup(sdkp);
+   goto err;
+   }
}
-   }
 
-   q->nr_zones = sdkp->nr_zones;
+   }
 
return 0;
 
-- 
2.13.6



[PATCH V7 1/8] block: introduce zoned block devices zone write locking

2017-11-08 Thread Damien Le Moal
From: Christoph Hellwig <h...@lst.de>

Components relying only on the request_queue structure for accessing
block devices (e.g. I/O schedulers) have a limited knowledged of the
device characteristics. In particular, the device capacity cannot be
easily discovered, which for a zoned block device also result in the
inability to easily know the number of zones of the device (the zone
size is indicated by the chunk_sectors field of the queue limits).

Introduce the nr_zones field to the request_queue structure to simplify
access to this information. Also, add the bitmap seq_zone_bitmap which
indicates which zones of the device are sequential zones (write
preferred or write required) and the bitmap seq_zones_wlock which
indicates if a zone is write locked, that is, if a write request
targeting a zone was dispatched to the device. These fields are
initialized by the low level block device driver (sd.c for ZBC/ZAC
disks). They are not initialized by stacking drivers (device mappers)
handling zoned block devices (e.g. dm-linear).

Using this, I/O schedulers can introduce zone write locking to control
request dispatching to a zoned block device and avoid write request
reordering by limiting to at most a single write request per zone
outside of the scheduler at any time.

Based on previous patches from Damien Le Moal.

Signed-off-by: Christoph Hellwig <h...@lst.de>
[Damien]
* Fixed comments and identation in blkdev.h
* Changed helper functions
* Fixed this commit message
Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 block/blk-core.c   |   1 +
 block/blk-zoned.c  |  41 ++
 include/linux/blkdev.h | 110 +
 3 files changed, 152 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index b8d1aa2d1008..e887c0b45d0b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1555,6 +1555,7 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
 
lockdep_assert_held(q->queue_lock);
 
+   blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
 
elv_completed_request(q, req);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..4ecf7ef8fe72 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -22,6 +22,47 @@ static inline sector_t blk_zone_start(struct request_queue 
*q,
 }
 
 /*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+   if (!rq->q->seq_zones_wlock)
+   return false;
+
+   if (blk_rq_is_passthrough(rq))
+   return false;
+
+   switch (req_op(rq)) {
+   case REQ_OP_WRITE_ZEROES:
+   case REQ_OP_WRITE_SAME:
+   case REQ_OP_WRITE:
+   return blk_rq_zone_is_seq(rq);
+   default:
+   return false;
+   }
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+
+void __blk_req_zone_write_lock(struct request *rq)
+{
+   if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+ rq->q->seq_zones_wlock)))
+   return;
+
+   WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+   rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+   rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+   WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+   rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+
+/*
  * Check that a zone report belongs to the partition.
  * If yes, fix its start sector and write pointer, copy it in the
  * zone information array and return true. Return false otherwise.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 225617dd0a3f..f9a57fe63ffc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -120,6 +120,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
bio chain. */
 #define RQF_SPECIAL_PAYLOAD((__force req_flags_t)(1 << 18))
+/* The per-zone write lock is held for this request */
+#define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -546,6 +548,21 @@ struct request_queue {
struct queue_limits limits;
 
/*
+* Zoned block device information for request dispatch control.
+* nr_zones is the total number of zones of the device. This is always
+* 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+* bits which indicates if a zone is conventional (bit clear) or
+* sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+* bits which indicates if a zone is write locked, thatt is, if a write
+* request targeting the zone

[PATCH V7 7/8] scsi: sd: Remove zone write locking

2017-11-08 Thread Damien Le Moal
The block layer now handles zone write locking.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/scsi/sd.c|  41 +++---
 drivers/scsi/sd.h|  11 -
 drivers/scsi/sd_zbc.c| 105 +++
 include/scsi/scsi_cmnd.h |   3 +-
 4 files changed, 21 insertions(+), 139 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fb9f8b5f4673..2d5b0f84ff14 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -835,16 +835,13 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd 
*cmd)
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
-   int ret;
 
if (!(rq->cmd_flags & REQ_NOUNMAP)) {
switch (sdkp->zeroing_mode) {
case SD_ZERO_WS16_UNMAP:
-   ret = sd_setup_write_same16_cmnd(cmd, true);
-   goto out;
+   return sd_setup_write_same16_cmnd(cmd, true);
case SD_ZERO_WS10_UNMAP:
-   ret = sd_setup_write_same10_cmnd(cmd, true);
-   goto out;
+   return sd_setup_write_same10_cmnd(cmd, true);
}
}
 
@@ -852,15 +849,9 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd 
*cmd)
return BLKPREP_INVALID;
 
if (sdkp->ws16 || sector > 0x || nr_sectors > 0x)
-   ret = sd_setup_write_same16_cmnd(cmd, false);
-   else
-   ret = sd_setup_write_same10_cmnd(cmd, false);
-
-out:
-   if (sd_is_zoned(sdkp) && ret == BLKPREP_OK)
-   return sd_zbc_write_lock_zone(cmd);
+   return sd_setup_write_same16_cmnd(cmd, false);
 
-   return ret;
+   return sd_setup_write_same10_cmnd(cmd, false);
 }
 
 static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -928,12 +919,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 
BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
-   if (sd_is_zoned(sdkp)) {
-   ret = sd_zbc_write_lock_zone(cmd);
-   if (ret != BLKPREP_OK)
-   return ret;
-   }
-
sector >>= ilog2(sdp->sector_size) - 9;
nr_sectors >>= ilog2(sdp->sector_size) - 9;
 
@@ -968,9 +953,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
ret = scsi_init_io(cmd);
rq->__data_len = nr_bytes;
 
-   if (sd_is_zoned(sdkp) && ret != BLKPREP_OK)
-   sd_zbc_write_unlock_zone(cmd);
-
return ret;
 }
 
@@ -1000,19 +982,12 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
sector_t threshold;
unsigned int this_count = blk_rq_sectors(rq);
unsigned int dif, dix;
-   bool zoned_write = sd_is_zoned(sdkp) && rq_data_dir(rq) == WRITE;
int ret;
unsigned char protect;
 
-   if (zoned_write) {
-   ret = sd_zbc_write_lock_zone(SCpnt);
-   if (ret != BLKPREP_OK)
-   return ret;
-   }
-
ret = scsi_init_io(SCpnt);
if (ret != BLKPREP_OK)
-   goto out;
+   return ret;
WARN_ON_ONCE(SCpnt != rq->special);
 
/* from here on until we're complete, any goto out
@@ -1231,9 +1206,6 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
 */
ret = BLKPREP_OK;
  out:
-   if (zoned_write && ret != BLKPREP_OK)
-   sd_zbc_write_unlock_zone(SCpnt);
-
return ret;
 }
 
@@ -1277,9 +1249,6 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 {
struct request *rq = SCpnt->request;
 
-   if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK)
-   sd_zbc_write_unlock_zone(SCpnt);
-
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
__free_page(rq->special_vec.bv_page);
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 99c4dde9b6bf..112627c1cc85 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -76,7 +76,6 @@ struct scsi_disk {
unsigned intnr_zones;
unsigned intzone_blocks;
unsigned intzone_shift;
-   unsigned long   *zones_wlock;
unsigned intzones_optimal_open;
unsigned intzones_optimal_nonseq;
unsigned intzones_max_open;
@@ -282,8 +281,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
 extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
 extern void sd_zbc_remove(struct scsi_disk *sdkp);
 extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
-extern int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd);
-extern void sd_zbc_write_unlock_zone(struct scsi_cmnd *cmd);
 extern int sd_zbc_setup

[PATCH V7 4/8] block: deadline-iosched: Introduce dispatch helpers

2017-11-08 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
deadline_dispatch_requests() for zoned block devices.

While at it, also remove the unnecessary forward declaration of the
function deadline_move_request().

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 block/deadline-iosched.c | 47 +--
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..81e3f0897457 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
int front_merges;
 };
 
-static void deadline_move_request(struct deadline_data *, struct request *);
-
 static inline struct rb_root *
 deadline_rb_root(struct deadline_data *dd, struct request *rq)
 {
@@ -231,6 +229,35 @@ static inline int deadline_check_fifo(struct deadline_data 
*dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to dispatch using
+ * arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
@@ -239,16 +266,15 @@ static int deadline_dispatch_requests(struct 
request_queue *q, int force)
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(>fifo_list[READ]);
const int writes = !list_empty(>fifo_list[WRITE]);
-   struct request *rq;
+   struct request *rq, *next_rq;
int data_dir;
 
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -291,19 +317,20 @@ static int deadline_dispatch_requests(struct 
request_queue *q, int force)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.13.6



[PATCH V7 6/8] scsi: sd_zbc: Initialize device request queue zoned data

2017-11-08 Thread Damien Le Moal
Initialize the seq_zone_bitmap and nr_zones fields of the disk request
queue on disk revalidate. As the seq_zone_bitmap allocation is
identical to the allocation of the zone write lock bitmap, introduce
the helper sd_zbc_alloc_zone_bitmap(). Using this helper, wait for the
disk capacity and number of zones to stabilize on the second
revalidation pass to allocate and initialize the bitmaps.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 drivers/scsi/sd_zbc.c | 138 --
 1 file changed, 133 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 27793b9f54c0..a1bbb5b397e1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -586,8 +586,121 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return 0;
 }
 
+/**
+ * sd_zbc_alloc_zone_bitmap - Allocate a zone bitmap (one bit per zone).
+ * @sdkp: The disk of the bitmap
+ */
+static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
+   * sizeof(unsigned long),
+   GFP_KERNEL, q->node);
+}
+
+/**
+ * sd_zbc_get_seq_zones - Parse report zones reply to identify sequential zones
+ * @sdkp: disk used
+ * @buf: report reply buffer
+ * @seq_zone_bitamp: bitmap of sequential zones to set
+ *
+ * Parse reported zone descriptors in @buf to identify sequential zones and
+ * set the reported zone bit in @seq_zones_bitmap accordingly.
+ * Since read-only and offline zones cannot be written, do not
+ * mark them as sequential in the bitmap.
+ * Return the LBA after the last zone reported.
+ */
+static sector_t sd_zbc_get_seq_zones(struct scsi_disk *sdkp, unsigned char 
*buf,
+unsigned int buflen,
+unsigned long *seq_zones_bitmap)
+{
+   sector_t lba, next_lba = sdkp->capacity;
+   unsigned int buf_len, list_length;
+   unsigned char *rec;
+   u8 type, cond;
+
+   list_length = get_unaligned_be32([0]) + 64;
+   buf_len = min(list_length, buflen);
+   rec = buf + 64;
+
+   while (rec < buf + buf_len) {
+   type = rec[0] & 0x0f;
+   cond = (rec[1] >> 4) & 0xf;
+   lba = get_unaligned_be64([16]);
+   if (type != ZBC_ZONE_TYPE_CONV &&
+   cond != ZBC_ZONE_COND_READONLY &&
+   cond != ZBC_ZONE_COND_OFFLINE)
+   set_bit(lba >> sdkp->zone_shift, seq_zones_bitmap);
+   next_lba = lba + get_unaligned_be64([8]);
+   rec += 64;
+   }
+
+   return next_lba;
+}
+
+/**
+ * sd_zbc_setup_seq_zones_bitmap - Initialize the disk seq zone bitmap.
+ * @sdkp: target disk
+ *
+ * Allocate a zone bitmap and initialize it by identifying sequential zones.
+ */
+static int sd_zbc_setup_seq_zones_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+   unsigned long *seq_zones_bitmap;
+   sector_t lba = 0;
+   unsigned char *buf;
+   int ret = -ENOMEM;
+
+   seq_zones_bitmap = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!seq_zones_bitmap)
+   return -ENOMEM;
+
+   buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   while (lba < sdkp->capacity) {
+   ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, lba);
+   if (ret)
+   goto out;
+   lba = sd_zbc_get_seq_zones(sdkp, buf, SD_ZBC_BUF_SIZE,
+  seq_zones_bitmap);
+   }
+
+   if (lba != sdkp->capacity) {
+   /* Something went wrong */
+   ret = -EIO;
+   }
+
+out:
+   kfree(buf);
+   if (ret) {
+   kfree(seq_zones_bitmap);
+   return ret;
+   }
+
+   q->seq_zones_bitmap = seq_zones_bitmap;
+
+   return 0;
+}
+
+static void sd_zbc_cleanup(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   kfree(q->seq_zones_bitmap);
+   q->seq_zones_bitmap = NULL;
+
+   kfree(sdkp->zones_wlock);
+   sdkp->zones_wlock = NULL;
+}
+
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   int ret;
 
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
@@ -599,14 +712,29 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
sdkp->nr_zones =
round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
+   /*
+* Wait for the disk capacity to stabilize before
+* initializing zone related information.
+*/
+ 

[PATCH V7 2/8] blokc: mq-deadline: Introduce dispatch helpers

2017-11-08 Thread Damien Le Moal
Avoid directly referencing the next_rq and fifo_list arrays using the
helper functions deadline_next_request() and deadline_fifo_request() to
facilitate changes in the dispatch request selection in
__dd_dispatch_request() for zoned block devices.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
---
 block/mq-deadline.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 0179e484ec98..8bd6db9e69c7 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -192,13 +192,42 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   if (list_empty(>fifo_list[data_dir]))
+   return NULL;
+
+   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+   if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+   return NULL;
+
+   return dd->next_rq[data_dir];
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
 static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-   struct request *rq;
+   struct request *rq, *next_rq;
bool reads, writes;
int data_dir;
 
@@ -214,10 +243,9 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * batches are currently reads XOR writes
 */
-   if (dd->next_rq[WRITE])
-   rq = dd->next_rq[WRITE];
-   else
-   rq = dd->next_rq[READ];
+   rq = deadline_next_request(dd, WRITE);
+   if (!rq)
+   rq = deadline_next_request(dd, READ);
 
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
@@ -260,19 +288,20 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
/*
 * we are not running a batch, find best request for selected data_dir
 */
-   if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+   next_rq = deadline_next_request(dd, data_dir);
+   if (deadline_check_fifo(dd, data_dir) || !next_rq) {
/*
 * A deadline has expired, the last request was in the other
 * direction, or we have run out of higher-sectored requests.
 * Start again from the request with the earliest expiry time.
 */
-   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = deadline_fifo_request(dd, data_dir);
} else {
/*
 * The last req was the same dir and we have a next request in
 * sort order. No expired requests so continue on from here.
 */
-   rq = dd->next_rq[data_dir];
+   rq = next_rq;
}
 
dd->batching = 0;
-- 
2.13.6



[PATCH V7 0/8] blk-mq support for ZBC disks

2017-11-08 Thread Damien Le Moal
This series, formerly titled "scsi-mq support for ZBC disks", implements
support for ZBC disks for system using the scsi-mq I/O path.

The current scsi level support of ZBC disks guarantees write request ordering
using a per-zone write lock which prevents issuing simultaneously multiple
write commands to a zone, doing so avoid reordering of sequential writes to
sequential zones. This method is however ineffective when scsi-mq is used with
zoned block devices. This is due to the different execution model of blk-mq
which passes a request to the scsi layer for dispatching after the request has
been removed from the I/O scheduler queue. That is, when the scsi layer tries
to lock the target zone of the request, the request may already be out of
order and zone write locking fails to prevent that.

Various approaches have been tried to solve this problem directly from the core
code of blk-mq. All of them had the serious disadvantage of cluttering blk-mq
code with zoned block device specific conditions and processing, making
maintenance and testing difficult.

This series adds blk-mq support for zoned block devices at the I/O scheduler
level with simple modifications of the mq-deadline scheduler. Implementation
is done with reusable helpers defined in the zoned block device support file
(blk-zoned.c). These helpers provide per zone write locking control functions
similar to what was implemented directly in the SCSI layer in sd_zbc.c.
The zone write locking mechanism is used by mq-deadline for the exact same
purpose, that is, to limit writes per zone to at most one request to avoid
reordering.

The changes to mq-deadline do not affect its operation with regular disks. The
same scheduling behavior is maintained for these devices. Compared to the SCSI
layer zone locking implementation, this series optimizes avoids locking
conventional zones which result in a use of these zone that is comparable to a
regular disk.

This series also implements changes to the legacy deadline-iosched. Doing so,
the zone locking code at the SCSI layer in sd.c and sd_zbc.c can be removed.
This results in a significant simplification of the sd driver command handling.

Patch 1 to 5 introduce the zone locking helpers in the block layer and modify
the deadline and mq-deadline schedulers. They equally apply on top of the block
tree branch for-4.15/block and on top of the scsi tree branch 4.15/scsi-queue.
Patch 6 to 8 remove the SCSI layer zone locking and initialize the device
request queue zone information. They apply to the scsi tree branch
4.15/scsi-queue. To cleanly apply these last 3 patches to the block tree branch
for-4.15/block, the following patches from the scsi tree must first be applied:

aa8a845662 "scsi: sd_zbc: Move ZBC declarations to scsi_proto.h"
e98f42bcad "scsi: sd_zbc: Fix comments and indentation"
5eed92d173 "scsi: sd_zbc: Rearrange code"
e8c77ec483 "scsi: sd_zbc: Use well defined macros"
4a109032e3 "scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()"

Of note is that this series imposes the use of the deadline and mq-deadline
schedulers with zoned block devices. A system can trivialy enforce this using
a udev rule such as:

ACTION=="add|change", KERNEL=="sd[a-z]", ATTRS{queue/zoned}=="host-managed", \
ATTR{queue/scheduler}="deadline"

This rules applies equally for the legacy SCSI path as well as the scsi-mq path
thanks to "mq-deadline" being aliased to "deadline".

Comments are as always very much appreciated.

Changes from v6:
* Implement zone write locking helpers in the block layer
* Also modify legacy path deadline scheduler to remove all zone write locking
  code from the scsi layer

Changes from v5:
* Refactor patches to introduce the zone_lock spinlock only when needed
* Addressed Bart's comments (in particular explanations of the zone_lock
  spinlock use)

Changes from v4:
* Various fixes and improvements (From Christoph's comments)
* Dropped zones_wlock scheduler tunable attribute

Changes from v3:
* Integrated support directly into mq-deadline instead of creating a new I/O
  scheduler.
* Disable setting of default mq scheduler for single queue devices

Changes from v2:
* Introduced blk_zoned structure
* Moved I/O scheduler from drivers/scsi to block

Changes from v1:
* Addressed Bart's comments for the blk-mq patches (declarations files)
* Split (former) patch 4 into multiple patches to facilitate review
* Fixed scsi disk lookup from io scheduler by introducing
  scsi_disk_from_queue()

Christoph Hellwig (1):
  block: introduce zoned block devices zone write locking

Damien Le Moal (7):
  blokc: mq-deadline: Introduce dispatch helpers
  block: mq-deadline: Introduce zone locking support
  block: deadline-iosched: Introduce dispatch helpers
  block: deadline-iosched: Introduce zone locking support
  scsi: sd_zbc: Initialize device request queue zoned data
  scsi: sd: Remove zone

[PATCH V7 5/8] block: deadline-iosched: Introduce zone locking support

2017-11-08 Thread Damien Le Moal
Introduce zone write locking to avoid write request reordering with
zoned block devices. This is achieved using a finer selection of the
next request to dispatch:
1) Any non-write request is always allowed to proceed.
2) Any write to a conventional zone is always allowed to proceed.
3) For a write to a sequential zone, the zone lock is first checked.
   a) If the zone is not locked, the write is allowed to proceed after
  its target zone is locked.
   b) If the zone is locked, the write request is skipped and the next
  request in the dispatch queue tested (back to step 1).

For a write request that has locked its target zone, the zone is
unlocked either when the request completes and the method
deadline_request_completed() is called, or when the request is requeued
using the method deadline_add_request().

Requests targeting a locked zone are always left in the scheduler queue
to preserve the initial write order. If no write request can be
dispatched, allow reads to be dispatched even if the write batch is not
done.

If the device used is not a zoned block device, or if zoned block device
support is disabled, this patch does not modify deadline behavior.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 block/deadline-iosched.c | 71 ++--
 1 file changed, 68 insertions(+), 3 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 81e3f0897457..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -98,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request 
*rq)
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
 
+   /*
+* This may be a requeue of a write request that has locked its
+* target zone. If it is the case, this releases the zone lock.
+*/
+   blk_req_zone_write_unlock(rq);
+
deadline_add_rq_rb(dd, rq);
 
/*
@@ -188,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct 
request *rq)
 {
struct request_queue *q = rq->q;
 
+   /*
+* For a zoned block device, write requests must write lock their
+* target zone.
+*/
+   blk_req_zone_write_lock(rq);
+
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
 }
@@ -235,13 +247,28 @@ static inline int deadline_check_fifo(struct 
deadline_data *dd, int ddir)
 static struct request *
 deadline_fifo_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
if (list_empty(>fifo_list[data_dir]))
return NULL;
 
-   return rq_entry_fifo(dd->fifo_list[data_dir].next);
+   rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   return rq;
+   }
+
+   return NULL;
 }
 
 /*
@@ -251,10 +278,29 @@ deadline_fifo_request(struct deadline_data *dd, int 
data_dir)
 static struct request *
 deadline_next_request(struct deadline_data *dd, int data_dir)
 {
+   struct request *rq;
+
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
return NULL;
 
-   return dd->next_rq[data_dir];
+   rq = dd->next_rq[data_dir];
+   if (!rq)
+   return NULL;
+
+   if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+   return rq;
+
+   /*
+* Look for a write request that can be dispatched, that is one with
+* an unlocked target zone.
+*/
+   while (rq) {
+   if (blk_req_can_dispatch_to_zone(rq))
+   return rq;
+   rq = deadline_latter_request(rq);
+   }
+
+   return NULL;
 }
 
 /*
@@ -288,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue 
*q, int force)
if (reads) {
BUG_ON(RB_EMPTY_ROOT(>sort_list[READ]));
 
-   if (writes && (dd->starved++ >= dd->writes_starved))
+   if (deadline_fifo_request(dd, WRITE) &&
+   (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
 
data_dir = READ;
@@ -333,6 +380,13 @@ static int deadline_dispatch_requests(struct request_queue 
*q, int force)
rq = next_rq;
}
 
+   /*
+* For a zoned block device, if we only have writes queued and none of
+* them can be dispatched, rq will be NULL.
+*/
+   if (!rq)

Re: [PATCH] Suppress a kernel warning in case the prep function returns BLKPREP_DEFER

2017-10-22 Thread Damien Le Moal
On 10/20/17 20:46, Bart Van Assche wrote:
> The legacy block layer handles requests as follows:
> - If the prep function returns BLKPREP_OK, let blk_peek_request()
>   return the pointer to that request.
> - If the prep function returns BLKPREP_DEFER, keep the RQF_STARTED
>   flag and retry calling the prep function later.
> - If the prep function returns BLKPREP_KILL or BLKPREP_INVALID, end
>   the request.
> 
> In none of these cases it is correct to clear the SCMD_INITIALIZED
> flag from inside scsi_prep_fn(). Since scsi_prep_fn() already
> guarantees that scsi_init_command() will be called once even if
> scsi_prep_fn() is called multiple times, remove the code that clears
> SCMD_INITIALIZED from scsi_prep_fn().
> 
> The scsi-mq code handles requests as follows:
> - If scsi_mq_prep_fn() returns BLKPREP_OK, set the RQF_DONTPREP flag
>   and submit the request to the SCSI LLD.
> - If scsi_mq_prep_fn() returns BLKPREP_DEFER, call
>   blk_mq_delay_run_hw_queue() and return BLK_STS_RESOURCE.
> - If the prep function returns BLKPREP_KILL or BLKPREP_INVALID, call
>   scsi_mq_uninit_cmd() and let the blk-mq core end the request.
> 
> In none of these cases scsi_mq_prep_fn() should clear the
> SCMD_INITIALIZED flag. Hence remove the code from scsi_mq_prep_fn()
> function that clears that flag.
> 
> This patch avoids that the following warning is triggered when using
> the legacy block layer:
> 
> [ cut here ]
> WARNING: CPU: 1 PID: 4198 at drivers/scsi/scsi_lib.c:654 
> scsi_end_request+0x1de/0x220
> CPU: 1 PID: 4198 Comm: mkfs.f2fs Not tainted 4.14.0-rc5+ #1
> task: 91c147a4b800 task.stack: b282c37b8000
> RIP: 0010:scsi_end_request+0x1de/0x220
> Call Trace:
> 
> scsi_io_completion+0x204/0x5e0
> scsi_finish_command+0xce/0xe0
> scsi_softirq_done+0x126/0x130
> blk_done_softirq+0x6e/0x80
> __do_softirq+0xcf/0x2a8
> irq_exit+0xab/0xb0
> do_IRQ+0x7b/0xc0
> common_interrupt+0x90/0x90
> 
> RIP: 0010:_raw_spin_unlock_irqrestore+0x9/0x10
> __test_set_page_writeback+0xc7/0x2c0
> __block_write_full_page+0x158/0x3b0
> block_write_full_page+0xc4/0xd0
> blkdev_writepage+0x13/0x20
> __writepage+0x12/0x40
> write_cache_pages+0x204/0x500
> generic_writepages+0x48/0x70
> blkdev_writepages+0x9/0x10
> do_writepages+0x34/0xc0
> __filemap_fdatawrite_range+0x6c/0x90
> file_write_and_wait_range+0x31/0x90
> blkdev_fsync+0x16/0x40
> vfs_fsync_range+0x44/0xa0
> do_fsync+0x38/0x60
> SyS_fsync+0xb/0x10
> entry_SYSCALL_64_fastpath+0x13/0x94
> ---[ end trace 86e8ef85a4a6c1d1 ]---
> 
> Fixes: commit 64104f703212 ("scsi: Call scsi_initialize_rq() for filesystem 
> requests")
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Damien Le Moal <damien.lem...@wdc.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Johannes Thumshirn <jthumsh...@suse.de>
> ---
>  drivers/scsi/scsi_lib.c | 8 +---
>  1 file changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 1779c8e91d09..5745af3e81bd 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1378,8 +1378,6 @@ static int scsi_prep_fn(struct request_queue *q, struct 
> request *req)
>  
>   ret = scsi_setup_cmnd(sdev, req);
>  out:
> - if (ret != BLKPREP_OK)
> - cmd->flags &= ~SCMD_INITIALIZED;
>   return scsi_prep_return(q, req, ret);
>  }
>  
> @@ -1899,7 +1897,6 @@ static int scsi_mq_prep_fn(struct request *req)
>   struct scsi_device *sdev = req->q->queuedata;
>   struct Scsi_Host *shost = sdev->host;
>   struct scatterlist *sg;
> - int ret;
>  
>   scsi_init_command(sdev, cmd);
>  
> @@ -1933,10 +1930,7 @@ static int scsi_mq_prep_fn(struct request *req)
>  
>   blk_mq_start_request(req);
>  
> - ret = scsi_setup_cmnd(sdev, req);
> - if (ret != BLK_STS_OK)
> - cmd->flags &= ~SCMD_INITIALIZED;
> - return ret;
> + return scsi_setup_cmnd(sdev, req);
>  }
>  
>  static void scsi_mq_done(struct scsi_cmnd *cmd)
> 
Reviewed-by: Damien Le Moal <damien.lem...@wdc.com>

-- 
Damien Le Moal
Western Digital Research


[PATCH 5/5] scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()

2017-10-10 Thread Damien Le Moal
The three values starting at byte 8 of the Zoned Block Device
Characteristics VPD page B6h are 32 bits values, not 64bits. So use
get_unaligned_be32() to retrieve the values and not get_unaligned_be64()

Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
Cc: <sta...@vger.kernel.org>

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
Reviewed-by: Johannes Thumshirn <jthumsh...@suse.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
---
 drivers/scsi/sd_zbc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index bbad851c1789..27793b9f54c0 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -423,15 +423,15 @@ static int sd_zbc_read_zoned_characteristics(struct 
scsi_disk *sdkp,
if (sdkp->device->type != TYPE_ZBC) {
/* Host-aware */
sdkp->urswrz = 1;
-   sdkp->zones_optimal_open = get_unaligned_be64([8]);
-   sdkp->zones_optimal_nonseq = get_unaligned_be64([12]);
+   sdkp->zones_optimal_open = get_unaligned_be32([8]);
+   sdkp->zones_optimal_nonseq = get_unaligned_be32([12]);
sdkp->zones_max_open = 0;
} else {
/* Host-managed */
sdkp->urswrz = buf[4] & 1;
sdkp->zones_optimal_open = 0;
sdkp->zones_optimal_nonseq = 0;
-   sdkp->zones_max_open = get_unaligned_be64([16]);
+   sdkp->zones_max_open = get_unaligned_be32([16]);
}
 
return 0;
-- 
2.13.6



[PATCH 4/5] scsi: sd_zbc: Use well defined macros

2017-10-10 Thread Damien Le Moal
instead of open coding, use the min() macro to calculate a report zones
reply buffer length in sd_zbc_check_zone_size() and the round_up()
macro for calculating the number of zones in sd_zbc_setup().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Johannes Thumshirn <jthumsh...@suse.de>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
---
 drivers/scsi/sd_zbc.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7dbaf920679e..bbad851c1789 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -475,7 +475,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf)
return 0;
 }
 
-#define SD_ZBC_BUF_SIZE 131072
+#define SD_ZBC_BUF_SIZE 131072U
 
 /**
  * sd_zbc_check_zone_size - Check the device zone sizes
@@ -526,10 +526,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
/* Parse REPORT ZONES header */
list_length = get_unaligned_be32([0]) + 64;
rec = buf + 64;
-   if (list_length < SD_ZBC_BUF_SIZE)
-   buf_len = list_length;
-   else
-   buf_len = SD_ZBC_BUF_SIZE;
+   buf_len = min(list_length, SD_ZBC_BUF_SIZE);
 
/* Parse zone descriptors */
while (rec < buf + buf_len) {
@@ -599,9 +596,8 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
/* chunk_sectors indicates the zone size */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
-   if (sdkp->capacity & (sdkp->zone_blocks - 1))
-   sdkp->nr_zones++;
+   sdkp->nr_zones =
+   round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
if (!sdkp->zones_wlock) {
sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
-- 
2.13.6



[PATCH 3/5] scsi: sd_zbc: Rearrange code

2017-10-10 Thread Damien Le Moal
Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
assignments and move the calculation of sdkp->zone_shift together
with the assignment of the verified zone_blocks value in
sd_zbc_check_zone_size().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
---
 drivers/scsi/sd_zbc.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 023f705ae235..7dbaf920679e 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -584,6 +584,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
}
 
sdkp->zone_blocks = zone_blocks;
+   sdkp->zone_shift = ilog2(zone_blocks);
 
return 0;
 }
@@ -591,10 +592,13 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
 
+   /* READ16/WRITE16 is mandatory for ZBC disks */
+   sdkp->device->use_16_for_rw = 1;
+   sdkp->device->use_10_for_rw = 0;
+
/* chunk_sectors indicates the zone size */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->zone_shift = ilog2(sdkp->zone_blocks);
sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
if (sdkp->capacity & (sdkp->zone_blocks - 1))
sdkp->nr_zones++;
@@ -657,10 +661,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned 
char *buf)
if (ret)
goto err;
 
-   /* READ16/WRITE16 is mandatory for ZBC disks */
-   sdkp->device->use_16_for_rw = 1;
-   sdkp->device->use_10_for_rw = 0;
-
return 0;
 
 err:
-- 
2.13.6



[PATCH 2/5] scsi: sd_zbc: Fix comments and indentation

2017-10-10 Thread Damien Le Moal
Fix comments style (use kernel-doc style) and content to clarify some
functions. Also fix some functions signature indentation and remove a
useless blank line in sd_zbc_read_zones().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Christoph Hellwig <h...@lst.de>
---
 drivers/scsi/scsi_lib.c |   5 ++-
 drivers/scsi/sd_zbc.c   | 117 +---
 2 files changed, 104 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe297..c72b97a74906 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1752,7 +1752,10 @@ static void scsi_done(struct scsi_cmnd *cmd)
  *
  * Returns: Nothing
  *
- * Lock status: IO request lock assumed to be held when called.
+ * Lock status: request queue lock assumed to be held when called.
+ *
+ * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
+ * protection for ZBC disks.
  */
 static void scsi_request_fn(struct request_queue *q)
__releases(q->queue_lock)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 692c8cbc7ed8..023f705ae235 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -32,10 +32,14 @@
 #include "sd.h"
 
 /**
- * Convert a zone descriptor to a zone struct.
+ * sd_zbc_parse_report - Convert a zone descriptor to a struct blk_zone,
+ * @sdkp: The disk the report originated from
+ * @buf: Address of the report zone descriptor
+ * @zone: the destination zone structure
+ *
+ * All LBA sized values are converted to 512B sectors unit.
  */
-static void sd_zbc_parse_report(struct scsi_disk *sdkp,
-   u8 *buf,
+static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
struct blk_zone *zone)
 {
struct scsi_device *sdp = sdkp->device;
@@ -58,7 +62,13 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp,
 }
 
 /**
- * Issue a REPORT ZONES scsi command.
+ * sd_zbc_report_zones - Issue a REPORT ZONES scsi command.
+ * @sdkp: The target disk
+ * @buf: Buffer to use for the reply
+ * @buflen: the buffer size
+ * @lba: Start LBA of the report
+ *
+ * For internal use during device validation.
  */
 static int sd_zbc_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
   unsigned int buflen, sector_t lba)
@@ -99,6 +109,12 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, 
unsigned char *buf,
return 0;
 }
 
+/**
+ * sd_zbc_setup_report_cmnd - Prepare a REPORT ZONES scsi command
+ * @cmd: The command to setup
+ *
+ * Call in sd_init_command() for a REQ_OP_ZONE_REPORT request.
+ */
 int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -141,6 +157,14 @@ int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
return BLKPREP_OK;
 }
 
+/**
+ * sd_zbc_report_zones_complete - Process a REPORT ZONES scsi command reply.
+ * @scmd: The completed report zones command
+ * @good_bytes: reply size in bytes
+ *
+ * Convert all reported zone descriptors to struct blk_zone. The conversion
+ * is done in-place, directly in the request specified sg buffer.
+ */
 static void sd_zbc_report_zones_complete(struct scsi_cmnd *scmd,
 unsigned int good_bytes)
 {
@@ -196,17 +220,32 @@ static void sd_zbc_report_zones_complete(struct scsi_cmnd 
*scmd,
local_irq_restore(flags);
 }
 
+/**
+ * sd_zbc_zone_sectors - Get the device zone size in number of 512B sectors.
+ * @sdkp: The target disk
+ */
 static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
 {
return logical_to_sectors(sdkp->device, sdkp->zone_blocks);
 }
 
+/**
+ * sd_zbc_zone_no - Get the number of the zone conataining a sector.
+ * @sdkp: The target disk
+ * @sector: 512B sector address contained in the zone
+ */
 static inline unsigned int sd_zbc_zone_no(struct scsi_disk *sdkp,
  sector_t sector)
 {
return sectors_to_logical(sdkp->device, sector) >> sdkp->zone_shift;
 }
 
+/**
+ * sd_zbc_setup_reset_cmnd - Prepare a RESET WRITE POINTER scsi command.
+ * @cmd: the command to setup
+ *
+ * Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
+ */
 int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -239,6 +278,23 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
return BLKPREP_OK;
 }
 
+/**
+ * sd_zbc_write_lock_zone - Write lock a sequential zone.
+ * @cmd: write command
+ *
+ * Called from sd_init_cmd() for write requests (standard write, write same or
+ * write zeroes operations). If the request target zone is not already locked,
+ * the zone is locked and BLKPREP_OK returned, allowing the request to proceed
+ * through dispatch in scsi_request_fn(). Otherwise, BLKPREP_DEFER is returned,
+ * for

[PATCH 0/5] ZBC support clenaup and fixes

2017-10-10 Thread Damien Le Moal
Martin,

First part of the "scsi-mq support for ZBC disks" series that is only scsi
specific. These patches are just cleanups with only one bug fix (last pacth)
and do not functionally change anything.

The remaining rework of the zone locking including scsi-mq support is still
ongoing and will be sent later.

Please consider these for addition to 4.15.

Damien Le Moal (5):
  scsi: sd_zbc: Move ZBC declarations to scsi_proto.h
  scsi: sd_zbc: Fix comments and indentation
  scsi: sd_zbc: Rearrange code
  scsi: sd_zbc: Use well defined macros
  scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()

 drivers/scsi/scsi_lib.c   |   5 +-
 drivers/scsi/sd_zbc.c | 169 ++
 include/scsi/scsi_proto.h |  45 +---
 3 files changed, 150 insertions(+), 69 deletions(-)

-- 
2.13.6



[PATCH 1/5] scsi: sd_zbc: Move ZBC declarations to scsi_proto.h

2017-10-10 Thread Damien Le Moal
Move standard macro definitions for the zone types and zone conditions
to scsi_proto.h together with the definitions related to the
REPORT ZONES command. While at it, define all values in the enums to
be clear.

Also remove unnecessary includes in sd_zbc.c.

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
Reviewed-by: Bart Van Assche <bart.vanass...@wdc.com>
Reviewed-by: Johannes Thumshirn <jthumsh...@suse.de>
Reviewed-by: Christoph Hellwig <h...@lst.de>
---
 drivers/scsi/sd_zbc.c | 24 
 include/scsi/scsi_proto.h | 45 ++---
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 8aa54779aac1..692c8cbc7ed8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -28,32 +28,8 @@
 
 #include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
 
 #include "sd.h"
-#include "scsi_priv.h"
-
-enum zbc_zone_type {
-   ZBC_ZONE_TYPE_CONV = 0x1,
-   ZBC_ZONE_TYPE_SEQWRITE_REQ,
-   ZBC_ZONE_TYPE_SEQWRITE_PREF,
-   ZBC_ZONE_TYPE_RESERVED,
-};
-
-enum zbc_zone_cond {
-   ZBC_ZONE_COND_NO_WP,
-   ZBC_ZONE_COND_EMPTY,
-   ZBC_ZONE_COND_IMP_OPEN,
-   ZBC_ZONE_COND_EXP_OPEN,
-   ZBC_ZONE_COND_CLOSED,
-   ZBC_ZONE_COND_READONLY = 0xd,
-   ZBC_ZONE_COND_FULL,
-   ZBC_ZONE_COND_OFFLINE,
-};
 
 /**
  * Convert a zone descriptor to a zone struct.
diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h
index 8c285d9a06d8..39130a9c05bf 100644
--- a/include/scsi/scsi_proto.h
+++ b/include/scsi/scsi_proto.h
@@ -301,19 +301,42 @@ struct scsi_lun {
 
 /* Reporting options for REPORT ZONES */
 enum zbc_zone_reporting_options {
-   ZBC_ZONE_REPORTING_OPTION_ALL = 0,
-   ZBC_ZONE_REPORTING_OPTION_EMPTY,
-   ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN,
-   ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN,
-   ZBC_ZONE_REPORTING_OPTION_CLOSED,
-   ZBC_ZONE_REPORTING_OPTION_FULL,
-   ZBC_ZONE_REPORTING_OPTION_READONLY,
-   ZBC_ZONE_REPORTING_OPTION_OFFLINE,
-   ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
-   ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE,
-   ZBC_ZONE_REPORTING_OPTION_NON_WP = 0x3f,
+   ZBC_ZONE_REPORTING_OPTION_ALL   = 0x00,
+   ZBC_ZONE_REPORTING_OPTION_EMPTY = 0x01,
+   ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN = 0x02,
+   ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN = 0x03,
+   ZBC_ZONE_REPORTING_OPTION_CLOSED= 0x04,
+   ZBC_ZONE_REPORTING_OPTION_FULL  = 0x05,
+   ZBC_ZONE_REPORTING_OPTION_READONLY  = 0x06,
+   ZBC_ZONE_REPORTING_OPTION_OFFLINE   = 0x07,
+   /* 0x08 to 0x0f are reserved */
+   ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
+   ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE  = 0x11,
+   /* 0x12 to 0x3e are reserved */
+   ZBC_ZONE_REPORTING_OPTION_NON_WP= 0x3f,
 };
 
 #define ZBC_REPORT_ZONE_PARTIAL 0x80
 
+/* Zone types of REPORT ZONES zone descriptors */
+enum zbc_zone_type {
+   ZBC_ZONE_TYPE_CONV  = 0x1,
+   ZBC_ZONE_TYPE_SEQWRITE_REQ  = 0x2,
+   ZBC_ZONE_TYPE_SEQWRITE_PREF = 0x3,
+   /* 0x4 to 0xf are reserved */
+};
+
+/* Zone conditions of REPORT ZONES zone descriptors */
+enum zbc_zone_cond {
+   ZBC_ZONE_COND_NO_WP = 0x0,
+   ZBC_ZONE_COND_EMPTY = 0x1,
+   ZBC_ZONE_COND_IMP_OPEN  = 0x2,
+   ZBC_ZONE_COND_EXP_OPEN  = 0x3,
+   ZBC_ZONE_COND_CLOSED= 0x4,
+   /* 0x5 to 0xc are reserved */
+   ZBC_ZONE_COND_READONLY  = 0xd,
+   ZBC_ZONE_COND_FULL  = 0xe,
+   ZBC_ZONE_COND_OFFLINE   = 0xf,
+};
+
 #endif /* _SCSI_PROTO_H_ */
-- 
2.13.6



Re: [PATCH V6 13/14] block: mq-deadline: Limit write request dispatch for zoned block devices

2017-10-03 Thread Damien Le Moal
Bart,

On 10/4/17 05:56, Bart Van Assche wrote:
> On Tue, 2017-10-03 at 09:19 +0900, Damien Le Moal wrote:
>> On 10/3/17 08:44, Bart Van Assche wrote:
>>> On Mon, 2017-10-02 at 16:15 +0900, Damien Le Moal wrote:
>>>>  static void deadline_wunlock_zone(struct deadline_data *dd,
>>>>  struct request *rq)
>>>>  {
>>>> +  unsigned long flags;
>>>> +
>>>> +  spin_lock_irqsave(>zone_lock, flags);
>>>> +
>>>>WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), dd->zones_wlock));
>>>>deadline_clear_request_zone_wlock(rq);
>>>> +
>>>> +  spin_unlock_irqrestore(>zone_lock, flags);
>>>>  }
>>>
>>> Is a request removed from the sort and fifo lists before being dispatched? 
>>> If so,
>>> does that mean that obtaining zone_lock in the above function is not 
>>> necessary?
>>
>> Yes, a request is removed from the sort tree and fifo list before
>> dispatching. But the dd->zone_lock spinlock is not there to protect
>> that, dd->lock protects the sort tree and fifo list. dd->zone_lock was
>> added to prevent the completed_request() method from changing a zone
>> lock state while deadline_fifo_requests() or deadline_next_request() are
>> running. Ex:
>>
>> Imagine this case: write request A for a zone Z is being executed (it
>> was dispatched) so Z is locked. Dispatch runs and inspects the next
>> requests in sort order. Let say we have the sequential writes B, C, D, E
>> queued for the same zone Z. First B is inspected and cannot be
>> dispatched (zone locked). Inspection move on to C, but before the that,
>> A completes and Z is unlocked. Then C will be OK to go as the zone is
>> now unlocked. But it is the wrong choice as it will result in out of
>> order write. B must be the next request dispatched after A.
>>
>> dd->zone_lock prevents this from happening. Without this spinlock, the
>> bad example case above happens very easily.
> 
> Hello Damien,
> 
> Thanks for the detailed and really clear reply. I hope you do not mind that I
> have a few further questions about this patch?
> - Does the zone_lock spinlock have to be acquired by both 
> deadline_wunlock_zone()
>   callers or only by the call from the request completion path?

Not really. Only the completion path is strongly needed as the insert
path unlock is under dd->lock, which prevents concurrent execution of
the sort or fifo request search. So the zone_lock lock/unlock could be
moved out of deadline_wunlock_zone() and done directly in
dd_completed_request().

> - Why do both the mq-deadline and the sd driver each have their own instance 
> of
>   the zones_wlock bitmap? Has it been considered to convert both bitmaps into 
> a
>   single bitmap that is shared by both kernel components and that exists e.g. 
> at
>   the request queue level?

The sd driver level zone locking handles only the legacy path. Hence the
zone lock bitmap attached to the scsi disk struct. For scsi-mq/blk-mq,
mq-deadline does the zone locking. Both bitmaps do not exist at the same
time.

Indeed we could move the zone lock bitmap in the request queue so that
it is common between legacy and mq cases. Christoph has a series doing
that, and going further by doing the zone locking directly within the
block layer dispatch code for both legacy and mq path. So the scsi level
locking and mq-deadline locking become unnecessary. Working on that new
series right now.

Best regards.

-- 
Damien Le Moal,
Western Digital


Re: [PATCH V6 13/14] block: mq-deadline: Limit write request dispatch for zoned block devices

2017-10-02 Thread Damien Le Moal
Bart,

On 10/3/17 08:44, Bart Van Assche wrote:
> On Mon, 2017-10-02 at 16:15 +0900, Damien Le Moal wrote:
>> When dispatching write requests to a zoned block device, only allow
>> requests targeting an unlocked zone. Requests targeting a locked zone
>> are left in the scheduler queue to preserve the initial write order.
>> If no write request can be dispatched, allow reads to be dispatched
>> even if the write batch is not done.
>>
>> To ensure that the search for an appropriate write request is atomic
>> in deadline_fifo_request() and deadline_next_request() with reagrd to
>   ^^
>   regard?

Will fix.

>> write requests zone lock state, introduce the spinlock zone_lock.
>> Holding this lock while doing the search in these functions as well as
>> when unlocking the target zone of a completed write request in
>> dd_completed_request() ensure that the search does not pickup a write
>> request in the middle of a zone queued write sequence.
> 
> Since there is already a spinlock in the mq-deadline scheduler that serializes
> most operations, do we really need a new spinlock?

The dd->lock spinlock is used without IRQ disabling. So it does not
protect against request completion method calls. That spinlock being a
"big lock", I did not want to use it with IRQ disabled.

>>  /*
>>   * Write unlock the target zone of a write request.
>> + * Clearing the target zone write lock bit is done with the scheduler 
>> zone_lock
>> + * spinlock held so that deadline_next_request() and deadline_fifo_request()
>> + * cannot see the lock state of a zone change due to a request completion 
>> during
>> + * their eventual search for an appropriate write request. Otherwise, for a 
>> zone
>> + * with multiple write requests queued, a non sequential write request
>> + * can be chosen.
>>   */
>>  static void deadline_wunlock_zone(struct deadline_data *dd,
>>struct request *rq)
>>  {
>> +unsigned long flags;
>> +
>> +spin_lock_irqsave(>zone_lock, flags);
>> +
>>  WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), dd->zones_wlock));
>>  deadline_clear_request_zone_wlock(rq);
>> +
>> +spin_unlock_irqrestore(>zone_lock, flags);
>>  }
> 
> Is a request removed from the sort and fifo lists before being dispatched? If 
> so,
> does that mean that obtaining zone_lock in the above function is not 
> necessary?

Yes, a request is removed from the sort tree and fifo list before
dispatching. But the dd->zone_lock spinlock is not there to protect
that, dd->lock protects the sort tree and fifo list. dd->zone_lock was
added to prevent the completed_request() method from changing a zone
lock state while deadline_fifo_requests() or deadline_next_request() are
running. Ex:

Imagine this case: write request A for a zone Z is being executed (it
was dispatched) so Z is locked. Dispatch runs and inspects the next
requests in sort order. Let say we have the sequential writes B, C, D, E
queued for the same zone Z. First B is inspected and cannot be
dispatched (zone locked). Inspection move on to C, but before the that,
A completes and Z is unlocked. Then C will be OK to go as the zone is
now unlocked. But it is the wrong choice as it will result in out of
order write. B must be the next request dispatched after A.

dd->zone_lock prevents this from happening. Without this spinlock, the
bad example case above happens very easily.

>>  static struct request *
>>  deadline_fifo_request(struct deadline_data *dd, int data_dir)
>>  {
>> +struct request *rq;
>> +unsigned long flags;
>> +
>>  if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
>>  return NULL;
>>  
>>  if (list_empty(>fifo_list[data_dir]))
>>  return NULL;
>>  
>> -return rq_entry_fifo(dd->fifo_list[data_dir].next);
>> +if (!dd->zones_wlock || data_dir == READ)
>> +return rq_entry_fifo(dd->fifo_list[data_dir].next);
>> +
>> +spin_lock_irqsave(>zone_lock, flags);
>> +
>> +list_for_each_entry(rq, >fifo_list[WRITE], queuelist) {
>> +if (deadline_can_dispatch_request(dd, rq))
>> +goto out;
>> +}
>> +rq = NULL;
>> +
>> +out:
>> +spin_unlock_irqrestore(>zone_lock, flags);
>> +
>> +return rq;
>>  }
> 
> Same question here: is it really necessary to obtain zone_lock?

See above. Same comment.

Best regards.

-- 
Damien Le Moal,
Western Digital


[PATCH V6 12/14] block: mq-deadline: Introduce zone locking support

2017-10-02 Thread Damien Le Moal
For a write request to a zoned block device, lock the request target
zone upon request displatch. The zone is unlocked either when the
request completes or when the request is requeued (inserted).

To indicate that a request has locked its target zone, use the first
pointer of the request elevator private data to store the value
RQ_ZONE_WLOCKED. Testing for this value allows quick decision in
dd_insert_request() and dd_completed_request() regarding the need for
unlocking the target zone of a request.

Signed-off-by: Damien Le Moal <damien.lem...@wdc.com>
---
 block/mq-deadline.c | 111 
 1 file changed, 111 insertions(+)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 6b7b84ee8f82..93a1aede5dd0 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -177,6 +177,91 @@ deadline_move_request(struct deadline_data *dd, struct 
request *rq)
 }
 
 /*
+ * Return true if a request is a write requests that needs zone
+ * write locking.
+ */
+static inline bool deadline_request_needs_zone_wlock(struct deadline_data *dd,
+struct request *rq)
+{
+
+   if (!dd->zones_wlock)
+   return false;
+
+   if (blk_rq_is_passthrough(rq))
+   return false;
+
+   /*
+* REQ_OP_SCSI_* and REQ_OP_DRV_* are already handled with
+* the previous check. Add them again here so that all request
+* operations defined by enum req_off are handled (so that a compiler
+* warning shows up if/when request operation definitions change.
+*/
+   switch (req_op(rq)) {
+   case REQ_OP_WRITE_ZEROES:
+   case REQ_OP_WRITE_SAME:
+   case REQ_OP_WRITE:
+   return blk_rq_zone_is_seq(rq);
+   default:
+   return false;
+   }
+}
+
+/*
+ * Abuse the elv.priv[0] pointer to indicate if a request has write
+ * locked its target zone. Only write request to a zoned block device
+ * can own a zone write lock.
+ */
+enum rq_zone_lock {
+   RQ_ZONE_NO_WLOCK = 0UL,
+   RQ_ZONE_WLOCKED  = 1UL,
+};
+
+static inline void deadline_set_request_zone_wlock(struct request *rq)
+{
+   rq->elv.priv[0] = (void *)RQ_ZONE_WLOCKED;
+}
+
+static inline void deadline_clear_request_zone_wlock(struct request *rq)
+{
+   rq->elv.priv[0] = (void *)RQ_ZONE_NO_WLOCK;
+}
+
+static inline bool deadline_request_has_zone_wlock(struct request *rq)
+{
+   return rq->elv.priv[0] == (void *)RQ_ZONE_WLOCKED;
+}
+
+/*
+ * Write lock the target zone of a write request.
+ */
+static void deadline_wlock_zone(struct deadline_data *dd,
+   struct request *rq)
+{
+   WARN_ON_ONCE(deadline_request_has_zone_wlock(rq));
+   WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), dd->zones_wlock));
+   deadline_set_request_zone_wlock(rq);
+}
+
+/*
+ * Write unlock the target zone of a write request.
+ */
+static void deadline_wunlock_zone(struct deadline_data *dd,
+ struct request *rq)
+{
+   WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), dd->zones_wlock));
+   deadline_clear_request_zone_wlock(rq);
+}
+
+/*
+ * Test the write lock state of the target zone of a write request.
+ */
+static inline bool deadline_zone_is_wlocked(struct deadline_data *dd,
+   struct request *rq)
+{
+   return test_bit(blk_rq_zone_no(rq), dd->zones_wlock);
+}
+
+/*
  * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(>fifo_list[data_dir])
  */
@@ -315,6 +400,11 @@ static struct request *__dd_dispatch_request(struct 
blk_mq_hw_ctx *hctx)
dd->batching++;
deadline_move_request(dd, rq);
 done:
+   /*
+* If the request needs its target zone locked, do it.
+*/
+   if (deadline_request_needs_zone_wlock(dd, rq))
+   deadline_wlock_zone(dd, rq);
rq->rq_flags |= RQF_STARTED;
return rq;
 }
@@ -464,6 +554,13 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, 
struct request *rq,
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
 
+   /*
+* This may be a requeue of a write request that has locked its
+* target zone. If this is the case, release the zone lock.
+*/
+   if (deadline_request_has_zone_wlock(rq))
+   deadline_wunlock_zone(dd, rq);
+
if (blk_mq_sched_try_insert_merge(q, rq))
return;
 
@@ -508,6 +605,19 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
spin_unlock(>lock);
 }
 
+/*
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests.
+ */
+static void dd_completed_request(struct request *rq)
+{
+   if (deadline_request_has_zone_wlock(rq)) {
+   struct deadl

  1   2   3   4   >