Re: [PATCH 2/3] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Max Gurtovoy




On 7/24/2018 11:33 PM, Keith Busch wrote:

On Tue, Jul 24, 2018 at 04:33:41PM +0300, Max Gurtovoy wrote:

+void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+   const int tuple_sz = rq->q->integrity.tuple_size;
+   u32 ref_tag = t10_pi_ref_tag(rq);
+   struct bio *bio;
+
+   if (protection_type == T10_PI_TYPE3_PROTECTION)
+   return;
+
+   __rq_for_each_bio(bio, rq) {
+   struct bio_integrity_payload *bip = bio_integrity(bio);
+   u32 virt = bip_get_seed(bip) & 0x;
+   struct bio_vec iv;
+   struct bvec_iter iter;
+
+   /* Already remapped? */
+   if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+   break;
+
+   bip_for_each_vec(iv, bip, iter) {
+   struct t10_pi_tuple *pi = kmap_atomic(iv.bv_page) +
+   iv.bv_offset;
+   unsigned int j;
+
+   for (j = 0; j < iv.bv_len; j += tuple_sz) {
+   if (be32_to_cpu(pi->ref_tag) == virt)
+   pi->ref_tag = cpu_to_be32(ref_tag);
+   virt++;
+   ref_tag++;
+   pi += tuple_sz;
+   }
+
+   kunmap_atomic(pi);
+   }


Since you're incrementing 'pi', you end up unmapping an address that
you didn't map. It does appears harmless in current kunmap_atomic()
implementation, though.


ohh, good catch. This code is taken from the scsi remap, but I guess 
it's time to fix some stuff while we doing this patchset.

I'll use another variable pi_map that will be unmapped.

in the nvme code:

pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;

...
but

kunmap_atomic(pmap);

should we unmap the same bv_page we mapped (without the offset) ?



You are also incrementing 'pi' by too many bytes since it is of type
struct t10_pi_tuple. The nvme driver used void* to make the pointer
arithmentic easier.



I'' use void*.
Thanks



[PATCH for v4.18] blk-mq: Rename BLK_EH_DONE into BLK_EH_DONT_RESET_TIMER

2018-07-24 Thread Bart Van Assche
If a block driver timeout handler returns BLK_EH_DONE that means
either that the request has been completed or that the block driver
still owns the request. Since the name BLK_EH_DONE is misleading,
change it into BLK_EH_DONT_RESET_TIMER.

Fixes: 88b0cfad2888 ("block: document the blk_eh_timer_return values")
Fixes: 6600593cbd93 ("block: rename BLK_EH_NOT_HANDLED to BLK_EH_DONE")
Signed-off-by: Bart Van Assche 
Cc: Christoph Hellwig 
Cc: Ming Lei 
Cc: Hannes Reinecke 
Cc: Johannes Thumshirn 
---
 Documentation/scsi/scsi_eh.txt|  4 ++--
 block/blk-mq.c|  2 +-
 block/blk-timeout.c   |  2 +-
 drivers/block/mtip32xx/mtip32xx.c |  2 +-
 drivers/block/nbd.c   |  4 ++--
 drivers/block/null_blk_main.c |  4 ++--
 drivers/message/fusion/mptsas.c   |  2 +-
 drivers/mmc/core/queue.c  |  2 +-
 drivers/nvme/host/pci.c   | 10 +-
 drivers/nvme/host/rdma.c  |  2 +-
 drivers/nvme/target/loop.c|  2 +-
 drivers/s390/block/dasd.c |  6 +++---
 drivers/scsi/gdth.c   |  2 +-
 drivers/scsi/libiscsi.c   |  6 +++---
 drivers/scsi/megaraid/megaraid_sas_base.c |  2 +-
 drivers/scsi/mvumi.c  |  2 +-
 drivers/scsi/qla4xxx/ql4_os.c |  2 +-
 drivers/scsi/scsi_error.c |  4 ++--
 drivers/scsi/scsi_transport_fc.c  |  4 ++--
 drivers/scsi/scsi_transport_srp.c |  4 ++--
 drivers/scsi/ufs/ufshcd.c |  6 +++---
 include/linux/blkdev.h|  2 +-
 22 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/Documentation/scsi/scsi_eh.txt b/Documentation/scsi/scsi_eh.txt
index 1b7436932a2b..59e085b28b31 100644
--- a/Documentation/scsi/scsi_eh.txt
+++ b/Documentation/scsi/scsi_eh.txt
@@ -86,9 +86,9 @@ function
This indicates that more time is required to finish the
command.  Timer is restarted.  This action is counted as a
retry and only allowed scmd->allowed + 1(!) times.  Once the
-   limit is reached, action for BLK_EH_DONE is taken instead.
+   limit is reached, action for BLK_EH_DONT_RESET_TIMER is taken instead.
 
-- BLK_EH_DONE
+- BLK_EH_DONT_RESET_TIMER
 eh_timed_out() callback did not handle the command.
Step #2 is taken.
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c92ce06fd565..96e1a7f25875 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -774,7 +774,7 @@ static void blk_mq_rq_timed_out(struct request *req, bool 
reserved)
enum blk_eh_timer_return ret;
 
ret = req->q->mq_ops->timeout(req, reserved);
-   if (ret == BLK_EH_DONE)
+   if (ret == BLK_EH_DONT_RESET_TIMER)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index f2cfd56e1606..37df7f8f8516 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -90,7 +90,7 @@ static void blk_rq_timed_out(struct request *req)
blk_add_timer(req);
blk_clear_rq_complete(req);
break;
-   case BLK_EH_DONE:
+   case BLK_EH_DONT_RESET_TIMER:
/*
 * LLD handles this for now but in the future
 * we can send a request msg to abort the command
diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index c73626decb46..e28be7821d0c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3712,7 +3712,7 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct 
request *req,
 
cmd->status = BLK_STS_TIMEOUT;
blk_mq_complete_request(req);
-   return BLK_EH_DONE;
+   return BLK_EH_DONT_RESET_TIMER;
}
 
if (test_bit(req->tag, dd->port->cmds_to_issue))
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3fb95c8d9fd8..67e17965e462 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -382,7 +382,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct 
request *req,
mutex_unlock(>lock);
nbd_requeue_cmd(cmd);
nbd_config_put(nbd);
-   return BLK_EH_DONE;
+   return BLK_EH_DONT_RESET_TIMER;
}
} else {
dev_err_ratelimited(nbd_to_dev(nbd),
@@ -395,7 +395,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct 
request *req,
nbd_config_put(nbd);
 done:
blk_mq_complete_request(req);
-   return BLK_EH_DONE;
+   return BLK_EH_DONT_RESET_TIMER;
 }
 
 /*
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 86cafa6d3b41..3617c9ac251a 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ 

Re: two small bio cleanups

2018-07-24 Thread Jens Axboe
On 7/24/18 5:04 AM, Christoph Hellwig wrote:
> To remove invariants that are odd and in the way of multipage biovecs.

Applied for 4.19, thanks.

-- 
Jens Axboe



Re: [PATCH 2/3] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Keith Busch
On Tue, Jul 24, 2018 at 04:33:41PM +0300, Max Gurtovoy wrote:
> +void t10_pi_prepare(struct request *rq, u8 protection_type)
> +{
> + const int tuple_sz = rq->q->integrity.tuple_size;
> + u32 ref_tag = t10_pi_ref_tag(rq);
> + struct bio *bio;
> +
> + if (protection_type == T10_PI_TYPE3_PROTECTION)
> + return;
> +
> + __rq_for_each_bio(bio, rq) {
> + struct bio_integrity_payload *bip = bio_integrity(bio);
> + u32 virt = bip_get_seed(bip) & 0x;
> + struct bio_vec iv;
> + struct bvec_iter iter;
> +
> + /* Already remapped? */
> + if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
> + break;
> +
> + bip_for_each_vec(iv, bip, iter) {
> + struct t10_pi_tuple *pi = kmap_atomic(iv.bv_page) +
> + iv.bv_offset;
> + unsigned int j;
> +
> + for (j = 0; j < iv.bv_len; j += tuple_sz) {
> + if (be32_to_cpu(pi->ref_tag) == virt)
> + pi->ref_tag = cpu_to_be32(ref_tag);
> + virt++;
> + ref_tag++;
> + pi += tuple_sz;
> + }
> +
> + kunmap_atomic(pi);
> + }

Since you're incrementing 'pi', you end up unmapping an address that
you didn't map. It does appears harmless in current kunmap_atomic()
implementation, though.

You are also incrementing 'pi' by too many bytes since it is of type
struct t10_pi_tuple. The nvme driver used void* to make the pointer
arithmentic easier.


Re: [PATCH 2/3] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Christoph Hellwig
> /**
>  * t10_pi_prepare() - prepate PI prior submitting request to device

no need for the braces here.

>
>  * @rq:   request with PI that should be prepared
>  * @protection_type:  PI type (Type 1/Type 2/Type 3)
>  *
>  * Description:

No need for the description tag either (although I don't think it
is actively harmful)


Re: [PATCH v2] bcache: set max writeback rate when I/O request is idle

2018-07-24 Thread Coly Li
On 2018/7/24 4:33 PM, Stefan Priebe - Profihost AG wrote:
> Am 24.07.2018 um 10:28 schrieb Coly Li:
>> On 2018/7/24 3:16 PM, Stefan Priebe - Profihost AG wrote:
>>> Am 24.07.2018 um 06:03 schrieb Coly Li:
 Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
 allows the writeback rate to be faster if there is no I/O request on a
 bcache device. It works well if there is only one bcache device attached
 to the cache set. If there are many bcache devices attached to a cache
 set, it may introduce performance regression because multiple faster
 writeback threads of the idle bcache devices will compete the btree level
 locks with the bcache device who have I/O requests coming.

 This patch fixes the above issue by only permitting fast writebac when
 all bcache devices attached on the cache set are idle. And if one of the
 bcache devices has new I/O request coming, minimized all writeback
 throughput immediately and let PI controller __update_writeback_rate()
 to decide the upcoming writeback rate for each bcache device.

 Also when all bcache devices are idle, limited wrieback rate to a small
 number is wast of thoughput, especially when backing devices are slower
 non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
 rate for each backing device if the whole cache set is idle. A faster
 writeback rate in idle time means new I/Os may have more available space
 for dirty data, and people may observe a better write performance then.

 Please note bcache may change its cache mode in run time, and this patch
 still works if the cache mode is switched from writeback mode and there
 is still dirty data on cache.

 Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing 
 idle")
 Cc: sta...@vger.kernel.org #4.16+
 Signed-off-by: Coly Li 
 Tested-by: Kai Krakow 
 Cc: Michael Lyle 
 Cc: Stefan Priebe 
>>>
>>> Hi Coly,
>>>
>>> i'm still experiencing the same bug as yesterday while rebooting every
>>> two times i get a deadlock in cache_set_free.
>>>
>>> Sadly it's so late ion the shutdown process that netconsole is already
>>> unloaded or at least i get no messages from while it happens. The traces
>>> look the same like yesterday.
>>
>> Hi Stefan,
>>
>> Do you use the v2 patch on latest SLE15 kernel source?
> 
> Yes - i use the kernel code from here:
> https://github.com/openSUSE/kernel-source/commits/SLE15
> 
>> Let me try to
>> reproduce it on my machine. I assume when you reboot, the cache is still
>> dirty and not clean up, am I right ?
> 
> Yes with around 15GB of dirty data - ceph is running on top of it and
> generated many random writes.
> 
>> And which file system do you mount
>> on top of the bcache device ?
> XFS

Hi Stefan,

Thanks for the information. I managed to reproduce a similar deadlock on
my small box. Let me see how to fix it and post a new version ASAP :-)

Coly Li


Re: [PATCH] block: Rename the null_blk_mod kernel module back into null_blk

2018-07-24 Thread Jens Axboe
On 7/23/18 3:18 PM, Bart Van Assche wrote:
> Commit ca4b2a011948 ("null_blk: add zone support") breaks several
> blktests scripts because it renamed the null_blk kernel module into
> null_blk_mod. Hence rename null_blk_mod back into null_blk.

Thanks Bart, applied.

-- 
Jens Axboe



Re: [PATCH 2/3] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Max Gurtovoy




On 7/24/2018 4:55 PM, Christoph Hellwig wrote:

+/*
+ * The virtual start sector is the one that was originally submitted
+ * by the block layer. Due to partitioning, MD/DM cloning, etc. the
+ * actual physical start sector is likely to be different.  Remap
+ * protection information to match the physical LBA.
+ *
+ * From SCSI protocol perspective there's a slight difference between
+ * Type 1 and 2.  The latter uses command's 32-byte exclusively, and the
+ * reference tag is seeded in the command.  This gives us the potential to
+ * avoid virt->phys remapping during write.  However, at read time we
+ * don't know whether the virt sector is the same as when we wrote it
+ * (we could be reading from real disk as opposed to MD/DM device.  So
+ * we always remap Type 2 making it identical to Type 1.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */


Please convert the comments for  t10_pi_prepare/t10_pi_complete to
kerneldoc format, and remove the mention of 32-byte CDBs, which
are SCSI specific.



something like this:

/**
 * t10_pi_prepare() - prepate PI prior submitting request to device
 * @rq:   request with PI that should be prepared
 * @protection_type:  PI type (Type 1/Type 2/Type 3)
 *
 * Description:
 *For Type 1/Type 2, the virtual start sector is the one that was
 *originally submitted by the block layer for the ref_tag usage. Due to
 *partitioning, MD/DM cloning, etc. the actual physical start sector is
 *likely to be different. Remap protection information to match the
 *physical LBA.
 *
 *Type 3 does not have a reference tag so no remapping is required.
 */


Otherwise this looks good to me.

Note that after this patch only sd_dif_config_host is left in sd_dif.c,
at which point it might be worth merging into sd.c as a separate patch.



Re: [PATCH 3/3] nvme: use blk API to remap ref tags for IOs with metadata

2018-07-24 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig 


Re: [PATCH 2/3] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Christoph Hellwig
> +/*
> + * The virtual start sector is the one that was originally submitted
> + * by the block layer.   Due to partitioning, MD/DM cloning, etc. the
> + * actual physical start sector is likely to be different.  Remap
> + * protection information to match the physical LBA.
> + *
> + * From SCSI protocol perspective there's a slight difference between
> + * Type 1 and 2.  The latter uses command's 32-byte exclusively, and the
> + * reference tag is seeded in the command.  This gives us the potential to
> + * avoid virt->phys remapping during write.  However, at read time we
> + * don't know whether the virt sector is the same as when we wrote it
> + * (we could be reading from real disk as opposed to MD/DM device.  So
> + * we always remap Type 2 making it identical to Type 1.
> + *
> + * Type 3 does not have a reference tag so no remapping is required.
> + */

Please convert the comments for  t10_pi_prepare/t10_pi_complete to
kerneldoc format, and remove the mention of 32-byte CDBs, which
are SCSI specific.

Otherwise this looks good to me.

Note that after this patch only sd_dif_config_host is left in sd_dif.c,
at which point it might be worth merging into sd.c as a separate patch.


Re: [PATCH 1/3] block: move ref_tag calculation func to the block layer

2018-07-24 Thread Christoph Hellwig
>   case NVME_NS_DPS_PI_TYPE2:
>   control |= NVME_RW_PRINFO_PRCHK_GUARD |
>   NVME_RW_PRINFO_PRCHK_REF;
> - cmnd->rw.reftag = cpu_to_le32(
> - nvme_block_nr(ns, blk_rq_pos(req)));
> + cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));

Note that this changes behavior for non-512 byte block sizes.  But
I think these changes do the right thing..



[PATCH 2/3] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Max Gurtovoy
Currently these functions are implemented in the scsi layer, but their
actual place should be the block layer since T10-PI is a general data
integrity feature that is used in the nvme protocol as well. Also, use
the tuple size from the integrity profile since it may vary between
integrity types.

Suggested-by: Christoph Hellwig 
Cc: Jens Axboe 
Cc: Martin K. Petersen 
Signed-off-by: Max Gurtovoy 
---

changes from v1 (Christoph, Martin and Keith comments):
 - moved the functions to t10-pi.c
 - updated tuple size
 - changed local variables scope
 - remove/add new lines

---
 block/t10-pi.c | 100 +++
 drivers/scsi/sd.c  |   8 ++--
 drivers/scsi/sd.h  |   9 
 drivers/scsi/sd_dif.c  | 113 -
 include/linux/t10-pi.h |   3 ++
 5 files changed, 108 insertions(+), 125 deletions(-)

diff --git a/block/t10-pi.c b/block/t10-pi.c
index a98db38..5f2a177 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -184,3 +184,103 @@ static blk_status_t t10_pi_type3_verify_ip(struct 
blk_integrity_iter *iter)
.verify_fn  = t10_pi_type3_verify_ip,
 };
 EXPORT_SYMBOL(t10_pi_type3_ip);
+
+/*
+ * The virtual start sector is the one that was originally submitted
+ * by the block layer. Due to partitioning, MD/DM cloning, etc. the
+ * actual physical start sector is likely to be different.  Remap
+ * protection information to match the physical LBA.
+ *
+ * From SCSI protocol perspective there's a slight difference between
+ * Type 1 and 2.  The latter uses command's 32-byte exclusively, and the
+ * reference tag is seeded in the command.  This gives us the potential to
+ * avoid virt->phys remapping during write.  However, at read time we
+ * don't know whether the virt sector is the same as when we wrote it
+ * (we could be reading from real disk as opposed to MD/DM device.  So
+ * we always remap Type 2 making it identical to Type 1.
+ *
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+void t10_pi_prepare(struct request *rq, u8 protection_type)
+{
+   const int tuple_sz = rq->q->integrity.tuple_size;
+   u32 ref_tag = t10_pi_ref_tag(rq);
+   struct bio *bio;
+
+   if (protection_type == T10_PI_TYPE3_PROTECTION)
+   return;
+
+   __rq_for_each_bio(bio, rq) {
+   struct bio_integrity_payload *bip = bio_integrity(bio);
+   u32 virt = bip_get_seed(bip) & 0x;
+   struct bio_vec iv;
+   struct bvec_iter iter;
+
+   /* Already remapped? */
+   if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+   break;
+
+   bip_for_each_vec(iv, bip, iter) {
+   struct t10_pi_tuple *pi = kmap_atomic(iv.bv_page) +
+   iv.bv_offset;
+   unsigned int j;
+
+   for (j = 0; j < iv.bv_len; j += tuple_sz) {
+   if (be32_to_cpu(pi->ref_tag) == virt)
+   pi->ref_tag = cpu_to_be32(ref_tag);
+   virt++;
+   ref_tag++;
+   pi += tuple_sz;
+   }
+
+   kunmap_atomic(pi);
+   }
+
+   bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+   }
+}
+EXPORT_SYMBOL(t10_pi_prepare);
+
+/*
+ * Remap physical sector values in the reference tag to the virtual
+ * values expected by the block layer.
+ */
+void t10_pi_complete(struct request *rq, u8 protection_type,
+unsigned int intervals)
+{
+   const int tuple_sz = rq->q->integrity.tuple_size;
+   u32 ref_tag = t10_pi_ref_tag(rq);
+   struct bio *bio;
+
+   if (protection_type == T10_PI_TYPE3_PROTECTION)
+   return;
+
+   __rq_for_each_bio(bio, rq) {
+   struct bio_integrity_payload *bip = bio_integrity(bio);
+   u32 virt = bip_get_seed(bip) & 0x;
+   struct bio_vec iv;
+   struct bvec_iter iter;
+
+   bip_for_each_vec(iv, bip, iter) {
+   struct t10_pi_tuple *pi = kmap_atomic(iv.bv_page) +
+   iv.bv_offset;
+   unsigned int j;
+
+   for (j = 0; j < iv.bv_len; j += tuple_sz) {
+   if (intervals == 0) {
+   kunmap_atomic(pi);
+   return;
+   }
+   if (be32_to_cpu(pi->ref_tag) == ref_tag)
+   pi->ref_tag = cpu_to_be32(virt);
+   virt++;
+   ref_tag++;
+   intervals--;
+   pi += tuple_sz;
+   }
+
+   

[PATCH 1/3] block: move ref_tag calculation func to the block layer

2018-07-24 Thread Max Gurtovoy
Currently this function is implemented in the scsi layer, but it's
actual place should be the block layer since T10-PI is a general
data integrity feature that is used in the nvme protocol as well.

Suggested-by: Christoph Hellwig 
Cc: Jens Axboe 
Cc: Martin K. Petersen 
Signed-off-by: Max Gurtovoy 
---
 drivers/infiniband/ulp/iser/iser_memory.c | 2 +-
 drivers/nvme/host/core.c  | 3 +--
 drivers/scsi/mpt3sas/mpt3sas_scsih.c  | 2 +-
 drivers/scsi/sd_dif.c | 4 ++--
 include/linux/t10-pi.h| 6 ++
 include/scsi/scsi_cmnd.h  | 7 +--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iser_memory.c 
b/drivers/infiniband/ulp/iser/iser_memory.c
index ca844a9..130bf16 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -311,7 +311,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task 
*iser_task,
 {
domain->sig_type = IB_SIG_TYPE_T10_DIF;
domain->sig.dif.pi_interval = scsi_prot_interval(sc);
-   domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
+   domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
/*
 * At the moment we hard code those, but in the future
 * we will take them from sc.
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bf65501..191177b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -627,8 +627,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
-   cmnd->rw.reftag = cpu_to_le32(
-   nvme_block_nr(ns, blk_rq_pos(req)));
+   cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
}
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c 
b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index b8d131a..dd738ae 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -4568,7 +4568,7 @@ static int _scsih_set_satl_pending(struct scsi_cmnd 
*scmd, bool pending)
MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG |
MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
mpi_request->CDB.EEDP32.PrimaryReferenceTag =
-   cpu_to_be32(scsi_prot_ref_tag(scmd));
+   cpu_to_be32(t10_pi_ref_tag(scmd->request));
break;
 
case SCSI_PROT_DIF_TYPE3:
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 9035380..d8de43d 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -124,7 +124,7 @@ void sd_dif_prepare(struct scsi_cmnd *scmd)
if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION)
return;
 
-   phys = scsi_prot_ref_tag(scmd);
+   phys = t10_pi_ref_tag(scmd->request);
 
__rq_for_each_bio(bio, scmd->request) {
struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -176,7 +176,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int 
good_bytes)
return;
 
intervals = good_bytes / scsi_prot_interval(scmd);
-   phys = scsi_prot_ref_tag(scmd);
+   phys = t10_pi_ref_tag(scmd->request);
 
__rq_for_each_bio(bio, scmd->request) {
struct bio_integrity_payload *bip = bio_integrity(bio);
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index c6aa8a3..635855e 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -37,6 +37,12 @@ struct t10_pi_tuple {
 #define T10_PI_APP_ESCAPE cpu_to_be16(0x)
 #define T10_PI_REF_ESCAPE cpu_to_be32(0x)
 
+static inline u32 t10_pi_ref_tag(struct request *rq)
+{
+   return blk_rq_pos(rq) >>
+   (rq->q->integrity.interval_exp - 9) & 0x;
+}
+
 extern const struct blk_integrity_profile t10_pi_type1_crc;
 extern const struct blk_integrity_profile t10_pi_type1_ip;
 extern const struct blk_integrity_profile t10_pi_type3_crc;
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index aaf1e97..cae229b 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -313,12 +314,6 @@ static inline unsigned int scsi_prot_interval(struct 
scsi_cmnd *scmd)
return scmd->device->sector_size;
 }
 
-static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
-{
-   return blk_rq_pos(scmd->request) >>
-   (ilog2(scsi_prot_interval(scmd)) - 9) & 0x;
-}
-
 static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
 {
return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
-- 
1.8.3.1



[PATCH 3/3] nvme: use blk API to remap ref tags for IOs with metadata

2018-07-24 Thread Max Gurtovoy
Also moved the logic of the remapping to the nvme core driver instead
of implementing it in the nvme pci driver. This way all the other nvme
transport drivers will benefit from it (in case they'll implement metadata
support).

Suggested-by: Christoph Hellwig 
Cc: Jens Axboe 
Cc: Martin K. Petersen 
Signed-off-by: Max Gurtovoy 
---

changes from v1:
 - check status of nvme_req instead of converting to BLK_STS

---
 drivers/nvme/host/core.c | 18 
 drivers/nvme/host/nvme.h |  9 +-
 drivers/nvme/host/pci.c  | 75 +---
 3 files changed, 20 insertions(+), 82 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 191177b..b57abe5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -617,6 +617,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
+   } else if (req_op(req) == REQ_OP_WRITE) {
+   t10_pi_prepare(req, ns->pi_type);
}
 
switch (ns->pi_type) {
@@ -637,6 +639,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns 
*ns,
return 0;
 }
 
+void nvme_cleanup_cmd(struct request *req)
+{
+   if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+   nvme_req(req)->status == 0) {
+   struct nvme_ns *ns = req->rq_disk->private_data;
+
+   t10_pi_complete(req, ns->pi_type,
+   blk_rq_bytes(req) >> ns->lba_shift);
+   }
+   if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+   kfree(page_address(req->special_vec.bv_page) +
+ req->special_vec.bv_offset);
+   }
+}
+EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
+
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0c4a33d..dfc01ff 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -356,14 +356,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, 
sector_t sector)
return (sector >> (ns->lba_shift - 9));
 }
 
-static inline void nvme_cleanup_cmd(struct request *req)
-{
-   if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-   kfree(page_address(req->special_vec.bv_page) +
- req->special_vec.bv_offset);
-   }
-}
-
 static inline void nvme_end_request(struct request *req, __le16 status,
union nvme_result result)
 {
@@ -420,6 +412,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, 
__le16 status,
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
+void nvme_cleanup_cmd(struct request *req);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddd441b..03beac5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -535,73 +535,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct 
request *req)
mempool_free(iod->sg, dev->iod_mempool);
 }
 
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-   if (be32_to_cpu(pi->ref_tag) == v)
-   pi->ref_tag = cpu_to_be32(p);
-}
-
-static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
-{
-   if (be32_to_cpu(pi->ref_tag) == p)
-   pi->ref_tag = cpu_to_be32(v);
-}
-
-/**
- * nvme_dif_remap - remaps ref tags to bip seed and physical lba
- *
- * The virtual start sector is the one that was originally submitted by the
- * block layer.Due to partitioning, MD/DM cloning, etc. the actual 
physical
- * start sector may be different. Remap protection information to match the
- * physical LBA on writes, and back to the original seed on reads.
- *
- * Type 0 and 3 do not have a ref tag, so no remapping required.
- */
-static void nvme_dif_remap(struct request *req,
-   void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
-{
-   struct nvme_ns *ns = req->rq_disk->private_data;
-   struct bio_integrity_payload *bip;
-   struct t10_pi_tuple *pi;
-   void *p, *pmap;
-   u32 i, nlb, ts, phys, virt;
-
-   if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
-   return;
-
-   bip = bio_integrity(req->bio);
-   if (!bip)
-   return;
-
-   pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
-
-   p = pmap;
-   virt = bip_get_seed(bip);
-   phys = nvme_block_nr(ns, blk_rq_pos(req));
-   nlb = 

two small bio cleanups

2018-07-24 Thread Christoph Hellwig
To remove invariants that are odd and in the way of multipage biovecs.




[PATCH 2/2] block: bio_set_pages_dirty can't see NULL bv_page in a valid bio_vec

2018-07-24 Thread Christoph Hellwig
So don't bother handling it.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Ming Lei 
---
 block/bio.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 504b42278099..07d3ffd95989 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1636,10 +1636,8 @@ void bio_set_pages_dirty(struct bio *bio)
int i;
 
bio_for_each_segment_all(bvec, bio, i) {
-   struct page *page = bvec->bv_page;
-
-   if (page && !PageCompound(page))
-   set_page_dirty_lock(page);
+   if (!PageCompound(bvec->bv_page))
+   set_page_dirty_lock(bvec->bv_page);
}
 }
 EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
-- 
2.18.0



[PATCH 1/2] block: simplify bio_check_pages_dirty

2018-07-24 Thread Christoph Hellwig
bio_check_pages_dirty currently inviolates the invariant that bv_page of
a bio_vec inside bi_vcnt shouldn't be zero, and that is going to become
really annoying with multpath biovecs.  Fortunately there isn't any
all that good reason for it - once we decide to defer freeing the bio
to a workqueue holding onto a few additional pages isn't really an
issue anymore.  So just check if there is a clean page that needs
dirtying in the first path, and do a second pass to free them if there
was none, while the cache is still hot.

Also use the chance to micro-optimize bio_dirty_fn a bit by not saving
irq state - we know we are called from a workqueue.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Ming Lei 
---
 block/bio.c | 56 -
 1 file changed, 21 insertions(+), 35 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 8ecc95615941..504b42278099 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1649,19 +1649,15 @@ static void bio_release_pages(struct bio *bio)
struct bio_vec *bvec;
int i;
 
-   bio_for_each_segment_all(bvec, bio, i) {
-   struct page *page = bvec->bv_page;
-
-   if (page)
-   put_page(page);
-   }
+   bio_for_each_segment_all(bvec, bio, i)
+   put_page(bvec->bv_page);
 }
 
 /*
  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
  * If they are, then fine.  If, however, some pages are clean then they must
  * have been written out during the direct-IO read.  So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
+ * the BIO and re-dirty the pages in process context.
  *
  * It is expected that bio_check_pages_dirty() will wholly own the BIO from
  * here on.  It will run one put_page() against each page and will run one
@@ -1679,52 +1675,42 @@ static struct bio *bio_dirty_list;
  */
 static void bio_dirty_fn(struct work_struct *work)
 {
-   unsigned long flags;
-   struct bio *bio;
+   struct bio *bio, *next;
 
-   spin_lock_irqsave(_dirty_lock, flags);
-   bio = bio_dirty_list;
+   spin_lock_irq(_dirty_lock);
+   next = bio_dirty_list;
bio_dirty_list = NULL;
-   spin_unlock_irqrestore(_dirty_lock, flags);
+   spin_unlock_irq(_dirty_lock);
 
-   while (bio) {
-   struct bio *next = bio->bi_private;
+   while ((bio = next) != NULL) {
+   next = bio->bi_private;
 
bio_set_pages_dirty(bio);
bio_release_pages(bio);
bio_put(bio);
-   bio = next;
}
 }
 
 void bio_check_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
-   int nr_clean_pages = 0;
+   unsigned long flags;
int i;
 
bio_for_each_segment_all(bvec, bio, i) {
-   struct page *page = bvec->bv_page;
-
-   if (PageDirty(page) || PageCompound(page)) {
-   put_page(page);
-   bvec->bv_page = NULL;
-   } else {
-   nr_clean_pages++;
-   }
+   if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+   goto defer;
}
 
-   if (nr_clean_pages) {
-   unsigned long flags;
-
-   spin_lock_irqsave(_dirty_lock, flags);
-   bio->bi_private = bio_dirty_list;
-   bio_dirty_list = bio;
-   spin_unlock_irqrestore(_dirty_lock, flags);
-   schedule_work(_dirty_work);
-   } else {
-   bio_put(bio);
-   }
+   bio_release_pages(bio);
+   bio_put(bio);
+   return;
+defer:
+   spin_lock_irqsave(_dirty_lock, flags);
+   bio->bi_private = bio_dirty_list;
+   bio_dirty_list = bio;
+   spin_unlock_irqrestore(_dirty_lock, flags);
+   schedule_work(_dirty_work);
 }
 EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
-- 
2.18.0



Re: [PATCH 1/2] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Max Gurtovoy




On 7/24/2018 4:54 AM, Martin K. Petersen wrote:


Christoph,


+void blk_integrity_dif_prepare(struct request *rq, u8 protection_type,
+  u32 ref_tag)
+{


Maybe call this blk_t10_pi_prepare?


The rest of these functions have a blk_integrity_ prefix. So either
stick with that or put the functions in t10-pi.c and use a t10_pi_
prefix.

I'm a bit torn on placement since the integrity metadata could contain
other stuff than T10 PI. But the remapping is very specific to T10 PI.


diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9421d9877730..4186bf027c59 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1119,7 +1119,9 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd 
*SCpnt)
SCpnt->cmnd[0] = WRITE_6;
  
  		if (blk_integrity_rq(rq))

-   sd_dif_prepare(SCpnt);
+   blk_integrity_dif_prepare(SCpnt->request,
+ sdkp->protection_type,
+ scsi_prot_ref_tag(SCpnt));


scsi_prot_ref_tag could be move to the block layer as it only uses
the sector in the eequest and the sector size, which we can get
from the gendisk as well.  We then don't need to pass it to the function.


For Type 2, the PI can be at intervals different from the logical block
size (although we don't support that yet). We should use the
blk_integrity profile interval instead of assuming sector size.

And wrt. Keith's comment: The tuple_size should be the one from the
integrity profile as well, not sizeof(struct t10_pi_tuple).



Ok so I'll use rq->q->integrity.tuple_size as the tuple_sz and increment 
j and pi according to it, right ?


Re: [PATCH v2] bcache: set max writeback rate when I/O request is idle

2018-07-24 Thread Stefan Priebe - Profihost AG
Am 24.07.2018 um 10:28 schrieb Coly Li:
> On 2018/7/24 3:16 PM, Stefan Priebe - Profihost AG wrote:
>> Am 24.07.2018 um 06:03 schrieb Coly Li:
>>> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>>> allows the writeback rate to be faster if there is no I/O request on a
>>> bcache device. It works well if there is only one bcache device attached
>>> to the cache set. If there are many bcache devices attached to a cache
>>> set, it may introduce performance regression because multiple faster
>>> writeback threads of the idle bcache devices will compete the btree level
>>> locks with the bcache device who have I/O requests coming.
>>>
>>> This patch fixes the above issue by only permitting fast writebac when
>>> all bcache devices attached on the cache set are idle. And if one of the
>>> bcache devices has new I/O request coming, minimized all writeback
>>> throughput immediately and let PI controller __update_writeback_rate()
>>> to decide the upcoming writeback rate for each bcache device.
>>>
>>> Also when all bcache devices are idle, limited wrieback rate to a small
>>> number is wast of thoughput, especially when backing devices are slower
>>> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
>>> rate for each backing device if the whole cache set is idle. A faster
>>> writeback rate in idle time means new I/Os may have more available space
>>> for dirty data, and people may observe a better write performance then.
>>>
>>> Please note bcache may change its cache mode in run time, and this patch
>>> still works if the cache mode is switched from writeback mode and there
>>> is still dirty data on cache.
>>>
>>> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing 
>>> idle")
>>> Cc: sta...@vger.kernel.org #4.16+
>>> Signed-off-by: Coly Li 
>>> Tested-by: Kai Krakow 
>>> Cc: Michael Lyle 
>>> Cc: Stefan Priebe 
>>
>> Hi Coly,
>>
>> i'm still experiencing the same bug as yesterday while rebooting every
>> two times i get a deadlock in cache_set_free.
>>
>> Sadly it's so late ion the shutdown process that netconsole is already
>> unloaded or at least i get no messages from while it happens. The traces
>> look the same like yesterday.
> 
> Hi Stefan,
> 
> Do you use the v2 patch on latest SLE15 kernel source?

Yes - i use the kernel code from here:
https://github.com/openSUSE/kernel-source/commits/SLE15

> Let me try to
> reproduce it on my machine. I assume when you reboot, the cache is still
> dirty and not clean up, am I right ?

Yes with around 15GB of dirty data - ceph is running on top of it and
generated many random writes.

> And which file system do you mount
> on top of the bcache device ?
XFS

> 
> Thanks.
> 
> Coly Li

Greets,
Stefan


Re: [PATCH v2] bcache: set max writeback rate when I/O request is idle

2018-07-24 Thread Coly Li
On 2018/7/24 3:16 PM, Stefan Priebe - Profihost AG wrote:
> Am 24.07.2018 um 06:03 schrieb Coly Li:
>> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>> allows the writeback rate to be faster if there is no I/O request on a
>> bcache device. It works well if there is only one bcache device attached
>> to the cache set. If there are many bcache devices attached to a cache
>> set, it may introduce performance regression because multiple faster
>> writeback threads of the idle bcache devices will compete the btree level
>> locks with the bcache device who have I/O requests coming.
>>
>> This patch fixes the above issue by only permitting fast writebac when
>> all bcache devices attached on the cache set are idle. And if one of the
>> bcache devices has new I/O request coming, minimized all writeback
>> throughput immediately and let PI controller __update_writeback_rate()
>> to decide the upcoming writeback rate for each bcache device.
>>
>> Also when all bcache devices are idle, limited wrieback rate to a small
>> number is wast of thoughput, especially when backing devices are slower
>> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
>> rate for each backing device if the whole cache set is idle. A faster
>> writeback rate in idle time means new I/Os may have more available space
>> for dirty data, and people may observe a better write performance then.
>>
>> Please note bcache may change its cache mode in run time, and this patch
>> still works if the cache mode is switched from writeback mode and there
>> is still dirty data on cache.
>>
>> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing 
>> idle")
>> Cc: sta...@vger.kernel.org #4.16+
>> Signed-off-by: Coly Li 
>> Tested-by: Kai Krakow 
>> Cc: Michael Lyle 
>> Cc: Stefan Priebe 
> 
> Hi Coly,
> 
> i'm still experiencing the same bug as yesterday while rebooting every
> two times i get a deadlock in cache_set_free.
> 
> Sadly it's so late ion the shutdown process that netconsole is already
> unloaded or at least i get no messages from while it happens. The traces
> look the same like yesterday.

Hi Stefan,

Do you use the v2 patch on latest SLE15 kernel source? Let me try to
reproduce it on my machine. I assume when you reboot, the cache is still
dirty and not clean up, am I right ? And which file system do you mount
on top of the bcache device ?

Thanks.

Coly Li



Re: [PATCH v2] UBI: Add volume read and write statistics

2018-07-24 Thread Richard Weinberger
[Adding linux-block]

Am Montag, 23. Juli 2018, 17:39:02 CEST schrieb Per Forlin:
> Simple read and write statistics.
> * Number of reads
> * Number of writes
> * Bytes read
> * Bytes written
> 
> This is useful to find out how the storage is being utilized.
> For block devices this already exists here:
> /sys/class/block//stat
> 
> For UBI it now exists here:
> /sys/class/ubi//stat

Please document it also in Documentation/ABI/stable/sysfs-class-ubi.

> Example:
> /sys/class/ubi/ubi0_21/stat
> 864 0 3106756 0 1057 0 2144256 0 0 0 0
> 
> The output format is same as for block devices
> except that not all metrics are supported yet.
> Unsupported values are set to 0.

Let's ask block folks what they think about.
Per, did you check, which tools use the stats file in /sys?
I checked iostat, it seems to consume only /proc/diskstats.

To give more context, UBI and MTD devices offer access to raw flash
storage. So, we have filesystems on top of it.
Since these devices are not block devices, almost no tool know
about these.
With this patch we'd like to offer a interface to userspace such that
tools will also discover UBI devices.
Maybe MTD devices later too.

> ---
> Changelog:
> 
> v2
> * Question: Translate bytes to sectors? iostats format expects sector unit.

Not sure. I'd wait for input from userspace and block folks.

> * Align with iostats format
> * Only count successful reads and writes
> 
>  drivers/mtd/ubi/eba.c | 11 ++-
>  drivers/mtd/ubi/ubi.h | 19 +++
>  drivers/mtd/ubi/vmt.c |  8 
>  3 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
> index b98481b..c9a88b2 100644
> --- a/drivers/mtd/ubi/eba.c
> +++ b/drivers/mtd/ubi/eba.c
> @@ -731,6 +731,11 @@ int ubi_eba_read_leb(struct ubi_device *ubi, struct 
> ubi_volume *vol, int lnum,
>   }
>   }
>  
> + if (!err) {
> + vol->stat.rcount++;
> + vol->stat.rbytes += len;

Can we please have a helper function for that?

> + }
> +
>   if (scrub)
>   err = ubi_wl_scrub_peb(ubi, pnum);
>  
> @@ -1091,8 +1096,12 @@ int ubi_eba_write_leb(struct ubi_device *ubi, struct 
> ubi_volume *vol, int lnum,
>   ubi_free_vid_buf(vidb);
>  
>  out:
> - if (err)
> + if (err) {
>   ubi_ro_mode(ubi);
> + } else {
> + vol->stat.wcount++;
> + vol->stat.wbytes += len;

Same.

> + }
>  
>   leb_write_unlock(ubi, vol_id, lnum);
>  
> diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
> index f5ba97c..0cb00f0 100644
> --- a/drivers/mtd/ubi/ubi.h
> +++ b/drivers/mtd/ubi/ubi.h
> @@ -293,6 +293,23 @@ struct ubi_eba_leb_desc {
>  };
>  
>  /**
> + * struct ubi_volume_stat - Volume statistics
> + * @rbytes: the number of bytes read
> + * @wbytes: the number of bytes written
> + * @rcount: the number of read requests
> + * @wcount: the number of write requests
> + *
> + * This structure contains read and write statistics.
> + *
> + */
> +struct ubi_volume_stat {
> + u64 rbytes;
> + u64 wbytes;
> + u32 rcount;
> + u32 wcount;

Does the wide of these integers also match with block devices?

Thanks,
//richard


Re: [PATCH] blk-mq: Avoid that a request queue stalls when restarting a shared hctx

2018-07-24 Thread jianchao.wang



On 07/23/2018 11:50 PM, Bart Van Assche wrote:
> The patch below fixes queue stalling when shared hctx marked for restart
> (BLK_MQ_S_SCHED_RESTART bit) but q->shared_hctx_restart stays zero.  The
> root cause is that hctxs are shared between queues, but 'shared_hctx_restart'

The blk_mq_hw_ctx structure is also per request_queue
Please refer to blk_mq_init_allocated_queue -> blk_mq_realloc_hw_ctxs

> belongs to the particular queue, which in fact may not need to be restarted,
> thus we return from blk_mq_sched_restart() and leave shared hctx of another
> queue never restarted.
> 
> The fix is to make shared_hctx_restart counter belong not to the queue, but
> to tags, thereby counter will reflect real number of shared hctx needed to
> be restarted.
> 
> During tests 1 hctx (set->nr_hw_queues) was used and all stalled requests
> were noticed in dd->fifo_list of mq-deadline scheduler.
> 
> Seeming possible sequence of events:
> 
> 1. Request A of queue A is inserted into dd->fifo_list of the scheduler.
> 
> 2. Request B of queue A bypasses scheduler and goes directly to
>hctx->dispatch.
> 
> 3. Request C of queue B is inserted.
> 
> 4. blk_mq_sched_dispatch_requests() is invoked, since hctx->dispatch is not
>empty (request B is in the list) hctx is only marked for for next restart
>and request A is left in a list (see comment "So it's best to leave them
>there for as long as we can. Mark the hw queue as needing a restart in
>that case." in blk-mq-sched.c)
> 
> 5. Eventually request B is completed/freed and blk_mq_sched_restart() is
>called, but by chance hctx from queue B is chosen for restart and request C
>gets a chance to be dispatched.

Request C is just inserted into queue B. If there is no mark restart there,
it will not be chosen.
blk_mq_sched_restart_hctx

if (!test_bit(BLK_MQ_S_SCHED_RESTART, >state))
return false;

If blk_mq_sched_restart choose queue B, one of its hctx must have SCHED_RESTART 
flag,
and q->shared_hctx_restart must not be zero.

> 
> 6. Eventually request C is completed/freed and blk_mq_sched_restart() is
>called, but shared_hctx_restart for queue B is zero and we return without
>attempt to restart hctx from queue A, thus request A is stuck forever.
> 
> But stalling queue is not the only one problem with blk_mq_sched_restart().
> My tests show that those loops thru all queues and hctxs can be very costly,
> even with shared_hctx_restart counter, which aims to fix performance issue.

Currently, SCHED_RESTART is always set when there is reqs on hctx->dispatch 
list in
blk_mq_sched_dispatch_requests. And no driver tag case is the main reason for 
hctx->dispatch
is not empty when there is io scheduler.
Therefore, most of time, blk_mq_sched_restart will be invoked further for no 
driver tag case.

For non-share-tag case, it will wakeup the hctx.
But for shared-tag case, it is unnecessary, because the sbitmap_queue wakeup 
hook will work
and hctx_may_queue will avoid starvation of other ones.

Therefore, the costly loop through the queues and hctxs is unnecessary most of 
time. 

Thanks
Jianchao







Re: [PATCH 1/2] block: move dif_prepare/dif_complete functions to block layer

2018-07-24 Thread Christoph Hellwig
On Mon, Jul 23, 2018 at 09:54:38PM -0400, Martin K. Petersen wrote:
> 
> Christoph,
> 
> >> +void blk_integrity_dif_prepare(struct request *rq, u8 protection_type,
> >> + u32 ref_tag)
> >> +{
> >
> > Maybe call this blk_t10_pi_prepare?
> 
> The rest of these functions have a blk_integrity_ prefix. So either
> stick with that or put the functions in t10-pi.c and use a t10_pi_
> prefix.

Yes, I suggested moving it somewhere in my reply.


Re: [PATCH v2] bcache: set max writeback rate when I/O request is idle

2018-07-24 Thread Stefan Priebe - Profihost AG
Am 24.07.2018 um 06:03 schrieb Coly Li:
> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
> allows the writeback rate to be faster if there is no I/O request on a
> bcache device. It works well if there is only one bcache device attached
> to the cache set. If there are many bcache devices attached to a cache
> set, it may introduce performance regression because multiple faster
> writeback threads of the idle bcache devices will compete the btree level
> locks with the bcache device who have I/O requests coming.
> 
> This patch fixes the above issue by only permitting fast writebac when
> all bcache devices attached on the cache set are idle. And if one of the
> bcache devices has new I/O request coming, minimized all writeback
> throughput immediately and let PI controller __update_writeback_rate()
> to decide the upcoming writeback rate for each bcache device.
> 
> Also when all bcache devices are idle, limited wrieback rate to a small
> number is wast of thoughput, especially when backing devices are slower
> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
> rate for each backing device if the whole cache set is idle. A faster
> writeback rate in idle time means new I/Os may have more available space
> for dirty data, and people may observe a better write performance then.
> 
> Please note bcache may change its cache mode in run time, and this patch
> still works if the cache mode is switched from writeback mode and there
> is still dirty data on cache.
> 
> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
> Cc: sta...@vger.kernel.org #4.16+
> Signed-off-by: Coly Li 
> Tested-by: Kai Krakow 
> Cc: Michael Lyle 
> Cc: Stefan Priebe 

Hi Coly,

i'm still experiencing the same bug as yesterday while rebooting every
two times i get a deadlock in cache_set_free.

Sadly it's so late ion the shutdown process that netconsole is already
unloaded or at least i get no messages from while it happens. The traces
look the same like yesterday.

Greets,
Stefan

> ---
> Channgelog:
> v2, Fix a deadlock reported by Stefan Priebe.
> v1, Initial version.
> 
>  drivers/md/bcache/bcache.h|  11 ++--
>  drivers/md/bcache/request.c   |  51 ++-
>  drivers/md/bcache/super.c |   1 +
>  drivers/md/bcache/sysfs.c |  14 +++--
>  drivers/md/bcache/util.c  |   2 +-
>  drivers/md/bcache/util.h  |   2 +-
>  drivers/md/bcache/writeback.c | 115 ++
>  7 files changed, 155 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index d6bf294f3907..469ab1a955e0 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -328,13 +328,6 @@ struct cached_dev {
>*/
>   atomic_thas_dirty;
>  
> - /*
> -  * Set to zero by things that touch the backing volume-- except
> -  * writeback.  Incremented by writeback.  Used to determine when to
> -  * accelerate idle writeback.
> -  */
> - atomic_tbacking_idle;
> -
>   struct bch_ratelimitwriteback_rate;
>   struct delayed_work writeback_rate_update;
>  
> @@ -514,6 +507,8 @@ struct cache_set {
>   struct cache_accounting accounting;
>  
>   unsigned long   flags;
> + atomic_tidle_counter;
> + atomic_tat_max_writeback_rate;
>  
>   struct cache_sb sb;
>  
> @@ -523,6 +518,8 @@ struct cache_set {
>  
>   struct bcache_device**devices;
>   unsigneddevices_max_used;
> + /* See set_at_max_writeback_rate() for how it is used */
> + unsignedprevious_dirty_dc_nr;
>   struct list_headcached_devs;
>   uint64_tcached_dev_sectors;
>   struct closure  caching;
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index ae67f5fa8047..1af3d96abfa5 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1104,6 +1104,43 @@ static void detached_dev_do_request(struct 
> bcache_device *d, struct bio *bio)
>  
>  /* Cached devices - read & write stuff */
>  
> +static void quit_max_writeback_rate(struct cache_set *c,
> + struct cached_dev *this_dc)
> +{
> + int i;
> + struct bcache_device *d;
> + struct cached_dev *dc;
> +
> + /*
> +  * If bch_register_lock is acquired by other attach/detach operations,
> +  * waiting here will increase I/O request latency for seconds or more.
> +  * To avoid such situation, only writeback rate of current cached device
> +  * is set to 1, and __update_write_back() will decide writeback rate
> +  * of other cached devices (remember c->idle_counter is 0 now).
> +  */
> + if (mutex_trylock(_register_lock)){
> + for (i = 0; i < c->devices_max_used; i++) {
> + if (!c->devices[i])
> +