Hi,

I've been running into problems when stability testing my 4.10 branch,
and I finally got an easy reproducer today (on the laptop, no less) and
was able to bisect it. Boils down to this:

2253efc850c4cf690516bbc07854eeb1077202ba is the first bad commit
commit 2253efc850c4cf690516bbc07854eeb1077202ba
Author: Bart Van Assche <[email protected]>
Date:   Fri Oct 28 17:20:02 2016 -0700

    blk-mq: Move more code into blk_mq_direct_issue_request()

The symptoms are one of two things:

1) We get command timeouts:

nvme nvme0: I/O 567 QID 14 timeout, aborting
nvme nvme0: Abort status: 0x0
nvme nvme0: I/O 567 QID 14 timeout, reset controller
nvme nvme0: completing aborted command with status:

blk_update_request: I/O error, dev nvme0n1, sector

EXT4-fs warning (device nvme0n1): ext4_end_bio:314: I/O
g to inode 20185097 (offset 0 size 8388608 starting block

Buffer I/O error on device nvme0n1, logical block 247040
Buffer I/O error on device nvme0n1, logical block 247041
Buffer I/O error on device nvme0n1, logical block 247042
Buffer I/O error on device nvme0n1, logical block 247043
Buffer I/O error on device nvme0n1, logical block 247044
Buffer I/O error on device nvme0n1, logical block 247045
Buffer I/O error on device nvme0n1, logical block 247046
Buffer I/O error on device nvme0n1, logical block 247047
Buffer I/O error on device nvme0n1, logical block 247048
Buffer I/O error on device nvme0n1, logical block 247049

No corruption though, the data has been written.

2) We oops in __blk_mq_complete_request(), because __nvme_process_cq()
  -> blk_mq_complete_request() -> __blk_mq_complete_request() gets a
  request that has NULL ->q, ->bio, ->biotail, etc.

I did a manual revert of the patch, see below, and it seems to work fine
with this applied. I'll take a look at why this is, since it isn't
immediately obvious to me. Sending this to get it out there while I take
a deeper look.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d180c989a0e5..365ae17c3f2b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1291,11 +1291,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
        return rq;
 }

-static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                     struct request *rq, blk_qc_t *cookie)
+static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 {
        int ret;
        struct request_queue *q = rq->q;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .list = NULL,
@@ -1303,9 +1303,6 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
        };
        blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);

-       if (blk_mq_hctx_stopped(hctx))
-               goto insert;
-
        /*
         * For OK queue, we are done. For error, kill it. Any other
         * error (busy), just add it to our list as we previously
@@ -1314,7 +1311,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
        ret = q->mq_ops->queue_rq(hctx, &bd);
        if (ret == BLK_MQ_RQ_QUEUE_OK) {
                *cookie = new_cookie;
-               return;
+               return 0;
        }

        __blk_mq_requeue_request(rq);
@@ -1323,11 +1320,10 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                *cookie = BLK_QC_T_NONE;
                rq->errors = -EIO;
                blk_mq_end_request(rq, rq->errors);
-               return;
+               return 0;
        }

-insert:
-       blk_mq_insert_request(rq, false, true, true);
+       return -1;
 }

 /*
@@ -1414,11 +1410,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)

                if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
                        rcu_read_lock();
-                       blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+                       if (blk_mq_hctx_stopped(data.hctx) ||
+                           blk_mq_direct_issue_request(old_rq, &cookie) != 0)
+                               blk_mq_insert_request(old_rq, false, true, 
true);
                        rcu_read_unlock();
                } else {
                        srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
-                       blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+                       if (blk_mq_hctx_stopped(data.hctx) ||
+                           blk_mq_direct_issue_request(old_rq, &cookie) != 0)
+                               blk_mq_insert_request(old_rq, false, true, 
true);
                        srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
                }
                goto done;

--
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to