On Mon, Jul 31, 2017 at 03:24:40PM -0700, Dave Jiang wrote:
> Adding blk-mq support to the pmem driver in addition to the direct bio
> support. This allows for hardware offloading via DMA engines. By default
> the bio method will be enabled. The blk-mq support can be turned on via
> module parameter queue_mode=1.
>
> Signed-off-by: Dave Jiang <[email protected]>
> ---
> drivers/nvdimm/pmem.c | 131
> +++++++++++++++++++++++++++++++++++++++++++++----
> drivers/nvdimm/pmem.h | 3 +
> 2 files changed, 122 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index c544d46..347835b 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -31,10 +31,24 @@
> #include <linux/pmem.h>
> #include <linux/dax.h>
> #include <linux/nd.h>
> +#include <linux/blk-mq.h>
> #include "pmem.h"
> #include "pfn.h"
> #include "nd.h"
>
> +enum {
> + PMEM_Q_BIO = 0,
> + PMEM_Q_MQ = 1,
> +};
> +
> +static int queue_mode = PMEM_Q_BIO;
> +module_param(queue_mode, int, 0444);
> +MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode");
Maybe in the parameter description we can give the user a hint as to what the
settings are? i.e. "PMEM queue mode (0=BIO, 1=MQ+DMA)" or something?
> +
> +struct pmem_cmd {
> + struct request *rq;
> +};
> +
> static struct device *to_dev(struct pmem_device *pmem)
> {
> /*
> @@ -239,9 +253,12 @@ static const struct dax_operations pmem_dax_ops = {
> .direct_access = pmem_dax_direct_access,
> };
>
> -static void pmem_release_queue(void *q)
> +static void pmem_release_queue(void *pmem)
> {
> - blk_cleanup_queue(q);
> +
Unneeded whitespace. Also, a nit, can we maybe add a local pmem with type
"struct pmem_device *" so we don't have to cast twice?
> + blk_cleanup_queue(((struct pmem_device *)pmem)->q);
> + if (queue_mode == PMEM_Q_MQ)
> + blk_mq_free_tag_set(&((struct pmem_device *)pmem)->tag_set);
> }
>
> static void pmem_freeze_queue(void *q)
> @@ -259,6 +276,65 @@ static void pmem_release_disk(void *__pmem)
> put_disk(pmem->disk);
> }
>
> +static int pmem_handle_cmd(struct pmem_cmd *cmd)
> +{
> + struct request *req = cmd->rq;
> + struct request_queue *q = req->q;
> + struct pmem_device *pmem = q->queuedata;
> + struct nd_region *nd_region = to_region(pmem);
> + struct bio_vec bvec;
> + struct req_iterator iter;
> + int rc = 0;
> +
> + if (req->cmd_flags & REQ_FLUSH)
> + nvdimm_flush(nd_region);
> +
> + rq_for_each_segment(bvec, req, iter) {
> + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
> + bvec.bv_offset, op_is_write(req_op(req)),
> + iter.iter.bi_sector);
> + if (rc < 0)
> + break;
> + }
> +
> + if (req->cmd_flags & REQ_FUA)
> + nvdimm_flush(nd_region);
> +
> + blk_mq_end_request(cmd->rq, rc);
> +
> + return 0;
return rc;
?
> +}
> +
> +static int pmem_queue_rq(struct blk_mq_hw_ctx *hctx,
> + const struct blk_mq_queue_data *bd)
> +{
> + struct pmem_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
> + int rc;
> +
> + cmd->rq = bd->rq;
> +
> + blk_mq_start_request(bd->rq);
> +
> + rc = pmem_handle_cmd(cmd);
> + if (rc < 0)
> + rc = BLK_MQ_RQ_QUEUE_ERROR;
> + else
> + rc = BLK_MQ_RQ_QUEUE_OK;
> +
> + return rc;
> +}
You can get rid of 'rc' altogether and simplify this to:
static int pmem_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct pmem_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
cmd->rq = bd->rq;
blk_mq_start_request(bd->rq);
if (pmem_handle_cmd(cmd) < 0)
return BLK_MQ_RQ_QUEUE_ERROR;
else
return BLK_MQ_RQ_QUEUE_OK;
}
> +static int pmem_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
> + unsigned int index)
> +{
> + return 0;
> +}
I'm pretty sure you don't need a dummy implementation of .init_hctx. All
callers check to make sure it's defined before calling, so if you just leave
the OP NULL it'll do the same thing. scsi_mq_ops for example does this.
> @@ -303,17 +380,47 @@ static int pmem_attach_disk(struct device *dev,
> return -EBUSY;
> }
>
> - q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
> - if (!q)
> - return -ENOMEM;
> + if (queue_mode == PMEM_Q_MQ) {
> + pmem->tag_set.ops = &pmem_mq_ops;
> + pmem->tag_set.nr_hw_queues = nr_online_nodes;
> + pmem->tag_set.queue_depth = 64;
> + pmem->tag_set.numa_node = dev_to_node(dev);
> + pmem->tag_set.cmd_size = sizeof(struct pmem_cmd);
> + pmem->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
> + pmem->tag_set.driver_data = pmem;
> +
> + rc = blk_mq_alloc_tag_set(&pmem->tag_set);
> + if (rc < 0)
> + return rc;
> +
> + pmem->q = blk_mq_init_queue(&pmem->tag_set);
> + if (IS_ERR(pmem->q)) {
> + blk_mq_free_tag_set(&pmem->tag_set);
> + return -ENOMEM;
> + }
>
> - if (devm_add_action_or_reset(dev, pmem_release_queue, q))
> - return -ENOMEM;
> + if (devm_add_action_or_reset(dev, pmem_release_queue, pmem)) {
> + blk_mq_free_tag_set(&pmem->tag_set);
> + return -ENOMEM;
> + }
> + } else if (queue_mode == PMEM_Q_BIO) {
> + pmem->q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
> + if (!pmem->q)
> + return -ENOMEM;
> +
> + if (devm_add_action_or_reset(dev, pmem_release_queue, pmem))
> + return -ENOMEM;
> +
> + blk_queue_make_request(pmem->q, pmem_make_request);
Later on in this function there is a big block that is still manipulating 'q'
instead of pmem->q:
blk_queue_write_cache(q, true, true);
blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
q->queuedata = pmem;
I'm pretty sure this all needs to be looking at pmem->q instead, and we may
just end up killing the local 'q'.
Also, this block duplicates the blk_queue_make_request() call, which we
manually make happen in the PMEM_Q_BIO case above. Not sure if this needs to
be common for both that and the PMEM_Q_MQ case, but we probably don't want it
in both places.
_______________________________________________
Linux-nvdimm mailing list
[email protected]
https://lists.01.org/mailman/listinfo/linux-nvdimm