Re: [PATCH 3/3] NVMe: Convert to blk-mq
On Fri, 18 Oct 2013, Matias Bjorling wrote: The nvme driver implements itself as a bio-based driver. This primarily because of high lock congestion for high-performance nvm devices. To remove the congestion within the traditional block layer, a multi-queue block layer is being implemented. This patch converts the current bio-based approach to work with the request-based approach found in the multi-queue block layer. This means that bio responsibility is moved from the driver, into the block layer. In return the block layer packs request structures and submit them to the nvme according to the features/limits of nvme hardware. The patch consists of: * Initialization of multi-queue data structures * Conversion of bio function call into request function calls. * Separate cmdid patchs for admin and normal queues. * Bio splits according to NOT_VIRT_MERGEABLE are assumed to be handled by blk-mq. * Uses the timeout framework blk-mq where possible. Signed-off-by: Matias Bjorling m...@bjorling.me --- drivers/block/nvme-core.c | 765 +++--- drivers/block/nvme-scsi.c | 39 +-- include/linux/nvme.h | 7 +- 3 files changed, 385 insertions(+), 426 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e99a30a..36bf45c 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c [snip] -static void nvme_start_io_acct(struct bio *bio) +static void nvme_start_io_acct(struct request *rq) { - struct gendisk *disk = bio-bi_bdev-bd_disk; - const int rw = bio_data_dir(bio); + struct gendisk *disk = rq-rq_disk; + const int rw = rq_data_dir(rq); int cpu = part_stat_lock(); part_round_stats(cpu, disk-part0); part_stat_inc(cpu, disk-part0, ios[rw]); - part_stat_add(cpu, disk-part0, sectors[rw], bio_sectors(bio)); + part_stat_add(cpu, disk-part0, sectors[rw], blk_rq_sectors(rq)); part_inc_in_flight(disk-part0, rw); part_stat_unlock(); } -static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) +static void nvme_end_io_acct(struct request *rq, unsigned long start_time) { - struct gendisk *disk = bio-bi_bdev-bd_disk; - const int rw = bio_data_dir(bio); + struct gendisk *disk = rq-rq_disk; + const int rw = rq_data_dir(rq); unsigned long duration = jiffies - start_time; int cpu = part_stat_lock(); part_stat_add(cpu, disk-part0, ticks[rw], duration); @@ -342,23 +370,26 @@ static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) part_stat_unlock(); } I think you can remove the io accounting, right? These were added here because the diskstats are not updated in the block layer for bio-based block drivers. @@ -715,32 +606,47 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, dma_dir = DMA_FROM_DEVICE; } - result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); - if (result = 0) + if (nvme_map_rq(nvmeq, iod, rq, dma_dir)) goto free_cmdid; - length = result; - cmnd-rw.command_id = cmdid; + length = blk_rq_bytes(rq); + + cmnd-rw.command_id = rq-tag; The command ids have to be unique on a submission queue. Since each namespace's blk-mq has its own 'tags' used as command ids here but share submission queues, what's stopping the tags for commands sent to namespace 1 from clashing with tags for namespace 2? I think this would work better if one blk-mq was created per device rather than namespace. It would fix the tag problem above and save a lot of memory potentially wasted on millions of requests allocated that can't be used. Do you know how/if this is planned to work with scsi? Will there be one blk-mq per LUN or per host controller? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] NVMe: Refactor doorbell
On Fri, 11 Oct 2013, Matias Bjorling wrote: The doorbell code is repeated various places. Refactor it into its own function for clarity. Signed-off-by: Matias Bjorling m...@bjorling.me Looks good to me. Reviewed-by: Keith Busch keith.bu...@intel.com --- drivers/block/nvme-core.c | 29 + 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index da52092..40998d5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -242,6 +242,13 @@ void put_nvmeq(struct nvme_queue *nvmeq) put_cpu(); } +static inline void nvme_ring_doorbell(struct nvme_queue *nvmeq) +{ + if (++nvmeq-sq_tail == nvmeq-q_depth) + nvmeq-sq_tail = 0; + writel(nvmeq-sq_tail, nvmeq-q_db); +} + /** * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use @@ -252,14 +259,10 @@ void put_nvmeq(struct nvme_queue *nvmeq) static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { unsigned long flags; - u16 tail; + spin_lock_irqsave(nvmeq-q_lock, flags); - tail = nvmeq-sq_tail; - memcpy(nvmeq-sq_cmds[tail], cmd, sizeof(*cmd)); - if (++tail == nvmeq-q_depth) - tail = 0; - writel(tail, nvmeq-q_db); - nvmeq-sq_tail = tail; + memcpy(nvmeq-sq_cmds[nvmeq-sq_tail], cmd, sizeof(*cmd)); + nvme_ring_doorbell(nvmeq); spin_unlock_irqrestore(nvmeq-q_lock, flags); return 0; @@ -619,9 +622,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd-dsm.nr = 0; cmnd-dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - if (++nvmeq-sq_tail == nvmeq-q_depth) - nvmeq-sq_tail = 0; - writel(nvmeq-sq_tail, nvmeq-q_db); + nvme_ring_doorbell(nvmeq); return 0; } @@ -636,9 +637,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd-common.command_id = cmdid; cmnd-common.nsid = cpu_to_le32(ns-ns_id); - if (++nvmeq-sq_tail == nvmeq-q_depth) - nvmeq-sq_tail = 0; - writel(nvmeq-sq_tail, nvmeq-q_db); + nvme_ring_doorbell(nvmeq); return 0; } @@ -729,9 +728,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd-rw.dsmgmt = cpu_to_le32(dsmgmt); nvme_start_io_acct(bio); - if (++nvmeq-sq_tail == nvmeq-q_depth) - nvmeq-sq_tail = 0; - writel(nvmeq-sq_tail, nvmeq-q_db); + nvme_ring_doorbell(nvmeq); return 0; -- 1.8.1.2 ___ Linux-nvme mailing list linux-n...@lists.infradead.org http://merlin.infradead.org/mailman/listinfo/linux-nvme -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/9] block: Make generic_make_request handle arbitrary sized bios
On Fri, 28 Feb 2014, Kent Overstreet wrote: On Thu, Feb 27, 2014 at 12:22:54PM -0500, Matthew Wilcox wrote: On Wed, Feb 26, 2014 at 03:39:49PM -0800, Kent Overstreet wrote: We do this by adding calls to blk_queue_split() to the various make_request functions that need it - a few can already handle arbitrary size bios. Note that we add the call _after_ any call to blk_queue_bounce(); this means that blk_queue_split() and blk_recalc_rq_segments() don't need to be concerned with bouncing affecting segment merging. diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 51824d1f23..e4376b9613 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -737,6 +737,8 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) struct nvme_queue *nvmeq = get_nvmeq(ns-dev); int result = -EBUSY; + blk_queue_split(q, bio, q-bio_split); + if (!nvmeq) { put_nvmeq(NULL); bio_endio(bio, -EIO); I'd suggest that we do: - struct nvme_queue *nvmeq = get_nvmeq(ns-dev); + struct nvme_queue *nvmeq; int result = -EBUSY; + blk_queue_split(q, bio, q-bio_split); + + nvmeq = get_nvmeq(ns-dev); if (!nvmeq) { so that we're running the blk_queue_split() code outside the get_cpu() call. Now, the NVMe driver has its own rules about when BIOs have to be split. Right now, that's way down inside the nvme_map_bio() call when we're walking the bio to compose the scatterlist. Should we instead have an nvme_bio_split() routine that is called instead of blk_queue_split(), and we can simplify nvme_map_bio() since it'll know that it's working with bios that don't have to be split. In fact, I think it would have little NVMe-specific in it at that point, so we could name __blk_bios_map_sg() better, export it to drivers and call it from nvme_map_bio(), which I think would make everybody happier. Actually, reading nvme_map_bio() (it's different since last I looked at it) it looks like nvme should already be able to handle arbitrary size bios? I do intend to rework the blk_bio_map_sg() (or add a new one?) to incrementally map as much of a bio as will fit in the provided scatterlist, but it looks like nvme has some odd restrictions where it's using BIOVEC_PHYS_MERGABLE()/BIOVEC_NOT_VIRT_MERGABLE() so I dunno if it's worth bothering to try and have it use generic code. Is nvme the only driver that has these kinds of restrictions on segment address offsets? If so, I guess there's no reason to make it generic. However we don't need an explicit split here: if the sg fills up (i.e. the places nvme_split_and_submit() is called), we can just mark the bio as partially completed (set bio-bi_iter = iter, i.e. use the iterator you passed to bio_for_each_segment), then increment bi_remaining (which just counts completions, i.e. bio_endio() calls before the bio is really completed) and resubmit the original bio. No need to allocate a split bio, or loop over the bio again in bio_split(). We used to manipulate the original bio to track partial completions, but I changed that for reasons that haven't quite yet materialized. If we move the bio's bi_iter, it will make it difficult to retry the original request on intermittent failures, and it will break the integrity verify if the device format supports protection information. It's also more performant to submit all parts at once rather than wait for the previous part to complete before sending the next. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2] NVMe: silence GCC warning on 32 bit
Looks good to me. This won't apply in linux-nvme yet and it may be a little while before it does, so this might be considered to go upstream through a different tree if you want this in sooner. On Tue, 4 Mar 2014, Paul Bolle wrote: Building nvme-core.o on 32 bit x86 triggers a rather impressive set of GCC warnings: In file included from drivers/block/nvme-core.c:20:0: drivers/block/nvme-core.c: In function 'nvme_submit_bio_queue': include/linux/bio.h:154:55: warning: 'bvprv.bv_offset' may be used uninitialized in this function [-Wmaybe-uninitialized] #define bvec_to_phys(bv) (page_to_phys((bv)-bv_page) + (unsigned long) (bv)-bv_offset) ^ drivers/block/nvme-core.c:498:23: note: 'bvprv.bv_offset' was declared here struct bio_vec bvec, bvprv; ^ In file included from drivers/block/nvme-core.c:20:0: include/linux/bio.h:154:55: warning: 'bvprv.bv_len' may be used uninitialized in this function [-Wmaybe-uninitialized] #define bvec_to_phys(bv) (page_to_phys((bv)-bv_page) + (unsigned long) (bv)-bv_offset) ^ drivers/block/nvme-core.c:498:23: note: 'bvprv.bv_len' was declared here struct bio_vec bvec, bvprv; ^ In file included from [...]/arch/x86/include/asm/page.h:70:0, from [...]/arch/x86/include/asm/processor.h:17, from [...]/arch/x86/include/asm/atomic.h:6, from include/linux/atomic.h:4, from include/linux/mutex.h:18, from include/linux/kernfs.h:13, from include/linux/sysfs.h:15, from include/linux/kobject.h:21, from include/linux/pci.h:28, from include/linux/nvme.h:23, from drivers/block/nvme-core.c:19: include/asm-generic/memory_model.h:31:53: warning: 'bvprv.bv_page' may be used uninitialized in this function [-Wmaybe-uninitialized] #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ^ drivers/block/nvme-core.c:498:23: note: 'bvprv.bv_page' was declared here struct bio_vec bvec, bvprv; ^ These are false positives. Apparently GCC can't determine that bvprv will only be used if first is false and has, therefore, been initialized. It turned out hard to reorganize the code to help GCC understand the flow of this code. So take the easy way out and initialize bvprv to { NULL }. And, since we're touching this code, make first a bool. Signed-off-by: Paul Bolle pebo...@tiscali.nl --- v2: redone, as required by Keith's review. Note that initializing bvprv to { NULL } is already done twice in block/blk-merge.c, which inspired me to take the easy way out here. Also note that it's actually not clear to me why these warnings only trigger on 32 bit. I guess there's some int/long conversion lurking somewhere. I haven't found it. bvprv? Would that mean bio_vec private? But it looks like some temporary variable, so something like tmp may make more sense. Anyhow, still compile tested only (on 32 and 64 bit x86). bvprv == bio_vec previous drivers/block/nvme-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 51824d1..60f98be 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -495,11 +495,11 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct bio *bio, enum dma_data_direction dma_dir, int psegs) { - struct bio_vec bvec, bvprv; + struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; struct scatterlist *sg = NULL; int length = 0, nsegs = 0, split_len = bio-bi_iter.bi_size; - int first = 1; + bool first = true; if (nvmeq-dev-stripe_size) split_len = nvmeq-dev-stripe_size - @@ -525,7 +525,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, return nvme_split_and_submit(bio, nvmeq, split_len); length += bvec.bv_len; bvprv = bvec; - first = 0; + first = false; } iod-nents = nsegs; sg_mark_end(sg); -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3 6/9] nvme: Use pci_enable_msi_range() and pci_enable_msix_range()
On Mon, 20 Jan 2014, Alexander Gordeev wrote: As result deprecation of MSI-X/MSI enablement functions pci_enable_msix() and pci_enable_msi_block() all drivers using these two interfaces need to be updated to use the new pci_enable_msi_range() and pci_enable_msix_range() interfaces. Signed-off-by: Alexander Gordeev agord...@redhat.com Reviewed-by: Keith Busch keith.bu...@intel.com -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: silence GCC warning on 32 bit
On Thu, 20 Feb 2014, Paul Bolle wrote: On Tue, 2014-02-18 at 10:02 +0100, Geert Uytterhoeven wrote: And these popped up in v3.14-rc1 on 32 bit x86. This patch makes these warnings go away. Compile tested only (on 32 and 64 bit x86). Review is appreciated, because the code I'm touching here is far from obvious to me. 8 From: Paul Bolle pebo...@tiscali.nl These are false positives. A bit of staring at the code reveals that struct bio_vec bvprv and int first operate in lockstep: if first is 1 bvprv isn't yet initialized and if first is 0 bvprv will be initialized. But if we convert bvprv to a pointer and initialize it to NULL we can do away with first. And it turns out the warning is gone if we do that. So that appears to be enough to help GCC understand the flow of this code. That's pretty much how it was done before the bio_vec iterators were merged, but I think there's a problem with this approach for this patch (see below). Signed-off-by: Paul Bolle pebo...@tiscali.nl --- drivers/block/nvme-core.c | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 51824d1..f9fb28b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -495,11 +495,10 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, struct bio *bio, enum dma_data_direction dma_dir, int psegs) { - struct bio_vec bvec, bvprv; + struct bio_vec bvec, *bvprv = NULL; struct bvec_iter iter; struct scatterlist *sg = NULL; int length = 0, nsegs = 0, split_len = bio-bi_iter.bi_size; - int first = 1; if (nvmeq-dev-stripe_size) split_len = nvmeq-dev-stripe_size - @@ -508,10 +507,10 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, sg_init_table(iod-sg, psegs); bio_for_each_segment(bvec, bio, iter) { - if (!first BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { + if (bvprv BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { sg-length += bvec.bv_len; } else { - if (!first BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) + if (bvprv BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) return nvme_split_and_submit(bio, nvmeq, length); @@ -524,8 +523,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, if (split_len - length bvec.bv_len) return nvme_split_and_submit(bio, nvmeq, split_len); length += bvec.bv_len; - bvprv = bvec; - first = 0; + bvprv = bvec; The address of bvec doesn't change, so bvprv is still going to point to bvec on the next iteration instead of the previous bio_vec like we want. When the next iteration gets to this comparison: + if (bvprv BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { both bio_vec's have the same address. } iod-nents = nsegs; sg_mark_end(sg); -- 1.8.5.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 3/3] NVMe: Convert to blk-mq
On Fri, 18 Oct 2013, Matias Bjørling wrote: On 10/18/2013 05:13 PM, Keith Busch wrote: On Fri, 18 Oct 2013, Matias Bjorling wrote: The nvme driver implements itself as a bio-based driver. This primarily because of high lock congestion for high-performance nvm devices. To remove the congestion within the traditional block layer, a multi-queue block layer is being implemented. -result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); -if (result = 0) +if (nvme_map_rq(nvmeq, iod, rq, dma_dir)) goto free_cmdid; -length = result; -cmnd-rw.command_id = cmdid; +length = blk_rq_bytes(rq); + +cmnd-rw.command_id = rq-tag; The command ids have to be unique on a submission queue. Since each namespace's blk-mq has its own 'tags' used as command ids here but share submission queues, what's stopping the tags for commands sent to namespace 1 from clashing with tags for namespace 2? I think this would work better if one blk-mq was created per device rather than namespace. It would fix the tag problem above and save a lot of memory potentially wasted on millions of requests allocated that can't be used. You're right. I didn't see the connection. In v3 I'll push struct request_queue to nvme_dev and map the queues appropriately. It will also fix the command id issues. Just anticipating a possible issue with the suggestion. Will this separate the logical block size from the request_queue? Each namespace can have a different format, so the block size and request_queue can't be tied together like it currently is for this to work.
Re: [PATCH 3/3] NVMe: Convert to blk-mq
On Tue, 22 Oct 2013, Matias Bjorling wrote: Den 22-10-2013 18:55, Keith Busch skrev: On Fri, 18 Oct 2013, Matias Bjørling wrote: On 10/18/2013 05:13 PM, Keith Busch wrote: On Fri, 18 Oct 2013, Matias Bjorling wrote: The nvme driver implements itself as a bio-based driver. This primarily because of high lock congestion for high-performance nvm devices. To remove the congestion within the traditional block layer, a multi-queue block layer is being implemented. -result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs); -if (result = 0) +if (nvme_map_rq(nvmeq, iod, rq, dma_dir)) goto free_cmdid; -length = result; -cmnd-rw.command_id = cmdid; +length = blk_rq_bytes(rq); + +cmnd-rw.command_id = rq-tag; The command ids have to be unique on a submission queue. Since each namespace's blk-mq has its own 'tags' used as command ids here but share submission queues, what's stopping the tags for commands sent to namespace 1 from clashing with tags for namespace 2? I think this would work better if one blk-mq was created per device rather than namespace. It would fix the tag problem above and save a lot of memory potentially wasted on millions of requests allocated that can't be used. You're right. I didn't see the connection. In v3 I'll push struct request_queue to nvme_dev and map the queues appropriately. It will also fix the command id issues. Just anticipating a possible issue with the suggestion. Will this separate the logical block size from the request_queue? Each namespace can have a different format, so the block size and request_queue can't be tied together like it currently is for this to work. If only a couple of different logical sizes are to be expected (1-4), we can keep a list of already initialized request queues, and use the one that match an already initialized? The spec allows a namespace to have up to 16 different block formats and they need not be the same 16 as another namespace on the same device. From a practical standpoint, I don't think devices will support more than a few formats, but even if you kept it to that many request queues, you just get back to conflicting command id tags and some wasted memory. Axboe, do you know of a better solution?
Re: [PATCH RFC 2/2] NVMe: rfc blk-mq support
On Tue, 8 Oct 2013, Matias Bjørling wrote: Convert the driver to blk mq. The patch consists of: * Initializion of mq data structures. * Convert function calls from bio to request data structures. * IO queues are split into an admin queue and io queues. * bio splits are removed as it should be handled by block layer. Signed-off-by: Matias Bjørling m...@bjorling.me I have no opinion right now if this is a good idea or not. I'll just comment on a couple issues on this implementation. Otherwise I think it's pretty neat and gave me a reason to explore multiqueues! First a couple minor suggestions: You might want to use REQ_END from the rq-cmd_flags to know if you should write the queue doorbell or not. Aggregating these would help most devices but we didn't have a good way of knowing before if this was the last request or not. Maybe change nvme_submit_bio_queue to return a BLK_MQ_RQ_QUEUE_ status so you don't need that switch statement after calling it. Must do something about requests that don't align to PRPs. I think you mentioned this in the outstanding work in [0/2]. struct nvme_queue *get_nvmeq(struct nvme_dev *dev) { - return dev-queues[get_cpu() + 1]; + get_cpu(); + return dev-admin_queue; } get_nvmeq now returns only the admin queue when it used to return only IO queues. This breaks NVME_IO and SG_IO ioctl handling. +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int i) +{ + struct nvme_ns *ns = data; + struct nvme_dev *dev = ns-dev; + struct nvme_queue *nq; + + nq = nvme_create_queue(dev, i + 1, hctx-queue_depth, i); + if (IS_ERR(nq)) + return PTR_ERR(nq); + + hctx-driver_data = nq; + + return 0; +} This right here is the biggest problem with the implemenation. It is going to fail for every namespace but the first one since each namespace registers a multiqueue and each mulitqueue requires a hw context to work. The number of queues is for the device, not namespace, so only the first namespace is going to successfully return from nvme_init_hctx; the rest will be unable to create an NVMe IO queue for trying to create one with already allocated QID. You should instead create the IO queues on the device like how it was done before then just set the hctx-driver_data to dev-queues[i + 1] or something like. +static enum blk_eh_timer_return nvme_timeout(struct request *rq) +{ + /* Currently the driver handle timeouts by itself */ + return BLK_EH_NOT_HANDLED; +} Need do something with the command timeouts here or somewhere. You've changed the driver to poll only on the admin queue for timed out commands, and left the multiqueue timeout a no-op.
Re: [PATCH RFC 0/2] Convert from bio-based to blk-mq
On Tue, 8 Oct 2013, Jens Axboe wrote: On Tue, Oct 08 2013, Matthew Wilcox wrote: On Tue, Oct 08, 2013 at 11:34:20AM +0200, Matias Bjørling wrote: The nvme driver implements itself as a bio-based driver. This primarily because of high lock congestion for high-performance nvm devices. To remove the congestion, a multi-queue block layer is being implemented. Um, no. You'll crater performance by adding another memory allocation (of the struct request). multi-queue is not the solution. That's a rather jump to conclusions statement to make. As Matias mentioned, there are no extra fast path allocations. Once the tagging is converted as well, I'd be surprised if it performs worse than before. And that on top of a net reduction in code. blk-mq might not be perfect as it stands, but it's a helluva lot better than a bunch of flash based drivers with lots of duplicated code and mechanisms. We need to move away from that. -- Jens Axboe But this wastes copious amounts of memory on an NVMe device with more than 1 namespace. The hardware's queues are shared among all namespaces, so you can't possibly have all the struct requests in use. What would be better is if I can create one blk-mq for each device/host and attach multiple gendisks to that.
[PATCH 1/2] driver-core: allow asynchronous device shutdown
A patch to allow .shutdown to execute asynchronously. Some devices may take a long time to complete a shutdown, so this patch lets a driver safely shutdown multiple devices asynchronously. This uses an exclusive asynchronous domain so other unrelated async tasks can't cause shutdown to hold up indefinitely. Signed-off-by: Keith Busch keith.bu...@intel.com --- drivers/base/core.c|4 include/linux/device.h |1 + 2 files changed, 5 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index 20da3ad..71b83bb 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -10,6 +10,7 @@ * */ +#include linux/async.h #include linux/device.h #include linux/err.h #include linux/init.h @@ -1926,6 +1927,8 @@ out: } EXPORT_SYMBOL_GPL(device_move); +ASYNC_DOMAIN_EXCLUSIVE(shutdown_domain); +EXPORT_SYMBOL(shutdown_domain); /** * device_shutdown - call -shutdown() on each device to shutdown. */ @@ -1986,6 +1989,7 @@ void device_shutdown(void) spin_lock(devices_kset-list_lock); } spin_unlock(devices_kset-list_lock); + async_synchronize_full_domain(shutdown_domain); } /* diff --git a/include/linux/device.h b/include/linux/device.h index d1d1c05..95ed9f1 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -997,6 +997,7 @@ static inline int devtmpfs_mount(const char *mountpoint) { return 0; } /* drivers/base/power/shutdown.c */ extern void device_shutdown(void); +extern struct async_domain shutdown_domain; /* debugging and troubleshooting/diagnostic helpers. */ extern const char *dev_driver_string(const struct device *dev); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] NVMe: Complete shutdown asynchronously
Some devices may take a long time to complete a shutdown. The driver can perform the device shutdown asynchronously so multiple devices may be done in parallel and speed up the overall system shutdown. PCI disables MSI/MSI-x after the driver's .shutdown returns, so the driver enables the INTx irq if performing the shutdown asynchronously. Signed-off-by: Keith Busch keith.bu...@intel.com --- drivers/block/nvme-core.c | 28 ++-- include/linux/nvme.h |1 + 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cd8a8bc7..a9982b8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -13,6 +13,7 @@ */ #include linux/nvme.h +#include linux/async.h #include linux/bio.h #include linux/bitops.h #include linux/blkdev.h @@ -1401,7 +1402,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) cc = (readl(dev-bar-cc) ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; writel(cc, dev-bar-cc); - timeout = 2 * HZ + jiffies; + timeout = 20 * HZ + jiffies; while ((readl(dev-bar-csts) NVME_CSTS_SHST_MASK) != NVME_CSTS_SHST_CMPLT) { msleep(100); @@ -2739,6 +2740,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev-reset_workfn = nvme_reset_failed_dev; INIT_WORK(dev-reset_work, nvme_reset_workfn); dev-pci_dev = pdev; + dev-intx_irq = pdev-irq; pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) @@ -2791,10 +2793,32 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) return result; } +static void nvme_async_shutdown(void *data, async_cookie_t cookie) +{ + struct nvme_dev *dev = data; + nvme_dev_shutdown(dev); + free_irq(dev-intx_irq, raw_nvmeq(dev, 0)); +} + static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_shutdown(dev); + struct nvme_queue *adminq = raw_nvmeq(dev, 0); + bool use_async = true; + + /* MSI/MSI-x are disabled after returning from this function */ + if (dev-pci_dev-msi_enabled || dev-pci_dev-msix_enabled) { + if (request_irq(dev-intx_irq, nvme_irq, IRQF_SHARED, + adminq-irqname, adminq)) + use_async = false; + else + pci_intx(dev-pci_dev, 1); + } + if (!use_async) + nvme_dev_shutdown(dev); + else + async_schedule_domain(nvme_async_shutdown, dev, + shutdown_domain); } static void nvme_remove(struct pci_dev *pdev) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1813cfd..3df7c48 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -80,6 +80,7 @@ struct nvme_dev { unsigned queue_count; unsigned online_queues; unsigned max_qid; + unsigned intx_irq; int q_depth; u32 db_stride; u32 ctrl_config; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] Asynchronous device shutdown
Device shutdown synchronization was recently reverted due to complications from unrelated 'async_schedule's holding up shutdown forever and the fact that no device driver was making use of the capability anyway. I would like to make use of this capability though; I'm told by some vendors that devices I develop drivers for may take a while to safely shutdown and there might be a lot of these devices in a machine, so we would benefit from letting this happen in parallel. This patch set adds a slightly different shutdown synchronization using a new domain, and I've added an implementation to the nvm-express driver here so there's at least one user, assuming this is acceptable. Keith Busch (2): driver-core: allow asynchronous device shutdown NVMe: Complete shutdown asynchronously drivers/base/core.c |4 drivers/block/nvme-core.c | 28 ++-- include/linux/device.h|1 + include/linux/nvme.h |1 + 4 files changed, 32 insertions(+), 2 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v7] NVMe: conversion to blk-mq
On Tue, 10 Jun 2014, Matias Bjørling wrote: This converts the current NVMe driver to utilize the blk-mq layer. I'd like to run xfstests on this, but it is failing mkfs.xfs. I honestly don't know much about this area, but I think this may be from the recent chunk sectors patch causing a __bio_add_page to reject adding a new page. [ 762.968002] [ cut here ] [ 762.973238] kernel BUG at fs/direct-io.c:753! [ 762.978189] invalid opcode: [#1] SMP [ 762.983003] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode pcspkr ehci_pci ehci_hcd usbcore lpc_ich ioatdma mfd_core usb_common acpi_cpufreq i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sr_mod cdrom sd_mod crc_t10dif crct10dif_common isci libsas ahci igb libahci scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod dca [ 763.066172] CPU: 0 PID: 12870 Comm: mkfs.xfs Not tainted 3.15.0-rc8+ #13 [ 763.073735] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 [ 763.085290] task: 88042809c510 ti: 880423c58000 task.ti: 880423c58000 [ 763.093728] RIP: 0010:[81142ddf] [81142ddf] dio_send_cur_page+0xa1/0xa8 [ 763.103325] RSP: 0018:880423c5ba68 EFLAGS: 00010202 [ 763.109333] RAX: 0001 RBX: 880423c5bbf8 RCX: 1000 [ 763.117410] RDX: 0001 RSI: 88042e4c8f00 RDI: 8804274e7008 [ 763.125487] RBP: 88042834b0c0 R08: R09: 0006 [ 763.133569] R10: 0006 R11: 880423c5b8d0 R12: 880423c5bb90 [ 763.141645] R13: 1000 R14: R15: 2e939002 [ 763.149720] FS: 7f6052596740() GS:88043f60() knlGS: [ 763.158891] CS: 0010 DS: ES: CR0: 80050033 [ 763.165411] CR2: 01a23000 CR3: 000420638000 CR4: 000407f0 [ 763.173495] Stack: [ 763.175848] 880423c5bbf8 88042834b0c0 ea000e530cc0 81142e8e [ 763.184543] 88042834b0c0 8804 0006 88042834b0c0 [ 763.193245] 0008 ea000e530cc0 [ 763.201949] Call Trace: [ 763.204793] [81142e8e] ? submit_page_section+0xa8/0x112 [ 763.211809] [8114388c] ? do_blockdev_direct_IO+0x7da/0xad8 [ 763.219124] [810e74b8] ? zone_statistics+0x46/0x79 [ 763.225659] [810d7766] ? get_page_from_freelist+0x625/0x727 [ 763.233064] [811409b1] ? I_BDEV+0x8/0x8 [ 763.238516] [81140cc5] ? blkdev_direct_IO+0x52/0x57 [ 763.245134] [811409b1] ? I_BDEV+0x8/0x8 [ 763.250601] [810d1749] ? generic_file_direct_write+0xe2/0x145 [ 763.258193] [810d18ea] ? __generic_file_aio_write+0x13e/0x225 [ 763.265782] [81140c00] ? blkdev_aio_write+0x42/0xa6 [ 763.272417] [8111801a] ? do_sync_write+0x50/0x73 [ 763.278752] [81118b36] ? vfs_write+0x9f/0xfc [ 763.284706] [81118f48] ? SyS_pwrite64+0x66/0x8c [ 763.290952] [8139fe12] ? system_call_fastpath+0x16/0x1b [ 763.297958] Code: 89 ef e8 a0 fd ff ff 48 8b 53 78 49 8d 4c 24 30 48 89 de 48 89 ef e8 87 fc ff ff 85 c0 75 0e 48 89 df e8 17 ff ff ff 85 c0 74 cd 0f 0b 5b 5d 41 5c c3 41 57 4d 89 cf 41 56 41 89 ce 41 55 45 89 [ 763.324319] RIP [81142ddf] dio_send_cur_page+0xa1/0xa8 [ 763.331311] RSP 880423c5ba68 [ 763.335359] ---[ end trace d57f8af6b5f01282 ]---
Re: [PATCH v7] NVMe: conversion to blk-mq
On Tue, 10 Jun 2014, Jens Axboe wrote: On Jun 10, 2014, at 9:52 AM, Keith Busch keith.bu...@intel.com wrote: On Tue, 10 Jun 2014, Matias Bjørling wrote: This converts the current NVMe driver to utilize the blk-mq layer. I'd like to run xfstests on this, but it is failing mkfs.xfs. I honestly don't know much about this area, but I think this may be from the recent chunk sectors patch causing a __bio_add_page to reject adding a new page. Gah, yes that's a bug in the chunk patch. It must always allow a single page at any offset. I'll test and send out a fix. I have two devices, one formatted 4k, the other 512. The 4k is used as the TEST_DEV and 512 is used as SCRATCH_DEV. I'm always hitting a BUG when unmounting the scratch dev in xfstests generic/068. The bug looks like nvme was trying to use an SGL that doesn't map correctly to a PRP. Also, it doesn't look like this driver can recover from an unresponsive device, leaving tasks in uniterruptible sleep state forever. Still looking into that one though; as far as I can tell the device is perfectly fine, but lots of Cancelling I/O messages are getting logged. [ 478.580863] [ cut here ] [ 478.586111] kernel BUG at drivers/block/nvme-core.c:486! [ 478.592130] invalid opcode: [#1] SMP [ 478.596963] Modules linked in: xfs nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd pcspkr usbcore acpi_cpufreq lpc_ich ioatdma mfd_core usb_common i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common crc32c_intel isci libsas ahci libahci scsi_transport_sas igb libata ptp pps_core i2c_algo_bit scsi_mod i2c_core dca [last unloaded: nvme] [ 478.682913] CPU: 5 PID: 17969 Comm: fsstress Not tainted 3.15.0-rc8+ #19 [ 478.690510] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 [ 478.702126] task: 88042bc18cf0 ti: 88042d3f task.ti: 88042d3f [ 478.710624] RIP: 0010:[a04bc8e6] [a04bc8e6] nvme_setup_prps+0x1b8/0x1eb [nvme] [ 478.720971] RSP: 0018:88042d3f38c8 EFLAGS: 00010286 [ 478.727013] RAX: 0014 RBX: 88042b96e400 RCX: 000803d0e000 [ 478.735096] RDX: 0015 RSI: 0246 RDI: 88042b96e6b0 [ 478.743177] RBP: 00015e00 R08: R09: 0e00 [ 478.751264] R10: 0e00 R11: 88042d3f3900 R12: 88042b96e6d0 [ 478.759349] R13: 880823f40e00 R14: 88042b96e710 R15: fc00 [ 478.767435] FS: 7f92eb29c700() GS:88043f6a() knlGS: [ 478.776614] CS: 0010 DS: ES: CR0: 80050033 [ 478.783143] CR2: 7f92e401ff18 CR3: 00042b5d5000 CR4: 000407e0 [ 478.791218] Stack: [ 478.793558] 8808229367c0 000805205000 88040014 0a0029abf540 [ 478.802302] 88082bcd8140 03100020 0017 000823f40e00 [ 478.811045] 0e00 880827de3300 88042b96e400 88082ad60c40 [ 478.819789] Call Trace: [ 478.822630] [a04bca5e] ? nvme_queue_rq+0x145/0x33b [nvme] [ 478.829859] [811c854f] ? blk_mq_make_request+0xd7/0x140 [ 478.836891] [811bf583] ? generic_make_request+0x98/0xd5 [ 478.843906] [811c0240] ? submit_bio+0x100/0x109 [ 478.850161] [81142bc2] ? dio_bio_submit+0x67/0x86 [ 478.856596] [81143a08] ? do_blockdev_direct_IO+0x956/0xad8 [ 478.863924] [a0592a2e] ? __xfs_get_blocks+0x410/0x410 [xfs] [ 478.871338] [a0591c12] ? xfs_vm_direct_IO+0xda/0x146 [xfs] [ 478.878652] [a0592a2e] ? __xfs_get_blocks+0x410/0x410 [xfs] [ 478.886066] [a0592b00] ? xfs_finish_ioend_sync+0x1a/0x1a [xfs] [ 478.893775] [810d1749] ? generic_file_direct_write+0xe2/0x145 [ 478.901385] [a05e81c0] ? xfs_file_dio_aio_write+0x1ba/0x208 [xfs] [ 478.909391] [a059c43d] ? xfs_file_aio_write+0xc4/0x157 [xfs] [ 478.916892] [8111801a] ? do_sync_write+0x50/0x73 [ 478.923227] [81118b36] ? vfs_write+0x9f/0xfc [ 478.929173] [81118e22] ? SyS_write+0x56/0x8a [ 478.935122] [8139fe52] ? system_call_fastpath+0x16/0x1b [ 478.942137] Code: 48 63 c2 41 81 ef 00 10 00 00 ff c2 83 7c 24 1c 00 49 89 4c c5 00 7e 35 48 81 c1 00 10 00 00 41 83 ff 00 0f 8f 6f ff ff ff 74 02 0f 0b 4c 89 e7 89 54 24 10 e8 03 e3 d2 e0 8b 54 24 10 49 89 c4 [ 478.968952] RIP [a04bc8e6] nvme_setup_prps+0x1b8/0x1eb [nvme] [ 478.976638] RSP 88042d3f38c8 [ 478.980699] ---[ end
Re: [PATCH v7] NVMe: conversion to blk-mq
On Tue, 10 Jun 2014, Jens Axboe wrote: On 06/10/2014 01:29 PM, Keith Busch wrote: I have two devices, one formatted 4k, the other 512. The 4k is used as the TEST_DEV and 512 is used as SCRATCH_DEV. I'm always hitting a BUG when unmounting the scratch dev in xfstests generic/068. The bug looks like nvme was trying to use an SGL that doesn't map correctly to a PRP. I'm guessing it's some of the coalescing settings, since the driver is now using the generic block rq mapping. Ok, sounds right. I mentioned in a way earlier review it doesn't look like a request that doesn't conform to a PRP list would get split anymore, and this test seems to confirm that. Can we create something that will allow a driver to add DMA constraints to a request queue with the rules of a PRP list? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v7] NVMe: conversion to blk-mq
On Tue, 10 Jun 2014, Jens Axboe wrote: On 06/10/2014 03:10 PM, Keith Busch wrote: On Tue, 10 Jun 2014, Jens Axboe wrote: On 06/10/2014 01:29 PM, Keith Busch wrote: I have two devices, one formatted 4k, the other 512. The 4k is used as the TEST_DEV and 512 is used as SCRATCH_DEV. I'm always hitting a BUG when unmounting the scratch dev in xfstests generic/068. The bug looks like nvme was trying to use an SGL that doesn't map correctly to a PRP. I'm guessing it's some of the coalescing settings, since the driver is now using the generic block rq mapping. Ok, sounds right. I mentioned in a way earlier review it doesn't look like a request that doesn't conform to a PRP list would get split anymore, and this test seems to confirm that. Can we create something that will allow a driver to add DMA constraints to a request queue with the rules of a PRP list? I haven't even looked at the rules - can you briefly outline them? From a quick look, seems it does prp chaining for every 512 entries. But nvme_setup_prps() looks like voodoo to an uninitiated, it could have used a comment or two :-) Yeah, nvme_setup_prps is probably the least readable code in this driver. Maybe some comments are in order here... There are two rules for an SGL to be mappable to a PRP: 1. Every element must have zero page offset, except the first. 2. Every element must end on a page boundary, except the last. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v7] NVMe: conversion to blk-mq
On Wed, 11 Jun 2014, Matias Bjørling wrote: I've rebased nvmemq_review and added two patches from Jens that add support for requests with single range virtual addresses. Keith, will you take it for a spin and see if it fixes 068 for you? There might still be a problem with some flushes, I'm looking into this. So far so good: it passed the test that was previously failing. I'll let the remaining xfstests run and see what happens.
Re: [PATCH v7] NVMe: conversion to blk-mq
On Thu, 12 Jun 2014, Matias Bjørling wrote: On 06/12/2014 12:51 AM, Keith Busch wrote: So far so good: it passed the test that was previously failing. I'll let the remaining xfstests run and see what happens. Great. The flushes was a fluke. I haven't been able to reproduce. Cool, most of the tests are passing, except there is some really weird stuff with the timeout handling. You've got two different places with the same two prints, so I was a little confused where they were coming from. I've got some more things to try to debug this, but this is thwat I have so far: It looks like the abort_complete callback is broken. First, the dev_warn there makes no sense because you're pointing to the admin queue's abort request, not the IO queue's request you're aborting. Then you call cancel_cmd_info on the same command you're completing but it looks like you're expecting to be doing this on the IO request you meant to abort, but that could cause double completions.
Re: [PATCH v7] NVMe: conversion to blk-mq
On Thu, 12 Jun 2014, Keith Busch wrote: On Thu, 12 Jun 2014, Matias Bjørling wrote: On 06/12/2014 12:51 AM, Keith Busch wrote: So far so good: it passed the test that was previously failing. I'll let the remaining xfstests run and see what happens. Great. The flushes was a fluke. I haven't been able to reproduce. Cool, most of the tests are passing, except there is some really weird stuff with the timeout handling. You've got two different places with the same two prints, so I was a little confused where they were coming from. I've got some more things to try to debug this, but this is thwat I have so far: It looks like the abort_complete callback is broken. First, the dev_warn there makes no sense because you're pointing to the admin queue's abort request, not the IO queue's request you're aborting. Then you call cancel_cmd_info on the same command you're completing but it looks like you're expecting to be doing this on the IO request you meant to abort, but that could cause double completions. I'll attach the diff I wrote to make this work. Lots of things had to change: Returning BLK_EH_HANDLED from the timeout handler isn't the right thing to do since the request isn't completed by the driver in line with this call and returning this from the driver caused the block layer to return success though no completion occured yet, so it was lying. The abort_completion handler shouldn't be trying to do things for the command it tried to abort. It could have completed before, after, or still be owned by the controller at this point, and we don't want it to be making decisions. When forcefully cancelling all IO, you don't want to check if the device is initialized before doing that. We're failing/removing the device after we've shut her down, there won't be another opprotunity to return status for the outstanding reqs after this. When cancelling IOs, we have to check if the hwctx has a valid tags for some reason. I have 32 cores in my system and as many queues, but blk-mq is only using half of those queues and freed the tags for the rest after they'd been initialized without telling the driver. Why is blk-mq not making utilizing all my queues? The diff below provides a way to synthesize a failed controller by using the sysfs pci/devices/bdf/reset. I just have the device removed from the polling list so we don't accidently trigger a failure when we can't read the device registers. diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 1419bbf..4f9e4d8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -170,8 +170,9 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev-queues[(hctx_idx % dev-queue_count) - + 1]; + struct nvme_queue *nvmeq = dev-queues[ + (hctx_idx % dev-queue_count) + 1]; + /* nvmeq queues are shared between namespaces. We assume here that * blk-mq map the tags so they match up with the nvme queue tags */ if (!nvmeq-hctx) @@ -245,26 +246,13 @@ static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) static void abort_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { - struct request *req; - struct nvme_cmd_info *aborted = ctx; - struct nvme_queue *a_nvmeq = aborted-nvmeq; - struct blk_mq_hw_ctx *hctx = nvmeq-hctx; - void *a_ctx; - nvme_completion_fn a_fn; - static struct nvme_completion a_cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ 1), - }; + struct request *req = ctx; - req = blk_mq_tag_to_rq(hctx-tags, cqe-command_id); + u16 status = le16_to_cpup(cqe-status) 1; + u32 result = le32_to_cpup(cqe-result); blk_put_request(req); - if (!cqe-status) - dev_warn(nvmeq-q_dmadev, Could not abort I/O %d QID %d, - req-tag, nvmeq-qid); - - a_ctx = cancel_cmd_info(aborted, a_fn); - a_fn(a_nvmeq, a_ctx, a_cqe); - + dev_warn(nvmeq-q_dmadev, Abort status:%x result:%x, status, result); ++nvmeq-dev-abort_limit; } @@ -391,6 +379,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, { struct nvme_iod *iod = ctx; struct request *req = iod-private; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(cqe-status) 1; @@ -404,10 +393,14 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, } else req-errors = 0; - if (iod-nents) { + if (cmd_rq-aborted) + dev_warn(nvmeq-dev-pci_dev-dev, + completing aborted command with status
Re: [PATCH v7] NVMe: conversion to blk-mq
On Fri, 13 Jun 2014, Jens Axboe wrote: On 06/12/2014 06:06 PM, Keith Busch wrote: When cancelling IOs, we have to check if the hwctx has a valid tags for some reason. I have 32 cores in my system and as many queues, but It's because unused queues are torn down, to save memory. blk-mq is only using half of those queues and freed the tags for the rest after they'd been initialized without telling the driver. Why is blk-mq not making utilizing all my queues? You have 31 + 1 queues, so only 31 mappable queues. blk-mq symmetrically distributes these, so you should have a core + thread sibling on 16 queues. And yes, that leaves 15 idle hardware queues for this specific case. I like the symmetry, it makes it more predictable if things are spread out evenly. You'll see performance differences on some workloads that depend on which cores your process runs and which one services an interrupt. We can play games with with cores and see what happens on my 32 cpu system. I usually run 'irqbalance --hint=exact' for best performance, but that doesn't do anything with blk-mq since the affinity hint is gone. I ran the following script several times on each version of the driver. This will pin a sequential read test to cores 0, 8, and 16. The device is local to NUMA node on cores 0-7 and 16-23; the second test runs on the remote node and the third on the thread sibling of 0. Results were averaged, but very consistent anyway. The system was otherwise idle. # for i in $(seq 0 8 16); do let cpu=1$i cpu=`echo $cpu | awk '{printf %#x\n, $1}'` taskset ${cpu} dd if=/dev/nvme0n1 of=/dev/null bs=4k count=100 iflag=direct done Here are the performance drops observed with blk-mq with the existing driver as baseline: CPU : Drop :. 0 : -6% 8 : -36% 16 : -12% -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v7] NVMe: conversion to blk-mq
On Fri, 13 Jun 2014, Jens Axboe wrote: On 06/13/2014 09:05 AM, Keith Busch wrote: Here are the performance drops observed with blk-mq with the existing driver as baseline: CPU : Drop :. 0 : -6% 8 : -36% 16 : -12% We need the hints back for sure, I'll run some of the same tests and verify to be sure. Out of curiousity, what is the topology like on your box? Are 0/1 siblings, and 0..7 one node? 0-7 are different cores on node 0, with 16-23 being their thread siblings. Similiar setup with 8-15 and 24-32 on node 1. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v7] NVMe: conversion to blk-mq
On Fri, 13 Jun 2014, Jens Axboe wrote: OK, same setup as mine. The affinity hint is really screwing us over, no question about it. We just need a: irq_set_affinity_hint(dev-entry[nvmeq-cq_vector].vector, hctx-cpumask); in the -init_hctx() methods to fix that up. That brings us to roughly the same performance, except for the cases where the dd is run on the thread sibling of the core handling the interrupt. And granted, with the 16 queues used, that'll happen on blk-mq. But since you have 32 threads and just 31 IO queues, the non blk-mq driver must end up sharing for some cases, too. So what do we care most about here? Consistency, or using all queues at all costs? I think we want to use all h/w queues regardless of mismatched sharing. A 24 thread server shouldn't use more of the hardware than a 32. You're right, the current driver shares the queues on anything with 32 or more cpus with this NVMe controller, but we wrote an algorithm that allocates the most and tries to group them with their nearest neighbors. One performance oddity we observe is that servicing the interrupt on the thread sibling of the core that submitted the I/O is the worst performing cpu you can chose; it's actually better to use a different core on the same node. At least that's true as long as you're not utilizing the cpus for other work, so YMMV. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V3] NVMe: basic conversion to blk-mq
On Wed, 28 May 2014, Matias Bjørling wrote: This converts the current NVMe driver to utilize the blk-mq layer. I am concerned about device hot removal since the h/w queues can be freed at any time. I *think* blk-mq helps with this in that the driver will not see a new request after calling blk_cleanup_queue. If you can confirm that's true and that blk-mq waits for all requests active in the driver to return to the block layer, then we're probably okay in this path. That wasn't true as a bio based driver which is why we are cautious with all the locking and barriers. But what about the IOCTL paths? It also doesn't look like we're handling the case where the SGL can't map to a PRP. +static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct bio *bio = iod-private; + struct request *req = iod-private; + u16 status = le16_to_cpup(cqe-status) 1; - if (unlikely(status)) { - if (!(status NVME_SC_DNR || - bio-bi_rw REQ_FAILFAST_MASK) - (jiffies - iod-start_time) IOD_TIMEOUT) { - if (!waitqueue_active(nvmeq-sq_full)) - add_wait_queue(nvmeq-sq_full, - nvmeq-sq_cong_wait); - list_add_tail(iod-node, nvmeq-iod_bio); - wake_up(nvmeq-sq_full); - return; - } - } if (iod-nents) { - dma_unmap_sg(nvmeq-q_dmadev, iod-sg, iod-nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_end_io_acct(bio, iod-start_time); + dma_unmap_sg(nvmeq-dev-pci_dev-dev, iod-sg, iod-nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); } nvme_free_iod(nvmeq-dev, iod); - if (status) - bio_endio(bio, -EIO); + + if (unlikely(status)) + req-errors = -EIO; else - bio_endio(bio, 0); + req-errors = 0; + + blk_mq_complete_request(req); } Is blk-mq going to retry intermittently failed commands for me? It doesn't look like it will. +static int nvme_submit_flush_sync(struct nvme_queue *nvmeq, struct nvme_ns *ns) +{ + struct request *req; + struct nvme_command cmnd; + + req = blk_mq_alloc_request(ns-queue, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + + nvme_setup_flush(cmnd, ns, req-tag); + nvme_submit_sync_cmd(req, cmnd, NULL, NVME_IO_TIMEOUT); return 0; } It looks like this function above is being called from an interrupt context where we are already holding a spinlock. The sync command will try to take that same lock. /* - * Called with local interrupts disabled and the q_lock held. May not sleep. + * Called with preemption disabled, may not sleep. */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) +static int nvme_submit_req_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = bio_phys_segments(ns-queue, bio); - int result; + enum dma_data_direction dma_dir; + int psegs = req-nr_phys_segments; - if ((bio-bi_rw REQ_FLUSH) psegs) - return nvme_split_flush_data(nvmeq, bio); - - iod = nvme_alloc_iod(psegs, bio-bi_iter.bi_size, GFP_ATOMIC); + iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC); if (!iod) - return -ENOMEM; + return BLK_MQ_RQ_QUEUE_BUSY; + + iod-private = req; + + nvme_set_info(cmd, iod, req_completion); + + if ((req-cmd_flags REQ_FLUSH) psegs) { + struct flush_cmd_info *flush_cmd = kmalloc( + sizeof(struct flush_cmd_info), GFP_KERNEL); The comment above says may not sleep, but using GFP_KERNEL here. I actually think it is safe to sleep, though since you're not taking a lock until later, so maybe you can change all the allocs to GFP_KERNEL? +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_queue_request, I think you would get some unpredictable behavior if a request really did go through nvme_queue_request on the admin queue. I don't see how that would happen; just sayin. static void nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; + unsigned i; - max = min(dev-max_qid, num_online_cpus()); - for (i = dev-queue_count; i = max; i++) + for (i = dev-queue_count; i = dev-max_qid; i++)
Re: [PATCH V3] NVMe: basic conversion to blk-mq
On Thu, 29 May 2014, Jens Axboe wrote: On 2014-05-28 21:07, Keith Busch wrote: Barring any bugs in the code, then yes, this should work. On the scsi-mq side, extensive error injection and pulling has been done, and it seems to hold up fine now. The ioctl path would need to be audited. It's a little different than scsi. This would be like pulling the drive and the HBA. In any case, it still looks like it works as expected. +static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; -struct bio *bio = iod-private; +struct request *req = iod-private; + u16 status = le16_to_cpup(cqe-status) 1; -if (unlikely(status)) { -if (!(status NVME_SC_DNR || -bio-bi_rw REQ_FAILFAST_MASK) -(jiffies - iod-start_time) IOD_TIMEOUT) { -if (!waitqueue_active(nvmeq-sq_full)) -add_wait_queue(nvmeq-sq_full, -nvmeq-sq_cong_wait); -list_add_tail(iod-node, nvmeq-iod_bio); -wake_up(nvmeq-sq_full); -return; -} -} Is blk-mq going to retry intermittently failed commands for me? It doesn't look like it will. Not sure what kind of behavior you are looking for here. If you can expand on the above a bit, I'll gladly help sort it out. Only the driver really knows if a particular request should be failed hard or retried. So you'd probably have to track retry counts in the request and reinsert/end as appropriate. Some vendor's drives return a failure status for a command but fully expect a retry to be successul. It'd be addressing this bug: bugzilla.kernel.org/show_bug.cgi?id=61061 The code being removed at the top of this function in the latest patch was taking care of the requeuing. I wasn't sure how many retries would be necessary, so I capped it at a total time instead of total tries. I'm told from 3rd parties that what we're doing is successful in their tests. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v4] NVMe: basic conversion to blk-mq
On Thu, 29 May 2014, Matias Bjørling wrote: This converts the current NVMe driver to utilize the blk-mq layer. I'm pretty darn sure this new nvme_remove can cause a process with an open reference to use queues after they're freed in the nvme_submit_sync_command path, maybe even the admin tags too. @@ -2802,7 +2574,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove(dev); nvme_dev_shutdown(dev); nvme_free_queues(dev, 0); - rcu_barrier(); + nvme_free_admin_tags(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(dev-kref, nvme_free_dev);
Re: [PATCH v8] NVMe: convert to blk-mq
On Fri, 13 Jun 2014, Matias Bjørling wrote: This converts the current NVMe driver to utilize the blk-mq layer. static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) { - struct nvme_dev *dev = pci_get_drvdata(pdev); + struct nvme_dev *dev = pci_get_drvdata(pdev); - if (prepare) - nvme_dev_shutdown(dev); - else - nvme_dev_resume(dev); + spin_lock(dev_list_lock); + if (prepare) + list_del_init(dev-node); + else + list_add(dev-node, dev_list); + spin_unlock(dev_list_lock); } + if (nvme_create_queue(dev-queues[i], i)) break; } The above change was just error injection test code so you can cause a device to become unresponsive and trigger the timeout handling. This latest is otherwise stable on my dev machine.
Re: [PATCH v5] conversion to blk-mq
On Mon, 2 Jun 2014, Matias Bjørling wrote: Hi Matthew and Keith, Here is an updated patch with the feedback from the previous days. It's against Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at: I'm testing this on my normal hardware now. As I feared, hot removal doesn't work while the device is actively being managed. Here's the oops: [ 1267.018283] BUG: unable to handle kernel NULL pointer dereference at 0004 [ 1267.018288] IP: [811c51b8] blk_mq_map_queue+0xf/0x1e [ 1267.018292] PGD b5ed5067 PUD b57e2067 PMD 0 [ 1267.018294] Oops: [#1] SMP [ 1267.018343] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd usbcore pcspkr lpc_ich acpi_cpufreq ioatdma mfd_core usb_common i2c_i801 evdev wmi ipmi_si tpm_tis ipmi_msghandler tpm processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sr_mod cdrom sd_mod crc_t10dif crct10dif_common isci libsas ahci scsi_transport_sas libahci igb libata ptp pps_core i2c_algo_bit scsi_mod i2c_core dca [last unloaded: nvme] [ 1267.018346] CPU: 1 PID: 6618 Comm: nvme_id_ctrl Tainted: GW 3.15.0-rc1+ #2 [ 1267.018347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 [ 1267.018349] task: 88042879eef0 ti: 8800b5f14000 task.ti: 8800b5f14000 [ 1267.018354] RIP: 0010:[811c51b8] [811c51b8] blk_mq_map_queue+0xf/0x1e [ 1267.018356] RSP: 0018:8800b5f15db0 EFLAGS: 00010206 [ 1267.018357] RAX: RBX: e8fbffa21e80 RCX: 88042f3952e0 [ 1267.018359] RDX: 1c10 RSI: 0001 RDI: 8800b5e9f008 [ 1267.018360] RBP: 88042d993480 R08: 8800b5e18cc0 R09: [ 1267.018362] R10: 0ca0 R11: 0ca0 R12: 8800b5f15e94 [ 1267.018363] R13: 3a98 R14: 7fffc7b44c40 R15: [ 1267.018365] FS: 7feb4cb05700() GS:88043f62() knlGS: [ 1267.018367] CS: 0010 DS: ES: CR0: 80050033 [ 1267.018369] CR2: 0004 CR3: b50f2000 CR4: 000407e0 [ 1267.018369] Stack: [ 1267.018372] 811c6334 fffc 88042d993480 fffc [ 1267.018375] a0532d73 0ca0 fff4 88042562b440 [ 1267.018378] 88042d853000 1000 a05348f3 c9001518b2a8 [ 1267.018378] Call Trace: [ 1267.018383] [811c6334] ? blk_mq_free_request+0x37/0x48 [ 1267.018388] [a0532d73] ? __nvme_submit_admin_cmd+0x7f/0x8a [nvme] [ 1267.018392] [a05348f3] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme] [ 1267.018397] [a053497d] ? nvme_dev_ioctl+0x1d/0x2b [nvme] [ 1267.018399] [81125916] ? do_vfs_ioctl+0x3f2/0x43b [ 1267.018402] [8105ca6b] ? finish_task_switch+0x84/0xc4 [ 1267.018406] [81394a8c] ? __schedule+0x45c/0x4f0 [ 1267.018408] [811259ad] ? SyS_ioctl+0x4e/0x7d [ 1267.018411] [8139c6d2] ? system_call_fastpath+0x16/0x1b [ 1267.018435] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 8b 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab [ 1267.018438] RIP [811c51b8] blk_mq_map_queue+0xf/0x1e [ 1267.018439] RSP 8800b5f15db0 [ 1267.018440] CR2: 0004 [ 1267.018443] ---[ end trace 1e244ae5f3ceb23b ]---
Re: [PATCH v5] conversion to blk-mq
Depending on the timing, it might fail in alloc instead of free: Jun 2 16:45:40 kbgrz1 kernel: [ 265.421243] NULL pointer dereference at (null) Jun 2 16:45:40 kbgrz1 kernel: [ 265.434284] PGD 429acf067 PUD 42ce28067 PMD 0 Jun 2 16:45:40 kbgrz1 kernel: [ 265.439565] Oops: [#1] SMP Jun 2 16:45:40 kbgrz1 kernel: [ 265.443413] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd ehci_pci ehci_hcd microcode usbcore pcspkr ioatdma lpc_ich usb_common mfd_core i2c_i801 evdev wmi acpi_cpufreq tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common isci libsas igb ahci scsi_transport_sas libahci ptp libata pps_core i2c_algo_bit i2c_core scsi_mod dca Jun 2 16:45:40 kbgrz1 kernel: [ 265.526398] CPU: 4 PID: 5454 Comm: nvme_id_ctrl Not tainted 3.15.0-rc1+ #2 Jun 2 16:45:40 kbgrz1 kernel: [ 265.534181] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 Jun 2 16:45:40 kbgrz1 kernel: [ 265.545789] task: 88042e418390 ti: 8804283e6000 task.ti: 8804283e6000 Jun 2 16:45:40 kbgrz1 kernel: [ 265.554270] RIP: 0010:[811c51b8] [811c51b8] blk_mq_map_queue+0xf/0x1e Jun 2 16:45:40 kbgrz1 kernel: [ 265.563720] RSP: 0018:8804283e7d80 EFLAGS: 00010286 Jun 2 16:45:40 kbgrz1 kernel: [ 265.569755] RAX: RBX: 880035a06008 RCX: 000f4240 Jun 2 16:45:40 kbgrz1 kernel: [ 265.577830] RDX: 0028a360 RSI: RDI: 880035a06008 Jun 2 16:45:40 kbgrz1 kernel: [ 265.585904] RBP: 88043f68 R08: R09: 1000 Jun 2 16:45:40 kbgrz1 kernel: [ 265.593979] R10: 1000 R11: 0410 R12: 00d0 Jun 2 16:45:40 kbgrz1 kernel: [ 265.602053] R13: 0001 R14: R15: Jun 2 16:45:40 kbgrz1 kernel: [ 265.610134] FS: 7f74b8bc9700() GS:88043f68() knlGS: Jun 2 16:45:40 kbgrz1 kernel: [ 265.619303] CS: 0010 DS: ES: CR0: 80050033 Jun 2 16:45:40 kbgrz1 kernel: [ 265.625824] CR2: CR3: 0004292c4000 CR4: 000407e0 Jun 2 16:45:40 kbgrz1 kernel: [ 265.633889] Stack: Jun 2 16:45:40 kbgrz1 kernel: [ 265.636236] 811c689f fff4 8804283e7e10 Jun 2 16:45:40 kbgrz1 kernel: [ 265.644949] 8804283e7e94 007d 7fff4f8a73b0 Jun 2 16:45:40 kbgrz1 kernel: [ 265.653653] a055acc7 0246 2d0aaec0 88042d0aaec0 Jun 2 16:45:40 kbgrz1 kernel: [ 265.662358] Call Trace: Jun 2 16:45:40 kbgrz1 kernel: [ 265.665194] [811c689f] ? blk_mq_alloc_request+0x54/0xd5 Jun 2 16:45:40 kbgrz1 kernel: [ 265.672217] [a055acc7] ? __nvme_submit_admin_cmd+0x2d/0x68 [nvme] Jun 2 16:45:40 kbgrz1 kernel: [ 265.680196] [a055c879] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme] Jun 2 16:45:40 kbgrz1 kernel: [ 265.687987] [a055c903] ? nvme_dev_ioctl+0x1d/0x2b [nvme] Jun 2 16:45:40 kbgrz1 kernel: [ 265.695107] [81125916] ? do_vfs_ioctl+0x3f2/0x43b Jun 2 16:45:40 kbgrz1 kernel: [ 265.701547] [8105ca6b] ? finish_task_switch+0x84/0xc4 Jun 2 16:45:40 kbgrz1 kernel: [ 265.708382] [81394a8c] ? __schedule+0x45c/0x4f0 Jun 2 16:45:40 kbgrz1 kernel: [ 265.714603] [811259ad] ? SyS_ioctl+0x4e/0x7d Jun 2 16:45:40 kbgrz1 kernel: [ 265.720555] [8139c6d2] ? system_call_fastpath+0x16/0x1b Jun 2 16:45:40 kbgrz1 kernel: [ 265.727560] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 8b 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab Jun 2 16:45:40 kbgrz1 kernel: [ 265.760706] RSP 8804283e7d80 Jun 2 16:45:40 kbgrz1 kernel: [ 265.764705] CR2: Jun 2 16:45:40 kbgrz1 kernel: [ 265.768531] ---[ end trace 785048a51785f51e ]--- On Mon, 2 Jun 2014, Keith Busch wrote: On Mon, 2 Jun 2014, Matias Bjørling wrote: Hi Matthew and Keith, Here is an updated patch with the feedback from the previous days. It's against Jens' for-3.16/core tree. You may use the nvmemq_wip_review branch at: I'm testing this on my normal hardware now. As I feared, hot removal doesn't work while the device is actively being managed. Here's the oops: [ 1267.018283] BUG: unable to handle kernel NULL pointer dereference at 0004 [ 1267.018288] IP: [811c51b8] blk_mq_map_queue+0xf/0x1e [ 1267.018292
Re: [PATCH v5] conversion to blk-mq
On Tue, 3 Jun 2014, Matias Bjorling wrote: Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks! Still fails as before: [ 88.933881] BUG: unable to handle kernel NULL pointer dereference at 0014 [ 88.942900] IP: [811c51b8] blk_mq_map_queue+0xf/0x1e [ 88.949605] PGD 427be0067 PUD 425495067 PMD 0 [ 88.954915] Oops: [#1] SMP [ 88.958787] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd pcspkr usbcore lpc_ich ioatdma usb_common mfd_core evdev i2c_i801 wmi acpi_cpufreq tpm_tis tpm ipmi_si ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common isci ahci libsas igb libahci scsi_transport_sas ptp pps_core i2c_algo_bit libata i2c_core scsi_mod dca [ 89.042529] CPU: 5 PID: 4544 Comm: nvme_id_ctrl Not tainted 3.15.0-rc1+ #3 [ 89.050295] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 [ 89.061856] task: 88042bbdb0d0 ti: 88042c24c000 task.ti: 88042c24c000 [ 89.070305] RIP: 0010:[811c51b8] [811c51b8] blk_mq_map_queue+0xf/0x1e [ 89.079747] RSP: 0018:88042c24dda0 EFLAGS: 00010202 [ 89.085795] RAX: RBX: e8fbffaa1b00 RCX: 88042e8ec4b0 [ 89.093868] RDX: 8be6 RSI: 0005 RDI: 88042abdf048 [ 89.101950] RBP: 88042c2b81c0 R08: 88042c24c000 R09: 880035c58410 [ 89.110033] R10: 88043f6b2dc0 R11: 88043f6b2dc0 R12: 88042c24de94 [ 89.118119] R13: 007d R14: 7fff0cd892b0 R15: [ 89.126210] FS: 7f39866c5700() GS:88043f6a() knlGS: [ 89.135387] CS: 0010 DS: ES: CR0: 80050033 [ 89.141916] CR2: 0014 CR3: 00042b387000 CR4: 000407e0 [ 89.149997] Stack: [ 89.152353] 811c6334 fffc 88042c2b81c0 88042c24de10 [ 89.161096] a054dcbb 0246 fffc 8800b5e05cc0 [ 89.169839] fff4 8800b5e05cc0 88042bbc3000 1000 [ 89.178583] Call Trace: [ 89.181429] [811c6334] ? blk_mq_free_request+0x37/0x48 [ 89.188360] [a054dcbb] ? __nvme_submit_admin_cmd+0x52/0x68 [nvme] [ 89.196349] [a054f761] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme] [ 89.204150] [a054f7eb] ? nvme_dev_ioctl+0x1d/0x2b [nvme] [ 89.211278] [81125916] ? do_vfs_ioctl+0x3f2/0x43b [ 89.217710] [81117e35] ? vfs_write+0xde/0xfc [ 89.223657] [811259ad] ? SyS_ioctl+0x4e/0x7d [ 89.229622] [8139c6d2] ? system_call_fastpath+0x16/0x1b [ 89.236636] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 8b 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab [ 89.263435] RIP [811c51b8] blk_mq_map_queue+0xf/0x1e [ 89.270237] RSP 88042c24dda0 [ 89.274237] CR2: 0014 [ 89.278095] ---[ end trace 54c0e8cbb1fe2ec3 ]--- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v5] conversion to blk-mq
On Tue, 3 Jun 2014, Matias Bjorling wrote: Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks! BTW, if you want to test this out yourself, it's pretty simple to recreate. I just run a simple user admin program sending nvme passthrough commands in a tight loop, then run: # echo 1 /sys/bus/pci/devices/bdf/remove Still fails as before: [ 88.933881] BUG: unable to handle kernel NULL pointer dereference at 0014 [ 88.942900] IP: [811c51b8] blk_mq_map_queue+0xf/0x1e [ 88.949605] PGD 427be0067 PUD 425495067 PMD 0 [ 88.954915] Oops: [#1] SMP [ 88.958787] Modules linked in: nvme parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc joydev jfs hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd microcode ehci_pci ehci_hcd pcspkr usbcore lpc_ich ioatdma usb_common mfd_core evdev i2c_i801 wmi acpi_cpufreq tpm_tis tpm ipmi_si ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache dm_mod nbd sg sd_mod sr_mod crc_t10dif cdrom crct10dif_common isci ahci libsas igb libahci scsi_transport_sas ptp pps_core i2c_algo_bit libata i2c_core scsi_mod dca [ 89.042529] CPU: 5 PID: 4544 Comm: nvme_id_ctrl Not tainted 3.15.0-rc1+ #3 [ 89.050295] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 [ 89.061856] task: 88042bbdb0d0 ti: 88042c24c000 task.ti: 88042c24c000 [ 89.070305] RIP: 0010:[811c51b8] [811c51b8] blk_mq_map_queue+0xf/0x1e [ 89.079747] RSP: 0018:88042c24dda0 EFLAGS: 00010202 [ 89.085795] RAX: RBX: e8fbffaa1b00 RCX: 88042e8ec4b0 [ 89.093868] RDX: 8be6 RSI: 0005 RDI: 88042abdf048 [ 89.101950] RBP: 88042c2b81c0 R08: 88042c24c000 R09: 880035c58410 [ 89.110033] R10: 88043f6b2dc0 R11: 88043f6b2dc0 R12: 88042c24de94 [ 89.118119] R13: 007d R14: 7fff0cd892b0 R15: [ 89.126210] FS: 7f39866c5700() GS:88043f6a() knlGS: [ 89.135387] CS: 0010 DS: ES: CR0: 80050033 [ 89.141916] CR2: 0014 CR3: 00042b387000 CR4: 000407e0 [ 89.149997] Stack: [ 89.152353] 811c6334 fffc 88042c2b81c0 88042c24de10 [ 89.161096] a054dcbb 0246 fffc 8800b5e05cc0 [ 89.169839] fff4 8800b5e05cc0 88042bbc3000 1000 [ 89.178583] Call Trace: [ 89.181429] [811c6334] ? blk_mq_free_request+0x37/0x48 [ 89.188360] [a054dcbb] ? __nvme_submit_admin_cmd+0x52/0x68 [nvme] [ 89.196349] [a054f761] ? nvme_user_admin_cmd+0x144/0x1b1 [nvme] [ 89.204150] [a054f7eb] ? nvme_dev_ioctl+0x1d/0x2b [nvme] [ 89.211278] [81125916] ? do_vfs_ioctl+0x3f2/0x43b [ 89.217710] [81117e35] ? vfs_write+0xde/0xfc [ 89.223657] [811259ad] ? SyS_ioctl+0x4e/0x7d [ 89.229622] [8139c6d2] ? system_call_fastpath+0x16/0x1b [ 89.236636] Code: 8b 4a 38 48 39 4e 38 72 12 74 06 b8 01 00 00 00 c3 48 8b 4a 60 48 39 4e 60 73 f0 c3 66 66 66 66 90 48 8b 87 e0 00 00 00 48 63 f6 8b 14 b0 48 8b 87 f8 00 00 00 48 8b 04 d0 c3 89 ff f0 48 0f ab [ 89.263435] RIP [811c51b8] blk_mq_map_queue+0xf/0x1e [ 89.270237] RSP 88042c24dda0 [ 89.274237] CR2: 0014 [ 89.278095] ---[ end trace 54c0e8cbb1fe2ec3 ]--- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v5] conversion to blk-mq
On Wed, 4 Jun 2014, Matias Bjørling wrote: On 06/04/2014 12:27 AM, Keith Busch wrote: On Tue, 3 Jun 2014, Matias Bjorling wrote: Keith, will you take the nvmemq_wip_v6 branch for a spin? Thanks! BTW, if you want to test this out yourself, it's pretty simple to recreate. I just run a simple user admin program sending nvme passthrough commands in a tight loop, then run: # echo 1 /sys/bus/pci/devices/bdf/remove I can't recreate- I use the nvme_get_feature program to continuously hit the ioctl path, testing using your nvme qemu branch. Okay, I'll try to fix it. I think there are multiple problems, but the first is that since there is no gendisk associated with the admin_q, the QUEUE_FLAG_INIT_DONE flag is never set, and blk_mq_queue_enter returns successful whenever this flag is not set even though this queue is dying, so we enter with all its invalid pointers. Here's a couple diff's. The first fixes the kernel oops by not entering a dying queue. The second is just a few unrelated clean-ups in nvme-core.c. I still can't complete my current hot-removal test, though; something appears hung, but haven't nailed that down yet. Please let me know what you think! Thanks. diff --git a/block/blk-mq.c b/block/blk-mq.c index d10013b..5a9ae8a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -105,6 +105,10 @@ static int blk_mq_queue_enter(struct request_queue *q) __percpu_counter_add(q-mq_usage_counter, 1, 100); smp_wmb(); /* we have problems to freeze the queue if it's initializing */ + if (blk_queue_dying(q)) { + __percpu_counter_add(q-mq_usage_counter, -1, 100); + ret = -ENODEV; + } if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) return 0; diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 243a5e6..22e9c82 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -98,7 +98,6 @@ struct nvme_queue { u8 cq_phase; u8 cqe_seen; u8 q_suspended; - cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; struct blk_mq_hw_ctx *hctx; }; @@ -1055,8 +1054,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) (void *)nvmeq-cqes, nvmeq-cq_dma_addr); dma_free_coherent(nvmeq-q_dmadev, SQ_SIZE(nvmeq-q_depth), nvmeq-sq_cmds, nvmeq-sq_dma_addr); - if (nvmeq-qid) - free_cpumask_var(nvmeq-cpu_mask); kfree(nvmeq); } @@ -1066,9 +1063,9 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) for (i = dev-queue_count - 1; i = lowest; i--) { struct nvme_queue *nvmeq = dev-queues[i]; - nvme_free_queue(nvmeq); dev-queue_count--; dev-queues[i] = NULL; + nvme_free_queue(nvmeq); } } @@ -1142,9 +1139,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, if (!nvmeq-sq_cmds) goto free_cqdma; - if (qid !zalloc_cpumask_var(nvmeq-cpu_mask, GFP_KERNEL)) - goto free_sqdma; - nvmeq-q_dmadev = dmadev; nvmeq-dev = dev; snprintf(nvmeq-irqname, sizeof(nvmeq-irqname), nvme%dq%d, @@ -1162,9 +1156,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, return nvmeq; - free_sqdma: - dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq-sq_cmds, - nvmeq-sq_dma_addr); free_cqdma: dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq-cqes, nvmeq-cq_dma_addr);
Re: [PATCH v5] conversion to blk-mq
On Wed, 4 Jun 2014, Jens Axboe wrote: On 06/04/2014 12:28 PM, Keith Busch wrote: Are you testing against 3.13? You really need the current tree for this, otherwise I'm sure you'll run into issues (as you appear to be :-) I'm using Matias' current tree: git://github.com/MatiasBjorling/linux-collab nvmeq_wip_6 Is this the right one to use? Looks like a 3.15rc1+. Also ... obviously that's the wrong diff from before. I didn't save my file before creating the diff. Should have looked like this: diff --git a/block/blk-mq.c b/block/blk-mq.c index d10013b..d1b986c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -105,7 +105,8 @@ static int blk_mq_queue_enter(struct request_queue *q) __percpu_counter_add(q-mq_usage_counter, 1, 100); smp_wmb(); /* we have problems to freeze the queue if it's initializing */ - if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) + if (!blk_queue_dying(q) (!blk_queue_bypass(q) || + !blk_queue_init_done(q))) return 0; __percpu_counter_add(q-mq_usage_counter, -1, 100); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] nvme: Cleanup nvme_dev_start()
On Mon, 20 Jan 2014, Alexander Gordeev wrote: This update fixes an oddity when a device is first added and then removed from dev_list in case of initialization failure, instead of just being added in case of success. Signed-off-by: Alexander Gordeev agord...@redhat.com --- drivers/block/nvme-core.c | 19 --- 1 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e1e4ad4..e4e12be 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2105,29 +2105,26 @@ static int nvme_dev_start(struct nvme_dev *dev) if (result) goto unmap; - spin_lock(dev_list_lock); - list_add(dev-node, dev_list); - spin_unlock(dev_list_lock); - result = set_queue_count(dev, num_online_cpus()); if (result == -EBUSY) For whatever reason, some of these devices unfortunetly don't support legacy interrupts. We expect an interrupt when the completion is posted for setting the queue count, but failing that, we rely on the polling thread to invoke the completion, so the device needs to be in the dev_list before calling set_queue_count. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] nvme: Cleanup nvme_dev_start() and fix IRQ leak
On Mon, 20 Jan 2014, Alexander Gordeev wrote: This is an attempt to make handling of admin queue in a single scope. This update also fixes a IRQ leak in case nvme_setup_io_queues() failed to allocate enough iomem and bailed out with -ENOMEM errno. This definitely seems to improve the code flow, but this leak was already fixed in the latest linux-nvme tree with this commit: http://git.infradead.org/users/willy/linux-nvme.git/commit/c5dc9192d52a4a3a479f701e97386347d454af84 Signed-off-by: Alexander Gordeev agord...@redhat.com --- drivers/block/nvme-core.c | 44 +++- 1 files changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 3e1ae55..e1e4ad4 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1287,6 +1287,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) if (result) return result; + dev-entry[0].vector = pdev-pci_dev-irq; result = queue_request_irq(dev, nvmeq, nvme admin); if (result) return result; @@ -1297,6 +1298,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } +static int nvme_teardown_admin_queue(struct nvme_dev *dev) +{ + free_irq(dev-entry[0].vector, dev-queues[0]); +} + struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length) { @@ -1744,17 +1750,10 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) (dev-db_stride + 3)); } -static int nvme_setup_io_queues(struct nvme_dev *dev) +static int nvme_setup_io_queues(struct nvme_dev *dev, int nr_io_queues) { struct pci_dev *pdev = dev-pci_dev; - int result, cpu, i, vecs, nr_io_queues, size, q_depth; - - nr_io_queues = num_online_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result 0) - return result; - if (result nr_io_queues) - nr_io_queues = result; + int result, cpu, i, vecs, size, q_depth; size = db_bar_size(dev, nr_io_queues); if (size 8192) { @@ -1771,20 +1770,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev-queues[0]-q_db = dev-dbs; } - /* Deregister the admin queue's interrupt */ - free_irq(dev-entry[0].vector, dev-queues[0]); - for (i = 0; i nr_io_queues; i++) dev-entry[i].entry = i; vecs = pci_enable_msix_range(pdev, dev-entry, 1, nr_io_queues); if (vecs 0) { vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); - if (vecs 0) { + if (vecs 0) vecs = 1; - } else { - for (i = 0; i vecs; i++) - dev-entry[i].vector = i + pdev-irq; - } + for (i = 0; i vecs; i++) + dev-entry[i].vector = i + pdev-irq; } /* @@ -1928,7 +1922,6 @@ static int nvme_dev_map(struct nvme_dev *dev) if (pci_enable_device_mem(pdev)) return result; - dev-entry[0].vector = pdev-irq; pci_set_master(pdev); bars = pci_select_bars(pdev, IORESOURCE_MEM); if (pci_request_selected_regions(pdev, bars, nvme)) @@ -2116,11 +2109,20 @@ static int nvme_dev_start(struct nvme_dev *dev) list_add(dev-node, dev_list); spin_unlock(dev_list_lock); - result = nvme_setup_io_queues(dev); - if (result result != -EBUSY) + result = set_queue_count(dev, num_online_cpus()); + if (result == -EBUSY) + return -EBUSY; + + nvme_teardown_admin_queue(dev); + + if (result) goto disable; - return result; + result = nvme_setup_io_queues(dev, result); + if (result) + goto disable; + + return 0; disable: spin_lock(dev_list_lock); -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] nvme: Cleanup nvme_dev_start() and fix IRQ leak
On Tue, 21 Jan 2014, Alexander Gordeev wrote: This is an attempt to make handling of admin queue in a single scope. This update also fixes a IRQ leak in case nvme_setup_io_queues() failed to allocate enough iomem and bailed out with -ENOMEM errno. Signed-off-by: Alexander Gordeev agord...@redhat.com --- +static void nvme_teardown_admin_queue(struct nvme_dev *dev) +{ + nvme_disable_queue(dev, 0); + nvme_free_queue(dev-queues[0]); +} @@ -2402,11 +2398,20 @@ static int nvme_dev_start(struct nvme_dev *dev) list_add(dev-node, dev_list); spin_unlock(dev_list_lock); - result = nvme_setup_io_queues(dev); - if (result result != -EBUSY) + result = set_queue_count(dev, num_online_cpus()); + if (result == -EBUSY) + return -EBUSY; + + nvme_teardown_admin_queue(dev); Oh no! Your new teardown function is freeing the admin queue, but it would be used immediatly after that in nvme_setup_io_queues ... + + if (result) goto disable; ... but you'll never actually get to setup io queues because the 'result' here is non-zero if we were successful, and is the number of queues the controller can allocate. I think you meant to do this instead: + if (result 0) - return result; + result = nvme_setup_io_queues(dev, result); + if (result) + goto disable; + + return 0; -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 5/9] nvme: Fix invalid call to irq_set_affinity_hint()
On Fri, 17 Jan 2014, Bjorn Helgaas wrote: On Fri, Jan 17, 2014 at 9:02 AM, Alexander Gordeev agord...@redhat.com wrote: In case MSI-X and MSI initialization failed the function irq_set_affinity_hint() is called with uninitialized value in dev-entry[0].vector. This update fixes the issue. dev-entry[0].vector is initialized in nvme_dev_map(), and it's used for free_irq() above the area of your patch, so I don't think this is actually a bug, though it might be somewhat confusing. It is confusing, but there's a reason. :) We send a single command using legacy irq to discover how many msix vectors we want. The legacy entry needs to be set some time before calling request_irq in nvme_configure_admin_queue, but also within nvme_dev_start (for power-management). I don't think there's a place to set it that won't look odd when looking at nvme_setup_io_queues. I settled on 'nvme_dev_map' was because 'nvme_dev_unmap' invalidates the entries, so this seemed to provide some amount of symmetry. Signed-off-by: Alexander Gordeev agord...@redhat.com --- drivers/block/nvme-core.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 26d03fa..e292450 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1790,15 +1790,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) vecs = 32; for (;;) { result = pci_enable_msi_block(pdev, vecs); - if (result == 0) { - for (i = 0; i vecs; i++) - dev-entry[i].vector = i + pdev-irq; - break; + if (result 0) { + vecs = result; + continue; } else if (result 0) { vecs = 1; - break; } - vecs = result; + for (i = 0; i vecs; i++) + dev-entry[i].vector = i + pdev-irq; + break; } } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] Let device drivers disable msi on shutdown
On Thu, 10 Jul 2014, Bjorn Helgaas wrote: [+cc LKML, Greg KH for driver core async shutdown question] On Tue, Jun 24, 2014 at 10:48:57AM -0600, Keith Busch wrote: To provide context why I want to do this asynchronously, NVM-Express has one PCI device per controller, of which there could be dozens in a system, and each one may take many seconds (I've heard over ten in some cases) to safely shutdown. I don't see anything in device_shutdown() that would wait for this sort of asynchronous shutdown to complete. So how do we know it's finished before we turn off the power, reset, kexec, etc.? If we need to do asynchronous shutdown, it seems like we need some sort of driver core infrastructure to manage that. Yes, good point! To address that, I did submit this patch: http://lists.infradead.org/pipermail/linux-nvme/2014-May/000827.html I need to fix the EXPORT_SYMBOL_GPL usage for a v2, but before that, I wan't to know the reason the driver can't use MSIx in an async shutdown shutdown, and came to the patch mentioned above. I'd originally had the async shutdown use legacy interrupts, but I know some NVMe devices do not support legacy, so can't use my original proposal. If I can't rely on MSI/MSI-x being enabled in an async shutdown, then I have to add polling, which I suppose we can live with. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v8] NVMe: convert to blk-mq
On Tue, 24 Jun 2014, Matias Bjorling wrote: Den 16-06-2014 17:57, Keith Busch skrev: This latest is otherwise stable on my dev machine. May I add an Acked-by from you? Totally up to Willy, but my feeling is not just yet. I see the value this driver provides, but I would need to give this to a customer and hear their results before adding an ack. It maybe a couple more weeks before they'll be able to try it. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v8] NVMe: convert to blk-mq
On Tue, 24 Jun 2014, Matias Bjørling wrote: On Tue, Jun 24, 2014 at 10:33 PM, Keith Busch keith.bu...@intel.com wrote: On Tue, 24 Jun 2014, Matias Bjorling wrote: Den 16-06-2014 17:57, Keith Busch skrev: This latest is otherwise stable on my dev machine. May I add an Acked-by from you? Totally up to Willy, but my feeling is not just yet. I see the value this driver provides, but I would need to give this to a customer and hear their results before adding an ack. It maybe a couple more weeks before they'll be able to try it. I was hoping to get an OK before that, to make sure it is picked up for the 3.17 release. Should I pack it up in a specific way to help it along? I think what you're doing is the right way to go. I trust you removed the test code from your next iteration, and this does pass my tests on my machine... I was just hoping to hear what some other people with better macro level benchmarking abilities would say. So okay, you can put my Acked-by on this, but I don't have all the performance numbers yet.
[PATCH] x86: ioapic: Fix irq_free_descs count
Signed-off-by: Keith Busch keith.bu...@intel.com Cc: Thomas Gleixner t...@linutronix.de Cc: x...@kernel.org --- kernel/irq/irqdesc.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7339e42..1487a12 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); */ void irq_free_hwirqs(unsigned int from, int cnt) { - int i; + int i, j; - for (i = from; cnt 0; i++, cnt--) { + for (i = from, j = cnt; j 0; i++, j--) { irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); arch_teardown_hwirq(i); } -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] x86: ioapic: Fix irq_free_descs count
On Mon, 30 Jun 2014, David Rientjes wrote: On Mon, 30 Jun 2014, Keith Busch wrote: Signed-off-by: Keith Busch keith.bu...@intel.com Cc: Thomas Gleixner t...@linutronix.de Cc: x...@kernel.org Acked-by: David Rientjes rient...@google.com This is definitely a fix for genirq: Provide generic hwirq allocation facility, but the changelog doesn't describe what the problem is and the title that this somehow fixes irq_free_descs() doesn't make any sense. My mistake, I used the component from the commit I bisected the bug down to. It's the equivalent of just doing - irq_free_descs(from, cnt); + irq_free_descs(from, i - from); I'd suggest the patch title be changed to genirq: Fix memory leak when calling irq_free_hwirqs() and the changelog state irq_free_hwirqs() always calls irq_free_descs() with a cnt == 0 which makes it a no-op since the interrupt count to free is decremented in itself. Yes, I like your suggestion. Should I submit a v2, or will maintainer replace the changelog? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] genirq: Fix memory leak when calling irq_free_hwirqs()
irq_free_hwirqs() always calls irq_free_descs() with a cnt == 0 which makes it a no-op since the interrupt count to free is decremented in itself. Fixes: 7b6ef1262549f6afc5c881aaef80beb8fd15f908 Signed-off-by: Keith Busch keith.bu...@intel.com Cc: Thomas Gleixner t...@linutronix.de Acked-by: David Rientjes rient...@google.com --- kernel/irq/irqdesc.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7339e42..1487a12 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); */ void irq_free_hwirqs(unsigned int from, int cnt) { - int i; + int i, j; - for (i = from; cnt 0; i++, cnt--) { + for (i = from, j = cnt; j 0; i++, j--) { irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); arch_teardown_hwirq(i); } -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: Fix dev_t liftime allocations
Releases the dev_t minor when all references are closed to prevent another device from acquiring the same major/minor. Since the partition's release may be invoked from call_rcu's soft-irq context, the ext_dev_idr's mutex had to be replaced with a spinlock so as not so sleep. Signed-off-by: Keith Busch keith.bu...@intel.com --- This was briefly discussed here: http://lists.infradead.org/pipermail/linux-nvme/2014-August/001120.html This patch goes one step further and fixes the same problem for partitions and disks. block/genhd.c | 18 +- block/partition-generic.c |2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 791f419..561134c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -28,10 +28,10 @@ struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT(1 MINORBITS) -/* For extended devt allocation. ext_devt_mutex prevents look up +/* For extended devt allocation. ext_devt_lock prevents look up * results from going away underneath its user. */ -static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; @@ -420,9 +420,9 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) } /* allocate ext devt */ - mutex_lock(ext_devt_mutex); + spin_lock(ext_devt_lock); idx = idr_alloc(ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL); - mutex_unlock(ext_devt_mutex); + spin_unlock(ext_devt_lock); if (idx 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -447,9 +447,9 @@ void blk_free_devt(dev_t devt) return; if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - mutex_lock(ext_devt_mutex); + spin_lock(ext_devt_lock); idr_remove(ext_devt_idr, blk_mangle_minor(MINOR(devt))); - mutex_unlock(ext_devt_mutex); + spin_unlock(ext_devt_lock); } } @@ -665,7 +665,6 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)-devt); } EXPORT_SYMBOL(del_gendisk); @@ -690,13 +689,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } else { struct hd_struct *part; - mutex_lock(ext_devt_mutex); + spin_lock(ext_devt_lock); part = idr_find(ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part get_disk(part_to_disk(part))) { *partno = part-partno; disk = part_to_disk(part); } - mutex_unlock(ext_devt_mutex); + spin_unlock(ext_devt_lock); } return disk; @@ -1098,6 +1097,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + blk_free_devt(dev-devt); disk_release_events(disk); kfree(disk-random); disk_replace_part_tbl(disk, NULL); diff --git a/block/partition-generic.c b/block/partition-generic.c index 789cdea..0d9e5f9 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -211,6 +211,7 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev-devt); free_part_stats(p); free_part_info(p); kfree(p); @@ -253,7 +254,6 @@ void delete_partition(struct gendisk *disk, int partno) rcu_assign_pointer(ptbl-last_lookup, NULL); kobject_put(part-holder_dir); device_del(part_to_dev(part)); - blk_free_devt(part_devt(part)); hd_struct_put(part); } -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2] block: Fix dev_t minor allocation lifetime
Releases the dev_t minor when all references are closed to prevent another device from acquiring the same major/minor. Since the partition's release may be invoked from call_rcu's soft-irq context, the ext_dev_idr's mutex had to be replaced with a spinlock so as not so sleep. Signed-off-by: Keith Busch keith.bu...@intel.com --- v1-v2: Applied comments from Willy: fixed gfp mask in idr_alloc to not wait, and preload. block/genhd.c | 24 ++-- block/partition-generic.c |2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 791f419..09da5e4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -28,10 +28,10 @@ struct kobject *block_depr; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT(1 MINORBITS) -/* For extended devt allocation. ext_devt_mutex prevents look up +/* For extended devt allocation. ext_devt_lock prevents look up * results from going away underneath its user. */ -static DEFINE_MUTEX(ext_devt_mutex); +static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; @@ -420,9 +420,13 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) } /* allocate ext devt */ - mutex_lock(ext_devt_mutex); - idx = idr_alloc(ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_KERNEL); - mutex_unlock(ext_devt_mutex); + idr_preload(GFP_KERNEL); + + spin_lock(ext_devt_lock); + idx = idr_alloc(ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); + spin_unlock(ext_devt_lock); + + idr_preload_end(); if (idx 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -447,9 +451,9 @@ void blk_free_devt(dev_t devt) return; if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - mutex_lock(ext_devt_mutex); + spin_lock(ext_devt_lock); idr_remove(ext_devt_idr, blk_mangle_minor(MINOR(devt))); - mutex_unlock(ext_devt_mutex); + spin_unlock(ext_devt_lock); } } @@ -665,7 +669,6 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)-devt); } EXPORT_SYMBOL(del_gendisk); @@ -690,13 +693,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } else { struct hd_struct *part; - mutex_lock(ext_devt_mutex); + spin_lock(ext_devt_lock); part = idr_find(ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part get_disk(part_to_disk(part))) { *partno = part-partno; disk = part_to_disk(part); } - mutex_unlock(ext_devt_mutex); + spin_unlock(ext_devt_lock); } return disk; @@ -1098,6 +1101,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + blk_free_devt(dev-devt); disk_release_events(disk); kfree(disk-random); disk_replace_part_tbl(disk, NULL); diff --git a/block/partition-generic.c b/block/partition-generic.c index 789cdea..0d9e5f9 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -211,6 +211,7 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev-devt); free_part_stats(p); free_part_info(p); kfree(p); @@ -253,7 +254,6 @@ void delete_partition(struct gendisk *disk, int partno) rcu_assign_pointer(ptbl-last_lookup, NULL); kobject_put(part-holder_dir); device_del(part_to_dev(part)); - blk_free_devt(part_devt(part)); hd_struct_put(part); } -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v12] NVMe: Convert to blk-mq
On Thu, 21 Aug 2014, Matias Bjørling wrote: On 08/19/2014 12:49 AM, Keith Busch wrote: I see the driver's queue suspend logic is removed, but I didn't mean to imply it was safe to do so without replacing it with something else. I thought maybe we could use the blk_stop/start_queue() functions if I'm correctly understanding what they're for. They're usually only used for the previous request model. Please correct me if I'm wrong. The flow of suspend is as following (roughly): 1. Freeze user threads 2. Perform sys_sync 3. Freeze freezable kernel threads 4. Freeze devices 5. ... On nvme suspend, we process all outstanding request and cancels any outstanding IOs, before going suspending. From what I found, is it still possible for IOs to be submitted and lost in the process? For suspend/resume, I think we're okay. There are three other ways the drive can be reset where we'd want to quiesce IO: I/O timeout Controller Failure Status (CSTS.CFS) set User initiated reset via sysfs * After a reset, we are not guaranteed that we even have the same number of h/w queues. The driver frees ones beyond the device's capabilities, so blk-mq may have references to freed memory. The driver may also allocate more queues if it is capable, but blk-mq won't be able to take advantage of that. Ok. Out of curiosity, why can the number of exposed nvme queues change from the hw perspective on suspend/resume? The only time you might expect something like that is if a f/w upgrade occured prior to the device reset and it supports different queues. The number of queues supported could be more or less than previous. I wouldn't normally expect different f/w to support different queue count, but it's certainly allowed. Otherwise the spec allows the controller to return errors even though the queue count feature was succesful. This could be for a variety of reasons from resource limits or other internal device errors.
[PATCH] fs/block_dev.c: Use hd_part to find block inodes
When using the GENHD_FL_EXT_DEVT disk flags, a newly added device may be assigned the same major/minor as one that was previously removed but opened, and the pesky userspace refuses to close it! The inode for the old block_device is still open, and so bdget() finds the stale device instead of allocating a new one. When the newly inserted drive is added, you'll see a message like: nvme0n1: detected capacity change from XXX to 0 and the partitions on the disk will not be usable after that. This patch uses the underlying disk's partition when trying to find the block device's opened inode so that two different disks that have a major/minor collision can coexist. Signed-off-by: Keith Busch keith.bu...@intel.com --- Maybe this is terrible idea!? This came from proposals to the nvme driver that remove the dynamic partitioning that was recently added, and I wanted to know why exactly it was failing. I don't know if there is a good reason to avoid having two devices opened with the same major/minor, but I think this is safe and tests out okay when I force that condition. In all cases, it appears getting the block device will eventually fail if either the disk or part do not exist, so should be okay to bail and assign these even earlier. fs/block_dev.c | 20 +--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d72746..9ba2bc8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -546,12 +546,15 @@ static inline unsigned long hash(dev_t dev) static int bdev_test(struct inode *inode, void *data) { - return BDEV_I(inode)-bdev.bd_dev == *(dev_t *)data; + return BDEV_I(inode)-bdev.bd_part == (struct hd_struct *)data; } static int bdev_set(struct inode *inode, void *data) { - BDEV_I(inode)-bdev.bd_dev = *(dev_t *)data; + struct hd_struct *part = (struct hd_struct *)data;; + + BDEV_I(inode)-bdev.bd_part = part; + BDEV_I(inode)-bdev.bd_dev = part_devt(part); return 0; } @@ -561,9 +564,20 @@ struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; + struct gendisk *disk; + struct hd_struct *part; + int partno; + + disk = get_gendisk(dev, partno); + if (!disk) + return NULL; + + part = disk_get_part(disk, partno); + if (!part) + return NULL; inode = iget5_locked(blockdev_superblock, hash(dev), - bdev_test, bdev_set, dev); + bdev_test, bdev_set, part); if (!inode) return NULL; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/block_dev.c: Use hd_part to find block inodes
On Fri, 22 Aug 2014, Christoph Hellwig wrote: On Fri, Aug 22, 2014 at 10:28:16AM -0600, Keith Busch wrote: When using the GENHD_FL_EXT_DEVT disk flags, a newly added device may be assigned the same major/minor as one that was previously removed but opened, and the pesky userspace refuses to close it! Which means life time rules for those dev_t allocations are broken. Please fix it to not release the dev_t until the device isn't referenced at all. Okay, thanks. So a proper fix would not let extended devt minors get reused while still referenced, so we can't release it unconditionally from del_gendisk(). I think the following does that, but I had to add a reference counter to gendisk. --- diff --git a/block/genhd.c b/block/genhd.c index 791f419..a41478a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -453,6 +453,12 @@ void blk_free_devt(dev_t devt) } } +static void free_ext_dev(struct kref *kref) +{ + struct gendisk *disk = container_of(kref, struct gendisk, kref); + blk_free_devt(disk_to_dev(disk)-devt); +} + static char *bdevt_str(dev_t devt, char *buf) { if (MAJOR(devt) = 0xff MINOR(devt) = 0xff) { @@ -665,7 +671,6 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)-devt); } EXPORT_SYMBOL(del_gendisk); @@ -1283,6 +1288,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id) disk_to_dev(disk)-class = block_class; disk_to_dev(disk)-type = disk_type; device_initialize(disk_to_dev(disk)); + kref_init(disk-kref); } return disk; } @@ -1303,6 +1309,7 @@ struct kobject *get_disk(struct gendisk *disk) module_put(owner); return NULL; } + kref_get(disk-kref); return kobj; } @@ -1311,8 +1318,10 @@ EXPORT_SYMBOL(get_disk); void put_disk(struct gendisk *disk) { - if (disk) + if (disk) { + kref_put(disk-kref, free_ext_dev); kobject_put(disk_to_dev(disk)-kobj); + } } EXPORT_SYMBOL(put_disk); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ec274e0..d51b099 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -200,6 +200,7 @@ struct gendisk { struct blk_integrity *integrity; #endif int node_id; + struct kref kref; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/block_dev.c: Use hd_part to find block inodes
On Fri, 22 Aug 2014, Keith Busch wrote: On Fri, 22 Aug 2014, Christoph Hellwig wrote: On Fri, Aug 22, 2014 at 10:28:16AM -0600, Keith Busch wrote: When using the GENHD_FL_EXT_DEVT disk flags, a newly added device may be assigned the same major/minor as one that was previously removed but opened, and the pesky userspace refuses to close it! Which means life time rules for those dev_t allocations are broken. Please fix it to not release the dev_t until the device isn't referenced at all. Okay, thanks. So a proper fix would not let extended devt minors get reused while still referenced, so we can't release it unconditionally from del_gendisk(). I think the following does that, but I had to add a reference counter to gendisk. Sorry for the rapid churn; I hadn't followed the function pointers through to discover the correct place to release the devt. This one's much simpler: --- diff --git a/block/genhd.c b/block/genhd.c index 791f419..321f1fd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -665,7 +665,6 @@ void del_gendisk(struct gendisk *disk) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_free_devt(disk_to_dev(disk)-devt); } EXPORT_SYMBOL(del_gendisk); @@ -1098,6 +1097,7 @@ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + blk_free_devt(dev-devt); disk_release_events(disk); kfree(disk-random); disk_replace_part_tbl(disk, NULL); -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
On Sun, 10 Aug 2014, Matias Bjørling wrote: On Sat, Jul 26, 2014 at 11:07 AM, Matias Bjørling m...@bjorling.me wrote: This converts the NVMe driver to a blk-mq request-based driver. Willy, do you need me to make any changes to the conversion? Can you pick it up for 3.17? Hi Matias, I'm starting to get a little more spare time to look at this again. I think there are still some bugs here, or perhaps something better we can do. I'll just start with one snippet of the code: @@ -765,33 +619,49 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, submit_iod: spin_lock_irq(nvmeq-q_lock); if (nvmeq-q_suspended) { spin_unlock_irq(nvmeq-q_lock); goto finish_cmd; } snip finish_cmd: nvme_finish_cmd(nvmeq, req-tag, NULL); nvme_free_iod(nvmeq-dev, iod); return result; } If the nvme queue is marked suspended, this code just goto's the finish without setting result, so I don't think that's right. But do we even need the q_suspended flag anymore? It was there because we couldn't prevent incoming requests as a bio based driver and we needed some way to mark that the h/w's IO queue was temporarily inactive, but blk-mq has ways to start/stop a queue at a higher level, right? If so, I think that's probably a better way than using this driver specific way. I haven't event tried debugging this next one: doing an insmod+rmmod caused this warning followed by a panic: Aug 13 15:41:41 kbgrz1 kernel: [ 89.207525] [ cut here ] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207538] WARNING: CPU: 8 PID: 5768 at mm/slab_common.c:491 kmalloc_slab+0x33/0x8b() Aug 13 15:41:41 kbgrz1 kernel: [ 89.207541] Modules linked in: nvme(-) parport_pc ppdev lp parport dlm sctp libcrc32c configfs nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc md4 hmac cifs bridge stp llc jfs joydev hid_generic usbhid hid loop md_mod x86_pkg_temp_thermal coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support microcode pcspkr ehci_pci ehci_hcd usbcore acpi_cpufreq lpc_ich usb_common ioatdma mfd_core i2c_i801 evdev wmi tpm_tis ipmi_si tpm ipmi_msghandler processor thermal_sys button ext4 crc16 jbd2 mbcache sg sr_mod cdrom sd_mod crct10dif_generic crc_t10dif crct10dif_common nbd dm_mod crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd isci libsas igb ahci libahci scsi_transport_sas ptp pps_core libata i2c_algo_bit i2c_core scsi_mod dca Aug 13 15:41:41 kbgrz1 kernel: [ 89.207653] CPU: 8 PID: 5768 Comm: nvme1 Not tainted 3.16.0-rc6+ #24 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207656] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207659] 0009 8139f9ba Aug 13 15:41:41 kbgrz1 kernel: [ 89.207664] 8103db86 e8601d80 810f0d59 0246 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207669] 880827bf28c0 8020 88082b8d9d00 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207674] Call Trace: Aug 13 15:41:41 kbgrz1 kernel: [ 89.207685] [8139f9ba] ? dump_stack+0x41/0x51 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207694] [8103db86] ? warn_slowpath_common+0x7d/0x95 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207699] [810f0d59] ? kmalloc_slab+0x33/0x8b Aug 13 15:41:41 kbgrz1 kernel: [ 89.207704] [810f0d59] ? kmalloc_slab+0x33/0x8b Aug 13 15:41:41 kbgrz1 kernel: [ 89.207710] [81115329] ? __kmalloc+0x28/0xf1 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207719] [811d0daf] ? blk_mq_tag_busy_iter+0x30/0x7c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207728] [a052c426] ? nvme_init_hctx+0x49/0x49 [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207733] [811d0daf] ? blk_mq_tag_busy_iter+0x30/0x7c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207738] [a052c98b] ? nvme_clear_queue+0x72/0x7d [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207744] [a052c9a8] ? nvme_del_queue_end+0x12/0x26 [nvme] Aug 13 15:41:41 kbgrz1 kernel: [ 89.207750] [810576e3] ? kthread_worker_fn+0xb1/0x111 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207754] [81057632] ? kthread_create_on_node+0x171/0x171 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207758] [81057632] ? kthread_create_on_node+0x171/0x171 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207762] [810574b9] ? kthread+0x9e/0xa6 Aug 13 15:41:41 kbgrz1 kernel: [ 89.207766] [8105741b] ? __kthread_parkme+0x5c/0x5c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207773] [813a3a2c] ? ret_from_fork+0x7c/0xb0 Aug 13 15:41:41 kbgrz1 kernel: [ 89.20] [8105741b] ? __kthread_parkme+0x5c/0x5c Aug 13 15:41:41 kbgrz1 kernel: [ 89.207780] ---[ end trace 8dc4a4c97c467d4c ]--- Aug 13 15:41:41 kbgrz1 kernel: [ 89.223627] PGD 0 Aug 13 15:41:41
Re: [PATCH v11] NVMe: Convert to blk-mq
On Thu, 14 Aug 2014, Jens Axboe wrote: On 08/14/2014 02:25 AM, Matias Bjørling wrote: The result is set to BLK_MQ_RQ_QUEUE_ERROR, or am I mistaken? Looks OK to me, looking at the code, 'result' is initialized to BLK_MQ_RQ_QUEUE_BUSY though. Which looks correct, we don't want to error on a suspended queue. My mistake missing how the result was initialized. nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. I'm not running with slab debugging. If it's any clue at all, blk-mq is using 16 of the 31 allocated h/w queues (which is okay as we discussed earlier), and the oops happens when clearing the first unused queue. I'll have time to mess with this more today, so I can either help find the problem or apply a patch if one becomes available.
Re: [PATCH v11] NVMe: Convert to blk-mq
On Thu, 14 Aug 2014, Matias Bjorling wrote: nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. The queue's tags were freed in 'blk_mq_map_swqueue' because some queues weren't mapped to a s/w queue, but the driver has a pointer to that freed memory, so it's a use-after-free error. This part in the driver looks different than it used to be in v8 when I last tested. The nvme_queue used to have a pointer to the 'hctx', but now it points directly to the 'tags', but it doesn't appear to be very safe. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v11] NVMe: Convert to blk-mq
On Thu, 14 Aug 2014, Jens Axboe wrote: nr_tags must be uninitialized or screwed up somehow, otherwise I don't see how that kmalloc() could warn on being too large. Keith, are you running with slab debugging? Matias, might be worth trying. The allocation and freeing of blk-mq parts seems a bit asymmetrical to me. The 'tags' belong to the tagset, but any request_queue using that tagset may free the tags. I looked to separate the tag allocation concerns, but that's more time than I have, so this is my quick-fix driver patch, forcing tag access through the hw_ctx. --- diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 384dc91..91432d2 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -109,7 +109,7 @@ struct nvme_queue { u8 cqe_seen; u8 q_suspended; struct async_cmd_info cmdinfo; - struct blk_mq_tags *tags; + struct blk_mq_hw_ctx *hctx; }; /* @@ -148,6 +148,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_queue *nvmeq = dev-queues[0]; hctx-driver_data = nvmeq; + nvmeq-hctx = hctx; return 0; } @@ -174,6 +175,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, irq_set_affinity_hint(dev-entry[nvmeq-cq_vector].vector, hctx-cpumask); hctx-driver_data = nvmeq; + nvmeq-hctx = hctx; return 0; } @@ -280,8 +282,7 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx, static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, unsigned int tag) { - struct request *req = blk_mq_tag_to_rq(nvmeq-tags, tag); - + struct request *req = blk_mq_tag_to_rq(nvmeq-hctx-tags, tag); return blk_mq_rq_to_pdu(req); } @@ -654,8 +655,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) nvme_submit_flush(nvmeq, ns, req-tag); else nvme_submit_iod(nvmeq, iod, ns); - - queued: nvme_process_cq(nvmeq); spin_unlock_irq(nvmeq-q_lock); return BLK_MQ_RQ_QUEUE_OK; @@ -1051,9 +1050,8 @@ static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map) if (tag = qdepth) break; - req = blk_mq_tag_to_rq(nvmeq-tags, tag++); + req = blk_mq_tag_to_rq(nvmeq-hctx-tags, tag++); cmd = blk_mq_rq_to_pdu(req); if (cmd-ctx == CMD_CTX_CANCELLED) continue; @@ -1132,8 +1130,8 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) { spin_lock_irq(nvmeq-q_lock); nvme_process_cq(nvmeq); - if (nvmeq-tags) - blk_mq_tag_busy_iter(nvmeq-tags, nvme_cancel_queue_ios, nvmeq); + if (nvmeq-hctx-tags) + blk_mq_tag_busy_iter(nvmeq-hctx-tags, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(nvmeq-q_lock); } @@ -1353,8 +1351,6 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(dev-admin_tagset)) return -ENOMEM; - dev-queues[0]-tags = dev-admin_tagset.tags[0]; - dev-admin_q = blk_mq_init_queue(dev-admin_tagset); if (!dev-admin_q) { blk_mq_free_tag_set(dev-admin_tagset); @@ -2055,9 +2051,6 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(dev-tagset)) goto out; - for (i = 1; i dev-online_queues; i++) - dev-queues[i]-tags = dev-tagset.tags[i - 1]; - id_ns = mem; for (i = 1; i = nn; i++) { res = nvme_identify(dev, i, 0, dma_addr); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v12] NVMe: Convert to blk-mq
On Fri, 15 Aug 2014, Matias Bjørling wrote: * NVMe queues are merged with the tags structure of blk-mq. I see the driver's queue suspend logic is removed, but I didn't mean to imply it was safe to do so without replacing it with something else. I thought maybe we could use the blk_stop/start_queue() functions if I'm correctly understanding what they're for. With what's in version 12, we could free an irq multiple times that doesn't even belong to the nvme queue anymore in certain error conditions. A couple other things I just noticed: * We lose the irq affinity hint after a suspend/resume or device reset because the driver's init_hctx() isn't called in these scenarios. * After a reset, we are not guaranteed that we even have the same number of h/w queues. The driver frees ones beyond the device's capabilities, so blk-mq may have references to freed memory. The driver may also allocate more queues if it is capable, but blk-mq won't be able to take advantage of that.
Re: [PATCH 4/5] lightnvm: NVMe integration
On Wed, 8 Oct 2014, Matias Bjørling wrote: NVMe devices are identified by the vendor specific bits: Bit 3 in OACS (device-wide). Currently made per device, as the nvme namespace is missing in the completion path. The NVM-Express 1.2 actually defined this bit for Namespace Management, so I don't think we can use bits the spec marked as reserved. Maybe you can trigger off some vendor specific Command Set Supported instead.
Re: [PATCH v13] NVMe: Convert to blk-mq
On Tue, 30 Sep 2014, Matias Bjørling wrote: @@ -1967,27 +1801,30 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, { ... - ns-queue-queue_flags = QUEUE_FLAG_DEFAULT; + queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns-queue); Instead of the above, you want + ns-queue-queue_flags |= QUEUE_FLAG_DEFAULT; I only caught it because of the fun dm-mpath attempt with nvme and blk-mq. Or maybe we don't want to include QUEUE_FLAG_STACKABLE right now (part of the default flags) because it will kernel crash if you're using dm-multipath with a blk-mq driver today. Otherwise, looks great!
[PATCH] misc: Increase available dyanmic minors
This increases the number of available miscellaneous character device dynamic minors from 63 to the max minor, 1M. Dynamic minor previously started at 63 and went down to zero. That's not enough in some situations, and also eventually creates a collision with 'psaux' misc device. This patch starts minors at the last defined misc minor (255) and works up to the max possible. Signed-off-by: Keith Busch keith.bu...@intel.com --- drivers/char/misc.c | 23 +-- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/char/misc.c b/drivers/char/misc.c index ffa97d2..229dba5 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -59,8 +59,7 @@ static DEFINE_MUTEX(misc_mtx); /* * Assigned numbers, used for dynamic minors */ -#define DYNAMIC_MINORS 64 /* like dynamic majors */ -static DECLARE_BITMAP(misc_minors, DYNAMIC_MINORS); +static DEFINE_IDA(misc_minors_ida); #ifdef CONFIG_PROC_FS static void *misc_seq_start(struct seq_file *seq, loff_t *pos) @@ -183,15 +182,14 @@ int misc_register(struct miscdevice * misc) INIT_LIST_HEAD(misc-list); mutex_lock(misc_mtx); - if (misc-minor == MISC_DYNAMIC_MINOR) { - int i = find_first_zero_bit(misc_minors, DYNAMIC_MINORS); - if (i = DYNAMIC_MINORS) { + int i = ida_simple_get(misc_minors_ida, MISC_DYNAMIC_MINOR, + MINORMASK, GFP_KERNEL); + if (i 0) { err = -EBUSY; goto out; } - misc-minor = DYNAMIC_MINORS - i - 1; - set_bit(i, misc_minors); + misc-minor = i; } else { struct miscdevice *c; @@ -208,9 +206,8 @@ int misc_register(struct miscdevice * misc) misc-this_device = device_create(misc_class, misc-parent, dev, misc, %s, misc-name); if (IS_ERR(misc-this_device)) { - int i = DYNAMIC_MINORS - misc-minor - 1; - if (i DYNAMIC_MINORS i = 0) - clear_bit(i, misc_minors); + if (misc-minor = MISC_DYNAMIC_MINOR) + ida_simple_remove(misc_minors_ida, misc-minor); err = PTR_ERR(misc-this_device); goto out; } @@ -237,16 +234,14 @@ int misc_register(struct miscdevice * misc) int misc_deregister(struct miscdevice *misc) { - int i = DYNAMIC_MINORS - misc-minor - 1; - if (WARN_ON(list_empty(misc-list))) return -EINVAL; mutex_lock(misc_mtx); list_del(misc-list); device_destroy(misc_class, MKDEV(MISC_MAJOR, misc-minor)); - if (i DYNAMIC_MINORS i = 0) - clear_bit(i, misc_minors); + if (misc-minor = MISC_DYNAMIC_MINOR) + ida_simple_remove(misc_minors_ida, misc-minor); mutex_unlock(misc_mtx); return 0; } -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] misc: Increase available dyanmic minors
On Tue, 9 Dec 2014, Arnd Bergmann wrote: On Monday 08 December 2014 16:01:50 Keith Busch wrote: This increases the number of available miscellaneous character device dynamic minors from 63 to the max minor, 1M. Dynamic minor previously started at 63 and went down to zero. That's not enough in some situations, and also eventually creates a collision with 'psaux' misc device. This patch starts minors at the last defined misc minor (255) and works up to the max possible. Signed-off-by: Keith Busch keith.bu...@intel.com I guess this will break support for ancient user space tools, and possibly also old file systems that do not support more than 8-bit minor numbers. I would assume that it's ok, but you definitely have to mention in the changelog that things might break and how you have concluded that this is safe enough. Sure, I can call this out in the change log. I hadn't considered file systems on a character device. Tooling as well. If a character device is so tightly coupled to such legacy tools, maybe they should register a static minor, and this patch makes 63 additional ones available. Does that sound reasonable? If you cannot come up with a good reasoning, it might be better to combine both and use up the traditional dynamic minor numbers before using the 255 range. A problem with the existing method is that anyone can use a number in the range it's dynamically allocating from. Once you encounter a collision, all following dynamic misc_device registration will fail. Most of the defined minors in that range are marked unused, but PSMOUSE_MINOR is still used. If we continue using the traditional dynamic allocator, the component registering as dynamic races against the one registering with its static, so that doesn't sound right. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Wed, 21 Jan 2015, Yan Liu wrote: For IO passthrough command, it uses an IO queue associated with the device. Actually, this patch does not modify that part. This patch is not really focused on io queues; instead, it is more about namespace protection from other namespace's user ios. The patch here doesn't prevent a user submitting completely arbitrary commands on IO queues. One still can do it through a char dev file descriptor. However, when a user explicitly chooses a namespace's file descriptor, it is unlikely that she/he tries to issue an io command to a different namespace. Maybe in that case someone should to use a new admin command. Oh, I just realized your patch is not for the blk-mq version of this driver. The mainline kernel uses blk-mq and is a little different in this path. My mistake, you're right the command works the same as before using the character device here. In that case, this seems very reasonable, but doesn't merge upstream.
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Fri, 23 Jan 2015, Christoph Hellwig wrote: On Fri, Jan 23, 2015 at 04:22:02PM +, Keith Busch wrote: The namespace id should be enforced on block devices, but is there a problem allowing arbitrary commands through the management char device? I have a need for a pure passthrough, but the proposed patch requires a matching namespace id all the time. I wrote and tested the one below to override nsid on block devices, but doesn't require a visible namespace through the management device. Allowing requests to differetn namespaces through the admin interface doesn't sound too horrible in general, but I still don't like your patch below. Instead of allocating another queue that allows arbitrary nsids we should simply look up the namespace when sent through the admin device, and still reject it if the namespace isn't valid. If a namespaces is marked hidden we should still create a device for it in Linux, as that whole concept of hiding a namespace is silly. No argument against removing the hidden attribute handling, but there are unadvertised NSID's that have special meaning. Like NSID 0x means to apply a command to all namespaces. Vendor specific commands may have other special NSID meanings as well. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Sun, 25 Jan 2015, Christoph Hellwig wrote: On Fri, Jan 23, 2015 at 03:57:06PM -0800, Yan Liu wrote: When a passthrough IO command is issued with a specific block device file descriptor. It should be applied at the namespace which is associated with that block device file descriptor. This patch makes such passthrough command ignore nsid in nvme_passthru_cmd structure. Instead it takes the namespace ID asscoiated with the block device descriptor. This looks good to me. If Keith can point to a use case for magic or hidden nsids we'll have to find a bypass for them, but this certainly is the right short term fix: 'Flush' is the only command from the spec where specifying all namespaces might be desirable. Otherwise, I can't disclose vendor specific behavior to gain concensus on operating within spec, silly as this desire may seem. Why empower the driver to make a feature unreachable? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Wed, 21 Jan 2015, Yan Liu wrote: When a passthrough IO command is issued with a specific block device file descriptor. It should be applied at the namespace which is associated with that block device file descriptor. This patch makes such passthrough command ingore nsid in nvme_passthru_cmd structure. Instead it takes the namespace ID asscoiated with the block device descriptor. Signed-off-by: Yan Liu y...@purestorage.com Oh it doesn't look like you tested this through the character handle. You've got it set to use the admin queue's request_queue for IO passthrough commands, so that can't be right. The IOCTL's purpose was to let someone submit completely arbitrary commands on IO queues. This technically shouldn't even need a namespace handle, but we don't have a request_queue associated to IO queues without one like the admin queue has. In fact, we ought to fix that so we can issue IO commands without namespaces. Anyway, namespaces may be hidden or some vendor special NSID trickery, who knows. Point is we don't want to tie the passthrough command's NSID down to the namespace providing the request_queue. If it is your intention to use that NSID, you can get the NSID using the NVME_IOCTL_ID prior to setting up the passthrough command. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Thu, 22 Jan 2015, Christoph Hellwig wrote: On Thu, Jan 22, 2015 at 12:47:24AM +, Keith Busch wrote: The IOCTL's purpose was to let someone submit completely arbitrary commands on IO queues. This technically shouldn't even need a namespace handle, but we don't have a request_queue associated to IO queues without one like the admin queue has. In fact, we ought to fix that so we can issue IO commands without namespaces. Honestly, this sounds like a horrible idea. As namespaces aren't really any different from SCSI LUNs they should only be accessible through the device associated with the namespaces, and admin commands should only be allowed through the character device (if at all). The case I considered was the hidden attribute in the NVMe LBA Range Type feature. It only indicates the storage should be hidden from the OS for general use, but the host may still use it for special purposes. In truth, the driver doesn't handle the hidden attribute very well and it doesn't seem like a well thought out feature in the spec anyway. But if you really need to restrict namespace access, shouldn't that be enforced on the target side with reservations or similar mechanism? I agree on your last point. Admin commands through namespaces carried over from before the management device existed, but removing it now will break some customer tooling. There's probably a responsible way to migrate. For these security and usability reasons we did get rid of the SG_FLAG_LUN_INHIBIT flag in the SCSI passthrough interface, which allowed for similar horrible things in the distant past. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Thu, 22 Jan 2015, Christoph Hellwig wrote: On Thu, Jan 22, 2015 at 03:21:28PM +, Keith Busch wrote: But if you really need to restrict namespace access, shouldn't that be enforced on the target side with reservations or similar mechanism? Think for example about containers where we give eah container access to a single nvme namespace, including container root access. Here you don't really want container A to be able to submit I/O for another container. A similar case exists for virtualization where we had problems with SCSI passthrough from guests. Okay, that's a great point. Yan, we should apply this if you can submit a patch for the linux-block tree. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: blk-mq crash with dm-multipath in for-3.20/core
On Mon, 9 Feb 2015, Mike Snitzer wrote: On Mon, Feb 09 2015 at 11:38am -0500, Dongsu Park dongsu.p...@profitbricks.com wrote: So that commit 6d6285c45f5a should be either reverted, or moved to linux-dm tree, doesn't it? Cheers, Dongsu [1] https://www.redhat.com/archives/dm-devel/2015-January/msg00171.html [2] https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=dm-for-3.20id=102e38b1030e883efc022dfdc7b7e7a3de70d1c5 Right, we're aware of this typo in 6d6285c45f5a. Sorry about that, but as you noted, once both the linux-block and linux-dm branches for 3.20 are merged all is back to working. So we're planning to just leave the block commit as broken and let the dm commit you noted fix it up. In the end 3.20-rc1 will have a working dm-multipath. Oh, we're not going rebase the series with the correction? I'm concerned someone biscecting a completely unrelated problem might step on this commit. Up to you guys. It's my fault, so I'll deal with the consequences. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Fix error handling of class_create(nvme)
On Fri, 6 Mar 2015, Alexey Khoroshilov wrote: class_create() returns ERR_PTR on failure, so IS_ERR() should be used instead of check for NULL. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov khoroshi...@ispras.ru Thanks for the fix. Acked-by: Keith Busch keith.bu...@intel.com -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 4.0.0-rc4 NVMe NULL pointer dereference and hang
On Sun, 22 Mar 2015, Steven Noonan wrote: This happens on boot, and then eventually results in an RCU stall. [8.047533] nvme :05:00.0: Device not ready; aborting initialisation Note that the above is expected with this hardware (long story). Although 3.19.x prints the above and then continues gracefully, 4.0-rc breaks immediately after the above message: Thanks for the notice. I CC'ed the linux-nvme mailing list. Since your device failed to become ready (this is expected, you say? ok, I won't ask. :)), it triggered recovery action that assumed it's list head was initialized once before. It's a one-line fix: --- diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 3b43897..ab7c847 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -3007,6 +3007,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) } get_device(dev-device); + INIT_LIST_HEAD(dev-node); INIT_WORK(dev-probe_work, nvme_async_probe); schedule_work(dev-probe_work); return 0; -- I'll coerce someone to merge this for rc6 today. [8.054306] BUG: unable to handle kernel NULL pointer dereference at 0008 [8.062155] IP: [a025e614] nvme_dev_list_remove+0x24/0xa0 [nvme] [8.069043] PGD 0 [8.071067] Oops: 0002 [#1] SMP [8.074332] Modules linked in: ahci libahci libata ehci_pci ehci_hcd scsi_mod usbcore usb_common nvme i915 intel_gtt i2c_algo_bit video drm_kms_helper drm i2c_core e1000e ptp pps_core ipmi_poweroff ipmi_msghandler button [8.094244] CPU: 4 PID: 632 Comm: kworker/u288:1 Not tainted 4.0.0-rc4-00347-gb87444a2 #5 [8.109878] Workqueue: nvme nvme_reset_workfn [nvme] [8.114852] task: 881f98271d70 ti: 881f982b8000 task.ti: 881f982b8000 [8.122321] RIP: 0010:[a025e614] [a025e614] nvme_dev_list_remove+0x24/0xa0 [nvme] [8.131624] RSP: :881f982bbd18 EFLAGS: 00010246 [8.136930] RAX: RBX: 883f63f84800 RCX: 88bf66e6a418 [8.144052] RDX: RSI: 0120 RDI: a0269848 [8.151171] RBP: 881f982bbd28 R08: 881f982b8000 R09: 0001 [8.158288] R10: 0086 R11: 0020 R12: 883f63f84800 [8.165411] R13: 88bf66e6a400 R14: 88df627ff900 R15: 1000 [8.172530] FS: () GS:883f7f88() knlGS: [8.180600] CS: 0010 DS: ES: CR0: 80050033 [8.186337] CR2: 0008 CR3: 01007ea0c000 CR4: 001406e0 [8.193458] DR0: DR1: DR2: [8.200574] DR3: DR6: fffe0ff0 DR7: 0400 [8.207693] Stack: [8.209705] 881f982bbd28 883f63f84978 881f982bbdc8 a026005e [8.217150] 883f7f894300 8de0 881f982bbd98 810a65e1 [8.224600] 881f982bbdd8 810a9943 881f982bbd98 881f982bbdd0 [8.232049] Call Trace: [8.234500] [a026005e] nvme_dev_shutdown+0x1e/0x430 [nvme] [8.240943] [810a65e1] ? put_prev_entity+0x31/0x350 [8.246772] [810a9943] ? pick_next_task_fair+0x103/0x4e0 [8.253046] [81012605] ? __switch_to+0x175/0x5c0 [8.258607] [a0262a8e] nvme_reset_failed_dev+0x1e/0x100 [nvme] [8.265378] [a025e1cf] nvme_reset_workfn+0xf/0x20 [nvme] [8.271649] [810872fe] process_one_work+0x14e/0x400 [8.277472] [8108822b] worker_thread+0x5b/0x530 [8.282943] [810881d0] ? rescuer_thread+0x3a0/0x3a0 [8.288778] [8108d359] kthread+0xc9/0xe0 [8.293649] [8108d290] ? kthread_stop+0x100/0x100 [8.299322] [81541158] ret_from_fork+0x58/0x90 [8.304711] [8108d290] ? kthread_stop+0x100/0x100 [8.310357] Code: 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 53 48 89 fb 48 c7 c7 48 98 26 a0 48 83 ec 08 e8 c3 23 2e e1 48 8b 13 48 8b 43 08 48 89 42 08 48 89 10 48 89 1b 48 81 3d 77 ae 00 00 a0 94 26 a0 [8.330295] RIP [a025e614] nvme_dev_list_remove+0x24/0xa0 [nvme] [8.337258] RSP 881f982bbd18 [8.340739] CR2: 0008 [8.344056] ---[ end trace 70831a936042aa41 ]--- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: add explicit BLK_DEV_INTEGRITY dependency
On Mon, 23 Feb 2015, Arnd Bergmann wrote: A patch that was added to 4.0-rc1 in the last minute caused a build break in the NVMe driver unless integrity support is also enabled: drivers/block/nvme-core.c: In function 'nvme_dif_remap': drivers/block/nvme-core.c:523:24: error: dereferencing pointer to incomplete type pmap = kmap_atomic(bip-bip_vec-bv_page) + bip-bip_vec-bv_offset; ^ drivers/block/nvme-core.c:523:49: error: dereferencing pointer to incomplete type pmap = kmap_atomic(bip-bip_vec-bv_page) + bip-bip_vec-bv_offset; ^ This changes the Kconfig entry for NVMe to depend on that support, to avoid the build error. Using 'select' would work as well, but might have unintended side-effects when users do not actually want the integerity support. Thanks for the catch. We do want users to be able to use this with and without blk-integrity, so I'll send a patch using conditional functions between appropriate ifdef's like how scsi and block handle this. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] NVMe: Do not take nsid while a passthrough IO command is being issued via a block device file descriptor
On Thu, 22 Jan 2015, Christoph Hellwig wrote: On Thu, Jan 22, 2015 at 04:02:08PM -0800, Yan Liu wrote: When a passthrough IO command is issued with a specific block device file descriptor. It should be applied at the namespace which is associated with that block device file descriptor. This patch makes such passthrough command ignore nsid in nvme_passthru_cmd structure. Instead it takes the namespace ID asscoiated with the block device descriptor. Signed-off-by: Yan Liu y...@purestorage.com Please move the code to find the ns into the caller, or even better a seaprate helper used by the caller. instead of adding another argument to nvme_user_cmd. The namespace id should be enforced on block devices, but is there a problem allowing arbitrary commands through the management char device? I have a need for a pure passthrough, but the proposed patch requires a matching namespace id all the time. I wrote and tested the one below to override nsid on block devices, but doesn't require a visible namespace through the management device. --- diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cb529e9..bdec1d7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1682,7 +1682,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, +static int nvme_user_cmd(struct nvme_dev *dev, struct request_queue *q, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; @@ -1690,6 +1690,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, int status, length; struct nvme_iod *uninitialized_var(iod); unsigned timeout; + struct request *req; + struct nvme_ns *ns = q-queuedata; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1699,7 +1701,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, memset(c, 0, sizeof(c)); c.common.opcode = cmd.opcode; c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.nsid = ns ? cpu_to_le32(ns-ns_id) : cpu_to_le32(cmd.nsid); c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); @@ -1725,21 +1727,15 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, if (length != cmd.data_len) status = -ENOMEM; - else if (ns) { - struct request *req; - - req = blk_mq_alloc_request(ns-queue, WRITE, - (GFP_KERNEL|__GFP_WAIT), false); - if (IS_ERR(req)) - status = PTR_ERR(req); - else { - status = nvme_submit_sync_cmd(req, c, cmd.result, - timeout); - blk_mq_free_request(req); - } - } else - status = __nvme_submit_admin_cmd(dev, c, cmd.result, timeout); + req = blk_mq_alloc_request(q, WRITE, (GFP_KERNEL|__GFP_WAIT), false); + if (IS_ERR(req)) { + status = PTR_ERR(req); + goto out; + } + status = nvme_submit_sync_cmd(req, c, cmd.result, timeout); + blk_mq_free_request(req); + out: if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode 1, iod); nvme_free_iod(dev, iod); @@ -1762,9 +1758,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns-ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns-dev, NULL, (void __user *)arg); + return nvme_user_cmd(ns-dev, ns-dev-admin_q, (void __user *)arg); case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns-dev, ns, (void __user *)arg); + return nvme_user_cmd(ns-dev, ns-queue, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -2155,6 +2151,17 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(dev-tagset)) goto out; + dev-io_q = blk_mq_init_queue(dev-tagset); + if (IS_ERR(dev-io_q)) { + blk_mq_free_tag_set(dev-tagset); + goto out; + } + if (!blk_get_queue(dev-io_q)) { + blk_cleanup_queue(dev-io_q); + blk_mq_free_tag_set(dev-tagset); + goto out; + } + id_ns = mem; for (i = 1; i = nn; i++) { res = nvme_identify(dev, i, 0, dma_addr); @@ -2565,6 +2572,7 @@ static void nvme_free_dev(struct kref *kref) nvme_release_instance(dev);
Re: [PATCH 01/10] block: make generic_make_request handle arbitrarily sized bios
On Tue, 28 Apr 2015, Christoph Hellwig wrote: This seems to lack support for QUEUE_FLAG_SG_GAPS to work around the retarded PPR format in the NVMe driver. Might strong words, sir! I'm missing the context here, but I'll say PRP is much more efficient for h/w to process over SGL, and the requirement comes from that h/w rather than the driver. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block:Change the function, nvme_alloc_queue to use -ENOMEM for when failing memory allocations
On Tue, 12 May 2015, Nicholas Krause wrote: This changes the function,nvme_alloc_queue to use the kernel code, -ENOMEM for when failing to allocate the memory required for the nvme_queue structure pointer,nvme in order to correctly return to the caller the correct reason for this function's failing. Nak! Look at what the callers are checking for upon return. Signed-off-by: Nicholas Krause xerofo...@gmail.com --- drivers/block/nvme-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 85b8036..e2eac1e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1387,7 +1387,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, struct device *dmadev = dev-pci_dev-dev; struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) - return NULL; + return -ENOMEM; nvmeq-cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), nvmeq-cq_dma_addr, GFP_KERNEL); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block:Remove including of the header file, linux/mm.h for the file,nvme-core.c
On Wed, 13 May 2015, Matthew Wilcox wrote: On Wed, May 13, 2015 at 12:21:18PM -0400, Nicholas Krause wrote: This removes the include statement for including the header file, linux/mm.h in the file, nvme-core.c due this driver file never calling any functions from the header file, linux/mm.h and only adding bloat to this driver by including this unnessary header file. Nick, I'm not going to apply this patch, simply because the effort of verifying that you haven't made a mistake is greater than the gain we might get from applying it. The driver does use functions defined in linux/mm.h: get_user_pages_fast is one of them. It compiles without including the header because another we include includes mm.h, but we don't want to depend on that! Don't worry, I didn't waste time checking; I just happened to know that one off the top. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 5/5 v2] nvme: LightNVM support
On Wed, 15 Apr 2015, Matias Bjørling wrote: @@ -2316,7 +2686,9 @@ static int nvme_dev_add(struct nvme_dev *dev) struct nvme_id_ctrl *ctrl; void *mem; dma_addr_t dma_addr; - int shift = NVME_CAP_MPSMIN(readq(dev-bar-cap)) + 12; + u64 cap = readq(dev-bar-cap); + int shift = NVME_CAP_MPSMIN(cap) + 12; + int nvm_cmdset = NVME_CAP_NVM(cap); The controller capabilities' command sets supported used here is the right way to key off on support for this new command set, IMHO, but I do not see in this patch the command set being selected when the controller is enabled Also if we're going this route, I think we need to define this reserved bit in the spec, but I'm not sure how to help with that. @@ -2332,6 +2704,7 @@ static int nvme_dev_add(struct nvme_dev *dev) ctrl = mem; nn = le32_to_cpup(ctrl-nn); dev-oncs = le16_to_cpup(ctrl-oncs); + dev-oacs = le16_to_cpup(ctrl-oacs); I don't find OACS used anywhere in the rest of the patch. I think this must be left over from v1. Otherwise it looks pretty good to me, but I think it would be cleaner if the lightnvm stuff is not mixed in the same file with the standard nvme command set. We might end up splitting nvme-core in the future anyway for command sets and transports.
RE: [PATCH 5/5 v2] nvme: LightNVM support
On Thu, 16 Apr 2015, James R. Bergsten wrote: My two cents worth is that it's (always) better to put ALL the commands into one place so that the entire set can be viewed at once and thus avoid inadvertent overloading of an opcode. Otherwise you don't know what you don't know. Yes, but these are two different command sets. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 5/5 v2] nvme: LightNVM support
On Thu, 16 Apr 2015, Javier González wrote: On 16 Apr 2015, at 16:55, Keith Busch keith.bu...@intel.com wrote: Otherwise it looks pretty good to me, but I think it would be cleaner if the lightnvm stuff is not mixed in the same file with the standard nvme command set. We might end up splitting nvme-core in the future anyway for command sets and transports. Would you be ok with having nvme-lightnvm for LightNVM specific commands? Sounds good to me, but I don't really have a dog in this fight. :)
Re: [PATCH 1/1] NVMe : Corrected memory freeing.
On Wed, 17 Jun 2015, Dheepthi K wrote: Memory freeing order has been corrected incase of allocation failure. This isn't necessary. The nvme_dev is zero'ed on allocation, and kfree(NULL or (void *)0) is okay to do. Signed-off-by: Dheepthi K dheepth...@gracelabs.com --- drivers/block/nvme-core.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 683dff2..9bac53b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2947,11 +2947,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev-entry = kzalloc_node(num_possible_cpus() * sizeof(*dev-entry), GFP_KERNEL, node); if (!dev-entry) - goto free; + goto free_dev; dev-queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), GFP_KERNEL, node); if (!dev-queues) - goto free; + goto free_entry; INIT_LIST_HEAD(dev-namespaces); dev-reset_workfn = nvme_reset_failed_dev; @@ -2987,9 +2987,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_instance(dev); put_pci: pci_dev_put(dev-pci_dev); - free: kfree(dev-queues); + free_entry: kfree(dev-entry); + free_dev: kfree(dev); return result; } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: fix type warning on 32-bit
On Tue, 19 May 2015, Arnd Bergmann wrote: A recent change to the ioctl handling caused a new harmless warning in the NVMe driver on all 32-bit machines: drivers/block/nvme-core.c: In function 'nvme_submit_io': drivers/block/nvme-core.c:1794:29: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] In order to shup up that warning, this introduces a new temporary variable that uses a double cast to extract the pointer from an __u64 structure member. Thanks for the fix. Acked-by: Keith Busch keith.bu...@intel.com Signed-off-by: Arnd Bergmann a...@arndb.de Fixes: a67a95134ff (NVMe: Meta data handling through submit io ioctl) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 85b8036deaa3..683dff272562 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1750,6 +1750,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) struct nvme_iod *iod; dma_addr_t meta_dma = 0; void *meta = NULL; + void __user *metadata; if (copy_from_user(io, uio, sizeof(io))) return -EFAULT; @@ -1763,6 +1764,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) meta_len = 0; } + metadata = (void __user *)(unsigned long)io.metadata; + write = io.opcode 1; switch (io.opcode) { @@ -1786,13 +1789,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (meta_len) { meta = dma_alloc_coherent(dev-pci_dev-dev, meta_len, meta_dma, GFP_KERNEL); + if (!meta) { status = -ENOMEM; goto unmap; } if (write) { - if (copy_from_user(meta, (void __user *)io.metadata, - meta_len)) { + if (copy_from_user(meta, metadata, meta_len)) { status = -EFAULT; goto unmap; } @@ -1819,8 +1822,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) nvme_free_iod(dev, iod); if (meta) { if (status == NVME_SC_SUCCESS !write) { - if (copy_to_user((void __user *)io.metadata, meta, - meta_len)) + if (copy_to_user(metadata, meta, meta_len)) status = -EFAULT; } dma_free_coherent(dev-pci_dev-dev, meta_len, meta, meta_dma); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Avoid interrupt disable during queue init.
On Thu, 21 May 2015, Parav Pandit wrote: Avoid diabling interrupt and holding q_lock for the queue which is just getting initialized. With this change, online_queues is also incremented without lock during queue setup stage. if Power management nvme_suspend() kicks in during queue setup time, per nvmeq based q_lock spinlock cannot protect device wide online_queues variable anyway. The q_lock is held to protect polling from reading inconsistent data. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Avoid interrupt disable during queue init.
On Fri, 22 May 2015, Parav Pandit wrote: On Fri, May 22, 2015 at 8:18 PM, Keith Busch keith.bu...@intel.com wrote: The rcu protection on nvme queues was removed with the blk-mq conversion as we rely on that layer for h/w access. o.k. But above is at level where data I/Os are not even active. Its between nvme_kthread and nvme_resume() from power management subsystem. I must be missing something. On resume, everything is already reaped from the queues, so there should be no harm letting the kthread poll an inactive queue. The proposal to remove the q_lock during queue init makes it possible for the thread to see the wrong cq phase bit and mess up the completion queue's head from reaping non-existent entries. But beyond nvme_resume, it appears a race condition is possible on any scenario when a device is reinitialized if it cannot create the same number of IO queues as it had in originally. Part of the problem is there doesn't seem to be a way to change a tagset's nr_hw_queues after it was created. The conditions that leads to this scenario should be uncommon, so I haven't given it much thought; I need to untangle dynamic namespaces first. :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Avoid interrupt disable during queue init.
On Thu, 21 May 2015, Parav Pandit wrote: On Fri, May 22, 2015 at 1:04 AM, Keith Busch keith.bu...@intel.com wrote: The q_lock is held to protect polling from reading inconsistent data. ah, yes. I can see the nvme_kthread can poll the CQ while its getting created through the nvme_resume(). I think this opens up other issue. nvme_kthread() should, Instead of, struct nvme_queue *nvmeq = dev-queues[i]; it should do, struct nvme_queue *nvmeq = rcu_dereference(dev-queues[i]); The rcu protection on nvme queues was removed with the blk-mq conversion as we rely on that layer for h/w access. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Avoid interrupt disable during queue init.
On Fri, 22 May 2015, Parav Pandit wrote: During normal positive path probe, (a) device is added to dev_list in nvme_dev_start() (b) nvme_kthread got created, which will eventually refers to dev-queues[qid] to check for NULL. (c) dev_start() worker thread has started probing device and creating the queue using nvme_alloc_queue This is is assigning the dev-queue[qid] new pointer. If this is done out of order, nvme_kthread will pickup uninitialized q_lock, cq_phase, q_db. A memory barrier before incrementing the dev-queue_count (and assigning the pointer in the array before that) should address this concern. Other thoughts to not create nvme_kthread until all the queues are active. No good, we want to poll during queue creation to detect controller errors and broken interrupts. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Avoid interrupt disable during queue init.
On Fri, 22 May 2015, Parav Pandit wrote: On Fri, May 22, 2015 at 9:53 PM, Keith Busch keith.bu...@intel.com wrote: A memory barrier before incrementing the dev-queue_count (and assigning the pointer in the array before that) should address this concern. Sure. mb() will solve the publisher side problem. RCU is wrapper around mb(). However mb() doesn't solve the issue of q_lock variable getting fetched before if (!nvmeq) condition being executed, by value compilation optimizations in nvme_kthread(). Eh? The value of dev-queue_count prevents the thread's for-loop from iterating that nvmeq before the q_lock is initialized. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Avoid interrupt disable during queue init.
On Fri, 22 May 2015, Parav Pandit wrote: I agree to it that nvmeq won't be null after mb(); That alone is not sufficient. What I have proposed in previous email is, Converting, struct nvme_queue *nvmeq = dev-queues[i]; if (!nvmeq) continue; spin_lock_irq(nvmeq-q_lock); to replace with, struct nvme_queue *nvmeq = rcu_dereference(dev-queues[i]); if (!nvmeq) continue; spin_lock_irq(nvmeq-q_lock); This will prevent fetching content of q_lock before checking for NULL condition. Classic usage or RCU. What the heck are you talking about? The value of dev-queue_count won't even let the thread iterate an nvmeq before q_lock is initialized. We used to rcu protect queue access, but that was to make nvme's make_request_fn safe to surprise removal, not for the polling thread. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] pci: Default MPS tuning, match upstream port
On Mon, 17 Aug 2015, Bjorn Helgaas wrote: On Wed, Jul 29, 2015 at 04:18:53PM -0600, Keith Busch wrote: The new pcie tuning will check the device's MPS against the parent bridge when it is initially added to the pci subsystem, prior to attaching to a driver. If MPS is mismatched, the downstream port is set to the parent bridge's if capable. This is a little confusing (at least, *I'm* confused). You're talking about setting the MPS configuration of the downstream port, but I think you are talking about either a Switch Upstream Port or an Endpoint. Such a port is indeed *downstream* from the parent bridge, but referring to it as a downstream port risks confusing it with the parent bridge, which is either a Switch Downstream Port or a Root Port. Yes, the wording is confusing, yet you've managed to describe my intention, though. :) { int retval; + if (dev-bus-self + pcie_get_mps(dev) != pcie_get_mps(dev-bus-self)) + return; This part looks like it could be in its own separate patch that basically enforces the if we can't safely configure this device's MPS setting, prevent drivers from using the device idea. What happens in this case? Does the device still appear in sysfs and lspci, even if we can't configure its MPS safely? This seems like good place for a dev_warn(). It will remain in the pci topology and all the associated sysfs artifacts and lspic capabilities will be there. You could retune the whole topology from user space with setpci and do a rescan if you were so inclined, but that could cause problems if transactions are in-flight while re-tuning. A dev_warn will be produced when the device is initially scanned as you noted below, but another at this point might be useful to make it clear a driver will not be bound. The above is broken, though, for PCI device's that are not PCI-e, so need to fix that at the very least if we will enforce this. - if (mps != p_mps) - dev_warn(dev-dev, Max Payload Size %d, but upstream %s set to %d; if necessary, use \pci=pcie_bus_safe\ and report a bug\n, -mps, pci_name(bridge), p_mps); + if (mps != p_mps) { + if (128 dev-pcie_mpss p_mps) { + dev_warn(dev-dev, + Max Payload Size Supported %d, but upstream %s set to %d. If problems are experienced, try running with pci=pcie_bus_safe\n, + 128 dev-pcie_mpss, pci_name(bridge), p_mps); + return; Isn't this the same case where the above change to pci_bus_add_device() will now decline to add the device? So I think there are really two policy changes: 1) If an device can be configured to match the upstream bridge's MPS setting, do it, and 2) Don't add a device when its MPS setting doesn't match the upstream bridge I'd like those to be separate patches. Ok. + } + pcie_write_mps(dev, p_mps); + dev_info(dev-dev, Max Payload Size set to %4d/%4d (was %4d)\n, + pcie_get_mps(dev), 128 dev-pcie_mpss, mps); + } I assume this hunk is related to the policy change (from do nothing to update the downstream port). Can you split that policy change into its own separate minimal patch? Yes, I'm looking for minimal and specific bisect targets :) I think this will be clearer if you write it as: if (mps == p_mps) return; if (128 dev-pcie_mpss p_mps) { dev_warn(...); return; } pcie_write_mps(...); dev_info(...); Now that this actively updates the MPS setting, it's starting to grow beyond the original pcie_bus_detect_mps() function name. Agreed that's a cleaner flow. Thanks for all the feedback. Will work on a revision this week. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Persistent Reservation API V2
On Tue, 11 Aug 2015, Christoph Hellwig wrote: This series adds support for a simplified Persistent Reservation API to the block layer. The intent is that both in-kernel and userspace consumers can use the API instead of having to hand craft SCSI or NVMe command through the various pass through interfaces. It also adds DM support as getting reservations through dm-multipath is a major pain with the current scheme. NVMe support currently isn't included as I don't have a multihost NVMe setup to test on, but Keith offered to test it and I'll have a patch for it shortly. Hi Christoph, I wrote an nvme implementation and it seems to work as expected with your pr-tests (minor modification to open an nvme target). While API appears to work as designed, there are a few features of NVMe that are unreachable with it. For example, NVMe can ignore existing keys when acquiring a reservation in addition to registering a new key. NVMe can also specify if whether or not a reservation should persist through power-loss. Anyway, I don't think SCSI has these same options. Should this new IOCTL API accommodate the common subset to maintain its simplicity, or make it more expressive for all features? The more important question might be if there are any users requiring these features, and I honestly don't know that right now. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/3] QIB: Removing usage of pcie_set_mps()
From: Dave Jiang dave.ji...@intel.com This is in perperation of un-exporting the pcie_set_mps() function symbol. A driver should not be changing the MPS as that is the responsibility of the PCI subsystem. Signed-off-by: Dave Jiang dave.ji...@intel.com --- drivers/infiniband/hw/qib/qib_pcie.c | 27 +-- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 4758a38..b8a2dcd 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -557,12 +557,11 @@ static void qib_tune_pcie_coalesce(struct qib_devdata *dd) */ static int qib_pcie_caps; module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO); -MODULE_PARM_DESC(pcie_caps, Max PCIe tuning: Payload (0..3), ReadReq (4..7)); +MODULE_PARM_DESC(pcie_caps, Max PCIe tuning: ReadReq (4..7)); static void qib_tune_pcie_caps(struct qib_devdata *dd) { struct pci_dev *parent; - u16 rc_mpss, rc_mps, ep_mpss, ep_mps; u16 rc_mrrs, ep_mrrs, max_mrrs; /* Find out supported and configured values for parent (root) */ @@ -575,30 +574,6 @@ static void qib_tune_pcie_caps(struct qib_devdata *dd) if (!pci_is_pcie(parent) || !pci_is_pcie(dd-pcidev)) return; - rc_mpss = parent-pcie_mpss; - rc_mps = ffs(pcie_get_mps(parent)) - 8; - /* Find out supported and configured values for endpoint (us) */ - ep_mpss = dd-pcidev-pcie_mpss; - ep_mps = ffs(pcie_get_mps(dd-pcidev)) - 8; - - /* Find max payload supported by root, endpoint */ - if (rc_mpss ep_mpss) - rc_mpss = ep_mpss; - - /* If Supported greater than limit in module param, limit it */ - if (rc_mpss (qib_pcie_caps 7)) - rc_mpss = qib_pcie_caps 7; - /* If less than (allowed, supported), bump root payload */ - if (rc_mpss rc_mps) { - rc_mps = rc_mpss; - pcie_set_mps(parent, 128 rc_mps); - } - /* If less than (allowed, supported), bump endpoint payload */ - if (rc_mpss ep_mps) { - ep_mps = rc_mpss; - pcie_set_mps(dd-pcidev, 128 ep_mps); - } - /* * Now the Read Request size. * No field for max supported, but PCIe spec limits it to 4096, -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] pci: Default MPS tuning, match upstream port
A hot plugged PCI-e device max payload size (MPS) defaults to 0 for 128bytes. The device is not usable if the upstream port is configured to a higher setting. Bus configuration was previously done by arch specific and hot plug code after the root port or bridge was scanned, and default behavior logged a warning without changing the setting if there was a problem. This patch unifies tuning with a default policy that affects only misconfigured downstream devices, and preserves existing end result if a non-default bus tuning is used (ex: pci=pcie_bus_safe). The new pcie tuning will check the device's MPS against the parent bridge when it is initially added to the pci subsystem, prior to attaching to a driver. If MPS is mismatched, the downstream port is set to the parent bridge's if capable. This is safe to change here since the device being configured is not bound to a driver and does not affect previously configured devices in the domain hierarchy. A device incapable of matching the upstream bridge will log a warning message and not bind to a driver. This is the safe option since proper MPS configuration must consider the entire hierarchy between communicating end points, and we can't safely modify a root port's subtree MPS settings while it is in use. Since the bus is configured everytime a bridge is scanned, this potentially creates unnecessary re-walking of an already configured sub-tree, but the code only executes during root port initialization, hot adding a switch, or explicit request to rescan. Signed-off-by: Keith Busch keith.bu...@intel.com Cc: Dave Jiang dave.ji...@intel.com Cc: Austin Bolen austin_bo...@dell.com Cc: Myron Stowe mst...@redhat.com Cc: Jon Mason jdma...@kudzu.us Cc: Bjorn Helgaas bhelg...@google.com --- arch/arm/kernel/bios32.c | 12 arch/powerpc/kernel/pci-common.c |7 --- arch/tile/kernel/pci_gx.c |4 arch/x86/pci/acpi.c|9 - drivers/pci/bus.c |4 drivers/pci/hotplug/acpiphp_glue.c |1 - drivers/pci/hotplug/pciehp_pci.c |1 - drivers/pci/hotplug/shpchp_pci.c |1 - drivers/pci/probe.c| 27 ++- include/linux/pci.h|2 -- 10 files changed, 26 insertions(+), 42 deletions(-) diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c index fc1..4fff58e 100644 --- a/arch/arm/kernel/bios32.c +++ b/arch/arm/kernel/bios32.c @@ -537,18 +537,6 @@ void pci_common_init_dev(struct device *parent, struct hw_pci *hw) */ pci_bus_add_devices(bus); } - - list_for_each_entry(sys, head, node) { - struct pci_bus *bus = sys-bus; - - /* Configure PCI Express settings */ - if (bus !pci_has_flag(PCI_PROBE_ONLY)) { - struct pci_bus *child; - - list_for_each_entry(child, bus-children, node) - pcie_bus_configure_settings(child); - } - } } #ifndef CONFIG_PCI_HOST_ITE8152 diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index b9de34d..7f27ffe 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1682,13 +1682,6 @@ void pcibios_scan_phb(struct pci_controller *hose) */ if (ppc_md.pcibios_fixup_phb) ppc_md.pcibios_fixup_phb(hose); - - /* Configure PCI Express settings */ - if (bus !pci_has_flag(PCI_PROBE_ONLY)) { - struct pci_bus *child; - list_for_each_entry(child, bus-children, node) - pcie_bus_configure_settings(child); - } } EXPORT_SYMBOL_GPL(pcibios_scan_phb); diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c index b1df847..67492fb 100644 --- a/arch/tile/kernel/pci_gx.c +++ b/arch/tile/kernel/pci_gx.c @@ -599,10 +599,6 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller) __gxio_mmio_write32(trio_context-mmio_base_mac + reg_offset, rc_dev_cap.word); - /* Configure PCI Express MPS setting. */ - list_for_each_entry(child, root_bus-children, node) - pcie_bus_configure_settings(child); - /* * Set the mac_config register in trio based on the MPS/MRS of the link. */ diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ff99117..ab5977a 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -478,15 +478,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) } } - /* After the PCI-E bus has been walked and all devices discovered, -* configure any settings of the fabric that might be necessary. -*/ - if (bus) { - struct pci_bus *child; - list_for_each_entry(child, bus-children, node
[PATCH 0/3] PCI-e Max Payload Size configuration
This patch series removes arch, driver, and hot-plug specific calls to configure a PCI-e device's MPS setting. The device is instead configured in the generic pci-core layer when the device is initially added, and before it is bound to a driver. The default policy is also changed from do nothing to update the down stream port to match the upstream port if it is capable. Dave Jiang (2): QIB: Removing usage of pcie_set_mps() PCIE: Remove symbol export for pcie_set_mps() Keith Busch (1): pci: Default MPS tuning to match upstream port arch/arm/kernel/bios32.c | 12 arch/powerpc/kernel/pci-common.c |7 --- arch/tile/kernel/pci_gx.c|4 arch/x86/pci/acpi.c |9 - drivers/infiniband/hw/qib/qib_pcie.c | 27 +-- drivers/pci/bus.c|4 drivers/pci/hotplug/acpiphp_glue.c |1 - drivers/pci/hotplug/pciehp_pci.c |1 - drivers/pci/hotplug/shpchp_pci.c |1 - drivers/pci/pci.c|1 - drivers/pci/probe.c | 22 -- include/linux/pci.h |2 -- 12 files changed, 21 insertions(+), 70 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/3] PCIE: Remove symbol export for pcie_set_mps()
From: Dave Jiang dave.ji...@intel.com The setting of PCIe MPS should be left to the PCI subsystem and not the driver. An ill configured MPS by the driver could cause the device to not function or unstablize the entire system. Removing the exported symbol. Signed-off-by: Dave Jiang dave.ji...@intel.com --- drivers/pci/pci.c |1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 0008c95..92349ee 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4121,7 +4121,6 @@ int pcie_set_mps(struct pci_dev *dev, int mps) return pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_PAYLOAD, v); } -EXPORT_SYMBOL(pcie_set_mps); /** * pcie_get_minimum_link - determine minimum link settings of a PCI device -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Persistent Reservation API
On Tue, 4 Aug 2015, Christoph Hellwig wrote: NVMe support currently isn't included as I don't have a multihost NVMe setup to test on, but if I can find a volunteer to test it I'm happy to write the code for it. Looks pretty good so far. I'd be happy to give try it out with NVMe subsystems. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Ksummit-discuss] [TECH TOPIC] IRQ affinity
On Wed, 15 Jul 2015, Bart Van Assche wrote: * With blk-mq and scsi-mq optimal performance can only be achieved if the relationship between MSI-X vector and NUMA node does not change over time. This is necessary to allow a blk-mq/scsi-mq driver to ensure that interrupts are processed on the same NUMA node as the node on which the data structures for a communication channel have been allocated. However, today there is no API that allows blk-mq/scsi-mq drivers and irqbalanced to exchange information about the relationship between MSI-X vector ranges and NUMA nodes. We could have low-level drivers provide blk-mq the controller's irq associated with a particular h/w context, and the block layer can provide the context's cpumask to irqbalance with the smp affinity hint. The nvme driver already uses the hwctx cpumask to set hints, but this doesn't seems like it should be a driver responsibility. It currently doesn't work correctly anyway with hot-cpu since blk-mq could rebalance the h/w contexts without syncing with the low-level driver. If we can add this to blk-mq, one additional case to consider is if the same interrupt vector is used with multiple h/w contexts. Blk-mq's cpu assignment needs to be aware of this to prevent sharing a vector across NUMA nodes. The only approach I know of that works today to define IRQ affinity for blk-mq/scsi-mq drivers is to disable irqbalanced and to run a custom script that defines IRQ affinity (see e.g. the spread-mlx4-ib-interrupts attachment of http://thread.gmane.org/gmane.linux.kernel.device-mapper.devel/21312/focus=98409). -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv3 3/4] x86/pci: Initial commit for new VMD device driver
The Intel Volume Management Device (VMD) is an integrated endpoint on the platform's PCIe root complex that acts as a host bridge to a secondary PCIe domain. BIOS can reassign one or more root ports to appear within a VMD domain instead of the primary domain. The immediate benefit is that additional PCI-e domains allow more than 256 buses in a system by letting bus number reuse across different domains. VMD domains do not define ACPI _SEG, so to avoid domain clashing with host bridges defining this segment, VMD domains start at 0x1 which is greater than the highest possible 16-bit ACPI defined _SEG. This driver enumerates and enables the domain using the root bus configuration interface provided by the PCI subsystem. The driver provides configuration space accessor functions (pci_ops), bus and memory resources, a chained MSI irq domain, irq_chip implementation, and dma operations necessary to support the domain through the VMD endpoint's interface. VMD routes I/O as follows: 1) Configuration Space: BAR 0 ("CFGBAR") of VMD provides the base address and size for configuration space register access to VMD-owned root ports. It works similarly to MMCONFIG for extended configuration space. Bus numbering is independent and does not conflict with the primary domain. 2) MMIO Space: BARs 2 and 4 ("MEMBAR1" and "MEMBAR2") of VMD provide the base address, size, and type for MMIO register access. These addresses are not translated by VMD hardware; they are simply reservations to be distributed to root ports' memory base/limit registers and subdivided among devices downstream. 3) DMA: To interact appropriately with IOMMU, the source ID DMA read and write requests are translated to the bus-device-function of the VMD endpoint. Otherwise, DMA operates normally without VMD-specific address translation. 4) Interrupts: Part of VMD's BAR 4 is reserved for VMD's MSI-X Table and PBA. MSIs from VMD domain devices and ports are remapped to appear if they were issued using one of VMD's MSI-X table entries. Each MSI and MSI-X addresses of VMD-owned devices and ports have a special format where the address refers specific entries in VMD's MSI-X table. As with DMA, the interrupt source id is translated to VMD's bus-device-function. The driver provides its own MSI and MSI-X configuration functions specific to how MSI messages are used within the VMD domain, and it provides an irq_chip for independent IRQ allocation and to relay interrupts from VMD's interrupt handler to the appropriate device driver's handler. 5) Errors: PCIe error message are intercepted by the root ports normally (e.g. AER), except with VMD, system errors (i.e. firmware first) are disabled by default. AER and hotplug interrupts are translated in the same way as endpoint interrupts. 6) VMD does not support INTx interrupts or IO ports. Devices or drivers requiring these features should either not be placed below VMD-owned root ports, or VMD should be disabled by BIOS for such endpoints. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- arch/x86/Kconfig | 17 ++ arch/x86/include/asm/vmd.h | 10 + arch/x86/kernel/apic/msi.c | 38 +++ arch/x86/pci/Makefile |2 + arch/x86/pci/vmd.c | 619 kernel/irq/chip.c |1 + kernel/irq/irqdomain.c |3 + 7 files changed, 690 insertions(+) create mode 100644 arch/x86/include/asm/vmd.h create mode 100644 arch/x86/pci/vmd.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96d058a..53c53f7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2632,6 +2632,23 @@ config PMC_ATOM def_bool y depends on PCI +config HAVE_VMDDEV + bool + +config VMDDEV + depends on PCI && PCI_DOMAINS && PCI_MSI && GENERIC_MSI_IRQ_DOMAIN && IRQ_DOMAIN_HIERARCHY + tristate "Volume Management Device Driver" + default N + select HAVE_VMDDEV + ---help--- + Adds support for the Intel Volume Manage Device (VMD). VMD is + a secondary PCI host bridge that allows PCI Express root ports, + and devices attached to them, to be removed from the default PCI + domain and placed within the VMD domain This provides additional + bus resources to allow more devices than are otherwise possible + with a single domain. If your system provides one of these and + has devices attached to it, say "Y". + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/include/asm/vmd.h b/arch/x86/include/asm/vmd.h new file mode 100644 index 000..35429f0 --- /dev/null +++ b/arch/x86/include/asm/vmd.h @@ -0,0 +1,10 @@ +#ifndef _ASM_X86_VMD_H +#define _ASM_X86_VMD_H + +#ifdef CONFIG_HAVE_VMDDEV +struct irq_domain_ops; +struct irq_domain *vmd_create_irq_d
[RFC PATCHv3 4/4] pciutils: Allow 32-bit domains
PCI-e segments will continue to use the lower 16 bits as required by ACPI. Special domains may use the full 32-bits. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- lib/filter.c |2 +- lib/pci.h|2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/filter.c b/lib/filter.c index d4254a0..075dc2f 100644 --- a/lib/filter.c +++ b/lib/filter.c @@ -45,7 +45,7 @@ pci_filter_parse_slot_v33(struct pci_filter *f, char *str) if (str[0] && strcmp(str, "*")) { long int x = strtol(str, , 16); - if ((e && *e) || (x < 0 || x > 0x)) + if ((e && *e) || (x < 0)) return "Invalid domain number"; f->domain = x; } diff --git a/lib/pci.h b/lib/pci.h index 10ba831..7e42765 100644 --- a/lib/pci.h +++ b/lib/pci.h @@ -119,7 +119,7 @@ struct pci_param *pci_walk_params(struct pci_access *acc, struct pci_param *prev struct pci_dev { struct pci_dev *next;/* Next device in the chain */ - u16 domain; /* PCI domain (host bridge) */ + int32_t domain; /* PCI domain (host bridge) */ u8 bus, dev, func; /* Bus inside domain, device and function */ /* These fields are set by pci_fill_info() */ -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv3 0/4] Driver for new VMD device
Here's version 3 for the VMD device driver, and overview of what changed from v2: >From review discussions, we discovered potential clashes in domain numbering (thanks, Bjorn). This new version avoids that clash by using domain numbers outside the ACPI defined _SEG range. Domains are purely a software concept, so there is no need to constrain domains to the possible segment range. Since this lets domain numbers exceed 16 bit domains, PATCH 4/4 updates pciutils to understand this so 'lspci' and 'setpci' will work as expected with these types of domains. This is not a kernel patch, but wasn't sure if it needs to be sent separately or to a different list, so it's included here. After testing configurations with constrained resources, we discovered pci probe made the previously and perfectly reasonable assumption that a pci domain provides 256 buses, but that's not always the case with these domains. PATCH 1/4 addresses the reduced resource conflicts that may occur during the initial scan. PATCH 2/4 provides a generic interface to allow pci domains to define DMA operations specific to their domain. This seemed better than tying the feature to this new VMD device. An alternative suggestion for future consideration was to use host_bridge specific operations when those are provided when that option is implemented. The main VMD patch in 3/4 is updated to address reviewer comments and a bug fixes discovered during testing. Specific fixes and updates include: Lockdep inversion detection with irq flow handler and msi activation; replaced irq flow handler's locking with rcu list. Out of range configuration access from constrained CFGBAR. Improved description in changelog and Kconfig describing the additional bus resource benefit this device provides. Proper use of 'pci_add_resource()'. Removed unnecessary NULL checking. Simplified domain enumeration and removal by calling pci-core API's for scan and root bus removal. Use "module_pci_driver()" rather than "module_init()" Subscribe to the "new" domain specific operations rather than defining this as a PCI FIXUP. Fixed memory leak if irq_domain creation failed. Keith Busch (4): pci: skip child bus with conflicting resources x86/pci: allow pci domain specific dma ops x86/pci: Initial commit for new VMD device driver pciutils: Allow 32-bit domains arch/x86/Kconfig | 17 ++ arch/x86/include/asm/device.h | 10 + arch/x86/include/asm/vmd.h| 10 + arch/x86/kernel/apic/msi.c| 38 +++ arch/x86/pci/Makefile |2 + arch/x86/pci/common.c | 38 +++ arch/x86/pci/vmd.c| 619 + drivers/pci/probe.c | 10 +- kernel/irq/chip.c |1 + kernel/irq/irqdomain.c|3 + 10 files changed, 746 insertions(+), 2 deletions(-) create mode 100644 arch/x86/include/asm/vmd.h create mode 100644 arch/x86/pci/vmd.c -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv3 2/4] x86-pci: allow pci domain specific dma ops
New x86 pci h/w will require dma operations specific to that domain. This patch allows those domains to register their operations, and sets devices as they are discovere3d in that domain to use them. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- arch/x86/include/asm/device.h | 10 ++ arch/x86/pci/common.c | 38 ++ 2 files changed, 48 insertions(+) diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 03dd729..3b23897 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -10,6 +10,16 @@ struct dev_archdata { #endif }; +#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) +struct dma_domain { + struct list_head node; + struct dma_map_ops *dma_ops; + int domain_nr; +}; +extern void add_dma_domain(struct dma_domain *domain); +extern void del_dma_domain(struct dma_domain *domain); +#endif + struct pdev_archdata { }; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dc78a4a..524183d 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -641,6 +641,43 @@ unsigned int pcibios_assign_all_busses(void) return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; } +#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) +LIST_HEAD(dma_domain_list); +DEFINE_SPINLOCK(dma_domain_list_lock); + +void add_dma_domain(struct dma_domain *domain) +{ + spin_lock(_domain_list_lock); + list_add(>node, _domain_list); + spin_unlock(_domain_list_lock); +} +EXPORT_SYMBOL_GPL(add_dma_domain); + +void del_dma_domain(struct dma_domain *domain) +{ + spin_lock(_domain_list_lock); + list_del(>node); + spin_unlock(_domain_list_lock); +} +EXPORT_SYMBOL_GPL(del_dma_domain); + +static void set_dma_domain_ops(struct pci_dev *pdev) +{ + struct dma_domain *domain; + + spin_lock(_domain_list_lock); + list_for_each_entry(domain, _domain_list, node) { + if (pci_domain_nr(pdev->bus) == domain->domain_nr) { + pdev->dev.archdata.dma_ops = domain->dma_ops; + break; + } + } + spin_unlock(_domain_list_lock); +} +#else +static void set_dma_domain_ops(struct pci_dev *pdev) {} +#endif + int pcibios_add_device(struct pci_dev *dev) { struct setup_data *data; @@ -670,6 +707,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = data->next; iounmap(data); } + set_dma_domain_ops(dev); return 0; } -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCHv3 1/4] pci: skip child bus with conflicting resources
And use the max bus resource from the parent rather than assume 255. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- drivers/pci/probe.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8361d27..1cb3be7 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -856,7 +856,10 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) if (!child) goto out; child->primary = primary; - pci_bus_insert_busn_res(child, secondary, subordinate); + if (!pci_bus_insert_busn_res(child, secondary, subordinate)) { + pci_remove_bus(child); + goto out; + } child->bridge_ctl = bctl; } @@ -896,7 +899,10 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) child = pci_add_new_bus(bus, dev, max+1); if (!child) goto out; - pci_bus_insert_busn_res(child, max+1, 0xff); + if (!pci_bus_insert_busn_res(child, max+1, bus->busn_res.end)) { + pci_remove_bus(child); + goto out; + } } max++; buses = (buses & 0xff00) -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2] blk-integrity: empty implementation when disabled
This patch moves the blk_integrity_payload definition outside the CONFIG_BLK_DEV_INTERITY dependency and provides empty function implementations when the kernel configuration disables integrity extensions. This simplifies drivers that make use of these to map user data so they don't need to repeat the same configuration checks. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- v1 -> v2: Fixed corrupted patch and subject line spelling error include/linux/bio.h | 32 ++-- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index b9b6e04..f0c46d0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -318,16 +318,6 @@ enum bip_flags { BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_rw & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - /* * bio integrity payload */ @@ -349,6 +339,16 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } +static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, + unsigned int nr) +{ + return NULL; +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] blk-integrity: empty imlpementation when disabled
This patch moves the blk_integrity_payload definition outside the CONFIG_BLK_DEV_INTERITY dependency and provides empty function implementations when the kernel configuration disables integrity extensions. This simplifies drivers that make use of these to map user data so they don't need to repeat the same configuration checks. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- include/linux/bio.h | 32 ++-- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index b9b6e04..f0c46d0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -318,16 +318,6 @@ enum bip_flags { BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_rw & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - /* * bio integrity payload */ @@ -349,6 +339,16 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } +static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, + unsigned int nr) +{ + return NULL; +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + `unsigned int len, unsigned int offset) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 1/6] msi: Relax msi_domain_alloc() to support parentless MSI irqdomains
From: Liu Jiang <jiang@linux.intel.com> Previously msi_domain_alloc() assumes MSI irqdomains always have parent irqdomains, but that's not true for the new Intel VMD devices. So relax msi_domain_alloc() to support parentless MSI irqdomains. Signed-off-by: Jiang Liu <jiang@linux.intel.com> Tested-by: Keith Busch <keith.bu...@intel.com> --- kernel/irq/msi.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b7..5e15cb4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, if (irq_find_mapping(domain, hwirq) > 0) return -EEXIST; - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret < 0) - return ret; + if (domain->parent) { + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + } for (i = 0; i < nr_irqs; i++) { ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); -- 2.6.2.307.g37023ba -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv5 2/6] pci: skip child bus with conflicting resources
And use the max bus resource from the parent rather than assume 255. Signed-off-by: Keith Busch <keith.bu...@intel.com> --- drivers/pci/probe.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index f14a970..ae5a4b3 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -856,7 +856,10 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) if (!child) goto out; child->primary = primary; - pci_bus_insert_busn_res(child, secondary, subordinate); + if (!pci_bus_insert_busn_res(child, secondary, subordinate)) { + pci_remove_bus(child); + goto out; + } child->bridge_ctl = bctl; } @@ -896,7 +899,10 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) child = pci_add_new_bus(bus, dev, max+1); if (!child) goto out; - pci_bus_insert_busn_res(child, max+1, 0xff); + if (!pci_bus_insert_busn_res(child, max+1, bus->busn_res.end)) { + pci_remove_bus(child); + goto out; + } } max++; buses = (buses & 0xff00) -- 2.6.2.307.g37023ba -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/