Re: [PATCH] NVMe: Fixed race between nvme_thread probe path.
On Thu, Jun 18, 2015 at 04:13:50PM +0530, Parav Pandit wrote: Kernel thread nvme_thread and driver load process can be executing in parallel on different CPU. This leads to race condition whenever nvme_alloc_queue() instructions are executed out of order that can reflects incorrect value for nvme_thread. Memory barrier in nvme_alloc_queue() ensures that it maintains the order and and data dependency read barrier in reader thread ensures that cpu cache is synced. Signed-off-by: Parav Pandit parav.pan...@avagotech.com --- drivers/block/nvme-core.c | 12 ++-- 1 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 5961ed7..90fb0ce 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1403,8 +1403,10 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq-q_db = dev-dbs[qid * 2 * dev-db_stride]; nvmeq-q_depth = depth; nvmeq-qid = qid; - dev-queue_count++; dev-queues[qid] = nvmeq; + /* update queues first before updating queue_count */ + smp_wmb(); + dev-queue_count++; return nvmeq; This has been applied already as an explicit mb() @@ -2073,7 +2075,13 @@ static int nvme_kthread(void *data) continue; } for (i = 0; i dev-queue_count; i++) { - struct nvme_queue *nvmeq = dev-queues[i]; + struct nvme_queue *nvmeq; + + /* make sure to read queue_count before + * traversing queues. + */ + smp_read_barrier_depends(); + nvmeq = dev-queues[i]; if (!nvmeq) continue; spin_lock_irq(nvmeq-q_lock); I don't think this is necessary. If queue_count is incremented while in this loop, it will be picked up the next time the kthread runs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Fixed race between nvme_thread probe path.
+ smp_wmb(); This has been applied already as an explicit mb() Since these structure is only accessible by software, won't smp_wmb() sufficient enough? Seems reasonable to me -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] VMD: Attach vmd resources to parent domain's resource tree
On Tue, Feb 23, 2016 at 11:16:11PM +, Keith Busch wrote: > On Tue, Feb 23, 2016 at 02:50:13PM -0700, Jon Derrick wrote: > > This patch attaches the new VMD domain's resources to the VMD device's > > resources. This allows /proc/iomem to display a more complete picture. > > > > Before: > > c000-c1ff : :5d:05.5 > > c200-c3ff : :5d:05.5 > > c201-c2013fff : nvme > > c400-c40f : :5d:05.5 > > > > After: > > c000-c1ff : :5d:05.5 > > c000-001f : VMD CFGBAR > > c200-c3ff : :5d:05.5 > > c200-c3ff : VMD MEMBAR1 > > c200-c22f : PCI Bus 1:01 > > c200-c200 : 1:01:00.0 > > c201-c2013fff : 1:01:00.0 > > c201-c2013fff : nvme > > c230-c24f : PCI Bus 1:01 > > c400-c40f : :5d:05.5 > > c4002000-c40f : VMD MEMBAR2 > > I think we should drop the CFGBAR from here since that's a bus resource > rather than IO memory. Otherwise, this looks good and useful. Yes that entry doesn't make much sense for iomem does it. I'll drop that part. > > I think in the near future we should come up with a better name than > "VMD MEMBAR" to help distinguish multiple VMD's in a system. Seconded. I'm fine with naming it by its PCI designator, eg 1:01:00.0, and maybe a bar designator after that (or not).
[PATCH] VMD: Attach vmd resources to parent domain's resource tree
This patch attaches the new VMD domain's resources to the VMD device's resources. This allows /proc/iomem to display a more complete picture. Before: c000-c1ff : :5d:05.5 c200-c3ff : :5d:05.5 c201-c2013fff : nvme c400-c40f : :5d:05.5 After: c000-c1ff : :5d:05.5 c000-001f : VMD CFGBAR c200-c3ff : :5d:05.5 c200-c3ff : VMD MEMBAR1 c200-c22f : PCI Bus 1:01 c200-c200 : 1:01:00.0 c201-c2013fff : 1:01:00.0 c201-c2013fff : nvme c230-c24f : PCI Bus 1:01 c400-c40f : :5d:05.5 c4002000-c40f : VMD MEMBAR2 Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/pci/vmd.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index d57e480..06490da 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -503,6 +503,20 @@ static struct pci_ops vmd_ops = { .write = vmd_pci_write, }; +static void vmd_attach_resources(struct vmd_dev *vmd) +{ + vmd->dev->resource[VMD_CFGBAR].child = >resources[0]; + vmd->dev->resource[VMD_MEMBAR1].child = >resources[1]; + vmd->dev->resource[VMD_MEMBAR2].child = >resources[2]; +} + +static void vmd_detach_resources(struct vmd_dev *vmd) +{ + vmd->dev->resource[VMD_CFGBAR].child = NULL; + vmd->dev->resource[VMD_MEMBAR1].child = NULL; + vmd->dev->resource[VMD_MEMBAR2].child = NULL; +} + /* * VMD domains start at 0x1000 to not clash with ACPI _SEG domains. */ @@ -530,6 +544,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) .start = res->start, .end = (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, + .parent = res, }; res = >dev->resource[VMD_MEMBAR1]; @@ -542,6 +557,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) .start = res->start, .end = res->end, .flags = flags, + .parent = res, }; res = >dev->resource[VMD_MEMBAR2]; @@ -554,6 +570,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) .start = res->start + 0x2000, .end = res->end, .flags = flags, + .parent = res, }; sd->domain = vmd_find_free_domain(); @@ -578,6 +595,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) return -ENODEV; } + vmd_attach_resources(vmd); vmd_setup_dma_ops(vmd); dev_set_msi_domain(>bus->dev, vmd->irq_domain); pci_rescan_bus(vmd->bus); @@ -674,6 +692,7 @@ static void vmd_remove(struct pci_dev *dev) { struct vmd_dev *vmd = pci_get_drvdata(dev); + vmd_detach_resources(vmd); pci_set_drvdata(dev, NULL); sysfs_remove_link(>dev->dev.kobj, "domain"); pci_stop_root_bus(vmd->bus); -- 1.8.3.1
Re: [PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match
Thanks everyone. Sorry about the mess :) On 02/15/2017 10:23 PM, Michael Ellerman wrote: > Jon Derrick <jonathan.derr...@intel.com> writes: > >> PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks >> like the 'arch/powerpc' file pattern should be enough to match powerpc >> opal code by itself. Remove the opal regex pattern from powerpc. > > We thought of it first. > > Can't you just rename your driver, Opal Storage Specification, so "oss", > that should be pretty unique? > > ... :) > > I don't like this version, but I'll merge the one from Stewart which > drops the pattern and adds the paths for the existing drivers. > > cheers > >> diff --git a/MAINTAINERS b/MAINTAINERS >> index b983b25..430dd02 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -7404,7 +7404,6 @@ F: drivers/pci/hotplug/pnv_php.c >> F: drivers/pci/hotplug/rpa* >> F: drivers/scsi/ibmvscsi/ >> F: tools/testing/selftests/powerpc >> -N: opal >> N: /pmac >> N: powermac >> N: powernv >> -- >> 1.8.3.1
[PATCH 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCH 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index d6dd604..addf89e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + (u64)clen + plen + slen > IO_BUFFER_LENGTH) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCH 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCH 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 28 ++-- include/linux/sed-opal.h | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 205d520..4fc72d4 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -69,7 +69,7 @@ enum opal_response_token { */ struct opal_resp_tok { const u8 *pos; - size_t len; + ssize_t len; enum opal_response_token type; enum opal_atom_width width; union { -- 1.8.3.1
[PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv3 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
Re: [PATCH 3/4] block/sed: Check received header lengths
On 02/15/2017 11:15 AM, Scott Bauer wrote: > On Wed, Feb 15, 2017 at 12:38:53PM -0700, Jon Derrick wrote: >> Add a buffer size check against discovery and response header lengths >> before we loop over their buffers. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> block/sed-opal.c | 35 +-- >> 1 file changed, 21 insertions(+), 14 deletions(-) >> >> diff --git a/block/sed-opal.c b/block/sed-opal.c >> index d6dd604..addf89e 100644 >> --- a/block/sed-opal.c >> +++ b/block/sed-opal.c >> @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) >> const struct d0_header *hdr = (struct d0_header *)dev->resp; >> const u8 *epos = dev->resp, *cpos = dev->resp; >> u16 comid = 0; >> +u32 hlen = be32_to_cpu(hdr->length); >> >> -print_buffer(dev->resp, be32_to_cpu(hdr->length)); >> +print_buffer(dev->resp, hlen); >> >> -epos += be32_to_cpu(hdr->length); /* end of buffer */ >> +if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { >> +pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", >> +sizeof(*hdr), hlen, IO_BUFFER_LENGTH); >> +return -EFAULT; >> +} >> + >> +epos += hlen; /* end of buffer */ >> cpos += sizeof(*hdr); /* current position on buffer */ >> >> while (cpos < epos && supported) { >> @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, >> int total; >> ssize_t token_length; >> const u8 *pos; >> +u32 clen, plen, slen; >> >> if (!buf) >> return -EFAULT; >> @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, >> pos = buf; >> pos += sizeof(*hdr); >> >> -pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", >> - be32_to_cpu(hdr->cp.length), >> - be32_to_cpu(hdr->pkt.length), >> - be32_to_cpu(hdr->subpkt.length)); >> - >> -if (hdr->cp.length == 0 || hdr->pkt.length == 0 || >> -hdr->subpkt.length == 0) { >> -pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", >> - be32_to_cpu(hdr->cp.length), >> - be32_to_cpu(hdr->pkt.length), >> - be32_to_cpu(hdr->subpkt.length)); >> +clen = be32_to_cpu(hdr->cp.length); >> +plen = be32_to_cpu(hdr->pkt.length); >> +slen = be32_to_cpu(hdr->subpkt.length); >> +pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", >> + clen, plen, slen); >> + >> +if (clen == 0 || plen == 0 || slen == 0 || >> +(u64)clen + plen + slen > IO_BUFFER_LENGTH) { > > I think we're over calculating here. From my understanding of the spec: > TCG_Storage_Architecture_Core_Spec_v2.01_r1.00.pdf > 3.2.3 > ComPackets, Packets & Subpackets > > The Comp packet size should the size of the packet, and the subpackets. > The "packet" should be the size of all the subpackets. > And lastly the subpacket should be the size its payload. Yep I forgot that. Good catch! > > if ((u64) slen + sizeof(*hdr) > IO_BUFFER_LENGTH) > > Since we never need the com packet length or pkt length (we never use them > anywhere) I think the above check is okay. Let me know if i'm wrong though, > or you have a different understanding of the lengths. Agreed > > > > > > > >
[PATCHv2 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index d6dd604..addf89e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + (u64)clen + plen + slen > IO_BUFFER_LENGTH) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv2 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv2 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv2 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv2 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv4 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Reviewed-by: Scott Bauer <scott.ba...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 00673cf..383d5d6 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv4 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Reviewed-by: Scott Bauer <scott.ba...@intel.com> --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv4 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Reviewed-by: Scott Bauer <scott.ba...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..00673cf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv4 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v3->v4: uses IS_ERR since tok is embedded in the response buffer and cannot be NULL v2->v3: corrected the bad calculation on the response parser check and changed it to only check the subpacket length v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv4 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv3 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index d6dd604..feba34b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv3 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v2->v3: corrected the bad calculation on the response parser check and changed it to only check the subpacket length v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv3 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
Re: [PATCH] x86/ioapic: Ignore root bridges without a companion ACPI device
On Sat, Sep 10, 2016 at 11:40:45PM +0800, Rui Wang wrote: > Some PCI root bridges don't have a corresponding ACPI device. > This can be the case on some old platforms. Don't call acpi_ioapic_add() > on these bridges because they can't support ioapic hotplug. Not just old platforms. VMD is a new device which initializes a root port in software without an acpi handle. What branch is this based on? I can't find the relevant code/commits
Re: [PATCH] x86/ioapic: Ignore root bridges without a companion ACPI device
On Mon, Sep 12, 2016 at 03:43:23PM -0600, Luck, Tony wrote: > > What branch is this based on? I can't find the relevant code/commits > > tip tree. branch x86/apic > > -Tony Cheers!
Re: [PATCH 1/2] nvme : Use correct scnprintf in cmb show
Looks good. Thanks Stephen. Reviewed-by Jon Derrick: <jonathan.derr...@intel.com> On Fri, Dec 16, 2016 at 11:54:50AM -0700, Stephen Bates wrote: > Make sure we are using the correct scnprintf in the sysfs show > function for the CMB. > > Signed-off-by: Stephen Bates <sba...@raithlin.com> > --- > drivers/nvme/host/pci.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 5e52034..be10860 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -50,7 +50,7 @@ > #define NVME_AQ_DEPTH256 > #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) > #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) > - > + > /* > * We handle AEN commands ourselves and don't even let the > * block layer know about them. > @@ -1332,7 +1332,7 @@ static ssize_t nvme_cmb_show(struct device *dev, > { > struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); > > - return snprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", > + return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", > ndev->cmbloc, ndev->cmbsz); > } > static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); > -- > 2.1.4 >
Re: [PATCH 2/2] nvme: improve cmb sysfs reporting
Minor nit below On Fri, Dec 16, 2016 at 11:54:51AM -0700, Stephen Bates wrote: > Add more information to the NVMe CMB sysfs entry. This includes > information about the CMB size, location and capabilities. > > Signed-off-by: Stephen Bates> --- > drivers/nvme/host/pci.c | 31 +-- > include/linux/nvme.h| 8 > 2 files changed, 37 insertions(+), 2 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index be10860..1f70630 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -98,6 +98,7 @@ struct nvme_dev { > void __iomem *cmb; > dma_addr_t cmb_dma_addr; > u64 cmb_size; > + u64 cmb_offset; > u32 cmbsz; > u32 cmbloc; > struct nvme_ctrl ctrl; > @@ -1326,14 +1327,39 @@ static int nvme_create_io_queues(struct nvme_dev *dev) > return ret >= 0 ? 0 : ret; > } > > +static const char * const cmb_caps[] = { > + [NVME_CMB_CAP_SQS] = "SQS", > + [NVME_CMB_CAP_CQS] = "CQS", > + [NVME_CMB_CAP_LISTS] = "LISTS", > + [NVME_CMB_CAP_RDS] = "RDS", > + [NVME_CMB_CAP_WDS] = "WDS", > +}; > + > static ssize_t nvme_cmb_show(struct device *dev, >struct device_attribute *attr, >char *buf) > { > struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); > + unsigned int i, len = 0; > + > + len += scnprintf(buf+len, PAGE_SIZE-len, > + "cmbloc : 0x%08x\ncmbsz : 0x%08x\n\n", > + ndev->cmbloc, ndev->cmbsz); > + > + len += scnprintf(buf+len, PAGE_SIZE-len, > + "OFFSET : 0x%016llx\nSIZE : %llu Bytes\n" \ > + "DMA ADDR : %pad\n\n", > + ndev->cmb_offset, ndev->cmb_size, > + >cmb_dma_addr); > + > + for (i = NVME_CMB_CAP_SQS; i <= NVME_CMB_CAP_WDS; i++) I'd prefer seeing (i = 0; i < ARRAY_SIZE(..); i++) because it provides automatic bounds checking against future code. > + len += scnprintf(buf+len, PAGE_SIZE-len, "%-7s: %s\n", > + cmb_caps[i], > + ((ndev->cmbsz) & (1< + "NOT SUPPORTED"); > + > + return len; > > - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", > -ndev->cmbloc, ndev->cmbsz); > } > static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); > > @@ -1376,6 +1402,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) > > dev->cmb_dma_addr = dma_addr; > dev->cmb_size = size; > + dev->cmb_offset = offset; > return cmb; > } > > diff --git a/include/linux/nvme.h b/include/linux/nvme.h > index fc3c242..cd0d324 100644 > --- a/include/linux/nvme.h > +++ b/include/linux/nvme.h > @@ -121,6 +121,14 @@ enum { > #define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) > #define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) > > +enum { > + NVME_CMB_CAP_SQS = 0, > + NVME_CMB_CAP_CQS, > + NVME_CMB_CAP_LISTS, > + NVME_CMB_CAP_RDS, > + NVME_CMB_CAP_WDS, > +}; > + > /* > * Submission and Completion Queue Entry Sizes for the NVM command set. > * (In bytes and specified as a power of two (2^n)). > -- > 2.1.4 >
Re: [PATCH v3 2/5] lib: Add Sed-opal library
On Mon, Dec 19, 2016 at 11:28:47PM -0800, Christoph Hellwig wrote: [snip] > > + while (cpos < total) { > > + if (!(pos[0] & 0x80)) /* tiny atom */ > > + token_length = response_parse_tiny(iter, pos); > > + else if (!(pos[0] & 0x40)) /* short atom */ > > + token_length = response_parse_short(iter, pos); > > + else if (!(pos[0] & 0x20)) /* medium atom */ > > + token_length = response_parse_medium(iter, pos); > > + else if (!(pos[0] & 0x10)) /* long atom */ > > + token_length = response_parse_long(iter, pos); > > Please add symbolic names for these constants to the protocol header. > I get tripped up by this logic every time I look at it. I'd almost rather see something like the following, which more closely follows the TCG core spec and the optimizing compiler should turn it into something similar to above anyways: if (pos[0] <= 0x7F) token_length = response_parse_tiny.. else if (pos[0] <= 0xBF) token_length = response_parse_short.. else if (pos[0] <= 0xDF) token_length = response_parse_medium.. else if (pos[0] <= 0xE3) token_length = response_parse_long.. Then just add those 0x7F, 0xBF, 0xDF, and 0xE3 constants to proto header. We could even add in a TCG reserved error for 0xE4-0xEF Also instead of tracking cpos, we could subtract from total until negative (if you make it signed). It's similar to how we do length in nvme_setup_prps. [snip] > > --- /dev/null > > +++ b/lib/sed-opal_internal.h > > This pretty much seem to contain the OPAL protocol defintions, so why > not opal_proto.h? Since there might eventually be a whole class of opal-like sed protocols, why does it make more sense to have opal_proto.h instead of sed-opal.h or some variation? This is similar to how leds-*.h look to me. Although I agree that sed-ATA.h would be dishonest since ATA security doesn't imply a self-encrypting-disk. [snip] > > +int sed_save(struct sed_context *sed_ctx, struct sed_key *key) > > +{ > > + switch (key->sed_type) { > > + case OPAL_LOCK_UNLOCK: > > + return opal_save(sed_ctx, key); > > + } > > + > > + return -EOPNOTSUPP; > > It seems to me that we should skip this whole sed_type indirections > and just specify the ioctls directly for OPAL (which would include > opalite and pyrite as subsets). The only other protocols of interest > for Linux would be the ATA "security" plain text passwords, which can > be handled differently, or enterprise SSC which we can hopefully avoid > to implement entirely. I'm on board with this if you think we won't have enough different, but similar, SED protocols to justify the indirection. In that case you can ignore the above comment as well. > > > + > > +int fdev_sed_ioctl(struct file *filep, unsigned int cmd, > > + unsigned long arg) > > +{ > > + struct sed_key key; > > + struct sed_context *sed_ctx; > > + > > + if (!capable(CAP_SYS_ADMIN)) > > + return -EACCES; > > + > > + if (!filep->f_sedctx || !filep->f_sedctx->ops || !filep->f_sedctx->dev) > > + return -ENODEV; > > In the previous version this was called from the block driver which > could pass in the context (and ops). Why was this changed? >
Re: block/sed-opal.c: 2 * bad if tests ?
On 03/06/2017 05:00 AM, David Binderman wrote: > Hello there, > > 1. > > block/sed-opal.c:2136:40: warning: logical ‘and’ of mutually exclusive tests > is always false [-Wlogical-op] > > Source code is > > if (lk_unlk->session.who < OPAL_USER1 && > lk_unlk->session.who > OPAL_USER9) { > > 2. > > block/sed-opal.c:2319:37: warning: logical ‘and’ of mutually exclusive tests > is always false [-Wlogical-op] > > if (opal_session->who < OPAL_USER1 && > opal_session->who > OPAL_USER9) { > > Duplicate. > > Also in the same file: > > [block/sed-opal.c:1034]: (style) Variable 'method' is assigned a value that > is never used. > > Regards > > David Binderman > Thanks for the catch(es). Will provide patch shortly
Re: [PATCH 2/3] pci: Generalize is_vmd behavior
On 08/11/2017 11:03 AM, Bjorn Helgaas wrote: > On Mon, Aug 07, 2017 at 01:57:12PM -0600, Jon Derrick wrote: >> Generalize is_vmd behavior to remove dependency on domain number >> checking in pci quirks. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> arch/x86/include/asm/pci.h | 8 +++- >> arch/x86/pci/common.c | 2 +- >> drivers/pci/quirks.c | 2 +- >> include/linux/pci.h| 4 >> 4 files changed, 9 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h >> index 473a729..5c5d54a 100644 >> --- a/arch/x86/include/asm/pci.h >> +++ b/arch/x86/include/asm/pci.h >> @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus >> *bus) >> #define pci_root_bus_fwnode _pci_root_bus_fwnode >> #endif >> >> -static inline bool is_vmd(struct pci_bus *bus) >> -{ >> #if IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) >> +{ >> struct pci_sysdata *sd = bus->sysdata; >> >> return sd->vmd_domain; >> -#else >> -return false; >> -#endif >> } >> +#endif >> >> /* Can be used to override the logic in pci_scan_bus for skipping >> already-configured bus numbers - to be used for buggy BIOSes >> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c >> index dbe2132..18b2277 100644 >> --- a/arch/x86/pci/common.c >> +++ b/arch/x86/pci/common.c >> @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} >> >> static void set_dev_domain_options(struct pci_dev *pdev) >> { >> -if (is_vmd(pdev->bus)) >> +if (pci_bus_is_vmd(pdev->bus)) >> pdev->hotplug_user_indicators = 1; >> } >> >> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c >> index 6967c6b..ba47995 100644 >> --- a/drivers/pci/quirks.c >> +++ b/drivers/pci/quirks.c >> @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, >> quirk_intel_qat_vf_cap); >> static void quirk_no_aersid(struct pci_dev *pdev) >> { >> /* VMD Domain */ >> -if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) >> +if (pci_bus_is_vmd(pdev->bus)) > > I like this part ... > >> pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; >> } >> DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); >> diff --git a/include/linux/pci.h b/include/linux/pci.h >> index 4869e66..0299d8b 100644 >> --- a/include/linux/pci.h >> +++ b/include/linux/pci.h >> @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus >> *bus) { return 0; } >> static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } >> #endif /* CONFIG_PCI_DOMAINS */ >> >> +#if !IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } >> +#endif > > But not so much this part. VMD is (AFAIK) still fundamentally an > x86-only thing, so I'd like it better if this could all be within > arch/x86. Could this be done by moving quirk_no_aersid() to > arch/x86/pci/fixup.c? > Thanks for pointing this out. I'll think of something different and localize it to the x86 code domain. > BTW, CONFIG_VMD in drivers/pci/host/Kconfig is currently "depends on > SRCU". I'm not a Kconfig expert, but that doesn't seem like an > intuitive connection. And it's the only such dependency on SRCU in > the tree -- most other places use "select SRCU", which makes more > sense to me. I agree - most places use select SRCU. I'll add that to v2. > >> /* >> * Generic implementation for PCI domain support. If your >> * architecture does not need custom management of PCI >> -- >> 2.9.4 >>
Re: [PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
Hi Robin, thanks for the reply. On 08/11/2017 12:25 PM, Robin Murphy wrote: > On 07/08/17 20:57, Jon Derrick wrote: >> VMD child devices must use the VMD endpoint's ID as the DMA source. >> Because of this, there needs to be a way to link the parent VMD >> endpoint's DMAR domain to the VMD child devices' DMAR domain such that >> attaching and detaching child devices modify the endpoint's DMAR mapping >> and prevents early detaching. > > That sounds like either pci_device_group() needs modifying, or perhaps > that intel-iommu needs its own extended iommu_ops::device_group > implementation, to ensure that VMD child devices get put in the same > group as their parent - if they share requester IDs they can't feasibly > be attached to different domains anyway. > > Robin. Yes it seems like that to me too. I have a high-level understanding of the changes required but not too much in the nitty-gritty details. It's a bit more complicated anyways because VMD emerges a set of root ports, and AFAICT, PCI ACS end at the root ports and iommu groups rely on ACS to group devices. Either way it's not within the scope of VMD.
[PATCH 1/3] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch <keith.bu...@intel.com> +M: Jonathan Derrick <jonathan.derr...@intel.com> L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
[PATCH 2/3] pci: Generalize is_vmd behavior
Generalize is_vmd behavior to remove dependency on domain number checking in pci quirks. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/include/asm/pci.h | 8 +++- arch/x86/pci/common.c | 2 +- drivers/pci/quirks.c | 2 +- include/linux/pci.h| 4 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 473a729..5c5d54a 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) #define pci_root_bus_fwnode_pci_root_bus_fwnode #endif -static inline bool is_vmd(struct pci_bus *bus) -{ #if IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) +{ struct pci_sysdata *sd = bus->sysdata; return sd->vmd_domain; -#else - return false; -#endif } +#endif /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132..18b2277 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} static void set_dev_domain_options(struct pci_dev *pdev) { - if (is_vmd(pdev->bus)) + if (pci_bus_is_vmd(pdev->bus)) pdev->hotplug_user_indicators = 1; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6967c6b..ba47995 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (pci_bus_is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66..0299d8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #endif /* CONFIG_PCI_DOMAINS */ +#if !IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } +#endif + /* * Generic implementation for PCI domain support. If your * architecture does not need custom management of PCI -- 2.9.4
[PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the DMA source. Because of this, there needs to be a way to link the parent VMD endpoint's DMAR domain to the VMD child devices' DMAR domain such that attaching and detaching child devices modify the endpoint's DMAR mapping and prevents early detaching. This is outside the scope of VMD, so disable binding child devices to prevent unforeseen issues. This functionality may be implemented in the future. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing iommu_group sysfs directories and subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..651a6cd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -905,6 +905,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); dev = _pdev->dev; + + /* VMD child devices currently cannot be handled individually */ + if (pci_bus_is_vmd(pdev->bus)) + return NULL; + segment = pci_domain_nr(pdev->bus); } else if (has_acpi_companion(dev)) dev = _COMPANION(dev)->dev; -- 2.9.4
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 12:03 AM, Dan Williams wrote: > On Wed, Sep 27, 2017 at 1:40 PM, Jon Derrick <jonathan.derr...@intel.com> > wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> + /* >> +* Ignore driver delete list arguments. They are handled by driver >> +* registration callbacks >> +*/ >> + if (strcmp(param, "delete_id") == 0) >> + return 0; >> + > > Does this preclude something like: > > modprobe ahci delete_id=1234:5678? > It does seem like it would. I can look into calling into the pci callback for this, but val is a struct module here and I haven't figured out the plumbing to get the [correct] driver from that. Maybe if I enforce the format of 'modprobe ahci ahci.delete_id=' to ensure the driver is specified (and would be required in cases with multi-driver modules).
Re: [RFC 1/3] PCI: pci-driver: Introduce pci device delete list
Hi Greg, On 09/28/2017 03:09 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:20PM -0400, Jon Derrick wrote: >> This patch introduces a new kernel command line parameter to mask pci >> device ids from pci driver id tables. This prevents masked devices from >> automatically binding to both built-in and module drivers. >> >> Devices can be later attached through the driver's sysfs new_id >> inteface. >> >> The use cases for this are primarily for debugging, eg, being able to >> prevent attachment before probes are set up. It can also be used to mask >> off faulty built-in hardware or faulty simulated hardware. >> >> Another use case is to prevent attachment of devices which will be >> passed to VMs, shortcutting the detachment effort. > > Is the "shortcut" really that big of a deal? unbind actually causes > problems? Is this an attempt to deal with devices being handled by more > than one driver and then you want to manually bind it later on? > >> The format is similar to the sysfs new_id format. Device ids are >> specified with: >> >> :[:SVVV:SDDD][:][:] >> >> Where: >> = Vendor ID >> = Device ID >> SVVV = Subvendor ID >> SDDD = Subdevice ID >> = Class >> = Class Mask >> >> IDs can be chained with commas. >> >> Examples: >> .delete_id=1234:5678 >> .delete_id=: >> .delete_id=::::010802 >> .delete_id=1234:5678,abcd:ef01,2345: > > What about drivers that handle more than one bus type (i.e. USB and > PCI?) This format is specific to PCI, yet you are defining it as a > "global" for all drivers :( > I was hoping to extend it to other bus types as well, but just wanted some early feedback on the idea. Pci was the easier implementation for me. Could prepending pci:, usb:, etc on the format be an option? > This feels hacky, what is the real reason for this? It feels like we > have so many different ways to blacklist and unbind and bind devices to > drivers already, why add yet-another way? > I ran into an issue a while back where I needed to disable a device from a built-in driver to perform a regression test. I worked around that issue by doing an initcall_blacklist on the pci-driver declaration, but that also preventing later binding because the driver was never registered. I've also had issues with remote systems where pluggable devices were broken or otherwise non-responsive, and it would have been nice to have been able to keep them from binding on module loading as a temporary workaround until someone could have removed those devices (though impossible on built-in hardware). I'm not sure about hacky; I think the implementation in this patch (1/3) is pretty clean :). I'm not familiar with all the different ways of blacklisting. Is there another way to work around the issues I listed above? I understand the concern about having it exist in the format . and only supporting one or a few bus types. I have another set that extends the pci= parameter instead. > thanks, > > greg k-h > Best, Jon
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 03:02 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:21PM -0400, Jon Derrick wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> +/* >> + * Ignore driver delete list arguments. They are handled by driver >> + * registration callbacks >> + */ >> +if (strcmp(param, "delete_id") == 0) >> +return 0; > > Why? This is only for the PCI core as you have defined it in this > patchset, but you just broke this module id for all other kernel modules > in the system :( > > If you want to do this, you need to provide this feature for _all_ > kernel drivers... > > thanks, > > greg k-h > Yes I'm not particularly happy about this one either. I will make this more robust if the blacklisting idea is sound.
[RFC 0/3] Introduce PCI device blacklisting
This set introduces a method to keep devices from matching driver id tables. Please see 0001 for more details of the motivation and implementation. This set currently applies to Bjorn's next (v4.14-rc1) branch. Jon Derrick (3): PCI: pci-driver: Introduce pci device delete list module: Ignore delete_id parameter Documentation: Add pci device delete_id interface Documentation/admin-guide/kernel-parameters.txt | 13 ++ drivers/pci/pci-driver.c| 253 +++- include/linux/pci.h | 1 + kernel/module.c | 7 + 4 files changed, 272 insertions(+), 2 deletions(-) -- 1.8.3.1
[RFC 1/3] PCI: pci-driver: Introduce pci device delete list
This patch introduces a new kernel command line parameter to mask pci device ids from pci driver id tables. This prevents masked devices from automatically binding to both built-in and module drivers. Devices can be later attached through the driver's sysfs new_id inteface. The use cases for this are primarily for debugging, eg, being able to prevent attachment before probes are set up. It can also be used to mask off faulty built-in hardware or faulty simulated hardware. Another use case is to prevent attachment of devices which will be passed to VMs, shortcutting the detachment effort. The format is similar to the sysfs new_id format. Device ids are specified with: :[:SVVV:SDDD][:][:] Where: = Vendor ID = Device ID SVVV = Subvendor ID SDDD = Subdevice ID = Class = Class Mask IDs can be chained with commas. Examples: .delete_id=1234:5678 .delete_id=: .delete_id=::::010802 .delete_id=1234:5678,abcd:ef01,2345: Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- drivers/pci/pci-driver.c | 253 ++- include/linux/pci.h | 1 + 2 files changed, 252 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 11bd267..7acdf13 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "pci.h" struct pci_dynid { @@ -27,6 +28,250 @@ struct pci_dynid { struct pci_device_id id; }; +static struct pci_device_id *pci_match_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev, + bool inverse) +{ + struct pci_dynid *deleteid; + struct pci_device_id *found_id = NULL; + + spin_lock(>deleteids.lock); + list_for_each_entry(deleteid, >deleteids.list, node) { + if (!inverse && pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + if (inverse && !pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + } + spin_unlock(>deleteids.lock); + + return found_id; +} + +/** + * pci_match_non_deleted_ids - match dev against not-deleted ids + * @ids: array of PCI device id structures to search in + * @dev: the PCI device structure to match against. + * + * Finds devices in driver id table not matching the delete list and tries to + * match them individually to dev. Returns the matching pci_dev_id structure, + * %NULL if there is no match, of -errno if there was a failure to allocate + * memory for the temporary pci_dev + */ +static struct pci_device_id *pci_match_non_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev) +{ + const struct pci_device_id *ids = drv->id_table; + struct pci_device_id *match = NULL; + struct pci_dev *tmpdev; + + tmpdev = kzalloc(sizeof(*tmpdev), GFP_KERNEL); + if (!tmpdev) + return ERR_PTR(-ENOMEM); + + while (ids->vendor || ids->subvendor || ids->class_mask) { + struct pci_device_id *found_id = NULL; + + tmpdev->vendor = ids->vendor; + tmpdev->device = ids->device, + tmpdev->subsystem_vendor = ids->subvendor, + tmpdev->subsystem_device = ids->subdevice, + tmpdev->class = ids->class; + + found_id = pci_match_deleted_ids(drv, tmpdev, true); + if (found_id) { + const struct pci_device_id id[2] = { *found_id, {0,} }; + if (pci_match_id(id, dev)) { + match = found_id; + break; + } + } + ids++; + } + + kfree(tmpdev); + + return match; +} + +/** + * pci_add_delete_id - add a new PCI device ID to a built-in driver's blacklist + * @drv: target pci driver + * @vendor: PCI vendor ID + * @device: PCI device ID + * @subvendor: PCI subvendor ID + * @subdevice: PCI subdevice ID + * @class: PCI class + * @class_mask: PCI class mask + * + * Adds a new dynamic pci device ID to this driver's delete list, preventing + * the device from attaching to built-in drivers + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pci_add_delete_id(struct pci_driver *drv, + unsigned int vendor, unsigned int device, + unsigned int subvendor, unsigned int subdevice, + unsigned int class, unsigned int class_mask) +{
[RFC 3/3] Documentation: Add pci device delete_id interface
Document the .delete_id= cmdline interface for masking built-in and module device id tables. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- Documentation/admin-guide/kernel-parameters.txt | 13 + 1 file changed, 13 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0549662..862c8b5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -789,6 +789,19 @@ Defaults to the default architecture's huge page size if not specified. + driver.delete_id= + [PCI] + Specified pci device format(s) will mask built-in + driver id tables, preventing devices from + automicatically attaching to built-in or module + drivers. Later attachment should be done by adding the + deleted id back through the "new_id" sysfs interface. + Format: :[::]\ + [:class][:class_mask][, ...] + PCI_ANY_ID masks should use the full 32-bit format. + Examples: virtio-pci.delete_id=: + nvme.delete_id=1234:5678,: + dhash_entries= [KNL] Set number of hash buckets for dentry cache. -- 1.8.3.1
[RFC 2/3] module: Ignore delete_id parameter
The PCI driver delete_id parameter is handled in each individual driver registration callback. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- kernel/module.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/module.c b/kernel/module.c index de66ec8..2b2dccf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } + /* +* Ignore driver delete list arguments. They are handled by driver +* registration callbacks +*/ + if (strcmp(param, "delete_id") == 0) + return 0; + /* Check for magic 'dyndbg' arg */ ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) -- 1.8.3.1
[PATCH v2 1/4] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Acked-by: Keith Busch <keith.bu...@intel.com> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch <keith.bu...@intel.com> +M: Jonathan Derrick <jonathan.derr...@intel.com> L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
[PATCH v2 0/4] VMD fixups
Mostly just cleanup in this revision, eg, trying to limit scope of vmd code to x86 Previous: https://patchwork.kernel.org/patch/9886095/ https://patchwork.kernel.org/patch/9886097/ https://patchwork.kernel.org/patch/9886101/ Jon Derrick (4): MAINTAINERS: Add Jonathan Derrick as VMD maintainer pci/x86: Move VMD quirks to x86 fixups x86/PCI: Use is_vmd rather than relying on the domain number iommu: Prevent VMD child devices from being remapping targets MAINTAINERS | 1 + arch/x86/pci/fixup.c| 18 ++ drivers/iommu/intel-iommu.c | 5 + drivers/pci/quirks.c| 17 - 4 files changed, 24 insertions(+), 17 deletions(-) -- 2.9.4
[PATCH v2 3/4] x86/PCI: Use is_vmd rather than relying on the domain number
Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/pci/fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index ca4b02e5..1ed2fbf 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -628,7 +628,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -- 2.9.4
[PATCH v2 4/4] iommu: Prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the requester. Because of this, there needs to be a way to link the parent VMD endpoint's iommu group and associated mappings to the VMD child devices such that attaching and detaching child devices modify the endpoint's mappings, while preventing early detaching on a singular device removal or unbinding. The reassignment of individual VMD child devices devices to VMs is outside the scope of VMD, but may be implemented in the future. For now it is best to prevent any such attempts. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing an iommu_group sysfs directories and allowing subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..94353a6e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -901,6 +901,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf struct pci_dev *pf_pdev; pdev = to_pci_dev(dev); + + /* VMD child devices currently cannot be handled individually */ + if (is_vmd(pdev->bus)) + return NULL; + /* VFs aren't listed in scope tables; we need to look up * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); -- 2.9.4
[PATCH v2 2/4] pci/x86: Move VMD quirks to x86 fixups
VMD currently only exists for Intel x86 products Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/pci/fixup.c | 18 ++ drivers/pci/quirks.c | 17 - 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 11e4074..ca4b02e5 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -618,3 +618,21 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); + +/* + * VMD-enabled root ports will change the source ID for all messages + * to the VMD device. Rather than doing device matching with the source + * ID, the AER driver should traverse the child device tree, reading + * AER registers to find the faulting device. + */ +static void quirk_no_aersid(struct pci_dev *pdev) +{ + /* VMD Domain */ + if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); + diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f1c9852..073baba 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4671,23 +4671,6 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); -/* - * VMD-enabled root ports will change the source ID for all messages - * to the VMD device. Rather than doing device matching with the source - * ID, the AER driver should traverse the child device tree, reading - * AER registers to find the faulting device. - */ -static void quirk_no_aersid(struct pci_dev *pdev) -{ - /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) - pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; -} -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); - /* FLR may cause some 82579 devices to hang. */ static void quirk_intel_no_flr(struct pci_dev *dev) { -- 2.9.4
[PATCH] ext4: Check superblock mapped prior to committing
This patch attempts to close a hole leading to a BUG seen with hot removals during writes [1]. A block device (NVME namespace in this test case) is formatted to EXT4 without partitions. It's mounted and write I/O is run to a file, then the device is hot removed from the slot. The superblock attempts to be written to the drive which is no longer present. The typical chain of events leading to the BUG: ext4_commit_super() __sync_dirty_buffer() submit_bh() submit_bh_wbc() BUG_ON(!buffer_mapped(bh)); This fix checks for the superblock's buffer head being mapped prior to syncing. [1] https://www.spinics.net/lists/linux-ext4/msg56527.html Signed-off-by: Jon Derrick --- fs/ext4/super.c | 8 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c4c220..ee33233 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4736,6 +4736,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; + + /* +* The superblock bh should be mapped, but it might not be if the +* device was hot-removed. Not much we can do but fail the I/O. +*/ + if (!buffer_mapped(sbh)) + return error; + /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock -- 2.7.4
[PATCH] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- Original RFC here: https://patchwork.ozlabs.org/patch/945374/ I split this bit out from the RFC while awaiting the pci string handling enhancements to handle per-device settings Changed from RFC is a simpler algo drivers/pci/setup-bus.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..5ae39e6 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,7 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1080,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH v2] PCI hotplug Eq v2
Hi Bjorn, Sorry for the delay on this one and pushing it after RC1. Feel free to queue it up for 4.20 if it looks fine. I've added comments to the git log and source explaining why calculate_iosize was left unchanged. Basically I could not synthesize a condition where it would have affected the topology. v1->v2: Comments Jon Derrick (1): PCI: Equalize hotplug memory for non/occupied slots drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) -- 1.8.3.1
[PATCH v2] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] The calculation for IO (hpiosize) should be similar, but platform firmwares I've encountered (including QEMU) provide strict allocations for IO and would not provide free IO resources for hotplug buses in order to prove this calculation. Signed-off-by: Jon Derrick --- drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..70d0aba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,15 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + /* +* Consider the current allocation size when adding size for extra +* hotplug memory. This ensures that occupied slots don't receive +* unneccessary memory allocations in addition to their current size. +* The calculation should be similar for calculate_iosize, but was +* unable to be tested. +*/ + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1088,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH] PCI/AER: Fix an AER enabling/disabling race
There is a sequence with non-ACPI root ports where the AER driver can enable error reporting on the tree before port drivers have bound to ports on the tree. The port driver assumes the AER driver will set up error reporting afterwards, so instead add a check if error reporting was set up first. Example: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/aer.c | 1 + drivers/pci/pcie/portdrv_core.c | 5 - include/linux/pci.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 83180ed..a4e36b6 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1333,6 +1333,7 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data) if (enable) pcie_set_ecrc_checking(dev); + dev->aer_configured = 1; return 0; } diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..f5de554 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -224,8 +224,11 @@ static int get_port_device_capability(struct pci_dev *dev) /* * Disable AER on this port in case it's been enabled by the * BIOS (the AER service driver will enable it when necessary). +* Don't disable it if the AER service driver has already +* enabled it from the root port bus walking */ - pci_disable_pcie_error_reporting(dev); + if (!dev->aer_configured) + pci_disable_pcie_error_reporting(dev); } #endif diff --git a/include/linux/pci.h b/include/linux/pci.h index e72ca8d..c071622 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -402,6 +402,7 @@ struct pci_dev { unsigned inthas_secondary_link:1; unsigned intnon_compliant_bars:1; /* Broken BARs; ignore them */ unsigned intis_probed:1;/* Device probing in progress */ + unsigned intaer_configured:1; /* AER configured for device */ pci_dev_flags_t dev_flags; atomic_tenable_cnt; /* pci_enable_device has been called */ -- 1.8.3.1
[PATCH] PCI/portdrv: Enable error reporting on managed ports
During probe, the port driver will disable error reporting and assumes it will be enabled later by the AER driver's pci_walk_bus() sequence. This may not be the case for host-bridge enabled root ports, who will enable first error reporting on the bus during the root port probe, and then disable error reporting on downstream devices during subsequent probing of the bus. A hotplugged port device may also fail to enable error reporting as the AER driver has already run on the root bus. Check for these conditions and enable error reporting during portdrv probing. Example case: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/portdrv_core.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..fdd953a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -343,6 +343,16 @@ int pcie_port_device_register(struct pci_dev *dev) if (!nr_service) goto error_cleanup_irqs; +#ifdef CONFIG_PCIEAER + /* +* Enable error reporting for this port in case AER probing has already +* run on the root bus or this port device is hot-inserted +*/ + if (dev->aer_cap && pci_aer_available() && + (pcie_ports_native || pci_find_host_bridge(dev->bus)->native_aer)) + pci_enable_pcie_error_reporting(dev); +#endif + return 0; error_cleanup_irqs: -- 1.8.3.1
[PATCH v3] PCI: Equalize hotplug memory and io for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory and hpiosize additional io if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory/io size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will be allocated their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize/hpiosize parameters to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- v2->v3: Made the IO and mem size calculations nearly equivalent drivers/pci/setup-bus.c | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..ed96043 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -811,6 +811,8 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, static resource_size_t calculate_iosize(resource_size_t size, resource_size_t min_size, resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -823,15 +825,18 @@ static resource_size_t calculate_iosize(resource_size_t size, #if defined(CONFIG_ISA) || defined(CONFIG_EISA) size = (size & 0xff) + ((size & ~0xffUL) << 2); #endif - size = ALIGN(size + size1, align); + size = size + size1; if (size < old_size) size = old_size; + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +846,8 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -930,12 +936,10 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } } - size0 = calculate_iosize(size, min_size, size1, + size0 = calculate_iosize(size, min_size, size1, 0, 0, resource_size(b_res), min_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_iosize(size, min_size, add_size + size1, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_iosize(size, min_size, size1, add_size, children_add_size, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) @@ -1079,12 +1083,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH v2 2/4] pci/x86: Move VMD quirks to x86 fixups
VMD currently only exists for Intel x86 products Signed-off-by: Jon Derrick --- arch/x86/pci/fixup.c | 18 ++ drivers/pci/quirks.c | 17 - 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 11e4074..ca4b02e5 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -618,3 +618,21 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); + +/* + * VMD-enabled root ports will change the source ID for all messages + * to the VMD device. Rather than doing device matching with the source + * ID, the AER driver should traverse the child device tree, reading + * AER registers to find the faulting device. + */ +static void quirk_no_aersid(struct pci_dev *pdev) +{ + /* VMD Domain */ + if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); + diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f1c9852..073baba 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4671,23 +4671,6 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); -/* - * VMD-enabled root ports will change the source ID for all messages - * to the VMD device. Rather than doing device matching with the source - * ID, the AER driver should traverse the child device tree, reading - * AER registers to find the faulting device. - */ -static void quirk_no_aersid(struct pci_dev *pdev) -{ - /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) - pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; -} -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); - /* FLR may cause some 82579 devices to hang. */ static void quirk_intel_no_flr(struct pci_dev *dev) { -- 2.9.4
[PATCH v2 4/4] iommu: Prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the requester. Because of this, there needs to be a way to link the parent VMD endpoint's iommu group and associated mappings to the VMD child devices such that attaching and detaching child devices modify the endpoint's mappings, while preventing early detaching on a singular device removal or unbinding. The reassignment of individual VMD child devices devices to VMs is outside the scope of VMD, but may be implemented in the future. For now it is best to prevent any such attempts. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing an iommu_group sysfs directories and allowing subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..94353a6e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -901,6 +901,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf struct pci_dev *pf_pdev; pdev = to_pci_dev(dev); + + /* VMD child devices currently cannot be handled individually */ + if (is_vmd(pdev->bus)) + return NULL; + /* VFs aren't listed in scope tables; we need to look up * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); -- 2.9.4
[PATCH v2 0/4] VMD fixups
Mostly just cleanup in this revision, eg, trying to limit scope of vmd code to x86 Previous: https://patchwork.kernel.org/patch/9886095/ https://patchwork.kernel.org/patch/9886097/ https://patchwork.kernel.org/patch/9886101/ Jon Derrick (4): MAINTAINERS: Add Jonathan Derrick as VMD maintainer pci/x86: Move VMD quirks to x86 fixups x86/PCI: Use is_vmd rather than relying on the domain number iommu: Prevent VMD child devices from being remapping targets MAINTAINERS | 1 + arch/x86/pci/fixup.c| 18 ++ drivers/iommu/intel-iommu.c | 5 + drivers/pci/quirks.c| 17 - 4 files changed, 24 insertions(+), 17 deletions(-) -- 2.9.4
[PATCH v2 3/4] x86/PCI: Use is_vmd rather than relying on the domain number
Signed-off-by: Jon Derrick --- arch/x86/pci/fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index ca4b02e5..1ed2fbf 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -628,7 +628,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -- 2.9.4
[PATCH v2 1/4] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick Acked-by: Keith Busch --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch +M: Jonathan Derrick L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
Re: [RFC 1/3] PCI: pci-driver: Introduce pci device delete list
Hi Greg, On 09/28/2017 03:09 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:20PM -0400, Jon Derrick wrote: >> This patch introduces a new kernel command line parameter to mask pci >> device ids from pci driver id tables. This prevents masked devices from >> automatically binding to both built-in and module drivers. >> >> Devices can be later attached through the driver's sysfs new_id >> inteface. >> >> The use cases for this are primarily for debugging, eg, being able to >> prevent attachment before probes are set up. It can also be used to mask >> off faulty built-in hardware or faulty simulated hardware. >> >> Another use case is to prevent attachment of devices which will be >> passed to VMs, shortcutting the detachment effort. > > Is the "shortcut" really that big of a deal? unbind actually causes > problems? Is this an attempt to deal with devices being handled by more > than one driver and then you want to manually bind it later on? > >> The format is similar to the sysfs new_id format. Device ids are >> specified with: >> >> :[:SVVV:SDDD][:][:] >> >> Where: >> = Vendor ID >> = Device ID >> SVVV = Subvendor ID >> SDDD = Subdevice ID >> = Class >> = Class Mask >> >> IDs can be chained with commas. >> >> Examples: >> .delete_id=1234:5678 >> .delete_id=: >> .delete_id=::::010802 >> .delete_id=1234:5678,abcd:ef01,2345: > > What about drivers that handle more than one bus type (i.e. USB and > PCI?) This format is specific to PCI, yet you are defining it as a > "global" for all drivers :( > I was hoping to extend it to other bus types as well, but just wanted some early feedback on the idea. Pci was the easier implementation for me. Could prepending pci:, usb:, etc on the format be an option? > This feels hacky, what is the real reason for this? It feels like we > have so many different ways to blacklist and unbind and bind devices to > drivers already, why add yet-another way? > I ran into an issue a while back where I needed to disable a device from a built-in driver to perform a regression test. I worked around that issue by doing an initcall_blacklist on the pci-driver declaration, but that also preventing later binding because the driver was never registered. I've also had issues with remote systems where pluggable devices were broken or otherwise non-responsive, and it would have been nice to have been able to keep them from binding on module loading as a temporary workaround until someone could have removed those devices (though impossible on built-in hardware). I'm not sure about hacky; I think the implementation in this patch (1/3) is pretty clean :). I'm not familiar with all the different ways of blacklisting. Is there another way to work around the issues I listed above? I understand the concern about having it exist in the format . and only supporting one or a few bus types. I have another set that extends the pci= parameter instead. > thanks, > > greg k-h > Best, Jon
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 03:02 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:21PM -0400, Jon Derrick wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> +/* >> + * Ignore driver delete list arguments. They are handled by driver >> + * registration callbacks >> + */ >> +if (strcmp(param, "delete_id") == 0) >> +return 0; > > Why? This is only for the PCI core as you have defined it in this > patchset, but you just broke this module id for all other kernel modules > in the system :( > > If you want to do this, you need to provide this feature for _all_ > kernel drivers... > > thanks, > > greg k-h > Yes I'm not particularly happy about this one either. I will make this more robust if the blacklisting idea is sound.
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 12:03 AM, Dan Williams wrote: > On Wed, Sep 27, 2017 at 1:40 PM, Jon Derrick > wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> + /* >> +* Ignore driver delete list arguments. They are handled by driver >> +* registration callbacks >> +*/ >> + if (strcmp(param, "delete_id") == 0) >> + return 0; >> + > > Does this preclude something like: > > modprobe ahci delete_id=1234:5678? > It does seem like it would. I can look into calling into the pci callback for this, but val is a struct module here and I haven't figured out the plumbing to get the [correct] driver from that. Maybe if I enforce the format of 'modprobe ahci ahci.delete_id=' to ensure the driver is specified (and would be required in cases with multi-driver modules).
Re: [PATCH 2/3] pci: Generalize is_vmd behavior
On 08/11/2017 11:03 AM, Bjorn Helgaas wrote: > On Mon, Aug 07, 2017 at 01:57:12PM -0600, Jon Derrick wrote: >> Generalize is_vmd behavior to remove dependency on domain number >> checking in pci quirks. >> >> Signed-off-by: Jon Derrick >> --- >> arch/x86/include/asm/pci.h | 8 +++- >> arch/x86/pci/common.c | 2 +- >> drivers/pci/quirks.c | 2 +- >> include/linux/pci.h| 4 >> 4 files changed, 9 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h >> index 473a729..5c5d54a 100644 >> --- a/arch/x86/include/asm/pci.h >> +++ b/arch/x86/include/asm/pci.h >> @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus >> *bus) >> #define pci_root_bus_fwnode _pci_root_bus_fwnode >> #endif >> >> -static inline bool is_vmd(struct pci_bus *bus) >> -{ >> #if IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) >> +{ >> struct pci_sysdata *sd = bus->sysdata; >> >> return sd->vmd_domain; >> -#else >> -return false; >> -#endif >> } >> +#endif >> >> /* Can be used to override the logic in pci_scan_bus for skipping >> already-configured bus numbers - to be used for buggy BIOSes >> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c >> index dbe2132..18b2277 100644 >> --- a/arch/x86/pci/common.c >> +++ b/arch/x86/pci/common.c >> @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} >> >> static void set_dev_domain_options(struct pci_dev *pdev) >> { >> -if (is_vmd(pdev->bus)) >> +if (pci_bus_is_vmd(pdev->bus)) >> pdev->hotplug_user_indicators = 1; >> } >> >> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c >> index 6967c6b..ba47995 100644 >> --- a/drivers/pci/quirks.c >> +++ b/drivers/pci/quirks.c >> @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, >> quirk_intel_qat_vf_cap); >> static void quirk_no_aersid(struct pci_dev *pdev) >> { >> /* VMD Domain */ >> -if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) >> +if (pci_bus_is_vmd(pdev->bus)) > > I like this part ... > >> pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; >> } >> DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); >> diff --git a/include/linux/pci.h b/include/linux/pci.h >> index 4869e66..0299d8b 100644 >> --- a/include/linux/pci.h >> +++ b/include/linux/pci.h >> @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus >> *bus) { return 0; } >> static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } >> #endif /* CONFIG_PCI_DOMAINS */ >> >> +#if !IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } >> +#endif > > But not so much this part. VMD is (AFAIK) still fundamentally an > x86-only thing, so I'd like it better if this could all be within > arch/x86. Could this be done by moving quirk_no_aersid() to > arch/x86/pci/fixup.c? > Thanks for pointing this out. I'll think of something different and localize it to the x86 code domain. > BTW, CONFIG_VMD in drivers/pci/host/Kconfig is currently "depends on > SRCU". I'm not a Kconfig expert, but that doesn't seem like an > intuitive connection. And it's the only such dependency on SRCU in > the tree -- most other places use "select SRCU", which makes more > sense to me. I agree - most places use select SRCU. I'll add that to v2. > >> /* >> * Generic implementation for PCI domain support. If your >> * architecture does not need custom management of PCI >> -- >> 2.9.4 >>
Re: [PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
Hi Robin, thanks for the reply. On 08/11/2017 12:25 PM, Robin Murphy wrote: > On 07/08/17 20:57, Jon Derrick wrote: >> VMD child devices must use the VMD endpoint's ID as the DMA source. >> Because of this, there needs to be a way to link the parent VMD >> endpoint's DMAR domain to the VMD child devices' DMAR domain such that >> attaching and detaching child devices modify the endpoint's DMAR mapping >> and prevents early detaching. > > That sounds like either pci_device_group() needs modifying, or perhaps > that intel-iommu needs its own extended iommu_ops::device_group > implementation, to ensure that VMD child devices get put in the same > group as their parent - if they share requester IDs they can't feasibly > be attached to different domains anyway. > > Robin. Yes it seems like that to me too. I have a high-level understanding of the changes required but not too much in the nitty-gritty details. It's a bit more complicated anyways because VMD emerges a set of root ports, and AFAICT, PCI ACS end at the root ports and iommu groups rely on ACS to group devices. Either way it's not within the scope of VMD.
[PATCH 1/3] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch +M: Jonathan Derrick L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
[PATCH 2/3] pci: Generalize is_vmd behavior
Generalize is_vmd behavior to remove dependency on domain number checking in pci quirks. Signed-off-by: Jon Derrick --- arch/x86/include/asm/pci.h | 8 +++- arch/x86/pci/common.c | 2 +- drivers/pci/quirks.c | 2 +- include/linux/pci.h| 4 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 473a729..5c5d54a 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) #define pci_root_bus_fwnode_pci_root_bus_fwnode #endif -static inline bool is_vmd(struct pci_bus *bus) -{ #if IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) +{ struct pci_sysdata *sd = bus->sysdata; return sd->vmd_domain; -#else - return false; -#endif } +#endif /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132..18b2277 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} static void set_dev_domain_options(struct pci_dev *pdev) { - if (is_vmd(pdev->bus)) + if (pci_bus_is_vmd(pdev->bus)) pdev->hotplug_user_indicators = 1; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6967c6b..ba47995 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (pci_bus_is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66..0299d8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #endif /* CONFIG_PCI_DOMAINS */ +#if !IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } +#endif + /* * Generic implementation for PCI domain support. If your * architecture does not need custom management of PCI -- 2.9.4
[PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the DMA source. Because of this, there needs to be a way to link the parent VMD endpoint's DMAR domain to the VMD child devices' DMAR domain such that attaching and detaching child devices modify the endpoint's DMAR mapping and prevents early detaching. This is outside the scope of VMD, so disable binding child devices to prevent unforeseen issues. This functionality may be implemented in the future. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing iommu_group sysfs directories and subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..651a6cd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -905,6 +905,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); dev = _pdev->dev; + + /* VMD child devices currently cannot be handled individually */ + if (pci_bus_is_vmd(pdev->bus)) + return NULL; + segment = pci_domain_nr(pdev->bus); } else if (has_acpi_companion(dev)) dev = _COMPANION(dev)->dev; -- 2.9.4
[PATCH] genirq/affinity: report extra vectors on uneven nodes
The current irq spreading algorithm spreads vectors amongst cpus evenly per node. If a node has more cpus than another node, the extra vectors being spread may not be reported back to the caller. This is most apparent with the NVMe driver and nr_cpus < vectors, where the underreporting results in the caller's WARN being triggered: irq_build_affinity_masks() ... if (nr_present < numvecs) WARN_ON(nr_present + nr_others < numvecs); Signed-off-by: Jon Derrick --- kernel/irq/affinity.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4352b08ae48d..9beafb8c7e92 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -127,7 +127,8 @@ static int __irq_build_affinity_masks(unsigned int startvec, } for_each_node_mask(n, nodemsk) { - unsigned int ncpus, v, vecs_to_assign, vecs_per_node; + unsigned int ncpus, v, vecs_to_assign, total_vecs_to_assign, + vecs_per_node; /* Spread the vectors per node */ vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; @@ -141,14 +142,16 @@ static int __irq_build_affinity_masks(unsigned int startvec, /* Account for rounding errors */ extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); + total_vecs_to_assign = vecs_to_assign + extra_vecs; - for (v = 0; curvec < last_affv && v < vecs_to_assign; + for (v = 0; curvec < last_affv && v < total_vecs_to_assign; curvec++, v++) { cpus_per_vec = ncpus / vecs_to_assign; /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; + v++; --extra_vecs; } irq_spread_init_one([curvec].mask, nmsk, -- 2.20.1
[PATCH 2/2] PCI: vmd: Fix shadow offsets to reflect spec changes
The shadow offset scratchpad was moved to 0x2000-0x2010. Update the location to get the correct shadow offset. Cc: sta...@vger.kernel.org # v5.2+ Fixes: 6788958e ("PCI: vmd: Assign membar addresses from shadow registers") Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 2e4da3f51d6b..a35d3f3996d7 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -31,6 +31,9 @@ #define PCI_REG_VMLOCK 0x70 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2) +#define MB2_SHADOW_OFFSET 0x2000 +#define MB2_SHADOW_SIZE16 + enum vmd_features { /* * Device may contain registers which hint the physical location of the @@ -578,7 +581,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) u32 vmlock; int ret; - membar2_offset = 0x2018; + membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, ); if (ret || vmlock == ~0) return -ENODEV; @@ -590,9 +593,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!membar2) return -ENOMEM; offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - readq(membar2 + 0x2008); + readq(membar2 + MB2_SHADOW_OFFSET); offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - readq(membar2 + 0x2010); + readq(membar2 + MB2_SHADOW_OFFSET + 8); pci_iounmap(vmd->dev, membar2); } } -- 2.20.1
[PATCH 0/2] VMD fixes for v5.4
Hi Lorenzo, Bjorn, Keith, Please consider the following patches for 5.4 inclusion. These will apply to 5.2 stable. 4.19 has a few feature deps so I will instead follow-up with a backport. Jon Derrick (2): PCI: vmd: Fix config addressing when using bus offsets PCI: vmd: Fix shadow offsets to reflect spec changes drivers/pci/controller/vmd.c | 25 +++-- 1 file changed, 15 insertions(+), 10 deletions(-) -- 2.20.1
[PATCH 1/2] PCI: vmd: Fix config addressing when using bus offsets
VMD maps child device config spaces to the VMD Config BAR linearly regardless of the starting bus offset. Because of this, the config address decode must ignore starting bus offsets when mapping the BDF to the config space address. Cc: sta...@vger.kernel.org # v5.2+ Fixes: 2a5a9c9a ("PCI: vmd: Add offset to bus numbers if necessary") Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 16 +--- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 4575e0c6dc4b..2e4da3f51d6b 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -94,6 +94,7 @@ struct vmd_dev { struct resource resources[3]; struct irq_domain *irq_domain; struct pci_bus *bus; + u8 busn_start; struct dma_map_ops dma_ops; struct dma_domain dma_domain; @@ -440,7 +441,8 @@ static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { char __iomem *addr = vmd->cfgbar + -(bus->number << 20) + (devfn << 12) + reg; +((bus->number - vmd->busn_start) << 20) + +(devfn << 12) + reg; if ((addr - vmd->cfgbar) + len >= resource_size(>dev->resource[VMD_CFGBAR])) @@ -563,7 +565,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) unsigned long flags; LIST_HEAD(resources); resource_size_t offset[2] = {0}; - resource_size_t membar2_offset = 0x2000, busn_start = 0; + resource_size_t membar2_offset = 0x2000; struct pci_bus *child; /* @@ -606,14 +608,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, ); if (BUS_RESTRICT_CAP(vmcap) && (BUS_RESTRICT_CFG(vmconfig) == 0x1)) - busn_start = 128; + vmd->busn_start = 128; } res = >dev->resource[VMD_CFGBAR]; vmd->resources[0] = (struct resource) { .name = "VMD CFGBAR", - .start = busn_start, - .end = busn_start + (resource_size(res) >> 20) - 1, + .start = vmd->busn_start, + .end = vmd->busn_start + (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, }; @@ -681,8 +683,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_add_resource_offset(, >resources[1], offset[0]); pci_add_resource_offset(, >resources[2], offset[1]); - vmd->bus = pci_create_root_bus(>dev->dev, busn_start, _ops, - sd, ); + vmd->bus = pci_create_root_bus(>dev->dev, vmd->busn_start, + _ops, sd, ); if (!vmd->bus) { pci_free_resource_list(); irq_domain_remove(vmd->irq_domain); -- 2.20.1
[PATCH] PCI: vmd: Expose oob management window to users
Some VMD devices provide a management window within MEMBAR2 that is used to communicate out-of-band with child devices. This patch creates a binary file for interacting with the interface. OOB Reads/Writes are bounds-checked by sysfs_fs_bin_{read,write} Signed-off-by: Jon Derrick --- Depends on https://lore.kernel.org/linux-pci/20190916135435.5017-1-jonathan.derr...@intel.com/T/#t drivers/pci/controller/vmd.c | 128 --- 1 file changed, 117 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a35d3f3996d7..b13954cf9c96 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -33,6 +33,8 @@ #define MB2_SHADOW_OFFSET 0x2000 #define MB2_SHADOW_SIZE16 +#define MB2_OOB_WINDOW_OFFSET 0x2010 +#define MB2_OOB_WINDOW_SIZE128 enum vmd_features { /* @@ -47,6 +49,12 @@ enum vmd_features { * bus numbering */ VMD_FEAT_HAS_BUS_RESTRICTIONS = (1 << 1), + + /* +* Device may provide an out-of-band management interface through a +* read/write window +*/ + VMD_FEAT_HAS_OOB_WINDOW = (1 << 2), }; /* @@ -101,6 +109,10 @@ struct vmd_dev { struct dma_map_ops dma_ops; struct dma_domain dma_domain; + + spinlock_t oob_lock; + char __iomem*oob_addr; + struct bin_attribute*oob_attr; }; static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) @@ -543,6 +555,68 @@ static void vmd_detach_resources(struct vmd_dev *vmd) vmd->dev->resource[VMD_MEMBAR2].child = NULL; } +static ssize_t vmd_oob_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + struct vmd_dev *vmd = attr->private; + unsigned long flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + spin_lock_irqsave(>oob_lock, flags); + memcpy_fromio([off], >oob_addr[off], count); + spin_unlock_irqrestore(>oob_lock, flags); + + return count; +} + +static ssize_t vmd_oob_write(struct file *filp, struct kobject *kobj, +struct bin_attribute *attr, char *buf, +loff_t off, size_t count) +{ + struct vmd_dev *vmd = attr->private; + unsigned long flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + spin_lock_irqsave(>oob_lock, flags); + memcpy_toio(>oob_addr[off], [off], count); + spin_unlock_irqrestore(>oob_lock, flags); + + return count; +} + +static int vmd_create_oob_file(struct vmd_dev *vmd) +{ + struct pci_dev *dev = vmd->dev; + struct bin_attribute *oob_attr; + + oob_attr = devm_kzalloc(>dev->dev, sizeof(*oob_attr), GFP_ATOMIC); + if (!oob_attr) + return -ENOMEM; + + spin_lock_init(>oob_lock); + sysfs_bin_attr_init(oob_attr); + vmd->oob_attr = oob_attr; + oob_attr->attr.name = "oob"; + oob_attr->attr.mode = S_IRUSR | S_IWUSR; + oob_attr->size = MB2_OOB_WINDOW_SIZE; + oob_attr->read = vmd_oob_read; + oob_attr->write = vmd_oob_write; + oob_attr->private = (void *)vmd; + + return sysfs_create_bin_file(>dev.kobj, oob_attr); +} + +static void vmd_destroy_oob_file(struct vmd_dev *vmd) +{ + if (vmd->oob_attr) + sysfs_remove_bin_file(>dev->dev.kobj, vmd->oob_attr); +} + /* * VMD domains start at 0x1 to not clash with ACPI _SEG domains. * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower @@ -570,6 +644,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) resource_size_t offset[2] = {0}; resource_size_t membar2_offset = 0x2000; struct pci_bus *child; + int ret; /* * Shadow registers may exist in certain VMD device ids which allow @@ -579,7 +654,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) */ if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) { u32 vmlock; - int ret; membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, ); @@ -614,6 +688,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) vmd->busn_start = 128; } + /* +* Certain VMD devices provide a window for communicating with child +* devices through a management interface +*/ + if (features & VMD_FEAT_HAS_OOB_WINDOW) { + membar2_offset = MB2_OOB_WINDOW_OFFSET + MB2_OOB_WINDOW_SIZE; + vmd
[RFC 3/3] Documentation: Add pci device delete_id interface
Document the .delete_id= cmdline interface for masking built-in and module device id tables. Signed-off-by: Jon Derrick --- Documentation/admin-guide/kernel-parameters.txt | 13 + 1 file changed, 13 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0549662..862c8b5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -789,6 +789,19 @@ Defaults to the default architecture's huge page size if not specified. + driver.delete_id= + [PCI] + Specified pci device format(s) will mask built-in + driver id tables, preventing devices from + automicatically attaching to built-in or module + drivers. Later attachment should be done by adding the + deleted id back through the "new_id" sysfs interface. + Format: :[::]\ + [:class][:class_mask][, ...] + PCI_ANY_ID masks should use the full 32-bit format. + Examples: virtio-pci.delete_id=: + nvme.delete_id=1234:5678,: + dhash_entries= [KNL] Set number of hash buckets for dentry cache. -- 1.8.3.1
[RFC 2/3] module: Ignore delete_id parameter
The PCI driver delete_id parameter is handled in each individual driver registration callback. Signed-off-by: Jon Derrick --- kernel/module.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/module.c b/kernel/module.c index de66ec8..2b2dccf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } + /* +* Ignore driver delete list arguments. They are handled by driver +* registration callbacks +*/ + if (strcmp(param, "delete_id") == 0) + return 0; + /* Check for magic 'dyndbg' arg */ ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) -- 1.8.3.1
[RFC 0/3] Introduce PCI device blacklisting
This set introduces a method to keep devices from matching driver id tables. Please see 0001 for more details of the motivation and implementation. This set currently applies to Bjorn's next (v4.14-rc1) branch. Jon Derrick (3): PCI: pci-driver: Introduce pci device delete list module: Ignore delete_id parameter Documentation: Add pci device delete_id interface Documentation/admin-guide/kernel-parameters.txt | 13 ++ drivers/pci/pci-driver.c| 253 +++- include/linux/pci.h | 1 + kernel/module.c | 7 + 4 files changed, 272 insertions(+), 2 deletions(-) -- 1.8.3.1
[RFC 1/3] PCI: pci-driver: Introduce pci device delete list
This patch introduces a new kernel command line parameter to mask pci device ids from pci driver id tables. This prevents masked devices from automatically binding to both built-in and module drivers. Devices can be later attached through the driver's sysfs new_id inteface. The use cases for this are primarily for debugging, eg, being able to prevent attachment before probes are set up. It can also be used to mask off faulty built-in hardware or faulty simulated hardware. Another use case is to prevent attachment of devices which will be passed to VMs, shortcutting the detachment effort. The format is similar to the sysfs new_id format. Device ids are specified with: :[:SVVV:SDDD][:][:] Where: = Vendor ID = Device ID SVVV = Subvendor ID SDDD = Subdevice ID = Class = Class Mask IDs can be chained with commas. Examples: .delete_id=1234:5678 .delete_id=: .delete_id=::::010802 .delete_id=1234:5678,abcd:ef01,2345: Signed-off-by: Jon Derrick --- drivers/pci/pci-driver.c | 253 ++- include/linux/pci.h | 1 + 2 files changed, 252 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 11bd267..7acdf13 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "pci.h" struct pci_dynid { @@ -27,6 +28,250 @@ struct pci_dynid { struct pci_device_id id; }; +static struct pci_device_id *pci_match_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev, + bool inverse) +{ + struct pci_dynid *deleteid; + struct pci_device_id *found_id = NULL; + + spin_lock(>deleteids.lock); + list_for_each_entry(deleteid, >deleteids.list, node) { + if (!inverse && pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + if (inverse && !pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + } + spin_unlock(>deleteids.lock); + + return found_id; +} + +/** + * pci_match_non_deleted_ids - match dev against not-deleted ids + * @ids: array of PCI device id structures to search in + * @dev: the PCI device structure to match against. + * + * Finds devices in driver id table not matching the delete list and tries to + * match them individually to dev. Returns the matching pci_dev_id structure, + * %NULL if there is no match, of -errno if there was a failure to allocate + * memory for the temporary pci_dev + */ +static struct pci_device_id *pci_match_non_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev) +{ + const struct pci_device_id *ids = drv->id_table; + struct pci_device_id *match = NULL; + struct pci_dev *tmpdev; + + tmpdev = kzalloc(sizeof(*tmpdev), GFP_KERNEL); + if (!tmpdev) + return ERR_PTR(-ENOMEM); + + while (ids->vendor || ids->subvendor || ids->class_mask) { + struct pci_device_id *found_id = NULL; + + tmpdev->vendor = ids->vendor; + tmpdev->device = ids->device, + tmpdev->subsystem_vendor = ids->subvendor, + tmpdev->subsystem_device = ids->subdevice, + tmpdev->class = ids->class; + + found_id = pci_match_deleted_ids(drv, tmpdev, true); + if (found_id) { + const struct pci_device_id id[2] = { *found_id, {0,} }; + if (pci_match_id(id, dev)) { + match = found_id; + break; + } + } + ids++; + } + + kfree(tmpdev); + + return match; +} + +/** + * pci_add_delete_id - add a new PCI device ID to a built-in driver's blacklist + * @drv: target pci driver + * @vendor: PCI vendor ID + * @device: PCI device ID + * @subvendor: PCI subvendor ID + * @subdevice: PCI subdevice ID + * @class: PCI class + * @class_mask: PCI class mask + * + * Adds a new dynamic pci device ID to this driver's delete list, preventing + * the device from attaching to built-in drivers + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pci_add_delete_id(struct pci_driver *drv, + unsigned int vendor, unsigned int device, + unsigned int subvendor, unsigned int subdevice, + unsigned int class, unsigned int class_mask) +{ + struct pci_dynid *del
[PATCH v3 0/2] PCI/ERR: Allow Native AER/DPC using _OSC
Hi Bjorn & Kuppuswamy, I see a problem in the DPC ECN [1] to _OSC in that it doesn't give us a way to determine if firmware supports _OSC DPC negotation, and therefore how to handle DPC. Here is the wording of the ECN that implies that Firmware without _OSC DPC negotiation support should have the OSPM rely on _OSC AER negotiation when determining DPC control: PCIe Base Specification suggests that Downstream Port Containment may be controlled either by the Firmware or the Operating System. It also suggests that the Firmware retain ownership of Downstream Port Containment if it also owns AER. When the Firmware owns Downstream Port Containment, it is expected to use the new "Error Disconnect Recover" notification to alert OSPM of a Downstream Port Containment event. In legacy platforms, as bits in _OSC are reserved prior to implementation, ACPI Root Bus enumeration will mark these Host Bridges as without Native DPC support, even though the specification implies it's expected that AER _OSC negotiation determines DPC control for these platforms. There seems to be a need for a way to determine if the DPC control bit in _OSC is supported and fallback on AER otherwise. Currently portdrv assumes DPC control if the port has Native AER services: static int get_port_device_capability(struct pci_dev *dev) ... if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) && pci_aer_available() && (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; Newer firmware may not grant OSPM DPC control, if for instance, it expects to use Error Disconnect Recovery. However it looks like ACPI will use DPC services via the EDR driver, without binding the full DPC port service driver. If we change portdrv to probe based on host->native_dpc and not AER, then we break instances with legacy firmware where OSPM will clear host->native_dpc solely due to _OSC bits being reserved: struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, ... if (!(root->osc_control_set & OSC_PCI_EXPRESS_DPC_CONTROL)) host_bridge->native_dpc = 0; So my assumption instead is that host->native_dpc can be 0 and expect Native DPC services if AER is used. In other words, if and only if DPC probe is invoked from portdrv, then it needs to rely on the AER dependency. Otherwise it should be assumed that ACPI set up DPC via EDR. This covers legacy firmware. However it seems like that could be trouble with newer firmware that might give OSPM control of AER but not DPC, and would result in both Native DPC and EDR being in effect. Anyways here are two patches that give control of AER and DPC on the results of _OSC. They don't mess with the HEST parser as I expect those to be removed at some point. I need these for VMD support which doesn't even rely on _OSC, but I suspect this won't be the last effort as we detangle Firmware First. [1] https://members.pcisig.com/wg/PCI-SIG/document/12888 Jon Derrick (2): PCI/AER: Use _OSC to determine Firmware First before HEST PCI/DPC: Use _OSC to determine DPC support drivers/pci/pcie/aer.c | 3 +++ drivers/pci/pcie/dpc.c | 3 --- drivers/pci/pcie/portdrv_core.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) -- 1.8.3.1
[PATCH v3 2/2] PCI/DPC: Use _OSC to determine DPC support
After a5bf8719af: "PCI/AER: Use only _OSC to determine AER ownership", _OSC is the primary determiner of ownership of Firmware First error handling rather than HEST. With the addition of DPC to _OSC [1], OSPM is able to negotiate DPC services from Firmware. ACPI Root Bus enumeration sets the Host Bridge's Native DPC flag on negotiation of those service. This patch changes DPC probing to check DPC control as determined by _OSC, by checking the Host Bridge's Native DPC member. As most older platforms won't have DPC negotiable by _OSC, this patch doesn't attempt to change behavior that assumes if OSPM has negotiated AER by _OSC, OSPM will also want DPC control. [1] https://members.pcisig.com/wg/PCI-SIG/document/12888 Signed-off-by: Jon Derrick --- drivers/pci/pcie/dpc.c | 3 --- drivers/pci/pcie/portdrv_core.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 7621704..9104929 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -284,9 +284,6 @@ static int dpc_probe(struct pcie_device *dev) int status; u16 ctl, cap; - if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native) - return -ENOTSUPP; - status = devm_request_threaded_irq(device, dev->irq, dpc_irq, dpc_handler, IRQF_SHARED, "pcie-dpc", pdev); diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 50a9522..f2139a1 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -256,7 +256,8 @@ static int get_port_device_capability(struct pci_dev *dev) */ if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) && pci_aer_available() && - (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) + (pcie_ports_dpc_native || host->native_dpc || +(services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM || -- 1.8.3.1
[PATCH v3 1/2] PCI/AER: Use _OSC to determine Firmware First before HEST
After a5bf8719af: "PCI/AER: Use only _OSC to determine AER ownership", _OSC is the primary determiner of ownership of Firmware First error handling rather than HEST. ACPI Root Bus enumeration has been modified to flag Host Bridge devices as using Native AER when _OSC has been negotiated for AER services. This patch ensures the PCI layers first uses the _OSC negotiated state by checking the Host Bridge's Native AER flag prior to HEST parsing. Signed-off-by: Jon Derrick --- drivers/pci/pcie/aer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index efc2677..f3d02f4 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -314,6 +314,9 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) if (pcie_ports_native) return 0; + if (pci_find_host_bridge(dev->bus)->native_aer) + return 0; + if (!dev->__aer_firmware_first_valid) aer_set_firmware_first(dev); return dev->__aer_firmware_first; -- 1.8.3.1
[PATCH] irqdomain/treewide: Free firmware node after domain removal
Change 711419e504eb ("irqdomain: Add the missing assignment of domain->fwnode for named fwnode") unintentionally caused a dangling pointer page fault issue on firmware nodes that were freed after IRQ domain allocation. Change e3beca48a45b fixed that dangling pointer issue by only freeing the firmware node after an IRQ domain allocation failure. That fix no longer frees the firmware node immediately, but leaves the firmware node allocated after the domain is removed. We need to keep the firmware node through irq_domain_remove, but should free it afterwards. This patch saves the handle and adds the freeing of firmware node after domain removal where appropriate. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick Cc: sta...@vger.kernel.org --- arch/mips/pci/pci-xtalk-bridge.c| 3 +++ arch/x86/kernel/apic/io_apic.c | 5 + drivers/iommu/intel/irq_remapping.c | 8 drivers/mfd/ioc3.c | 6 ++ drivers/pci/controller/vmd.c| 3 +++ 5 files changed, 25 insertions(+) diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c index 5958217..9b3cc77 100644 --- a/arch/mips/pci/pci-xtalk-bridge.c +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -728,6 +728,7 @@ static int bridge_probe(struct platform_device *pdev) pci_free_resource_list(>windows); err_remove_domain: irq_domain_remove(domain); + irq_domain_free_fwnode(fn); return err; } @@ -735,8 +736,10 @@ static int bridge_remove(struct platform_device *pdev) { struct pci_bus *bus = platform_get_drvdata(pdev); struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct fwnode_handle *fn = bc->domain->fwnode; irq_domain_remove(bc->domain); + irq_domain_free_fwnode(fn); pci_lock_rescan_remove(); pci_stop_root_bus(bus); pci_remove_root_bus(bus); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81ffcfb..21325a4a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = [idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 9564d23..aa096b3 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -628,13 +628,21 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) static void intel_teardown_irq_remapping(struct intel_iommu *iommu) { + struct fwnode_handle *fn; + if (iommu && iommu->ir_table) { if (iommu->ir_msi_domain) { + fn = iommu->ir_msi_domain->fwnode; + irq_domain_remove(iommu->ir_msi_domain); + irq_domain_free_fwnode(fn); iommu->ir_msi_domain = NULL; } if (iommu->ir_domain) { + fn = iommu->ir_domain->fwnode; + irq_domain_remove(iommu->ir_domain); + irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } free_pages((unsigned long)iommu->ir_table->base, diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c index 74cee7c..d939ccc 100644 --- a/drivers/mfd/ioc3.c +++ b/drivers/mfd/ioc3.c @@ -616,7 +616,10 @@ static int ioc3_mfd_probe(struct pci_dev *pdev, /* Remove all already added MFD devices */ mfd_remove_devices(>pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, regs); @@ -643,7 +646,10 @@ static void ioc3_mfd_remove(struct pci_dev *pdev) /* Release resources */ mfd_remove_devices(>pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, ipd->regs); diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller
[PATCH 1/6] PCI: vmd: Create physical offset helper
Moves the guest-passthrough physical offset discovery code to a new helper. No functional changes. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 105 +-- 1 file changed, 62 insertions(+), 43 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index f69ef8c89f72..44b2db789eff 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -417,6 +417,60 @@ static int vmd_find_free_domain(void) return domain + 1; } +static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint, + resource_size_t *offset1, + resource_size_t *offset2) +{ + struct pci_dev *dev = vmd->dev; + u64 phys1, phys2; + + if (native_hint) { + u32 vmlock; + int ret; + + ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, ); + if (ret || vmlock == ~0) + return -ENODEV; + + if (MB2_SHADOW_EN(vmlock)) { + void __iomem *membar2; + + membar2 = pci_iomap(dev, VMD_MEMBAR2, 0); + if (!membar2) + return -ENOMEM; + phys1 = readq(membar2 + MB2_SHADOW_OFFSET); + phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8); + pci_iounmap(dev, membar2); + } else + return 0; + } else { + /* Hypervisor-Emulated Vendor-Specific Capability */ + int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); + u32 reg, regu; + + pci_read_config_dword(dev, pos + 4, ); + + /* "SHDW" */ + if (pos && reg == 0x53484457) { + pci_read_config_dword(dev, pos + 8, ); + pci_read_config_dword(dev, pos + 12, ); + phys1 = (u64) regu << 32 | reg; + + pci_read_config_dword(dev, pos + 16, ); + pci_read_config_dword(dev, pos + 20, ); + phys2 = (u64) regu << 32 | reg; + } else + return 0; + } + + *offset1 = dev->resource[VMD_MEMBAR1].start - + (phys1 & PCI_BASE_ADDRESS_MEM_MASK); + *offset2 = dev->resource[VMD_MEMBAR2].start - + (phys2 & PCI_BASE_ADDRESS_MEM_MASK); + + return 0; +} + static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; @@ -428,6 +482,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) resource_size_t offset[2] = {0}; resource_size_t membar2_offset = 0x2000; struct pci_bus *child; + int ret; /* * Shadow registers may exist in certain VMD device ids which allow @@ -436,50 +491,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) * or 0, depending on an enable bit in the VMD device. */ if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) { - u32 vmlock; - int ret; - membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; - ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, ); - if (ret || vmlock == ~0) - return -ENODEV; - - if (MB2_SHADOW_EN(vmlock)) { - void __iomem *membar2; - - membar2 = pci_iomap(vmd->dev, VMD_MEMBAR2, 0); - if (!membar2) - return -ENOMEM; - offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - (readq(membar2 + MB2_SHADOW_OFFSET) & -PCI_BASE_ADDRESS_MEM_MASK); - offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - (readq(membar2 + MB2_SHADOW_OFFSET + 8) & -PCI_BASE_ADDRESS_MEM_MASK); - pci_iounmap(vmd->dev, membar2); - } - } - - if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) { - int pos = pci_find_capability(vmd->dev, PCI_CAP_ID_VNDR); - u32 reg, regu; - - pci_read_config_dword(vmd->dev, pos + 4, ); - - /* "SHDW" */ - if (pos && reg == 0x53484457) { - pci_read_config_dword(vmd->dev, pos + 8, ); - pci_read_config_dword(vmd->dev, pos + 12, ); - offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - -
[PATCH 5/6] x86/apic/msi: Use Real PCI DMA device when configuring IRTE
VMD retransmits child device MSI/X with the VMD endpoint's requester-id. In order to support direct interrupt remapping of VMD child devices, ensure that the IRTE is programmed with the VMD endpoint's requester-id using pci_real_dma_dev(). Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- arch/x86/kernel/apic/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c2b2911feeef..7ca271b8d891 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -189,7 +189,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) init_irq_alloc_info(, NULL); info.type = X86_IRQ_ALLOC_TYPE_MSI; - info.msi_dev = dev; + info.msi_dev = pci_real_dma_dev(dev); domain = irq_remapping_get_irq_domain(); if (domain == NULL) -- 2.27.0
[PATCH 0/6] VMD MSI Remapping Bypass
The Intel Volume Management Device acts similar to a PCI-to-PCI bridge in that it changes downstream devices' requester-ids to its own. As VMD supports PCIe devices, it has its own MSI/X table and transmits child device MSI/X by remapping child device MSI/X and handling like a demultiplexer. Some newer VMD devices (Icelake Server and client) have an option to bypass the VMD MSI/X remapping table. This allows for better performance scaling as the child device MSI/X won't be limited by VMD's MSI/X count and IRQ handler. It's expected that most users don't want MSI/X remapping when they can get better performance without this limitation. This set includes some long overdue cleanup of overgrown VMD code and introduces the MSI/X remapping disable. Applies on top of e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") and ec0160891e38 ("irqdomain/treewide: Free firmware node after domain removal") in tip/urgent Jon Derrick (6): PCI: vmd: Create physical offset helper PCI: vmd: Create bus offset configuration helper PCI: vmd: Create IRQ Domain configuration helper PCI: vmd: Create IRQ allocation helper x86/apic/msi: Use Real PCI DMA device when configuring IRTE PCI: vmd: Disable MSI/X remapping when possible arch/x86/kernel/apic/msi.c | 2 +- drivers/pci/controller/vmd.c | 344 +++ 2 files changed, 224 insertions(+), 122 deletions(-) base-commit: ec0160891e387f4771f953b888b1fe951398e5d9 -- 2.27.0
[PATCH 6/6] PCI: vmd: Disable MSI/X remapping when possible
VMD will retransmit child device MSI/X using its own MSI/X table and requester-id. This limits the number of MSI/X available to the whole child device domain to the number of VMD MSI/X interrupts. Some VMD devices have a mode where this remapping can be disabled, allowing child device interrupts to bypass processing with the VMD MSI/X domain interrupt handler and going straight the child device interrupt handler, allowing for better performance and scaling. The requester-id still gets changed to the VMD endpoint's requester-id, and the interrupt remapping handlers have been updated to properly set IRTE for child device interrupts to the VMD endpoint's context. Some VMD platforms have existing production BIOS which rely on MSI/X remapping and won't explicitly program the MSI/X remapping bit. This re-enables MSI/X remapping on unload. Disabling MSI/X remapping is only available for Icelake Server and client VMD products. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 58 +++- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 3214d785fa5d..e8cde2c390b9 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -53,6 +53,12 @@ enum vmd_features { * vendor-specific capability space */ VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP= (1 << 2), + + /* +* Device remaps MSI/X transactions into its MSI/X table and requires +* VMD MSI domain for child device interrupt handling +*/ + VMD_FEAT_REMAPS_MSI = (1 << 3), }; /* @@ -298,6 +304,15 @@ static struct msi_domain_info vmd_msi_domain_info = { .chip = _msi_controller, }; +static void vmd_enable_msi_remapping(struct vmd_dev *vmd, bool enable) +{ + u16 reg; + + pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, ); + reg = enable ? (reg & ~0x2) : (reg | 0x2); + pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg); +} + static int vmd_create_irq_domain(struct vmd_dev *vmd) { struct fwnode_handle *fn; @@ -318,6 +333,13 @@ static int vmd_create_irq_domain(struct vmd_dev *vmd) static void vmd_remove_irq_domain(struct vmd_dev *vmd) { + /* +* Some production BIOS won't enable remapping between soft reboots. +* Ensure remapping is restored before unloading the driver +*/ + if (!vmd->msix_count) + vmd_enable_msi_remapping(vmd, true); + if (vmd->irq_domain) { struct fwnode_handle *fn = vmd->irq_domain->fwnode; @@ -606,6 +628,27 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) return ret; } + /* +* Currently MSI remapping must be enabled in guest passthrough mode +* due to some missing interrupt remapping plumbing. This is probably +* acceptable because the guest is usually CPU-limited and MSI +* remapping doesn't become a performance bottleneck. +*/ + if (features & VMD_FEAT_REMAPS_MSI || offset[0] || offset[1]) { + ret = vmd_alloc_irqs(vmd); + if (ret) + return ret; + } + + /* +* Disable remapping for performance if possible based on if VMD IRQs +* had been allocated. +*/ + if (vmd->msix_count) + vmd_enable_msi_remapping(vmd, true); + else + vmd_enable_msi_remapping(vmd, false); + /* * Certain VMD devices may have a root port configuration option which * limits the bus range to between 0-127, 128-255, or 224-255 @@ -674,9 +717,11 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) sd->node = pcibus_to_node(vmd->dev->bus); - ret = vmd_create_irq_domain(vmd); - if (ret) - return ret; + if (vmd->msix_count) { + ret = vmd_create_irq_domain(vmd); + if (ret) + return ret; + } pci_add_resource(, >resources[0]); pci_add_resource_offset(, >resources[1], offset[0]); @@ -738,10 +783,6 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(32))) return -ENODEV; - err = vmd_alloc_irqs(vmd); - if (err) - return err; - spin_lock_init(>cfg_lock); pci_set_drvdata(dev, vmd); err = vmd_enable_domain(vmd, (unsigned long) id->driver_data); @@ -809,7 +850,8 @@ static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume); static const struct pci_device_id vmd_ids[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D), - .driver_dat
[PATCH 2/6] PCI: vmd: Create bus offset configuration helper
Moves the bus offset configuration discovery code to a new helper. Modifies the bus offset 2-bit decode switch to have a 0 case and a default error case, just in case the field is expanded in future hardware. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 53 ++-- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 44b2db789eff..a462719af12a 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -471,6 +471,35 @@ static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint, return 0; } +static int vmd_get_bus_number_start(struct vmd_dev *vmd) +{ + struct pci_dev *dev = vmd->dev; + u16 reg; + + pci_read_config_word(dev, PCI_REG_VMCAP, ); + if (BUS_RESTRICT_CAP(reg)) { + pci_read_config_word(dev, PCI_REG_VMCONFIG, ); + + switch (BUS_RESTRICT_CFG(reg)) { + case 0: + vmd->busn_start = 0; + break; + case 1: + vmd->busn_start = 128; + break; + case 2: + vmd->busn_start = 224; + break; + default: + pci_err(dev, "Unknown Bus Offset Setting (%d)\n", + BUS_RESTRICT_CFG(reg)); + return -ENODEV; + } + } + + return 0; +} + static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; @@ -506,27 +535,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) * limits the bus range to between 0-127, 128-255, or 224-255 */ if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) { - u16 reg16; - - pci_read_config_word(vmd->dev, PCI_REG_VMCAP, ); - if (BUS_RESTRICT_CAP(reg16)) { - pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, -); - - switch (BUS_RESTRICT_CFG(reg16)) { - case 1: - vmd->busn_start = 128; - break; - case 2: - vmd->busn_start = 224; - break; - case 3: - pci_err(vmd->dev, "Unknown Bus Offset Setting\n"); - return -ENODEV; - default: - break; - } - } + ret = vmd_get_bus_number_start(vmd); + if (ret) + return ret; } res = >dev->resource[VMD_CFGBAR]; -- 2.27.0
[PATCH 4/6] PCI: vmd: Create IRQ allocation helper
Moves the IRQ allocation and SRCU initialization code to a new helper. No functional changes. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 94 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 703c48171993..3214d785fa5d 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -528,6 +528,55 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd) return 0; } +static irqreturn_t vmd_irq(int irq, void *data) +{ + struct vmd_irq_list *irqs = data; + struct vmd_irq *vmdirq; + int idx; + + idx = srcu_read_lock(>srcu); + list_for_each_entry_rcu(vmdirq, >irq_list, node) + generic_handle_irq(vmdirq->virq); + srcu_read_unlock(>srcu, idx); + + return IRQ_HANDLED; +} + +static int vmd_alloc_irqs(struct vmd_dev *vmd) +{ + struct pci_dev *dev = vmd->dev; + int i, err; + + vmd->msix_count = pci_msix_vec_count(dev); + if (vmd->msix_count < 0) + return -ENODEV; + + vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count, + PCI_IRQ_MSIX); + if (vmd->msix_count < 0) + return vmd->msix_count; + + vmd->irqs = devm_kcalloc(>dev, vmd->msix_count, sizeof(*vmd->irqs), +GFP_KERNEL); + if (!vmd->irqs) + return -ENOMEM; + + for (i = 0; i < vmd->msix_count; i++) { + err = init_srcu_struct(>irqs[i].srcu); + if (err) + return err; + + INIT_LIST_HEAD(>irqs[i].irq_list); + err = devm_request_irq(>dev, pci_irq_vector(dev, i), + vmd_irq, IRQF_NO_THREAD, + "vmd", >irqs[i]); + if (err) + return err; + } + + return 0; +} + static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; @@ -663,24 +712,10 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) return 0; } -static irqreturn_t vmd_irq(int irq, void *data) -{ - struct vmd_irq_list *irqs = data; - struct vmd_irq *vmdirq; - int idx; - - idx = srcu_read_lock(>srcu); - list_for_each_entry_rcu(vmdirq, >irq_list, node) - generic_handle_irq(vmdirq->virq); - srcu_read_unlock(>srcu, idx); - - return IRQ_HANDLED; -} - static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) { struct vmd_dev *vmd; - int i, err; + int err; if (resource_size(>resource[VMD_CFGBAR]) < (1 << 20)) return -ENOMEM; @@ -703,32 +738,9 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(32))) return -ENODEV; - vmd->msix_count = pci_msix_vec_count(dev); - if (vmd->msix_count < 0) - return -ENODEV; - - vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count, - PCI_IRQ_MSIX); - if (vmd->msix_count < 0) - return vmd->msix_count; - - vmd->irqs = devm_kcalloc(>dev, vmd->msix_count, sizeof(*vmd->irqs), -GFP_KERNEL); - if (!vmd->irqs) - return -ENOMEM; - - for (i = 0; i < vmd->msix_count; i++) { - err = init_srcu_struct(>irqs[i].srcu); - if (err) - return err; - - INIT_LIST_HEAD(>irqs[i].irq_list); - err = devm_request_irq(>dev, pci_irq_vector(dev, i), - vmd_irq, IRQF_NO_THREAD, - "vmd", >irqs[i]); - if (err) - return err; - } + err = vmd_alloc_irqs(vmd); + if (err) + return err; spin_lock_init(>cfg_lock); pci_set_drvdata(dev, vmd); -- 2.27.0
[PATCH 3/6] PCI: vmd: Create IRQ Domain configuration helper
Moves the IRQ and MSI Domain configuration code to new helpers. No functional changes. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 52 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a462719af12a..703c48171993 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -298,6 +298,34 @@ static struct msi_domain_info vmd_msi_domain_info = { .chip = _msi_controller, }; +static int vmd_create_irq_domain(struct vmd_dev *vmd) +{ + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain); + if (!fn) + return -ENODEV; + + vmd->irq_domain = pci_msi_create_irq_domain(fn, _msi_domain_info, + x86_vector_domain); + if (!vmd->irq_domain) { + irq_domain_free_fwnode(fn); + return -ENODEV; + } + + return 0; +} + +static void vmd_remove_irq_domain(struct vmd_dev *vmd) +{ + if (vmd->irq_domain) { + struct fwnode_handle *fn = vmd->irq_domain->fwnode; + + irq_domain_remove(vmd->irq_domain); + irq_domain_free_fwnode(fn); + } +} + static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { @@ -503,7 +531,6 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd) static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; - struct fwnode_handle *fn; struct resource *res; u32 upper_bits; unsigned long flags; @@ -598,16 +625,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) sd->node = pcibus_to_node(vmd->dev->bus); - fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain); - if (!fn) - return -ENODEV; - - vmd->irq_domain = pci_msi_create_irq_domain(fn, _msi_domain_info, - x86_vector_domain); - if (!vmd->irq_domain) { - irq_domain_free_fwnode(fn); - return -ENODEV; - } + ret = vmd_create_irq_domain(vmd); + if (ret) + return ret; pci_add_resource(, >resources[0]); pci_add_resource_offset(, >resources[1], offset[0]); @@ -617,13 +637,13 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) _ops, sd, ); if (!vmd->bus) { pci_free_resource_list(); - irq_domain_remove(vmd->irq_domain); - irq_domain_free_fwnode(fn); + vmd_remove_irq_domain(vmd); return -ENODEV; } vmd_attach_resources(vmd); - dev_set_msi_domain(>bus->dev, vmd->irq_domain); + if (vmd->irq_domain) + dev_set_msi_domain(>bus->dev, vmd->irq_domain); pci_scan_child_bus(vmd->bus); pci_assign_unassigned_bus_resources(vmd->bus); @@ -732,15 +752,13 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd) static void vmd_remove(struct pci_dev *dev) { struct vmd_dev *vmd = pci_get_drvdata(dev); - struct fwnode_handle *fn = vmd->irq_domain->fwnode; sysfs_remove_link(>dev->dev.kobj, "domain"); pci_stop_root_bus(vmd->bus); pci_remove_root_bus(vmd->bus); vmd_cleanup_srcu(vmd); vmd_detach_resources(vmd); - irq_domain_remove(vmd->irq_domain); - irq_domain_free_fwnode(fn); + vmd_remove_irq_domain(vmd); } #ifdef CONFIG_PM_SLEEP -- 2.27.0
[PATCH] ext4: Check superblock mapped prior to committing
This patch attempts to close a hole leading to a BUG seen with hot removals during writes [1]. A block device (NVME namespace in this test case) is formatted to EXT4 without partitions. It's mounted and write I/O is run to a file, then the device is hot removed from the slot. The superblock attempts to be written to the drive which is no longer present. The typical chain of events leading to the BUG: ext4_commit_super() __sync_dirty_buffer() submit_bh() submit_bh_wbc() BUG_ON(!buffer_mapped(bh)); This fix checks for the superblock's buffer head being mapped prior to syncing. [1] https://www.spinics.net/lists/linux-ext4/msg56527.html Signed-off-by: Jon Derrick --- fs/ext4/super.c | 8 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c4c220..ee33233 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4736,6 +4736,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; + + /* +* The superblock bh should be mapped, but it might not be if the +* device was hot-removed. Not much we can do but fail the I/O. +*/ + if (!buffer_mapped(sbh)) + return error; + /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock -- 2.7.4
[PATCH v2] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] The calculation for IO (hpiosize) should be similar, but platform firmwares I've encountered (including QEMU) provide strict allocations for IO and would not provide free IO resources for hotplug buses in order to prove this calculation. Signed-off-by: Jon Derrick --- drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..70d0aba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,15 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + /* +* Consider the current allocation size when adding size for extra +* hotplug memory. This ensures that occupied slots don't receive +* unneccessary memory allocations in addition to their current size. +* The calculation should be similar for calculate_iosize, but was +* unable to be tested. +*/ + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1088,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH v2] PCI hotplug Eq v2
Hi Bjorn, Sorry for the delay on this one and pushing it after RC1. Feel free to queue it up for 4.20 if it looks fine. I've added comments to the git log and source explaining why calculate_iosize was left unchanged. Basically I could not synthesize a condition where it would have affected the topology. v1->v2: Comments Jon Derrick (1): PCI: Equalize hotplug memory for non/occupied slots drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) -- 1.8.3.1
[PATCH] PCI/AER: Fix an AER enabling/disabling race
There is a sequence with non-ACPI root ports where the AER driver can enable error reporting on the tree before port drivers have bound to ports on the tree. The port driver assumes the AER driver will set up error reporting afterwards, so instead add a check if error reporting was set up first. Example: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/aer.c | 1 + drivers/pci/pcie/portdrv_core.c | 5 - include/linux/pci.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 83180ed..a4e36b6 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1333,6 +1333,7 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data) if (enable) pcie_set_ecrc_checking(dev); + dev->aer_configured = 1; return 0; } diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..f5de554 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -224,8 +224,11 @@ static int get_port_device_capability(struct pci_dev *dev) /* * Disable AER on this port in case it's been enabled by the * BIOS (the AER service driver will enable it when necessary). +* Don't disable it if the AER service driver has already +* enabled it from the root port bus walking */ - pci_disable_pcie_error_reporting(dev); + if (!dev->aer_configured) + pci_disable_pcie_error_reporting(dev); } #endif diff --git a/include/linux/pci.h b/include/linux/pci.h index e72ca8d..c071622 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -402,6 +402,7 @@ struct pci_dev { unsigned inthas_secondary_link:1; unsigned intnon_compliant_bars:1; /* Broken BARs; ignore them */ unsigned intis_probed:1;/* Device probing in progress */ + unsigned intaer_configured:1; /* AER configured for device */ pci_dev_flags_t dev_flags; atomic_tenable_cnt; /* pci_enable_device has been called */ -- 1.8.3.1
[PATCH] nvme-pci: Prevent mmio reads if pci channel offline
Some platforms don't seem to easily tolerate non-posted mmio reads on lost (hot removed) devices. This has been noted in previous modifications to other layers where an mmio read to a lost device could cause an undesired firmware intervention [1][2]. This patch reworks the nvme-pci reads to quickly check connectivity prior to reading the BAR. The intent is to prevent a non-posted mmio read which would definitely result in an error condition of some sort. There is, of course, a chance the link becomes disconnected between the check and the read. Like other similar checks, it is only intended to reduce the likelihood of encountering these issues and not fully close the gap. mmio writes are posted and in the fast path and have been left as-is. [1] https://lkml.org/lkml/2018/7/30/858 [2] https://lkml.org/lkml/2018/9/18/1546 Signed-off-by: Jon Derrick --- drivers/nvme/host/pci.c | 75 ++--- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f54718b63637..e555ac2afacd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1264,6 +1264,33 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) csts, result); } +static int nvme_reg_readl(struct nvme_dev *dev, u32 off, u32 *val) +{ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return -ENODEV; + + *val = readl(dev->bar + off); + return 0; +} + +static int nvme_reg_readq(struct nvme_dev *dev, u32 off, u64 *val) +{ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return -ENODEV; + + *val = readq(dev->bar + off); + return 0; +} + +static int nvme_reg_lo_hi_readq(struct nvme_dev *dev, u32 off, u64 *val) +{ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return -ENODEV; + + *val = lo_hi_readq(dev->bar + off); + return 0; +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1271,13 +1298,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; - u32 csts = readl(dev->bar + NVME_REG_CSTS); + u32 csts; /* If PCI error recovery process is happening, we cannot reset or * the recovery mechanism will surely fail. */ mb(); - if (pci_channel_offline(to_pci_dev(dev->dev))) + if (nvme_reg_readl(dev, NVME_REG_CSTS, )) return BLK_EH_RESET_TIMER; /* @@ -1692,18 +1719,24 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) { int result; - u32 aqa; + u32 csts, vs, aqa; struct nvme_queue *nvmeq; result = nvme_remap_bar(dev, db_bar_size(dev, 0)); if (result < 0) return result; - dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? - NVME_CAP_NSSRC(dev->ctrl.cap) : 0; + result = nvme_reg_readl(dev, NVME_REG_CSTS, ); + if (result) + return result; + + result = nvme_reg_readl(dev, NVME_REG_VS, ); + if (result) + return result; - if (dev->subsystem && - (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + dev->subsystem = (vs >= NVME_VS(1, 1, 0)) ? + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; + if (dev->subsystem && (csts & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); result = nvme_disable_ctrl(>ctrl, dev->ctrl.cap); @@ -1808,10 +1841,11 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (dev->cmb_size) return; - dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!dev->cmbsz) + if (nvme_reg_readl(dev, NVME_REG_CMBSZ, >cmbsz) || !dev->cmbsz) + return; + + if (nvme_reg_readl(dev, NVME_REG_CMBLOC, >cmbloc)) return; - dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); @@ -2357,6 +2391,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); + u32 csts; if (pci_enable_device_mem(pdev)) return result; @@ -2367,7 +2402,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; - if (readl(dev-
[PATCH] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- Original RFC here: https://patchwork.ozlabs.org/patch/945374/ I split this bit out from the RFC while awaiting the pci string handling enhancements to handle per-device settings Changed from RFC is a simpler algo drivers/pci/setup-bus.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..5ae39e6 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,7 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1080,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH] PCI/portdrv: Enable error reporting on managed ports
During probe, the port driver will disable error reporting and assumes it will be enabled later by the AER driver's pci_walk_bus() sequence. This may not be the case for host-bridge enabled root ports, who will enable first error reporting on the bus during the root port probe, and then disable error reporting on downstream devices during subsequent probing of the bus. A hotplugged port device may also fail to enable error reporting as the AER driver has already run on the root bus. Check for these conditions and enable error reporting during portdrv probing. Example case: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/portdrv_core.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..fdd953a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -343,6 +343,16 @@ int pcie_port_device_register(struct pci_dev *dev) if (!nr_service) goto error_cleanup_irqs; +#ifdef CONFIG_PCIEAER + /* +* Enable error reporting for this port in case AER probing has already +* run on the root bus or this port device is hot-inserted +*/ + if (dev->aer_cap && pci_aer_available() && + (pcie_ports_native || pci_find_host_bridge(dev->bus)->native_aer)) + pci_enable_pcie_error_reporting(dev); +#endif + return 0; error_cleanup_irqs: -- 1.8.3.1
[PATCH v3] PCI: Equalize hotplug memory and io for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory and hpiosize additional io if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory/io size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will be allocated their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize/hpiosize parameters to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- v2->v3: Made the IO and mem size calculations nearly equivalent drivers/pci/setup-bus.c | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..ed96043 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -811,6 +811,8 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, static resource_size_t calculate_iosize(resource_size_t size, resource_size_t min_size, resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -823,15 +825,18 @@ static resource_size_t calculate_iosize(resource_size_t size, #if defined(CONFIG_ISA) || defined(CONFIG_EISA) size = (size & 0xff) + ((size & ~0xffUL) << 2); #endif - size = ALIGN(size + size1, align); + size = size + size1; if (size < old_size) size = old_size; + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +846,8 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -930,12 +936,10 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } } - size0 = calculate_iosize(size, min_size, size1, + size0 = calculate_iosize(size, min_size, size1, 0, 0, resource_size(b_res), min_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_iosize(size, min_size, add_size + size1, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_iosize(size, min_size, size1, add_size, children_add_size, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) @@ -1079,12 +1083,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
Re: [PATCH] NVMe: Fixed race between nvme_thread & probe path.
> >> + smp_wmb(); > > > > This has been applied already as an explicit mb() > > Since these structure is only accessible by software, won't smp_wmb() > sufficient enough? > Seems reasonable to me -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] NVMe: Fixed race between nvme_thread & probe path.
On Thu, Jun 18, 2015 at 04:13:50PM +0530, Parav Pandit wrote: > Kernel thread nvme_thread and driver load process can be executing > in parallel on different CPU. This leads to race condition whenever > nvme_alloc_queue() instructions are executed out of order that can > reflects incorrect value for nvme_thread. > Memory barrier in nvme_alloc_queue() ensures that it maintains the > order and and data dependency read barrier in reader thread ensures > that cpu cache is synced. > > Signed-off-by: Parav Pandit > --- > drivers/block/nvme-core.c | 12 ++-- > 1 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c > index 5961ed7..90fb0ce 100644 > --- a/drivers/block/nvme-core.c > +++ b/drivers/block/nvme-core.c > @@ -1403,8 +1403,10 @@ static struct nvme_queue *nvme_alloc_queue(struct > nvme_dev *dev, int qid, > nvmeq->q_db = >dbs[qid * 2 * dev->db_stride]; > nvmeq->q_depth = depth; > nvmeq->qid = qid; > - dev->queue_count++; > dev->queues[qid] = nvmeq; > + /* update queues first before updating queue_count */ > + smp_wmb(); > + dev->queue_count++; > > return nvmeq; > This has been applied already as an explicit mb() > @@ -2073,7 +2075,13 @@ static int nvme_kthread(void *data) > continue; > } > for (i = 0; i < dev->queue_count; i++) { > - struct nvme_queue *nvmeq = dev->queues[i]; > + struct nvme_queue *nvmeq; > + > + /* make sure to read queue_count before > + * traversing queues. > + */ > + smp_read_barrier_depends(); > + nvmeq = dev->queues[i]; > if (!nvmeq) > continue; > spin_lock_irq(>q_lock); I don't think this is necessary. If queue_count is incremented while in this loop, it will be picked up the next time the kthread runs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] nvme : Use correct scnprintf in cmb show
Looks good. Thanks Stephen. Reviewed-by Jon Derrick: On Fri, Dec 16, 2016 at 11:54:50AM -0700, Stephen Bates wrote: > Make sure we are using the correct scnprintf in the sysfs show > function for the CMB. > > Signed-off-by: Stephen Bates > --- > drivers/nvme/host/pci.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 5e52034..be10860 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -50,7 +50,7 @@ > #define NVME_AQ_DEPTH256 > #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) > #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) > - > + > /* > * We handle AEN commands ourselves and don't even let the > * block layer know about them. > @@ -1332,7 +1332,7 @@ static ssize_t nvme_cmb_show(struct device *dev, > { > struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); > > - return snprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", > + return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", > ndev->cmbloc, ndev->cmbsz); > } > static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); > -- > 2.1.4 >
Re: [PATCH 2/2] nvme: improve cmb sysfs reporting
Minor nit below On Fri, Dec 16, 2016 at 11:54:51AM -0700, Stephen Bates wrote: > Add more information to the NVMe CMB sysfs entry. This includes > information about the CMB size, location and capabilities. > > Signed-off-by: Stephen Bates > --- > drivers/nvme/host/pci.c | 31 +-- > include/linux/nvme.h| 8 > 2 files changed, 37 insertions(+), 2 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index be10860..1f70630 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -98,6 +98,7 @@ struct nvme_dev { > void __iomem *cmb; > dma_addr_t cmb_dma_addr; > u64 cmb_size; > + u64 cmb_offset; > u32 cmbsz; > u32 cmbloc; > struct nvme_ctrl ctrl; > @@ -1326,14 +1327,39 @@ static int nvme_create_io_queues(struct nvme_dev *dev) > return ret >= 0 ? 0 : ret; > } > > +static const char * const cmb_caps[] = { > + [NVME_CMB_CAP_SQS] = "SQS", > + [NVME_CMB_CAP_CQS] = "CQS", > + [NVME_CMB_CAP_LISTS] = "LISTS", > + [NVME_CMB_CAP_RDS] = "RDS", > + [NVME_CMB_CAP_WDS] = "WDS", > +}; > + > static ssize_t nvme_cmb_show(struct device *dev, >struct device_attribute *attr, >char *buf) > { > struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); > + unsigned int i, len = 0; > + > + len += scnprintf(buf+len, PAGE_SIZE-len, > + "cmbloc : 0x%08x\ncmbsz : 0x%08x\n\n", > + ndev->cmbloc, ndev->cmbsz); > + > + len += scnprintf(buf+len, PAGE_SIZE-len, > + "OFFSET : 0x%016llx\nSIZE : %llu Bytes\n" \ > + "DMA ADDR : %pad\n\n", > + ndev->cmb_offset, ndev->cmb_size, > + >cmb_dma_addr); > + > + for (i = NVME_CMB_CAP_SQS; i <= NVME_CMB_CAP_WDS; i++) I'd prefer seeing (i = 0; i < ARRAY_SIZE(..); i++) because it provides automatic bounds checking against future code. > + len += scnprintf(buf+len, PAGE_SIZE-len, "%-7s: %s\n", > + cmb_caps[i], > + ((ndev->cmbsz) & (1< + "NOT SUPPORTED"); > + > + return len; > > - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", > -ndev->cmbloc, ndev->cmbsz); > } > static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); > > @@ -1376,6 +1402,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) > > dev->cmb_dma_addr = dma_addr; > dev->cmb_size = size; > + dev->cmb_offset = offset; > return cmb; > } > > diff --git a/include/linux/nvme.h b/include/linux/nvme.h > index fc3c242..cd0d324 100644 > --- a/include/linux/nvme.h > +++ b/include/linux/nvme.h > @@ -121,6 +121,14 @@ enum { > #define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) > #define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) > > +enum { > + NVME_CMB_CAP_SQS = 0, > + NVME_CMB_CAP_CQS, > + NVME_CMB_CAP_LISTS, > + NVME_CMB_CAP_RDS, > + NVME_CMB_CAP_WDS, > +}; > + > /* > * Submission and Completion Queue Entry Sizes for the NVM command set. > * (In bytes and specified as a power of two (2^n)). > -- > 2.1.4 >