from:"Jon Derrick"

Re: [PATCH] NVMe: Fixed race between nvme_thread probe path.

2015-06-18 Thread Jon Derrick

On Thu, Jun 18, 2015 at 04:13:50PM +0530, Parav Pandit wrote:
 Kernel thread nvme_thread and driver load process can be executing
 in parallel on different CPU. This leads to race condition whenever
 nvme_alloc_queue() instructions are executed out of order that can
 reflects incorrect value for nvme_thread.
 Memory barrier in nvme_alloc_queue() ensures that it maintains the
 order and and data dependency read barrier in reader thread ensures
 that cpu cache is synced.
 
 Signed-off-by: Parav Pandit parav.pan...@avagotech.com
 ---
  drivers/block/nvme-core.c |   12 ++--
  1 files changed, 10 insertions(+), 2 deletions(-)
 
 diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
 index 5961ed7..90fb0ce 100644
 --- a/drivers/block/nvme-core.c
 +++ b/drivers/block/nvme-core.c
 @@ -1403,8 +1403,10 @@ static struct nvme_queue *nvme_alloc_queue(struct 
 nvme_dev *dev, int qid,
   nvmeq-q_db = dev-dbs[qid * 2 * dev-db_stride];
   nvmeq-q_depth = depth;
   nvmeq-qid = qid;
 - dev-queue_count++;
   dev-queues[qid] = nvmeq;
 + /* update queues first before updating queue_count */
 + smp_wmb();
 + dev-queue_count++;
  
   return nvmeq;
  

This has been applied already as an explicit mb()

 @@ -2073,7 +2075,13 @@ static int nvme_kthread(void *data)
   continue;
   }
   for (i = 0; i  dev-queue_count; i++) {
 - struct nvme_queue *nvmeq = dev-queues[i];
 + struct nvme_queue *nvmeq;
 +
 + /* make sure to read queue_count before
 +  * traversing queues.
 +  */
 + smp_read_barrier_depends();
 + nvmeq = dev-queues[i];
   if (!nvmeq)
   continue;
   spin_lock_irq(nvmeq-q_lock);

I don't think this is necessary. If queue_count is incremented while in this 
loop, it will be picked up the next time the kthread runs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] NVMe: Fixed race between nvme_thread probe path.

2015-06-26 Thread Jon Derrick

  + smp_wmb();
 
  This has been applied already as an explicit mb()
 
 Since these structure is only accessible by software, won't smp_wmb()
 sufficient enough?
 

Seems reasonable to me

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] VMD: Attach vmd resources to parent domain's resource tree

2016-02-23 Thread Jon Derrick

On Tue, Feb 23, 2016 at 11:16:11PM +, Keith Busch wrote:
> On Tue, Feb 23, 2016 at 02:50:13PM -0700, Jon Derrick wrote:
> > This patch attaches the new VMD domain's resources to the VMD device's
> > resources. This allows /proc/iomem to display a more complete picture.
> > 
> > Before:
> >   c000-c1ff : :5d:05.5
> >   c200-c3ff : :5d:05.5
> > c201-c2013fff : nvme
> >   c400-c40f : :5d:05.5
> > 
> > After:
> >   c000-c1ff : :5d:05.5
> > c000-001f : VMD CFGBAR
> >   c200-c3ff : :5d:05.5
> > c200-c3ff : VMD MEMBAR1
> >   c200-c22f : PCI Bus 1:01
> > c200-c200 : 1:01:00.0
> > c201-c2013fff : 1:01:00.0
> >   c201-c2013fff : nvme
> >   c230-c24f : PCI Bus 1:01
> >   c400-c40f : :5d:05.5
> > c4002000-c40f : VMD MEMBAR2
> 
> I think we should drop the CFGBAR from here since that's a bus resource
> rather than IO memory. Otherwise, this looks good and useful.
Yes that entry doesn't make much sense for iomem does it. I'll drop that part.

> 
> I think in the near future we should come up with a better name than
> "VMD MEMBAR" to help distinguish multiple VMD's in a system.
Seconded. I'm fine with naming it by its PCI designator, eg 1:01:00.0, and 
maybe a bar designator after that (or not).

[PATCH] VMD: Attach vmd resources to parent domain's resource tree

2016-02-23 Thread Jon Derrick

This patch attaches the new VMD domain's resources to the VMD device's
resources. This allows /proc/iomem to display a more complete picture.

Before:
  c000-c1ff : :5d:05.5
  c200-c3ff : :5d:05.5
c201-c2013fff : nvme
  c400-c40f : :5d:05.5

After:
  c000-c1ff : :5d:05.5
c000-001f : VMD CFGBAR
  c200-c3ff : :5d:05.5
c200-c3ff : VMD MEMBAR1
  c200-c22f : PCI Bus 1:01
c200-c200 : 1:01:00.0
c201-c2013fff : 1:01:00.0
  c201-c2013fff : nvme
  c230-c24f : PCI Bus 1:01
  c400-c40f : :5d:05.5
c4002000-c40f : VMD MEMBAR2

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 arch/x86/pci/vmd.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index d57e480..06490da 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -503,6 +503,20 @@ static struct pci_ops vmd_ops = {
.write  = vmd_pci_write,
 };
 
+static void vmd_attach_resources(struct vmd_dev *vmd)
+{
+   vmd->dev->resource[VMD_CFGBAR].child = >resources[0];
+   vmd->dev->resource[VMD_MEMBAR1].child = >resources[1];
+   vmd->dev->resource[VMD_MEMBAR2].child = >resources[2];
+}
+
+static void vmd_detach_resources(struct vmd_dev *vmd)
+{
+   vmd->dev->resource[VMD_CFGBAR].child = NULL;
+   vmd->dev->resource[VMD_MEMBAR1].child = NULL;
+   vmd->dev->resource[VMD_MEMBAR2].child = NULL;
+}
+
 /*
  * VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
  */
@@ -530,6 +544,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start,
.end   = (resource_size(res) >> 20) - 1,
.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
+   .parent = res,
};
 
res = >dev->resource[VMD_MEMBAR1];
@@ -542,6 +557,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start,
.end   = res->end,
.flags = flags,
+   .parent = res,
};
 
res = >dev->resource[VMD_MEMBAR2];
@@ -554,6 +570,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start + 0x2000,
.end   = res->end,
.flags = flags,
+   .parent = res,
};
 
sd->domain = vmd_find_free_domain();
@@ -578,6 +595,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
return -ENODEV;
}
 
+   vmd_attach_resources(vmd);
vmd_setup_dma_ops(vmd);
dev_set_msi_domain(>bus->dev, vmd->irq_domain);
pci_rescan_bus(vmd->bus);
@@ -674,6 +692,7 @@ static void vmd_remove(struct pci_dev *dev)
 {
struct vmd_dev *vmd = pci_get_drvdata(dev);
 
+   vmd_detach_resources(vmd);
pci_set_drvdata(dev, NULL);
sysfs_remove_link(>dev->dev.kobj, "domain");
pci_stop_root_bus(vmd->bus);
-- 
1.8.3.1

Re: [PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match

2017-02-16 Thread Jon Derrick

Thanks everyone. Sorry about the mess :)

On 02/15/2017 10:23 PM, Michael Ellerman wrote:
> Jon Derrick <jonathan.derr...@intel.com> writes:
> 
>> PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks
>> like the 'arch/powerpc' file pattern should be enough to match powerpc
>> opal code by itself. Remove the opal regex pattern from powerpc.
> 
> We thought of it first.
> 
> Can't you just rename your driver, Opal Storage Specification, so "oss",
> that should be pretty unique?
> 
> ... :)
> 
> I don't like this version, but I'll merge the one from Stewart which
> drops the pattern and adds the paths for the existing drivers.
> 
> cheers
> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index b983b25..430dd02 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -7404,7 +7404,6 @@ F: drivers/pci/hotplug/pnv_php.c
>>  F:  drivers/pci/hotplug/rpa*
>>  F:  drivers/scsi/ibmvscsi/
>>  F:  tools/testing/selftests/powerpc
>> -N:  opal
>>  N:  /pmac
>>  N:  powermac
>>  N:  powernv
>> -- 
>> 1.8.3.1

[PATCH 2/4] block/sed: Add helper to qualify response tokens

2017-02-15 Thread Jon Derrick

Add helper which verifies the response token is valid and matches the
expected value. Merges token_type and response_get_token.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 61 +++-
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 77623ad..d6dd604 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, 
u32 tsn)
return 0;
 }
 
-static enum opal_response_token token_type(const struct parsed_resp *resp,
-  int n)
+static const struct opal_resp_tok *response_get_token(
+   const struct parsed_resp *resp,
+   int n)
 {
const struct opal_resp_tok *tok;
 
if (n >= resp->num) {
pr_err("Token number doesn't exist: %d, resp: %d\n",
   n, resp->num);
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
tok = >toks[n];
if (tok->len == 0) {
pr_err("Token length must be non-zero\n");
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
-   return tok->type;
-}
-
-/*
- * This function returns 0 in case of invalid token. One should call
- * token_type() first to find out if the token is valid or not.
- */
-static enum opal_token response_get_token(const struct parsed_resp *resp,
- int n)
-{
-   const struct opal_resp_tok *tok;
-
-   if (n >= resp->num) {
-   pr_err("Token number doesn't exist: %d, resp: %d\n",
-  n, resp->num);
-   return 0;
-   }
-
-   tok = >toks[n];
-   if (tok->len == 0) {
-   pr_err("Token length must be non-zero\n");
-   return 0;
-   }
-
-   return tok->pos[0];
+   return tok;
 }
 
 static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
@@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp 
*resp, int n)
return resp->toks[n].stored.u;
 }
 
+static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
+{
+   if (IS_ERR_OR_NULL(token) ||
+   token->type != OPAL_DTA_TOKENID_TOKEN ||
+   token->pos[0] != match)
+   return false;
+   return true;
+}
+
 static u8 response_status(const struct parsed_resp *resp)
 {
-   if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
-   response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+   const struct opal_resp_tok *tok;
+
+   tok = response_get_token(resp, 0);
+   if (response_token_matches(tok, OPAL_ENDOFSESSION))
return 0;
-   }
 
if (resp->num < 5)
return DTAERROR_NO_METHOD_STATUS;
 
-   if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
-   token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
-   response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
-   response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+   tok = response_get_token(resp, resp->num - 5);
+   if (!response_token_matches(tok, OPAL_STARTLIST))
+   return DTAERROR_NO_METHOD_STATUS;
+
+   tok = response_get_token(resp, resp->num - 1);
+   if (!response_token_matches(tok, OPAL_ENDLIST))
return DTAERROR_NO_METHOD_STATUS;
 
return response_get_u64(resp, resp->num - 4);
-- 
1.8.3.1

[PATCH 3/4] block/sed: Check received header lengths

2017-02-15 Thread Jon Derrick

Add a buffer size check against discovery and response header lengths
before we loop over their buffers.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 35 +--
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index d6dd604..addf89e 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
const struct d0_header *hdr = (struct d0_header *)dev->resp;
const u8 *epos = dev->resp, *cpos = dev->resp;
u16 comid = 0;
+   u32 hlen = be32_to_cpu(hdr->length);
 
-   print_buffer(dev->resp, be32_to_cpu(hdr->length));
+   print_buffer(dev->resp, hlen);
 
-   epos += be32_to_cpu(hdr->length); /* end of buffer */
+   if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+   pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
+   sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+   return -EFAULT;
+   }
+
+   epos += hlen; /* end of buffer */
cpos += sizeof(*hdr); /* current position on buffer */
 
while (cpos < epos && supported) {
@@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length,
int total;
ssize_t token_length;
const u8 *pos;
+   u32 clen, plen, slen;
 
if (!buf)
return -EFAULT;
@@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length,
pos = buf;
pos += sizeof(*hdr);
 
-   pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
-be32_to_cpu(hdr->cp.length),
-be32_to_cpu(hdr->pkt.length),
-be32_to_cpu(hdr->subpkt.length));
-
-   if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
-   hdr->subpkt.length == 0) {
-   pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
-  be32_to_cpu(hdr->cp.length),
-  be32_to_cpu(hdr->pkt.length),
-  be32_to_cpu(hdr->subpkt.length));
+   clen = be32_to_cpu(hdr->cp.length);
+   plen = be32_to_cpu(hdr->pkt.length);
+   slen = be32_to_cpu(hdr->subpkt.length);
+   pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
+clen, plen, slen);
+
+   if (clen == 0 || plen == 0 || slen == 0 ||
+   (u64)clen + plen + slen > IO_BUFFER_LENGTH) {
+   pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+  clen, plen, slen);
print_buffer(pos, sizeof(*hdr));
return -EINVAL;
}
@@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length,
return -EFAULT;
 
iter = resp->toks;
-   total = be32_to_cpu(hdr->subpkt.length);
+   total = slen;
print_buffer(pos, total);
while (total > 0) {
if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
-- 
1.8.3.1

[PATCH 4/4] MAINTAINERS: Remove powerpc's opal match

2017-02-15 Thread Jon Derrick

PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks
like the 'arch/powerpc' file pattern should be enough to match powerpc
opal code by itself. Remove the opal regex pattern from powerpc.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b983b25..430dd02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c
 F: drivers/pci/hotplug/rpa*
 F: drivers/scsi/ibmvscsi/
 F: tools/testing/selftests/powerpc
-N: opal
 N: /pmac
 N: powermac
 N: powernv
-- 
1.8.3.1

[PATCH 1/4] block/sed: Use ssize_t on atom parsers to return errors

2017-02-15 Thread Jon Derrick

The short atom parser can return an errno from decoding but does not
currently return the error as a signed value. Convert all of the parsers
to ssize_t.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 28 ++--
 include/linux/sed-opal.h |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index e95b8a5..77623ad 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct 
parsed_resp *resp,
return tok->pos[0];
 }
 
-static size_t response_parse_tiny(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_short(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_short(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
@@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
tok->type = OPAL_DTA_TOKENID_SINT;
} else {
u64 u_integer = 0;
-   int i, b = 0;
+   ssize_t i, b = 0;
 
tok->type = OPAL_DTA_TOKENID_UINT;
if (tok->len > 9) {
@@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_medium(struct opal_resp_tok *tok,
-   const u8 *pos)
+static ssize_t response_parse_medium(struct opal_resp_tok *tok,
+const u8 *pos)
 {
tok->pos = pos;
tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
@@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_long(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_long(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
@@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_token(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_token(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length,
struct opal_resp_tok *iter;
int num_entries = 0;
int total;
-   size_t token_length;
+   ssize_t token_length;
const u8 *pos;
 
if (!buf)
@@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length,
else /* TOKEN */
token_length = response_parse_token(iter, pos);
 
-   if (token_length == -EINVAL)
-   return -EINVAL;
+   if (token_length < 0)
+   return token_length;
 
pos += token_length;
total -= token_length;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 205d520..4fc72d4 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -69,7 +69,7 @@ enum opal_response_token {
  */
 struct opal_resp_tok {
const u8 *pos;
-   size_t len;
+   ssize_t len;
enum opal_response_token type;
enum opal_atom_width width;
union {
-- 
1.8.3.1

[PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match

2017-02-15 Thread Jon Derrick

PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks
like the 'arch/powerpc' file pattern should be enough to match powerpc
opal code by itself. Remove the opal regex pattern from powerpc.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b983b25..430dd02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c
 F: drivers/pci/hotplug/rpa*
 F: drivers/scsi/ibmvscsi/
 F: tools/testing/selftests/powerpc
-N: opal
 N: /pmac
 N: powermac
 N: powernv
-- 
1.8.3.1

[PATCHv3 1/4] block/sed: Use ssize_t on atom parsers to return errors

2017-02-15 Thread Jon Derrick

The short atom parser can return an errno from decoding but does not
currently return the error as a signed value. Convert all of the parsers
to ssize_t.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index e95b8a5..77623ad 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct 
parsed_resp *resp,
return tok->pos[0];
 }
 
-static size_t response_parse_tiny(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_short(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_short(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
@@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
tok->type = OPAL_DTA_TOKENID_SINT;
} else {
u64 u_integer = 0;
-   int i, b = 0;
+   ssize_t i, b = 0;
 
tok->type = OPAL_DTA_TOKENID_UINT;
if (tok->len > 9) {
@@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_medium(struct opal_resp_tok *tok,
-   const u8 *pos)
+static ssize_t response_parse_medium(struct opal_resp_tok *tok,
+const u8 *pos)
 {
tok->pos = pos;
tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
@@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_long(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_long(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
@@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_token(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_token(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length,
struct opal_resp_tok *iter;
int num_entries = 0;
int total;
-   size_t token_length;
+   ssize_t token_length;
const u8 *pos;
 
if (!buf)
@@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length,
else /* TOKEN */
token_length = response_parse_token(iter, pos);
 
-   if (token_length == -EINVAL)
-   return -EINVAL;
+   if (token_length < 0)
+   return token_length;
 
pos += token_length;
total -= token_length;
-- 
1.8.3.1

Re: [PATCH 3/4] block/sed: Check received header lengths

2017-02-15 Thread Jon Derrick

On 02/15/2017 11:15 AM, Scott Bauer wrote:
> On Wed, Feb 15, 2017 at 12:38:53PM -0700, Jon Derrick wrote:
>> Add a buffer size check against discovery and response header lengths
>> before we loop over their buffers.
>>
>> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
>> ---
>>  block/sed-opal.c | 35 +--
>>  1 file changed, 21 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/sed-opal.c b/block/sed-opal.c
>> index d6dd604..addf89e 100644
>> --- a/block/sed-opal.c
>> +++ b/block/sed-opal.c
>> @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
>>  const struct d0_header *hdr = (struct d0_header *)dev->resp;
>>  const u8 *epos = dev->resp, *cpos = dev->resp;
>>  u16 comid = 0;
>> +u32 hlen = be32_to_cpu(hdr->length);
>>  
>> -print_buffer(dev->resp, be32_to_cpu(hdr->length));
>> +print_buffer(dev->resp, hlen);
>>  
>> -epos += be32_to_cpu(hdr->length); /* end of buffer */
>> +if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
>> +pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
>> +sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
>> +return -EFAULT;
>> +}
>> +
>> +epos += hlen; /* end of buffer */
>>  cpos += sizeof(*hdr); /* current position on buffer */
>>  
>>  while (cpos < epos && supported) {
>> @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length,
>>  int total;
>>  ssize_t token_length;
>>  const u8 *pos;
>> +u32 clen, plen, slen;
>>  
>>  if (!buf)
>>  return -EFAULT;
>> @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length,
>>  pos = buf;
>>  pos += sizeof(*hdr);
>>  
>> -pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
>> - be32_to_cpu(hdr->cp.length),
>> - be32_to_cpu(hdr->pkt.length),
>> - be32_to_cpu(hdr->subpkt.length));
>> -
>> -if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
>> -hdr->subpkt.length == 0) {
>> -pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
>> -   be32_to_cpu(hdr->cp.length),
>> -   be32_to_cpu(hdr->pkt.length),
>> -   be32_to_cpu(hdr->subpkt.length));
>> +clen = be32_to_cpu(hdr->cp.length);
>> +plen = be32_to_cpu(hdr->pkt.length);
>> +slen = be32_to_cpu(hdr->subpkt.length);
>> +pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
>> + clen, plen, slen);
>> +
>> +if (clen == 0 || plen == 0 || slen == 0 ||
>> +(u64)clen + plen + slen > IO_BUFFER_LENGTH) {
> 
> I think we're over calculating here. From my understanding of the spec:
> TCG_Storage_Architecture_Core_Spec_v2.01_r1.00.pdf
> 3.2.3
>   ComPackets, Packets & Subpackets
> 
> The Comp packet size should the size of the packet, and the subpackets.
> The "packet" should be the size of all the subpackets.
> And lastly the subpacket should be the size its payload.
Yep I forgot that. Good catch!


> 
> if ((u64) slen + sizeof(*hdr) > IO_BUFFER_LENGTH)
> 
> Since we never need the com packet length or pkt length (we never use them
> anywhere) I think the above check is okay. Let me know if i'm wrong though,
> or you have a different understanding of the lengths.
Agreed

> 
> 
> 
> 
> 
> 
> 
>

[PATCHv2 3/4] block/sed: Check received header lengths

2017-02-15 Thread Jon Derrick

Add a buffer size check against discovery and response header lengths
before we loop over their buffers.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 35 +--
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index d6dd604..addf89e 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
const struct d0_header *hdr = (struct d0_header *)dev->resp;
const u8 *epos = dev->resp, *cpos = dev->resp;
u16 comid = 0;
+   u32 hlen = be32_to_cpu(hdr->length);
 
-   print_buffer(dev->resp, be32_to_cpu(hdr->length));
+   print_buffer(dev->resp, hlen);
 
-   epos += be32_to_cpu(hdr->length); /* end of buffer */
+   if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+   pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
+   sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+   return -EFAULT;
+   }
+
+   epos += hlen; /* end of buffer */
cpos += sizeof(*hdr); /* current position on buffer */
 
while (cpos < epos && supported) {
@@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length,
int total;
ssize_t token_length;
const u8 *pos;
+   u32 clen, plen, slen;
 
if (!buf)
return -EFAULT;
@@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length,
pos = buf;
pos += sizeof(*hdr);
 
-   pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
-be32_to_cpu(hdr->cp.length),
-be32_to_cpu(hdr->pkt.length),
-be32_to_cpu(hdr->subpkt.length));
-
-   if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
-   hdr->subpkt.length == 0) {
-   pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
-  be32_to_cpu(hdr->cp.length),
-  be32_to_cpu(hdr->pkt.length),
-  be32_to_cpu(hdr->subpkt.length));
+   clen = be32_to_cpu(hdr->cp.length);
+   plen = be32_to_cpu(hdr->pkt.length);
+   slen = be32_to_cpu(hdr->subpkt.length);
+   pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
+clen, plen, slen);
+
+   if (clen == 0 || plen == 0 || slen == 0 ||
+   (u64)clen + plen + slen > IO_BUFFER_LENGTH) {
+   pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+  clen, plen, slen);
print_buffer(pos, sizeof(*hdr));
return -EINVAL;
}
@@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length,
return -EFAULT;
 
iter = resp->toks;
-   total = be32_to_cpu(hdr->subpkt.length);
+   total = slen;
print_buffer(pos, total);
while (total > 0) {
if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
-- 
1.8.3.1

[PATCHv2 4/4] MAINTAINERS: Remove powerpc's opal match

2017-02-15 Thread Jon Derrick

PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks
like the 'arch/powerpc' file pattern should be enough to match powerpc
opal code by itself. Remove the opal regex pattern from powerpc.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b983b25..430dd02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c
 F: drivers/pci/hotplug/rpa*
 F: drivers/scsi/ibmvscsi/
 F: tools/testing/selftests/powerpc
-N: opal
 N: /pmac
 N: powermac
 N: powernv
-- 
1.8.3.1

[PATCHv2 1/4] block/sed: Use ssize_t on atom parsers to return errors

2017-02-15 Thread Jon Derrick

The short atom parser can return an errno from decoding but does not
currently return the error as a signed value. Convert all of the parsers
to ssize_t.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index e95b8a5..77623ad 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct 
parsed_resp *resp,
return tok->pos[0];
 }
 
-static size_t response_parse_tiny(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_short(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_short(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
@@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
tok->type = OPAL_DTA_TOKENID_SINT;
} else {
u64 u_integer = 0;
-   int i, b = 0;
+   ssize_t i, b = 0;
 
tok->type = OPAL_DTA_TOKENID_UINT;
if (tok->len > 9) {
@@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_medium(struct opal_resp_tok *tok,
-   const u8 *pos)
+static ssize_t response_parse_medium(struct opal_resp_tok *tok,
+const u8 *pos)
 {
tok->pos = pos;
tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
@@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_long(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_long(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
@@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_token(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_token(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length,
struct opal_resp_tok *iter;
int num_entries = 0;
int total;
-   size_t token_length;
+   ssize_t token_length;
const u8 *pos;
 
if (!buf)
@@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length,
else /* TOKEN */
token_length = response_parse_token(iter, pos);
 
-   if (token_length == -EINVAL)
-   return -EINVAL;
+   if (token_length < 0)
+   return token_length;
 
pos += token_length;
total -= token_length;
-- 
1.8.3.1

[PATCHv2 2/4] block/sed: Add helper to qualify response tokens

2017-02-15 Thread Jon Derrick

Add helper which verifies the response token is valid and matches the
expected value. Merges token_type and response_get_token.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 61 +++-
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 77623ad..d6dd604 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, 
u32 tsn)
return 0;
 }
 
-static enum opal_response_token token_type(const struct parsed_resp *resp,
-  int n)
+static const struct opal_resp_tok *response_get_token(
+   const struct parsed_resp *resp,
+   int n)
 {
const struct opal_resp_tok *tok;
 
if (n >= resp->num) {
pr_err("Token number doesn't exist: %d, resp: %d\n",
   n, resp->num);
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
tok = >toks[n];
if (tok->len == 0) {
pr_err("Token length must be non-zero\n");
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
-   return tok->type;
-}
-
-/*
- * This function returns 0 in case of invalid token. One should call
- * token_type() first to find out if the token is valid or not.
- */
-static enum opal_token response_get_token(const struct parsed_resp *resp,
- int n)
-{
-   const struct opal_resp_tok *tok;
-
-   if (n >= resp->num) {
-   pr_err("Token number doesn't exist: %d, resp: %d\n",
-  n, resp->num);
-   return 0;
-   }
-
-   tok = >toks[n];
-   if (tok->len == 0) {
-   pr_err("Token length must be non-zero\n");
-   return 0;
-   }
-
-   return tok->pos[0];
+   return tok;
 }
 
 static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
@@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp 
*resp, int n)
return resp->toks[n].stored.u;
 }
 
+static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
+{
+   if (IS_ERR_OR_NULL(token) ||
+   token->type != OPAL_DTA_TOKENID_TOKEN ||
+   token->pos[0] != match)
+   return false;
+   return true;
+}
+
 static u8 response_status(const struct parsed_resp *resp)
 {
-   if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
-   response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+   const struct opal_resp_tok *tok;
+
+   tok = response_get_token(resp, 0);
+   if (response_token_matches(tok, OPAL_ENDOFSESSION))
return 0;
-   }
 
if (resp->num < 5)
return DTAERROR_NO_METHOD_STATUS;
 
-   if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
-   token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
-   response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
-   response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+   tok = response_get_token(resp, resp->num - 5);
+   if (!response_token_matches(tok, OPAL_STARTLIST))
+   return DTAERROR_NO_METHOD_STATUS;
+
+   tok = response_get_token(resp, resp->num - 1);
+   if (!response_token_matches(tok, OPAL_ENDLIST))
return DTAERROR_NO_METHOD_STATUS;
 
return response_get_u64(resp, resp->num - 4);
-- 
1.8.3.1

[PATCHv2 0/4] OPAL patches

2017-02-15 Thread Jon Derrick

Just a couple of fixes for sed-opal to prevent faulty firmware from
allowing us to go off in the weeds, and a helper to remove some
duplicate code.

v1->v2:
left tok->len as a size_t
got everyone important on the same email thread

Jon Derrick (4):
  block/sed: Use ssize_t on atom parsers to return errors
  block/sed: Add helper to qualify response tokens
  block/sed: Check received header lengths
  MAINTAINERS: Remove powerpc's opal match

 MAINTAINERS  |   1 -
 block/sed-opal.c | 124 +++
 2 files changed, 60 insertions(+), 65 deletions(-)

-- 
1.8.3.1

[PATCHv4 3/4] block/sed: Check received header lengths

2017-02-15 Thread Jon Derrick

Add a buffer size check against discovery and response header lengths
before we loop over their buffers.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
Reviewed-by: Scott Bauer <scott.ba...@intel.com>
---
 block/sed-opal.c | 35 +--
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 00673cf..383d5d6 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
const struct d0_header *hdr = (struct d0_header *)dev->resp;
const u8 *epos = dev->resp, *cpos = dev->resp;
u16 comid = 0;
+   u32 hlen = be32_to_cpu(hdr->length);
 
-   print_buffer(dev->resp, be32_to_cpu(hdr->length));
+   print_buffer(dev->resp, hlen);
 
-   epos += be32_to_cpu(hdr->length); /* end of buffer */
+   if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+   pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
+   sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+   return -EFAULT;
+   }
+
+   epos += hlen; /* end of buffer */
cpos += sizeof(*hdr); /* current position on buffer */
 
while (cpos < epos && supported) {
@@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length,
int total;
ssize_t token_length;
const u8 *pos;
+   u32 clen, plen, slen;
 
if (!buf)
return -EFAULT;
@@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length,
pos = buf;
pos += sizeof(*hdr);
 
-   pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
-be32_to_cpu(hdr->cp.length),
-be32_to_cpu(hdr->pkt.length),
-be32_to_cpu(hdr->subpkt.length));
-
-   if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
-   hdr->subpkt.length == 0) {
-   pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
-  be32_to_cpu(hdr->cp.length),
-  be32_to_cpu(hdr->pkt.length),
-  be32_to_cpu(hdr->subpkt.length));
+   clen = be32_to_cpu(hdr->cp.length);
+   plen = be32_to_cpu(hdr->pkt.length);
+   slen = be32_to_cpu(hdr->subpkt.length);
+   pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
+clen, plen, slen);
+
+   if (clen == 0 || plen == 0 || slen == 0 ||
+   slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+   pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+  clen, plen, slen);
print_buffer(pos, sizeof(*hdr));
return -EINVAL;
}
@@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length,
return -EFAULT;
 
iter = resp->toks;
-   total = be32_to_cpu(hdr->subpkt.length);
+   total = slen;
print_buffer(pos, total);
while (total > 0) {
if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
-- 
1.8.3.1

[PATCHv4 1/4] block/sed: Use ssize_t on atom parsers to return errors

2017-02-15 Thread Jon Derrick

The short atom parser can return an errno from decoding but does not
currently return the error as a signed value. Convert all of the parsers
to ssize_t.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
Reviewed-by: Scott Bauer <scott.ba...@intel.com>
---
 block/sed-opal.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index e95b8a5..77623ad 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct 
parsed_resp *resp,
return tok->pos[0];
 }
 
-static size_t response_parse_tiny(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_short(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_short(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
@@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
tok->type = OPAL_DTA_TOKENID_SINT;
} else {
u64 u_integer = 0;
-   int i, b = 0;
+   ssize_t i, b = 0;
 
tok->type = OPAL_DTA_TOKENID_UINT;
if (tok->len > 9) {
@@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_medium(struct opal_resp_tok *tok,
-   const u8 *pos)
+static ssize_t response_parse_medium(struct opal_resp_tok *tok,
+const u8 *pos)
 {
tok->pos = pos;
tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
@@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok 
*tok,
return tok->len;
 }
 
-static size_t response_parse_long(struct opal_resp_tok *tok,
- const u8 *pos)
+static ssize_t response_parse_long(struct opal_resp_tok *tok,
+  const u8 *pos)
 {
tok->pos = pos;
tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
@@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok,
return tok->len;
 }
 
-static size_t response_parse_token(struct opal_resp_tok *tok,
-  const u8 *pos)
+static ssize_t response_parse_token(struct opal_resp_tok *tok,
+   const u8 *pos)
 {
tok->pos = pos;
tok->len = 1;
@@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length,
struct opal_resp_tok *iter;
int num_entries = 0;
int total;
-   size_t token_length;
+   ssize_t token_length;
const u8 *pos;
 
if (!buf)
@@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length,
else /* TOKEN */
token_length = response_parse_token(iter, pos);
 
-   if (token_length == -EINVAL)
-   return -EINVAL;
+   if (token_length < 0)
+   return token_length;
 
pos += token_length;
total -= token_length;
-- 
1.8.3.1

[PATCHv4 2/4] block/sed: Add helper to qualify response tokens

2017-02-15 Thread Jon Derrick

Add helper which verifies the response token is valid and matches the
expected value. Merges token_type and response_get_token.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
Reviewed-by: Scott Bauer <scott.ba...@intel.com>
---
 block/sed-opal.c | 61 +++-
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 77623ad..00673cf 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, 
u32 tsn)
return 0;
 }
 
-static enum opal_response_token token_type(const struct parsed_resp *resp,
-  int n)
+static const struct opal_resp_tok *response_get_token(
+   const struct parsed_resp *resp,
+   int n)
 {
const struct opal_resp_tok *tok;
 
if (n >= resp->num) {
pr_err("Token number doesn't exist: %d, resp: %d\n",
   n, resp->num);
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
tok = >toks[n];
if (tok->len == 0) {
pr_err("Token length must be non-zero\n");
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
-   return tok->type;
-}
-
-/*
- * This function returns 0 in case of invalid token. One should call
- * token_type() first to find out if the token is valid or not.
- */
-static enum opal_token response_get_token(const struct parsed_resp *resp,
- int n)
-{
-   const struct opal_resp_tok *tok;
-
-   if (n >= resp->num) {
-   pr_err("Token number doesn't exist: %d, resp: %d\n",
-  n, resp->num);
-   return 0;
-   }
-
-   tok = >toks[n];
-   if (tok->len == 0) {
-   pr_err("Token length must be non-zero\n");
-   return 0;
-   }
-
-   return tok->pos[0];
+   return tok;
 }
 
 static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
@@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp 
*resp, int n)
return resp->toks[n].stored.u;
 }
 
+static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
+{
+   if (IS_ERR(token) ||
+   token->type != OPAL_DTA_TOKENID_TOKEN ||
+   token->pos[0] != match)
+   return false;
+   return true;
+}
+
 static u8 response_status(const struct parsed_resp *resp)
 {
-   if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
-   response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+   const struct opal_resp_tok *tok;
+
+   tok = response_get_token(resp, 0);
+   if (response_token_matches(tok, OPAL_ENDOFSESSION))
return 0;
-   }
 
if (resp->num < 5)
return DTAERROR_NO_METHOD_STATUS;
 
-   if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
-   token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
-   response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
-   response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+   tok = response_get_token(resp, resp->num - 5);
+   if (!response_token_matches(tok, OPAL_STARTLIST))
+   return DTAERROR_NO_METHOD_STATUS;
+
+   tok = response_get_token(resp, resp->num - 1);
+   if (!response_token_matches(tok, OPAL_ENDLIST))
return DTAERROR_NO_METHOD_STATUS;
 
return response_get_u64(resp, resp->num - 4);
-- 
1.8.3.1

[PATCHv4 0/4] OPAL patches

2017-02-15 Thread Jon Derrick

Just a couple of fixes for sed-opal to prevent faulty firmware from
allowing us to go off in the weeds, and a helper to remove some
duplicate code.

v3->v4:
uses IS_ERR since tok is embedded in the response buffer and cannot be NULL

v2->v3:
corrected the bad calculation on the response parser check and changed
it to only check the subpacket length

v1->v2:
left tok->len as a size_t
got everyone important on the same email thread

Jon Derrick (4):
  block/sed: Use ssize_t on atom parsers to return errors
  block/sed: Add helper to qualify response tokens
  block/sed: Check received header lengths
  MAINTAINERS: Remove powerpc's opal match

 MAINTAINERS  |   1 -
 block/sed-opal.c | 124 +++
 2 files changed, 60 insertions(+), 65 deletions(-)

-- 
1.8.3.1

[PATCHv4 4/4] MAINTAINERS: Remove powerpc's opal match

2017-02-15 Thread Jon Derrick

PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks
like the 'arch/powerpc' file pattern should be enough to match powerpc
opal code by itself. Remove the opal regex pattern from powerpc.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b983b25..430dd02 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c
 F: drivers/pci/hotplug/rpa*
 F: drivers/scsi/ibmvscsi/
 F: tools/testing/selftests/powerpc
-N: opal
 N: /pmac
 N: powermac
 N: powernv
-- 
1.8.3.1

[PATCHv3 3/4] block/sed: Check received header lengths

2017-02-15 Thread Jon Derrick

Add a buffer size check against discovery and response header lengths
before we loop over their buffers.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 35 +--
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index d6dd604..feba34b 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev)
const struct d0_header *hdr = (struct d0_header *)dev->resp;
const u8 *epos = dev->resp, *cpos = dev->resp;
u16 comid = 0;
+   u32 hlen = be32_to_cpu(hdr->length);
 
-   print_buffer(dev->resp, be32_to_cpu(hdr->length));
+   print_buffer(dev->resp, hlen);
 
-   epos += be32_to_cpu(hdr->length); /* end of buffer */
+   if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+   pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
+   sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+   return -EFAULT;
+   }
+
+   epos += hlen; /* end of buffer */
cpos += sizeof(*hdr); /* current position on buffer */
 
while (cpos < epos && supported) {
@@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length,
int total;
ssize_t token_length;
const u8 *pos;
+   u32 clen, plen, slen;
 
if (!buf)
return -EFAULT;
@@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length,
pos = buf;
pos += sizeof(*hdr);
 
-   pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
-be32_to_cpu(hdr->cp.length),
-be32_to_cpu(hdr->pkt.length),
-be32_to_cpu(hdr->subpkt.length));
-
-   if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
-   hdr->subpkt.length == 0) {
-   pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
-  be32_to_cpu(hdr->cp.length),
-  be32_to_cpu(hdr->pkt.length),
-  be32_to_cpu(hdr->subpkt.length));
+   clen = be32_to_cpu(hdr->cp.length);
+   plen = be32_to_cpu(hdr->pkt.length);
+   slen = be32_to_cpu(hdr->subpkt.length);
+   pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n",
+clen, plen, slen);
+
+   if (clen == 0 || plen == 0 || slen == 0 ||
+   slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+   pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+  clen, plen, slen);
print_buffer(pos, sizeof(*hdr));
return -EINVAL;
}
@@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length,
return -EFAULT;
 
iter = resp->toks;
-   total = be32_to_cpu(hdr->subpkt.length);
+   total = slen;
print_buffer(pos, total);
while (total > 0) {
if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
-- 
1.8.3.1

[PATCHv3 0/4] OPAL patches

2017-02-15 Thread Jon Derrick

Just a couple of fixes for sed-opal to prevent faulty firmware from
allowing us to go off in the weeds, and a helper to remove some
duplicate code.

v2->v3:
corrected the bad calculation on the response parser check and changed
it to only check the subpacket length

v1->v2:
left tok->len as a size_t
got everyone important on the same email thread


Jon Derrick (4):
  block/sed: Use ssize_t on atom parsers to return errors
  block/sed: Add helper to qualify response tokens
  block/sed: Check received header lengths
  MAINTAINERS: Remove powerpc's opal match

 MAINTAINERS  |   1 -
 block/sed-opal.c | 124 +++
 2 files changed, 60 insertions(+), 65 deletions(-)

-- 
1.8.3.1

[PATCHv3 2/4] block/sed: Add helper to qualify response tokens

2017-02-15 Thread Jon Derrick

Add helper which verifies the response token is valid and matches the
expected value. Merges token_type and response_get_token.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 block/sed-opal.c | 61 +++-
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/block/sed-opal.c b/block/sed-opal.c
index 77623ad..d6dd604 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, 
u32 tsn)
return 0;
 }
 
-static enum opal_response_token token_type(const struct parsed_resp *resp,
-  int n)
+static const struct opal_resp_tok *response_get_token(
+   const struct parsed_resp *resp,
+   int n)
 {
const struct opal_resp_tok *tok;
 
if (n >= resp->num) {
pr_err("Token number doesn't exist: %d, resp: %d\n",
   n, resp->num);
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
tok = >toks[n];
if (tok->len == 0) {
pr_err("Token length must be non-zero\n");
-   return OPAL_DTA_TOKENID_INVALID;
+   return ERR_PTR(-EINVAL);
}
 
-   return tok->type;
-}
-
-/*
- * This function returns 0 in case of invalid token. One should call
- * token_type() first to find out if the token is valid or not.
- */
-static enum opal_token response_get_token(const struct parsed_resp *resp,
- int n)
-{
-   const struct opal_resp_tok *tok;
-
-   if (n >= resp->num) {
-   pr_err("Token number doesn't exist: %d, resp: %d\n",
-  n, resp->num);
-   return 0;
-   }
-
-   tok = >toks[n];
-   if (tok->len == 0) {
-   pr_err("Token length must be non-zero\n");
-   return 0;
-   }
-
-   return tok->pos[0];
+   return tok;
 }
 
 static ssize_t response_parse_tiny(struct opal_resp_tok *tok,
@@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp 
*resp, int n)
return resp->toks[n].stored.u;
 }
 
+static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
+{
+   if (IS_ERR_OR_NULL(token) ||
+   token->type != OPAL_DTA_TOKENID_TOKEN ||
+   token->pos[0] != match)
+   return false;
+   return true;
+}
+
 static u8 response_status(const struct parsed_resp *resp)
 {
-   if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
-   response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+   const struct opal_resp_tok *tok;
+
+   tok = response_get_token(resp, 0);
+   if (response_token_matches(tok, OPAL_ENDOFSESSION))
return 0;
-   }
 
if (resp->num < 5)
return DTAERROR_NO_METHOD_STATUS;
 
-   if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
-   token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
-   response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
-   response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+   tok = response_get_token(resp, resp->num - 5);
+   if (!response_token_matches(tok, OPAL_STARTLIST))
+   return DTAERROR_NO_METHOD_STATUS;
+
+   tok = response_get_token(resp, resp->num - 1);
+   if (!response_token_matches(tok, OPAL_ENDLIST))
return DTAERROR_NO_METHOD_STATUS;
 
return response_get_u64(resp, resp->num - 4);
-- 
1.8.3.1

Re: [PATCH] x86/ioapic: Ignore root bridges without a companion ACPI device

2016-09-12 Thread Jon Derrick

On Sat, Sep 10, 2016 at 11:40:45PM +0800, Rui Wang wrote:
> Some PCI root bridges don't have a corresponding ACPI device.
> This can be the case on some old platforms. Don't call acpi_ioapic_add()
> on these bridges because they can't support ioapic hotplug.
Not just old platforms. VMD is a new device which initializes a root port in 
software without an acpi handle.

What branch is this based on? I can't find the relevant code/commits

Re: [PATCH] x86/ioapic: Ignore root bridges without a companion ACPI device

2016-09-12 Thread Jon Derrick

On Mon, Sep 12, 2016 at 03:43:23PM -0600, Luck, Tony wrote:
> > What branch is this based on? I can't find the relevant code/commits
> 
> tip tree. branch x86/apic
> 
> -Tony

Cheers!

Re: [PATCH 1/2] nvme : Use correct scnprintf in cmb show

2016-12-19 Thread Jon Derrick

Looks good. Thanks Stephen.

Reviewed-by Jon Derrick: <jonathan.derr...@intel.com>

On Fri, Dec 16, 2016 at 11:54:50AM -0700, Stephen Bates wrote:
> Make sure we are using the correct scnprintf in the sysfs show
> function for the CMB.
> 
> Signed-off-by: Stephen Bates <sba...@raithlin.com>
> ---
>  drivers/nvme/host/pci.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 5e52034..be10860 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -50,7 +50,7 @@
>  #define NVME_AQ_DEPTH256
>  #define SQ_SIZE(depth)   (depth * sizeof(struct nvme_command))
>  #define CQ_SIZE(depth)   (depth * sizeof(struct nvme_completion))
> - 
> +
>  /*
>   * We handle AEN commands ourselves and don't even let the
>   * block layer know about them.
> @@ -1332,7 +1332,7 @@ static ssize_t nvme_cmb_show(struct device *dev,
>  {
>   struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
>  
> - return snprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
> + return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
>  ndev->cmbloc, ndev->cmbsz);
>  }
>  static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
> -- 
> 2.1.4
>

Re: [PATCH 2/2] nvme: improve cmb sysfs reporting

2016-12-19 Thread Jon Derrick

Minor nit below

On Fri, Dec 16, 2016 at 11:54:51AM -0700, Stephen Bates wrote:
> Add more information to the NVMe CMB sysfs entry. This includes
> information about the CMB size, location and capabilities.
> 
> Signed-off-by: Stephen Bates 
> ---
>  drivers/nvme/host/pci.c | 31 +--
>  include/linux/nvme.h|  8 
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index be10860..1f70630 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -98,6 +98,7 @@ struct nvme_dev {
>   void __iomem *cmb;
>   dma_addr_t cmb_dma_addr;
>   u64 cmb_size;
> + u64 cmb_offset;
>   u32 cmbsz;
>   u32 cmbloc;
>   struct nvme_ctrl ctrl;
> @@ -1326,14 +1327,39 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>   return ret >= 0 ? 0 : ret;
>  }
>  
> +static const char * const cmb_caps[] = {
> + [NVME_CMB_CAP_SQS] = "SQS",
> + [NVME_CMB_CAP_CQS] = "CQS",
> + [NVME_CMB_CAP_LISTS] = "LISTS",
> + [NVME_CMB_CAP_RDS] = "RDS",
> + [NVME_CMB_CAP_WDS] = "WDS",
> +};
> +
>  static ssize_t nvme_cmb_show(struct device *dev,
>struct device_attribute *attr,
>char *buf)
>  {
>   struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
> + unsigned int i, len = 0;
> +
> + len += scnprintf(buf+len, PAGE_SIZE-len,
> + "cmbloc  : 0x%08x\ncmbsz   : 0x%08x\n\n",
> + ndev->cmbloc, ndev->cmbsz);
> +
> + len += scnprintf(buf+len, PAGE_SIZE-len,
> + "OFFSET   : 0x%016llx\nSIZE : %llu Bytes\n" \
> + "DMA ADDR : %pad\n\n",
> + ndev->cmb_offset, ndev->cmb_size,
> + >cmb_dma_addr);
> +
> + for (i = NVME_CMB_CAP_SQS; i <= NVME_CMB_CAP_WDS; i++)
I'd prefer seeing (i = 0; i < ARRAY_SIZE(..); i++) because it provides
automatic bounds checking against future code.

> + len += scnprintf(buf+len, PAGE_SIZE-len, "%-7s: %s\n",
> +  cmb_caps[i],
> +  ((ndev->cmbsz) & (1< +  "NOT SUPPORTED");
> +
> + return len;
>  
> - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
> -ndev->cmbloc, ndev->cmbsz);
>  }
>  static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
>  
> @@ -1376,6 +1402,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
>  
>   dev->cmb_dma_addr = dma_addr;
>   dev->cmb_size = size;
> + dev->cmb_offset = offset;
>   return cmb;
>  }
>  
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index fc3c242..cd0d324 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -121,6 +121,14 @@ enum {
>  #define NVME_CMB_CQS(cmbsz)  ((cmbsz) & 0x2)
>  #define NVME_CMB_SQS(cmbsz)  ((cmbsz) & 0x1)
>  
> +enum {
> + NVME_CMB_CAP_SQS = 0,
> + NVME_CMB_CAP_CQS,
> + NVME_CMB_CAP_LISTS,
> + NVME_CMB_CAP_RDS,
> + NVME_CMB_CAP_WDS,
> +};
> +
>  /*
>   * Submission and Completion Queue Entry Sizes for the NVM command set.
>   * (In bytes and specified as a power of two (2^n)).
> -- 
> 2.1.4
>

Re: [PATCH v3 2/5] lib: Add Sed-opal library

2016-12-20 Thread Jon Derrick

On Mon, Dec 19, 2016 at 11:28:47PM -0800, Christoph Hellwig wrote:
[snip]
> > +   while (cpos < total) {
> > +   if (!(pos[0] & 0x80)) /* tiny atom */
> > +   token_length = response_parse_tiny(iter, pos);
> > +   else if (!(pos[0] & 0x40)) /* short atom */
> > +   token_length = response_parse_short(iter, pos);
> > +   else if (!(pos[0] & 0x20)) /* medium atom */
> > +   token_length = response_parse_medium(iter, pos);
> > +   else if (!(pos[0] & 0x10)) /* long atom */
> > +   token_length = response_parse_long(iter, pos);
> 
> Please add symbolic names for these constants to the protocol header.
> 
I get tripped up by this logic every time I look at it. I'd almost
rather see something like the following, which more closely follows the
TCG core spec and the optimizing compiler should turn it into something
similar to above anyways:

if (pos[0] <= 0x7F)
token_length = response_parse_tiny..
else if (pos[0] <= 0xBF)
token_length = response_parse_short..
else if (pos[0] <= 0xDF)
token_length = response_parse_medium..
else if (pos[0] <= 0xE3)
token_length = response_parse_long..

Then just add those 0x7F, 0xBF, 0xDF, and 0xE3 constants to proto
header. We could even add in a TCG reserved error for 0xE4-0xEF

Also instead of tracking cpos, we could subtract from total until
negative (if you make it signed). It's similar to how we do length in
nvme_setup_prps.

[snip]
> > --- /dev/null
> > +++ b/lib/sed-opal_internal.h
> 
> This pretty much seem to contain the OPAL protocol defintions, so why
> not opal_proto.h?
Since there might eventually be a whole class of opal-like sed
protocols, why does it make more sense to have opal_proto.h instead of
sed-opal.h or some variation? This is similar to how leds-*.h look to
me. Although I agree that sed-ATA.h would be dishonest since ATA
security doesn't imply a self-encrypting-disk.

[snip]
> > +int sed_save(struct sed_context *sed_ctx, struct sed_key *key)
> > +{
> > +   switch (key->sed_type) {
> > +   case OPAL_LOCK_UNLOCK:
> > +   return opal_save(sed_ctx, key);
> > +   }
> > +
> > +   return -EOPNOTSUPP;
> 
> It seems to me that we should skip this whole sed_type indirections
> and just specify the ioctls directly for OPAL (which would include
> opalite and pyrite as subsets).  The only other protocols of interest
> for Linux would be the ATA "security" plain text passwords, which can
> be handled differently, or enterprise SSC which we can hopefully avoid
> to implement entirely.
I'm on board with this if you think we won't have enough different, but
similar, SED protocols to justify the indirection. In that case you can
ignore the above comment as well.

> 
> > +
> > +int fdev_sed_ioctl(struct file *filep, unsigned int cmd,
> > +  unsigned long arg)
> > +{
> > +   struct sed_key key;
> > +   struct sed_context *sed_ctx;
> > +
> > +   if (!capable(CAP_SYS_ADMIN))
> > +   return -EACCES;
> > +
> > +   if (!filep->f_sedctx || !filep->f_sedctx->ops || !filep->f_sedctx->dev)
> > +   return -ENODEV;
> 
> In the previous version this was called from the block driver which
> could pass in the context (and ops).  Why was this changed?
>

Re: block/sed-opal.c: 2 * bad if tests ?

2017-03-06 Thread Jon Derrick

On 03/06/2017 05:00 AM, David Binderman wrote:
> Hello there,
> 
> 1.
> 
> block/sed-opal.c:2136:40: warning: logical ‘and’ of mutually exclusive tests 
> is always false [-Wlogical-op]
> 
> Source code is
> 
> if (lk_unlk->session.who < OPAL_USER1 &&
> lk_unlk->session.who > OPAL_USER9) {
> 
> 2.
> 
> block/sed-opal.c:2319:37: warning: logical ‘and’ of mutually exclusive tests 
> is always false [-Wlogical-op]
> 
> if (opal_session->who < OPAL_USER1 &&
> opal_session->who > OPAL_USER9) {
> 
> Duplicate.
> 
> Also in the same file:
> 
> [block/sed-opal.c:1034]: (style) Variable 'method' is assigned a value that 
> is never used.
> 
> Regards
> 
> David Binderman
> 
Thanks for the catch(es). Will provide patch shortly

Re: [PATCH 2/3] pci: Generalize is_vmd behavior

2017-08-11 Thread Jon Derrick

On 08/11/2017 11:03 AM, Bjorn Helgaas wrote:
> On Mon, Aug 07, 2017 at 01:57:12PM -0600, Jon Derrick wrote:
>> Generalize is_vmd behavior to remove dependency on domain number
>> checking in pci quirks.
>>
>> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
>> ---
>>  arch/x86/include/asm/pci.h | 8 +++-
>>  arch/x86/pci/common.c  | 2 +-
>>  drivers/pci/quirks.c   | 2 +-
>>  include/linux/pci.h| 4 
>>  4 files changed, 9 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
>> index 473a729..5c5d54a 100644
>> --- a/arch/x86/include/asm/pci.h
>> +++ b/arch/x86/include/asm/pci.h
>> @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus 
>> *bus)
>>  #define pci_root_bus_fwnode _pci_root_bus_fwnode
>>  #endif
>>  
>> -static inline bool is_vmd(struct pci_bus *bus)
>> -{
>>  #if IS_ENABLED(CONFIG_VMD)
>> +static inline bool pci_bus_is_vmd(struct pci_bus *bus)
>> +{
>>  struct pci_sysdata *sd = bus->sysdata;
>>  
>>  return sd->vmd_domain;
>> -#else
>> -return false;
>> -#endif
>>  }
>> +#endif
>>  
>>  /* Can be used to override the logic in pci_scan_bus for skipping
>> already-configured bus numbers - to be used for buggy BIOSes
>> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
>> index dbe2132..18b2277 100644
>> --- a/arch/x86/pci/common.c
>> +++ b/arch/x86/pci/common.c
>> @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {}
>>  
>>  static void set_dev_domain_options(struct pci_dev *pdev)
>>  {
>> -if (is_vmd(pdev->bus))
>> +if (pci_bus_is_vmd(pdev->bus))
>>  pdev->hotplug_user_indicators = 1;
>>  }
>>  
>> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
>> index 6967c6b..ba47995 100644
>> --- a/drivers/pci/quirks.c
>> +++ b/drivers/pci/quirks.c
>> @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, 
>> quirk_intel_qat_vf_cap);
>>  static void quirk_no_aersid(struct pci_dev *pdev)
>>  {
>>  /* VMD Domain */
>> -if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
>> +if (pci_bus_is_vmd(pdev->bus))
> 
> I like this part ...
> 
>>  pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
>>  }
>>  DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 4869e66..0299d8b 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus 
>> *bus) { return 0; }
>>  static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
>>  #endif /* CONFIG_PCI_DOMAINS */
>>  
>> +#if !IS_ENABLED(CONFIG_VMD)
>> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; }
>> +#endif
> 
> But not so much this part.  VMD is (AFAIK) still fundamentally an
> x86-only thing, so I'd like it better if this could all be within
> arch/x86.  Could this be done by moving quirk_no_aersid() to
> arch/x86/pci/fixup.c?
> 
Thanks for pointing this out. I'll think of something different and
localize it to the x86 code domain.

> BTW, CONFIG_VMD in drivers/pci/host/Kconfig is currently "depends on
> SRCU".  I'm not a Kconfig expert, but that doesn't seem like an
> intuitive connection.  And it's the only such dependency on SRCU in
> the tree -- most other places use "select SRCU", which makes more
> sense to me.
I agree - most places use select SRCU. I'll add that to v2.

> 
>>  /*
>>   * Generic implementation for PCI domain support. If your
>>   * architecture does not need custom management of PCI
>> -- 
>> 2.9.4
>>

Re: [PATCH 3/3] iommu: prevent VMD child devices from being remapping targets

2017-08-11 Thread Jon Derrick

Hi Robin, thanks for the reply.

On 08/11/2017 12:25 PM, Robin Murphy wrote:
> On 07/08/17 20:57, Jon Derrick wrote:
>> VMD child devices must use the VMD endpoint's ID as the DMA source.
>> Because of this, there needs to be a way to link the parent VMD
>> endpoint's DMAR domain to the VMD child devices' DMAR domain such that
>> attaching and detaching child devices modify the endpoint's DMAR mapping
>> and prevents early detaching.
> > That sounds like either pci_device_group() needs modifying, or perhaps
> that intel-iommu needs its own extended iommu_ops::device_group
> implementation, to ensure that VMD child devices get put in the same
> group as their parent - if they share requester IDs they can't feasibly
> be attached to different domains anyway.
> 
> Robin.
Yes it seems like that to me too. I have a high-level understanding of
the changes required but not too much in the nitty-gritty details. It's
a bit more complicated anyways because VMD emerges a set of root ports,
and AFAICT, PCI ACS end at the root ports and iommu groups rely on ACS
to group devices. Either way it's not within the scope of VMD.

[PATCH 1/3] MAINTAINERS: Add Jonathan Derrick as VMD maintainer

2017-08-07 Thread Jon Derrick

Add myself as VMD maintainer

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488d..3ec39df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10090,6 +10090,7 @@ F:  drivers/pci/dwc/*imx6*
 
 PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD)
 M: Keith Busch <keith.bu...@intel.com>
+M: Jonathan Derrick <jonathan.derr...@intel.com>
 L: linux-...@vger.kernel.org
 S: Supported
 F: drivers/pci/host/vmd.c
-- 
2.9.4

[PATCH 2/3] pci: Generalize is_vmd behavior

2017-08-07 Thread Jon Derrick

Generalize is_vmd behavior to remove dependency on domain number
checking in pci quirks.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 arch/x86/include/asm/pci.h | 8 +++-
 arch/x86/pci/common.c  | 2 +-
 drivers/pci/quirks.c   | 2 +-
 include/linux/pci.h| 4 
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 473a729..5c5d54a 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus 
*bus)
 #define pci_root_bus_fwnode_pci_root_bus_fwnode
 #endif
 
-static inline bool is_vmd(struct pci_bus *bus)
-{
 #if IS_ENABLED(CONFIG_VMD)
+static inline bool pci_bus_is_vmd(struct pci_bus *bus)
+{
struct pci_sysdata *sd = bus->sysdata;
 
return sd->vmd_domain;
-#else
-   return false;
-#endif
 }
+#endif
 
 /* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dbe2132..18b2277 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {}
 
 static void set_dev_domain_options(struct pci_dev *pdev)
 {
-   if (is_vmd(pdev->bus))
+   if (pci_bus_is_vmd(pdev->bus))
pdev->hotplug_user_indicators = 1;
 }
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6967c6b..ba47995 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, 
quirk_intel_qat_vf_cap);
 static void quirk_no_aersid(struct pci_dev *pdev)
 {
/* VMD Domain */
-   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
+   if (pci_bus_is_vmd(pdev->bus))
pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66..0299d8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus *bus) { 
return 0; }
 static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 #endif /* CONFIG_PCI_DOMAINS */
 
+#if !IS_ENABLED(CONFIG_VMD)
+static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; }
+#endif
+
 /*
  * Generic implementation for PCI domain support. If your
  * architecture does not need custom management of PCI
-- 
2.9.4

[PATCH 3/3] iommu: prevent VMD child devices from being remapping targets

2017-08-07 Thread Jon Derrick

VMD child devices must use the VMD endpoint's ID as the DMA source.
Because of this, there needs to be a way to link the parent VMD
endpoint's DMAR domain to the VMD child devices' DMAR domain such that
attaching and detaching child devices modify the endpoint's DMAR mapping
and prevents early detaching.

This is outside the scope of VMD, so disable binding child devices to
prevent unforeseen issues. This functionality may be implemented in the
future.

This patch prevents VMD child devices from returning an IOMMU, which
prevents it from exposing iommu_group sysfs directories and subsequent
binding by userspace-access drivers such as VFIO.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 drivers/iommu/intel-iommu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 687f18f..651a6cd 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -905,6 +905,11 @@ static struct intel_iommu *device_to_iommu(struct device 
*dev, u8 *bus, u8 *devf
 * the PF instead to find the IOMMU. */
pf_pdev = pci_physfn(pdev);
dev = _pdev->dev;
+
+   /* VMD child devices currently cannot be handled individually */
+   if (pci_bus_is_vmd(pdev->bus))
+   return NULL;
+
segment = pci_domain_nr(pdev->bus);
} else if (has_acpi_companion(dev))
dev = _COMPANION(dev)->dev;
-- 
2.9.4

Re: [RFC 2/3] module: Ignore delete_id parameter

2017-09-28 Thread Jon Derrick

On 09/28/2017 12:03 AM, Dan Williams wrote:
> On Wed, Sep 27, 2017 at 1:40 PM, Jon Derrick <jonathan.derr...@intel.com> 
> wrote:
>> The PCI driver delete_id parameter is handled in each individual driver
>> registration callback.
>>
>> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
>> ---
>>  kernel/module.c | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/kernel/module.c b/kernel/module.c
>> index de66ec8..2b2dccf 100644
>> --- a/kernel/module.c
>> +++ b/kernel/module.c
>> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char 
>> *val, const char *modname,
>> return 0;
>> }
>>
>> +   /*
>> +* Ignore driver delete list arguments. They are handled by driver
>> +* registration callbacks
>> +*/
>> +   if (strcmp(param, "delete_id") == 0)
>> +   return 0;
>> +
> 
> Does this preclude something like:
> 
> modprobe ahci delete_id=1234:5678?
> 

It does seem like it would. I can look into calling into the pci
callback for this, but val is a struct module here and I haven't figured
out the plumbing to get the [correct] driver from that.

Maybe if I enforce the format of 'modprobe ahci ahci.delete_id=' to
ensure the driver is specified (and would be required in cases with
multi-driver modules).

Re: [RFC 1/3] PCI: pci-driver: Introduce pci device delete list

2017-09-28 Thread Jon Derrick

Hi Greg,

On 09/28/2017 03:09 AM, Greg Kroah-Hartman wrote:
> On Wed, Sep 27, 2017 at 04:40:20PM -0400, Jon Derrick wrote:
>> This patch introduces a new kernel command line parameter to mask pci
>> device ids from pci driver id tables. This prevents masked devices from
>> automatically binding to both built-in and module drivers.
>>
>> Devices can be later attached through the driver's sysfs new_id
>> inteface.
>>
>> The use cases for this are primarily for debugging, eg, being able to
>> prevent attachment before probes are set up. It can also be used to mask
>> off faulty built-in hardware or faulty simulated hardware.
>>
>> Another use case is to prevent attachment of devices which will be
>> passed to VMs, shortcutting the detachment effort.
> 
> Is the "shortcut" really that big of a deal?  unbind actually causes
> problems?  Is this an attempt to deal with devices being handled by more
> than one driver and then you want to manually bind it later on?
> 
>> The format is similar to the sysfs new_id format. Device ids are
>> specified with:
>>
>> :[:SVVV:SDDD][:][:]
>>
>> Where:
>>  = Vendor ID
>>  = Device ID
>> SVVV = Subvendor ID
>> SDDD = Subdevice ID
>>  = Class
>>  = Class Mask
>>
>> IDs can be chained with commas.
>>
>> Examples:
>>  .delete_id=1234:5678
>>  .delete_id=:
>>  .delete_id=::::010802
>>  .delete_id=1234:5678,abcd:ef01,2345:
> 
> What about drivers that handle more than one bus type (i.e. USB and
> PCI?)  This format is specific to PCI, yet you are defining it as a
> "global" for all drivers :(
> 
I was hoping to extend it to other bus types as well, but just wanted
some early feedback on the idea. Pci was the easier implementation for
me. Could prepending pci:, usb:, etc on the format be an option?

> This feels hacky, what is the real reason for this?  It feels like we
> have so many different ways to blacklist and unbind and bind devices to
> drivers already, why add yet-another way?
> 
I ran into an issue a while back where I needed to disable a device from
a built-in driver to perform a regression test. I worked around that
issue by doing an initcall_blacklist on the pci-driver declaration, but
that also preventing later binding because the driver was never registered.

I've also had issues with remote systems where pluggable devices were
broken or otherwise non-responsive, and it would have been nice to have
been able to keep them from binding on module loading as a temporary
workaround until someone could have removed those devices (though
impossible on built-in hardware).

I'm not sure about hacky; I think the implementation in this patch (1/3)
is pretty clean :). I'm not familiar with all the different ways of
blacklisting. Is there another way to work around the issues I listed above?

I understand the concern about having it exist in the format .
and only supporting one or a few bus types. I have another set that
extends the pci= parameter instead.

> thanks,
> 
> greg k-h
> 

Best,
Jon

Re: [RFC 2/3] module: Ignore delete_id parameter

2017-09-28 Thread Jon Derrick

On 09/28/2017 03:02 AM, Greg Kroah-Hartman wrote:
> On Wed, Sep 27, 2017 at 04:40:21PM -0400, Jon Derrick wrote:
>> The PCI driver delete_id parameter is handled in each individual driver
>> registration callback.
>>
>> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
>> ---
>>  kernel/module.c | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/kernel/module.c b/kernel/module.c
>> index de66ec8..2b2dccf 100644
>> --- a/kernel/module.c
>> +++ b/kernel/module.c
>> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char 
>> *val, const char *modname,
>>  return 0;
>>  }
>>  
>> +/*
>> + * Ignore driver delete list arguments. They are handled by driver
>> + * registration callbacks
>> + */
>> +if (strcmp(param, "delete_id") == 0)
>> +return 0;
> 
> Why?  This is only for the PCI core as you have defined it in this
> patchset, but you just broke this module id for all other kernel modules
> in the system :(
> 
> If you want to do this, you need to provide this feature for _all_
> kernel drivers...
> 
> thanks,
> 
> greg k-h
> 
Yes I'm not particularly happy about this one either. I will make this
more robust if the blacklisting idea is sound.

[RFC 0/3] Introduce PCI device blacklisting

2017-09-27 Thread Jon Derrick

This set introduces a method to keep devices from matching driver id
tables.

Please see 0001 for more details of the motivation and implementation.

This set currently applies to Bjorn's next (v4.14-rc1) branch.


Jon Derrick (3):
  PCI: pci-driver: Introduce pci device delete list
  module: Ignore delete_id parameter
  Documentation: Add pci device delete_id interface

 Documentation/admin-guide/kernel-parameters.txt |  13 ++
 drivers/pci/pci-driver.c| 253 +++-
 include/linux/pci.h |   1 +
 kernel/module.c |   7 +
 4 files changed, 272 insertions(+), 2 deletions(-)

-- 
1.8.3.1

[RFC 1/3] PCI: pci-driver: Introduce pci device delete list

2017-09-27 Thread Jon Derrick

This patch introduces a new kernel command line parameter to mask pci
device ids from pci driver id tables. This prevents masked devices from
automatically binding to both built-in and module drivers.

Devices can be later attached through the driver's sysfs new_id
inteface.

The use cases for this are primarily for debugging, eg, being able to
prevent attachment before probes are set up. It can also be used to mask
off faulty built-in hardware or faulty simulated hardware.

Another use case is to prevent attachment of devices which will be
passed to VMs, shortcutting the detachment effort.

The format is similar to the sysfs new_id format. Device ids are
specified with:

:[:SVVV:SDDD][:][:]

Where:
 = Vendor ID
 = Device ID
SVVV = Subvendor ID
SDDD = Subdevice ID
 = Class
 = Class Mask

IDs can be chained with commas.

Examples:
.delete_id=1234:5678
.delete_id=:
.delete_id=::::010802
.delete_id=1234:5678,abcd:ef01,2345:

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 drivers/pci/pci-driver.c | 253 ++-
 include/linux/pci.h  |   1 +
 2 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 11bd267..7acdf13 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "pci.h"
 
 struct pci_dynid {
@@ -27,6 +28,250 @@ struct pci_dynid {
struct pci_device_id id;
 };
 
+static struct pci_device_id *pci_match_deleted_ids(struct pci_driver *drv,
+  struct pci_dev *dev,
+  bool inverse)
+{
+   struct pci_dynid *deleteid;
+   struct pci_device_id *found_id = NULL;
+
+   spin_lock(>deleteids.lock);
+   list_for_each_entry(deleteid, >deleteids.list, node) {
+   if (!inverse && pci_match_one_device(>id, dev)) {
+   found_id = >id;
+   break;
+   }
+   if (inverse && !pci_match_one_device(>id, dev)) {
+   found_id = >id;
+   break;
+   }
+   }
+   spin_unlock(>deleteids.lock);
+
+   return found_id;
+}
+
+/**
+ * pci_match_non_deleted_ids - match dev against not-deleted ids
+ * @ids: array of PCI device id structures to search in
+ * @dev: the PCI device structure to match against.
+ *
+ * Finds devices in driver id table not matching the delete list and tries to
+ * match them individually to dev. Returns the matching pci_dev_id structure,
+ * %NULL if there is no match, of -errno if there was a failure to allocate
+ * memory for the temporary pci_dev
+ */
+static struct pci_device_id *pci_match_non_deleted_ids(struct pci_driver *drv,
+  struct pci_dev *dev)
+{
+   const struct pci_device_id *ids = drv->id_table;
+   struct pci_device_id *match = NULL;
+   struct pci_dev *tmpdev;
+
+   tmpdev = kzalloc(sizeof(*tmpdev), GFP_KERNEL);
+   if (!tmpdev)
+   return ERR_PTR(-ENOMEM);
+
+   while (ids->vendor || ids->subvendor || ids->class_mask) {
+   struct pci_device_id *found_id = NULL;
+
+   tmpdev->vendor = ids->vendor;
+   tmpdev->device = ids->device,
+   tmpdev->subsystem_vendor = ids->subvendor,
+   tmpdev->subsystem_device = ids->subdevice,
+   tmpdev->class = ids->class;
+
+   found_id = pci_match_deleted_ids(drv, tmpdev, true);
+   if (found_id) {
+   const struct pci_device_id id[2] = { *found_id, {0,} };
+   if (pci_match_id(id, dev)) {
+   match = found_id;
+   break;
+   }
+   }
+   ids++;
+   }
+
+   kfree(tmpdev);
+
+   return match;
+}
+
+/**
+ * pci_add_delete_id - add a new PCI device ID to a built-in driver's blacklist
+ * @drv: target pci driver
+ * @vendor: PCI vendor ID
+ * @device: PCI device ID
+ * @subvendor: PCI subvendor ID
+ * @subdevice: PCI subdevice ID
+ * @class: PCI class
+ * @class_mask: PCI class mask
+ *
+ * Adds a new dynamic pci device ID to this driver's delete list, preventing
+ * the device from attaching to built-in drivers
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pci_add_delete_id(struct pci_driver *drv,
+ unsigned int vendor, unsigned int device,
+ unsigned int subvendor, unsigned int subdevice,
+ unsigned int class, unsigned int class_mask)
+{

[RFC 3/3] Documentation: Add pci device delete_id interface

2017-09-27 Thread Jon Derrick

Document the .delete_id= cmdline interface for masking built-in
and module device id tables.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 13 +
 1 file changed, 13 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 0549662..862c8b5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -789,6 +789,19 @@
Defaults to the default architecture's huge page size
if not specified.
 
+   driver.delete_id=
+   [PCI]
+   Specified pci device format(s) will mask built-in
+   driver id tables, preventing devices from
+   automicatically attaching to built-in or module
+   drivers.  Later attachment should be done by adding the
+   deleted id back through the "new_id" sysfs interface.
+   Format: :[::]\
+   [:class][:class_mask][, ...]
+   PCI_ANY_ID masks should use the full 32-bit format.
+   Examples: virtio-pci.delete_id=:
+ nvme.delete_id=1234:5678,:
+
dhash_entries=  [KNL]
Set number of hash buckets for dentry cache.
 
-- 
1.8.3.1

[RFC 2/3] module: Ignore delete_id parameter

2017-09-27 Thread Jon Derrick

The PCI driver delete_id parameter is handled in each individual driver
registration callback.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 kernel/module.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index de66ec8..2b2dccf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char 
*val, const char *modname,
return 0;
}
 
+   /*
+* Ignore driver delete list arguments. They are handled by driver
+* registration callbacks
+*/
+   if (strcmp(param, "delete_id") == 0)
+   return 0;
+
/* Check for magic 'dyndbg' arg */
ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
-- 
1.8.3.1

[PATCH v2 1/4] MAINTAINERS: Add Jonathan Derrick as VMD maintainer

2017-08-17 Thread Jon Derrick

Add myself as VMD maintainer

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
Acked-by: Keith Busch <keith.bu...@intel.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488d..3ec39df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10090,6 +10090,7 @@ F:  drivers/pci/dwc/*imx6*
 
 PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD)
 M: Keith Busch <keith.bu...@intel.com>
+M: Jonathan Derrick <jonathan.derr...@intel.com>
 L: linux-...@vger.kernel.org
 S: Supported
 F: drivers/pci/host/vmd.c
-- 
2.9.4

[PATCH v2 0/4] VMD fixups

2017-08-17 Thread Jon Derrick

Mostly just cleanup in this revision, eg, trying to limit scope of vmd code to
x86

Previous:
https://patchwork.kernel.org/patch/9886095/
https://patchwork.kernel.org/patch/9886097/
https://patchwork.kernel.org/patch/9886101/


Jon Derrick (4):
  MAINTAINERS: Add Jonathan Derrick as VMD maintainer
  pci/x86: Move VMD quirks to x86 fixups
  x86/PCI: Use is_vmd rather than relying on the domain number
  iommu: Prevent VMD child devices from being remapping targets

 MAINTAINERS |  1 +
 arch/x86/pci/fixup.c| 18 ++
 drivers/iommu/intel-iommu.c |  5 +
 drivers/pci/quirks.c| 17 -
 4 files changed, 24 insertions(+), 17 deletions(-)

-- 
2.9.4

[PATCH v2 3/4] x86/PCI: Use is_vmd rather than relying on the domain number

2017-08-17 Thread Jon Derrick

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 arch/x86/pci/fixup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ca4b02e5..1ed2fbf 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -628,7 +628,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, 
quirk_apple_mbp_poweroff);
 static void quirk_no_aersid(struct pci_dev *pdev)
 {
/* VMD Domain */
-   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
+   if (is_vmd(pdev->bus))
pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
-- 
2.9.4

[PATCH v2 4/4] iommu: Prevent VMD child devices from being remapping targets

2017-08-17 Thread Jon Derrick

VMD child devices must use the VMD endpoint's ID as the requester.
Because of this, there needs to be a way to link the parent VMD
endpoint's iommu group and associated mappings to the VMD child devices
such that attaching and detaching child devices modify the endpoint's
mappings, while preventing early detaching on a singular device removal
or unbinding.

The reassignment of individual VMD child devices devices to VMs is
outside the scope of VMD, but may be implemented in the future. For now
it is best to prevent any such attempts.

This patch prevents VMD child devices from returning an IOMMU, which
prevents it from exposing an iommu_group sysfs directories and allowing
subsequent binding by userspace-access drivers such as VFIO.

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 drivers/iommu/intel-iommu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 687f18f..94353a6e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -901,6 +901,11 @@ static struct intel_iommu *device_to_iommu(struct device 
*dev, u8 *bus, u8 *devf
struct pci_dev *pf_pdev;
 
pdev = to_pci_dev(dev);
+
+   /* VMD child devices currently cannot be handled individually */
+   if (is_vmd(pdev->bus))
+   return NULL;
+
/* VFs aren't listed in scope tables; we need to look up
 * the PF instead to find the IOMMU. */
pf_pdev = pci_physfn(pdev);
-- 
2.9.4

[PATCH v2 2/4] pci/x86: Move VMD quirks to x86 fixups

2017-08-17 Thread Jon Derrick

VMD currently only exists for Intel x86 products

Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
---
 arch/x86/pci/fixup.c | 18 ++
 drivers/pci/quirks.c | 17 -
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 11e4074..ca4b02e5 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -618,3 +618,21 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev)
dev_info(dev, "can't work around MacBook Pro poweroff issue\n");
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, 
quirk_apple_mbp_poweroff);
+
+/*
+ * VMD-enabled root ports will change the source ID for all messages
+ * to the VMD device. Rather than doing device matching with the source
+ * ID, the AER driver should traverse the child device tree, reading
+ * AER registers to find the faulting device.
+ */
+static void quirk_no_aersid(struct pci_dev *pdev)
+{
+   /* VMD Domain */
+   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
+   pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
+
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index f1c9852..073baba 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4671,23 +4671,6 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev)
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap);
 
-/*
- * VMD-enabled root ports will change the source ID for all messages
- * to the VMD device. Rather than doing device matching with the source
- * ID, the AER driver should traverse the child device tree, reading
- * AER registers to find the faulting device.
- */
-static void quirk_no_aersid(struct pci_dev *pdev)
-{
-   /* VMD Domain */
-   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
-   pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
-}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
-
 /* FLR may cause some 82579 devices to hang. */
 static void quirk_intel_no_flr(struct pci_dev *dev)
 {
-- 
2.9.4

[PATCH] ext4: Check superblock mapped prior to committing

2018-06-29 Thread Jon Derrick

This patch attempts to close a hole leading to a BUG seen with hot
removals during writes [1].

A block device (NVME namespace in this test case) is formatted to EXT4
without partitions. It's mounted and write I/O is run to a file, then
the device is hot removed from the slot. The superblock attempts to be
written to the drive which is no longer present.

The typical chain of events leading to the BUG:
ext4_commit_super()
  __sync_dirty_buffer()
submit_bh()
  submit_bh_wbc()
BUG_ON(!buffer_mapped(bh));

This fix checks for the superblock's buffer head being mapped prior to
syncing.

[1] https://www.spinics.net/lists/linux-ext4/msg56527.html

Signed-off-by: Jon Derrick 
---
 fs/ext4/super.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0c4c220..ee33233 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4736,6 +4736,14 @@ static int ext4_commit_super(struct super_block *sb, int 
sync)
 
if (!sbh || block_device_ejected(sb))
return error;
+
+   /*
+* The superblock bh should be mapped, but it might not be if the
+* device was hot-removed. Not much we can do but fail the I/O.
+*/
+   if (!buffer_mapped(sbh))
+   return error;
+
/*
 * If the file system is mounted read-only, don't update the
 * superblock write time.  This avoids updating the superblock
-- 
2.7.4

[PATCH] PCI: Equalize hotplug memory for non/occupied slots

2018-07-25 Thread Jon Derrick

Currently, a hotplug bridge will be given hpmemsize additional memory if
available, in order to satisfy any future hotplug allocation
requirements.

These calculations don't consider the current memory size of the hotplug
bridge/slot, so hotplug bridges/slots which have downstream devices will
get their current allocation in addition to the hpmemsize value.

This makes for possibly undesirable results with a mix of unoccupied and
occupied slots (ex, with hpmemsize=2M):

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d64f [size=3M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d650-d66f [size=2M]

This change considers the current allocation size when using the
hpmemsize parameter to make the reservations predictable for the mix of
unoccupied and occupied slots:

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d63f [size=2M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d640-d65f [size=2M]

Signed-off-by: Jon Derrick 
---
Original RFC here:
https://patchwork.ozlabs.org/patch/945374/

I split this bit out from the RFC while awaiting the pci string handling
enhancements to handle per-device settings

Changed from RFC is a simpler algo

 drivers/pci/setup-bus.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824..5ae39e6 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -841,7 +842,7 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
@@ -1079,12 +1080,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
-   size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), 
min_align);
+   size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), 
min_align);
add_align = max(min_align, add_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_memsize(size, min_size, add_size,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.3.1

[PATCH v2] PCI hotplug Eq v2

2018-08-30 Thread Jon Derrick

Hi Bjorn,

Sorry for the delay on this one and pushing it after RC1.
Feel free to queue it up for 4.20 if it looks fine.

I've added comments to the git log and source explaining why calculate_iosize
was left unchanged. Basically I could not synthesize a condition where it would
have affected the topology.

v1->v2: Comments

Jon Derrick (1):
  PCI: Equalize hotplug memory for non/occupied slots

 drivers/pci/setup-bus.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

-- 
1.8.3.1

[PATCH v2] PCI: Equalize hotplug memory for non/occupied slots

2018-08-30 Thread Jon Derrick

Currently, a hotplug bridge will be given hpmemsize additional memory if
available, in order to satisfy any future hotplug allocation
requirements.

These calculations don't consider the current memory size of the hotplug
bridge/slot, so hotplug bridges/slots which have downstream devices will
get their current allocation in addition to the hpmemsize value.

This makes for possibly undesirable results with a mix of unoccupied and
occupied slots (ex, with hpmemsize=2M):

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d64f [size=3M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d650-d66f [size=2M]

This change considers the current allocation size when using the
hpmemsize parameter to make the reservations predictable for the mix of
unoccupied and occupied slots:

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d63f [size=2M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d640-d65f [size=2M]

The calculation for IO (hpiosize) should be similar, but platform
firmwares I've encountered (including QEMU) provide strict allocations
for IO and would not provide free IO resources for hotplug buses
in order to prove this calculation.

Signed-off-by: Jon Derrick 
---
 drivers/pci/setup-bus.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824..70d0aba 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -841,7 +842,15 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+
+   /*
+* Consider the current allocation size when adding size for extra
+* hotplug memory. This ensures that occupied slots don't receive
+* unneccessary memory allocations in addition to their current size.
+* The calculation should be similar for calculate_iosize, but was
+* unable to be tested.
+*/
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
@@ -1079,12 +1088,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
-   size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), 
min_align);
+   size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), 
min_align);
add_align = max(min_align, add_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_memsize(size, min_size, add_size,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.3.1

[PATCH] PCI/AER: Fix an AER enabling/disabling race

2018-09-01 Thread Jon Derrick

There is a sequence with non-ACPI root ports where the AER driver can
enable error reporting on the tree before port drivers have bound to
ports on the tree. The port driver assumes the AER driver will set up
error reporting afterwards, so instead add a check if error reporting
was set up first.

Example:
[  343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting
[  343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting
[  343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting
[  343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting
[  343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting
[  343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting
[  343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting
[  343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting
[  343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting
[  343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting
[  343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting
[  343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting
[  343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting
[  344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting
[  344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting
[  344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting
[  344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting
[  344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting
[  344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting
[  344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting
[  344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting

Signed-off-by: Jon Derrick 
---
 drivers/pci/pcie/aer.c  | 1 +
 drivers/pci/pcie/portdrv_core.c | 5 -
 include/linux/pci.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 83180ed..a4e36b6 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1333,6 +1333,7 @@ static int set_device_error_reporting(struct pci_dev 
*dev, void *data)
if (enable)
pcie_set_ecrc_checking(dev);
 
+   dev->aer_configured = 1;
return 0;
 }
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 7c37d81..f5de554 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -224,8 +224,11 @@ static int get_port_device_capability(struct pci_dev *dev)
/*
 * Disable AER on this port in case it's been enabled by the
 * BIOS (the AER service driver will enable it when necessary).
+* Don't disable it if the AER service driver has already
+* enabled it from the root port bus walking
 */
-   pci_disable_pcie_error_reporting(dev);
+   if (!dev->aer_configured)
+   pci_disable_pcie_error_reporting(dev);
}
 #endif
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e72ca8d..c071622 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -402,6 +402,7 @@ struct pci_dev {
unsigned inthas_secondary_link:1;
unsigned intnon_compliant_bars:1;   /* Broken BARs; ignore them */
unsigned intis_probed:1;/* Device probing in progress */
+   unsigned intaer_configured:1;   /* AER configured for device */
pci_dev_flags_t dev_flags;
atomic_tenable_cnt; /* pci_enable_device has been called */
 
-- 
1.8.3.1

[PATCH] PCI/portdrv: Enable error reporting on managed ports

2018-09-04 Thread Jon Derrick

During probe, the port driver will disable error reporting and assumes
it will be enabled later by the AER driver's pci_walk_bus() sequence.
This may not be the case for host-bridge enabled root ports, who will
enable first error reporting on the bus during the root port probe, and
then disable error reporting on downstream devices during subsequent
probing of the bus.

A hotplugged port device may also fail to enable error reporting as the
AER driver has already run on the root bus.

Check for these conditions and enable error reporting during portdrv
probing.

Example case:
[  343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting
[  343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting
[  343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting
[  343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting
[  343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting
[  343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting
[  343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting
[  343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting
[  343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting
[  343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting
[  343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting
[  343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting
[  343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting
[  344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting
[  344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting
[  344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting
[  344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting
[  344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting
[  344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting
[  344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting
[  344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting

Signed-off-by: Jon Derrick 
---
 drivers/pci/pcie/portdrv_core.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 7c37d81..fdd953a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -343,6 +343,16 @@ int pcie_port_device_register(struct pci_dev *dev)
if (!nr_service)
goto error_cleanup_irqs;
 
+#ifdef CONFIG_PCIEAER
+   /*
+* Enable error reporting for this port in case AER probing has already
+* run on the root bus or this port device is hot-inserted
+*/
+   if (dev->aer_cap && pci_aer_available() &&
+   (pcie_ports_native || pci_find_host_bridge(dev->bus)->native_aer))
+   pci_enable_pcie_error_reporting(dev);
+#endif
+
return 0;
 
 error_cleanup_irqs:
-- 
1.8.3.1

[PATCH v3] PCI: Equalize hotplug memory and io for non/occupied slots

2018-09-25 Thread Jon Derrick

Currently, a hotplug bridge will be given hpmemsize additional memory
and hpiosize additional io if available, in order to satisfy any future
hotplug allocation requirements.

These calculations don't consider the current memory/io size of the
hotplug bridge/slot, so hotplug bridges/slots which have downstream
devices will be allocated their current allocation in addition to the
hpmemsize value.

This makes for possibly undesirable results with a mix of unoccupied and
occupied slots (ex, with hpmemsize=2M):

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d64f [size=3M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d650-d66f [size=2M]

This change considers the current allocation size when using the
hpmemsize/hpiosize parameters to make the reservations predictable for
the mix of unoccupied and occupied slots:

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d63f [size=2M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d640-d65f [size=2M]

Signed-off-by: Jon Derrick 
---
v2->v3: Made the IO and mem size calculations nearly equivalent

 drivers/pci/setup-bus.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824..ed96043 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -811,6 +811,8 @@ static struct resource *find_free_bus_resource(struct 
pci_bus *bus,
 static resource_size_t calculate_iosize(resource_size_t size,
resource_size_t min_size,
resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -823,15 +825,18 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 #if defined(CONFIG_ISA) || defined(CONFIG_EISA)
size = (size & 0xff) + ((size & ~0xffUL) << 2);
 #endif
-   size = ALIGN(size + size1, align);
+   size = size + size1;
if (size < old_size)
size = old_size;
+
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -841,7 +846,8 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
@@ -930,12 +936,10 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
}
}
 
-   size0 = calculate_iosize(size, min_size, size1,
+   size0 = calculate_iosize(size, min_size, size1, 0, 0,
resource_size(b_res), min_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_iosize(size, min_size, add_size + size1,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_iosize(size, min_size, size1, add_size, 
children_add_size,
resource_size(b_res), min_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
@@ -1079,12 +1083,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
-   size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), 
min_align);
+   size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), 
min_align);
add_align = max(min_align, add_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_memsize(size, min_size, add_size,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.3.1

[PATCH v2 2/4] pci/x86: Move VMD quirks to x86 fixups

2017-08-17 Thread Jon Derrick

VMD currently only exists for Intel x86 products

Signed-off-by: Jon Derrick 
---
 arch/x86/pci/fixup.c | 18 ++
 drivers/pci/quirks.c | 17 -
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 11e4074..ca4b02e5 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -618,3 +618,21 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev)
dev_info(dev, "can't work around MacBook Pro poweroff issue\n");
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, 
quirk_apple_mbp_poweroff);
+
+/*
+ * VMD-enabled root ports will change the source ID for all messages
+ * to the VMD device. Rather than doing device matching with the source
+ * ID, the AER driver should traverse the child device tree, reading
+ * AER registers to find the faulting device.
+ */
+static void quirk_no_aersid(struct pci_dev *pdev)
+{
+   /* VMD Domain */
+   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
+   pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
+
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index f1c9852..073baba 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4671,23 +4671,6 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev)
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap);
 
-/*
- * VMD-enabled root ports will change the source ID for all messages
- * to the VMD device. Rather than doing device matching with the source
- * ID, the AER driver should traverse the child device tree, reading
- * AER registers to find the faulting device.
- */
-static void quirk_no_aersid(struct pci_dev *pdev)
-{
-   /* VMD Domain */
-   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
-   pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
-}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
-
 /* FLR may cause some 82579 devices to hang. */
 static void quirk_intel_no_flr(struct pci_dev *dev)
 {
-- 
2.9.4

[PATCH v2 4/4] iommu: Prevent VMD child devices from being remapping targets

2017-08-17 Thread Jon Derrick

VMD child devices must use the VMD endpoint's ID as the requester.
Because of this, there needs to be a way to link the parent VMD
endpoint's iommu group and associated mappings to the VMD child devices
such that attaching and detaching child devices modify the endpoint's
mappings, while preventing early detaching on a singular device removal
or unbinding.

The reassignment of individual VMD child devices devices to VMs is
outside the scope of VMD, but may be implemented in the future. For now
it is best to prevent any such attempts.

This patch prevents VMD child devices from returning an IOMMU, which
prevents it from exposing an iommu_group sysfs directories and allowing
subsequent binding by userspace-access drivers such as VFIO.

Signed-off-by: Jon Derrick 
---
 drivers/iommu/intel-iommu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 687f18f..94353a6e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -901,6 +901,11 @@ static struct intel_iommu *device_to_iommu(struct device 
*dev, u8 *bus, u8 *devf
struct pci_dev *pf_pdev;
 
pdev = to_pci_dev(dev);
+
+   /* VMD child devices currently cannot be handled individually */
+   if (is_vmd(pdev->bus))
+   return NULL;
+
/* VFs aren't listed in scope tables; we need to look up
 * the PF instead to find the IOMMU. */
pf_pdev = pci_physfn(pdev);
-- 
2.9.4

[PATCH v2 0/4] VMD fixups

2017-08-17 Thread Jon Derrick

Mostly just cleanup in this revision, eg, trying to limit scope of vmd code to
x86

Previous:
https://patchwork.kernel.org/patch/9886095/
https://patchwork.kernel.org/patch/9886097/
https://patchwork.kernel.org/patch/9886101/


Jon Derrick (4):
  MAINTAINERS: Add Jonathan Derrick as VMD maintainer
  pci/x86: Move VMD quirks to x86 fixups
  x86/PCI: Use is_vmd rather than relying on the domain number
  iommu: Prevent VMD child devices from being remapping targets

 MAINTAINERS |  1 +
 arch/x86/pci/fixup.c| 18 ++
 drivers/iommu/intel-iommu.c |  5 +
 drivers/pci/quirks.c| 17 -
 4 files changed, 24 insertions(+), 17 deletions(-)

-- 
2.9.4

[PATCH v2 3/4] x86/PCI: Use is_vmd rather than relying on the domain number

2017-08-17 Thread Jon Derrick

Signed-off-by: Jon Derrick 
---
 arch/x86/pci/fixup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ca4b02e5..1ed2fbf 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -628,7 +628,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, 
quirk_apple_mbp_poweroff);
 static void quirk_no_aersid(struct pci_dev *pdev)
 {
/* VMD Domain */
-   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
+   if (is_vmd(pdev->bus))
pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
-- 
2.9.4

[PATCH v2 1/4] MAINTAINERS: Add Jonathan Derrick as VMD maintainer

2017-08-17 Thread Jon Derrick

Add myself as VMD maintainer

Signed-off-by: Jon Derrick 
Acked-by: Keith Busch 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488d..3ec39df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10090,6 +10090,7 @@ F:  drivers/pci/dwc/*imx6*
 
 PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD)
 M: Keith Busch 
+M: Jonathan Derrick 
 L: linux-...@vger.kernel.org
 S: Supported
 F: drivers/pci/host/vmd.c
-- 
2.9.4

Re: [RFC 1/3] PCI: pci-driver: Introduce pci device delete list

2017-09-28 Thread Jon Derrick

Hi Greg,

On 09/28/2017 03:09 AM, Greg Kroah-Hartman wrote:
> On Wed, Sep 27, 2017 at 04:40:20PM -0400, Jon Derrick wrote:
>> This patch introduces a new kernel command line parameter to mask pci
>> device ids from pci driver id tables. This prevents masked devices from
>> automatically binding to both built-in and module drivers.
>>
>> Devices can be later attached through the driver's sysfs new_id
>> inteface.
>>
>> The use cases for this are primarily for debugging, eg, being able to
>> prevent attachment before probes are set up. It can also be used to mask
>> off faulty built-in hardware or faulty simulated hardware.
>>
>> Another use case is to prevent attachment of devices which will be
>> passed to VMs, shortcutting the detachment effort.
> 
> Is the "shortcut" really that big of a deal?  unbind actually causes
> problems?  Is this an attempt to deal with devices being handled by more
> than one driver and then you want to manually bind it later on?
> 
>> The format is similar to the sysfs new_id format. Device ids are
>> specified with:
>>
>> :[:SVVV:SDDD][:][:]
>>
>> Where:
>>  = Vendor ID
>>  = Device ID
>> SVVV = Subvendor ID
>> SDDD = Subdevice ID
>>  = Class
>>  = Class Mask
>>
>> IDs can be chained with commas.
>>
>> Examples:
>>  .delete_id=1234:5678
>>  .delete_id=:
>>  .delete_id=::::010802
>>  .delete_id=1234:5678,abcd:ef01,2345:
> 
> What about drivers that handle more than one bus type (i.e. USB and
> PCI?)  This format is specific to PCI, yet you are defining it as a
> "global" for all drivers :(
> 
I was hoping to extend it to other bus types as well, but just wanted
some early feedback on the idea. Pci was the easier implementation for
me. Could prepending pci:, usb:, etc on the format be an option?

> This feels hacky, what is the real reason for this?  It feels like we
> have so many different ways to blacklist and unbind and bind devices to
> drivers already, why add yet-another way?
> 
I ran into an issue a while back where I needed to disable a device from
a built-in driver to perform a regression test. I worked around that
issue by doing an initcall_blacklist on the pci-driver declaration, but
that also preventing later binding because the driver was never registered.

I've also had issues with remote systems where pluggable devices were
broken or otherwise non-responsive, and it would have been nice to have
been able to keep them from binding on module loading as a temporary
workaround until someone could have removed those devices (though
impossible on built-in hardware).

I'm not sure about hacky; I think the implementation in this patch (1/3)
is pretty clean :). I'm not familiar with all the different ways of
blacklisting. Is there another way to work around the issues I listed above?

I understand the concern about having it exist in the format .
and only supporting one or a few bus types. I have another set that
extends the pci= parameter instead.

> thanks,
> 
> greg k-h
> 

Best,
Jon

Re: [RFC 2/3] module: Ignore delete_id parameter

2017-09-28 Thread Jon Derrick

On 09/28/2017 03:02 AM, Greg Kroah-Hartman wrote:
> On Wed, Sep 27, 2017 at 04:40:21PM -0400, Jon Derrick wrote:
>> The PCI driver delete_id parameter is handled in each individual driver
>> registration callback.
>>
>> Signed-off-by: Jon Derrick 
>> ---
>>  kernel/module.c | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/kernel/module.c b/kernel/module.c
>> index de66ec8..2b2dccf 100644
>> --- a/kernel/module.c
>> +++ b/kernel/module.c
>> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char 
>> *val, const char *modname,
>>  return 0;
>>  }
>>  
>> +/*
>> + * Ignore driver delete list arguments. They are handled by driver
>> + * registration callbacks
>> + */
>> +if (strcmp(param, "delete_id") == 0)
>> +return 0;
> 
> Why?  This is only for the PCI core as you have defined it in this
> patchset, but you just broke this module id for all other kernel modules
> in the system :(
> 
> If you want to do this, you need to provide this feature for _all_
> kernel drivers...
> 
> thanks,
> 
> greg k-h
> 
Yes I'm not particularly happy about this one either. I will make this
more robust if the blacklisting idea is sound.

Re: [RFC 2/3] module: Ignore delete_id parameter

2017-09-28 Thread Jon Derrick

On 09/28/2017 12:03 AM, Dan Williams wrote:
> On Wed, Sep 27, 2017 at 1:40 PM, Jon Derrick  
> wrote:
>> The PCI driver delete_id parameter is handled in each individual driver
>> registration callback.
>>
>> Signed-off-by: Jon Derrick 
>> ---
>>  kernel/module.c | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/kernel/module.c b/kernel/module.c
>> index de66ec8..2b2dccf 100644
>> --- a/kernel/module.c
>> +++ b/kernel/module.c
>> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char 
>> *val, const char *modname,
>> return 0;
>> }
>>
>> +   /*
>> +* Ignore driver delete list arguments. They are handled by driver
>> +* registration callbacks
>> +*/
>> +   if (strcmp(param, "delete_id") == 0)
>> +   return 0;
>> +
> 
> Does this preclude something like:
> 
> modprobe ahci delete_id=1234:5678?
> 

It does seem like it would. I can look into calling into the pci
callback for this, but val is a struct module here and I haven't figured
out the plumbing to get the [correct] driver from that.

Maybe if I enforce the format of 'modprobe ahci ahci.delete_id=' to
ensure the driver is specified (and would be required in cases with
multi-driver modules).

Re: [PATCH 2/3] pci: Generalize is_vmd behavior

2017-08-11 Thread Jon Derrick

On 08/11/2017 11:03 AM, Bjorn Helgaas wrote:
> On Mon, Aug 07, 2017 at 01:57:12PM -0600, Jon Derrick wrote:
>> Generalize is_vmd behavior to remove dependency on domain number
>> checking in pci quirks.
>>
>> Signed-off-by: Jon Derrick 
>> ---
>>  arch/x86/include/asm/pci.h | 8 +++-
>>  arch/x86/pci/common.c  | 2 +-
>>  drivers/pci/quirks.c   | 2 +-
>>  include/linux/pci.h| 4 
>>  4 files changed, 9 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
>> index 473a729..5c5d54a 100644
>> --- a/arch/x86/include/asm/pci.h
>> +++ b/arch/x86/include/asm/pci.h
>> @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus 
>> *bus)
>>  #define pci_root_bus_fwnode _pci_root_bus_fwnode
>>  #endif
>>  
>> -static inline bool is_vmd(struct pci_bus *bus)
>> -{
>>  #if IS_ENABLED(CONFIG_VMD)
>> +static inline bool pci_bus_is_vmd(struct pci_bus *bus)
>> +{
>>  struct pci_sysdata *sd = bus->sysdata;
>>  
>>  return sd->vmd_domain;
>> -#else
>> -return false;
>> -#endif
>>  }
>> +#endif
>>  
>>  /* Can be used to override the logic in pci_scan_bus for skipping
>> already-configured bus numbers - to be used for buggy BIOSes
>> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
>> index dbe2132..18b2277 100644
>> --- a/arch/x86/pci/common.c
>> +++ b/arch/x86/pci/common.c
>> @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {}
>>  
>>  static void set_dev_domain_options(struct pci_dev *pdev)
>>  {
>> -if (is_vmd(pdev->bus))
>> +if (pci_bus_is_vmd(pdev->bus))
>>  pdev->hotplug_user_indicators = 1;
>>  }
>>  
>> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
>> index 6967c6b..ba47995 100644
>> --- a/drivers/pci/quirks.c
>> +++ b/drivers/pci/quirks.c
>> @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, 
>> quirk_intel_qat_vf_cap);
>>  static void quirk_no_aersid(struct pci_dev *pdev)
>>  {
>>  /* VMD Domain */
>> -if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
>> +if (pci_bus_is_vmd(pdev->bus))
> 
> I like this part ...
> 
>>  pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
>>  }
>>  DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 4869e66..0299d8b 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus 
>> *bus) { return 0; }
>>  static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
>>  #endif /* CONFIG_PCI_DOMAINS */
>>  
>> +#if !IS_ENABLED(CONFIG_VMD)
>> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; }
>> +#endif
> 
> But not so much this part.  VMD is (AFAIK) still fundamentally an
> x86-only thing, so I'd like it better if this could all be within
> arch/x86.  Could this be done by moving quirk_no_aersid() to
> arch/x86/pci/fixup.c?
> 
Thanks for pointing this out. I'll think of something different and
localize it to the x86 code domain.

> BTW, CONFIG_VMD in drivers/pci/host/Kconfig is currently "depends on
> SRCU".  I'm not a Kconfig expert, but that doesn't seem like an
> intuitive connection.  And it's the only such dependency on SRCU in
> the tree -- most other places use "select SRCU", which makes more
> sense to me.
I agree - most places use select SRCU. I'll add that to v2.

> 
>>  /*
>>   * Generic implementation for PCI domain support. If your
>>   * architecture does not need custom management of PCI
>> -- 
>> 2.9.4
>>

Re: [PATCH 3/3] iommu: prevent VMD child devices from being remapping targets

2017-08-11 Thread Jon Derrick

Hi Robin, thanks for the reply.

On 08/11/2017 12:25 PM, Robin Murphy wrote:
> On 07/08/17 20:57, Jon Derrick wrote:
>> VMD child devices must use the VMD endpoint's ID as the DMA source.
>> Because of this, there needs to be a way to link the parent VMD
>> endpoint's DMAR domain to the VMD child devices' DMAR domain such that
>> attaching and detaching child devices modify the endpoint's DMAR mapping
>> and prevents early detaching.
> > That sounds like either pci_device_group() needs modifying, or perhaps
> that intel-iommu needs its own extended iommu_ops::device_group
> implementation, to ensure that VMD child devices get put in the same
> group as their parent - if they share requester IDs they can't feasibly
> be attached to different domains anyway.
> 
> Robin.
Yes it seems like that to me too. I have a high-level understanding of
the changes required but not too much in the nitty-gritty details. It's
a bit more complicated anyways because VMD emerges a set of root ports,
and AFAICT, PCI ACS end at the root ports and iommu groups rely on ACS
to group devices. Either way it's not within the scope of VMD.

[PATCH 1/3] MAINTAINERS: Add Jonathan Derrick as VMD maintainer

2017-08-07 Thread Jon Derrick

Add myself as VMD maintainer

Signed-off-by: Jon Derrick 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488d..3ec39df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10090,6 +10090,7 @@ F:  drivers/pci/dwc/*imx6*
 
 PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD)
 M: Keith Busch 
+M: Jonathan Derrick 
 L: linux-...@vger.kernel.org
 S: Supported
 F: drivers/pci/host/vmd.c
-- 
2.9.4

[PATCH 2/3] pci: Generalize is_vmd behavior

2017-08-07 Thread Jon Derrick

Generalize is_vmd behavior to remove dependency on domain number
checking in pci quirks.

Signed-off-by: Jon Derrick 
---
 arch/x86/include/asm/pci.h | 8 +++-
 arch/x86/pci/common.c  | 2 +-
 drivers/pci/quirks.c   | 2 +-
 include/linux/pci.h| 4 
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 473a729..5c5d54a 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus 
*bus)
 #define pci_root_bus_fwnode_pci_root_bus_fwnode
 #endif
 
-static inline bool is_vmd(struct pci_bus *bus)
-{
 #if IS_ENABLED(CONFIG_VMD)
+static inline bool pci_bus_is_vmd(struct pci_bus *bus)
+{
struct pci_sysdata *sd = bus->sysdata;
 
return sd->vmd_domain;
-#else
-   return false;
-#endif
 }
+#endif
 
 /* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dbe2132..18b2277 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {}
 
 static void set_dev_domain_options(struct pci_dev *pdev)
 {
-   if (is_vmd(pdev->bus))
+   if (pci_bus_is_vmd(pdev->bus))
pdev->hotplug_user_indicators = 1;
 }
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6967c6b..ba47995 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, 
quirk_intel_qat_vf_cap);
 static void quirk_no_aersid(struct pci_dev *pdev)
 {
/* VMD Domain */
-   if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1)
+   if (pci_bus_is_vmd(pdev->bus))
pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66..0299d8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus *bus) { 
return 0; }
 static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 #endif /* CONFIG_PCI_DOMAINS */
 
+#if !IS_ENABLED(CONFIG_VMD)
+static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; }
+#endif
+
 /*
  * Generic implementation for PCI domain support. If your
  * architecture does not need custom management of PCI
-- 
2.9.4

[PATCH 3/3] iommu: prevent VMD child devices from being remapping targets

2017-08-07 Thread Jon Derrick

VMD child devices must use the VMD endpoint's ID as the DMA source.
Because of this, there needs to be a way to link the parent VMD
endpoint's DMAR domain to the VMD child devices' DMAR domain such that
attaching and detaching child devices modify the endpoint's DMAR mapping
and prevents early detaching.

This is outside the scope of VMD, so disable binding child devices to
prevent unforeseen issues. This functionality may be implemented in the
future.

This patch prevents VMD child devices from returning an IOMMU, which
prevents it from exposing iommu_group sysfs directories and subsequent
binding by userspace-access drivers such as VFIO.

Signed-off-by: Jon Derrick 
---
 drivers/iommu/intel-iommu.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 687f18f..651a6cd 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -905,6 +905,11 @@ static struct intel_iommu *device_to_iommu(struct device 
*dev, u8 *bus, u8 *devf
 * the PF instead to find the IOMMU. */
pf_pdev = pci_physfn(pdev);
dev = _pdev->dev;
+
+   /* VMD child devices currently cannot be handled individually */
+   if (pci_bus_is_vmd(pdev->bus))
+   return NULL;
+
segment = pci_domain_nr(pdev->bus);
} else if (has_acpi_companion(dev))
dev = _COMPANION(dev)->dev;
-- 
2.9.4

[PATCH] genirq/affinity: report extra vectors on uneven nodes

2019-08-07 Thread Jon Derrick

The current irq spreading algorithm spreads vectors amongst cpus evenly
per node. If a node has more cpus than another node, the extra vectors
being spread may not be reported back to the caller.

This is most apparent with the NVMe driver and nr_cpus < vectors, where
the underreporting results in the caller's WARN being triggered:

irq_build_affinity_masks()
...
if (nr_present < numvecs)
WARN_ON(nr_present + nr_others < numvecs);

Signed-off-by: Jon Derrick 
---
 kernel/irq/affinity.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4352b08ae48d..9beafb8c7e92 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -127,7 +127,8 @@ static int __irq_build_affinity_masks(unsigned int startvec,
}
 
for_each_node_mask(n, nodemsk) {
-   unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
+   unsigned int ncpus, v, vecs_to_assign, total_vecs_to_assign,
+   vecs_per_node;
 
/* Spread the vectors per node */
vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
@@ -141,14 +142,16 @@ static int __irq_build_affinity_masks(unsigned int 
startvec,
 
/* Account for rounding errors */
extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
+   total_vecs_to_assign = vecs_to_assign + extra_vecs;
 
-   for (v = 0; curvec < last_affv && v < vecs_to_assign;
+   for (v = 0; curvec < last_affv && v < total_vecs_to_assign;
 curvec++, v++) {
cpus_per_vec = ncpus / vecs_to_assign;
 
/* Account for extra vectors to compensate rounding 
errors */
if (extra_vecs) {
cpus_per_vec++;
+   v++;
--extra_vecs;
}
irq_spread_init_one([curvec].mask, nmsk,
-- 
2.20.1

[PATCH 2/2] PCI: vmd: Fix shadow offsets to reflect spec changes

2019-09-16 Thread Jon Derrick

The shadow offset scratchpad was moved to 0x2000-0x2010. Update the
location to get the correct shadow offset.

Cc: sta...@vger.kernel.org # v5.2+
Fixes: 6788958e ("PCI: vmd: Assign membar addresses from shadow registers")
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 2e4da3f51d6b..a35d3f3996d7 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -31,6 +31,9 @@
 #define PCI_REG_VMLOCK 0x70
 #define MB2_SHADOW_EN(vmlock)  (vmlock & 0x2)
 
+#define MB2_SHADOW_OFFSET  0x2000
+#define MB2_SHADOW_SIZE16
+
 enum vmd_features {
/*
 * Device may contain registers which hint the physical location of the
@@ -578,7 +581,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
u32 vmlock;
int ret;
 
-   membar2_offset = 0x2018;
+   membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, );
if (ret || vmlock == ~0)
return -ENODEV;
@@ -590,9 +593,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
if (!membar2)
return -ENOMEM;
offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-   readq(membar2 + 0x2008);
+   readq(membar2 + MB2_SHADOW_OFFSET);
offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
-   readq(membar2 + 0x2010);
+   readq(membar2 + MB2_SHADOW_OFFSET + 8);
pci_iounmap(vmd->dev, membar2);
}
}
-- 
2.20.1

[PATCH 0/2] VMD fixes for v5.4

2019-09-16 Thread Jon Derrick

Hi Lorenzo, Bjorn, Keith,

Please consider the following patches for 5.4 inclusion.

These will apply to 5.2 stable. 4.19 has a few feature deps so I will instead
follow-up with a backport.

Jon Derrick (2):
  PCI: vmd: Fix config addressing when using bus offsets
  PCI: vmd: Fix shadow offsets to reflect spec changes

 drivers/pci/controller/vmd.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

-- 
2.20.1

[PATCH 1/2] PCI: vmd: Fix config addressing when using bus offsets

2019-09-16 Thread Jon Derrick

VMD maps child device config spaces to the VMD Config BAR linearly
regardless of the starting bus offset. Because of this, the config
address decode must ignore starting bus offsets when mapping the BDF to
the config space address.

Cc: sta...@vger.kernel.org # v5.2+
Fixes: 2a5a9c9a ("PCI: vmd: Add offset to bus numbers if necessary")
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 4575e0c6dc4b..2e4da3f51d6b 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -94,6 +94,7 @@ struct vmd_dev {
struct resource resources[3];
struct irq_domain   *irq_domain;
struct pci_bus  *bus;
+   u8  busn_start;
 
struct dma_map_ops  dma_ops;
struct dma_domain   dma_domain;
@@ -440,7 +441,8 @@ static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, 
struct pci_bus *bus,
  unsigned int devfn, int reg, int len)
 {
char __iomem *addr = vmd->cfgbar +
-(bus->number << 20) + (devfn << 12) + reg;
+((bus->number - vmd->busn_start) << 20) +
+(devfn << 12) + reg;
 
if ((addr - vmd->cfgbar) + len >=
resource_size(>dev->resource[VMD_CFGBAR]))
@@ -563,7 +565,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
unsigned long flags;
LIST_HEAD(resources);
resource_size_t offset[2] = {0};
-   resource_size_t membar2_offset = 0x2000, busn_start = 0;
+   resource_size_t membar2_offset = 0x2000;
struct pci_bus *child;
 
/*
@@ -606,14 +608,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, 
unsigned long features)
pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, );
if (BUS_RESTRICT_CAP(vmcap) &&
(BUS_RESTRICT_CFG(vmconfig) == 0x1))
-   busn_start = 128;
+   vmd->busn_start = 128;
}
 
res = >dev->resource[VMD_CFGBAR];
vmd->resources[0] = (struct resource) {
.name  = "VMD CFGBAR",
-   .start = busn_start,
-   .end   = busn_start + (resource_size(res) >> 20) - 1,
+   .start = vmd->busn_start,
+   .end   = vmd->busn_start + (resource_size(res) >> 20) - 1,
.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
};
 
@@ -681,8 +683,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
pci_add_resource_offset(, >resources[1], offset[0]);
pci_add_resource_offset(, >resources[2], offset[1]);
 
-   vmd->bus = pci_create_root_bus(>dev->dev, busn_start, _ops,
-  sd, );
+   vmd->bus = pci_create_root_bus(>dev->dev, vmd->busn_start,
+  _ops, sd, );
if (!vmd->bus) {
pci_free_resource_list();
irq_domain_remove(vmd->irq_domain);
-- 
2.20.1

[PATCH] PCI: vmd: Expose oob management window to users

2019-09-16 Thread Jon Derrick

Some VMD devices provide a management window within MEMBAR2 that is used
to communicate out-of-band with child devices. This patch creates a
binary file for interacting with the interface.

OOB Reads/Writes are bounds-checked by sysfs_fs_bin_{read,write}

Signed-off-by: Jon Derrick 
---
Depends on 
https://lore.kernel.org/linux-pci/20190916135435.5017-1-jonathan.derr...@intel.com/T/#t

 drivers/pci/controller/vmd.c | 128 ---
 1 file changed, 117 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index a35d3f3996d7..b13954cf9c96 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -33,6 +33,8 @@
 
 #define MB2_SHADOW_OFFSET  0x2000
 #define MB2_SHADOW_SIZE16
+#define MB2_OOB_WINDOW_OFFSET  0x2010
+#define MB2_OOB_WINDOW_SIZE128
 
 enum vmd_features {
/*
@@ -47,6 +49,12 @@ enum vmd_features {
 * bus numbering
 */
VMD_FEAT_HAS_BUS_RESTRICTIONS   = (1 << 1),
+
+   /*
+* Device may provide an out-of-band management interface through a
+* read/write window
+*/
+   VMD_FEAT_HAS_OOB_WINDOW = (1 << 2),
 };
 
 /*
@@ -101,6 +109,10 @@ struct vmd_dev {
 
struct dma_map_ops  dma_ops;
struct dma_domain   dma_domain;
+
+   spinlock_t  oob_lock;
+   char __iomem*oob_addr;
+   struct bin_attribute*oob_attr;
 };
 
 static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
@@ -543,6 +555,68 @@ static void vmd_detach_resources(struct vmd_dev *vmd)
vmd->dev->resource[VMD_MEMBAR2].child = NULL;
 }
 
+static ssize_t vmd_oob_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *attr, char *buf,
+   loff_t off, size_t count)
+{
+   struct vmd_dev *vmd = attr->private;
+   unsigned long flags;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
+   spin_lock_irqsave(>oob_lock, flags);
+   memcpy_fromio([off], >oob_addr[off], count);
+   spin_unlock_irqrestore(>oob_lock, flags);
+
+   return count;
+}
+
+static ssize_t vmd_oob_write(struct file *filp, struct kobject *kobj,
+struct bin_attribute *attr, char *buf,
+loff_t off, size_t count)
+{
+   struct vmd_dev *vmd = attr->private;
+   unsigned long flags;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
+   spin_lock_irqsave(>oob_lock, flags);
+   memcpy_toio(>oob_addr[off], [off], count);
+   spin_unlock_irqrestore(>oob_lock, flags);
+
+   return count;
+}
+
+static int vmd_create_oob_file(struct vmd_dev *vmd)
+{
+   struct pci_dev *dev = vmd->dev;
+   struct bin_attribute *oob_attr;
+
+   oob_attr = devm_kzalloc(>dev->dev, sizeof(*oob_attr), GFP_ATOMIC);
+   if (!oob_attr)
+   return -ENOMEM;
+
+   spin_lock_init(>oob_lock);
+   sysfs_bin_attr_init(oob_attr);
+   vmd->oob_attr = oob_attr;
+   oob_attr->attr.name = "oob";
+   oob_attr->attr.mode = S_IRUSR | S_IWUSR;
+   oob_attr->size = MB2_OOB_WINDOW_SIZE;
+   oob_attr->read = vmd_oob_read;
+   oob_attr->write = vmd_oob_write;
+   oob_attr->private = (void *)vmd;
+
+   return sysfs_create_bin_file(>dev.kobj, oob_attr);
+}
+
+static void vmd_destroy_oob_file(struct vmd_dev *vmd)
+{
+   if (vmd->oob_attr)
+   sysfs_remove_bin_file(>dev->dev.kobj, vmd->oob_attr);
+}
+
 /*
  * VMD domains start at 0x1 to not clash with ACPI _SEG domains.
  * Per ACPI r6.0, sec 6.5.6,  _SEG returns an integer, of which the lower
@@ -570,6 +644,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
resource_size_t offset[2] = {0};
resource_size_t membar2_offset = 0x2000;
struct pci_bus *child;
+   int ret;
 
/*
 * Shadow registers may exist in certain VMD device ids which allow
@@ -579,7 +654,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
 */
if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) {
u32 vmlock;
-   int ret;
 
membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, );
@@ -614,6 +688,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
vmd->busn_start = 128;
}
 
+   /*
+* Certain VMD devices provide a window for communicating with child
+* devices through a management interface
+*/
+   if (features & VMD_FEAT_HAS_OOB_WINDOW) {
+   membar2_offset = MB2_OOB_WINDOW_OFFSET + MB2_OOB_WINDOW_SIZE;
+   vmd

[RFC 3/3] Documentation: Add pci device delete_id interface

2017-09-27 Thread Jon Derrick

Document the .delete_id= cmdline interface for masking built-in
and module device id tables.

Signed-off-by: Jon Derrick 
---
 Documentation/admin-guide/kernel-parameters.txt | 13 +
 1 file changed, 13 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 0549662..862c8b5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -789,6 +789,19 @@
Defaults to the default architecture's huge page size
if not specified.
 
+   driver.delete_id=
+   [PCI]
+   Specified pci device format(s) will mask built-in
+   driver id tables, preventing devices from
+   automicatically attaching to built-in or module
+   drivers.  Later attachment should be done by adding the
+   deleted id back through the "new_id" sysfs interface.
+   Format: :[::]\
+   [:class][:class_mask][, ...]
+   PCI_ANY_ID masks should use the full 32-bit format.
+   Examples: virtio-pci.delete_id=:
+ nvme.delete_id=1234:5678,:
+
dhash_entries=  [KNL]
Set number of hash buckets for dentry cache.
 
-- 
1.8.3.1

[RFC 2/3] module: Ignore delete_id parameter

2017-09-27 Thread Jon Derrick

The PCI driver delete_id parameter is handled in each individual driver
registration callback.

Signed-off-by: Jon Derrick 
---
 kernel/module.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index de66ec8..2b2dccf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char 
*val, const char *modname,
return 0;
}
 
+   /*
+* Ignore driver delete list arguments. They are handled by driver
+* registration callbacks
+*/
+   if (strcmp(param, "delete_id") == 0)
+   return 0;
+
/* Check for magic 'dyndbg' arg */
ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
-- 
1.8.3.1

[RFC 0/3] Introduce PCI device blacklisting

2017-09-27 Thread Jon Derrick

This set introduces a method to keep devices from matching driver id
tables.

Please see 0001 for more details of the motivation and implementation.

This set currently applies to Bjorn's next (v4.14-rc1) branch.


Jon Derrick (3):
  PCI: pci-driver: Introduce pci device delete list
  module: Ignore delete_id parameter
  Documentation: Add pci device delete_id interface

 Documentation/admin-guide/kernel-parameters.txt |  13 ++
 drivers/pci/pci-driver.c| 253 +++-
 include/linux/pci.h |   1 +
 kernel/module.c |   7 +
 4 files changed, 272 insertions(+), 2 deletions(-)

-- 
1.8.3.1

[RFC 1/3] PCI: pci-driver: Introduce pci device delete list

2017-09-27 Thread Jon Derrick

This patch introduces a new kernel command line parameter to mask pci
device ids from pci driver id tables. This prevents masked devices from
automatically binding to both built-in and module drivers.

Devices can be later attached through the driver's sysfs new_id
inteface.

The use cases for this are primarily for debugging, eg, being able to
prevent attachment before probes are set up. It can also be used to mask
off faulty built-in hardware or faulty simulated hardware.

Another use case is to prevent attachment of devices which will be
passed to VMs, shortcutting the detachment effort.

The format is similar to the sysfs new_id format. Device ids are
specified with:

:[:SVVV:SDDD][:][:]

Where:
 = Vendor ID
 = Device ID
SVVV = Subvendor ID
SDDD = Subdevice ID
 = Class
 = Class Mask

IDs can be chained with commas.

Examples:
.delete_id=1234:5678
.delete_id=:
.delete_id=::::010802
.delete_id=1234:5678,abcd:ef01,2345:

Signed-off-by: Jon Derrick 
---
 drivers/pci/pci-driver.c | 253 ++-
 include/linux/pci.h  |   1 +
 2 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 11bd267..7acdf13 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "pci.h"
 
 struct pci_dynid {
@@ -27,6 +28,250 @@ struct pci_dynid {
struct pci_device_id id;
 };
 
+static struct pci_device_id *pci_match_deleted_ids(struct pci_driver *drv,
+  struct pci_dev *dev,
+  bool inverse)
+{
+   struct pci_dynid *deleteid;
+   struct pci_device_id *found_id = NULL;
+
+   spin_lock(>deleteids.lock);
+   list_for_each_entry(deleteid, >deleteids.list, node) {
+   if (!inverse && pci_match_one_device(>id, dev)) {
+   found_id = >id;
+   break;
+   }
+   if (inverse && !pci_match_one_device(>id, dev)) {
+   found_id = >id;
+   break;
+   }
+   }
+   spin_unlock(>deleteids.lock);
+
+   return found_id;
+}
+
+/**
+ * pci_match_non_deleted_ids - match dev against not-deleted ids
+ * @ids: array of PCI device id structures to search in
+ * @dev: the PCI device structure to match against.
+ *
+ * Finds devices in driver id table not matching the delete list and tries to
+ * match them individually to dev. Returns the matching pci_dev_id structure,
+ * %NULL if there is no match, of -errno if there was a failure to allocate
+ * memory for the temporary pci_dev
+ */
+static struct pci_device_id *pci_match_non_deleted_ids(struct pci_driver *drv,
+  struct pci_dev *dev)
+{
+   const struct pci_device_id *ids = drv->id_table;
+   struct pci_device_id *match = NULL;
+   struct pci_dev *tmpdev;
+
+   tmpdev = kzalloc(sizeof(*tmpdev), GFP_KERNEL);
+   if (!tmpdev)
+   return ERR_PTR(-ENOMEM);
+
+   while (ids->vendor || ids->subvendor || ids->class_mask) {
+   struct pci_device_id *found_id = NULL;
+
+   tmpdev->vendor = ids->vendor;
+   tmpdev->device = ids->device,
+   tmpdev->subsystem_vendor = ids->subvendor,
+   tmpdev->subsystem_device = ids->subdevice,
+   tmpdev->class = ids->class;
+
+   found_id = pci_match_deleted_ids(drv, tmpdev, true);
+   if (found_id) {
+   const struct pci_device_id id[2] = { *found_id, {0,} };
+   if (pci_match_id(id, dev)) {
+   match = found_id;
+   break;
+   }
+   }
+   ids++;
+   }
+
+   kfree(tmpdev);
+
+   return match;
+}
+
+/**
+ * pci_add_delete_id - add a new PCI device ID to a built-in driver's blacklist
+ * @drv: target pci driver
+ * @vendor: PCI vendor ID
+ * @device: PCI device ID
+ * @subvendor: PCI subvendor ID
+ * @subdevice: PCI subdevice ID
+ * @class: PCI class
+ * @class_mask: PCI class mask
+ *
+ * Adds a new dynamic pci device ID to this driver's delete list, preventing
+ * the device from attaching to built-in drivers
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pci_add_delete_id(struct pci_driver *drv,
+ unsigned int vendor, unsigned int device,
+ unsigned int subvendor, unsigned int subdevice,
+ unsigned int class, unsigned int class_mask)
+{
+   struct pci_dynid *del

[PATCH v3 0/2] PCI/ERR: Allow Native AER/DPC using _OSC

2020-04-30 Thread Jon Derrick

Hi Bjorn & Kuppuswamy,

I see a problem in the DPC ECN [1] to _OSC in that it doesn't give us a way to
determine if firmware supports _OSC DPC negotation, and therefore how to handle
DPC.

Here is the wording of the ECN that implies that Firmware without _OSC DPC
negotiation support should have the OSPM rely on _OSC AER negotiation when
determining DPC control:

  PCIe Base Specification suggests that Downstream Port Containment may be
  controlled either by the Firmware or the Operating System. It also suggests
  that the Firmware retain ownership of Downstream Port Containment if it also
  owns AER. When the Firmware owns Downstream Port Containment, it is expected
  to use the new "Error Disconnect Recover" notification to alert OSPM of a
  Downstream Port Containment event.

In legacy platforms, as bits in _OSC are reserved prior to implementation, ACPI
Root Bus enumeration will mark these Host Bridges as without Native DPC
support, even though the specification implies it's expected that AER _OSC
negotiation determines DPC control for these platforms. There seems to be a
need for a way to determine if the DPC control bit in _OSC is supported and
fallback on AER otherwise.


Currently portdrv assumes DPC control if the port has Native AER services:

static int get_port_device_capability(struct pci_dev *dev)
...
if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) &&
pci_aer_available() &&
(pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER)))
services |= PCIE_PORT_SERVICE_DPC;

Newer firmware may not grant OSPM DPC control, if for instance, it expects to
use Error Disconnect Recovery. However it looks like ACPI will use DPC services
via the EDR driver, without binding the full DPC port service driver.


If we change portdrv to probe based on host->native_dpc and not AER, then we
break instances with legacy firmware where OSPM will clear host->native_dpc
solely due to _OSC bits being reserved:

struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
...
if (!(root->osc_control_set & OSC_PCI_EXPRESS_DPC_CONTROL))
host_bridge->native_dpc = 0;



So my assumption instead is that host->native_dpc can be 0 and expect Native
DPC services if AER is used. In other words, if and only if DPC probe is
invoked from portdrv, then it needs to rely on the AER dependency. Otherwise it
should be assumed that ACPI set up DPC via EDR. This covers legacy firmware.

However it seems like that could be trouble with newer firmware that might give
OSPM control of AER but not DPC, and would result in both Native DPC and EDR
being in effect.


Anyways here are two patches that give control of AER and DPC on the results of
_OSC. They don't mess with the HEST parser as I expect those to be removed at
some point. I need these for VMD support which doesn't even rely on _OSC, but I
suspect this won't be the last effort as we detangle Firmware First.

[1] https://members.pcisig.com/wg/PCI-SIG/document/12888


Jon Derrick (2):
  PCI/AER: Use _OSC to determine Firmware First before HEST
  PCI/DPC: Use _OSC to determine DPC support

 drivers/pci/pcie/aer.c  | 3 +++
 drivers/pci/pcie/dpc.c  | 3 ---
 drivers/pci/pcie/portdrv_core.c | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

-- 
1.8.3.1

[PATCH v3 2/2] PCI/DPC: Use _OSC to determine DPC support

2020-04-30 Thread Jon Derrick

After a5bf8719af: "PCI/AER: Use only _OSC to determine AER ownership",
_OSC is the primary determiner of ownership of Firmware First error
handling rather than HEST.

With the addition of DPC to _OSC [1], OSPM is able to negotiate DPC
services from Firmware. ACPI Root Bus enumeration sets the Host Bridge's
Native DPC flag on negotiation of those service. This patch changes DPC
probing to check DPC control as determined by _OSC, by checking the Host
Bridge's Native DPC member.

As most older platforms won't have DPC negotiable by _OSC, this patch
doesn't attempt to change behavior that assumes if OSPM has negotiated
AER by _OSC, OSPM will also want DPC control.

[1] https://members.pcisig.com/wg/PCI-SIG/document/12888

Signed-off-by: Jon Derrick 
---
 drivers/pci/pcie/dpc.c  | 3 ---
 drivers/pci/pcie/portdrv_core.c | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 7621704..9104929 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -284,9 +284,6 @@ static int dpc_probe(struct pcie_device *dev)
int status;
u16 ctl, cap;
 
-   if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native)
-   return -ENOTSUPP;
-
status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
   dpc_handler, IRQF_SHARED,
   "pcie-dpc", pdev);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 50a9522..f2139a1 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -256,7 +256,8 @@ static int get_port_device_capability(struct pci_dev *dev)
 */
if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) &&
pci_aer_available() &&
-   (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER)))
+   (pcie_ports_dpc_native || host->native_dpc ||
+(services & PCIE_PORT_SERVICE_AER)))
services |= PCIE_PORT_SERVICE_DPC;
 
if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM ||
-- 
1.8.3.1

[PATCH v3 1/2] PCI/AER: Use _OSC to determine Firmware First before HEST

2020-04-30 Thread Jon Derrick

After a5bf8719af: "PCI/AER: Use only _OSC to determine AER ownership",
_OSC is the primary determiner of ownership of Firmware First error
handling rather than HEST.

ACPI Root Bus enumeration has been modified to flag Host Bridge devices
as using Native AER when _OSC has been negotiated for AER services.

This patch ensures the PCI layers first uses the _OSC negotiated state
by checking the Host Bridge's Native AER flag prior to HEST parsing.

Signed-off-by: Jon Derrick 
---
 drivers/pci/pcie/aer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index efc2677..f3d02f4 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -314,6 +314,9 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev)
if (pcie_ports_native)
return 0;
 
+   if (pci_find_host_bridge(dev->bus)->native_aer)
+   return 0;
+
if (!dev->__aer_firmware_first_valid)
aer_set_firmware_first(dev);
return dev->__aer_firmware_first;
-- 
1.8.3.1

[PATCH] irqdomain/treewide: Free firmware node after domain removal

2020-07-21 Thread Jon Derrick

Change 711419e504eb ("irqdomain: Add the missing assignment of
domain->fwnode for named fwnode") unintentionally caused a dangling
pointer page fault issue on firmware nodes that were freed after IRQ
domain allocation. Change e3beca48a45b fixed that dangling pointer issue
by only freeing the firmware node after an IRQ domain allocation
failure. That fix no longer frees the firmware node immediately, but
leaves the firmware node allocated after the domain is removed.

We need to keep the firmware node through irq_domain_remove, but should
free it afterwards. This patch saves the handle and adds the freeing of
firmware node after domain removal where appropriate.

Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally 
allocated")
Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
Cc: sta...@vger.kernel.org
---
 arch/mips/pci/pci-xtalk-bridge.c| 3 +++
 arch/x86/kernel/apic/io_apic.c  | 5 +
 drivers/iommu/intel/irq_remapping.c | 8 
 drivers/mfd/ioc3.c  | 6 ++
 drivers/pci/controller/vmd.c| 3 +++
 5 files changed, 25 insertions(+)

diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c
index 5958217..9b3cc77 100644
--- a/arch/mips/pci/pci-xtalk-bridge.c
+++ b/arch/mips/pci/pci-xtalk-bridge.c
@@ -728,6 +728,7 @@ static int bridge_probe(struct platform_device *pdev)
pci_free_resource_list(>windows);
 err_remove_domain:
irq_domain_remove(domain);
+   irq_domain_free_fwnode(fn);
return err;
 }
 
@@ -735,8 +736,10 @@ static int bridge_remove(struct platform_device *pdev)
 {
struct pci_bus *bus = platform_get_drvdata(pdev);
struct bridge_controller *bc = BRIDGE_CONTROLLER(bus);
+   struct fwnode_handle *fn = bc->domain->fwnode;
 
irq_domain_remove(bc->domain);
+   irq_domain_free_fwnode(fn);
pci_lock_rescan_remove();
pci_stop_root_bus(bus);
pci_remove_root_bus(bus);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 81ffcfb..21325a4a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic)
 
 static void ioapic_destroy_irqdomain(int idx)
 {
+   struct ioapic_domain_cfg *cfg = [idx].irqdomain_cfg;
+   struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
if (ioapics[idx].irqdomain) {
irq_domain_remove(ioapics[idx].irqdomain);
+   if (!cfg->dev)
+   irq_domain_free_fwnode(fn);
ioapics[idx].irqdomain = NULL;
}
 }
diff --git a/drivers/iommu/intel/irq_remapping.c 
b/drivers/iommu/intel/irq_remapping.c
index 9564d23..aa096b3 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -628,13 +628,21 @@ static int intel_setup_irq_remapping(struct intel_iommu 
*iommu)
 
 static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
 {
+   struct fwnode_handle *fn;
+
if (iommu && iommu->ir_table) {
if (iommu->ir_msi_domain) {
+   fn = iommu->ir_msi_domain->fwnode;
+
irq_domain_remove(iommu->ir_msi_domain);
+   irq_domain_free_fwnode(fn);
iommu->ir_msi_domain = NULL;
}
if (iommu->ir_domain) {
+   fn = iommu->ir_domain->fwnode;
+
irq_domain_remove(iommu->ir_domain);
+   irq_domain_free_fwnode(fn);
iommu->ir_domain = NULL;
}
free_pages((unsigned long)iommu->ir_table->base,
diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c
index 74cee7c..d939ccc 100644
--- a/drivers/mfd/ioc3.c
+++ b/drivers/mfd/ioc3.c
@@ -616,7 +616,10 @@ static int ioc3_mfd_probe(struct pci_dev *pdev,
/* Remove all already added MFD devices */
mfd_remove_devices(>pdev->dev);
if (ipd->domain) {
+   struct fwnode_handle *fn = ipd->domain->fwnode;
+
irq_domain_remove(ipd->domain);
+   irq_domain_free_fwnode(fn);
free_irq(ipd->domain_irq, (void *)ipd);
}
pci_iounmap(pdev, regs);
@@ -643,7 +646,10 @@ static void ioc3_mfd_remove(struct pci_dev *pdev)
/* Release resources */
mfd_remove_devices(>pdev->dev);
if (ipd->domain) {
+   struct fwnode_handle *fn = ipd->domain->fwnode;
+
irq_domain_remove(ipd->domain);
+   irq_domain_free_fwnode(fn);
free_irq(ipd->domain_irq, (void *)ipd);
}
pci_iounmap(pdev, ipd->regs);
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller

[PATCH 1/6] PCI: vmd: Create physical offset helper

2020-07-28 Thread Jon Derrick

Moves the guest-passthrough physical offset discovery code to a new
helper. No functional changes.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 105 +--
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index f69ef8c89f72..44b2db789eff 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -417,6 +417,60 @@ static int vmd_find_free_domain(void)
return domain + 1;
 }
 
+static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint,
+   resource_size_t *offset1,
+   resource_size_t *offset2)
+{
+   struct pci_dev *dev = vmd->dev;
+   u64 phys1, phys2;
+
+   if (native_hint) {
+   u32 vmlock;
+   int ret;
+
+   ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, );
+   if (ret || vmlock == ~0)
+   return -ENODEV;
+
+   if (MB2_SHADOW_EN(vmlock)) {
+   void __iomem *membar2;
+
+   membar2 = pci_iomap(dev, VMD_MEMBAR2, 0);
+   if (!membar2)
+   return -ENOMEM;
+   phys1 = readq(membar2 + MB2_SHADOW_OFFSET);
+   phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8);
+   pci_iounmap(dev, membar2);
+   } else
+   return 0;
+   } else {
+   /* Hypervisor-Emulated Vendor-Specific Capability */
+   int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+   u32 reg, regu;
+
+   pci_read_config_dword(dev, pos + 4, );
+
+   /* "SHDW" */
+   if (pos && reg == 0x53484457) {
+   pci_read_config_dword(dev, pos + 8, );
+   pci_read_config_dword(dev, pos + 12, );
+   phys1 = (u64) regu << 32 | reg;
+
+   pci_read_config_dword(dev, pos + 16, );
+   pci_read_config_dword(dev, pos + 20, );
+   phys2 = (u64) regu << 32 | reg;
+   } else
+   return 0;
+   }
+
+   *offset1 = dev->resource[VMD_MEMBAR1].start -
+   (phys1 & PCI_BASE_ADDRESS_MEM_MASK);
+   *offset2 = dev->resource[VMD_MEMBAR2].start -
+   (phys2 & PCI_BASE_ADDRESS_MEM_MASK);
+
+   return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
struct pci_sysdata *sd = >sysdata;
@@ -428,6 +482,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
resource_size_t offset[2] = {0};
resource_size_t membar2_offset = 0x2000;
struct pci_bus *child;
+   int ret;
 
/*
 * Shadow registers may exist in certain VMD device ids which allow
@@ -436,50 +491,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, 
unsigned long features)
 * or 0, depending on an enable bit in the VMD device.
 */
if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) {
-   u32 vmlock;
-   int ret;
-
membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
-   ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, );
-   if (ret || vmlock == ~0)
-   return -ENODEV;
-
-   if (MB2_SHADOW_EN(vmlock)) {
-   void __iomem *membar2;
-
-   membar2 = pci_iomap(vmd->dev, VMD_MEMBAR2, 0);
-   if (!membar2)
-   return -ENOMEM;
-   offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-   (readq(membar2 + MB2_SHADOW_OFFSET) &
-PCI_BASE_ADDRESS_MEM_MASK);
-   offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
-   (readq(membar2 + MB2_SHADOW_OFFSET + 8) 
&
-PCI_BASE_ADDRESS_MEM_MASK);
-   pci_iounmap(vmd->dev, membar2);
-   }
-   }
-
-   if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) {
-   int pos = pci_find_capability(vmd->dev, PCI_CAP_ID_VNDR);
-   u32 reg, regu;
-
-   pci_read_config_dword(vmd->dev, pos + 4, );
-
-   /* "SHDW" */
-   if (pos && reg == 0x53484457) {
-   pci_read_config_dword(vmd->dev, pos + 8, );
-   pci_read_config_dword(vmd->dev, pos + 12, );
-   offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-

[PATCH 5/6] x86/apic/msi: Use Real PCI DMA device when configuring IRTE

2020-07-28 Thread Jon Derrick

VMD retransmits child device MSI/X with the VMD endpoint's requester-id.
In order to support direct interrupt remapping of VMD child devices,
ensure that the IRTE is programmed with the VMD endpoint's requester-id
using pci_real_dma_dev().

Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
---
 arch/x86/kernel/apic/msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c2b2911feeef..7ca271b8d891 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -189,7 +189,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, 
int type)
 
init_irq_alloc_info(, NULL);
info.type = X86_IRQ_ALLOC_TYPE_MSI;
-   info.msi_dev = dev;
+   info.msi_dev = pci_real_dma_dev(dev);
 
domain = irq_remapping_get_irq_domain();
if (domain == NULL)
-- 
2.27.0

[PATCH 0/6] VMD MSI Remapping Bypass

2020-07-28 Thread Jon Derrick

The Intel Volume Management Device acts similar to a PCI-to-PCI bridge in that
it changes downstream devices' requester-ids to its own. As VMD supports PCIe
devices, it has its own MSI/X table and transmits child device MSI/X by
remapping child device MSI/X and handling like a demultiplexer.

Some newer VMD devices (Icelake Server and client) have an option to bypass the
VMD MSI/X remapping table. This allows for better performance scaling as the
child device MSI/X won't be limited by VMD's MSI/X count and IRQ handler.

It's expected that most users don't want MSI/X remapping when they can get
better performance without this limitation. This set includes some long overdue
cleanup of overgrown VMD code and introduces the MSI/X remapping disable.

Applies on top of e3beca48a45b ("irqdomain/treewide: Keep firmware node
unconditionally allocated") and ec0160891e38 ("irqdomain/treewide: Free
firmware node after domain removal") in tip/urgent


Jon Derrick (6):
  PCI: vmd: Create physical offset helper
  PCI: vmd: Create bus offset configuration helper
  PCI: vmd: Create IRQ Domain configuration helper
  PCI: vmd: Create IRQ allocation helper
  x86/apic/msi: Use Real PCI DMA device when configuring IRTE
  PCI: vmd: Disable MSI/X remapping when possible

 arch/x86/kernel/apic/msi.c   |   2 +-
 drivers/pci/controller/vmd.c | 344 +++
 2 files changed, 224 insertions(+), 122 deletions(-)


base-commit: ec0160891e387f4771f953b888b1fe951398e5d9
-- 
2.27.0

[PATCH 6/6] PCI: vmd: Disable MSI/X remapping when possible

2020-07-28 Thread Jon Derrick

VMD will retransmit child device MSI/X using its own MSI/X table and
requester-id. This limits the number of MSI/X available to the whole
child device domain to the number of VMD MSI/X interrupts. Some VMD
devices have a mode where this remapping can be disabled, allowing child
device interrupts to bypass processing with the VMD MSI/X domain
interrupt handler and going straight the child device interrupt handler,
allowing for better performance and scaling. The requester-id still gets
changed to the VMD endpoint's requester-id, and the interrupt remapping
handlers have been updated to properly set IRTE for child device
interrupts to the VMD endpoint's context.

Some VMD platforms have existing production BIOS which rely on MSI/X
remapping and won't explicitly program the MSI/X remapping bit. This
re-enables MSI/X remapping on unload.

Disabling MSI/X remapping is only available for Icelake Server and
client VMD products.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 58 +++-
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 3214d785fa5d..e8cde2c390b9 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -53,6 +53,12 @@ enum vmd_features {
 * vendor-specific capability space
 */
VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP= (1 << 2),
+
+   /*
+* Device remaps MSI/X transactions into its MSI/X table and requires
+* VMD MSI domain for child device interrupt handling
+*/
+   VMD_FEAT_REMAPS_MSI = (1 << 3),
 };
 
 /*
@@ -298,6 +304,15 @@ static struct msi_domain_info vmd_msi_domain_info = {
.chip   = _msi_controller,
 };
 
+static void vmd_enable_msi_remapping(struct vmd_dev *vmd, bool enable)
+{
+   u16 reg;
+
+   pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, );
+   reg = enable ? (reg & ~0x2) : (reg | 0x2);
+   pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg);
+}
+
 static int vmd_create_irq_domain(struct vmd_dev *vmd)
 {
struct fwnode_handle *fn;
@@ -318,6 +333,13 @@ static int vmd_create_irq_domain(struct vmd_dev *vmd)
 
 static void vmd_remove_irq_domain(struct vmd_dev *vmd)
 {
+   /*
+* Some production BIOS won't enable remapping between soft reboots.
+* Ensure remapping is restored before unloading the driver
+*/
+   if (!vmd->msix_count)
+   vmd_enable_msi_remapping(vmd, true);
+
if (vmd->irq_domain) {
struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
@@ -606,6 +628,27 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
return ret;
}
 
+   /*
+* Currently MSI remapping must be enabled in guest passthrough mode
+* due to some missing interrupt remapping plumbing. This is probably
+* acceptable because the guest is usually CPU-limited and MSI
+* remapping doesn't become a performance bottleneck.
+*/
+   if (features & VMD_FEAT_REMAPS_MSI || offset[0] || offset[1]) {
+   ret = vmd_alloc_irqs(vmd);
+   if (ret)
+   return ret;
+   }
+
+   /*
+* Disable remapping for performance if possible based on if VMD IRQs
+* had been allocated.
+*/
+   if (vmd->msix_count)
+   vmd_enable_msi_remapping(vmd, true);
+   else
+   vmd_enable_msi_remapping(vmd, false);
+
/*
 * Certain VMD devices may have a root port configuration option which
 * limits the bus range to between 0-127, 128-255, or 224-255
@@ -674,9 +717,11 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
 
sd->node = pcibus_to_node(vmd->dev->bus);
 
-   ret = vmd_create_irq_domain(vmd);
-   if (ret)
-   return ret;
+   if (vmd->msix_count) {
+   ret = vmd_create_irq_domain(vmd);
+   if (ret)
+   return ret;
+   }
 
pci_add_resource(, >resources[0]);
pci_add_resource_offset(, >resources[1], offset[0]);
@@ -738,10 +783,6 @@ static int vmd_probe(struct pci_dev *dev, const struct 
pci_device_id *id)
dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(32)))
return -ENODEV;
 
-   err = vmd_alloc_irqs(vmd);
-   if (err)
-   return err;
-
spin_lock_init(>cfg_lock);
pci_set_drvdata(dev, vmd);
err = vmd_enable_domain(vmd, (unsigned long) id->driver_data);
@@ -809,7 +850,8 @@ static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, 
vmd_resume);
 
 static const struct pci_device_id vmd_ids[] = {
{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D),
-   .driver_dat

[PATCH 2/6] PCI: vmd: Create bus offset configuration helper

2020-07-28 Thread Jon Derrick

Moves the bus offset configuration discovery code to a new helper.
Modifies the bus offset 2-bit decode switch to have a 0 case and a
default error case, just in case the field is expanded in future
hardware.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 53 ++--
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 44b2db789eff..a462719af12a 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -471,6 +471,35 @@ static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool 
native_hint,
return 0;
 }
 
+static int vmd_get_bus_number_start(struct vmd_dev *vmd)
+{
+   struct pci_dev *dev = vmd->dev;
+   u16 reg;
+
+   pci_read_config_word(dev, PCI_REG_VMCAP, );
+   if (BUS_RESTRICT_CAP(reg)) {
+   pci_read_config_word(dev, PCI_REG_VMCONFIG, );
+
+   switch (BUS_RESTRICT_CFG(reg)) {
+   case 0:
+   vmd->busn_start = 0;
+   break;
+   case 1:
+   vmd->busn_start = 128;
+   break;
+   case 2:
+   vmd->busn_start = 224;
+   break;
+   default:
+   pci_err(dev, "Unknown Bus Offset Setting (%d)\n",
+   BUS_RESTRICT_CFG(reg));
+   return -ENODEV;
+   }
+   }
+
+   return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
struct pci_sysdata *sd = >sysdata;
@@ -506,27 +535,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
 * limits the bus range to between 0-127, 128-255, or 224-255
 */
if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) {
-   u16 reg16;
-
-   pci_read_config_word(vmd->dev, PCI_REG_VMCAP, );
-   if (BUS_RESTRICT_CAP(reg16)) {
-   pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG,
-);
-
-   switch (BUS_RESTRICT_CFG(reg16)) {
-   case 1:
-   vmd->busn_start = 128;
-   break;
-   case 2:
-   vmd->busn_start = 224;
-   break;
-   case 3:
-   pci_err(vmd->dev, "Unknown Bus Offset 
Setting\n");
-   return -ENODEV;
-   default:
-   break;
-   }
-   }
+   ret = vmd_get_bus_number_start(vmd);
+   if (ret)
+   return ret;
}
 
res = >dev->resource[VMD_CFGBAR];
-- 
2.27.0

[PATCH 4/6] PCI: vmd: Create IRQ allocation helper

2020-07-28 Thread Jon Derrick

Moves the IRQ allocation and SRCU initialization code to a new helper.
No functional changes.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 94 
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 703c48171993..3214d785fa5d 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -528,6 +528,55 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd)
return 0;
 }
 
+static irqreturn_t vmd_irq(int irq, void *data)
+{
+   struct vmd_irq_list *irqs = data;
+   struct vmd_irq *vmdirq;
+   int idx;
+
+   idx = srcu_read_lock(>srcu);
+   list_for_each_entry_rcu(vmdirq, >irq_list, node)
+   generic_handle_irq(vmdirq->virq);
+   srcu_read_unlock(>srcu, idx);
+
+   return IRQ_HANDLED;
+}
+
+static int vmd_alloc_irqs(struct vmd_dev *vmd)
+{
+   struct pci_dev *dev = vmd->dev;
+   int i, err;
+
+   vmd->msix_count = pci_msix_vec_count(dev);
+   if (vmd->msix_count < 0)
+   return -ENODEV;
+
+   vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
+   PCI_IRQ_MSIX);
+   if (vmd->msix_count < 0)
+   return vmd->msix_count;
+
+   vmd->irqs = devm_kcalloc(>dev, vmd->msix_count, sizeof(*vmd->irqs),
+GFP_KERNEL);
+   if (!vmd->irqs)
+   return -ENOMEM;
+
+   for (i = 0; i < vmd->msix_count; i++) {
+   err = init_srcu_struct(>irqs[i].srcu);
+   if (err)
+   return err;
+
+   INIT_LIST_HEAD(>irqs[i].irq_list);
+   err = devm_request_irq(>dev, pci_irq_vector(dev, i),
+  vmd_irq, IRQF_NO_THREAD,
+  "vmd", >irqs[i]);
+   if (err)
+   return err;
+   }
+
+   return 0;
+}
+
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
struct pci_sysdata *sd = >sysdata;
@@ -663,24 +712,10 @@ static int vmd_enable_domain(struct vmd_dev *vmd, 
unsigned long features)
return 0;
 }
 
-static irqreturn_t vmd_irq(int irq, void *data)
-{
-   struct vmd_irq_list *irqs = data;
-   struct vmd_irq *vmdirq;
-   int idx;
-
-   idx = srcu_read_lock(>srcu);
-   list_for_each_entry_rcu(vmdirq, >irq_list, node)
-   generic_handle_irq(vmdirq->virq);
-   srcu_read_unlock(>srcu, idx);
-
-   return IRQ_HANDLED;
-}
-
 static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 {
struct vmd_dev *vmd;
-   int i, err;
+   int err;
 
if (resource_size(>resource[VMD_CFGBAR]) < (1 << 20))
return -ENOMEM;
@@ -703,32 +738,9 @@ static int vmd_probe(struct pci_dev *dev, const struct 
pci_device_id *id)
dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(32)))
return -ENODEV;
 
-   vmd->msix_count = pci_msix_vec_count(dev);
-   if (vmd->msix_count < 0)
-   return -ENODEV;
-
-   vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count,
-   PCI_IRQ_MSIX);
-   if (vmd->msix_count < 0)
-   return vmd->msix_count;
-
-   vmd->irqs = devm_kcalloc(>dev, vmd->msix_count, sizeof(*vmd->irqs),
-GFP_KERNEL);
-   if (!vmd->irqs)
-   return -ENOMEM;
-
-   for (i = 0; i < vmd->msix_count; i++) {
-   err = init_srcu_struct(>irqs[i].srcu);
-   if (err)
-   return err;
-
-   INIT_LIST_HEAD(>irqs[i].irq_list);
-   err = devm_request_irq(>dev, pci_irq_vector(dev, i),
-  vmd_irq, IRQF_NO_THREAD,
-  "vmd", >irqs[i]);
-   if (err)
-   return err;
-   }
+   err = vmd_alloc_irqs(vmd);
+   if (err)
+   return err;
 
spin_lock_init(>cfg_lock);
pci_set_drvdata(dev, vmd);
-- 
2.27.0

[PATCH 3/6] PCI: vmd: Create IRQ Domain configuration helper

2020-07-28 Thread Jon Derrick

Moves the IRQ and MSI Domain configuration code to new helpers. No
functional changes.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Jon Derrick 
---
 drivers/pci/controller/vmd.c | 52 
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index a462719af12a..703c48171993 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -298,6 +298,34 @@ static struct msi_domain_info vmd_msi_domain_info = {
.chip   = _msi_controller,
 };
 
+static int vmd_create_irq_domain(struct vmd_dev *vmd)
+{
+   struct fwnode_handle *fn;
+
+   fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
+   if (!fn)
+   return -ENODEV;
+
+   vmd->irq_domain = pci_msi_create_irq_domain(fn, _msi_domain_info,
+   x86_vector_domain);
+   if (!vmd->irq_domain) {
+   irq_domain_free_fwnode(fn);
+   return -ENODEV;
+   }
+
+   return 0;
+}
+
+static void vmd_remove_irq_domain(struct vmd_dev *vmd)
+{
+   if (vmd->irq_domain) {
+   struct fwnode_handle *fn = vmd->irq_domain->fwnode;
+
+   irq_domain_remove(vmd->irq_domain);
+   irq_domain_free_fwnode(fn);
+   }
+}
+
 static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
  unsigned int devfn, int reg, int len)
 {
@@ -503,7 +531,6 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd)
 static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 {
struct pci_sysdata *sd = >sysdata;
-   struct fwnode_handle *fn;
struct resource *res;
u32 upper_bits;
unsigned long flags;
@@ -598,16 +625,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned 
long features)
 
sd->node = pcibus_to_node(vmd->dev->bus);
 
-   fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain);
-   if (!fn)
-   return -ENODEV;
-
-   vmd->irq_domain = pci_msi_create_irq_domain(fn, _msi_domain_info,
-   x86_vector_domain);
-   if (!vmd->irq_domain) {
-   irq_domain_free_fwnode(fn);
-   return -ENODEV;
-   }
+   ret = vmd_create_irq_domain(vmd);
+   if (ret)
+   return ret;
 
pci_add_resource(, >resources[0]);
pci_add_resource_offset(, >resources[1], offset[0]);
@@ -617,13 +637,13 @@ static int vmd_enable_domain(struct vmd_dev *vmd, 
unsigned long features)
   _ops, sd, );
if (!vmd->bus) {
pci_free_resource_list();
-   irq_domain_remove(vmd->irq_domain);
-   irq_domain_free_fwnode(fn);
+   vmd_remove_irq_domain(vmd);
return -ENODEV;
}
 
vmd_attach_resources(vmd);
-   dev_set_msi_domain(>bus->dev, vmd->irq_domain);
+   if (vmd->irq_domain)
+   dev_set_msi_domain(>bus->dev, vmd->irq_domain);
 
pci_scan_child_bus(vmd->bus);
pci_assign_unassigned_bus_resources(vmd->bus);
@@ -732,15 +752,13 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd)
 static void vmd_remove(struct pci_dev *dev)
 {
struct vmd_dev *vmd = pci_get_drvdata(dev);
-   struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
sysfs_remove_link(>dev->dev.kobj, "domain");
pci_stop_root_bus(vmd->bus);
pci_remove_root_bus(vmd->bus);
vmd_cleanup_srcu(vmd);
vmd_detach_resources(vmd);
-   irq_domain_remove(vmd->irq_domain);
-   irq_domain_free_fwnode(fn);
+   vmd_remove_irq_domain(vmd);
 }
 
 #ifdef CONFIG_PM_SLEEP
-- 
2.27.0

[PATCH] ext4: Check superblock mapped prior to committing

2018-06-29 Thread Jon Derrick

This patch attempts to close a hole leading to a BUG seen with hot
removals during writes [1].

A block device (NVME namespace in this test case) is formatted to EXT4
without partitions. It's mounted and write I/O is run to a file, then
the device is hot removed from the slot. The superblock attempts to be
written to the drive which is no longer present.

The typical chain of events leading to the BUG:
ext4_commit_super()
  __sync_dirty_buffer()
submit_bh()
  submit_bh_wbc()
BUG_ON(!buffer_mapped(bh));

This fix checks for the superblock's buffer head being mapped prior to
syncing.

[1] https://www.spinics.net/lists/linux-ext4/msg56527.html

Signed-off-by: Jon Derrick 
---
 fs/ext4/super.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0c4c220..ee33233 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4736,6 +4736,14 @@ static int ext4_commit_super(struct super_block *sb, int 
sync)
 
if (!sbh || block_device_ejected(sb))
return error;
+
+   /*
+* The superblock bh should be mapped, but it might not be if the
+* device was hot-removed. Not much we can do but fail the I/O.
+*/
+   if (!buffer_mapped(sbh))
+   return error;
+
/*
 * If the file system is mounted read-only, don't update the
 * superblock write time.  This avoids updating the superblock
-- 
2.7.4

[PATCH v2] PCI: Equalize hotplug memory for non/occupied slots

2018-08-30 Thread Jon Derrick

Currently, a hotplug bridge will be given hpmemsize additional memory if
available, in order to satisfy any future hotplug allocation
requirements.

These calculations don't consider the current memory size of the hotplug
bridge/slot, so hotplug bridges/slots which have downstream devices will
get their current allocation in addition to the hpmemsize value.

This makes for possibly undesirable results with a mix of unoccupied and
occupied slots (ex, with hpmemsize=2M):

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d64f [size=3M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d650-d66f [size=2M]

This change considers the current allocation size when using the
hpmemsize parameter to make the reservations predictable for the mix of
unoccupied and occupied slots:

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d63f [size=2M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d640-d65f [size=2M]

The calculation for IO (hpiosize) should be similar, but platform
firmwares I've encountered (including QEMU) provide strict allocations
for IO and would not provide free IO resources for hotplug buses
in order to prove this calculation.

Signed-off-by: Jon Derrick 
---
 drivers/pci/setup-bus.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824..70d0aba 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -841,7 +842,15 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+
+   /*
+* Consider the current allocation size when adding size for extra
+* hotplug memory. This ensures that occupied slots don't receive
+* unneccessary memory allocations in addition to their current size.
+* The calculation should be similar for calculate_iosize, but was
+* unable to be tested.
+*/
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
@@ -1079,12 +1088,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
-   size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), 
min_align);
+   size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), 
min_align);
add_align = max(min_align, add_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_memsize(size, min_size, add_size,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.3.1

[PATCH v2] PCI hotplug Eq v2

2018-08-30 Thread Jon Derrick

Hi Bjorn,

Sorry for the delay on this one and pushing it after RC1.
Feel free to queue it up for 4.20 if it looks fine.

I've added comments to the git log and source explaining why calculate_iosize
was left unchanged. Basically I could not synthesize a condition where it would
have affected the topology.

v1->v2: Comments

Jon Derrick (1):
  PCI: Equalize hotplug memory for non/occupied slots

 drivers/pci/setup-bus.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

-- 
1.8.3.1

[PATCH] PCI/AER: Fix an AER enabling/disabling race

2018-09-01 Thread Jon Derrick

There is a sequence with non-ACPI root ports where the AER driver can
enable error reporting on the tree before port drivers have bound to
ports on the tree. The port driver assumes the AER driver will set up
error reporting afterwards, so instead add a check if error reporting
was set up first.

Example:
[  343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting
[  343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting
[  343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting
[  343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting
[  343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting
[  343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting
[  343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting
[  343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting
[  343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting
[  343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting
[  343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting
[  343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting
[  343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting
[  344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting
[  344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting
[  344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting
[  344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting
[  344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting
[  344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting
[  344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting
[  344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting

Signed-off-by: Jon Derrick 
---
 drivers/pci/pcie/aer.c  | 1 +
 drivers/pci/pcie/portdrv_core.c | 5 -
 include/linux/pci.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 83180ed..a4e36b6 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1333,6 +1333,7 @@ static int set_device_error_reporting(struct pci_dev 
*dev, void *data)
if (enable)
pcie_set_ecrc_checking(dev);
 
+   dev->aer_configured = 1;
return 0;
 }
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 7c37d81..f5de554 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -224,8 +224,11 @@ static int get_port_device_capability(struct pci_dev *dev)
/*
 * Disable AER on this port in case it's been enabled by the
 * BIOS (the AER service driver will enable it when necessary).
+* Don't disable it if the AER service driver has already
+* enabled it from the root port bus walking
 */
-   pci_disable_pcie_error_reporting(dev);
+   if (!dev->aer_configured)
+   pci_disable_pcie_error_reporting(dev);
}
 #endif
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e72ca8d..c071622 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -402,6 +402,7 @@ struct pci_dev {
unsigned inthas_secondary_link:1;
unsigned intnon_compliant_bars:1;   /* Broken BARs; ignore them */
unsigned intis_probed:1;/* Device probing in progress */
+   unsigned intaer_configured:1;   /* AER configured for device */
pci_dev_flags_t dev_flags;
atomic_tenable_cnt; /* pci_enable_device has been called */
 
-- 
1.8.3.1

[PATCH] nvme-pci: Prevent mmio reads if pci channel offline

2019-02-21 Thread Jon Derrick

Some platforms don't seem to easily tolerate non-posted mmio reads on
lost (hot removed) devices. This has been noted in previous
modifications to other layers where an mmio read to a lost device could
cause an undesired firmware intervention [1][2].

This patch reworks the nvme-pci reads to quickly check connectivity
prior to reading the BAR. The intent is to prevent a non-posted mmio
read which would definitely result in an error condition of some sort.
There is, of course, a chance the link becomes disconnected between the
check and the read. Like other similar checks, it is only intended to
reduce the likelihood of encountering these issues and not fully close
the gap.

mmio writes are posted and in the fast path and have been left as-is.

[1] https://lkml.org/lkml/2018/7/30/858
[2] https://lkml.org/lkml/2018/9/18/1546

Signed-off-by: Jon Derrick 
---
 drivers/nvme/host/pci.c | 75 ++---
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f54718b63637..e555ac2afacd 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1264,6 +1264,33 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 
csts)
 csts, result);
 }
 
+static int nvme_reg_readl(struct nvme_dev *dev, u32 off, u32 *val)
+{
+   if (pci_channel_offline(to_pci_dev(dev->dev)))
+   return -ENODEV;
+
+   *val = readl(dev->bar + off);
+   return 0;
+}
+
+static int nvme_reg_readq(struct nvme_dev *dev, u32 off, u64 *val)
+{
+   if (pci_channel_offline(to_pci_dev(dev->dev)))
+   return -ENODEV;
+
+   *val = readq(dev->bar + off);
+   return 0;
+}
+
+static int nvme_reg_lo_hi_readq(struct nvme_dev *dev, u32 off, u64 *val)
+{
+   if (pci_channel_offline(to_pci_dev(dev->dev)))
+   return -ENODEV;
+
+   *val = lo_hi_readq(dev->bar + off);
+   return 0;
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
reserved)
 {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1271,13 +1298,13 @@ static enum blk_eh_timer_return nvme_timeout(struct 
request *req, bool reserved)
struct nvme_dev *dev = nvmeq->dev;
struct request *abort_req;
struct nvme_command cmd;
-   u32 csts = readl(dev->bar + NVME_REG_CSTS);
+   u32 csts;
 
/* If PCI error recovery process is happening, we cannot reset or
 * the recovery mechanism will surely fail.
 */
mb();
-   if (pci_channel_offline(to_pci_dev(dev->dev)))
+   if (nvme_reg_readl(dev, NVME_REG_CSTS, ))
return BLK_EH_RESET_TIMER;
 
/*
@@ -1692,18 +1719,24 @@ static int nvme_remap_bar(struct nvme_dev *dev, 
unsigned long size)
 static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 {
int result;
-   u32 aqa;
+   u32 csts, vs, aqa;
struct nvme_queue *nvmeq;
 
result = nvme_remap_bar(dev, db_bar_size(dev, 0));
if (result < 0)
return result;
 
-   dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
-   NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
+   result = nvme_reg_readl(dev, NVME_REG_CSTS, );
+   if (result)
+   return result;
+
+   result = nvme_reg_readl(dev, NVME_REG_VS, );
+   if (result)
+   return result;
 
-   if (dev->subsystem &&
-   (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
+   dev->subsystem = (vs >= NVME_VS(1, 1, 0)) ?
+   NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
+   if (dev->subsystem && (csts & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
 
result = nvme_disable_ctrl(>ctrl, dev->ctrl.cap);
@@ -1808,10 +1841,11 @@ static void nvme_map_cmb(struct nvme_dev *dev)
if (dev->cmb_size)
return;
 
-   dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
-   if (!dev->cmbsz)
+   if (nvme_reg_readl(dev, NVME_REG_CMBSZ, >cmbsz) || !dev->cmbsz)
+   return;
+
+   if (nvme_reg_readl(dev, NVME_REG_CMBLOC, >cmbloc))
return;
-   dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
 
size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
@@ -2357,6 +2391,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 {
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+   u32 csts;
 
if (pci_enable_device_mem(pdev))
return result;
@@ -2367,7 +2402,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
 
-   if (readl(dev-

[PATCH] PCI: Equalize hotplug memory for non/occupied slots

2018-07-25 Thread Jon Derrick

Currently, a hotplug bridge will be given hpmemsize additional memory if
available, in order to satisfy any future hotplug allocation
requirements.

These calculations don't consider the current memory size of the hotplug
bridge/slot, so hotplug bridges/slots which have downstream devices will
get their current allocation in addition to the hpmemsize value.

This makes for possibly undesirable results with a mix of unoccupied and
occupied slots (ex, with hpmemsize=2M):

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d64f [size=3M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d650-d66f [size=2M]

This change considers the current allocation size when using the
hpmemsize parameter to make the reservations predictable for the mix of
unoccupied and occupied slots:

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d63f [size=2M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d640-d65f [size=2M]

Signed-off-by: Jon Derrick 
---
Original RFC here:
https://patchwork.ozlabs.org/patch/945374/

I split this bit out from the RFC while awaiting the pci string handling
enhancements to handle per-device settings

Changed from RFC is a simpler algo

 drivers/pci/setup-bus.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824..5ae39e6 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -841,7 +842,7 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
@@ -1079,12 +1080,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
-   size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), 
min_align);
+   size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), 
min_align);
add_align = max(min_align, add_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_memsize(size, min_size, add_size,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.3.1

[PATCH] PCI/portdrv: Enable error reporting on managed ports

2018-09-04 Thread Jon Derrick

During probe, the port driver will disable error reporting and assumes
it will be enabled later by the AER driver's pci_walk_bus() sequence.
This may not be the case for host-bridge enabled root ports, who will
enable first error reporting on the bus during the root port probe, and
then disable error reporting on downstream devices during subsequent
probing of the bus.

A hotplugged port device may also fail to enable error reporting as the
AER driver has already run on the root bus.

Check for these conditions and enable error reporting during portdrv
probing.

Example case:
[  343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting
[  343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting
[  343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting
[  343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting
[  343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting
[  343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting
[  343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting
[  343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting
[  343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting
[  343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting
[  343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting
[  343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting
[  343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting
[  344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting
[  344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting
[  344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting
[  344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting
[  344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting
[  344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting
[  344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting
[  344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting

Signed-off-by: Jon Derrick 
---
 drivers/pci/pcie/portdrv_core.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 7c37d81..fdd953a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -343,6 +343,16 @@ int pcie_port_device_register(struct pci_dev *dev)
if (!nr_service)
goto error_cleanup_irqs;
 
+#ifdef CONFIG_PCIEAER
+   /*
+* Enable error reporting for this port in case AER probing has already
+* run on the root bus or this port device is hot-inserted
+*/
+   if (dev->aer_cap && pci_aer_available() &&
+   (pcie_ports_native || pci_find_host_bridge(dev->bus)->native_aer))
+   pci_enable_pcie_error_reporting(dev);
+#endif
+
return 0;
 
 error_cleanup_irqs:
-- 
1.8.3.1

[PATCH v3] PCI: Equalize hotplug memory and io for non/occupied slots

2018-09-25 Thread Jon Derrick

Currently, a hotplug bridge will be given hpmemsize additional memory
and hpiosize additional io if available, in order to satisfy any future
hotplug allocation requirements.

These calculations don't consider the current memory/io size of the
hotplug bridge/slot, so hotplug bridges/slots which have downstream
devices will be allocated their current allocation in addition to the
hpmemsize value.

This makes for possibly undesirable results with a mix of unoccupied and
occupied slots (ex, with hpmemsize=2M):

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d64f [size=3M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d650-d66f [size=2M]

This change considers the current allocation size when using the
hpmemsize/hpiosize parameters to make the reservations predictable for
the mix of unoccupied and occupied slots:

02:03.0 PCI bridge: <-- Occupied
Memory behind bridge: d620-d63f [size=2M]
02:04.0 PCI bridge: <-- Unoccupied
Memory behind bridge: d640-d65f [size=2M]

Signed-off-by: Jon Derrick 
---
v2->v3: Made the IO and mem size calculations nearly equivalent

 drivers/pci/setup-bus.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1824..ed96043 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -811,6 +811,8 @@ static struct resource *find_free_bus_resource(struct 
pci_bus *bus,
 static resource_size_t calculate_iosize(resource_size_t size,
resource_size_t min_size,
resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -823,15 +825,18 @@ static resource_size_t calculate_iosize(resource_size_t 
size,
 #if defined(CONFIG_ISA) || defined(CONFIG_EISA)
size = (size & 0xff) + ((size & ~0xffUL) << 2);
 #endif
-   size = ALIGN(size + size1, align);
+   size = size + size1;
if (size < old_size)
size = old_size;
+
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
 static resource_size_t calculate_memsize(resource_size_t size,
resource_size_t min_size,
-   resource_size_t size1,
+   resource_size_t add_size,
+   resource_size_t children_add_size,
resource_size_t old_size,
resource_size_t align)
 {
@@ -841,7 +846,8 @@ static resource_size_t calculate_memsize(resource_size_t 
size,
old_size = 0;
if (size < old_size)
size = old_size;
-   size = ALIGN(size + size1, align);
+
+   size = ALIGN(max(size, add_size) + children_add_size, align);
return size;
 }
 
@@ -930,12 +936,10 @@ static void pbus_size_io(struct pci_bus *bus, 
resource_size_t min_size,
}
}
 
-   size0 = calculate_iosize(size, min_size, size1,
+   size0 = calculate_iosize(size, min_size, size1, 0, 0,
resource_size(b_res), min_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_iosize(size, min_size, add_size + size1,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_iosize(size, min_size, size1, add_size, 
children_add_size,
resource_size(b_res), min_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
@@ -1079,12 +1083,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned 
long mask,
 
min_align = calculate_mem_align(aligns, max_order);
min_align = max(min_align, window_alignment(bus, b_res->flags));
-   size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), 
min_align);
+   size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), 
min_align);
add_align = max(min_align, add_align);
-   if (children_add_size > add_size)
-   add_size = children_add_size;
-   size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
-   calculate_memsize(size, min_size, add_size,
+   size1 = (!realloc_head || (realloc_head && !add_size && 
!children_add_size)) ? size0 :
+   calculate_memsize(size, min_size, add_size, children_add_size,
resource_size(b_res), add_align);
if (!size0 && !size1) {
if (b_res->start || b_res->end)
-- 
1.8.3.1

Re: [PATCH] NVMe: Fixed race between nvme_thread & probe path.

2015-06-26 Thread Jon Derrick

> >> + smp_wmb();
> >
> > This has been applied already as an explicit mb()
> 
> Since these structure is only accessible by software, won't smp_wmb()
> sufficient enough?
> 

Seems reasonable to me

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] NVMe: Fixed race between nvme_thread & probe path.

2015-06-18 Thread Jon Derrick

On Thu, Jun 18, 2015 at 04:13:50PM +0530, Parav Pandit wrote:
> Kernel thread nvme_thread and driver load process can be executing
> in parallel on different CPU. This leads to race condition whenever
> nvme_alloc_queue() instructions are executed out of order that can
> reflects incorrect value for nvme_thread.
> Memory barrier in nvme_alloc_queue() ensures that it maintains the
> order and and data dependency read barrier in reader thread ensures
> that cpu cache is synced.
> 
> Signed-off-by: Parav Pandit 
> ---
>  drivers/block/nvme-core.c |   12 ++--
>  1 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
> index 5961ed7..90fb0ce 100644
> --- a/drivers/block/nvme-core.c
> +++ b/drivers/block/nvme-core.c
> @@ -1403,8 +1403,10 @@ static struct nvme_queue *nvme_alloc_queue(struct 
> nvme_dev *dev, int qid,
>   nvmeq->q_db = >dbs[qid * 2 * dev->db_stride];
>   nvmeq->q_depth = depth;
>   nvmeq->qid = qid;
> - dev->queue_count++;
>   dev->queues[qid] = nvmeq;
> + /* update queues first before updating queue_count */
> + smp_wmb();
> + dev->queue_count++;
>  
>   return nvmeq;
>  

This has been applied already as an explicit mb()

> @@ -2073,7 +2075,13 @@ static int nvme_kthread(void *data)
>   continue;
>   }
>   for (i = 0; i < dev->queue_count; i++) {
> - struct nvme_queue *nvmeq = dev->queues[i];
> + struct nvme_queue *nvmeq;
> +
> + /* make sure to read queue_count before
> +  * traversing queues.
> +  */
> + smp_read_barrier_depends();
> + nvmeq = dev->queues[i];
>   if (!nvmeq)
>   continue;
>   spin_lock_irq(>q_lock);

I don't think this is necessary. If queue_count is incremented while in this 
loop, it will be picked up the next time the kthread runs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] nvme : Use correct scnprintf in cmb show

2016-12-19 Thread Jon Derrick

Looks good. Thanks Stephen.

Reviewed-by Jon Derrick: 

On Fri, Dec 16, 2016 at 11:54:50AM -0700, Stephen Bates wrote:
> Make sure we are using the correct scnprintf in the sysfs show
> function for the CMB.
> 
> Signed-off-by: Stephen Bates 
> ---
>  drivers/nvme/host/pci.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 5e52034..be10860 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -50,7 +50,7 @@
>  #define NVME_AQ_DEPTH256
>  #define SQ_SIZE(depth)   (depth * sizeof(struct nvme_command))
>  #define CQ_SIZE(depth)   (depth * sizeof(struct nvme_completion))
> - 
> +
>  /*
>   * We handle AEN commands ourselves and don't even let the
>   * block layer know about them.
> @@ -1332,7 +1332,7 @@ static ssize_t nvme_cmb_show(struct device *dev,
>  {
>   struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
>  
> - return snprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
> + return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
>  ndev->cmbloc, ndev->cmbsz);
>  }
>  static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
> -- 
> 2.1.4
>

Re: [PATCH 2/2] nvme: improve cmb sysfs reporting

2016-12-19 Thread Jon Derrick

Minor nit below

On Fri, Dec 16, 2016 at 11:54:51AM -0700, Stephen Bates wrote:
> Add more information to the NVMe CMB sysfs entry. This includes
> information about the CMB size, location and capabilities.
> 
> Signed-off-by: Stephen Bates 
> ---
>  drivers/nvme/host/pci.c | 31 +--
>  include/linux/nvme.h|  8 
>  2 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index be10860..1f70630 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -98,6 +98,7 @@ struct nvme_dev {
>   void __iomem *cmb;
>   dma_addr_t cmb_dma_addr;
>   u64 cmb_size;
> + u64 cmb_offset;
>   u32 cmbsz;
>   u32 cmbloc;
>   struct nvme_ctrl ctrl;
> @@ -1326,14 +1327,39 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>   return ret >= 0 ? 0 : ret;
>  }
>  
> +static const char * const cmb_caps[] = {
> + [NVME_CMB_CAP_SQS] = "SQS",
> + [NVME_CMB_CAP_CQS] = "CQS",
> + [NVME_CMB_CAP_LISTS] = "LISTS",
> + [NVME_CMB_CAP_RDS] = "RDS",
> + [NVME_CMB_CAP_WDS] = "WDS",
> +};
> +
>  static ssize_t nvme_cmb_show(struct device *dev,
>struct device_attribute *attr,
>char *buf)
>  {
>   struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
> + unsigned int i, len = 0;
> +
> + len += scnprintf(buf+len, PAGE_SIZE-len,
> + "cmbloc  : 0x%08x\ncmbsz   : 0x%08x\n\n",
> + ndev->cmbloc, ndev->cmbsz);
> +
> + len += scnprintf(buf+len, PAGE_SIZE-len,
> + "OFFSET   : 0x%016llx\nSIZE : %llu Bytes\n" \
> + "DMA ADDR : %pad\n\n",
> + ndev->cmb_offset, ndev->cmb_size,
> + >cmb_dma_addr);
> +
> + for (i = NVME_CMB_CAP_SQS; i <= NVME_CMB_CAP_WDS; i++)
I'd prefer seeing (i = 0; i < ARRAY_SIZE(..); i++) because it provides
automatic bounds checking against future code.

> + len += scnprintf(buf+len, PAGE_SIZE-len, "%-7s: %s\n",
> +  cmb_caps[i],
> +  ((ndev->cmbsz) & (1< +  "NOT SUPPORTED");
> +
> + return len;
>  
> - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
> -ndev->cmbloc, ndev->cmbsz);
>  }
>  static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
>  
> @@ -1376,6 +1402,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
>  
>   dev->cmb_dma_addr = dma_addr;
>   dev->cmb_size = size;
> + dev->cmb_offset = offset;
>   return cmb;
>  }
>  
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index fc3c242..cd0d324 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -121,6 +121,14 @@ enum {
>  #define NVME_CMB_CQS(cmbsz)  ((cmbsz) & 0x2)
>  #define NVME_CMB_SQS(cmbsz)  ((cmbsz) & 0x1)
>  
> +enum {
> + NVME_CMB_CAP_SQS = 0,
> + NVME_CMB_CAP_CQS,
> + NVME_CMB_CAP_LISTS,
> + NVME_CMB_CAP_RDS,
> + NVME_CMB_CAP_WDS,
> +};
> +
>  /*
>   * Submission and Completion Queue Entry Sizes for the NVM command set.
>   * (In bytes and specified as a power of two (2^n)).
> -- 
> 2.1.4
>

1 2 >

1 - 100 of 128 matches

Mail list logo