[PATCH 4/6] PCI: vmd: Create IRQ allocation helper
Moves the IRQ allocation and SRCU initialization code to a new helper. No functional changes. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 94 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 703c48171993..3214d785fa5d 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -528,6 +528,55 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd) return 0; } +static irqreturn_t vmd_irq(int irq, void *data) +{ + struct vmd_irq_list *irqs = data; + struct vmd_irq *vmdirq; + int idx; + + idx = srcu_read_lock(>srcu); + list_for_each_entry_rcu(vmdirq, >irq_list, node) + generic_handle_irq(vmdirq->virq); + srcu_read_unlock(>srcu, idx); + + return IRQ_HANDLED; +} + +static int vmd_alloc_irqs(struct vmd_dev *vmd) +{ + struct pci_dev *dev = vmd->dev; + int i, err; + + vmd->msix_count = pci_msix_vec_count(dev); + if (vmd->msix_count < 0) + return -ENODEV; + + vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count, + PCI_IRQ_MSIX); + if (vmd->msix_count < 0) + return vmd->msix_count; + + vmd->irqs = devm_kcalloc(>dev, vmd->msix_count, sizeof(*vmd->irqs), +GFP_KERNEL); + if (!vmd->irqs) + return -ENOMEM; + + for (i = 0; i < vmd->msix_count; i++) { + err = init_srcu_struct(>irqs[i].srcu); + if (err) + return err; + + INIT_LIST_HEAD(>irqs[i].irq_list); + err = devm_request_irq(>dev, pci_irq_vector(dev, i), + vmd_irq, IRQF_NO_THREAD, + "vmd", >irqs[i]); + if (err) + return err; + } + + return 0; +} + static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; @@ -663,24 +712,10 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) return 0; } -static irqreturn_t vmd_irq(int irq, void *data) -{ - struct vmd_irq_list *irqs = data; - struct vmd_irq *vmdirq; - int idx; - - idx = srcu_read_lock(>srcu); - list_for_each_entry_rcu(vmdirq, >irq_list, node) - generic_handle_irq(vmdirq->virq); - srcu_read_unlock(>srcu, idx); - - return IRQ_HANDLED; -} - static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) { struct vmd_dev *vmd; - int i, err; + int err; if (resource_size(>resource[VMD_CFGBAR]) < (1 << 20)) return -ENOMEM; @@ -703,32 +738,9 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(32))) return -ENODEV; - vmd->msix_count = pci_msix_vec_count(dev); - if (vmd->msix_count < 0) - return -ENODEV; - - vmd->msix_count = pci_alloc_irq_vectors(dev, 1, vmd->msix_count, - PCI_IRQ_MSIX); - if (vmd->msix_count < 0) - return vmd->msix_count; - - vmd->irqs = devm_kcalloc(>dev, vmd->msix_count, sizeof(*vmd->irqs), -GFP_KERNEL); - if (!vmd->irqs) - return -ENOMEM; - - for (i = 0; i < vmd->msix_count; i++) { - err = init_srcu_struct(>irqs[i].srcu); - if (err) - return err; - - INIT_LIST_HEAD(>irqs[i].irq_list); - err = devm_request_irq(>dev, pci_irq_vector(dev, i), - vmd_irq, IRQF_NO_THREAD, - "vmd", >irqs[i]); - if (err) - return err; - } + err = vmd_alloc_irqs(vmd); + if (err) + return err; spin_lock_init(>cfg_lock); pci_set_drvdata(dev, vmd); -- 2.27.0
[PATCH 3/6] PCI: vmd: Create IRQ Domain configuration helper
Moves the IRQ and MSI Domain configuration code to new helpers. No functional changes. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 52 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a462719af12a..703c48171993 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -298,6 +298,34 @@ static struct msi_domain_info vmd_msi_domain_info = { .chip = _msi_controller, }; +static int vmd_create_irq_domain(struct vmd_dev *vmd) +{ + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain); + if (!fn) + return -ENODEV; + + vmd->irq_domain = pci_msi_create_irq_domain(fn, _msi_domain_info, + x86_vector_domain); + if (!vmd->irq_domain) { + irq_domain_free_fwnode(fn); + return -ENODEV; + } + + return 0; +} + +static void vmd_remove_irq_domain(struct vmd_dev *vmd) +{ + if (vmd->irq_domain) { + struct fwnode_handle *fn = vmd->irq_domain->fwnode; + + irq_domain_remove(vmd->irq_domain); + irq_domain_free_fwnode(fn); + } +} + static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { @@ -503,7 +531,6 @@ static int vmd_get_bus_number_start(struct vmd_dev *vmd) static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; - struct fwnode_handle *fn; struct resource *res; u32 upper_bits; unsigned long flags; @@ -598,16 +625,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) sd->node = pcibus_to_node(vmd->dev->bus); - fn = irq_domain_alloc_named_id_fwnode("VMD-MSI", vmd->sysdata.domain); - if (!fn) - return -ENODEV; - - vmd->irq_domain = pci_msi_create_irq_domain(fn, _msi_domain_info, - x86_vector_domain); - if (!vmd->irq_domain) { - irq_domain_free_fwnode(fn); - return -ENODEV; - } + ret = vmd_create_irq_domain(vmd); + if (ret) + return ret; pci_add_resource(, >resources[0]); pci_add_resource_offset(, >resources[1], offset[0]); @@ -617,13 +637,13 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) _ops, sd, ); if (!vmd->bus) { pci_free_resource_list(); - irq_domain_remove(vmd->irq_domain); - irq_domain_free_fwnode(fn); + vmd_remove_irq_domain(vmd); return -ENODEV; } vmd_attach_resources(vmd); - dev_set_msi_domain(>bus->dev, vmd->irq_domain); + if (vmd->irq_domain) + dev_set_msi_domain(>bus->dev, vmd->irq_domain); pci_scan_child_bus(vmd->bus); pci_assign_unassigned_bus_resources(vmd->bus); @@ -732,15 +752,13 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd) static void vmd_remove(struct pci_dev *dev) { struct vmd_dev *vmd = pci_get_drvdata(dev); - struct fwnode_handle *fn = vmd->irq_domain->fwnode; sysfs_remove_link(>dev->dev.kobj, "domain"); pci_stop_root_bus(vmd->bus); pci_remove_root_bus(vmd->bus); vmd_cleanup_srcu(vmd); vmd_detach_resources(vmd); - irq_domain_remove(vmd->irq_domain); - irq_domain_free_fwnode(fn); + vmd_remove_irq_domain(vmd); } #ifdef CONFIG_PM_SLEEP -- 2.27.0
[PATCH 6/6] PCI: vmd: Disable MSI/X remapping when possible
VMD will retransmit child device MSI/X using its own MSI/X table and requester-id. This limits the number of MSI/X available to the whole child device domain to the number of VMD MSI/X interrupts. Some VMD devices have a mode where this remapping can be disabled, allowing child device interrupts to bypass processing with the VMD MSI/X domain interrupt handler and going straight the child device interrupt handler, allowing for better performance and scaling. The requester-id still gets changed to the VMD endpoint's requester-id, and the interrupt remapping handlers have been updated to properly set IRTE for child device interrupts to the VMD endpoint's context. Some VMD platforms have existing production BIOS which rely on MSI/X remapping and won't explicitly program the MSI/X remapping bit. This re-enables MSI/X remapping on unload. Disabling MSI/X remapping is only available for Icelake Server and client VMD products. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 58 +++- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 3214d785fa5d..e8cde2c390b9 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -53,6 +53,12 @@ enum vmd_features { * vendor-specific capability space */ VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP= (1 << 2), + + /* +* Device remaps MSI/X transactions into its MSI/X table and requires +* VMD MSI domain for child device interrupt handling +*/ + VMD_FEAT_REMAPS_MSI = (1 << 3), }; /* @@ -298,6 +304,15 @@ static struct msi_domain_info vmd_msi_domain_info = { .chip = _msi_controller, }; +static void vmd_enable_msi_remapping(struct vmd_dev *vmd, bool enable) +{ + u16 reg; + + pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, ); + reg = enable ? (reg & ~0x2) : (reg | 0x2); + pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg); +} + static int vmd_create_irq_domain(struct vmd_dev *vmd) { struct fwnode_handle *fn; @@ -318,6 +333,13 @@ static int vmd_create_irq_domain(struct vmd_dev *vmd) static void vmd_remove_irq_domain(struct vmd_dev *vmd) { + /* +* Some production BIOS won't enable remapping between soft reboots. +* Ensure remapping is restored before unloading the driver +*/ + if (!vmd->msix_count) + vmd_enable_msi_remapping(vmd, true); + if (vmd->irq_domain) { struct fwnode_handle *fn = vmd->irq_domain->fwnode; @@ -606,6 +628,27 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) return ret; } + /* +* Currently MSI remapping must be enabled in guest passthrough mode +* due to some missing interrupt remapping plumbing. This is probably +* acceptable because the guest is usually CPU-limited and MSI +* remapping doesn't become a performance bottleneck. +*/ + if (features & VMD_FEAT_REMAPS_MSI || offset[0] || offset[1]) { + ret = vmd_alloc_irqs(vmd); + if (ret) + return ret; + } + + /* +* Disable remapping for performance if possible based on if VMD IRQs +* had been allocated. +*/ + if (vmd->msix_count) + vmd_enable_msi_remapping(vmd, true); + else + vmd_enable_msi_remapping(vmd, false); + /* * Certain VMD devices may have a root port configuration option which * limits the bus range to between 0-127, 128-255, or 224-255 @@ -674,9 +717,11 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) sd->node = pcibus_to_node(vmd->dev->bus); - ret = vmd_create_irq_domain(vmd); - if (ret) - return ret; + if (vmd->msix_count) { + ret = vmd_create_irq_domain(vmd); + if (ret) + return ret; + } pci_add_resource(, >resources[0]); pci_add_resource_offset(, >resources[1], offset[0]); @@ -738,10 +783,6 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) dma_set_mask_and_coherent(>dev, DMA_BIT_MASK(32))) return -ENODEV; - err = vmd_alloc_irqs(vmd); - if (err) - return err; - spin_lock_init(>cfg_lock); pci_set_drvdata(dev, vmd); err = vmd_enable_domain(vmd, (unsigned long) id->driver_data); @@ -809,7 +850,8 @@ static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume); static const struct pci_device_id vmd_ids[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_201D), - .driver_dat
[PATCH 2/6] PCI: vmd: Create bus offset configuration helper
Moves the bus offset configuration discovery code to a new helper. Modifies the bus offset 2-bit decode switch to have a 0 case and a default error case, just in case the field is expanded in future hardware. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 53 ++-- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 44b2db789eff..a462719af12a 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -471,6 +471,35 @@ static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint, return 0; } +static int vmd_get_bus_number_start(struct vmd_dev *vmd) +{ + struct pci_dev *dev = vmd->dev; + u16 reg; + + pci_read_config_word(dev, PCI_REG_VMCAP, ); + if (BUS_RESTRICT_CAP(reg)) { + pci_read_config_word(dev, PCI_REG_VMCONFIG, ); + + switch (BUS_RESTRICT_CFG(reg)) { + case 0: + vmd->busn_start = 0; + break; + case 1: + vmd->busn_start = 128; + break; + case 2: + vmd->busn_start = 224; + break; + default: + pci_err(dev, "Unknown Bus Offset Setting (%d)\n", + BUS_RESTRICT_CFG(reg)); + return -ENODEV; + } + } + + return 0; +} + static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; @@ -506,27 +535,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) * limits the bus range to between 0-127, 128-255, or 224-255 */ if (features & VMD_FEAT_HAS_BUS_RESTRICTIONS) { - u16 reg16; - - pci_read_config_word(vmd->dev, PCI_REG_VMCAP, ); - if (BUS_RESTRICT_CAP(reg16)) { - pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, -); - - switch (BUS_RESTRICT_CFG(reg16)) { - case 1: - vmd->busn_start = 128; - break; - case 2: - vmd->busn_start = 224; - break; - case 3: - pci_err(vmd->dev, "Unknown Bus Offset Setting\n"); - return -ENODEV; - default: - break; - } - } + ret = vmd_get_bus_number_start(vmd); + if (ret) + return ret; } res = >dev->resource[VMD_CFGBAR]; -- 2.27.0
[PATCH 0/6] VMD MSI Remapping Bypass
The Intel Volume Management Device acts similar to a PCI-to-PCI bridge in that it changes downstream devices' requester-ids to its own. As VMD supports PCIe devices, it has its own MSI/X table and transmits child device MSI/X by remapping child device MSI/X and handling like a demultiplexer. Some newer VMD devices (Icelake Server and client) have an option to bypass the VMD MSI/X remapping table. This allows for better performance scaling as the child device MSI/X won't be limited by VMD's MSI/X count and IRQ handler. It's expected that most users don't want MSI/X remapping when they can get better performance without this limitation. This set includes some long overdue cleanup of overgrown VMD code and introduces the MSI/X remapping disable. Applies on top of e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") and ec0160891e38 ("irqdomain/treewide: Free firmware node after domain removal") in tip/urgent Jon Derrick (6): PCI: vmd: Create physical offset helper PCI: vmd: Create bus offset configuration helper PCI: vmd: Create IRQ Domain configuration helper PCI: vmd: Create IRQ allocation helper x86/apic/msi: Use Real PCI DMA device when configuring IRTE PCI: vmd: Disable MSI/X remapping when possible arch/x86/kernel/apic/msi.c | 2 +- drivers/pci/controller/vmd.c | 344 +++ 2 files changed, 224 insertions(+), 122 deletions(-) base-commit: ec0160891e387f4771f953b888b1fe951398e5d9 -- 2.27.0
[PATCH 5/6] x86/apic/msi: Use Real PCI DMA device when configuring IRTE
VMD retransmits child device MSI/X with the VMD endpoint's requester-id. In order to support direct interrupt remapping of VMD child devices, ensure that the IRTE is programmed with the VMD endpoint's requester-id using pci_real_dma_dev(). Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- arch/x86/kernel/apic/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c2b2911feeef..7ca271b8d891 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -189,7 +189,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) init_irq_alloc_info(, NULL); info.type = X86_IRQ_ALLOC_TYPE_MSI; - info.msi_dev = dev; + info.msi_dev = pci_real_dma_dev(dev); domain = irq_remapping_get_irq_domain(); if (domain == NULL) -- 2.27.0
[PATCH 1/6] PCI: vmd: Create physical offset helper
Moves the guest-passthrough physical offset discovery code to a new helper. No functional changes. Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 105 +-- 1 file changed, 62 insertions(+), 43 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index f69ef8c89f72..44b2db789eff 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -417,6 +417,60 @@ static int vmd_find_free_domain(void) return domain + 1; } +static int vmd_get_phys_offsets(struct vmd_dev *vmd, bool native_hint, + resource_size_t *offset1, + resource_size_t *offset2) +{ + struct pci_dev *dev = vmd->dev; + u64 phys1, phys2; + + if (native_hint) { + u32 vmlock; + int ret; + + ret = pci_read_config_dword(dev, PCI_REG_VMLOCK, ); + if (ret || vmlock == ~0) + return -ENODEV; + + if (MB2_SHADOW_EN(vmlock)) { + void __iomem *membar2; + + membar2 = pci_iomap(dev, VMD_MEMBAR2, 0); + if (!membar2) + return -ENOMEM; + phys1 = readq(membar2 + MB2_SHADOW_OFFSET); + phys2 = readq(membar2 + MB2_SHADOW_OFFSET + 8); + pci_iounmap(dev, membar2); + } else + return 0; + } else { + /* Hypervisor-Emulated Vendor-Specific Capability */ + int pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); + u32 reg, regu; + + pci_read_config_dword(dev, pos + 4, ); + + /* "SHDW" */ + if (pos && reg == 0x53484457) { + pci_read_config_dword(dev, pos + 8, ); + pci_read_config_dword(dev, pos + 12, ); + phys1 = (u64) regu << 32 | reg; + + pci_read_config_dword(dev, pos + 16, ); + pci_read_config_dword(dev, pos + 20, ); + phys2 = (u64) regu << 32 | reg; + } else + return 0; + } + + *offset1 = dev->resource[VMD_MEMBAR1].start - + (phys1 & PCI_BASE_ADDRESS_MEM_MASK); + *offset2 = dev->resource[VMD_MEMBAR2].start - + (phys2 & PCI_BASE_ADDRESS_MEM_MASK); + + return 0; +} + static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) { struct pci_sysdata *sd = >sysdata; @@ -428,6 +482,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) resource_size_t offset[2] = {0}; resource_size_t membar2_offset = 0x2000; struct pci_bus *child; + int ret; /* * Shadow registers may exist in certain VMD device ids which allow @@ -436,50 +491,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) * or 0, depending on an enable bit in the VMD device. */ if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) { - u32 vmlock; - int ret; - membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; - ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, ); - if (ret || vmlock == ~0) - return -ENODEV; - - if (MB2_SHADOW_EN(vmlock)) { - void __iomem *membar2; - - membar2 = pci_iomap(vmd->dev, VMD_MEMBAR2, 0); - if (!membar2) - return -ENOMEM; - offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - (readq(membar2 + MB2_SHADOW_OFFSET) & -PCI_BASE_ADDRESS_MEM_MASK); - offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - (readq(membar2 + MB2_SHADOW_OFFSET + 8) & -PCI_BASE_ADDRESS_MEM_MASK); - pci_iounmap(vmd->dev, membar2); - } - } - - if (features & VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP) { - int pos = pci_find_capability(vmd->dev, PCI_CAP_ID_VNDR); - u32 reg, regu; - - pci_read_config_dword(vmd->dev, pos + 4, ); - - /* "SHDW" */ - if (pos && reg == 0x53484457) { - pci_read_config_dword(vmd->dev, pos + 8, ); - pci_read_config_dword(vmd->dev, pos + 12, ); - offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - -
[tip: irq/urgent] irqdomain/treewide: Free firmware node after domain removal
The following commit has been merged into the irq/urgent branch of tip: Commit-ID: ec0160891e387f4771f953b888b1fe951398e5d9 Gitweb: https://git.kernel.org/tip/ec0160891e387f4771f953b888b1fe951398e5d9 Author:Jon Derrick AuthorDate:Tue, 21 Jul 2020 14:26:09 -06:00 Committer: Thomas Gleixner CommitterDate: Thu, 23 Jul 2020 00:08:52 +02:00 irqdomain/treewide: Free firmware node after domain removal Commit 711419e504eb ("irqdomain: Add the missing assignment of domain->fwnode for named fwnode") unintentionally caused a dangling pointer page fault issue on firmware nodes that were freed after IRQ domain allocation. Commit e3beca48a45b fixed that dangling pointer issue by only freeing the firmware node after an IRQ domain allocation failure. That fix no longer frees the firmware node immediately, but leaves the firmware node allocated after the domain is removed. The firmware node must be kept around through irq_domain_remove, but should be freed it afterwards. Add the missing free operations after domain removal where where appropriate. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Signed-off-by: Jon Derrick Signed-off-by: Thomas Gleixner Reviewed-by: Andy Shevchenko Acked-by: Bjorn Helgaas# drivers/pci Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/1595363169-7157-1-git-send-email-jonathan.derr...@intel.com --- arch/mips/pci/pci-xtalk-bridge.c| 3 +++ arch/x86/kernel/apic/io_apic.c | 5 + drivers/iommu/intel/irq_remapping.c | 8 drivers/mfd/ioc3.c | 6 ++ drivers/pci/controller/vmd.c| 3 +++ 5 files changed, 25 insertions(+) diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c index 5958217..9b3cc77 100644 --- a/arch/mips/pci/pci-xtalk-bridge.c +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -728,6 +728,7 @@ err_free_resource: pci_free_resource_list(>windows); err_remove_domain: irq_domain_remove(domain); + irq_domain_free_fwnode(fn); return err; } @@ -735,8 +736,10 @@ static int bridge_remove(struct platform_device *pdev) { struct pci_bus *bus = platform_get_drvdata(pdev); struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct fwnode_handle *fn = bc->domain->fwnode; irq_domain_remove(bc->domain); + irq_domain_free_fwnode(fn); pci_lock_rescan_remove(); pci_stop_root_bus(bus); pci_remove_root_bus(bus); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81ffcfb..21325a4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = [idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 9564d23..aa096b3 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -628,13 +628,21 @@ out_free_table: static void intel_teardown_irq_remapping(struct intel_iommu *iommu) { + struct fwnode_handle *fn; + if (iommu && iommu->ir_table) { if (iommu->ir_msi_domain) { + fn = iommu->ir_msi_domain->fwnode; + irq_domain_remove(iommu->ir_msi_domain); + irq_domain_free_fwnode(fn); iommu->ir_msi_domain = NULL; } if (iommu->ir_domain) { + fn = iommu->ir_domain->fwnode; + irq_domain_remove(iommu->ir_domain); + irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } free_pages((unsigned long)iommu->ir_table->base, diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c index 74cee7c..d939ccc 100644 --- a/drivers/mfd/ioc3.c +++ b/drivers/mfd/ioc3.c @@ -616,7 +616,10 @@ static int ioc3_mfd_probe(struct pci_dev *pdev, /* Remove all already added MFD devices */ mfd_remove_devices(>pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, regs); @@ -643,7 +646,10 @@ stati
[PATCH] irqdomain/treewide: Free firmware node after domain removal
Change 711419e504eb ("irqdomain: Add the missing assignment of domain->fwnode for named fwnode") unintentionally caused a dangling pointer page fault issue on firmware nodes that were freed after IRQ domain allocation. Change e3beca48a45b fixed that dangling pointer issue by only freeing the firmware node after an IRQ domain allocation failure. That fix no longer frees the firmware node immediately, but leaves the firmware node allocated after the domain is removed. We need to keep the firmware node through irq_domain_remove, but should free it afterwards. This patch saves the handle and adds the freeing of firmware node after domain removal where appropriate. Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated") Reviewed-by: Andy Shevchenko Signed-off-by: Jon Derrick Cc: sta...@vger.kernel.org --- arch/mips/pci/pci-xtalk-bridge.c| 3 +++ arch/x86/kernel/apic/io_apic.c | 5 + drivers/iommu/intel/irq_remapping.c | 8 drivers/mfd/ioc3.c | 6 ++ drivers/pci/controller/vmd.c| 3 +++ 5 files changed, 25 insertions(+) diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c index 5958217..9b3cc77 100644 --- a/arch/mips/pci/pci-xtalk-bridge.c +++ b/arch/mips/pci/pci-xtalk-bridge.c @@ -728,6 +728,7 @@ static int bridge_probe(struct platform_device *pdev) pci_free_resource_list(>windows); err_remove_domain: irq_domain_remove(domain); + irq_domain_free_fwnode(fn); return err; } @@ -735,8 +736,10 @@ static int bridge_remove(struct platform_device *pdev) { struct pci_bus *bus = platform_get_drvdata(pdev); struct bridge_controller *bc = BRIDGE_CONTROLLER(bus); + struct fwnode_handle *fn = bc->domain->fwnode; irq_domain_remove(bc->domain); + irq_domain_free_fwnode(fn); pci_lock_rescan_remove(); pci_stop_root_bus(bus); pci_remove_root_bus(bus); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81ffcfb..21325a4a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = [idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 9564d23..aa096b3 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -628,13 +628,21 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) static void intel_teardown_irq_remapping(struct intel_iommu *iommu) { + struct fwnode_handle *fn; + if (iommu && iommu->ir_table) { if (iommu->ir_msi_domain) { + fn = iommu->ir_msi_domain->fwnode; + irq_domain_remove(iommu->ir_msi_domain); + irq_domain_free_fwnode(fn); iommu->ir_msi_domain = NULL; } if (iommu->ir_domain) { + fn = iommu->ir_domain->fwnode; + irq_domain_remove(iommu->ir_domain); + irq_domain_free_fwnode(fn); iommu->ir_domain = NULL; } free_pages((unsigned long)iommu->ir_table->base, diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c index 74cee7c..d939ccc 100644 --- a/drivers/mfd/ioc3.c +++ b/drivers/mfd/ioc3.c @@ -616,7 +616,10 @@ static int ioc3_mfd_probe(struct pci_dev *pdev, /* Remove all already added MFD devices */ mfd_remove_devices(>pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, regs); @@ -643,7 +646,10 @@ static void ioc3_mfd_remove(struct pci_dev *pdev) /* Release resources */ mfd_remove_devices(>pdev->dev); if (ipd->domain) { + struct fwnode_handle *fn = ipd->domain->fwnode; + irq_domain_remove(ipd->domain); + irq_domain_free_fwnode(fn); free_irq(ipd->domain_irq, (void *)ipd); } pci_iounmap(pdev, ipd->regs); diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller
[PATCH v3 1/2] PCI/AER: Use _OSC to determine Firmware First before HEST
After a5bf8719af: "PCI/AER: Use only _OSC to determine AER ownership", _OSC is the primary determiner of ownership of Firmware First error handling rather than HEST. ACPI Root Bus enumeration has been modified to flag Host Bridge devices as using Native AER when _OSC has been negotiated for AER services. This patch ensures the PCI layers first uses the _OSC negotiated state by checking the Host Bridge's Native AER flag prior to HEST parsing. Signed-off-by: Jon Derrick --- drivers/pci/pcie/aer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index efc2677..f3d02f4 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -314,6 +314,9 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) if (pcie_ports_native) return 0; + if (pci_find_host_bridge(dev->bus)->native_aer) + return 0; + if (!dev->__aer_firmware_first_valid) aer_set_firmware_first(dev); return dev->__aer_firmware_first; -- 1.8.3.1
[PATCH v3 0/2] PCI/ERR: Allow Native AER/DPC using _OSC
Hi Bjorn & Kuppuswamy, I see a problem in the DPC ECN [1] to _OSC in that it doesn't give us a way to determine if firmware supports _OSC DPC negotation, and therefore how to handle DPC. Here is the wording of the ECN that implies that Firmware without _OSC DPC negotiation support should have the OSPM rely on _OSC AER negotiation when determining DPC control: PCIe Base Specification suggests that Downstream Port Containment may be controlled either by the Firmware or the Operating System. It also suggests that the Firmware retain ownership of Downstream Port Containment if it also owns AER. When the Firmware owns Downstream Port Containment, it is expected to use the new "Error Disconnect Recover" notification to alert OSPM of a Downstream Port Containment event. In legacy platforms, as bits in _OSC are reserved prior to implementation, ACPI Root Bus enumeration will mark these Host Bridges as without Native DPC support, even though the specification implies it's expected that AER _OSC negotiation determines DPC control for these platforms. There seems to be a need for a way to determine if the DPC control bit in _OSC is supported and fallback on AER otherwise. Currently portdrv assumes DPC control if the port has Native AER services: static int get_port_device_capability(struct pci_dev *dev) ... if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) && pci_aer_available() && (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; Newer firmware may not grant OSPM DPC control, if for instance, it expects to use Error Disconnect Recovery. However it looks like ACPI will use DPC services via the EDR driver, without binding the full DPC port service driver. If we change portdrv to probe based on host->native_dpc and not AER, then we break instances with legacy firmware where OSPM will clear host->native_dpc solely due to _OSC bits being reserved: struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, ... if (!(root->osc_control_set & OSC_PCI_EXPRESS_DPC_CONTROL)) host_bridge->native_dpc = 0; So my assumption instead is that host->native_dpc can be 0 and expect Native DPC services if AER is used. In other words, if and only if DPC probe is invoked from portdrv, then it needs to rely on the AER dependency. Otherwise it should be assumed that ACPI set up DPC via EDR. This covers legacy firmware. However it seems like that could be trouble with newer firmware that might give OSPM control of AER but not DPC, and would result in both Native DPC and EDR being in effect. Anyways here are two patches that give control of AER and DPC on the results of _OSC. They don't mess with the HEST parser as I expect those to be removed at some point. I need these for VMD support which doesn't even rely on _OSC, but I suspect this won't be the last effort as we detangle Firmware First. [1] https://members.pcisig.com/wg/PCI-SIG/document/12888 Jon Derrick (2): PCI/AER: Use _OSC to determine Firmware First before HEST PCI/DPC: Use _OSC to determine DPC support drivers/pci/pcie/aer.c | 3 +++ drivers/pci/pcie/dpc.c | 3 --- drivers/pci/pcie/portdrv_core.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) -- 1.8.3.1
[PATCH v3 2/2] PCI/DPC: Use _OSC to determine DPC support
After a5bf8719af: "PCI/AER: Use only _OSC to determine AER ownership", _OSC is the primary determiner of ownership of Firmware First error handling rather than HEST. With the addition of DPC to _OSC [1], OSPM is able to negotiate DPC services from Firmware. ACPI Root Bus enumeration sets the Host Bridge's Native DPC flag on negotiation of those service. This patch changes DPC probing to check DPC control as determined by _OSC, by checking the Host Bridge's Native DPC member. As most older platforms won't have DPC negotiable by _OSC, this patch doesn't attempt to change behavior that assumes if OSPM has negotiated AER by _OSC, OSPM will also want DPC control. [1] https://members.pcisig.com/wg/PCI-SIG/document/12888 Signed-off-by: Jon Derrick --- drivers/pci/pcie/dpc.c | 3 --- drivers/pci/pcie/portdrv_core.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index 7621704..9104929 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -284,9 +284,6 @@ static int dpc_probe(struct pcie_device *dev) int status; u16 ctl, cap; - if (pcie_aer_get_firmware_first(pdev) && !pcie_ports_dpc_native) - return -ENOTSUPP; - status = devm_request_threaded_irq(device, dev->irq, dpc_irq, dpc_handler, IRQF_SHARED, "pcie-dpc", pdev); diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 50a9522..f2139a1 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -256,7 +256,8 @@ static int get_port_device_capability(struct pci_dev *dev) */ if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DPC) && pci_aer_available() && - (pcie_ports_dpc_native || (services & PCIE_PORT_SERVICE_AER))) + (pcie_ports_dpc_native || host->native_dpc || +(services & PCIE_PORT_SERVICE_AER))) services |= PCIE_PORT_SERVICE_DPC; if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM || -- 1.8.3.1
[PATCH] PCI: vmd: Expose oob management window to users
Some VMD devices provide a management window within MEMBAR2 that is used to communicate out-of-band with child devices. This patch creates a binary file for interacting with the interface. OOB Reads/Writes are bounds-checked by sysfs_fs_bin_{read,write} Signed-off-by: Jon Derrick --- Depends on https://lore.kernel.org/linux-pci/20190916135435.5017-1-jonathan.derr...@intel.com/T/#t drivers/pci/controller/vmd.c | 128 --- 1 file changed, 117 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index a35d3f3996d7..b13954cf9c96 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -33,6 +33,8 @@ #define MB2_SHADOW_OFFSET 0x2000 #define MB2_SHADOW_SIZE16 +#define MB2_OOB_WINDOW_OFFSET 0x2010 +#define MB2_OOB_WINDOW_SIZE128 enum vmd_features { /* @@ -47,6 +49,12 @@ enum vmd_features { * bus numbering */ VMD_FEAT_HAS_BUS_RESTRICTIONS = (1 << 1), + + /* +* Device may provide an out-of-band management interface through a +* read/write window +*/ + VMD_FEAT_HAS_OOB_WINDOW = (1 << 2), }; /* @@ -101,6 +109,10 @@ struct vmd_dev { struct dma_map_ops dma_ops; struct dma_domain dma_domain; + + spinlock_t oob_lock; + char __iomem*oob_addr; + struct bin_attribute*oob_attr; }; static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) @@ -543,6 +555,68 @@ static void vmd_detach_resources(struct vmd_dev *vmd) vmd->dev->resource[VMD_MEMBAR2].child = NULL; } +static ssize_t vmd_oob_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + struct vmd_dev *vmd = attr->private; + unsigned long flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + spin_lock_irqsave(>oob_lock, flags); + memcpy_fromio([off], >oob_addr[off], count); + spin_unlock_irqrestore(>oob_lock, flags); + + return count; +} + +static ssize_t vmd_oob_write(struct file *filp, struct kobject *kobj, +struct bin_attribute *attr, char *buf, +loff_t off, size_t count) +{ + struct vmd_dev *vmd = attr->private; + unsigned long flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + spin_lock_irqsave(>oob_lock, flags); + memcpy_toio(>oob_addr[off], [off], count); + spin_unlock_irqrestore(>oob_lock, flags); + + return count; +} + +static int vmd_create_oob_file(struct vmd_dev *vmd) +{ + struct pci_dev *dev = vmd->dev; + struct bin_attribute *oob_attr; + + oob_attr = devm_kzalloc(>dev->dev, sizeof(*oob_attr), GFP_ATOMIC); + if (!oob_attr) + return -ENOMEM; + + spin_lock_init(>oob_lock); + sysfs_bin_attr_init(oob_attr); + vmd->oob_attr = oob_attr; + oob_attr->attr.name = "oob"; + oob_attr->attr.mode = S_IRUSR | S_IWUSR; + oob_attr->size = MB2_OOB_WINDOW_SIZE; + oob_attr->read = vmd_oob_read; + oob_attr->write = vmd_oob_write; + oob_attr->private = (void *)vmd; + + return sysfs_create_bin_file(>dev.kobj, oob_attr); +} + +static void vmd_destroy_oob_file(struct vmd_dev *vmd) +{ + if (vmd->oob_attr) + sysfs_remove_bin_file(>dev->dev.kobj, vmd->oob_attr); +} + /* * VMD domains start at 0x1 to not clash with ACPI _SEG domains. * Per ACPI r6.0, sec 6.5.6, _SEG returns an integer, of which the lower @@ -570,6 +644,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) resource_size_t offset[2] = {0}; resource_size_t membar2_offset = 0x2000; struct pci_bus *child; + int ret; /* * Shadow registers may exist in certain VMD device ids which allow @@ -579,7 +654,6 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) */ if (features & VMD_FEAT_HAS_MEMBAR_SHADOW) { u32 vmlock; - int ret; membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, ); @@ -614,6 +688,24 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) vmd->busn_start = 128; } + /* +* Certain VMD devices provide a window for communicating with child +* devices through a management interface +*/ + if (features & VMD_FEAT_HAS_OOB_WINDOW) { + membar2_offset = MB2_OOB_WINDOW_OFFSET + MB2_OOB_WINDOW_SIZE; + vmd
[PATCH 1/2] PCI: vmd: Fix config addressing when using bus offsets
VMD maps child device config spaces to the VMD Config BAR linearly regardless of the starting bus offset. Because of this, the config address decode must ignore starting bus offsets when mapping the BDF to the config space address. Cc: sta...@vger.kernel.org # v5.2+ Fixes: 2a5a9c9a ("PCI: vmd: Add offset to bus numbers if necessary") Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 16 +--- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 4575e0c6dc4b..2e4da3f51d6b 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -94,6 +94,7 @@ struct vmd_dev { struct resource resources[3]; struct irq_domain *irq_domain; struct pci_bus *bus; + u8 busn_start; struct dma_map_ops dma_ops; struct dma_domain dma_domain; @@ -440,7 +441,8 @@ static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { char __iomem *addr = vmd->cfgbar + -(bus->number << 20) + (devfn << 12) + reg; +((bus->number - vmd->busn_start) << 20) + +(devfn << 12) + reg; if ((addr - vmd->cfgbar) + len >= resource_size(>dev->resource[VMD_CFGBAR])) @@ -563,7 +565,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) unsigned long flags; LIST_HEAD(resources); resource_size_t offset[2] = {0}; - resource_size_t membar2_offset = 0x2000, busn_start = 0; + resource_size_t membar2_offset = 0x2000; struct pci_bus *child; /* @@ -606,14 +608,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, ); if (BUS_RESTRICT_CAP(vmcap) && (BUS_RESTRICT_CFG(vmconfig) == 0x1)) - busn_start = 128; + vmd->busn_start = 128; } res = >dev->resource[VMD_CFGBAR]; vmd->resources[0] = (struct resource) { .name = "VMD CFGBAR", - .start = busn_start, - .end = busn_start + (resource_size(res) >> 20) - 1, + .start = vmd->busn_start, + .end = vmd->busn_start + (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, }; @@ -681,8 +683,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_add_resource_offset(, >resources[1], offset[0]); pci_add_resource_offset(, >resources[2], offset[1]); - vmd->bus = pci_create_root_bus(>dev->dev, busn_start, _ops, - sd, ); + vmd->bus = pci_create_root_bus(>dev->dev, vmd->busn_start, + _ops, sd, ); if (!vmd->bus) { pci_free_resource_list(); irq_domain_remove(vmd->irq_domain); -- 2.20.1
[PATCH 2/2] PCI: vmd: Fix shadow offsets to reflect spec changes
The shadow offset scratchpad was moved to 0x2000-0x2010. Update the location to get the correct shadow offset. Cc: sta...@vger.kernel.org # v5.2+ Fixes: 6788958e ("PCI: vmd: Assign membar addresses from shadow registers") Signed-off-by: Jon Derrick --- drivers/pci/controller/vmd.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 2e4da3f51d6b..a35d3f3996d7 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -31,6 +31,9 @@ #define PCI_REG_VMLOCK 0x70 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2) +#define MB2_SHADOW_OFFSET 0x2000 +#define MB2_SHADOW_SIZE16 + enum vmd_features { /* * Device may contain registers which hint the physical location of the @@ -578,7 +581,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) u32 vmlock; int ret; - membar2_offset = 0x2018; + membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, ); if (ret || vmlock == ~0) return -ENODEV; @@ -590,9 +593,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!membar2) return -ENOMEM; offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - readq(membar2 + 0x2008); + readq(membar2 + MB2_SHADOW_OFFSET); offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - readq(membar2 + 0x2010); + readq(membar2 + MB2_SHADOW_OFFSET + 8); pci_iounmap(vmd->dev, membar2); } } -- 2.20.1
[PATCH 0/2] VMD fixes for v5.4
Hi Lorenzo, Bjorn, Keith, Please consider the following patches for 5.4 inclusion. These will apply to 5.2 stable. 4.19 has a few feature deps so I will instead follow-up with a backport. Jon Derrick (2): PCI: vmd: Fix config addressing when using bus offsets PCI: vmd: Fix shadow offsets to reflect spec changes drivers/pci/controller/vmd.c | 25 +++-- 1 file changed, 15 insertions(+), 10 deletions(-) -- 2.20.1
[PATCH] genirq/affinity: report extra vectors on uneven nodes
The current irq spreading algorithm spreads vectors amongst cpus evenly per node. If a node has more cpus than another node, the extra vectors being spread may not be reported back to the caller. This is most apparent with the NVMe driver and nr_cpus < vectors, where the underreporting results in the caller's WARN being triggered: irq_build_affinity_masks() ... if (nr_present < numvecs) WARN_ON(nr_present + nr_others < numvecs); Signed-off-by: Jon Derrick --- kernel/irq/affinity.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 4352b08ae48d..9beafb8c7e92 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -127,7 +127,8 @@ static int __irq_build_affinity_masks(unsigned int startvec, } for_each_node_mask(n, nodemsk) { - unsigned int ncpus, v, vecs_to_assign, vecs_per_node; + unsigned int ncpus, v, vecs_to_assign, total_vecs_to_assign, + vecs_per_node; /* Spread the vectors per node */ vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; @@ -141,14 +142,16 @@ static int __irq_build_affinity_masks(unsigned int startvec, /* Account for rounding errors */ extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); + total_vecs_to_assign = vecs_to_assign + extra_vecs; - for (v = 0; curvec < last_affv && v < vecs_to_assign; + for (v = 0; curvec < last_affv && v < total_vecs_to_assign; curvec++, v++) { cpus_per_vec = ncpus / vecs_to_assign; /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; + v++; --extra_vecs; } irq_spread_init_one([curvec].mask, nmsk, -- 2.20.1
[PATCH] nvme-pci: Prevent mmio reads if pci channel offline
Some platforms don't seem to easily tolerate non-posted mmio reads on lost (hot removed) devices. This has been noted in previous modifications to other layers where an mmio read to a lost device could cause an undesired firmware intervention [1][2]. This patch reworks the nvme-pci reads to quickly check connectivity prior to reading the BAR. The intent is to prevent a non-posted mmio read which would definitely result in an error condition of some sort. There is, of course, a chance the link becomes disconnected between the check and the read. Like other similar checks, it is only intended to reduce the likelihood of encountering these issues and not fully close the gap. mmio writes are posted and in the fast path and have been left as-is. [1] https://lkml.org/lkml/2018/7/30/858 [2] https://lkml.org/lkml/2018/9/18/1546 Signed-off-by: Jon Derrick --- drivers/nvme/host/pci.c | 75 ++--- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f54718b63637..e555ac2afacd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1264,6 +1264,33 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) csts, result); } +static int nvme_reg_readl(struct nvme_dev *dev, u32 off, u32 *val) +{ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return -ENODEV; + + *val = readl(dev->bar + off); + return 0; +} + +static int nvme_reg_readq(struct nvme_dev *dev, u32 off, u64 *val) +{ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return -ENODEV; + + *val = readq(dev->bar + off); + return 0; +} + +static int nvme_reg_lo_hi_readq(struct nvme_dev *dev, u32 off, u64 *val) +{ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return -ENODEV; + + *val = lo_hi_readq(dev->bar + off); + return 0; +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1271,13 +1298,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; - u32 csts = readl(dev->bar + NVME_REG_CSTS); + u32 csts; /* If PCI error recovery process is happening, we cannot reset or * the recovery mechanism will surely fail. */ mb(); - if (pci_channel_offline(to_pci_dev(dev->dev))) + if (nvme_reg_readl(dev, NVME_REG_CSTS, )) return BLK_EH_RESET_TIMER; /* @@ -1692,18 +1719,24 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) { int result; - u32 aqa; + u32 csts, vs, aqa; struct nvme_queue *nvmeq; result = nvme_remap_bar(dev, db_bar_size(dev, 0)); if (result < 0) return result; - dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? - NVME_CAP_NSSRC(dev->ctrl.cap) : 0; + result = nvme_reg_readl(dev, NVME_REG_CSTS, ); + if (result) + return result; + + result = nvme_reg_readl(dev, NVME_REG_VS, ); + if (result) + return result; - if (dev->subsystem && - (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + dev->subsystem = (vs >= NVME_VS(1, 1, 0)) ? + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; + if (dev->subsystem && (csts & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); result = nvme_disable_ctrl(>ctrl, dev->ctrl.cap); @@ -1808,10 +1841,11 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (dev->cmb_size) return; - dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!dev->cmbsz) + if (nvme_reg_readl(dev, NVME_REG_CMBSZ, >cmbsz) || !dev->cmbsz) + return; + + if (nvme_reg_readl(dev, NVME_REG_CMBLOC, >cmbloc)) return; - dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); @@ -2357,6 +2391,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) { int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); + u32 csts; if (pci_enable_device_mem(pdev)) return result; @@ -2367,7 +2402,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; - if (readl(dev-
[PATCH v3] PCI: Equalize hotplug memory and io for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory and hpiosize additional io if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory/io size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will be allocated their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize/hpiosize parameters to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- v2->v3: Made the IO and mem size calculations nearly equivalent drivers/pci/setup-bus.c | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..ed96043 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -811,6 +811,8 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, static resource_size_t calculate_iosize(resource_size_t size, resource_size_t min_size, resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -823,15 +825,18 @@ static resource_size_t calculate_iosize(resource_size_t size, #if defined(CONFIG_ISA) || defined(CONFIG_EISA) size = (size & 0xff) + ((size & ~0xffUL) << 2); #endif - size = ALIGN(size + size1, align); + size = size + size1; if (size < old_size) size = old_size; + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +846,8 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -930,12 +936,10 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } } - size0 = calculate_iosize(size, min_size, size1, + size0 = calculate_iosize(size, min_size, size1, 0, 0, resource_size(b_res), min_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_iosize(size, min_size, add_size + size1, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_iosize(size, min_size, size1, add_size, children_add_size, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) @@ -1079,12 +1083,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH v3] PCI: Equalize hotplug memory and io for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory and hpiosize additional io if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory/io size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will be allocated their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize/hpiosize parameters to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- v2->v3: Made the IO and mem size calculations nearly equivalent drivers/pci/setup-bus.c | 28 +++- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..ed96043 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -811,6 +811,8 @@ static struct resource *find_free_bus_resource(struct pci_bus *bus, static resource_size_t calculate_iosize(resource_size_t size, resource_size_t min_size, resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -823,15 +825,18 @@ static resource_size_t calculate_iosize(resource_size_t size, #if defined(CONFIG_ISA) || defined(CONFIG_EISA) size = (size & 0xff) + ((size & ~0xffUL) << 2); #endif - size = ALIGN(size + size1, align); + size = size + size1; if (size < old_size) size = old_size; + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +846,8 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -930,12 +936,10 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size, } } - size0 = calculate_iosize(size, min_size, size1, + size0 = calculate_iosize(size, min_size, size1, 0, 0, resource_size(b_res), min_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_iosize(size, min_size, add_size + size1, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_iosize(size, min_size, size1, add_size, children_add_size, resource_size(b_res), min_align); if (!size0 && !size1) { if (b_res->start || b_res->end) @@ -1079,12 +1083,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH] PCI/portdrv: Enable error reporting on managed ports
During probe, the port driver will disable error reporting and assumes it will be enabled later by the AER driver's pci_walk_bus() sequence. This may not be the case for host-bridge enabled root ports, who will enable first error reporting on the bus during the root port probe, and then disable error reporting on downstream devices during subsequent probing of the bus. A hotplugged port device may also fail to enable error reporting as the AER driver has already run on the root bus. Check for these conditions and enable error reporting during portdrv probing. Example case: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/portdrv_core.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..fdd953a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -343,6 +343,16 @@ int pcie_port_device_register(struct pci_dev *dev) if (!nr_service) goto error_cleanup_irqs; +#ifdef CONFIG_PCIEAER + /* +* Enable error reporting for this port in case AER probing has already +* run on the root bus or this port device is hot-inserted +*/ + if (dev->aer_cap && pci_aer_available() && + (pcie_ports_native || pci_find_host_bridge(dev->bus)->native_aer)) + pci_enable_pcie_error_reporting(dev); +#endif + return 0; error_cleanup_irqs: -- 1.8.3.1
[PATCH] PCI/portdrv: Enable error reporting on managed ports
During probe, the port driver will disable error reporting and assumes it will be enabled later by the AER driver's pci_walk_bus() sequence. This may not be the case for host-bridge enabled root ports, who will enable first error reporting on the bus during the root port probe, and then disable error reporting on downstream devices during subsequent probing of the bus. A hotplugged port device may also fail to enable error reporting as the AER driver has already run on the root bus. Check for these conditions and enable error reporting during portdrv probing. Example case: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/portdrv_core.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..fdd953a 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -343,6 +343,16 @@ int pcie_port_device_register(struct pci_dev *dev) if (!nr_service) goto error_cleanup_irqs; +#ifdef CONFIG_PCIEAER + /* +* Enable error reporting for this port in case AER probing has already +* run on the root bus or this port device is hot-inserted +*/ + if (dev->aer_cap && pci_aer_available() && + (pcie_ports_native || pci_find_host_bridge(dev->bus)->native_aer)) + pci_enable_pcie_error_reporting(dev); +#endif + return 0; error_cleanup_irqs: -- 1.8.3.1
[PATCH] PCI/AER: Fix an AER enabling/disabling race
There is a sequence with non-ACPI root ports where the AER driver can enable error reporting on the tree before port drivers have bound to ports on the tree. The port driver assumes the AER driver will set up error reporting afterwards, so instead add a check if error reporting was set up first. Example: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/aer.c | 1 + drivers/pci/pcie/portdrv_core.c | 5 - include/linux/pci.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 83180ed..a4e36b6 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1333,6 +1333,7 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data) if (enable) pcie_set_ecrc_checking(dev); + dev->aer_configured = 1; return 0; } diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..f5de554 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -224,8 +224,11 @@ static int get_port_device_capability(struct pci_dev *dev) /* * Disable AER on this port in case it's been enabled by the * BIOS (the AER service driver will enable it when necessary). +* Don't disable it if the AER service driver has already +* enabled it from the root port bus walking */ - pci_disable_pcie_error_reporting(dev); + if (!dev->aer_configured) + pci_disable_pcie_error_reporting(dev); } #endif diff --git a/include/linux/pci.h b/include/linux/pci.h index e72ca8d..c071622 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -402,6 +402,7 @@ struct pci_dev { unsigned inthas_secondary_link:1; unsigned intnon_compliant_bars:1; /* Broken BARs; ignore them */ unsigned intis_probed:1;/* Device probing in progress */ + unsigned intaer_configured:1; /* AER configured for device */ pci_dev_flags_t dev_flags; atomic_tenable_cnt; /* pci_enable_device has been called */ -- 1.8.3.1
[PATCH] PCI/AER: Fix an AER enabling/disabling race
There is a sequence with non-ACPI root ports where the AER driver can enable error reporting on the tree before port drivers have bound to ports on the tree. The port driver assumes the AER driver will set up error reporting afterwards, so instead add a check if error reporting was set up first. Example: [ 343.790573] pcieport 1:00:00.0: pci_disable_pcie_error_reporting [ 343.809812] pcieport 1:00:00.0: pci_enable_pcie_error_reporting [ 343.819506] pci 1:01:00.0: pci_enable_pcie_error_reporting [ 343.828814] pci 1:02:00.0: pci_enable_pcie_error_reporting [ 343.838089] pci 1:02:01.0: pci_enable_pcie_error_reporting [ 343.847478] pci 1:02:02.0: pci_enable_pcie_error_reporting [ 343.856659] pci 1:02:03.0: pci_enable_pcie_error_reporting [ 343.865794] pci 1:02:04.0: pci_enable_pcie_error_reporting [ 343.874875] pci 1:02:05.0: pci_enable_pcie_error_reporting [ 343.883918] pci 1:02:06.0: pci_enable_pcie_error_reporting [ 343.892922] pci 1:02:07.0: pci_enable_pcie_error_reporting [ 343.918900] pcieport 1:01:00.0: pci_disable_pcie_error_reporting [ 343.968426] pcieport 1:02:00.0: pci_disable_pcie_error_reporting [ 344.028179] pcieport 1:02:01.0: pci_disable_pcie_error_reporting [ 344.091269] pcieport 1:02:02.0: pci_disable_pcie_error_reporting [ 344.156473] pcieport 1:02:03.0: pci_disable_pcie_error_reporting [ 344.238042] pcieport 1:02:04.0: pci_disable_pcie_error_reporting [ 344.321864] pcieport 1:02:05.0: pci_disable_pcie_error_reporting [ 344.411601] pcieport 1:02:06.0: pci_disable_pcie_error_reporting [ 344.505332] pcieport 1:02:07.0: pci_disable_pcie_error_reporting [ 344.621824] nvme 1:06:00.0: pci_enable_pcie_error_reporting Signed-off-by: Jon Derrick --- drivers/pci/pcie/aer.c | 1 + drivers/pci/pcie/portdrv_core.c | 5 - include/linux/pci.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 83180ed..a4e36b6 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1333,6 +1333,7 @@ static int set_device_error_reporting(struct pci_dev *dev, void *data) if (enable) pcie_set_ecrc_checking(dev); + dev->aer_configured = 1; return 0; } diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c index 7c37d81..f5de554 100644 --- a/drivers/pci/pcie/portdrv_core.c +++ b/drivers/pci/pcie/portdrv_core.c @@ -224,8 +224,11 @@ static int get_port_device_capability(struct pci_dev *dev) /* * Disable AER on this port in case it's been enabled by the * BIOS (the AER service driver will enable it when necessary). +* Don't disable it if the AER service driver has already +* enabled it from the root port bus walking */ - pci_disable_pcie_error_reporting(dev); + if (!dev->aer_configured) + pci_disable_pcie_error_reporting(dev); } #endif diff --git a/include/linux/pci.h b/include/linux/pci.h index e72ca8d..c071622 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -402,6 +402,7 @@ struct pci_dev { unsigned inthas_secondary_link:1; unsigned intnon_compliant_bars:1; /* Broken BARs; ignore them */ unsigned intis_probed:1;/* Device probing in progress */ + unsigned intaer_configured:1; /* AER configured for device */ pci_dev_flags_t dev_flags; atomic_tenable_cnt; /* pci_enable_device has been called */ -- 1.8.3.1
[PATCH v2] PCI hotplug Eq v2
Hi Bjorn, Sorry for the delay on this one and pushing it after RC1. Feel free to queue it up for 4.20 if it looks fine. I've added comments to the git log and source explaining why calculate_iosize was left unchanged. Basically I could not synthesize a condition where it would have affected the topology. v1->v2: Comments Jon Derrick (1): PCI: Equalize hotplug memory for non/occupied slots drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) -- 1.8.3.1
[PATCH v2] PCI hotplug Eq v2
Hi Bjorn, Sorry for the delay on this one and pushing it after RC1. Feel free to queue it up for 4.20 if it looks fine. I've added comments to the git log and source explaining why calculate_iosize was left unchanged. Basically I could not synthesize a condition where it would have affected the topology. v1->v2: Comments Jon Derrick (1): PCI: Equalize hotplug memory for non/occupied slots drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) -- 1.8.3.1
[PATCH v2] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] The calculation for IO (hpiosize) should be similar, but platform firmwares I've encountered (including QEMU) provide strict allocations for IO and would not provide free IO resources for hotplug buses in order to prove this calculation. Signed-off-by: Jon Derrick --- drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..70d0aba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,15 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + /* +* Consider the current allocation size when adding size for extra +* hotplug memory. This ensures that occupied slots don't receive +* unneccessary memory allocations in addition to their current size. +* The calculation should be similar for calculate_iosize, but was +* unable to be tested. +*/ + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1088,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH v2] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] The calculation for IO (hpiosize) should be similar, but platform firmwares I've encountered (including QEMU) provide strict allocations for IO and would not provide free IO resources for hotplug buses in order to prove this calculation. Signed-off-by: Jon Derrick --- drivers/pci/setup-bus.c | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..70d0aba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,15 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + + /* +* Consider the current allocation size when adding size for extra +* hotplug memory. This ensures that occupied slots don't receive +* unneccessary memory allocations in addition to their current size. +* The calculation should be similar for calculate_iosize, but was +* unable to be tested. +*/ + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1088,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- Original RFC here: https://patchwork.ozlabs.org/patch/945374/ I split this bit out from the RFC while awaiting the pci string handling enhancements to handle per-device settings Changed from RFC is a simpler algo drivers/pci/setup-bus.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..5ae39e6 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,7 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1080,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH] PCI: Equalize hotplug memory for non/occupied slots
Currently, a hotplug bridge will be given hpmemsize additional memory if available, in order to satisfy any future hotplug allocation requirements. These calculations don't consider the current memory size of the hotplug bridge/slot, so hotplug bridges/slots which have downstream devices will get their current allocation in addition to the hpmemsize value. This makes for possibly undesirable results with a mix of unoccupied and occupied slots (ex, with hpmemsize=2M): 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d64f [size=3M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d650-d66f [size=2M] This change considers the current allocation size when using the hpmemsize parameter to make the reservations predictable for the mix of unoccupied and occupied slots: 02:03.0 PCI bridge: <-- Occupied Memory behind bridge: d620-d63f [size=2M] 02:04.0 PCI bridge: <-- Unoccupied Memory behind bridge: d640-d65f [size=2M] Signed-off-by: Jon Derrick --- Original RFC here: https://patchwork.ozlabs.org/patch/945374/ I split this bit out from the RFC while awaiting the pci string handling enhancements to handle per-device settings Changed from RFC is a simpler algo drivers/pci/setup-bus.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1824..5ae39e6 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -831,7 +831,8 @@ static resource_size_t calculate_iosize(resource_size_t size, static resource_size_t calculate_memsize(resource_size_t size, resource_size_t min_size, - resource_size_t size1, + resource_size_t add_size, + resource_size_t children_add_size, resource_size_t old_size, resource_size_t align) { @@ -841,7 +842,7 @@ static resource_size_t calculate_memsize(resource_size_t size, old_size = 0; if (size < old_size) size = old_size; - size = ALIGN(size + size1, align); + size = ALIGN(max(size, add_size) + children_add_size, align); return size; } @@ -1079,12 +1080,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); - size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + size0 = calculate_memsize(size, min_size, 0, 0, resource_size(b_res), min_align); add_align = max(min_align, add_align); - if (children_add_size > add_size) - add_size = children_add_size; - size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : - calculate_memsize(size, min_size, add_size, + size1 = (!realloc_head || (realloc_head && !add_size && !children_add_size)) ? size0 : + calculate_memsize(size, min_size, add_size, children_add_size, resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) -- 1.8.3.1
[PATCH] ext4: Check superblock mapped prior to committing
This patch attempts to close a hole leading to a BUG seen with hot removals during writes [1]. A block device (NVME namespace in this test case) is formatted to EXT4 without partitions. It's mounted and write I/O is run to a file, then the device is hot removed from the slot. The superblock attempts to be written to the drive which is no longer present. The typical chain of events leading to the BUG: ext4_commit_super() __sync_dirty_buffer() submit_bh() submit_bh_wbc() BUG_ON(!buffer_mapped(bh)); This fix checks for the superblock's buffer head being mapped prior to syncing. [1] https://www.spinics.net/lists/linux-ext4/msg56527.html Signed-off-by: Jon Derrick --- fs/ext4/super.c | 8 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c4c220..ee33233 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4736,6 +4736,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; + + /* +* The superblock bh should be mapped, but it might not be if the +* device was hot-removed. Not much we can do but fail the I/O. +*/ + if (!buffer_mapped(sbh)) + return error; + /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock -- 2.7.4
[PATCH] ext4: Check superblock mapped prior to committing
This patch attempts to close a hole leading to a BUG seen with hot removals during writes [1]. A block device (NVME namespace in this test case) is formatted to EXT4 without partitions. It's mounted and write I/O is run to a file, then the device is hot removed from the slot. The superblock attempts to be written to the drive which is no longer present. The typical chain of events leading to the BUG: ext4_commit_super() __sync_dirty_buffer() submit_bh() submit_bh_wbc() BUG_ON(!buffer_mapped(bh)); This fix checks for the superblock's buffer head being mapped prior to syncing. [1] https://www.spinics.net/lists/linux-ext4/msg56527.html Signed-off-by: Jon Derrick --- fs/ext4/super.c | 8 1 file changed, 8 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c4c220..ee33233 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4736,6 +4736,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; + + /* +* The superblock bh should be mapped, but it might not be if the +* device was hot-removed. Not much we can do but fail the I/O. +*/ + if (!buffer_mapped(sbh)) + return error; + /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock -- 2.7.4
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 12:03 AM, Dan Williams wrote: > On Wed, Sep 27, 2017 at 1:40 PM, Jon Derrick <jonathan.derr...@intel.com> > wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> + /* >> +* Ignore driver delete list arguments. They are handled by driver >> +* registration callbacks >> +*/ >> + if (strcmp(param, "delete_id") == 0) >> + return 0; >> + > > Does this preclude something like: > > modprobe ahci delete_id=1234:5678? > It does seem like it would. I can look into calling into the pci callback for this, but val is a struct module here and I haven't figured out the plumbing to get the [correct] driver from that. Maybe if I enforce the format of 'modprobe ahci ahci.delete_id=' to ensure the driver is specified (and would be required in cases with multi-driver modules).
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 12:03 AM, Dan Williams wrote: > On Wed, Sep 27, 2017 at 1:40 PM, Jon Derrick > wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> + /* >> +* Ignore driver delete list arguments. They are handled by driver >> +* registration callbacks >> +*/ >> + if (strcmp(param, "delete_id") == 0) >> + return 0; >> + > > Does this preclude something like: > > modprobe ahci delete_id=1234:5678? > It does seem like it would. I can look into calling into the pci callback for this, but val is a struct module here and I haven't figured out the plumbing to get the [correct] driver from that. Maybe if I enforce the format of 'modprobe ahci ahci.delete_id=' to ensure the driver is specified (and would be required in cases with multi-driver modules).
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 03:02 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:21PM -0400, Jon Derrick wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> +/* >> + * Ignore driver delete list arguments. They are handled by driver >> + * registration callbacks >> + */ >> +if (strcmp(param, "delete_id") == 0) >> +return 0; > > Why? This is only for the PCI core as you have defined it in this > patchset, but you just broke this module id for all other kernel modules > in the system :( > > If you want to do this, you need to provide this feature for _all_ > kernel drivers... > > thanks, > > greg k-h > Yes I'm not particularly happy about this one either. I will make this more robust if the blacklisting idea is sound.
Re: [RFC 2/3] module: Ignore delete_id parameter
On 09/28/2017 03:02 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:21PM -0400, Jon Derrick wrote: >> The PCI driver delete_id parameter is handled in each individual driver >> registration callback. >> >> Signed-off-by: Jon Derrick >> --- >> kernel/module.c | 7 +++ >> 1 file changed, 7 insertions(+) >> >> diff --git a/kernel/module.c b/kernel/module.c >> index de66ec8..2b2dccf 100644 >> --- a/kernel/module.c >> +++ b/kernel/module.c >> @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char >> *val, const char *modname, >> return 0; >> } >> >> +/* >> + * Ignore driver delete list arguments. They are handled by driver >> + * registration callbacks >> + */ >> +if (strcmp(param, "delete_id") == 0) >> +return 0; > > Why? This is only for the PCI core as you have defined it in this > patchset, but you just broke this module id for all other kernel modules > in the system :( > > If you want to do this, you need to provide this feature for _all_ > kernel drivers... > > thanks, > > greg k-h > Yes I'm not particularly happy about this one either. I will make this more robust if the blacklisting idea is sound.
Re: [RFC 1/3] PCI: pci-driver: Introduce pci device delete list
Hi Greg, On 09/28/2017 03:09 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:20PM -0400, Jon Derrick wrote: >> This patch introduces a new kernel command line parameter to mask pci >> device ids from pci driver id tables. This prevents masked devices from >> automatically binding to both built-in and module drivers. >> >> Devices can be later attached through the driver's sysfs new_id >> inteface. >> >> The use cases for this are primarily for debugging, eg, being able to >> prevent attachment before probes are set up. It can also be used to mask >> off faulty built-in hardware or faulty simulated hardware. >> >> Another use case is to prevent attachment of devices which will be >> passed to VMs, shortcutting the detachment effort. > > Is the "shortcut" really that big of a deal? unbind actually causes > problems? Is this an attempt to deal with devices being handled by more > than one driver and then you want to manually bind it later on? > >> The format is similar to the sysfs new_id format. Device ids are >> specified with: >> >> :[:SVVV:SDDD][:][:] >> >> Where: >> = Vendor ID >> = Device ID >> SVVV = Subvendor ID >> SDDD = Subdevice ID >> = Class >> = Class Mask >> >> IDs can be chained with commas. >> >> Examples: >> .delete_id=1234:5678 >> .delete_id=: >> .delete_id=::::010802 >> .delete_id=1234:5678,abcd:ef01,2345: > > What about drivers that handle more than one bus type (i.e. USB and > PCI?) This format is specific to PCI, yet you are defining it as a > "global" for all drivers :( > I was hoping to extend it to other bus types as well, but just wanted some early feedback on the idea. Pci was the easier implementation for me. Could prepending pci:, usb:, etc on the format be an option? > This feels hacky, what is the real reason for this? It feels like we > have so many different ways to blacklist and unbind and bind devices to > drivers already, why add yet-another way? > I ran into an issue a while back where I needed to disable a device from a built-in driver to perform a regression test. I worked around that issue by doing an initcall_blacklist on the pci-driver declaration, but that also preventing later binding because the driver was never registered. I've also had issues with remote systems where pluggable devices were broken or otherwise non-responsive, and it would have been nice to have been able to keep them from binding on module loading as a temporary workaround until someone could have removed those devices (though impossible on built-in hardware). I'm not sure about hacky; I think the implementation in this patch (1/3) is pretty clean :). I'm not familiar with all the different ways of blacklisting. Is there another way to work around the issues I listed above? I understand the concern about having it exist in the format . and only supporting one or a few bus types. I have another set that extends the pci= parameter instead. > thanks, > > greg k-h > Best, Jon
Re: [RFC 1/3] PCI: pci-driver: Introduce pci device delete list
Hi Greg, On 09/28/2017 03:09 AM, Greg Kroah-Hartman wrote: > On Wed, Sep 27, 2017 at 04:40:20PM -0400, Jon Derrick wrote: >> This patch introduces a new kernel command line parameter to mask pci >> device ids from pci driver id tables. This prevents masked devices from >> automatically binding to both built-in and module drivers. >> >> Devices can be later attached through the driver's sysfs new_id >> inteface. >> >> The use cases for this are primarily for debugging, eg, being able to >> prevent attachment before probes are set up. It can also be used to mask >> off faulty built-in hardware or faulty simulated hardware. >> >> Another use case is to prevent attachment of devices which will be >> passed to VMs, shortcutting the detachment effort. > > Is the "shortcut" really that big of a deal? unbind actually causes > problems? Is this an attempt to deal with devices being handled by more > than one driver and then you want to manually bind it later on? > >> The format is similar to the sysfs new_id format. Device ids are >> specified with: >> >> :[:SVVV:SDDD][:][:] >> >> Where: >> = Vendor ID >> = Device ID >> SVVV = Subvendor ID >> SDDD = Subdevice ID >> = Class >> = Class Mask >> >> IDs can be chained with commas. >> >> Examples: >> .delete_id=1234:5678 >> .delete_id=: >> .delete_id=::::010802 >> .delete_id=1234:5678,abcd:ef01,2345: > > What about drivers that handle more than one bus type (i.e. USB and > PCI?) This format is specific to PCI, yet you are defining it as a > "global" for all drivers :( > I was hoping to extend it to other bus types as well, but just wanted some early feedback on the idea. Pci was the easier implementation for me. Could prepending pci:, usb:, etc on the format be an option? > This feels hacky, what is the real reason for this? It feels like we > have so many different ways to blacklist and unbind and bind devices to > drivers already, why add yet-another way? > I ran into an issue a while back where I needed to disable a device from a built-in driver to perform a regression test. I worked around that issue by doing an initcall_blacklist on the pci-driver declaration, but that also preventing later binding because the driver was never registered. I've also had issues with remote systems where pluggable devices were broken or otherwise non-responsive, and it would have been nice to have been able to keep them from binding on module loading as a temporary workaround until someone could have removed those devices (though impossible on built-in hardware). I'm not sure about hacky; I think the implementation in this patch (1/3) is pretty clean :). I'm not familiar with all the different ways of blacklisting. Is there another way to work around the issues I listed above? I understand the concern about having it exist in the format . and only supporting one or a few bus types. I have another set that extends the pci= parameter instead. > thanks, > > greg k-h > Best, Jon
[RFC 0/3] Introduce PCI device blacklisting
This set introduces a method to keep devices from matching driver id tables. Please see 0001 for more details of the motivation and implementation. This set currently applies to Bjorn's next (v4.14-rc1) branch. Jon Derrick (3): PCI: pci-driver: Introduce pci device delete list module: Ignore delete_id parameter Documentation: Add pci device delete_id interface Documentation/admin-guide/kernel-parameters.txt | 13 ++ drivers/pci/pci-driver.c| 253 +++- include/linux/pci.h | 1 + kernel/module.c | 7 + 4 files changed, 272 insertions(+), 2 deletions(-) -- 1.8.3.1
[RFC 0/3] Introduce PCI device blacklisting
This set introduces a method to keep devices from matching driver id tables. Please see 0001 for more details of the motivation and implementation. This set currently applies to Bjorn's next (v4.14-rc1) branch. Jon Derrick (3): PCI: pci-driver: Introduce pci device delete list module: Ignore delete_id parameter Documentation: Add pci device delete_id interface Documentation/admin-guide/kernel-parameters.txt | 13 ++ drivers/pci/pci-driver.c| 253 +++- include/linux/pci.h | 1 + kernel/module.c | 7 + 4 files changed, 272 insertions(+), 2 deletions(-) -- 1.8.3.1
[RFC 1/3] PCI: pci-driver: Introduce pci device delete list
This patch introduces a new kernel command line parameter to mask pci device ids from pci driver id tables. This prevents masked devices from automatically binding to both built-in and module drivers. Devices can be later attached through the driver's sysfs new_id inteface. The use cases for this are primarily for debugging, eg, being able to prevent attachment before probes are set up. It can also be used to mask off faulty built-in hardware or faulty simulated hardware. Another use case is to prevent attachment of devices which will be passed to VMs, shortcutting the detachment effort. The format is similar to the sysfs new_id format. Device ids are specified with: :[:SVVV:SDDD][:][:] Where: = Vendor ID = Device ID SVVV = Subvendor ID SDDD = Subdevice ID = Class = Class Mask IDs can be chained with commas. Examples: .delete_id=1234:5678 .delete_id=: .delete_id=::::010802 .delete_id=1234:5678,abcd:ef01,2345: Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- drivers/pci/pci-driver.c | 253 ++- include/linux/pci.h | 1 + 2 files changed, 252 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 11bd267..7acdf13 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "pci.h" struct pci_dynid { @@ -27,6 +28,250 @@ struct pci_dynid { struct pci_device_id id; }; +static struct pci_device_id *pci_match_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev, + bool inverse) +{ + struct pci_dynid *deleteid; + struct pci_device_id *found_id = NULL; + + spin_lock(>deleteids.lock); + list_for_each_entry(deleteid, >deleteids.list, node) { + if (!inverse && pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + if (inverse && !pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + } + spin_unlock(>deleteids.lock); + + return found_id; +} + +/** + * pci_match_non_deleted_ids - match dev against not-deleted ids + * @ids: array of PCI device id structures to search in + * @dev: the PCI device structure to match against. + * + * Finds devices in driver id table not matching the delete list and tries to + * match them individually to dev. Returns the matching pci_dev_id structure, + * %NULL if there is no match, of -errno if there was a failure to allocate + * memory for the temporary pci_dev + */ +static struct pci_device_id *pci_match_non_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev) +{ + const struct pci_device_id *ids = drv->id_table; + struct pci_device_id *match = NULL; + struct pci_dev *tmpdev; + + tmpdev = kzalloc(sizeof(*tmpdev), GFP_KERNEL); + if (!tmpdev) + return ERR_PTR(-ENOMEM); + + while (ids->vendor || ids->subvendor || ids->class_mask) { + struct pci_device_id *found_id = NULL; + + tmpdev->vendor = ids->vendor; + tmpdev->device = ids->device, + tmpdev->subsystem_vendor = ids->subvendor, + tmpdev->subsystem_device = ids->subdevice, + tmpdev->class = ids->class; + + found_id = pci_match_deleted_ids(drv, tmpdev, true); + if (found_id) { + const struct pci_device_id id[2] = { *found_id, {0,} }; + if (pci_match_id(id, dev)) { + match = found_id; + break; + } + } + ids++; + } + + kfree(tmpdev); + + return match; +} + +/** + * pci_add_delete_id - add a new PCI device ID to a built-in driver's blacklist + * @drv: target pci driver + * @vendor: PCI vendor ID + * @device: PCI device ID + * @subvendor: PCI subvendor ID + * @subdevice: PCI subdevice ID + * @class: PCI class + * @class_mask: PCI class mask + * + * Adds a new dynamic pci device ID to this driver's delete list, preventing + * the device from attaching to built-in drivers + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pci_add_delete_id(struct pci_driver *drv, + unsigned int vendor, unsigned int device, + unsigned int subvendor, unsigned int subdevice, + unsigned int class, unsigned int class_mask) +{
[RFC 1/3] PCI: pci-driver: Introduce pci device delete list
This patch introduces a new kernel command line parameter to mask pci device ids from pci driver id tables. This prevents masked devices from automatically binding to both built-in and module drivers. Devices can be later attached through the driver's sysfs new_id inteface. The use cases for this are primarily for debugging, eg, being able to prevent attachment before probes are set up. It can also be used to mask off faulty built-in hardware or faulty simulated hardware. Another use case is to prevent attachment of devices which will be passed to VMs, shortcutting the detachment effort. The format is similar to the sysfs new_id format. Device ids are specified with: :[:SVVV:SDDD][:][:] Where: = Vendor ID = Device ID SVVV = Subvendor ID SDDD = Subdevice ID = Class = Class Mask IDs can be chained with commas. Examples: .delete_id=1234:5678 .delete_id=: .delete_id=::::010802 .delete_id=1234:5678,abcd:ef01,2345: Signed-off-by: Jon Derrick --- drivers/pci/pci-driver.c | 253 ++- include/linux/pci.h | 1 + 2 files changed, 252 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 11bd267..7acdf13 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "pci.h" struct pci_dynid { @@ -27,6 +28,250 @@ struct pci_dynid { struct pci_device_id id; }; +static struct pci_device_id *pci_match_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev, + bool inverse) +{ + struct pci_dynid *deleteid; + struct pci_device_id *found_id = NULL; + + spin_lock(>deleteids.lock); + list_for_each_entry(deleteid, >deleteids.list, node) { + if (!inverse && pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + if (inverse && !pci_match_one_device(>id, dev)) { + found_id = >id; + break; + } + } + spin_unlock(>deleteids.lock); + + return found_id; +} + +/** + * pci_match_non_deleted_ids - match dev against not-deleted ids + * @ids: array of PCI device id structures to search in + * @dev: the PCI device structure to match against. + * + * Finds devices in driver id table not matching the delete list and tries to + * match them individually to dev. Returns the matching pci_dev_id structure, + * %NULL if there is no match, of -errno if there was a failure to allocate + * memory for the temporary pci_dev + */ +static struct pci_device_id *pci_match_non_deleted_ids(struct pci_driver *drv, + struct pci_dev *dev) +{ + const struct pci_device_id *ids = drv->id_table; + struct pci_device_id *match = NULL; + struct pci_dev *tmpdev; + + tmpdev = kzalloc(sizeof(*tmpdev), GFP_KERNEL); + if (!tmpdev) + return ERR_PTR(-ENOMEM); + + while (ids->vendor || ids->subvendor || ids->class_mask) { + struct pci_device_id *found_id = NULL; + + tmpdev->vendor = ids->vendor; + tmpdev->device = ids->device, + tmpdev->subsystem_vendor = ids->subvendor, + tmpdev->subsystem_device = ids->subdevice, + tmpdev->class = ids->class; + + found_id = pci_match_deleted_ids(drv, tmpdev, true); + if (found_id) { + const struct pci_device_id id[2] = { *found_id, {0,} }; + if (pci_match_id(id, dev)) { + match = found_id; + break; + } + } + ids++; + } + + kfree(tmpdev); + + return match; +} + +/** + * pci_add_delete_id - add a new PCI device ID to a built-in driver's blacklist + * @drv: target pci driver + * @vendor: PCI vendor ID + * @device: PCI device ID + * @subvendor: PCI subvendor ID + * @subdevice: PCI subdevice ID + * @class: PCI class + * @class_mask: PCI class mask + * + * Adds a new dynamic pci device ID to this driver's delete list, preventing + * the device from attaching to built-in drivers + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pci_add_delete_id(struct pci_driver *drv, + unsigned int vendor, unsigned int device, + unsigned int subvendor, unsigned int subdevice, + unsigned int class, unsigned int class_mask) +{ + struct pci_dynid *del
[RFC 2/3] module: Ignore delete_id parameter
The PCI driver delete_id parameter is handled in each individual driver registration callback. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- kernel/module.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/module.c b/kernel/module.c index de66ec8..2b2dccf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } + /* +* Ignore driver delete list arguments. They are handled by driver +* registration callbacks +*/ + if (strcmp(param, "delete_id") == 0) + return 0; + /* Check for magic 'dyndbg' arg */ ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) -- 1.8.3.1
[RFC 2/3] module: Ignore delete_id parameter
The PCI driver delete_id parameter is handled in each individual driver registration callback. Signed-off-by: Jon Derrick --- kernel/module.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/module.c b/kernel/module.c index de66ec8..2b2dccf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3620,6 +3620,13 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } + /* +* Ignore driver delete list arguments. They are handled by driver +* registration callbacks +*/ + if (strcmp(param, "delete_id") == 0) + return 0; + /* Check for magic 'dyndbg' arg */ ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) -- 1.8.3.1
[RFC 3/3] Documentation: Add pci device delete_id interface
Document the .delete_id= cmdline interface for masking built-in and module device id tables. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- Documentation/admin-guide/kernel-parameters.txt | 13 + 1 file changed, 13 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0549662..862c8b5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -789,6 +789,19 @@ Defaults to the default architecture's huge page size if not specified. + driver.delete_id= + [PCI] + Specified pci device format(s) will mask built-in + driver id tables, preventing devices from + automicatically attaching to built-in or module + drivers. Later attachment should be done by adding the + deleted id back through the "new_id" sysfs interface. + Format: :[::]\ + [:class][:class_mask][, ...] + PCI_ANY_ID masks should use the full 32-bit format. + Examples: virtio-pci.delete_id=: + nvme.delete_id=1234:5678,: + dhash_entries= [KNL] Set number of hash buckets for dentry cache. -- 1.8.3.1
[RFC 3/3] Documentation: Add pci device delete_id interface
Document the .delete_id= cmdline interface for masking built-in and module device id tables. Signed-off-by: Jon Derrick --- Documentation/admin-guide/kernel-parameters.txt | 13 + 1 file changed, 13 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0549662..862c8b5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -789,6 +789,19 @@ Defaults to the default architecture's huge page size if not specified. + driver.delete_id= + [PCI] + Specified pci device format(s) will mask built-in + driver id tables, preventing devices from + automicatically attaching to built-in or module + drivers. Later attachment should be done by adding the + deleted id back through the "new_id" sysfs interface. + Format: :[::]\ + [:class][:class_mask][, ...] + PCI_ANY_ID masks should use the full 32-bit format. + Examples: virtio-pci.delete_id=: + nvme.delete_id=1234:5678,: + dhash_entries= [KNL] Set number of hash buckets for dentry cache. -- 1.8.3.1
[PATCH v2 1/4] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Acked-by: Keith Busch <keith.bu...@intel.com> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch <keith.bu...@intel.com> +M: Jonathan Derrick <jonathan.derr...@intel.com> L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
[PATCH v2 1/4] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick Acked-by: Keith Busch --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch +M: Jonathan Derrick L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
[PATCH v2 3/4] x86/PCI: Use is_vmd rather than relying on the domain number
Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/pci/fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index ca4b02e5..1ed2fbf 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -628,7 +628,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -- 2.9.4
[PATCH v2 3/4] x86/PCI: Use is_vmd rather than relying on the domain number
Signed-off-by: Jon Derrick --- arch/x86/pci/fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index ca4b02e5..1ed2fbf 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -628,7 +628,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -- 2.9.4
[PATCH v2 0/4] VMD fixups
Mostly just cleanup in this revision, eg, trying to limit scope of vmd code to x86 Previous: https://patchwork.kernel.org/patch/9886095/ https://patchwork.kernel.org/patch/9886097/ https://patchwork.kernel.org/patch/9886101/ Jon Derrick (4): MAINTAINERS: Add Jonathan Derrick as VMD maintainer pci/x86: Move VMD quirks to x86 fixups x86/PCI: Use is_vmd rather than relying on the domain number iommu: Prevent VMD child devices from being remapping targets MAINTAINERS | 1 + arch/x86/pci/fixup.c| 18 ++ drivers/iommu/intel-iommu.c | 5 + drivers/pci/quirks.c| 17 - 4 files changed, 24 insertions(+), 17 deletions(-) -- 2.9.4
[PATCH v2 0/4] VMD fixups
Mostly just cleanup in this revision, eg, trying to limit scope of vmd code to x86 Previous: https://patchwork.kernel.org/patch/9886095/ https://patchwork.kernel.org/patch/9886097/ https://patchwork.kernel.org/patch/9886101/ Jon Derrick (4): MAINTAINERS: Add Jonathan Derrick as VMD maintainer pci/x86: Move VMD quirks to x86 fixups x86/PCI: Use is_vmd rather than relying on the domain number iommu: Prevent VMD child devices from being remapping targets MAINTAINERS | 1 + arch/x86/pci/fixup.c| 18 ++ drivers/iommu/intel-iommu.c | 5 + drivers/pci/quirks.c| 17 - 4 files changed, 24 insertions(+), 17 deletions(-) -- 2.9.4
[PATCH v2 4/4] iommu: Prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the requester. Because of this, there needs to be a way to link the parent VMD endpoint's iommu group and associated mappings to the VMD child devices such that attaching and detaching child devices modify the endpoint's mappings, while preventing early detaching on a singular device removal or unbinding. The reassignment of individual VMD child devices devices to VMs is outside the scope of VMD, but may be implemented in the future. For now it is best to prevent any such attempts. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing an iommu_group sysfs directories and allowing subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..94353a6e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -901,6 +901,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf struct pci_dev *pf_pdev; pdev = to_pci_dev(dev); + + /* VMD child devices currently cannot be handled individually */ + if (is_vmd(pdev->bus)) + return NULL; + /* VFs aren't listed in scope tables; we need to look up * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); -- 2.9.4
[PATCH v2 4/4] iommu: Prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the requester. Because of this, there needs to be a way to link the parent VMD endpoint's iommu group and associated mappings to the VMD child devices such that attaching and detaching child devices modify the endpoint's mappings, while preventing early detaching on a singular device removal or unbinding. The reassignment of individual VMD child devices devices to VMs is outside the scope of VMD, but may be implemented in the future. For now it is best to prevent any such attempts. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing an iommu_group sysfs directories and allowing subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..94353a6e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -901,6 +901,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf struct pci_dev *pf_pdev; pdev = to_pci_dev(dev); + + /* VMD child devices currently cannot be handled individually */ + if (is_vmd(pdev->bus)) + return NULL; + /* VFs aren't listed in scope tables; we need to look up * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); -- 2.9.4
[PATCH v2 2/4] pci/x86: Move VMD quirks to x86 fixups
VMD currently only exists for Intel x86 products Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/pci/fixup.c | 18 ++ drivers/pci/quirks.c | 17 - 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 11e4074..ca4b02e5 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -618,3 +618,21 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); + +/* + * VMD-enabled root ports will change the source ID for all messages + * to the VMD device. Rather than doing device matching with the source + * ID, the AER driver should traverse the child device tree, reading + * AER registers to find the faulting device. + */ +static void quirk_no_aersid(struct pci_dev *pdev) +{ + /* VMD Domain */ + if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); + diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f1c9852..073baba 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4671,23 +4671,6 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); -/* - * VMD-enabled root ports will change the source ID for all messages - * to the VMD device. Rather than doing device matching with the source - * ID, the AER driver should traverse the child device tree, reading - * AER registers to find the faulting device. - */ -static void quirk_no_aersid(struct pci_dev *pdev) -{ - /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) - pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; -} -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); - /* FLR may cause some 82579 devices to hang. */ static void quirk_intel_no_flr(struct pci_dev *dev) { -- 2.9.4
[PATCH v2 2/4] pci/x86: Move VMD quirks to x86 fixups
VMD currently only exists for Intel x86 products Signed-off-by: Jon Derrick --- arch/x86/pci/fixup.c | 18 ++ drivers/pci/quirks.c | 17 - 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 11e4074..ca4b02e5 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -618,3 +618,21 @@ static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); + +/* + * VMD-enabled root ports will change the source ID for all messages + * to the VMD device. Rather than doing device matching with the source + * ID, the AER driver should traverse the child device tree, reading + * AER registers to find the faulting device. + */ +static void quirk_no_aersid(struct pci_dev *pdev) +{ + /* VMD Domain */ + if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); + diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f1c9852..073baba 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4671,23 +4671,6 @@ static void quirk_intel_qat_vf_cap(struct pci_dev *pdev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); -/* - * VMD-enabled root ports will change the source ID for all messages - * to the VMD device. Rather than doing device matching with the source - * ID, the AER driver should traverse the child device tree, reading - * AER registers to find the faulting device. - */ -static void quirk_no_aersid(struct pci_dev *pdev) -{ - /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) - pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; -} -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); - /* FLR may cause some 82579 devices to hang. */ static void quirk_intel_no_flr(struct pci_dev *dev) { -- 2.9.4
Re: [PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
Hi Robin, thanks for the reply. On 08/11/2017 12:25 PM, Robin Murphy wrote: > On 07/08/17 20:57, Jon Derrick wrote: >> VMD child devices must use the VMD endpoint's ID as the DMA source. >> Because of this, there needs to be a way to link the parent VMD >> endpoint's DMAR domain to the VMD child devices' DMAR domain such that >> attaching and detaching child devices modify the endpoint's DMAR mapping >> and prevents early detaching. > > That sounds like either pci_device_group() needs modifying, or perhaps > that intel-iommu needs its own extended iommu_ops::device_group > implementation, to ensure that VMD child devices get put in the same > group as their parent - if they share requester IDs they can't feasibly > be attached to different domains anyway. > > Robin. Yes it seems like that to me too. I have a high-level understanding of the changes required but not too much in the nitty-gritty details. It's a bit more complicated anyways because VMD emerges a set of root ports, and AFAICT, PCI ACS end at the root ports and iommu groups rely on ACS to group devices. Either way it's not within the scope of VMD.
Re: [PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
Hi Robin, thanks for the reply. On 08/11/2017 12:25 PM, Robin Murphy wrote: > On 07/08/17 20:57, Jon Derrick wrote: >> VMD child devices must use the VMD endpoint's ID as the DMA source. >> Because of this, there needs to be a way to link the parent VMD >> endpoint's DMAR domain to the VMD child devices' DMAR domain such that >> attaching and detaching child devices modify the endpoint's DMAR mapping >> and prevents early detaching. > > That sounds like either pci_device_group() needs modifying, or perhaps > that intel-iommu needs its own extended iommu_ops::device_group > implementation, to ensure that VMD child devices get put in the same > group as their parent - if they share requester IDs they can't feasibly > be attached to different domains anyway. > > Robin. Yes it seems like that to me too. I have a high-level understanding of the changes required but not too much in the nitty-gritty details. It's a bit more complicated anyways because VMD emerges a set of root ports, and AFAICT, PCI ACS end at the root ports and iommu groups rely on ACS to group devices. Either way it's not within the scope of VMD.
Re: [PATCH 2/3] pci: Generalize is_vmd behavior
On 08/11/2017 11:03 AM, Bjorn Helgaas wrote: > On Mon, Aug 07, 2017 at 01:57:12PM -0600, Jon Derrick wrote: >> Generalize is_vmd behavior to remove dependency on domain number >> checking in pci quirks. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> arch/x86/include/asm/pci.h | 8 +++- >> arch/x86/pci/common.c | 2 +- >> drivers/pci/quirks.c | 2 +- >> include/linux/pci.h| 4 >> 4 files changed, 9 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h >> index 473a729..5c5d54a 100644 >> --- a/arch/x86/include/asm/pci.h >> +++ b/arch/x86/include/asm/pci.h >> @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus >> *bus) >> #define pci_root_bus_fwnode _pci_root_bus_fwnode >> #endif >> >> -static inline bool is_vmd(struct pci_bus *bus) >> -{ >> #if IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) >> +{ >> struct pci_sysdata *sd = bus->sysdata; >> >> return sd->vmd_domain; >> -#else >> -return false; >> -#endif >> } >> +#endif >> >> /* Can be used to override the logic in pci_scan_bus for skipping >> already-configured bus numbers - to be used for buggy BIOSes >> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c >> index dbe2132..18b2277 100644 >> --- a/arch/x86/pci/common.c >> +++ b/arch/x86/pci/common.c >> @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} >> >> static void set_dev_domain_options(struct pci_dev *pdev) >> { >> -if (is_vmd(pdev->bus)) >> +if (pci_bus_is_vmd(pdev->bus)) >> pdev->hotplug_user_indicators = 1; >> } >> >> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c >> index 6967c6b..ba47995 100644 >> --- a/drivers/pci/quirks.c >> +++ b/drivers/pci/quirks.c >> @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, >> quirk_intel_qat_vf_cap); >> static void quirk_no_aersid(struct pci_dev *pdev) >> { >> /* VMD Domain */ >> -if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) >> +if (pci_bus_is_vmd(pdev->bus)) > > I like this part ... > >> pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; >> } >> DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); >> diff --git a/include/linux/pci.h b/include/linux/pci.h >> index 4869e66..0299d8b 100644 >> --- a/include/linux/pci.h >> +++ b/include/linux/pci.h >> @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus >> *bus) { return 0; } >> static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } >> #endif /* CONFIG_PCI_DOMAINS */ >> >> +#if !IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } >> +#endif > > But not so much this part. VMD is (AFAIK) still fundamentally an > x86-only thing, so I'd like it better if this could all be within > arch/x86. Could this be done by moving quirk_no_aersid() to > arch/x86/pci/fixup.c? > Thanks for pointing this out. I'll think of something different and localize it to the x86 code domain. > BTW, CONFIG_VMD in drivers/pci/host/Kconfig is currently "depends on > SRCU". I'm not a Kconfig expert, but that doesn't seem like an > intuitive connection. And it's the only such dependency on SRCU in > the tree -- most other places use "select SRCU", which makes more > sense to me. I agree - most places use select SRCU. I'll add that to v2. > >> /* >> * Generic implementation for PCI domain support. If your >> * architecture does not need custom management of PCI >> -- >> 2.9.4 >>
Re: [PATCH 2/3] pci: Generalize is_vmd behavior
On 08/11/2017 11:03 AM, Bjorn Helgaas wrote: > On Mon, Aug 07, 2017 at 01:57:12PM -0600, Jon Derrick wrote: >> Generalize is_vmd behavior to remove dependency on domain number >> checking in pci quirks. >> >> Signed-off-by: Jon Derrick >> --- >> arch/x86/include/asm/pci.h | 8 +++- >> arch/x86/pci/common.c | 2 +- >> drivers/pci/quirks.c | 2 +- >> include/linux/pci.h| 4 >> 4 files changed, 9 insertions(+), 7 deletions(-) >> >> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h >> index 473a729..5c5d54a 100644 >> --- a/arch/x86/include/asm/pci.h >> +++ b/arch/x86/include/asm/pci.h >> @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus >> *bus) >> #define pci_root_bus_fwnode _pci_root_bus_fwnode >> #endif >> >> -static inline bool is_vmd(struct pci_bus *bus) >> -{ >> #if IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) >> +{ >> struct pci_sysdata *sd = bus->sysdata; >> >> return sd->vmd_domain; >> -#else >> -return false; >> -#endif >> } >> +#endif >> >> /* Can be used to override the logic in pci_scan_bus for skipping >> already-configured bus numbers - to be used for buggy BIOSes >> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c >> index dbe2132..18b2277 100644 >> --- a/arch/x86/pci/common.c >> +++ b/arch/x86/pci/common.c >> @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} >> >> static void set_dev_domain_options(struct pci_dev *pdev) >> { >> -if (is_vmd(pdev->bus)) >> +if (pci_bus_is_vmd(pdev->bus)) >> pdev->hotplug_user_indicators = 1; >> } >> >> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c >> index 6967c6b..ba47995 100644 >> --- a/drivers/pci/quirks.c >> +++ b/drivers/pci/quirks.c >> @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, >> quirk_intel_qat_vf_cap); >> static void quirk_no_aersid(struct pci_dev *pdev) >> { >> /* VMD Domain */ >> -if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) >> +if (pci_bus_is_vmd(pdev->bus)) > > I like this part ... > >> pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; >> } >> DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); >> diff --git a/include/linux/pci.h b/include/linux/pci.h >> index 4869e66..0299d8b 100644 >> --- a/include/linux/pci.h >> +++ b/include/linux/pci.h >> @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus >> *bus) { return 0; } >> static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } >> #endif /* CONFIG_PCI_DOMAINS */ >> >> +#if !IS_ENABLED(CONFIG_VMD) >> +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } >> +#endif > > But not so much this part. VMD is (AFAIK) still fundamentally an > x86-only thing, so I'd like it better if this could all be within > arch/x86. Could this be done by moving quirk_no_aersid() to > arch/x86/pci/fixup.c? > Thanks for pointing this out. I'll think of something different and localize it to the x86 code domain. > BTW, CONFIG_VMD in drivers/pci/host/Kconfig is currently "depends on > SRCU". I'm not a Kconfig expert, but that doesn't seem like an > intuitive connection. And it's the only such dependency on SRCU in > the tree -- most other places use "select SRCU", which makes more > sense to me. I agree - most places use select SRCU. I'll add that to v2. > >> /* >> * Generic implementation for PCI domain support. If your >> * architecture does not need custom management of PCI >> -- >> 2.9.4 >>
[PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the DMA source. Because of this, there needs to be a way to link the parent VMD endpoint's DMAR domain to the VMD child devices' DMAR domain such that attaching and detaching child devices modify the endpoint's DMAR mapping and prevents early detaching. This is outside the scope of VMD, so disable binding child devices to prevent unforeseen issues. This functionality may be implemented in the future. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing iommu_group sysfs directories and subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..651a6cd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -905,6 +905,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); dev = _pdev->dev; + + /* VMD child devices currently cannot be handled individually */ + if (pci_bus_is_vmd(pdev->bus)) + return NULL; + segment = pci_domain_nr(pdev->bus); } else if (has_acpi_companion(dev)) dev = _COMPANION(dev)->dev; -- 2.9.4
[PATCH 3/3] iommu: prevent VMD child devices from being remapping targets
VMD child devices must use the VMD endpoint's ID as the DMA source. Because of this, there needs to be a way to link the parent VMD endpoint's DMAR domain to the VMD child devices' DMAR domain such that attaching and detaching child devices modify the endpoint's DMAR mapping and prevents early detaching. This is outside the scope of VMD, so disable binding child devices to prevent unforeseen issues. This functionality may be implemented in the future. This patch prevents VMD child devices from returning an IOMMU, which prevents it from exposing iommu_group sysfs directories and subsequent binding by userspace-access drivers such as VFIO. Signed-off-by: Jon Derrick --- drivers/iommu/intel-iommu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 687f18f..651a6cd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -905,6 +905,11 @@ static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devf * the PF instead to find the IOMMU. */ pf_pdev = pci_physfn(pdev); dev = _pdev->dev; + + /* VMD child devices currently cannot be handled individually */ + if (pci_bus_is_vmd(pdev->bus)) + return NULL; + segment = pci_domain_nr(pdev->bus); } else if (has_acpi_companion(dev)) dev = _COMPANION(dev)->dev; -- 2.9.4
[PATCH 2/3] pci: Generalize is_vmd behavior
Generalize is_vmd behavior to remove dependency on domain number checking in pci quirks. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- arch/x86/include/asm/pci.h | 8 +++- arch/x86/pci/common.c | 2 +- drivers/pci/quirks.c | 2 +- include/linux/pci.h| 4 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 473a729..5c5d54a 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) #define pci_root_bus_fwnode_pci_root_bus_fwnode #endif -static inline bool is_vmd(struct pci_bus *bus) -{ #if IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) +{ struct pci_sysdata *sd = bus->sysdata; return sd->vmd_domain; -#else - return false; -#endif } +#endif /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132..18b2277 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} static void set_dev_domain_options(struct pci_dev *pdev) { - if (is_vmd(pdev->bus)) + if (pci_bus_is_vmd(pdev->bus)) pdev->hotplug_user_indicators = 1; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6967c6b..ba47995 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (pci_bus_is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66..0299d8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #endif /* CONFIG_PCI_DOMAINS */ +#if !IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } +#endif + /* * Generic implementation for PCI domain support. If your * architecture does not need custom management of PCI -- 2.9.4
[PATCH 2/3] pci: Generalize is_vmd behavior
Generalize is_vmd behavior to remove dependency on domain number checking in pci quirks. Signed-off-by: Jon Derrick --- arch/x86/include/asm/pci.h | 8 +++- arch/x86/pci/common.c | 2 +- drivers/pci/quirks.c | 2 +- include/linux/pci.h| 4 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 473a729..5c5d54a 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -60,16 +60,14 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) #define pci_root_bus_fwnode_pci_root_bus_fwnode #endif -static inline bool is_vmd(struct pci_bus *bus) -{ #if IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) +{ struct pci_sysdata *sd = bus->sysdata; return sd->vmd_domain; -#else - return false; -#endif } +#endif /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132..18b2277 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -662,7 +662,7 @@ static void set_dma_domain_ops(struct pci_dev *pdev) {} static void set_dev_domain_options(struct pci_dev *pdev) { - if (is_vmd(pdev->bus)) + if (pci_bus_is_vmd(pdev->bus)) pdev->hotplug_user_indicators = 1; } diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6967c6b..ba47995 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4666,7 +4666,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); static void quirk_no_aersid(struct pci_dev *pdev) { /* VMD Domain */ - if (pdev->bus->sysdata && pci_domain_nr(pdev->bus) >= 0x1) + if (pci_bus_is_vmd(pdev->bus)) pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66..0299d8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1471,6 +1471,10 @@ static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #endif /* CONFIG_PCI_DOMAINS */ +#if !IS_ENABLED(CONFIG_VMD) +static inline bool pci_bus_is_vmd(struct pci_bus *bus) { return false; } +#endif + /* * Generic implementation for PCI domain support. If your * architecture does not need custom management of PCI -- 2.9.4
[PATCH 1/3] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch <keith.bu...@intel.com> +M: Jonathan Derrick <jonathan.derr...@intel.com> L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
[PATCH 1/3] MAINTAINERS: Add Jonathan Derrick as VMD maintainer
Add myself as VMD maintainer Signed-off-by: Jon Derrick --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f66488d..3ec39df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10090,6 +10090,7 @@ F: drivers/pci/dwc/*imx6* PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Keith Busch +M: Jonathan Derrick L: linux-...@vger.kernel.org S: Supported F: drivers/pci/host/vmd.c -- 2.9.4
Re: block/sed-opal.c: 2 * bad if tests ?
On 03/06/2017 05:00 AM, David Binderman wrote: > Hello there, > > 1. > > block/sed-opal.c:2136:40: warning: logical ‘and’ of mutually exclusive tests > is always false [-Wlogical-op] > > Source code is > > if (lk_unlk->session.who < OPAL_USER1 && > lk_unlk->session.who > OPAL_USER9) { > > 2. > > block/sed-opal.c:2319:37: warning: logical ‘and’ of mutually exclusive tests > is always false [-Wlogical-op] > > if (opal_session->who < OPAL_USER1 && > opal_session->who > OPAL_USER9) { > > Duplicate. > > Also in the same file: > > [block/sed-opal.c:1034]: (style) Variable 'method' is assigned a value that > is never used. > > Regards > > David Binderman > Thanks for the catch(es). Will provide patch shortly
Re: block/sed-opal.c: 2 * bad if tests ?
On 03/06/2017 05:00 AM, David Binderman wrote: > Hello there, > > 1. > > block/sed-opal.c:2136:40: warning: logical ‘and’ of mutually exclusive tests > is always false [-Wlogical-op] > > Source code is > > if (lk_unlk->session.who < OPAL_USER1 && > lk_unlk->session.who > OPAL_USER9) { > > 2. > > block/sed-opal.c:2319:37: warning: logical ‘and’ of mutually exclusive tests > is always false [-Wlogical-op] > > if (opal_session->who < OPAL_USER1 && > opal_session->who > OPAL_USER9) { > > Duplicate. > > Also in the same file: > > [block/sed-opal.c:1034]: (style) Variable 'method' is assigned a value that > is never used. > > Regards > > David Binderman > Thanks for the catch(es). Will provide patch shortly
Re: [PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match
Thanks everyone. Sorry about the mess :) On 02/15/2017 10:23 PM, Michael Ellerman wrote: > Jon Derrick <jonathan.derr...@intel.com> writes: > >> PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks >> like the 'arch/powerpc' file pattern should be enough to match powerpc >> opal code by itself. Remove the opal regex pattern from powerpc. > > We thought of it first. > > Can't you just rename your driver, Opal Storage Specification, so "oss", > that should be pretty unique? > > ... :) > > I don't like this version, but I'll merge the one from Stewart which > drops the pattern and adds the paths for the existing drivers. > > cheers > >> diff --git a/MAINTAINERS b/MAINTAINERS >> index b983b25..430dd02 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -7404,7 +7404,6 @@ F: drivers/pci/hotplug/pnv_php.c >> F: drivers/pci/hotplug/rpa* >> F: drivers/scsi/ibmvscsi/ >> F: tools/testing/selftests/powerpc >> -N: opal >> N: /pmac >> N: powermac >> N: powernv >> -- >> 1.8.3.1
Re: [PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match
Thanks everyone. Sorry about the mess :) On 02/15/2017 10:23 PM, Michael Ellerman wrote: > Jon Derrick writes: > >> PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks >> like the 'arch/powerpc' file pattern should be enough to match powerpc >> opal code by itself. Remove the opal regex pattern from powerpc. > > We thought of it first. > > Can't you just rename your driver, Opal Storage Specification, so "oss", > that should be pretty unique? > > ... :) > > I don't like this version, but I'll merge the one from Stewart which > drops the pattern and adds the paths for the existing drivers. > > cheers > >> diff --git a/MAINTAINERS b/MAINTAINERS >> index b983b25..430dd02 100644 >> --- a/MAINTAINERS >> +++ b/MAINTAINERS >> @@ -7404,7 +7404,6 @@ F: drivers/pci/hotplug/pnv_php.c >> F: drivers/pci/hotplug/rpa* >> F: drivers/scsi/ibmvscsi/ >> F: tools/testing/selftests/powerpc >> -N: opal >> N: /pmac >> N: powermac >> N: powernv >> -- >> 1.8.3.1
[PATCHv4 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Reviewed-by: Scott Bauer <scott.ba...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..00673cf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv4 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick Reviewed-by: Scott Bauer --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..00673cf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv4 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Reviewed-by: Scott Bauer <scott.ba...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 00673cf..383d5d6 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv4 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> Reviewed-by: Scott Bauer <scott.ba...@intel.com> --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv4 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick Reviewed-by: Scott Bauer --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 00673cf..383d5d6 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv4 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick Reviewed-by: Scott Bauer --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv4 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv4 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv4 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v3->v4: uses IS_ERR since tok is embedded in the response buffer and cannot be NULL v2->v3: corrected the bad calculation on the response parser check and changed it to only check the subpacket length v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv4 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v3->v4: uses IS_ERR since tok is embedded in the response buffer and cannot be NULL v2->v3: corrected the bad calculation on the response parser check and changed it to only check the subpacket length v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv3 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v2->v3: corrected the bad calculation on the response parser check and changed it to only check the subpacket length v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv3 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v2->v3: corrected the bad calculation on the response parser check and changed it to only check the subpacket length v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv3 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index d6dd604..feba34b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv3 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index d6dd604..feba34b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv3 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv3 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv3 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1
[PATCHv3 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv3 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
Re: [PATCH 3/4] block/sed: Check received header lengths
On 02/15/2017 11:15 AM, Scott Bauer wrote: > On Wed, Feb 15, 2017 at 12:38:53PM -0700, Jon Derrick wrote: >> Add a buffer size check against discovery and response header lengths >> before we loop over their buffers. >> >> Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> >> --- >> block/sed-opal.c | 35 +-- >> 1 file changed, 21 insertions(+), 14 deletions(-) >> >> diff --git a/block/sed-opal.c b/block/sed-opal.c >> index d6dd604..addf89e 100644 >> --- a/block/sed-opal.c >> +++ b/block/sed-opal.c >> @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) >> const struct d0_header *hdr = (struct d0_header *)dev->resp; >> const u8 *epos = dev->resp, *cpos = dev->resp; >> u16 comid = 0; >> +u32 hlen = be32_to_cpu(hdr->length); >> >> -print_buffer(dev->resp, be32_to_cpu(hdr->length)); >> +print_buffer(dev->resp, hlen); >> >> -epos += be32_to_cpu(hdr->length); /* end of buffer */ >> +if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { >> +pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", >> +sizeof(*hdr), hlen, IO_BUFFER_LENGTH); >> +return -EFAULT; >> +} >> + >> +epos += hlen; /* end of buffer */ >> cpos += sizeof(*hdr); /* current position on buffer */ >> >> while (cpos < epos && supported) { >> @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, >> int total; >> ssize_t token_length; >> const u8 *pos; >> +u32 clen, plen, slen; >> >> if (!buf) >> return -EFAULT; >> @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, >> pos = buf; >> pos += sizeof(*hdr); >> >> -pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", >> - be32_to_cpu(hdr->cp.length), >> - be32_to_cpu(hdr->pkt.length), >> - be32_to_cpu(hdr->subpkt.length)); >> - >> -if (hdr->cp.length == 0 || hdr->pkt.length == 0 || >> -hdr->subpkt.length == 0) { >> -pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", >> - be32_to_cpu(hdr->cp.length), >> - be32_to_cpu(hdr->pkt.length), >> - be32_to_cpu(hdr->subpkt.length)); >> +clen = be32_to_cpu(hdr->cp.length); >> +plen = be32_to_cpu(hdr->pkt.length); >> +slen = be32_to_cpu(hdr->subpkt.length); >> +pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", >> + clen, plen, slen); >> + >> +if (clen == 0 || plen == 0 || slen == 0 || >> +(u64)clen + plen + slen > IO_BUFFER_LENGTH) { > > I think we're over calculating here. From my understanding of the spec: > TCG_Storage_Architecture_Core_Spec_v2.01_r1.00.pdf > 3.2.3 > ComPackets, Packets & Subpackets > > The Comp packet size should the size of the packet, and the subpackets. > The "packet" should be the size of all the subpackets. > And lastly the subpacket should be the size its payload. Yep I forgot that. Good catch! > > if ((u64) slen + sizeof(*hdr) > IO_BUFFER_LENGTH) > > Since we never need the com packet length or pkt length (we never use them > anywhere) I think the above check is okay. Let me know if i'm wrong though, > or you have a different understanding of the lengths. Agreed > > > > > > > >
Re: [PATCH 3/4] block/sed: Check received header lengths
On 02/15/2017 11:15 AM, Scott Bauer wrote: > On Wed, Feb 15, 2017 at 12:38:53PM -0700, Jon Derrick wrote: >> Add a buffer size check against discovery and response header lengths >> before we loop over their buffers. >> >> Signed-off-by: Jon Derrick >> --- >> block/sed-opal.c | 35 +-- >> 1 file changed, 21 insertions(+), 14 deletions(-) >> >> diff --git a/block/sed-opal.c b/block/sed-opal.c >> index d6dd604..addf89e 100644 >> --- a/block/sed-opal.c >> +++ b/block/sed-opal.c >> @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) >> const struct d0_header *hdr = (struct d0_header *)dev->resp; >> const u8 *epos = dev->resp, *cpos = dev->resp; >> u16 comid = 0; >> +u32 hlen = be32_to_cpu(hdr->length); >> >> -print_buffer(dev->resp, be32_to_cpu(hdr->length)); >> +print_buffer(dev->resp, hlen); >> >> -epos += be32_to_cpu(hdr->length); /* end of buffer */ >> +if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { >> +pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", >> +sizeof(*hdr), hlen, IO_BUFFER_LENGTH); >> +return -EFAULT; >> +} >> + >> +epos += hlen; /* end of buffer */ >> cpos += sizeof(*hdr); /* current position on buffer */ >> >> while (cpos < epos && supported) { >> @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, >> int total; >> ssize_t token_length; >> const u8 *pos; >> +u32 clen, plen, slen; >> >> if (!buf) >> return -EFAULT; >> @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, >> pos = buf; >> pos += sizeof(*hdr); >> >> -pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", >> - be32_to_cpu(hdr->cp.length), >> - be32_to_cpu(hdr->pkt.length), >> - be32_to_cpu(hdr->subpkt.length)); >> - >> -if (hdr->cp.length == 0 || hdr->pkt.length == 0 || >> -hdr->subpkt.length == 0) { >> -pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", >> - be32_to_cpu(hdr->cp.length), >> - be32_to_cpu(hdr->pkt.length), >> - be32_to_cpu(hdr->subpkt.length)); >> +clen = be32_to_cpu(hdr->cp.length); >> +plen = be32_to_cpu(hdr->pkt.length); >> +slen = be32_to_cpu(hdr->subpkt.length); >> +pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", >> + clen, plen, slen); >> + >> +if (clen == 0 || plen == 0 || slen == 0 || >> +(u64)clen + plen + slen > IO_BUFFER_LENGTH) { > > I think we're over calculating here. From my understanding of the spec: > TCG_Storage_Architecture_Core_Spec_v2.01_r1.00.pdf > 3.2.3 > ComPackets, Packets & Subpackets > > The Comp packet size should the size of the packet, and the subpackets. > The "packet" should be the size of all the subpackets. > And lastly the subpacket should be the size its payload. Yep I forgot that. Good catch! > > if ((u64) slen + sizeof(*hdr) > IO_BUFFER_LENGTH) > > Since we never need the com packet length or pkt length (we never use them > anywhere) I think the above check is okay. Let me know if i'm wrong though, > or you have a different understanding of the lengths. Agreed > > > > > > > >
[PATCHv2 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv2 2/4] block/sed: Add helper to qualify response tokens
Add helper which verifies the response token is valid and matches the expected value. Merges token_type and response_get_token. Signed-off-by: Jon Derrick --- block/sed-opal.c | 61 +++- 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 77623ad..d6dd604 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -591,48 +591,25 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) return 0; } -static enum opal_response_token token_type(const struct parsed_resp *resp, - int n) +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) { const struct opal_resp_tok *tok; if (n >= resp->num) { pr_err("Token number doesn't exist: %d, resp: %d\n", n, resp->num); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } tok = >toks[n]; if (tok->len == 0) { pr_err("Token length must be non-zero\n"); - return OPAL_DTA_TOKENID_INVALID; + return ERR_PTR(-EINVAL); } - return tok->type; -} - -/* - * This function returns 0 in case of invalid token. One should call - * token_type() first to find out if the token is valid or not. - */ -static enum opal_token response_get_token(const struct parsed_resp *resp, - int n) -{ - const struct opal_resp_tok *tok; - - if (n >= resp->num) { - pr_err("Token number doesn't exist: %d, resp: %d\n", - n, resp->num); - return 0; - } - - tok = >toks[n]; - if (tok->len == 0) { - pr_err("Token length must be non-zero\n"); - return 0; - } - - return tok->pos[0]; + return tok; } static ssize_t response_parse_tiny(struct opal_resp_tok *tok, @@ -851,20 +828,32 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return resp->toks[n].stored.u; } +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR_OR_NULL(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + static u8 response_status(const struct parsed_resp *resp) { - if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN && - response_get_token(resp, 0) == OPAL_ENDOFSESSION) { + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) return 0; - } if (resp->num < 5) return DTAERROR_NO_METHOD_STATUS; - if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN || - token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN || - response_get_token(resp, resp->num - 1) != OPAL_ENDLIST || - response_get_token(resp, resp->num - 5) != OPAL_STARTLIST) + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) return DTAERROR_NO_METHOD_STATUS; return response_get_u64(resp, resp->num - 4); -- 1.8.3.1
[PATCHv2 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv2 1/4] block/sed: Use ssize_t on atom parsers to return errors
The short atom parser can return an errno from decoding but does not currently return the error as a signed value. Convert all of the parsers to ssize_t. Signed-off-by: Jon Derrick --- block/sed-opal.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index e95b8a5..77623ad 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -635,8 +635,8 @@ static enum opal_token response_get_token(const struct parsed_resp *resp, return tok->pos[0]; } -static size_t response_parse_tiny(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -652,8 +652,8 @@ static size_t response_parse_tiny(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_short(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; @@ -665,7 +665,7 @@ static size_t response_parse_short(struct opal_resp_tok *tok, tok->type = OPAL_DTA_TOKENID_SINT; } else { u64 u_integer = 0; - int i, b = 0; + ssize_t i, b = 0; tok->type = OPAL_DTA_TOKENID_UINT; if (tok->len > 9) { @@ -682,8 +682,8 @@ static size_t response_parse_short(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_medium(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_medium(struct opal_resp_tok *tok, +const u8 *pos) { tok->pos = pos; tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; @@ -699,8 +699,8 @@ static size_t response_parse_medium(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_long(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; @@ -716,8 +716,8 @@ static size_t response_parse_long(struct opal_resp_tok *tok, return tok->len; } -static size_t response_parse_token(struct opal_resp_tok *tok, - const u8 *pos) +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) { tok->pos = pos; tok->len = 1; @@ -734,7 +734,7 @@ static int response_parse(const u8 *buf, size_t length, struct opal_resp_tok *iter; int num_entries = 0; int total; - size_t token_length; + ssize_t token_length; const u8 *pos; if (!buf) @@ -780,8 +780,8 @@ static int response_parse(const u8 *buf, size_t length, else /* TOKEN */ token_length = response_parse_token(iter, pos); - if (token_length == -EINVAL) - return -EINVAL; + if (token_length < 0) + return token_length; pos += token_length; total -= token_length; -- 1.8.3.1
[PATCHv2 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv2 0/4] OPAL patches
Just a couple of fixes for sed-opal to prevent faulty firmware from allowing us to go off in the weeds, and a helper to remove some duplicate code. v1->v2: left tok->len as a size_t got everyone important on the same email thread Jon Derrick (4): block/sed: Use ssize_t on atom parsers to return errors block/sed: Add helper to qualify response tokens block/sed: Check received header lengths MAINTAINERS: Remove powerpc's opal match MAINTAINERS | 1 - block/sed-opal.c | 124 +++ 2 files changed, 60 insertions(+), 65 deletions(-) -- 1.8.3.1
[PATCHv2 3/4] block/sed: Check received header lengths
Add a buffer size check against discovery and response header lengths before we loop over their buffers. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- block/sed-opal.c | 35 +-- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index d6dd604..addf89e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -340,10 +340,17 @@ static int opal_discovery0_end(struct opal_dev *dev) const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); - print_buffer(dev->resp, be32_to_cpu(hdr->length)); + print_buffer(dev->resp, hlen); - epos += be32_to_cpu(hdr->length); /* end of buffer */ + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ while (cpos < epos && supported) { @@ -713,6 +720,7 @@ static int response_parse(const u8 *buf, size_t length, int total; ssize_t token_length; const u8 *pos; + u32 clen, plen, slen; if (!buf) return -EFAULT; @@ -724,17 +732,16 @@ static int response_parse(const u8 *buf, size_t length, pos = buf; pos += sizeof(*hdr); - pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n", -be32_to_cpu(hdr->cp.length), -be32_to_cpu(hdr->pkt.length), -be32_to_cpu(hdr->subpkt.length)); - - if (hdr->cp.length == 0 || hdr->pkt.length == 0 || - hdr->subpkt.length == 0) { - pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n", - be32_to_cpu(hdr->cp.length), - be32_to_cpu(hdr->pkt.length), - be32_to_cpu(hdr->subpkt.length)); + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", +clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + (u64)clen + plen + slen > IO_BUFFER_LENGTH) { + pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); print_buffer(pos, sizeof(*hdr)); return -EINVAL; } @@ -743,7 +750,7 @@ static int response_parse(const u8 *buf, size_t length, return -EFAULT; iter = resp->toks; - total = be32_to_cpu(hdr->subpkt.length); + total = slen; print_buffer(pos, total); while (total > 0) { if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ -- 1.8.3.1
[PATCHv2 4/4] MAINTAINERS: Remove powerpc's opal match
PPC's 'opal' match pattern also matches block/sed-opal.c, where it looks like the 'arch/powerpc' file pattern should be enough to match powerpc opal code by itself. Remove the opal regex pattern from powerpc. Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b983b25..430dd02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7404,7 +7404,6 @@ F:drivers/pci/hotplug/pnv_php.c F: drivers/pci/hotplug/rpa* F: drivers/scsi/ibmvscsi/ F: tools/testing/selftests/powerpc -N: opal N: /pmac N: powermac N: powernv -- 1.8.3.1