Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
when the device's MSI-X table size is larger than the firmware's MSI quota
for the device.
This is because the commit changes how rtas_prepare_msi_irqs() is called:
- Before, it is called when interrupts are allocated at the global
interrupt domain with nvec_in being the number of allocated interrupts.
rtas_prepare_msi_irqs() can return a positive number and the allocation
will be retried.
- Now, it is called at the creation of per-device interrupt domain with
nvec_in being the number of interrupts that the device supports. If
rtas_prepare_msi_irqs() returns positive, domain creation just fails.
For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
number (the quota). This causes per-device interrupt domain creation to
fail and thus the NVMe driver cannot enable MSI-X.
Rework to make this scenario works again:
- pseries_msi_ops_prepare() only prepares as many interrupts as the quota
permit.
- pseries_irq_domain_alloc() fails if the device's quota is exceeded.
Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
as allowed by the quota. If device drivers attempt to allocate more
interrupts than the quota permits, pseries_irq_domain_alloc() will return
an error code and msi_handle_pci_fail() will allow device drivers a retry.
Reported-by: Nilay Shroff <[email protected]>
Closes:
https://lore.kernel.org/linuxppc-dev/[email protected]/
Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to
msi_create_parent_irq_domain()")
Signed-off-by: Nam Cao <[email protected]>
Acked-by: Nilay Shroff <[email protected]>
Cc: [email protected]
---
arch/powerpc/platforms/pseries/msi.c | 42 ++++++++++++++++++++++++++--
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c
b/arch/powerpc/platforms/pseries/msi.c
index a82aaa786e9e..8898a968a59b 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -19,6 +19,11 @@
#include "pseries.h"
+struct pseries_msi_device {
+ unsigned int msi_quota;
+ unsigned int msi_used;
+};
+
static int query_token, change_token;
#define RTAS_QUERY_FN 0
@@ -433,8 +438,26 @@ static int pseries_msi_ops_prepare(struct irq_domain
*domain, struct device *dev
struct msi_domain_info *info = domain->host_data;
struct pci_dev *pdev = to_pci_dev(dev);
int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX :
PCI_CAP_ID_MSI;
+ int ret;
+
+ struct pseries_msi_device *pseries_dev __free(kfree)
+ = kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
+ if (!pseries_dev)
+ return -ENOMEM;
+
+ ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ if (ret > 0) {
+ nvec = ret;
+ ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ }
+ if (ret < 0)
+ return ret;
- return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+ pseries_dev->msi_quota = nvec;
+ pseries_dev->msi_used = 0;
+
+ arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
+ return 0;
}
/*
@@ -443,9 +466,13 @@ static int pseries_msi_ops_prepare(struct irq_domain
*domain, struct device *dev
*/
static void pseries_msi_ops_teardown(struct irq_domain *domain,
msi_alloc_info_t *arg)
{
+ struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
struct pci_dev *pdev = to_pci_dev(domain->dev);
rtas_disable_msi(pdev);
+
+ WARN_ON(pseries_dev->msi_used);
+ kfree(pseries_dev);
}
static void pseries_msi_shutdown(struct irq_data *d)
@@ -546,12 +573,18 @@ static int pseries_irq_domain_alloc(struct irq_domain
*domain, unsigned int virq
unsigned int nr_irqs, void *arg)
{
struct pci_controller *phb = domain->host_data;
+ struct pseries_msi_device *pseries_dev;
msi_alloc_info_t *info = arg;
struct msi_desc *desc = info->desc;
struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
int hwirq;
int i, ret;
+ pseries_dev = info->scratchpad[0].ptr;
+
+ if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
+ return -ENOSPC;
+
hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
if (hwirq < 0) {
dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
@@ -567,9 +600,10 @@ static int pseries_irq_domain_alloc(struct irq_domain
*domain, unsigned int virq
goto out;
irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
- &pseries_msi_irq_chip,
domain->host_data);
+ &pseries_msi_irq_chip,
pseries_dev);
}
+ pseries_dev->msi_used++;
return 0;
out:
@@ -582,9 +616,11 @@ static void pseries_irq_domain_free(struct irq_domain
*domain, unsigned int virq
unsigned int nr_irqs)
{
struct irq_data *d = irq_domain_get_irq_data(domain, virq);
- struct pci_controller *phb = irq_data_get_irq_chip_data(d);
+ struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
+ struct pci_controller *phb = domain->host_data;
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
+ pseries_dev->msi_used -= nr_irqs;
irq_domain_free_irqs_parent(domain, virq, nr_irqs);
}
--
2.47.3