On 1/7/26 3:32 PM, Nam Cao wrote:
> Nilay reported that since commit daaa574aba6f ("powerpc/pseries/msi: Switch
> to msi_create_parent_irq_domain()"), the NVMe driver cannot enable MSI-X
> when the device's MSI-X table size is larger than the firmware's MSI quota
> for the device.
> 
> This is because the commit changes how rtas_prepare_msi_irqs() is called:
> 
>   - Before, it is called when interrupts are allocated at the global
>     interrupt domain with nvec_in being the number of allocated interrupts.
>     rtas_prepare_msi_irqs() can return a positive number and the allocation
>     will be retried.
> 
>   - Now, it is called at the creation of per-device interrupt domain with
>     nvec_in being the number of interrupts that the device supports. If
>     rtas_prepare_msi_irqs() returns positive, domain creation just fails.
> 
> For Nilay's NVMe driver case, rtas_prepare_msi_irqs() returns a positive
> number (the quota). This causes per-device interrupt domain creation to
> fail and thus the NVMe driver cannot enable MSI-X.
> 
> Rework to make this scenario works again:
> 
>   - pseries_msi_ops_prepare() only prepares as many interrupts as the quota
>     permit.
> 
>   - pseries_irq_domain_alloc() fails if the device's quota is exceeded.
> 
> Now, if the quota is exceeded, pseries_msi_ops_prepare() will only prepare
> as allowed by the quota. If device drivers attempt to allocate more
> interrupts than the quota permits, pseries_irq_domain_alloc() will return
> an error code and msi_handle_pci_fail() will allow device drivers a retry.
> 
> Reported-by: Nilay Shroff <[email protected]>
> Closes: 
> https://lore.kernel.org/linuxppc-dev/[email protected]/
> Fixes: daaa574aba6f ("powerpc/pseries/msi: Switch to 
> msi_create_parent_irq_domain()")
> Signed-off-by: Nam Cao <[email protected]>
> Acked-by: Nilay Shroff <[email protected]>
> Cc: [email protected]
> ---
> v2:
>   - change pseries_msi_ops_prepare()'s allocation logic to match the
>     original logic in __pci_enable_msix_range()
> 
>   - fix up Nilay's email address
> ---
>  arch/powerpc/platforms/pseries/msi.c | 44 ++++++++++++++++++++++++++--
>  1 file changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/msi.c 
> b/arch/powerpc/platforms/pseries/msi.c
> index a82aaa786e9e..edc30cda5dbc 100644
> --- a/arch/powerpc/platforms/pseries/msi.c
> +++ b/arch/powerpc/platforms/pseries/msi.c
> @@ -19,6 +19,11 @@
>  
>  #include "pseries.h"
>  
> +struct pseries_msi_device {
> +     unsigned int msi_quota;
> +     unsigned int msi_used;
> +};
> +
>  static int query_token, change_token;
>  
>  #define RTAS_QUERY_FN                0
> @@ -433,8 +438,28 @@ static int pseries_msi_ops_prepare(struct irq_domain 
> *domain, struct device *dev
>       struct msi_domain_info *info = domain->host_data;
>       struct pci_dev *pdev = to_pci_dev(dev);
>       int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : 
> PCI_CAP_ID_MSI;
> +     int ret;
> +
> +     struct pseries_msi_device *pseries_dev __free(kfree)
> +             = kmalloc(sizeof(*pseries_dev), GFP_KERNEL);
> +     if (!pseries_dev)
> +             return -ENOMEM;
> +
> +     while (1) {
> +             ret = rtas_prepare_msi_irqs(pdev, nvec, type, arg);
> +             if (!ret)
> +                     break;
> +             else if (ret > 0)
> +                     nvec = ret;
> +             else
> +                     return ret;
> +     }
>  
> -     return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
> +     pseries_dev->msi_quota = nvec;
> +     pseries_dev->msi_used = 0;
> +
> +     arg->scratchpad[0].ptr = no_free_ptr(pseries_dev);
> +     return 0;
>  }
>  
>  /*
> @@ -443,9 +468,13 @@ static int pseries_msi_ops_prepare(struct irq_domain 
> *domain, struct device *dev
>   */
>  static void pseries_msi_ops_teardown(struct irq_domain *domain, 
> msi_alloc_info_t *arg)
>  {
> +     struct pseries_msi_device *pseries_dev = arg->scratchpad[0].ptr;
>       struct pci_dev *pdev = to_pci_dev(domain->dev);
>  
>       rtas_disable_msi(pdev);
> +
> +     WARN_ON(pseries_dev->msi_used);
> +     kfree(pseries_dev);
>  }
>  
>  static void pseries_msi_shutdown(struct irq_data *d)
> @@ -546,12 +575,18 @@ static int pseries_irq_domain_alloc(struct irq_domain 
> *domain, unsigned int virq
>                                   unsigned int nr_irqs, void *arg)
>  {
>       struct pci_controller *phb = domain->host_data;
> +     struct pseries_msi_device *pseries_dev;
>       msi_alloc_info_t *info = arg;
>       struct msi_desc *desc = info->desc;
>       struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
>       int hwirq;
>       int i, ret;
>  
> +     pseries_dev = info->scratchpad[0].ptr;
> +
> +     if (pseries_dev->msi_used + nr_irqs > pseries_dev->msi_quota)
> +             return -ENOSPC;
> +
>       hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_index);
>       if (hwirq < 0) {
>               dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq);
> @@ -567,9 +602,10 @@ static int pseries_irq_domain_alloc(struct irq_domain 
> *domain, unsigned int virq
>                       goto out;
>  
>               irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
> -                                           &pseries_msi_irq_chip, 
> domain->host_data);
> +                                           &pseries_msi_irq_chip, 
> pseries_dev);
>       }
>  
> +     pseries_dev->msi_used++;
>       return 0;
>  
>  out:
> @@ -582,9 +618,11 @@ static void pseries_irq_domain_free(struct irq_domain 
> *domain, unsigned int virq
>                                   unsigned int nr_irqs)
>  {
>       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
> -     struct pci_controller *phb = irq_data_get_irq_chip_data(d);
> +     struct pseries_msi_device *pseries_dev = irq_data_get_irq_chip_data(d);
> +     struct pci_controller *phb = domain->host_data;
>  
>       pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
> +     pseries_dev->msi_used -= nr_irqs;
>       irq_domain_free_irqs_parent(domain, virq, nr_irqs);
>  }
>  

I just tested this change on my system using the latest mainline kernel and it 
works
well for me. So with that, please fell free to add,

Tested-by: Nilay Shroff <[email protected]>


Reply via email to