[PATCH 0/2] KVM: PPC: Book3S HV: XIVE: Improve guest entries and exits

2021-07-20 Thread Cédric Le Goater
The XIVE interrupt controller on P10 can automatically save and
restore the state of the interrupt registers under the internal NVP
structure representing the VCPU. This saves a costly store/load in
guest entries and exits.

Thanks,

C. 


Cédric Le Goater (2):
  KVM: PPC: Book3S HV: XIVE: Add a 'flags' field
  KVM: PPC: Book3S HV: XIVE: Add support for automatic save-restore

 arch/powerpc/include/asm/xive-regs.h  |  3 ++
 arch/powerpc/include/asm/xive.h   |  1 +
 arch/powerpc/kvm/book3s_xive.h| 11 +-
 arch/powerpc/kvm/book3s_xive.c| 53 +--
 arch/powerpc/kvm/book3s_xive_native.c | 21 ---
 arch/powerpc/sysdev/xive/native.c | 10 +
 6 files changed, 82 insertions(+), 17 deletions(-)

-- 
2.31.1



[PATCH 1/2] KVM: PPC: Book3S HV: XIVE: Add a 'flags' field

2021-07-20 Thread Cédric Le Goater
Use it to hold platform specific features. P9 DD2 introduced
single-escalation support. P10 will add others.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_xive.h|  9 -
 arch/powerpc/kvm/book3s_xive.c| 19 ++-
 arch/powerpc/kvm/book3s_xive_native.c | 12 +++-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index afe9eeac6d56..73c3cd25093c 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -97,6 +97,8 @@ struct kvmppc_xive_ops {
int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq);
 };
 
+#define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1
+
 struct kvmppc_xive {
struct kvm *kvm;
struct kvm_device *dev;
@@ -133,7 +135,7 @@ struct kvmppc_xive {
u32 q_page_order;
 
/* Flags */
-   u8  single_escalation;
+   u8  flags;
 
/* Number of entries in the VP block */
u32 nr_servers;
@@ -308,5 +310,10 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
 int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp);
 int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr);
 
+static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive)
+{
+   return xive->flags & KVMPPC_XIVE_FLAG_SINGLE_ESCALATION;
+}
+
 #endif /* CONFIG_KVM_XICS */
 #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 8cfab3547494..12f101d74b48 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -363,9 +363,9 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
if (!vcpu->arch.xive_vcpu)
continue;
rc = xive_provision_queue(vcpu, prio);
-   if (rc == 0 && !xive->single_escalation)
+   if (rc == 0 && !kvmppc_xive_has_single_escalation(xive))
kvmppc_xive_attach_escalation(vcpu, prio,
- xive->single_escalation);
+ 
kvmppc_xive_has_single_escalation(xive));
if (rc)
return rc;
}
@@ -1199,7 +1199,7 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
if (xc->esc_virq[i]) {
-   if (xc->xive->single_escalation)
+   if (kvmppc_xive_has_single_escalation(xc->xive))
xive_cleanup_single_escalation(vcpu, xc,
xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
@@ -1340,7 +1340,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 * Enable the VP first as the single escalation mode will
 * affect escalation interrupts numbering
 */
-   r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+   r = xive_native_enable_vp(xc->vp_id, 
kvmppc_xive_has_single_escalation(xive));
if (r) {
pr_err("Failed to enable VP in OPAL, err %d\n", r);
goto bail;
@@ -1357,15 +1357,15 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct xive_q *q = >queues[i];
 
/* Single escalation, no queue 7 */
-   if (i == 7 && xive->single_escalation)
+   if (i == 7 && kvmppc_xive_has_single_escalation(xive))
break;
 
/* Is queue already enabled ? Provision it */
if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i);
-   if (r == 0 && !xive->single_escalation)
+   if (r == 0 && !kvmppc_xive_has_single_escalation(xive))
kvmppc_xive_attach_escalation(
-   vcpu, i, xive->single_escalation);
+   vcpu, i, 
kvmppc_xive_has_single_escalation(xive));
if (r)
goto bail;
} else {
@@ -1380,7 +1380,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
}
 
/* If not done above, attach priority 0 escalation */
-   r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
+   r = kvmppc_xive_attach_escalation(vcpu, 0, 
kvmppc_xive_has_single_escalation(xive));
if (r)
goto bail;
 
@@ -2135,7 +2135,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 
type)
 */
xive->nr_servers = KVM_MAX_VCPUS;
 
-   xive

[PATCH 2/2] KVM: PPC: Book3S HV: XIVE: Add support for automatic save-restore

2021-07-20 Thread Cédric Le Goater
On P10, the feature doing an automatic "save & restore" of a VCPU
interrupt context is set by default in OPAL. When a VP context is
pulled out, the state of the interrupt registers are saved by the XIVE
interrupt controller under the internal NVP structure representing the
VP. This saves a costly store/load in guest entries and exits.

If OPAL advertises the "save & restore" feature in the device tree,
it should also have set the 'H' bit in the CAM line. Check that when
vCPUs are connected to their ICP in KVM before going any further.

Cc: Nicholas Piggin 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive-regs.h  |  3 +++
 arch/powerpc/include/asm/xive.h   |  1 +
 arch/powerpc/kvm/book3s_xive.h|  2 ++
 arch/powerpc/kvm/book3s_xive.c| 34 +--
 arch/powerpc/kvm/book3s_xive_native.c |  9 +++
 arch/powerpc/sysdev/xive/native.c | 10 
 6 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/xive-regs.h 
b/arch/powerpc/include/asm/xive-regs.h
index 8b211faa0e42..cf8bb6ac4463 100644
--- a/arch/powerpc/include/asm/xive-regs.h
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -80,10 +80,13 @@
 #define   TM_QW0W2_VU  PPC_BIT32(0)
 #define   TM_QW0W2_LOGIC_SERV  PPC_BITMASK32(1,31) // XX 2,31 ?
 #define   TM_QW1W2_VO  PPC_BIT32(0)
+#define   TM_QW1W2_HO   PPC_BIT32(1) /* P10 XIVE2 */
 #define   TM_QW1W2_OS_CAM  PPC_BITMASK32(8,31)
 #define   TM_QW2W2_VP  PPC_BIT32(0)
+#define   TM_QW2W2_HP   PPC_BIT32(1) /* P10 XIVE2 */
 #define   TM_QW2W2_POOL_CAMPPC_BITMASK32(8,31)
 #define   TM_QW3W2_VT  PPC_BIT32(0)
+#define   TM_QW3W2_HT   PPC_BIT32(1) /* P10 XIVE2 */
 #define   TM_QW3W2_LP  PPC_BIT32(6)
 #define   TM_QW3W2_LE  PPC_BIT32(7)
 #define   TM_QW3W2_T   PPC_BIT32(31)
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index aa094a8655b0..efb0f5effcc6 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -125,6 +125,7 @@ int xive_native_enable_vp(u32 vp_id, bool 
single_escalation);
 int xive_native_disable_vp(u32 vp_id);
 int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
 bool xive_native_has_single_escalation(void);
+bool xive_native_has_save_restore(void);
 
 int xive_native_get_queue_info(u32 vp_id, uint32_t prio,
   u64 *out_qpage,
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 73c3cd25093c..e6a9651c6f1e 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -98,6 +98,7 @@ struct kvmppc_xive_ops {
 };
 
 #define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1
+#define KVMPPC_XIVE_FLAG_SAVE_RESTORE 0x2
 
 struct kvmppc_xive {
struct kvm *kvm;
@@ -309,6 +310,7 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
struct kvmppc_xive_vcpu *xc, int irq);
 int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp);
 int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr);
+bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu);
 
 static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive)
 {
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 12f101d74b48..cc5bee49bd63 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -59,6 +59,25 @@
  */
 #define XIVE_Q_GAP 2
 
+static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu)
+{
+   struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+   /* Check enablement at VP level */
+   return xc->vp_cam & TM_QW1W2_HO;
+}
+
+bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu)
+{
+   struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+   struct kvmppc_xive *xive = xc->xive;
+
+   if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE)
+   return kvmppc_xive_vcpu_has_save_restore(vcpu);
+
+   return true;
+}
+
 /*
  * Push a vcpu's context to the XIVE on guest entry.
  * This assumes we are in virtual mode (MMU on)
@@ -77,7 +96,8 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
return;
 
eieio();
-   __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
+   if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
+   __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
vcpu->arch.xive_pushed = 1;
eieio();
@@ -149,7 +169,8 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
/* First load to pull the context, we ignore the value */
__raw_readl(tima + TM_SPC_PULL_OS_CTX);
/* Second load to recover the context state (Words 0 and 1) */
-  

[PATCH] powerpc: use IRQF_NO_DEBUG for IPIs

2021-07-19 Thread Cédric Le Goater
There is no need to use the lockup detector ("noirqdebug") for IPIs.
The ipistorm benchmark measures a ~10% improvement on high systems
when this flag is set.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 2 +-
 arch/powerpc/sysdev/xive/common.c  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index b14c502e56a8..18174ccefbc0 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -133,7 +133,7 @@ static void xics_request_ipi(void)
 * IPIs are marked IRQF_PERCPU. The handler was set in map.
 */
BUG_ON(request_irq(ipi, icp_ops->ipi_action,
-  IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+  IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, "IPI", 
NULL));
 }
 
 void __init xics_smp_probe(void)
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index dbdbbc2f1dc5..9ab44d069704 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1161,7 +1161,8 @@ static int __init xive_request_ipi(void)
snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
 
ret = request_irq(xid->irq, xive_muxed_ipi_action,
- IRQF_PERCPU | IRQF_NO_THREAD, xid->name, 
NULL);
+ IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD,
+ xid->name, NULL);
 
WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret);
}
-- 
2.31.1



[PATCH] powerpc/xive: Fix error handling when allocating an IPI

2021-07-01 Thread Cédric Le Goater
This is a smatch warning:

  arch/powerpc/sysdev/xive/common.c:1161 xive_request_ipi() warn: unsigned 
'xid->irq' is never less than zero.

Fixes: fd6db2892eba ("powerpc/xive: Modernize XIVE-IPI domain with an 'alloc' 
handler")
Cc: sta...@vger.kernel.org # v5.13
Reported-by: kernel test robot 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f8ff558bc305..7bbb9bc83057 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1148,11 +1148,10 @@ static int __init xive_request_ipi(void)
 * Since the HW interrupt number doesn't have any meaning,
 * simply use the node number.
 */
-   xid->irq = irq_domain_alloc_irqs(ipi_domain, 1, node, );
-   if (xid->irq < 0) {
-   ret = xid->irq;
+   ret = irq_domain_alloc_irqs(ipi_domain, 1, node, );
+   if (ret < 0)
goto out_free_xive_ipis;
-   }
+   xid->irq = ret;
 
snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
 
-- 
2.31.1



[PATCH v2 22/32] powerpc/pci: Drop XIVE restriction on MSI domains

2021-07-01 Thread Cédric Le Goater
The PowerNV and pSeries platforms now have support for both the XICS
and XIVE IRQ domains.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 +---
 arch/powerpc/platforms/pseries/msi.c  | 4 
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index b498876a976f..e2454439e574 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2332,9 +2332,7 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
 
-   /* Only supported by the XIVE driver */
-   if (xive_enabled())
-   pnv_msi_allocate_domains(phb->hose, count);
+   pnv_msi_allocate_domains(phb->hose, count);
 }
 
 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index e2127a3f7ebd..e196cc1b8540 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -720,10 +720,6 @@ int pseries_msi_allocate_domains(struct pci_controller 
*phb)
 {
int count;
 
-   /* Only supported by the XIVE driver */
-   if (!xive_enabled())
-   return -ENODEV;
-
if (!__find_pe_total_msi(phb->dn, )) {
pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n",
   phb->dn, phb->global_number);
-- 
2.31.1



[PATCH v2 30/32] KVM: PPC: Book3S HV: XICS: Fix mapping of passthrough interrupts

2021-07-01 Thread Cédric Le Goater
PCI MSIs now live in an MSI domain but the underlying calls, which
will EOI the interrupt in real mode, need an HW IRQ number mapped in
the XICS IRQ domain. Grab it there.

Cc: Alexey Kardashevskiy 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_hv.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 965178aeff13..1afbe91c6ca1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5233,6 +5233,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
struct kvmppc_passthru_irqmap *pimap;
struct irq_chip *chip;
int i, rc = 0;
+   struct irq_data *host_data;
 
if (!kvm_irq_bypass)
return 1;
@@ -5297,7 +5298,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
 * the KVM real mode handler.
 */
smp_wmb();
-   irq_map->r_hwirq = desc->irq_data.hwirq;
+
+   /*
+* The 'host_irq' number is mapped in the PCI-MSI domain but
+* the underlying calls, which will EOI the interrupt in real
+* mode, need an HW IRQ number mapped in the XICS IRQ domain.
+*/
+   host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
+   irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
 
if (i == pimap->n_mapped)
pimap->n_mapped++;
@@ -5305,7 +5313,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
if (xics_on_xive())
rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
else
-   kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+   kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
if (rc)
irq_map->r_hwirq = 0;
 
-- 
2.31.1



[PATCH v2 31/32] powerpc/xive: Use XIVE domain under xmon and debugfs

2021-07-01 Thread Cédric Le Goater
The default domain of the PCI/MSIs is not the XIVE domain anymore. To
list the IRQ mappings under XMON and debugfs, query the IRQ data from
the low level XIVE domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f0012d6b4fe9..f8ff558bc305 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -322,11 +322,10 @@ void xmon_xive_get_irq_all(void)
struct irq_desc *desc;
 
for_each_irq_desc(i, desc) {
-   struct irq_data *d = irq_desc_get_irq_data(desc);
-   unsigned int hwirq = (unsigned int)irqd_to_hwirq(d);
+   struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, 
i);
 
-   if (d->domain == xive_irq_domain)
-   xmon_xive_get_irq_config(hwirq, d);
+   if (d)
+   xmon_xive_get_irq_config(irqd_to_hwirq(d), d);
}
 }
 
@@ -1766,9 +1765,9 @@ static int xive_core_debug_show(struct seq_file *m, void 
*private)
xive_debug_show_cpu(m, cpu);
 
for_each_irq_desc(i, desc) {
-   struct irq_data *d = irq_desc_get_irq_data(desc);
+   struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, 
i);
 
-   if (d->domain == xive_irq_domain)
+   if (d)
xive_debug_show_irq(m, d);
}
return 0;
-- 
2.31.1



[PATCH v2 28/32] powerpc/powernv/pci: Set the IRQ chip data for P8/CXL devices

2021-07-01 Thread Cédric Le Goater
Before MSI domains, the default IRQ chip of PHB3 MSIs was patched by
pnv_set_msi_irq_chip() with the custom EOI handler pnv_ioda2_msi_eoi()
and the owning PHB was deduced from the 'ioda.irq_chip' field. This
path has been deprecated by the MSI domains but it is still in use by
the P8 CAPI 'cxl' driver.

Rewriting this driver to support MSI would be a waste of time.
Nevertheless, we can still remove the IRQ chip patch and set the IRQ
chip data instead. This is cleaner.

Cc: Frederic Barrat 
Cc: Christophe Lombard 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 6c4b37598bcc..aa97245eedbf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1971,19 +1971,23 @@ int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, 
unsigned int hw_irq)
return opal_pci_msi_eoi(phb->opal_id, hw_irq);
 }
 
+/*
+ * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers
+ */
 static void pnv_ioda2_msi_eoi(struct irq_data *d)
 {
int64_t rc;
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-   struct irq_chip *chip = irq_data_get_irq_chip(d);
+   struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+   struct pnv_phb *phb = hose->private_data;
 
-   rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
+   rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
WARN_ON_ONCE(rc);
 
icp_native_eoi(d);
 }
 
-
+/* P8/CXL only */
 void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
 {
struct irq_data *idata;
@@ -2005,6 +2009,7 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned 
int virq)
phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
}
irq_set_chip(virq, >ioda.irq_chip);
+   irq_set_chip_data(virq, phb->hose);
 }
 
 static struct irq_chip pnv_pci_msi_irq_chip;
-- 
2.31.1



[PATCH v2 32/32] genirq: Improve "hwirq" output in /proc and /sys/

2021-07-01 Thread Cédric Le Goater
The HW IRQ numbers generated by the PCI MSI layer can be quite large
on a pSeries machine when running under the IBM Hypervisor and they
appear as negative. Use '%lu' instead to show them correctly.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 kernel/irq/irqdesc.c | 2 +-
 kernel/irq/proc.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4a617d7312a4..1d8b7fb6b366 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj,
 
raw_spin_lock_irq(>lock);
if (desc->irq_data.domain)
-   ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+   ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
raw_spin_unlock_irq(>lock);
 
return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7c5cd42df3b9..ee595ec09778 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
-   seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+   seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
else
seq_printf(p, " %*s", prec, "");
 #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
-- 
2.31.1



[PATCH v2 15/32] KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts

2021-07-01 Thread Cédric Le Goater
PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the
underlying calls handling the passthrough of the interrupt in the
guest need a number in the XIVE IRQ domain.

Use the IRQ data mapped in the XIVE IRQ domain and not the one in the
PCI-MSI domain.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_xive.c | 3 ++-
 kernel/irq/irqdomain.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 434da541a20b..d30eb35cc7f0 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -926,7 +926,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
-   struct irq_data *host_data = irq_get_irq_data(host_irq);
+   struct irq_data *host_data =
+   irq_domain_get_irq_data(irq_get_default_host(), host_irq);
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
u16 idx;
u8 prio;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6284443b87ec..c8c06318dcbf 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,6 +481,7 @@ struct irq_domain *irq_get_default_host(void)
 {
return irq_default_domain;
 }
+EXPORT_SYMBOL_GPL(irq_get_default_host);
 
 static void irq_domain_clear_mapping(struct irq_domain *domain,
 irq_hw_number_t hwirq)
-- 
2.31.1



[PATCH v2 17/32] powerpc/xics: Rename the map handler in a check handler

2021-07-01 Thread Cédric Le Goater
This moves the IRQ initialization done under the different ICS backends
in the common part of XICS. The 'map' handler becomes a simple 'check'
on the HW IRQ at the FW level.

As we don't need an ICS anymore in xics_migrate_irqs_away(), the XICS
domain does not set a chip data for the IRQ.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xics.h|  3 ++-
 arch/powerpc/sysdev/xics/ics-native.c  | 13 +---
 arch/powerpc/sysdev/xics/ics-opal.c| 27 +
 arch/powerpc/sysdev/xics/ics-rtas.c| 28 +-
 arch/powerpc/sysdev/xics/xics-common.c | 15 --
 5 files changed, 36 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index 584dcf903590..e76d835dc03f 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -89,10 +89,11 @@ static inline int ics_opal_init(void) { return -ENODEV; }
 /* ICS instance, hooked up to chip_data of an irq */
 struct ics {
struct list_head link;
-   int (*map)(struct ics *ics, unsigned int virq);
+   int (*check)(struct ics *ics, unsigned int hwirq);
void (*mask_unknown)(struct ics *ics, unsigned long vec);
long (*get_server)(struct ics *ics, unsigned long vec);
int (*host_match)(struct ics *ics, struct device_node *node);
+   struct irq_chip *chip;
char data[];
 };
 
diff --git a/arch/powerpc/sysdev/xics/ics-native.c 
b/arch/powerpc/sysdev/xics/ics-native.c
index d450502f4053..dec7d93a8ba1 100644
--- a/arch/powerpc/sysdev/xics/ics-native.c
+++ b/arch/powerpc/sysdev/xics/ics-native.c
@@ -131,19 +131,15 @@ static struct irq_chip ics_native_irq_chip = {
.irq_retrigger  = xics_retrigger,
 };
 
-static int ics_native_map(struct ics *ics, unsigned int virq)
+static int ics_native_check(struct ics *ics, unsigned int hw_irq)
 {
-   unsigned int vec = (unsigned int)virq_to_hw(virq);
struct ics_native *in = to_ics_native(ics);
 
-   pr_devel("%s: vec=0x%x\n", __func__, vec);
+   pr_devel("%s: hw_irq=0x%x\n", __func__, hw_irq);
 
-   if (vec < in->ibase || vec >= (in->ibase + in->icount))
+   if (hw_irq < in->ibase || hw_irq >= (in->ibase + in->icount))
return -EINVAL;
 
-   irq_set_chip_and_handler(virq, _native_irq_chip, 
handle_fasteoi_irq);
-   irq_set_chip_data(virq, ics);
-
return 0;
 }
 
@@ -177,10 +173,11 @@ static int ics_native_host_match(struct ics *ics, struct 
device_node *node)
 }
 
 static struct ics ics_native_template = {
-   .map= ics_native_map,
+   .check  = ics_native_check,
.mask_unknown   = ics_native_mask_unknown,
.get_server = ics_native_get_server,
.host_match = ics_native_host_match,
+   .chip = _native_irq_chip,
 };
 
 static int __init ics_native_add_one(struct device_node *np)
diff --git a/arch/powerpc/sysdev/xics/ics-opal.c 
b/arch/powerpc/sysdev/xics/ics-opal.c
index 823f6c9664cd..8c7ddcc718b6 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -157,26 +157,13 @@ static struct irq_chip ics_opal_irq_chip = {
.irq_retrigger = xics_retrigger,
 };
 
-static int ics_opal_map(struct ics *ics, unsigned int virq);
-static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec);
-static long ics_opal_get_server(struct ics *ics, unsigned long vec);
-
 static int ics_opal_host_match(struct ics *ics, struct device_node *node)
 {
return 1;
 }
 
-/* Only one global & state struct ics */
-static struct ics ics_hal = {
-   .map= ics_opal_map,
-   .mask_unknown   = ics_opal_mask_unknown,
-   .get_server = ics_opal_get_server,
-   .host_match = ics_opal_host_match,
-};
-
-static int ics_opal_map(struct ics *ics, unsigned int virq)
+static int ics_opal_check(struct ics *ics, unsigned int hw_irq)
 {
-   unsigned int hw_irq = (unsigned int)virq_to_hw(virq);
int64_t rc;
__be16 server;
int8_t priority;
@@ -189,9 +176,6 @@ static int ics_opal_map(struct ics *ics, unsigned int virq)
if (rc != OPAL_SUCCESS)
return -ENXIO;
 
-   irq_set_chip_and_handler(virq, _opal_irq_chip, handle_fasteoi_irq);
-   irq_set_chip_data(virq, _hal);
-
return 0;
 }
 
@@ -222,6 +206,15 @@ static long ics_opal_get_server(struct ics *ics, unsigned 
long vec)
return ics_opal_unmangle_server(be16_to_cpu(server));
 }
 
+/* Only one global & state struct ics */
+static struct ics ics_hal = {
+   .check  = ics_opal_check,
+   .mask_unknown   = ics_opal_mask_unknown,
+   .get_server = ics_opal_get_server,
+   .host_match = ics_opal_host_match,
+   .chip   = _opal_irq_chip,
+};
+
 int __init ics_opal_init(void)
 {
if (!firmware_has_feature(FW_FEATURE_OPAL))
diff --git a/arch/

[PATCH v2 20/32] powerpc/xics: Add support for IRQ domain hierarchy

2021-07-01 Thread Cédric Le Goater
XICS doesn't have any state associated with the IRQ. The support is
straightforward and simpler than for XIVE.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 41 ++
 1 file changed, 41 insertions(+)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 419d91bffec3..e82d0d4ddec0 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -406,7 +406,48 @@ int xics_retrigger(struct irq_data *data)
return 0;
 }
 
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+static int xics_host_domain_translate(struct irq_domain *d, struct irq_fwspec 
*fwspec,
+ unsigned long *hwirq, unsigned int *type)
+{
+   return xics_host_xlate(d, to_of_node(fwspec->fwnode), fwspec->param,
+  fwspec->param_count, hwirq, type);
+}
+
+static int xics_host_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+   struct irq_fwspec *fwspec = arg;
+   irq_hw_number_t hwirq;
+   unsigned int type = IRQ_TYPE_NONE;
+   int i, rc;
+
+   rc = xics_host_domain_translate(domain, fwspec, , );
+   if (rc)
+   return rc;
+
+   pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++)
+   irq_domain_set_info(domain, virq + i, hwirq + i, xics_ics->chip,
+   xics_ics, handle_fasteoi_irq, NULL, NULL);
+
+   return 0;
+}
+
+static void xics_host_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+   pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+}
+#endif
+
 static const struct irq_domain_ops xics_host_ops = {
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+   .alloc  = xics_host_domain_alloc,
+   .free   = xics_host_domain_free,
+   .translate = xics_host_domain_translate,
+#endif
.match = xics_host_match,
.map = xics_host_map,
.xlate = xics_host_xlate,
-- 
2.31.1



[PATCH v2 19/32] powerpc/xics: Add debug logging to the set_irq_affinity handlers

2021-07-01 Thread Cédric Le Goater
It really helps to know how the HW is configured when tweaking the IRQ
subsystem.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/ics-opal.c | 2 +-
 arch/powerpc/sysdev/xics/ics-rtas.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xics/ics-opal.c 
b/arch/powerpc/sysdev/xics/ics-opal.c
index 8c7ddcc718b6..bf26cae1b982 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -133,7 +133,7 @@ static int ics_opal_set_affinity(struct irq_data *d,
}
server = ics_opal_mangle_server(wanted_server);
 
-   pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n",
+   pr_debug("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n",
 d->irq, hw_irq, wanted_server, server);
 
rc = opal_set_xive(hw_irq, server, priority);
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c 
b/arch/powerpc/sysdev/xics/ics-rtas.c
index 6d19d711ed35..b50c6341682e 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -133,6 +133,9 @@ static int ics_rtas_set_affinity(struct irq_data *d,
return -1;
}
 
+   pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq,
+hw_irq, irq_server);
+
status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL,
 hw_irq, irq_server, xics_status[1]);
 
-- 
2.31.1



[PATCH v2 29/32] powerpc/powernv/pci: Rework pnv_opal_pci_msi_eoi()

2021-07-01 Thread Cédric Le Goater
pnv_opal_pci_msi_eoi() is called from KVM to EOI passthrough interrupts
when in real mode. Adding MSI domain broke the hack using the
'ioda.irq_chip' field to deduce the owning PHB. Fix that by using the
IRQ chip data in the MSI domain.

The 'ioda.irq_chip' field is now unused and could be removed from the
pnv_phb struct.

Cc: Alexey Kardashevskiy 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/pnv-pci.h|  2 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c  |  8 
 arch/powerpc/platforms/powernv/pci-ioda.c | 17 +
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pnv-pci.h 
b/arch/powerpc/include/asm/pnv-pci.h
index d0ee0ede5767..b3f480799352 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -33,7 +33,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d);
 bool is_pnv_opal_msi(struct irq_chip *chip);
 
 #ifdef CONFIG_CXL_BASE
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 0a11ec88a0ae..587c33fc4564 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -706,6 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
icp->rm_eoied_irq = irq;
}
 
+   /* Handle passthrough interrupts */
if (state->host_irq) {
++vcpu->stat.pthru_all;
if (state->intr_cpu != -1) {
@@ -759,12 +760,12 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long 
xirr)
 
 static unsigned long eoi_rc;
 
-static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
+static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again)
 {
void __iomem *xics_phys;
int64_t rc;
 
-   rc = pnv_opal_pci_msi_eoi(c, hwirq);
+   rc = pnv_opal_pci_msi_eoi(d);
 
if (rc)
eoi_rc = rc;
@@ -872,8 +873,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
icp_rm_deliver_irq(xics, icp, irq, false);
 
/* EOI the interrupt */
-   icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
-   again);
+   icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, 
again);
 
if (check_too_hard(xics, icp) == H_TOO_HARD)
return 2;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index aa97245eedbf..2389cd79c3c8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1963,12 +1963,21 @@ void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
pe->dma_setup_done = true;
 }
 
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
+/*
+ * Called from KVM in real mode to EOI passthru interrupts. The ICP
+ * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru().
+ *
+ * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call
+ * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ
+ * numbers of the in-the-middle MSI domain are vector numbers and it's
+ * good enough for OPAL. Use that.
+ */
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d)
 {
-   struct pnv_phb *phb = container_of(chip, struct pnv_phb,
-  ioda.irq_chip);
+   struct pci_controller *hose = 
irq_data_get_irq_chip_data(d->parent_data);
+   struct pnv_phb *phb = hose->private_data;
 
-   return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+   return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq);
 }
 
 /*
-- 
2.31.1



[PATCH v2 25/32] powerpc/powernv/pci: Drop unused MSI code

2021-07-01 Thread Cédric Le Goater
MSIs should be fully managed by the PCI and IRQ subsystems now.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci.h  |  6 --
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 -
 arch/powerpc/platforms/powernv/pci.c  | 67 ---
 3 files changed, 100 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index c8d4f222a86f..966a9eb64339 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -123,11 +123,7 @@ struct pnv_phb {
 #endif
 
unsigned intmsi_base;
-   unsigned intmsi32_support;
struct msi_bitmap   msi_bmp;
-   int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
-unsigned int hwirq, unsigned int virq,
-unsigned int is_64, struct msi_msg *msg);
int (*init_m64)(struct pnv_phb *phb);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
@@ -289,8 +285,6 @@ extern void pnv_pci_init_npu2_opencapi_phb(struct 
device_node *np);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
-extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
-extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index e2454439e574..eb38ce1fd434 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2080,29 +2080,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, 
struct pci_dev *dev,
return 0;
 }
 
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
-{
-   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
-   unsigned int xive_num = hwirq - phb->msi_base;
-   int rc;
-
-   rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg);
-   if (rc)
-   return rc;
-
-   /* P8 only */
-   pnv_set_msi_irq_chip(phb, virq);
-
-   pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
-" address=%x_%08x data=%x PE# %x\n",
-pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
-msg->address_hi, msg->address_lo, msg->data, pe->pe_number);
-
-   return 0;
-}
-
 /*
  * The msi_free() op is called before irq_domain_free_irqs_top() when
  * the handler data is still available. Use that to clear the XIVE
@@ -2327,8 +2304,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
return;
}
 
-   phb->msi_setup = pnv_pci_ioda_msi_setup;
-   phb->msi32_support = 1;
pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
 
@@ -2936,8 +2911,6 @@ static const struct pci_controller_ops 
pnv_pci_ioda_controller_ops = {
.dma_dev_setup  = pnv_pci_ioda_dma_dev_setup,
.dma_bus_setup  = pnv_pci_ioda_dma_bus_setup,
.iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
-   .setup_msi_irqs = pnv_setup_msi_irqs,
-   .teardown_msi_irqs  = pnv_teardown_msi_irqs,
.enable_device_hook = pnv_pci_enable_device_hook,
.release_device = pnv_pci_release_device,
.window_alignment   = pnv_pci_window_alignment,
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index b18468dc31ff..e9dee50ea881 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -160,73 +160,6 @@ int pnv_pci_set_power_state(uint64_t id, uint8_t state, 
struct opal_msg *msg)
 }
 EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
 
-int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
-{
-   struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
-   struct msi_desc *entry;
-   struct msi_msg msg;
-   int hwirq;
-   unsigned int virq;
-   int rc;
-
-   if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
-   return -ENODEV;
-
-   if (pdev->no_64bit_msi && !phb->msi32_support)
-   return -ENODEV;
-
-   for_each_pci_msi_entry(entry, pdev) {
-   if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
-   pr_warn("%s: Supports only 64-bit MSIs\n",
-   pci

[PATCH v2 09/32] powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data

2021-07-01 Thread Cédric Le Goater
The MSI domain clears the IRQ with msi_domain_free(), which calls
irq_domain_free_irqs_top(), which clears the handler data. This is a
problem for the XIVE controller since we need to unmap MMIO pages and
free a specific XIVE structure.

The 'msi_free()' handler is called before irq_domain_free_irqs_top()
when the handler data is still available. Use that to clear the XIVE
controller data.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive.h  |  1 +
 arch/powerpc/platforms/pseries/msi.c | 16 +++-
 arch/powerpc/sysdev/xive/common.c|  5 -
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index aa094a8655b0..20ae50ab083c 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base);
 int xive_native_populate_irq_data(u32 hw_irq,
  struct xive_irq_data *data);
 void xive_cleanup_irq_data(struct xive_irq_data *xd);
+void xive_irq_free_data(unsigned int virq);
 void xive_native_free_irq(u32 irq);
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
 
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 591cee9cbc9e..f9635b01b2bf 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -529,6 +529,19 @@ static int pseries_msi_ops_prepare(struct irq_domain 
*domain, struct device *dev
return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
 
+/*
+ * ->msi_free() is called before irq_domain_free_irqs_top() when the
+ * handler data is still available. Use that to clear the XIVE
+ * controller data.
+ */
+static void pseries_msi_ops_msi_free(struct irq_domain *domain,
+struct msi_domain_info *info,
+unsigned int irq)
+{
+   if (xive_enabled())
+   xive_irq_free_data(irq);
+}
+
 /*
  * RTAS can not disable one MSI at a time. It's all or nothing. Do it
  * at the end after all IRQs have been freed.
@@ -546,6 +559,7 @@ static void pseries_msi_domain_free_irqs(struct irq_domain 
*domain,
 
 static struct msi_domain_ops pseries_pci_msi_domain_ops = {
.msi_prepare= pseries_msi_ops_prepare,
+   .msi_free   = pseries_msi_ops_msi_free,
.domain_free_irqs = pseries_msi_domain_free_irqs,
 };
 
@@ -660,7 +674,7 @@ static void pseries_irq_domain_free(struct irq_domain 
*domain, unsigned int virq
 
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
 
-   irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+   /* XIVE domain data is cleared through ->msi_free() */
 }
 
 static const struct irq_domain_ops pseries_irq_domain_ops = {
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 38183c9b21c0..f0012d6b4fe9 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -986,6 +986,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
 
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
+   pr_debug("%s for HW %x\n", __func__, xd->hw_irq);
+
if (xd->eoi_mmio) {
iounmap(xd->eoi_mmio);
if (xd->eoi_mmio == xd->trig_mmio)
@@ -1027,7 +1029,7 @@ static int xive_irq_alloc_data(unsigned int virq, 
irq_hw_number_t hw)
return 0;
 }
 
-static void xive_irq_free_data(unsigned int virq)
+void xive_irq_free_data(unsigned int virq)
 {
struct xive_irq_data *xd = irq_get_handler_data(virq);
 
@@ -1037,6 +1039,7 @@ static void xive_irq_free_data(unsigned int virq)
xive_cleanup_irq_data(xd);
kfree(xd);
 }
+EXPORT_SYMBOL_GPL(xive_irq_free_data);
 
 #ifdef CONFIG_SMP
 
-- 
2.31.1



[PATCH v2 27/32] powerpc/xics: Fix IRQ migration

2021-07-01 Thread Cédric Le Goater
desc->irq_data points to the top level IRQ data descriptor which is
not necessarily in the XICS IRQ domain. MSIs are in another domain for
instance. Fix that by looking for a mapping on the low level XICS IRQ
domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index e82d0d4ddec0..0b8b49446992 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -183,6 +183,8 @@ void xics_migrate_irqs_away(void)
unsigned int irq, virq;
struct irq_desc *desc;
 
+   pr_debug("%s: CPU %u\n", __func__, cpu);
+
/* If we used to be the default server, move to the new "boot_cpuid" */
if (hw_cpu == xics_default_server)
xics_update_irq_servers();
@@ -197,6 +199,7 @@ void xics_migrate_irqs_away(void)
struct irq_chip *chip;
long server;
unsigned long flags;
+   struct irq_data *irqd;
 
/* We can't set affinity on ISA interrupts */
if (virq < NUM_ISA_INTERRUPTS)
@@ -204,9 +207,11 @@ void xics_migrate_irqs_away(void)
/* We only need to migrate enabled IRQS */
if (!desc->action)
continue;
-   if (desc->irq_data.domain != xics_host)
+   /* We need a mapping in the XICS IRQ domain */
+   irqd = irq_domain_get_irq_data(xics_host, virq);
+   if (!irqd)
continue;
-   irq = desc->irq_data.hwirq;
+   irq = irqd_to_hwirq(irqd);
/* We need to get IPIs still. */
if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
continue;
-- 
2.31.1



[PATCH v2 14/32] KVM: PPC: Book3S HV: XIVE: Change interface of passthrough interrupt routines

2021-07-01 Thread Cédric Le Goater
The routine kvmppc_set_passthru_irq() calls kvmppc_xive_set_mapped()
and kvmppc_xive_clr_mapped() with an IRQ descriptor. Use directly the
host IRQ number to remove a useless conversion.

Add some debug.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 ++--
 arch/powerpc/kvm/book3s_hv.c   |  4 ++--
 arch/powerpc/kvm/book3s_xive.c | 17 -
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2d88944f9f34..671fbd1a765e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -664,9 +664,9 @@ extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
 extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc);
+ unsigned long host_irq);
 extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc);
+ unsigned long host_irq);
 extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 048b4ca55cfe..965178aeff13 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5303,7 +5303,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
pimap->n_mapped++;
 
if (xics_on_xive())
-   rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
+   rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
else
kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
if (rc)
@@ -5344,7 +5344,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
}
 
if (xics_on_xive())
-   rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, 
pimap->mapped[i].desc);
+   rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
else
kvmppc_xics_clr_mapped(kvm, guest_gsi, 
pimap->mapped[i].r_hwirq);
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 9268d386b128..434da541a20b 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -921,13 +921,12 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 }
 
 int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
-  struct irq_desc *host_desc)
+  unsigned long host_irq)
 {
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
-   struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
-   unsigned int host_irq = irq_desc_get_irq(host_desc);
+   struct irq_data *host_data = irq_get_irq_data(host_irq);
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
u16 idx;
u8 prio;
@@ -936,7 +935,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
if (!xive)
return -ENODEV;
 
-   pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, 
hw_irq);
+   pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n",
+__func__, guest_irq, host_irq, hw_irq);
 
sb = kvmppc_xive_find_source(xive, guest_irq, );
if (!sb)
@@ -958,7 +958,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
 */
rc = irq_set_vcpu_affinity(host_irq, state);
if (rc) {
-   pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
+   pr_err("Failed to set VCPU affinity for host IRQ %ld\n", 
host_irq);
return rc;
}
 
@@ -1018,12 +1018,11 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned 
long guest_irq,
 EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
 
 int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
-  struct irq_desc *host_desc)
+  unsigned long host_irq)
 {
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
-   unsigned int host_irq = irq_desc_get_irq(host_desc);
u16 idx;
u8 prio;
int rc;
@@ -1031,7 +1030,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long 
guest_irq,
if (!xive)
return -ENODEV;
 
-   pr_devel("clr_mapped girq 0x%lx...\n", guest_irq);
+   pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__,

[PATCH v2 26/32] powerpc/powernv/pci: Adapt is_pnv_opal_msi() to detect passthrough interrupt

2021-07-01 Thread Cédric Le Goater
The pnv_ioda2_msi_eoi() chip handler is not used anymore for MSIs.
Simply use the check on the PSI-MSI chip.

Cc: Alexey Kardashevskiy 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index eb38ce1fd434..6c4b37598bcc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2015,7 +2015,7 @@ static struct irq_chip pnv_pci_msi_irq_chip;
  */
 bool is_pnv_opal_msi(struct irq_chip *chip)
 {
-   return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == 
_pci_msi_irq_chip;
+   return chip == _pci_msi_irq_chip;
 }
 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
 
-- 
2.31.1



[PATCH v2 23/32] powerpc/xics: Drop unmask of MSIs at startup

2021-07-01 Thread Cédric Le Goater
That was a workaround in the XICS domain because of the lack of MSI
domain. This is now handled.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/ics-opal.c | 11 ---
 arch/powerpc/sysdev/xics/ics-rtas.c |  9 -
 2 files changed, 20 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/ics-opal.c 
b/arch/powerpc/sysdev/xics/ics-opal.c
index bf26cae1b982..c4d95d8beb6f 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -62,17 +62,6 @@ static void ics_opal_unmask_irq(struct irq_data *d)
 
 static unsigned int ics_opal_startup(struct irq_data *d)
 {
-#ifdef CONFIG_PCI_MSI
-   /*
-* The generic MSI code returns with the interrupt disabled on the
-* card, using the MSI mask bits. Firmware doesn't appear to unmask
-* at that level, so we do it here by hand.
-*/
-   if (irq_data_get_msi_desc(d))
-   pci_msi_unmask_irq(d);
-#endif
-
-   /* unmask it */
ics_opal_unmask_irq(d);
return 0;
 }
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c 
b/arch/powerpc/sysdev/xics/ics-rtas.c
index b50c6341682e..b9da317b7a2d 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -57,15 +57,6 @@ static void ics_rtas_unmask_irq(struct irq_data *d)
 
 static unsigned int ics_rtas_startup(struct irq_data *d)
 {
-#ifdef CONFIG_PCI_MSI
-   /*
-* The generic MSI code returns with the interrupt disabled on the
-* card, using the MSI mask bits. Firmware doesn't appear to unmask
-* at that level, so we do it here by hand.
-*/
-   if (irq_data_get_msi_desc(d))
-   pci_msi_unmask_irq(d);
-#endif
/* unmask it */
ics_rtas_unmask_irq(d);
return 0;
-- 
2.31.1



[PATCH v2 03/32] powerpc/xive: Add support for IRQ domain hierarchy

2021-07-01 Thread Cédric Le Goater
This adds handlers to allocate/free IRQs in a domain hierarchy. We
could try to use xive_irq_domain_map() in xive_irq_domain_alloc() but
we rely on xive_irq_alloc_data() to set the IRQ handler data and
duplicating the code is simpler.

xive_irq_free_data() needs to be called when IRQ are freed to clear
the MMIO mappings and free the XIVE handler data, xive_irq_data
structure. This is going to be a problem with MSI domains which we
will address later.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 64 +++
 1 file changed, 64 insertions(+)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f985ed331a8c..834f1a378fc2 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1375,7 +1375,71 @@ static void xive_irq_domain_debug_show(struct seq_file 
*m, struct irq_domain *d,
 }
 #endif
 
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+static int xive_irq_domain_translate(struct irq_domain *d,
+struct irq_fwspec *fwspec,
+unsigned long *hwirq,
+unsigned int *type)
+{
+   return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode),
+fwspec->param, fwspec->param_count,
+hwirq, type);
+}
+
+static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+unsigned int nr_irqs, void *arg)
+{
+   struct irq_fwspec *fwspec = arg;
+   irq_hw_number_t hwirq;
+   unsigned int type = IRQ_TYPE_NONE;
+   int i, rc;
+
+   rc = xive_irq_domain_translate(domain, fwspec, , );
+   if (rc)
+   return rc;
+
+   pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++) {
+   /* TODO: call xive_irq_domain_map() */
+
+   /*
+* Mark interrupts as edge sensitive by default so that resend
+* actually works. Will fix that up below if needed.
+*/
+   irq_clear_status_flags(virq, IRQ_LEVEL);
+
+   /* allocates and sets handler data */
+   rc = xive_irq_alloc_data(virq + i, hwirq + i);
+   if (rc)
+   return rc;
+
+   irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ _irq_chip, 
domain->host_data);
+   irq_set_handler(virq + i, handle_fasteoi_irq);
+   }
+
+   return 0;
+}
+
+static void xive_irq_domain_free(struct irq_domain *domain,
+unsigned int virq, unsigned int nr_irqs)
+{
+   int i;
+
+   pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++)
+   xive_irq_free_data(virq + i);
+}
+#endif
+
 static const struct irq_domain_ops xive_irq_domain_ops = {
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+   .alloc  = xive_irq_domain_alloc,
+   .free   = xive_irq_domain_free,
+   .translate = xive_irq_domain_translate,
+#endif
.match = xive_irq_domain_match,
.map = xive_irq_domain_map,
.unmap = xive_irq_domain_unmap,
-- 
2.31.1



[PATCH v2 24/32] powerpc/pseries/pci: Drop unused MSI code

2021-07-01 Thread Cédric Le Goater
MSIs should be fully managed by the PCI and IRQ subsystems now.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 87 
 1 file changed, 87 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index e196cc1b8540..1b305e411862 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -111,21 +111,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int 
offset)
return rtas_ret[0];
 }
 
-static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
-{
-   struct msi_desc *entry;
-
-   for_each_pci_msi_entry(entry, pdev) {
-   if (!entry->irq)
-   continue;
-
-   irq_set_msi_desc(entry->irq, NULL);
-   irq_dispose_mapping(entry->irq);
-   }
-
-   rtas_disable_msi(pdev);
-}
-
 static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
 {
struct device_node *dn;
@@ -459,66 +444,6 @@ static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int 
nvec_in, int type,
return 0;
 }
 
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
-{
-   struct pci_dn *pdn;
-   int hwirq, virq, i;
-   int rc;
-   struct msi_desc *entry;
-   struct msi_msg msg;
-
-   rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL);
-   if (rc)
-   return rc;
-
-   pdn = pci_get_pdn(pdev);
-   i = 0;
-   for_each_pci_msi_entry(entry, pdev) {
-   hwirq = rtas_query_irq_number(pdn, i++);
-   if (hwirq < 0) {
-   pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
-   return hwirq;
-   }
-
-   /*
-* Depending on the number of online CPUs in the original
-* kernel, it is likely for CPU #0 to be offline in a kdump
-* kernel. The associated IRQs in the affinity mappings
-* provided by irq_create_affinity_masks() are thus not
-* started by irq_startup(), as per-design for managed IRQs.
-* This can be a problem with multi-queue block devices driven
-* by blk-mq : such a non-started IRQ is very likely paired
-* with the single queue enforced by blk-mq during kdump (see
-* blk_mq_alloc_tag_set()). This causes the device to remain
-* silent and likely hangs the guest at some point.
-*
-* We don't really care for fine-grained affinity when doing
-* kdump actually : simply ignore the pre-computed affinity
-* masks in this case and let the default mask with all CPUs
-* be used when creating the IRQ mappings.
-*/
-   if (is_kdump_kernel())
-   virq = irq_create_mapping(NULL, hwirq);
-   else
-   virq = irq_create_mapping_affinity(NULL, hwirq,
-  entry->affinity);
-
-   if (!virq) {
-   pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
-   return -ENOSPC;
-   }
-
-   dev_dbg(>dev, "rtas_msi: allocated virq %d\n", virq);
-   irq_set_msi_desc(virq, entry);
-
-   /* Read config space back so we can restore after reset */
-   __pci_read_msi_msg(entry, );
-   entry->msg = msg;
-   }
-
-   return 0;
-}
-
 static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device 
*dev,
   int nvec, msi_alloc_info_t *arg)
 {
@@ -759,8 +684,6 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
 
 static int rtas_msi_init(void)
 {
-   struct pci_controller *phb;
-
query_token  = rtas_token("ibm,query-interrupt-source-number");
change_token = rtas_token("ibm,change-msi");
 
@@ -772,16 +695,6 @@ static int rtas_msi_init(void)
 
pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
 
-   WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
-   pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
-   pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
-
-   list_for_each_entry(phb, _list, list_node) {
-   WARN_ON(phb->controller_ops.setup_msi_irqs);
-   phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
-   phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
-   }
-
WARN_ON(ppc_md.pci_irq_fixup);
ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
 
-- 
2.31.1



[PATCH v2 10/32] powerpc/pseries/pci: Add support of MSI domains to PHB hotplug

2021-07-01 Thread Cédric Le Goater
Simply allocate or release the MSI domains when a PHB is inserted in
or removed from the machine.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/pseries.h   |  1 +
 arch/powerpc/platforms/pseries/msi.c   | 10 ++
 arch/powerpc/platforms/pseries/pci_dlpar.c |  4 
 3 files changed, 15 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index d9280262588b..3544778e06d0 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -86,6 +86,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge 
*bridge);
 
 extern struct pci_controller_ops pseries_pci_controller_ops;
 int pseries_msi_allocate_domains(struct pci_controller *phb);
+void pseries_msi_free_domains(struct pci_controller *phb);
 
 unsigned long pseries_memory_block_size(void);
 
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index f9635b01b2bf..e2127a3f7ebd 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -733,6 +733,16 @@ int pseries_msi_allocate_domains(struct pci_controller 
*phb)
return __pseries_msi_allocate_domains(phb, count);
 }
 
+void pseries_msi_free_domains(struct pci_controller *phb)
+{
+   if (phb->msi_domain)
+   irq_domain_remove(phb->msi_domain);
+   if (phb->dev_domain)
+   irq_domain_remove(phb->dev_domain);
+   if (phb->fwnode)
+   irq_domain_free_fwnode(phb->fwnode);
+}
+
 static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
 {
/* No LSI -> leave MSIs (if any) configured */
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c 
b/arch/powerpc/platforms/pseries/pci_dlpar.c
index a8f9140a24fa..90c9d3531694 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -33,6 +33,8 @@ struct pci_controller *init_phb_dynamic(struct device_node 
*dn)
 
pci_devs_phb_init_dynamic(phb);
 
+   pseries_msi_allocate_domains(phb);
+
/* Create EEH devices for the PHB */
eeh_phb_pe_create(phb);
 
@@ -74,6 +76,8 @@ int remove_phb_dynamic(struct pci_controller *phb)
}
}
 
+   pseries_msi_free_domains(phb);
+
/* Remove the PCI bus and unregister the bridge device from sysfs */
phb->bus = NULL;
pci_remove_bus(b);
-- 
2.31.1



[PATCH v2 06/32] powerpc/xive: Drop unmask of MSIs at startup

2021-07-01 Thread Cédric Le Goater
That was a workaround in the XIVE domain because of the lack of MSI
domain. This is now handled.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 2c907a4a2b05..a03057bfccfd 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -626,16 +626,6 @@ static unsigned int xive_irq_startup(struct irq_data *d)
pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
 d->irq, hw_irq, d);
 
-#ifdef CONFIG_PCI_MSI
-   /*
-* The generic MSI code returns with the interrupt disabled on the
-* card, using the MSI mask bits. Firmware doesn't appear to unmask
-* at that level, so we do it here by hand.
-*/
-   if (irq_data_get_msi_desc(d))
-   pci_msi_unmask_irq(d);
-#endif
-
/* Pick a target */
target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
if (target == XIVE_INVALID_TARGET) {
-- 
2.31.1



[PATCH v2 21/32] powerpc/powernv/pci: Customize the MSI EOI handler to support PHB3

2021-07-01 Thread Cédric Le Goater
PHB3s need an extra OPAL call to EOI the interrupt. The call takes an
OPAL HW IRQ number but it is translated into a vector number in OPAL.
Here, we directly use the vector number of the in-the-middle "PNV-MSI"
domain instead of grabbing the OPAL HW IRQ number in the XICS parent
domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index e77caa4dbbdf..b498876a976f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2169,12 +2169,33 @@ static void pnv_msi_compose_msg(struct irq_data *d, 
struct msi_msg *msg)
entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
 }
 
+/*
+ * The IRQ data is mapped in the MSI domain in which HW IRQ numbers
+ * correspond to vector numbers.
+ */
+static void pnv_msi_eoi(struct irq_data *d)
+{
+   struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+   struct pnv_phb *phb = hose->private_data;
+
+   if (phb->model == PNV_PHB_MODEL_PHB3) {
+   /*
+* The EOI OPAL call takes an OPAL HW IRQ number but
+* since it is translated into a vector number in
+* OPAL, use that directly.
+*/
+   WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq));
+   }
+
+   irq_chip_eoi_parent(d);
+}
+
 static struct irq_chip pnv_msi_irq_chip = {
.name   = "PNV-MSI",
.irq_shutdown   = pnv_msi_shutdown,
.irq_mask   = irq_chip_mask_parent,
.irq_unmask = irq_chip_unmask_parent,
-   .irq_eoi= irq_chip_eoi_parent,
+   .irq_eoi= pnv_msi_eoi,
.irq_set_affinity   = irq_chip_set_affinity_parent,
.irq_compose_msi_msg= pnv_msi_compose_msg,
 };
-- 
2.31.1



[PATCH v2 18/32] powerpc/xics: Give a name to the default XICS IRQ domain

2021-07-01 Thread Cédric Le Goater
and clean up the error path.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 399dd5becf65..419d91bffec3 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -412,11 +412,22 @@ static const struct irq_domain_ops xics_host_ops = {
.xlate = xics_host_xlate,
 };
 
-static void __init xics_init_host(void)
+static int __init xics_allocate_domain(void)
 {
-   xics_host = irq_domain_add_tree(NULL, _host_ops, NULL);
-   BUG_ON(xics_host == NULL);
+   struct fwnode_handle *fn;
+
+   fn = irq_domain_alloc_named_fwnode("XICS");
+   if (!fn)
+   return -ENOMEM;
+
+   xics_host = irq_domain_create_tree(fn, _host_ops, NULL);
+   if (!xics_host) {
+   irq_domain_free_fwnode(fn);
+   return -ENOMEM;
+   }
+
irq_set_default_host(xics_host);
+   return 0;
 }
 
 void __init xics_register_ics(struct ics *ics)
@@ -480,6 +491,8 @@ void __init xics_init(void)
/* Initialize common bits */
xics_get_server_size();
xics_update_irq_servers();
-   xics_init_host();
+   rc = xics_allocate_domain();
+   if (rc < 0)
+   pr_err("XICS: Failed to create IRQ domain");
xics_setup_cpu();
 }
-- 
2.31.1



[PATCH v2 16/32] powerpc/xics: Remove ICS list

2021-07-01 Thread Cédric Le Goater
We always had only one ICS per machine. Simplify the XICS driver by
removing the ICS list.

The ICS stored in the chip data of the XICS domain becomes useless and
we don't need it anymore to migrate away IRQs from a CPU. This will be
removed in a subsequent patch.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 45 +++---
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 7c561a612366..05e5e7d84ca7 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct xics_cppr, xics_cppr);
 
 struct irq_domain *xics_host;
 
-static LIST_HEAD(ics_list);
+static struct ics *xics_ics;
 
 void xics_update_irq_servers(void)
 {
@@ -111,12 +111,11 @@ void xics_setup_cpu(void)
 
 void xics_mask_unknown_vec(unsigned int vec)
 {
-   struct ics *ics;
-
pr_err("Interrupt 0x%x (real) is invalid, disabling it.\n", vec);
 
-   list_for_each_entry(ics, _list, link)
-   ics->mask_unknown(ics, vec);
+   if (WARN_ON(!xics_ics))
+   return;
+   xics_ics->mask_unknown(xics_ics, vec);
 }
 
 
@@ -198,7 +197,6 @@ void xics_migrate_irqs_away(void)
struct irq_chip *chip;
long server;
unsigned long flags;
-   struct ics *ics;
 
/* We can't set affinity on ISA interrupts */
if (virq < NUM_ISA_INTERRUPTS)
@@ -219,13 +217,10 @@ void xics_migrate_irqs_away(void)
raw_spin_lock_irqsave(>lock, flags);
 
/* Locate interrupt server */
-   server = -1;
-   ics = irq_desc_get_chip_data(desc);
-   if (ics)
-   server = ics->get_server(ics, irq);
+   server = xics_ics->get_server(xics_ics, irq);
if (server < 0) {
-   printk(KERN_ERR "%s: Can't find server for irq %d\n",
-  __func__, irq);
+   pr_err("%s: Can't find server for irq %d/%x\n",
+  __func__, virq, irq);
goto unlock;
}
 
@@ -307,13 +302,9 @@ int xics_get_irq_server(unsigned int virq, const struct 
cpumask *cpumask,
 static int xics_host_match(struct irq_domain *h, struct device_node *node,
   enum irq_domain_bus_token bus_token)
 {
-   struct ics *ics;
-
-   list_for_each_entry(ics, _list, link)
-   if (ics->host_match(ics, node))
-   return 1;
-
-   return 0;
+   if (WARN_ON(!xics_ics))
+   return 0;
+   return xics_ics->host_match(xics_ics, node) ? 1 : 0;
 }
 
 /* Dummies */
@@ -330,8 +321,6 @@ static struct irq_chip xics_ipi_chip = {
 static int xics_host_map(struct irq_domain *h, unsigned int virq,
 irq_hw_number_t hw)
 {
-   struct ics *ics;
-
pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw);
 
/*
@@ -348,12 +337,14 @@ static int xics_host_map(struct irq_domain *h, unsigned 
int virq,
return 0;
}
 
+   if (WARN_ON(!xics_ics))
+   return -EINVAL;
+
/* Let the ICS setup the chip data */
-   list_for_each_entry(ics, _list, link)
-   if (ics->map(ics, virq) == 0)
-   return 0;
+   if (xics_ics->map(xics_ics, virq))
+   return -EINVAL;
 
-   return -EINVAL;
+   return 0;
 }
 
 static int xics_host_xlate(struct irq_domain *h, struct device_node *ct,
@@ -427,7 +418,9 @@ static void __init xics_init_host(void)
 
 void __init xics_register_ics(struct ics *ics)
 {
-   list_add(>link, _list);
+   if (WARN_ONCE(xics_ics, "XICS: Source Controller is already defined !"))
+   return;
+   xics_ics = ics;
 }
 
 static void __init xics_get_server_size(void)
-- 
2.31.1



[PATCH v2 02/32] powerpc/pseries/pci: Introduce rtas_prepare_msi_irqs()

2021-07-01 Thread Cédric Le Goater
This splits the routine setting the MSIs in two parts: allocation of
MSIs for the PCI device at the FW level (RTAS) and the actual mapping
and activation of the IRQs.

rtas_prepare_msi_irqs() will serve as a handler for the PCI MSI domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index d2d090e04745..4bf14f27e1aa 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -373,12 +373,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
 }
 
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type,
+msi_alloc_info_t *arg)
 {
struct pci_dn *pdn;
-   int hwirq, virq, i, quota, rc;
-   struct msi_desc *entry;
-   struct msi_msg msg;
+   int quota, rc;
int nvec = nvec_in;
int use_32bit_msi_hack = 0;
 
@@ -456,6 +455,22 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int 
nvec_in, int type)
return rc;
}
 
+   return 0;
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+{
+   struct pci_dn *pdn;
+   int hwirq, virq, i;
+   int rc;
+   struct msi_desc *entry;
+   struct msi_msg msg;
+
+   rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL);
+   if (rc)
+   return rc;
+
+   pdn = pci_get_pdn(pdev);
i = 0;
for_each_pci_msi_entry(entry, pdev) {
hwirq = rtas_query_irq_number(pdn, i++);
-- 
2.31.1



[PATCH v2 13/32] KVM: PPC: Book3S HV: Use the new IRQ chip to detect passthrough interrupts

2021-07-01 Thread Cédric Le Goater
Passthrough PCI MSI interrupts are detected in KVM with a check on a
specific EOI handler (P8) or on XIVE (P9). We can now check the
PCI-MSI IRQ chip which is cleaner.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_hv.c  | 2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index dd39b5373075..048b4ca55cfe 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5260,7 +5260,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
 * what our real-mode EOI code does, or a XIVE interrupt
 */
chip = irq_data_get_irq_chip(>irq_data);
-   if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
+   if (!chip || !is_pnv_opal_msi(chip)) {
pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map 
for (%d,%d)\n",
host_irq, guest_gsi);
mutex_unlock(>lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d2a17fcb6002..e77caa4dbbdf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2007,13 +2007,15 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned 
int virq)
irq_set_chip(virq, >ioda.irq_chip);
 }
 
+static struct irq_chip pnv_pci_msi_irq_chip;
+
 /*
  * Returns true iff chip is something that we could call
  * pnv_opal_pci_msi_eoi for.
  */
 bool is_pnv_opal_msi(struct irq_chip *chip)
 {
-   return chip->irq_eoi == pnv_ioda2_msi_eoi;
+   return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == 
_pci_msi_irq_chip;
 }
 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
 
-- 
2.31.1



[PATCH v2 12/32] powerpc/powernv/pci: Add MSI domains

2021-07-01 Thread Cédric Le Goater
This is very similar to the MSI domains of the pSeries platform. The
MSI allocator is directly handled under the Linux PHB in the
in-the-middle "PNV-MSI" domain.

Only the XIVE (P9/P10) parent domain is supported for now. Support for
XICS will come later.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 188 ++
 1 file changed, 188 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2922674cc934..d2a17fcb6002 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -2100,6 +2101,189 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, 
struct pci_dev *dev,
return 0;
 }
 
+/*
+ * The msi_free() op is called before irq_domain_free_irqs_top() when
+ * the handler data is still available. Use that to clear the XIVE
+ * controller.
+ */
+static void pnv_msi_ops_msi_free(struct irq_domain *domain,
+struct msi_domain_info *info,
+unsigned int irq)
+{
+   if (xive_enabled())
+   xive_irq_free_data(irq);
+}
+
+static struct msi_domain_ops pnv_pci_msi_domain_ops = {
+   .msi_free   = pnv_msi_ops_msi_free,
+};
+
+static void pnv_msi_shutdown(struct irq_data *d)
+{
+   d = d->parent_data;
+   if (d->chip->irq_shutdown)
+   d->chip->irq_shutdown(d);
+}
+
+static void pnv_msi_mask(struct irq_data *d)
+{
+   pci_msi_mask_irq(d);
+   irq_chip_mask_parent(d);
+}
+
+static void pnv_msi_unmask(struct irq_data *d)
+{
+   pci_msi_unmask_irq(d);
+   irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip pnv_pci_msi_irq_chip = {
+   .name   = "PNV-PCI-MSI",
+   .irq_shutdown   = pnv_msi_shutdown,
+   .irq_mask   = pnv_msi_mask,
+   .irq_unmask = pnv_msi_unmask,
+   .irq_eoi= irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pnv_msi_domain_info = {
+   .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX),
+   .ops   = _pci_msi_domain_ops,
+   .chip  = _pci_msi_irq_chip,
+};
+
+static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
+{
+   struct msi_desc *entry = irq_data_get_msi_desc(d);
+   struct pci_dev *pdev = msi_desc_to_pci_dev(entry);
+   struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+   struct pnv_phb *phb = hose->private_data;
+   int rc;
+
+   rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
+ entry->msi_attrib.is_64, msg);
+   if (rc)
+   dev_err(>dev, "Failed to setup %s-bit MSI #%ld : %d\n",
+   entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
+}
+
+static struct irq_chip pnv_msi_irq_chip = {
+   .name   = "PNV-MSI",
+   .irq_shutdown   = pnv_msi_shutdown,
+   .irq_mask   = irq_chip_mask_parent,
+   .irq_unmask = irq_chip_unmask_parent,
+   .irq_eoi= irq_chip_eoi_parent,
+   .irq_set_affinity   = irq_chip_set_affinity_parent,
+   .irq_compose_msi_msg= pnv_msi_compose_msg,
+};
+
+static int pnv_irq_parent_domain_alloc(struct irq_domain *domain,
+  unsigned int virq, int hwirq)
+{
+   struct irq_fwspec parent_fwspec;
+   int ret;
+
+   parent_fwspec.fwnode = domain->parent->fwnode;
+   parent_fwspec.param_count = 2;
+   parent_fwspec.param[0] = hwirq;
+   parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
+
+   ret = irq_domain_alloc_irqs_parent(domain, virq, 1, _fwspec);
+   if (ret)
+   return ret;
+
+   return 0;
+}
+
+static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+   unsigned int nr_irqs, void *arg)
+{
+   struct pci_controller *hose = domain->host_data;
+   struct pnv_phb *phb = hose->private_data;
+   msi_alloc_info_t *info = arg;
+   struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc);
+   int hwirq;
+   int i, ret;
+
+   hwirq = msi_bitmap_alloc_hwirqs(>msi_bmp, nr_irqs);
+   if (hwirq < 0) {
+   dev_warn(>dev, "failed to find a free MSI\n");
+   return -ENOSPC;
+   }
+
+   dev_dbg(>dev, "%s bridge %pOF %d/%x #%d\n", __func__,
+   hose->dn, virq, hwirq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++) {
+   ret = pnv_irq_parent_domain_alloc(domain, virq + i,
+ phb->msi_base + hwirq + i);
+   if (ret)
+   got

[PATCH v2 11/32] powerpc/powernv/pci: Introduce __pnv_pci_ioda_msi_setup()

2021-07-01 Thread Cédric Le Goater
It will be used as a 'compose_msg' handler of the MSI domain introduced
later.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 +++
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7de464679292..2922674cc934 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2016,15 +2016,17 @@ bool is_pnv_opal_msi(struct irq_chip *chip)
 }
 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
 
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
+static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+   unsigned int xive_num,
+   unsigned int is_64, struct msi_msg *msg)
 {
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
-   unsigned int xive_num = hwirq - phb->msi_base;
__be32 data;
int rc;
 
+   dev_dbg(>dev, "%s: setup %s-bit MSI for vector #%d\n", __func__,
+   is_64 ? "64" : "32", xive_num);
+
/* No PE assigned ? bail out ... no MSI for you ! */
if (pe == NULL)
return -ENXIO;
@@ -2072,12 +2074,28 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, 
struct pci_dev *dev,
}
msg->data = be32_to_cpu(data);
 
+   return 0;
+}
+
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg)
+{
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+   unsigned int xive_num = hwirq - phb->msi_base;
+   int rc;
+
+   rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg);
+   if (rc)
+   return rc;
+
+   /* P8 only */
pnv_set_msi_irq_chip(phb, virq);
 
pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
 " address=%x_%08x data=%x PE# %x\n",
 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
-msg->address_hi, msg->address_lo, data, pe->pe_number);
+msg->address_hi, msg->address_lo, msg->data, pe->pe_number);
 
return 0;
 }
-- 
2.31.1



[PATCH v2 01/32] powerpc/pseries/pci: Introduce __find_pe_total_msi()

2021-07-01 Thread Cédric Le Goater
It will help to size the PCI MSI domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 637300330507..d2d090e04745 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -164,12 +164,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec)
 
 /* Quota calculation */
 
-static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+static struct device_node *__find_pe_total_msi(struct device_node *node, int 
*total)
 {
struct device_node *dn;
const __be32 *p;
 
-   dn = of_node_get(pci_device_to_OF_node(dev));
+   dn = of_node_get(node);
while (dn) {
p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
if (p) {
@@ -185,6 +185,11 @@ static struct device_node *find_pe_total_msi(struct 
pci_dev *dev, int *total)
return NULL;
 }
 
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+   return __find_pe_total_msi(pci_device_to_OF_node(dev), total);
+}
+
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
struct device_node *dn;
-- 
2.31.1



[PATCH v2 08/32] powerpc/pseries/pci: Add a domain_free_irqs() handler

2021-07-01 Thread Cédric Le Goater
The RTAS firmware can not disable one MSI at a time. It's all or
nothing. We need a custom free IRQ handler for that.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 86c6809ebac2..591cee9cbc9e 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -529,8 +529,24 @@ static int pseries_msi_ops_prepare(struct irq_domain 
*domain, struct device *dev
return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
 
+/*
+ * RTAS can not disable one MSI at a time. It's all or nothing. Do it
+ * at the end after all IRQs have been freed.
+ */
+static void pseries_msi_domain_free_irqs(struct irq_domain *domain,
+struct device *dev)
+{
+   if (WARN_ON_ONCE(!dev_is_pci(dev)))
+   return;
+
+   __msi_domain_free_irqs(domain, dev);
+
+   rtas_disable_msi(to_pci_dev(dev));
+}
+
 static struct msi_domain_ops pseries_pci_msi_domain_ops = {
.msi_prepare= pseries_msi_ops_prepare,
+   .domain_free_irqs = pseries_msi_domain_free_irqs,
 };
 
 static void pseries_msi_shutdown(struct irq_data *d)
-- 
2.31.1



[PATCH v2 00/32] powerpc: Add MSI IRQ domains to PCI drivers

2021-07-01 Thread Cédric Le Goater
Hello,

This series adds support for MSI IRQ domains on top of the XICS (P8)
and XIVE (P9/P10) IRQ domains for the PowerNV (baremetal) and pSeries
(VM) platforms. It should simplify and improve IRQ affinity of PCI
MSIs under these PowerPC platforms, specially for drivers distributing
multiple RX/TX queues on the different CPUs of the system.

Data locality can still be improved with an interrupt controller node
per chip but this requires FW changes. It could be done under OPAL.

The patchset has a large impact but it is well contained under the MSI
support. Initial tests were done on the P8, P9 and P10 PowerNV and
pSeries platforms, under the KVM and PowerVM hypervisor. PCI passthrough
was tested on P8/KVM, P9/KVM and P9/pVM with both interrupt modes.

P8 passthrough has some optimization to EOI MSIs when under real mode :

 e3c13e56a471 ("KVM: PPC: Book3S HV: Handle passthrough interrupts in guest")
 5d375199ea96 ("KVM: PPC: Book3S HV: Set server for passed-through interrupts")

They give us a ~10% bandwidth improvement on some 100G adapters
(Thanks Alexey), so it's good to keep but they require access to the
low level IRQ domain of the machine. It should be possible to rework
the code and use the MSI IRQ domains instead but for now, it's simpler
to keep the bypass. That can come later.

The P8/CAPI driver is also impacted. Tests were done on a Firestone
system with a memory AFU.

Thanks,

C.

Changes since v2 :

 - Included some CONFIG_IRQ_DOMAIN_HIERARCHY ifdefs
 - Microwatt fixes for ICS native
 - Removed irqd_is_started() check when setting the affinity

Cédric Le Goater (32):
  powerpc/pseries/pci: Introduce __find_pe_total_msi()
  powerpc/pseries/pci: Introduce rtas_prepare_msi_irqs()
  powerpc/xive: Add support for IRQ domain hierarchy
  powerpc/xive: Ease debugging of xive_irq_set_affinity()
  powerpc/pseries/pci: Add MSI domains
  powerpc/xive: Drop unmask of MSIs at startup
  powerpc/xive: Remove irqd_is_started() check when setting the affinity
  powerpc/pseries/pci: Add a domain_free_irqs() handler
  powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data
  powerpc/pseries/pci: Add support of MSI domains to PHB hotplug
  powerpc/powernv/pci: Introduce __pnv_pci_ioda_msi_setup()
  powerpc/powernv/pci: Add MSI domains
  KVM: PPC: Book3S HV: Use the new IRQ chip to detect passthrough
interrupts
  KVM: PPC: Book3S HV: XIVE: Change interface of passthrough interrupt
routines
  KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts
  powerpc/xics: Remove ICS list
  powerpc/xics: Rename the map handler in a check handler
  powerpc/xics: Give a name to the default XICS IRQ domain
  powerpc/xics: Add debug logging to the set_irq_affinity handlers
  powerpc/xics: Add support for IRQ domain hierarchy
  powerpc/powernv/pci: Customize the MSI EOI handler to support PHB3
  powerpc/pci: Drop XIVE restriction on MSI domains
  powerpc/xics: Drop unmask of MSIs at startup
  powerpc/pseries/pci: Drop unused MSI code
  powerpc/powernv/pci: Drop unused MSI code
  powerpc/powernv/pci: Adapt is_pnv_opal_msi() to detect passthrough
interrupt
  powerpc/xics: Fix IRQ migration
  powerpc/powernv/pci: Set the IRQ chip data for P8/CXL devices
  powerpc/powernv/pci: Rework pnv_opal_pci_msi_eoi()
  KVM: PPC: Book3S HV: XICS: Fix mapping of passthrough interrupts
  powerpc/xive: Use XIVE domain under xmon and debugfs
  genirq: Improve "hwirq" output in /proc and /sys/

 arch/powerpc/include/asm/kvm_ppc.h |   4 +-
 arch/powerpc/include/asm/pci-bridge.h  |   5 +
 arch/powerpc/include/asm/pnv-pci.h |   2 +-
 arch/powerpc/include/asm/xics.h|   3 +-
 arch/powerpc/include/asm/xive.h|   1 +
 arch/powerpc/platforms/powernv/pci.h   |   6 -
 arch/powerpc/platforms/pseries/pseries.h   |   2 +
 arch/powerpc/kernel/pci-common.c   |   6 +
 arch/powerpc/kvm/book3s_hv.c   |  18 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c   |   8 +-
 arch/powerpc/kvm/book3s_xive.c |  18 +-
 arch/powerpc/platforms/powernv/pci-ioda.c  | 256 --
 arch/powerpc/platforms/powernv/pci.c   |  67 -
 arch/powerpc/platforms/pseries/msi.c   | 296 -
 arch/powerpc/platforms/pseries/pci_dlpar.c |   4 +
 arch/powerpc/platforms/pseries/setup.c |   2 +
 arch/powerpc/sysdev/xics/ics-native.c  |  13 +-
 arch/powerpc/sysdev/xics/ics-opal.c|  40 +--
 arch/powerpc/sysdev/xics/ics-rtas.c|  40 +--
 arch/powerpc/sysdev/xics/xics-common.c | 129 ++---
 arch/powerpc/sysdev/xive/common.c  |  98 +--
 kernel/irq/irqdesc.c   |   2 +-
 kernel/irq/irqdomain.c |   1 +
 kernel/irq/proc.c  |   2 +-
 24 files changed, 710 insertions(+), 313 deletions(-)

-- 
2.31.1



[PATCH v2 05/32] powerpc/pseries/pci: Add MSI domains

2021-07-01 Thread Cédric Le Goater
Two IRQ domains are added on top of default machine IRQ domain.

First, the top level "pSeries-PCI-MSI" domain deals with the MSI
specificities. In this domain, the HW IRQ numbers are generated by the
PCI MSI layer, they compose a unique ID for an MSI source with the PCI
device identifier and the MSI vector number.

These numbers can be quite large on a pSeries machine running under
the IBM Hypervisor and /sys/kernel/irq/ and /proc/interrupts will
require small fixes to show them correctly.

Second domain is the in-the-middle "pSeries-MSI" domain which acts as
a proxy between the PCI MSI subsystem and the machine IRQ subsystem.
It usually allocate the MSI vector numbers but, on pSeries machines,
this is done by the RTAS FW and RTAS returns IRQ numbers in the IRQ
number space of the machine. This is why the in-the-middle "pSeries-MSI"
domain has the same HW IRQ numbers as its parent domain.

Only the XIVE (P9/P10) parent domain is supported for now. We still
need to add support for IRQ domain hierarchy under XICS.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/pci-bridge.h|   5 +
 arch/powerpc/platforms/pseries/pseries.h |   1 +
 arch/powerpc/kernel/pci-common.c |   6 +
 arch/powerpc/platforms/pseries/msi.c | 185 +++
 arch/powerpc/platforms/pseries/setup.c   |   2 +
 5 files changed, 199 insertions(+)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 74424c14515c..90f488fa4c17 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -126,6 +126,11 @@ struct pci_controller {
 #endif /* CONFIG_PPC64 */
 
void *private_data;
+
+   /* IRQ domain hierarchy */
+   struct irq_domain   *dev_domain;
+   struct irq_domain   *msi_domain;
+   struct fwnode_handle*fwnode;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 1f051a786fb3..d9280262588b 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -85,6 +85,7 @@ struct pci_host_bridge;
 int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
 
 extern struct pci_controller_ops pseries_pci_controller_ops;
+int pseries_msi_allocate_domains(struct pci_controller *phb);
 
 unsigned long pseries_memory_block_size(void);
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 001e90cd8948..c3573430919d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1060,11 +1061,16 @@ void pcibios_bus_add_device(struct pci_dev *dev)
 
 int pcibios_add_device(struct pci_dev *dev)
 {
+   struct irq_domain *d;
+
 #ifdef CONFIG_PCI_IOV
if (ppc_md.pcibios_fixup_sriov)
ppc_md.pcibios_fixup_sriov(dev);
 #endif /* CONFIG_PCI_IOV */
 
+   d = dev_get_msi_domain(>bus->dev);
+   if (d)
+   dev_set_msi_domain(>dev, d);
return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 4bf14f27e1aa..86c6809ebac2 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pseries.h"
 
@@ -518,6 +519,190 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int 
nvec_in, int type)
return 0;
 }
 
+static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device 
*dev,
+  int nvec, msi_alloc_info_t *arg)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct msi_desc *desc = first_pci_msi_entry(pdev);
+   int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+
+   return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+}
+
+static struct msi_domain_ops pseries_pci_msi_domain_ops = {
+   .msi_prepare= pseries_msi_ops_prepare,
+};
+
+static void pseries_msi_shutdown(struct irq_data *d)
+{
+   d = d->parent_data;
+   if (d->chip->irq_shutdown)
+   d->chip->irq_shutdown(d);
+}
+
+static void pseries_msi_mask(struct irq_data *d)
+{
+   pci_msi_mask_irq(d);
+   irq_chip_mask_parent(d);
+}
+
+static void pseries_msi_unmask(struct irq_data *d)
+{
+   pci_msi_unmask_irq(d);
+   irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip pseries_pci_msi_irq_chip = {
+   .name   = "pSeries-PCI-MSI",
+   .irq_shutdown   = pseries_msi_shutdown,
+   .irq_mask   = pseries_msi_mask,
+   .irq_unmask = pseries_msi_unmask,
+   .irq_eoi= irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pseries_msi_domain_info = {
+   .flags = (MSI_FLAG_USE_DEF_D

[PATCH v2 04/32] powerpc/xive: Ease debugging of xive_irq_set_affinity()

2021-07-01 Thread Cédric Le Goater
pr_debug() is easier to activate and it helps to know how the kernel
configures the HW when tweaking the IRQ subsystem.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 834f1a378fc2..2c907a4a2b05 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -723,7 +723,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
u32 target, old_target;
int rc = 0;
 
-   pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
+   pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq);
 
/* Is this valid ? */
if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
@@ -768,7 +768,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
return rc;
}
 
-   pr_devel("  target: 0x%x\n", target);
+   pr_debug("  target: 0x%x\n", target);
xd->target = target;
 
/* Give up previous target */
-- 
2.31.1



[PATCH v2 07/32] powerpc/xive: Remove irqd_is_started() check when setting the affinity

2021-07-01 Thread Cédric Le Goater
In the early days of XIVE support, commit cffb717ceb8e ("powerpc/xive:
Ensure active irqd when setting affinity") tried to fix an issue
related to interrupt migration. If the root cause was related to CPU
unplug, it should have been fixed and there is no reason to keep the
irqd_is_started() check. This test is also breaking affinity setting
of MSIs which can set before starting the associated IRQ.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index a03057bfccfd..38183c9b21c0 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -719,10 +719,6 @@ static int xive_irq_set_affinity(struct irq_data *d,
if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
return -EINVAL;
 
-   /* Don't do anything if the interrupt isn't started */
-   if (!irqd_is_started(d))
-   return IRQ_SET_MASK_OK;
-
/*
 * If existing target is already in the new mask, and is
 * online then do nothing.
-- 
2.31.1



[PATCH] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs

2021-06-29 Thread Cédric Le Goater
On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
runtime. Today, the IPI is not created for such nodes, and hot-plugged
CPUs use a bogus IPI, which leads to soft lockups.

We could create the node IPI on demand but it is a bit complex because
this code would be called under bringup_up() and some IRQ locking is
being done. The simplest solution is to create the IPIs for all nodes
at startup.

Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
Cc: sta...@vger.kernel.org # v5.13
Reported-by: Geetika Moolchandani 
Cc: Srikar Dronamraju 
Signed-off-by: Cédric Le Goater 
---

This patch breaks old versions of irqbalance (<= v1.4). Possible nodes
are collected from /sys/devices/system/node/ but CPU-less nodes are
not listed there. When interrupts are scanned, the link representing
the node structure is NULL and segfault occurs.

Version 1.7 seems immune. 

---
 arch/powerpc/sysdev/xive/common.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f3b16ed48b05..5d2c58dba57e 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1143,10 +1143,6 @@ static int __init xive_request_ipi(void)
struct xive_ipi_desc *xid = _ipis[node];
struct xive_ipi_alloc_info info = { node };
 
-   /* Skip nodes without CPUs */
-   if (cpumask_empty(cpumask_of_node(node)))
-   continue;
-
/*
 * Map one IPI interrupt per node for all cpus of that node.
 * Since the HW interrupt number doesn't have any meaning,
-- 
2.31.1



Re: [PATCH 07/31] powerpc/xive: Fix xive_irq_set_affinity for MSI

2021-05-20 Thread Cédric Le Goater
On 5/14/21 10:48 PM, Thomas Gleixner wrote:
> On Fri, Apr 30 2021 at 10:03, Cédric Le Goater wrote:
>> The MSI affinity is automanaged and it can be set before starting the
>> associated IRQ.
>>
>> ( Should we simply remove the irqd_is_started() test ? )
> 
> If the hardware can handle it properly.
> 
> But see:
> 
>   cffb717ceb8e ("powerpc/xive: Ensure active irqd when setting affinity")

Thanks for digging. That's a patch from the early days of XIVE support. 

> which introduced that condition. It mutters something about migration of
> shutdown interrupts:
> 
>[  123.053037264,3] XIVE[ IC 00  ] ISN 2 lead to invalid IVE !

The XIVE driver in OPAL is complaining.

Linux is trying to configure the target of HW IRQ number 2 but OPAL refuses
because it's invalid. The first 16 are reserved (like on Linux).

So it's another problem. 2 could be a value from an "interrupts" property,
giving the INTx number assigned to a PCI device or an OPAL event IRQ 
number leaked into the XIVE domain. Given the low Linux IRQ number that 
might be the latter. 

>[   77.885859] xive: Error -6 reconfiguring irq 17
>[   77.885862] IRQ17: set affinity failed(-6).
> 
> Not that I can decode that :)

A device name would help but you have guessed most of it ;)

> 
> Non-managed interrupts have the sequence:
> 
>   startup()
>   set_affinity()
> 
> which is historical and an earlier attempt to flip it caused havoc in
> some places.
> 
> With managed we needed to make sure that the affinity is set correctly
> right at start. So it needs to be done the other way round and it turned
> out that for MSI this works.
> 
> I have no idea, whether that might make the above issue reappear or
> not. If so, then we need some extra state to make it work.
> 
> The root cause which triggered the problem got fixed, so there should be
> no issue _if_ this was specifically related to that CPU unplug case.

I would vote for this option. I will simply remove the irqd_is_started() 
test which looks bogus and do some extra tests on all platforms.

Thanks,

C.


 
>> diff --git a/arch/powerpc/sysdev/xive/common.c 
>> b/arch/powerpc/sysdev/xive/common.c
>> index 96737938e8e3..3485baf9ec8c 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -710,7 +710,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
>>  return -EINVAL;
>>  
>>  /* Don't do anything if the interrupt isn't started */
>> -if (!irqd_is_started(d))
>> +if (!irqd_is_started(d) && !irqd_affinity_is_managed(d))
>>  return IRQ_SET_MASK_OK;
>>  
>>  /*
> 
> Thanks,
> 
> tglx
> 



Re: [PATCH 31/31] genirq: Improve "hwirq" output in /proc and /sys/

2021-05-20 Thread Cédric Le Goater
On 5/14/21 10:49 PM, Thomas Gleixner wrote:
> On Fri, Apr 30 2021 at 10:04, Cédric Le Goater wrote:
>> The HW IRQ numbers generated by the PCI MSI layer can be quite large
>> on a pSeries machine when running under the IBM Hypervisor and they
>> appear as negative. Use '%u' to show them correctly.
>>
>> Cc: Thomas Gleixner 
>> Signed-off-by: Cédric Le Goater 
>> ---
>>  kernel/irq/irqdesc.c | 2 +-
>>  kernel/irq/proc.c| 2 +-
>>  2 files changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
>> index cc1a09406c6e..85054eb2ae51 100644
>> --- a/kernel/irq/irqdesc.c
>> +++ b/kernel/irq/irqdesc.c
>> @@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj,
>>  
>>  raw_spin_lock_irq(>lock);
>>  if (desc->irq_data.domain)
>> -ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
>> +ret = sprintf(buf, "%u\n", (int)desc->irq_data.hwirq);
> 
> Which makes the (int) cast pointless, right?

Well, hwirq is a long. Would you prefer a "%lu" for both ?

Thanks,

C.

> 
>>  raw_spin_unlock_irq(>lock);
>>  
>>  return ret;
>> diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
>> index 98138788cb04..e2392f05da04 100644
>> --- a/kernel/irq/proc.c
>> +++ b/kernel/irq/proc.c
>> @@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v)
>>  seq_printf(p, " %8s", "None");
>>  }
>>  if (desc->irq_data.domain)
>> -seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
>> +seq_printf(p, " %*u", prec, (int)desc->irq_data.hwirq);
> 
> ditto.
> 
> Thanks,
> 
> tglx
> 



Re: [PATCH 09/31] powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data

2021-05-20 Thread Cédric Le Goater
Adding Marc.

On 4/30/21 10:03 AM, Cédric Le Goater wrote:
> The MSI domain clears the IRQ with msi_domain_free(), which calls
> irq_domain_free_irqs_top(), which clears the handler data. This is a
> problem for the XIVE controller since we need to unmap MMIO pages and
> free a specific XIVE structure.
> 
> The 'msi_free()' handler is called before irq_domain_free_irqs_top()
> when the handler data is still available. Use that to clear the XIVE
> controller data.
This feels like a clumsy way of doing so. 

irq_domain_free_irqs_parent() would be my preferred way to clear the 
lowlevel handler data but we can't today. Could there be a smarter way ?

Thanks,

C.


> Cc: Thomas Gleixner 
> Signed-off-by: Cédric Le Goater 
>
> ---
>  arch/powerpc/include/asm/xive.h  |  1 +
>  arch/powerpc/platforms/pseries/msi.c | 16 +++-
>  arch/powerpc/sysdev/xive/common.c|  5 -
>  3 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
> index aa094a8655b0..20ae50ab083c 100644
> --- a/arch/powerpc/include/asm/xive.h
> +++ b/arch/powerpc/include/asm/xive.h
> @@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base);
>  int xive_native_populate_irq_data(u32 hw_irq,
> struct xive_irq_data *data);
>  void xive_cleanup_irq_data(struct xive_irq_data *xd);
> +void xive_irq_free_data(unsigned int virq);
>  void xive_native_free_irq(u32 irq);
>  int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
>  
> diff --git a/arch/powerpc/platforms/pseries/msi.c 
> b/arch/powerpc/platforms/pseries/msi.c
> index a41c448520d4..da9d63a088bb 100644
> --- a/arch/powerpc/platforms/pseries/msi.c
> +++ b/arch/powerpc/platforms/pseries/msi.c
> @@ -529,6 +529,19 @@ static int pseries_msi_ops_prepare(struct irq_domain 
> *domain, struct device *dev
>   return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
>  }
>  
> +/*
> + * ->msi_free() is called before irq_domain_free_irqs_top() when the
> + * handler data is still available. Use that to clear the XIVE
> + * controller data.
> + */
> +static void pseries_msi_ops_msi_free(struct irq_domain *domain,
> +  struct msi_domain_info *info,
> +  unsigned int irq)
> +{
> + if (xive_enabled())
> + xive_irq_free_data(irq);
> +}
> +
>  /*
>   * RTAS can not disable one MSI at a time. It's all or nothing. Do it
>   * at the end after all IRQs have been freed.
> @@ -546,6 +559,7 @@ static void pseries_msi_domain_free_irqs(struct 
> irq_domain *domain,
>  
>  static struct msi_domain_ops pseries_pci_msi_domain_ops = {
>   .msi_prepare= pseries_msi_ops_prepare,
> + .msi_free   = pseries_msi_ops_msi_free,
>   .domain_free_irqs = pseries_msi_domain_free_irqs,
>  };
>  
> @@ -660,7 +674,7 @@ static void pseries_irq_domain_free(struct irq_domain 
> *domain, unsigned int virq
>  
>   pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
>  
> - irq_domain_free_irqs_parent(domain, virq, nr_irqs);
> + /* XIVE domain data is cleared through ->msi_free() */
>  }
>  
>  static const struct irq_domain_ops pseries_irq_domain_ops = {
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index 3485baf9ec8c..191cd80ec534 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -980,6 +980,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
>  
>  void xive_cleanup_irq_data(struct xive_irq_data *xd)
>  {
> + pr_debug("%s for HW %x\n", __func__, xd->hw_irq);
> +
>   if (xd->eoi_mmio) {
>   unmap_kernel_range((unsigned long)xd->eoi_mmio,
>  1u << xd->esb_shift);
> @@ -1025,7 +1027,7 @@ static int xive_irq_alloc_data(unsigned int virq, 
> irq_hw_number_t hw)
>   return 0;
>  }
>  
> -static void xive_irq_free_data(unsigned int virq)
> +void xive_irq_free_data(unsigned int virq)
>  {
>   struct xive_irq_data *xd = irq_get_handler_data(virq);
>  
> @@ -1035,6 +1037,7 @@ static void xive_irq_free_data(unsigned int virq)
>   xive_cleanup_irq_data(xd);
>   kfree(xd);
>  }
> +EXPORT_SYMBOL_GPL(xive_irq_free_data);
>  
>  #ifdef CONFIG_SMP
>  
> 



Re: [PATCH 15/31] KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts

2021-05-20 Thread Cédric Le Goater
On 5/15/21 12:40 PM, Marc Zyngier wrote:
> On Fri, 14 May 2021 21:51:51 +0100,
> Thomas Gleixner  wrote:
>>
>> On Fri, Apr 30 2021 at 10:03, Cédric Le Goater wrote:
>>
>> CC: +Marc
> 
> Thanks Thomas.
> 
>>
>>> PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the
>>> underlying calls handling the passthrough of the interrupt in the
>>> guest need a number in the XIVE IRQ domain.
>>>
>>> Use the IRQ data mapped in the XIVE IRQ domain and not the one in the
>>> PCI-MSI domain.
>>>
>>> Exporting irq_get_default_host() might not be the best solution.
>>>
>>> Cc: Thomas Gleixner 
>>> Cc: Paul Mackerras 
>>> Signed-off-by: Cédric Le Goater 
>>> ---
>>>  arch/powerpc/kvm/book3s_xive.c | 3 ++-
>>>  kernel/irq/irqdomain.c | 1 +
>>>  2 files changed, 3 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>>> index 3a7da42bed57..81b9f4fc3978 100644
>>> --- a/arch/powerpc/kvm/book3s_xive.c
>>> +++ b/arch/powerpc/kvm/book3s_xive.c
>>> @@ -861,7 +861,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned 
>>> long guest_irq,
>>> struct kvmppc_xive *xive = kvm->arch.xive;
>>> struct kvmppc_xive_src_block *sb;
>>> struct kvmppc_xive_irq_state *state;
>>> -   struct irq_data *host_data = irq_get_irq_data(host_irq);
>>> +   struct irq_data *host_data =
>>> +   irq_domain_get_irq_data(irq_get_default_host(), host_irq);
>>> unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
>>> u16 idx;
>>> u8 prio;
>>> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
>>> index d10ab1d689d5..8a073d1ce611 100644
>>> --- a/kernel/irq/irqdomain.c
>>> +++ b/kernel/irq/irqdomain.c
>>> @@ -481,6 +481,7 @@ struct irq_domain *irq_get_default_host(void)
>>>  {
>>> return irq_default_domain;
>>>  }
>>> +EXPORT_SYMBOL_GPL(irq_get_default_host);
>>>  
>>>  static void irq_domain_clear_mapping(struct irq_domain *domain,
>>>  irq_hw_number_t hwirq)
>>
> 
> Is there any reason why we should add more users of the "default host"
> fallback? I would really hope that new code would actually track their
> irqdomain in a more fine-grained way, specially when using the
> hierarchical MSi setup, which seems to be the goal of this series.
> 
> Don't you have enough topology information that you can make use of to
> correctly assign a domain identifier (of_node or otherwise)?


PHB have a node ID and this is taken into account by the MSI domains.
However, one thing PPC (pSeries and PowerNV) lacks is an interrupt
controller node per chip which makes the IRQ domain hierarchy a bit
incomplete.

It will be difficult to change the pseries platform (VM) since the
PAPR architecture only specifies a single interrupt domain for the
whole machine. The PowerNV platform is designed in a similar way
(because the pseries platform preexisted) and the OPAL firmware hides
the interrupt controllers of each chip behind a single node. The
underlying topology is encoded in HW interrupt numbers. This is a bit
unfortunate since some PowerNV Linux drivers need that information.
Rewriting a new interrupt controller driver in OPAL would be a lot of
work and it won't happen any time soon. But it's feasible.

All that to say that we have a default IRQ domain on these platforms
and not one  IRQ domain per node/chip.

Also, there are two types of interrupt models to consider: the older
XICS (for P8/P7 processors) and the newer XIVE (for P9/P10).

Regarding MSI passthrough, the XIVE side is simpler (I can't believe I
am saying that, XIVE is anything but simple) and I think we can rework
kvmppc_xive_set_mapped() and xive_irq_set_vcpu_affinity() to remove
the IRQ domain bypass. 

XICS adds optimizations for passthrough done in real mode:

 e3c13e56a471 ("KVM: PPC: Book3S HV: Handle passthrough interrupts in guest")
 5d375199ea96 ("KVM: PPC: Book3S HV: Set server for passed-through interrupts")

That's a ~10% bandwidth improvements on CX5 adapters, it's good to
have but they are much more complex to rework. I took some time to
look for a solution for these because of the use of irq_to_desc() and
the use of the host IRQ in the XICS domain which are ugly but nothing
comes to mind yet.

For the time being, I think these changes bypassing the IRQ domains
are fine. I need some more time to mature an alternative.

Thanks,

C. 



[PATCH 19/31] powerpc/xics: Add debug logging to the set_irq_affinity handlers

2021-04-30 Thread Cédric Le Goater
It really helps to know how the HW is configured when tweaking the IRQ
subsystem.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/ics-opal.c | 2 +-
 arch/powerpc/sysdev/xics/ics-rtas.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xics/ics-opal.c 
b/arch/powerpc/sysdev/xics/ics-opal.c
index 8c7ddcc718b6..bf26cae1b982 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -133,7 +133,7 @@ static int ics_opal_set_affinity(struct irq_data *d,
}
server = ics_opal_mangle_server(wanted_server);
 
-   pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n",
+   pr_debug("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n",
 d->irq, hw_irq, wanted_server, server);
 
rc = opal_set_xive(hw_irq, server, priority);
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c 
b/arch/powerpc/sysdev/xics/ics-rtas.c
index 6d19d711ed35..b50c6341682e 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -133,6 +133,9 @@ static int ics_rtas_set_affinity(struct irq_data *d,
return -1;
}
 
+   pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq,
+hw_irq, irq_server);
+
status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL,
 hw_irq, irq_server, xics_status[1]);
 
-- 
2.26.3



[PATCH 28/31] powerpc/powernv/pci: Set the IRQ chip data for P8/CXL devices

2021-04-30 Thread Cédric Le Goater
Before MSI domains, the default IRQ chip of PHB3 MSIs was patched by
pnv_set_msi_irq_chip() with the custom EOI handler pnv_ioda2_msi_eoi()
and the owning PHB was deduced from the 'ioda.irq_chip' field. This
path has been deprecated by the MSI domains but it is still in use by
the P8 CAPI 'cxl' driver.

Rewriting this driver to support MSI would be a waste of time.
Nevertheless, we can still remove the IRQ chip patch and set the IRQ
chip data instead. This is cleaner.

Cc: Frederic Barrat 
Cc: Christophe Lombard 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c1598ab730c3..d496d5b1b45a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2115,19 +2115,23 @@ int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, 
unsigned int hw_irq)
return opal_pci_msi_eoi(phb->opal_id, hw_irq);
 }
 
+/*
+ * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers
+ */
 static void pnv_ioda2_msi_eoi(struct irq_data *d)
 {
int64_t rc;
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-   struct irq_chip *chip = irq_data_get_irq_chip(d);
+   struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+   struct pnv_phb *phb = hose->private_data;
 
-   rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
+   rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
WARN_ON_ONCE(rc);
 
icp_native_eoi(d);
 }
 
-
+/* P8/CXL only */
 void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
 {
struct irq_data *idata;
@@ -2149,6 +2153,7 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned 
int virq)
phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
}
irq_set_chip(virq, >ioda.irq_chip);
+   irq_set_chip_data(virq, phb->hose);
 }
 
 static struct irq_chip pnv_pci_msi_irq_chip;
-- 
2.26.3



[PATCH 13/31] KVM: PPC: Book3S HV: Use the new IRQ chip to detect passthrough interrupts

2021-04-30 Thread Cédric Le Goater
Passthrough PCI MSI interrupts are detected in KVM with a check on a
specific EOI handler (P8) or on XIVE (P9). We can now check the
PCI-MSI IRQ chip which is cleaner.

Cc: Paul Mackerras 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_hv.c  | 2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index deb450e4289e..86a0f8b0e6da 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5153,7 +5153,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
 * what our real-mode EOI code does, or a XIVE interrupt
 */
chip = irq_data_get_irq_chip(>irq_data);
-   if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
+   if (!chip || !is_pnv_opal_msi(chip)) {
pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map 
for (%d,%d)\n",
host_irq, guest_gsi);
mutex_unlock(>lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3886ca6e2ed3..7b75af17dc59 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2151,13 +2151,15 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned 
int virq)
irq_set_chip(virq, >ioda.irq_chip);
 }
 
+static struct irq_chip pnv_pci_msi_irq_chip;
+
 /*
  * Returns true iff chip is something that we could call
  * pnv_opal_pci_msi_eoi for.
  */
 bool is_pnv_opal_msi(struct irq_chip *chip)
 {
-   return chip->irq_eoi == pnv_ioda2_msi_eoi;
+   return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == 
_pci_msi_irq_chip;
 }
 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
 
-- 
2.26.3



[PATCH 07/31] powerpc/xive: Fix xive_irq_set_affinity for MSI

2021-04-30 Thread Cédric Le Goater
The MSI affinity is automanaged and it can be set before starting the
associated IRQ.

( Should we simply remove the irqd_is_started() test ? )

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 96737938e8e3..3485baf9ec8c 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -710,7 +710,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
return -EINVAL;
 
/* Don't do anything if the interrupt isn't started */
-   if (!irqd_is_started(d))
+   if (!irqd_is_started(d) && !irqd_affinity_is_managed(d))
return IRQ_SET_MASK_OK;
 
/*
-- 
2.26.3



[PATCH 29/31] powerpc/powernv/pci: Rework pnv_opal_pci_msi_eoi()

2021-04-30 Thread Cédric Le Goater
pnv_opal_pci_msi_eoi() is called from KVM to EOI passthrough interrupts
when in real mode. Adding MSI domain broke the hack using the
'ioda.irq_chip' field to deduce the owning PHB. Fix that by using the
IRQ chip data in the MSI domain.

The 'ioda.irq_chip' field is now unused and could be removed from the
pnv_phb struct.

Cc: Paul Mackerras 
Cc: Alexey Kardashevskiy 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/pnv-pci.h|  2 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c  |  8 
 arch/powerpc/platforms/powernv/pci-ioda.c | 17 +
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pnv-pci.h 
b/arch/powerpc/include/asm/pnv-pci.h
index d0ee0ede5767..b3f480799352 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -33,7 +33,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d);
 bool is_pnv_opal_msi(struct irq_chip *chip);
 
 #ifdef CONFIG_CXL_BASE
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index c2c9c733f359..1772d53526e2 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -713,6 +713,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
icp->rm_eoied_irq = irq;
}
 
+   /* Handle passthrough interrupts */
if (state->host_irq) {
++vcpu->stat.pthru_all;
if (state->intr_cpu != -1) {
@@ -766,7 +767,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 
 static unsigned long eoi_rc;
 
-static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
+static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again)
 {
void __iomem *xics_phys;
int64_t rc;
@@ -779,7 +780,7 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 
xirr, bool *again)
return;
}
 
-   rc = pnv_opal_pci_msi_eoi(c, hwirq);
+   rc = pnv_opal_pci_msi_eoi(d);
 
if (rc)
eoi_rc = rc;
@@ -887,8 +888,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
icp_rm_deliver_irq(xics, icp, irq, false);
 
/* EOI the interrupt */
-   icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
-   again);
+   icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, 
again);
 
if (check_too_hard(xics, icp) == H_TOO_HARD)
return 2;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d496d5b1b45a..8406b94cbfca 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2107,12 +2107,21 @@ void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
pe->dma_setup_done = true;
 }
 
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
+/*
+ * Called from KVM in real mode to EOI passthru interrupts. The ICP
+ * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru().
+ *
+ * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call
+ * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ
+ * numbers of the in-the-middle MSI domain are vector numbers and it's
+ * good enough for OPAL. Use that.
+ */
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d)
 {
-   struct pnv_phb *phb = container_of(chip, struct pnv_phb,
-  ioda.irq_chip);
+   struct pci_controller *hose = 
irq_data_get_irq_chip_data(d->parent_data);
+   struct pnv_phb *phb = hose->private_data;
 
-   return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+   return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq);
 }
 
 /*
-- 
2.26.3



[PATCH 22/31] powerpc/pci: Drop XIVE restriction on MSI domains

2021-04-30 Thread Cédric Le Goater
The PowerNV and pSeries platforms now have support for both the XICS
and XIVE IRQ domains.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 +---
 arch/powerpc/platforms/pseries/msi.c  | 4 
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7035be271c34..13b56de92d85 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2476,9 +2476,7 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
 
-   /* Only supported by the XIVE driver */
-   if (xive_enabled())
-   pnv_msi_allocate_domains(phb->hose, count);
+   pnv_msi_allocate_domains(phb->hose, count);
 }
 
 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index d1470941cadf..1886cb5ca4df 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -720,10 +720,6 @@ int pseries_msi_allocate_domains(struct pci_controller 
*phb)
 {
int count;
 
-   /* Only supported by the XIVE driver */
-   if (!xive_enabled())
-   return -ENODEV;
-
if (!__find_pe_total_msi(phb->dn, )) {
pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n",
   phb->dn, phb->global_number);
-- 
2.26.3



[PATCH 20/31] powerpc/xics: Add support for IRQ domain hierarchy

2021-04-30 Thread Cédric Le Goater
XICS doesn't have any state associated with the IRQ. The support is
straightforward and simpler than for XIVE.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 37 ++
 1 file changed, 37 insertions(+)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 981587c7..05d21005dc79 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -406,7 +406,44 @@ int xics_retrigger(struct irq_data *data)
return 0;
 }
 
+static int xics_host_domain_translate(struct irq_domain *d, struct irq_fwspec 
*fwspec,
+ unsigned long *hwirq, unsigned int *type)
+{
+   return xics_host_xlate(d, to_of_node(fwspec->fwnode), fwspec->param,
+  fwspec->param_count, hwirq, type);
+}
+
+static int xics_host_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+   struct irq_fwspec *fwspec = arg;
+   irq_hw_number_t hwirq;
+   unsigned int type = IRQ_TYPE_NONE;
+   int i, rc;
+
+   rc = xics_host_domain_translate(domain, fwspec, , );
+   if (rc)
+   return rc;
+
+   pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++)
+   irq_domain_set_info(domain, virq + i, hwirq + i, xics_ics->chip,
+   xics_ics, handle_fasteoi_irq, NULL, NULL);
+
+   return 0;
+}
+
+static void xics_host_domain_free(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+   pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+}
+
 static const struct irq_domain_ops xics_host_ops = {
+   .alloc  = xics_host_domain_alloc,
+   .free   = xics_host_domain_free,
+   .translate = xics_host_domain_translate,
.match = xics_host_match,
.map = xics_host_map,
.xlate = xics_host_xlate,
-- 
2.26.3



[PATCH 23/31] powerpc/xics: Drop unmask of MSIs at startup

2021-04-30 Thread Cédric Le Goater
That was a workaround in the XICS domain because of the lack of MSI
domain. This is now handled.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/ics-opal.c | 11 ---
 arch/powerpc/sysdev/xics/ics-rtas.c |  9 -
 2 files changed, 20 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/ics-opal.c 
b/arch/powerpc/sysdev/xics/ics-opal.c
index bf26cae1b982..c4d95d8beb6f 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -62,17 +62,6 @@ static void ics_opal_unmask_irq(struct irq_data *d)
 
 static unsigned int ics_opal_startup(struct irq_data *d)
 {
-#ifdef CONFIG_PCI_MSI
-   /*
-* The generic MSI code returns with the interrupt disabled on the
-* card, using the MSI mask bits. Firmware doesn't appear to unmask
-* at that level, so we do it here by hand.
-*/
-   if (irq_data_get_msi_desc(d))
-   pci_msi_unmask_irq(d);
-#endif
-
-   /* unmask it */
ics_opal_unmask_irq(d);
return 0;
 }
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c 
b/arch/powerpc/sysdev/xics/ics-rtas.c
index b50c6341682e..b9da317b7a2d 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -57,15 +57,6 @@ static void ics_rtas_unmask_irq(struct irq_data *d)
 
 static unsigned int ics_rtas_startup(struct irq_data *d)
 {
-#ifdef CONFIG_PCI_MSI
-   /*
-* The generic MSI code returns with the interrupt disabled on the
-* card, using the MSI mask bits. Firmware doesn't appear to unmask
-* at that level, so we do it here by hand.
-*/
-   if (irq_data_get_msi_desc(d))
-   pci_msi_unmask_irq(d);
-#endif
/* unmask it */
ics_rtas_unmask_irq(d);
return 0;
-- 
2.26.3



[PATCH 25/31] powerpc/powernv/pci: Drop unused MSI code

2021-04-30 Thread Cédric Le Goater
MSIs should be fully managed by the PCI and IRQ subsystems now.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci.h  |  6 --
 arch/powerpc/platforms/powernv/pci-ioda.c | 29 --
 arch/powerpc/platforms/powernv/pci.c  | 67 ---
 3 files changed, 102 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 36d22920f5a3..a075012788df 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -127,11 +127,7 @@ struct pnv_phb {
 #endif
 
unsigned intmsi_base;
-   unsigned intmsi32_support;
struct msi_bitmap   msi_bmp;
-   int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
-unsigned int hwirq, unsigned int virq,
-unsigned int is_64, struct msi_msg *msg);
int (*init_m64)(struct pnv_phb *phb);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
@@ -295,8 +291,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, 
unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
-extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
-extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 13b56de92d85..c5acd85a9144 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2224,29 +2224,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, 
struct pci_dev *dev,
return 0;
 }
 
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
-{
-   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
-   unsigned int xive_num = hwirq - phb->msi_base;
-   int rc;
-
-   rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg);
-   if (rc)
-   return rc;
-
-   /* P8 only */
-   pnv_set_msi_irq_chip(phb, virq);
-
-   pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
-" address=%x_%08x data=%x PE# %x\n",
-pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
-msg->address_hi, msg->address_lo, msg->data, pe->pe_number);
-
-   return 0;
-}
-
 /*
  * The msi_free() op is called before irq_domain_free_irqs_top() when
  * the handler data is still available. Use that to clear the XIVE
@@ -2471,8 +2448,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
return;
}
 
-   phb->msi_setup = pnv_pci_ioda_msi_setup;
-   phb->msi32_support = 1;
pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
 
@@ -3090,8 +3065,6 @@ static const struct pci_controller_ops 
pnv_pci_ioda_controller_ops = {
.dma_dev_setup  = pnv_pci_ioda_dma_dev_setup,
.dma_bus_setup  = pnv_pci_ioda_dma_bus_setup,
.iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
-   .setup_msi_irqs = pnv_setup_msi_irqs,
-   .teardown_msi_irqs  = pnv_teardown_msi_irqs,
.enable_device_hook = pnv_pci_enable_device_hook,
.release_device = pnv_pci_release_device,
.window_alignment   = pnv_pci_window_alignment,
@@ -3101,8 +3074,6 @@ static const struct pci_controller_ops 
pnv_pci_ioda_controller_ops = {
 };
 
 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
-   .setup_msi_irqs = pnv_setup_msi_irqs,
-   .teardown_msi_irqs  = pnv_teardown_msi_irqs,
.enable_device_hook = pnv_pci_enable_device_hook,
.window_alignment   = pnv_pci_window_alignment,
.reset_secondary_bus= pnv_pci_reset_secondary_bus,
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 9b9bca169275..397b3d7eb150 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -160,73 +160,6 @@ int pnv_pci_set_power_state(uint64_t id, uint8_t state, 
struct opal_msg *msg)
 }
 EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
 
-int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
-{
-   struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
-   struct msi_desc *entry;
-   struct msi_msg msg;
-   int hwirq;
-

[PATCH 31/31] genirq: Improve "hwirq" output in /proc and /sys/

2021-04-30 Thread Cédric Le Goater
The HW IRQ numbers generated by the PCI MSI layer can be quite large
on a pSeries machine when running under the IBM Hypervisor and they
appear as negative. Use '%u' to show them correctly.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 kernel/irq/irqdesc.c | 2 +-
 kernel/irq/proc.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index cc1a09406c6e..85054eb2ae51 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -188,7 +188,7 @@ static ssize_t hwirq_show(struct kobject *kobj,
 
raw_spin_lock_irq(>lock);
if (desc->irq_data.domain)
-   ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq);
+   ret = sprintf(buf, "%u\n", (int)desc->irq_data.hwirq);
raw_spin_unlock_irq(>lock);
 
return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 98138788cb04..e2392f05da04 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -513,7 +513,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
-   seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+   seq_printf(p, " %*u", prec, (int)desc->irq_data.hwirq);
else
seq_printf(p, " %*s", prec, "");
 #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
-- 
2.26.3



[PATCH 08/31] powerpc/pseries/pci: Add a domain_free_irqs handler

2021-04-30 Thread Cédric Le Goater
The RTAS firmware can not disable one MSI at a time. It's all or
nothing. We need a custom free IRQ handler for that.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index a9bd1e991df5..a41c448520d4 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -529,8 +529,24 @@ static int pseries_msi_ops_prepare(struct irq_domain 
*domain, struct device *dev
return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
 
+/*
+ * RTAS can not disable one MSI at a time. It's all or nothing. Do it
+ * at the end after all IRQs have been freed.
+ */
+static void pseries_msi_domain_free_irqs(struct irq_domain *domain,
+struct device *dev)
+{
+   if (WARN_ON_ONCE(!dev_is_pci(dev)))
+   return;
+
+   __msi_domain_free_irqs(domain, dev);
+
+   rtas_disable_msi(to_pci_dev(dev));
+}
+
 static struct msi_domain_ops pseries_pci_msi_domain_ops = {
.msi_prepare= pseries_msi_ops_prepare,
+   .domain_free_irqs = pseries_msi_domain_free_irqs,
 };
 
 static void pseries_msi_shutdown(struct irq_data *d)
-- 
2.26.3



[PATCH 09/31] powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data

2021-04-30 Thread Cédric Le Goater
The MSI domain clears the IRQ with msi_domain_free(), which calls
irq_domain_free_irqs_top(), which clears the handler data. This is a
problem for the XIVE controller since we need to unmap MMIO pages and
free a specific XIVE structure.

The 'msi_free()' handler is called before irq_domain_free_irqs_top()
when the handler data is still available. Use that to clear the XIVE
controller data.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive.h  |  1 +
 arch/powerpc/platforms/pseries/msi.c | 16 +++-
 arch/powerpc/sysdev/xive/common.c|  5 -
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index aa094a8655b0..20ae50ab083c 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base);
 int xive_native_populate_irq_data(u32 hw_irq,
  struct xive_irq_data *data);
 void xive_cleanup_irq_data(struct xive_irq_data *xd);
+void xive_irq_free_data(unsigned int virq);
 void xive_native_free_irq(u32 irq);
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
 
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index a41c448520d4..da9d63a088bb 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -529,6 +529,19 @@ static int pseries_msi_ops_prepare(struct irq_domain 
*domain, struct device *dev
return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
 
+/*
+ * ->msi_free() is called before irq_domain_free_irqs_top() when the
+ * handler data is still available. Use that to clear the XIVE
+ * controller data.
+ */
+static void pseries_msi_ops_msi_free(struct irq_domain *domain,
+struct msi_domain_info *info,
+unsigned int irq)
+{
+   if (xive_enabled())
+   xive_irq_free_data(irq);
+}
+
 /*
  * RTAS can not disable one MSI at a time. It's all or nothing. Do it
  * at the end after all IRQs have been freed.
@@ -546,6 +559,7 @@ static void pseries_msi_domain_free_irqs(struct irq_domain 
*domain,
 
 static struct msi_domain_ops pseries_pci_msi_domain_ops = {
.msi_prepare= pseries_msi_ops_prepare,
+   .msi_free   = pseries_msi_ops_msi_free,
.domain_free_irqs = pseries_msi_domain_free_irqs,
 };
 
@@ -660,7 +674,7 @@ static void pseries_irq_domain_free(struct irq_domain 
*domain, unsigned int virq
 
pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
 
-   irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+   /* XIVE domain data is cleared through ->msi_free() */
 }
 
 static const struct irq_domain_ops pseries_irq_domain_ops = {
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 3485baf9ec8c..191cd80ec534 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -980,6 +980,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
 
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
+   pr_debug("%s for HW %x\n", __func__, xd->hw_irq);
+
if (xd->eoi_mmio) {
unmap_kernel_range((unsigned long)xd->eoi_mmio,
   1u << xd->esb_shift);
@@ -1025,7 +1027,7 @@ static int xive_irq_alloc_data(unsigned int virq, 
irq_hw_number_t hw)
return 0;
 }
 
-static void xive_irq_free_data(unsigned int virq)
+void xive_irq_free_data(unsigned int virq)
 {
struct xive_irq_data *xd = irq_get_handler_data(virq);
 
@@ -1035,6 +1037,7 @@ static void xive_irq_free_data(unsigned int virq)
xive_cleanup_irq_data(xd);
kfree(xd);
 }
+EXPORT_SYMBOL_GPL(xive_irq_free_data);
 
 #ifdef CONFIG_SMP
 
-- 
2.26.3



[PATCH 15/31] KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts

2021-04-30 Thread Cédric Le Goater
PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the
underlying calls handling the passthrough of the interrupt in the
guest need a number in the XIVE IRQ domain.

Use the IRQ data mapped in the XIVE IRQ domain and not the one in the
PCI-MSI domain.

Exporting irq_get_default_host() might not be the best solution.

Cc: Thomas Gleixner 
Cc: Paul Mackerras 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_xive.c | 3 ++-
 kernel/irq/irqdomain.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 3a7da42bed57..81b9f4fc3978 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -861,7 +861,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
-   struct irq_data *host_data = irq_get_irq_data(host_irq);
+   struct irq_data *host_data =
+   irq_domain_get_irq_data(irq_get_default_host(), host_irq);
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
u16 idx;
u8 prio;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d10ab1d689d5..8a073d1ce611 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -481,6 +481,7 @@ struct irq_domain *irq_get_default_host(void)
 {
return irq_default_domain;
 }
+EXPORT_SYMBOL_GPL(irq_get_default_host);
 
 static void irq_domain_clear_mapping(struct irq_domain *domain,
 irq_hw_number_t hwirq)
-- 
2.26.3



[PATCH 05/31] powerpc/pseries/pci: Add MSI domains

2021-04-30 Thread Cédric Le Goater
Two IRQ domains are added on top of default machine IRQ domain.

First, the top level "PCI-MSI" domain deals with the MSI specificities.
In this domain, the HW IRQ numbers are generated by the PCI MSI layer,
they compose a unique ID for an MSI source with the PCI device
identifier and the MSI vector number.

These numbers can be quite large on a pSeries machine running under
the IBM Hypervisor and /sys/kernel/irq/ and /proc/interrupts will
require small fixes to show them correctly.

Then, the in-the-middle "MSI" domain acts as a proxy between the PCI
MSI subsystem and the machine IRQ subsystem. It usually handles the
MSI allocator but on pSeries machines, this is done by the RTAS
FW. RTAS returns IRQ numbers in the IRQ number space of the machine.
This is why this in-the-middle "Pseries-MSI" domain has the same HW
IRQ numbers as its parent domain.

Only the XIVE (P9/P10) parent domain is supported for now. We still
need to add support for IRQ domain hierarchy under XICS.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/pci-bridge.h|   5 +
 arch/powerpc/platforms/pseries/pseries.h |   1 +
 arch/powerpc/kernel/pci-common.c |   6 +
 arch/powerpc/platforms/pseries/msi.c | 185 +++
 arch/powerpc/platforms/pseries/setup.c   |   2 +
 5 files changed, 199 insertions(+)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index d2a2a14e56f9..fb35d340a739 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -127,6 +127,11 @@ struct pci_controller {
 
void *private_data;
struct npu *npu;
+
+   /* IRQ domain hierarchy */
+   struct irq_domain   *dev_domain;
+   struct irq_domain   *msi_domain;
+   struct fwnode_handle*fwnode;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 4fe48c04c6c2..91cf2afcf423 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -87,6 +87,7 @@ struct pci_host_bridge;
 int pseries_root_bridge_prepare(struct pci_host_bridge *bridge);
 
 extern struct pci_controller_ops pseries_pci_controller_ops;
+int pseries_msi_allocate_domains(struct pci_controller *phb);
 
 unsigned long pseries_memory_block_size(void);
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 001e90cd8948..c3573430919d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1060,11 +1061,16 @@ void pcibios_bus_add_device(struct pci_dev *dev)
 
 int pcibios_add_device(struct pci_dev *dev)
 {
+   struct irq_domain *d;
+
 #ifdef CONFIG_PCI_IOV
if (ppc_md.pcibios_fixup_sriov)
ppc_md.pcibios_fixup_sriov(dev);
 #endif /* CONFIG_PCI_IOV */
 
+   d = dev_get_msi_domain(>bus->dev);
+   if (d)
+   dev_set_msi_domain(>dev, d);
return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 4bf14f27e1aa..a9bd1e991df5 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pseries.h"
 
@@ -518,6 +519,190 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int 
nvec_in, int type)
return 0;
 }
 
+static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device 
*dev,
+  int nvec, msi_alloc_info_t *arg)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct msi_desc *desc = first_pci_msi_entry(pdev);
+   int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+
+   return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
+}
+
+static struct msi_domain_ops pseries_pci_msi_domain_ops = {
+   .msi_prepare= pseries_msi_ops_prepare,
+};
+
+static void pseries_msi_shutdown(struct irq_data *d)
+{
+   d = d->parent_data;
+   if (d->chip->irq_shutdown)
+   d->chip->irq_shutdown(d);
+}
+
+static void pseries_msi_mask(struct irq_data *d)
+{
+   pci_msi_mask_irq(d);
+   irq_chip_mask_parent(d);
+}
+
+static void pseries_msi_unmask(struct irq_data *d)
+{
+   pci_msi_unmask_irq(d);
+   irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip pseries_pci_msi_irq_chip = {
+   .name   = "Pseries-PCI-MSI",
+   .irq_shutdown   = pseries_msi_shutdown,
+   .irq_mask   = pseries_msi_mask,
+   .irq_unmask = pseries_msi_unmask,
+   .irq_eoi= irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pseries_msi_domain_info = {
+   .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE

[PATCH 27/31] powerpc/xics: Fix IRQ migration

2021-04-30 Thread Cédric Le Goater
desc->irq_data points to the top level IRQ data descriptor which is
not necessarily in the XICS IRQ domain. MSIs are in another domain for
instance. Fix that by looking for a mapping on the low level XICS IRQ
domain.

TODO: Why not use irq_migrate_all_off_this_cpu() instead ?

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 05d21005dc79..2a3ad7f5c331 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -183,6 +183,8 @@ void xics_migrate_irqs_away(void)
unsigned int irq, virq;
struct irq_desc *desc;
 
+   pr_debug("%s: CPU %u\n", __func__, cpu);
+
/* If we used to be the default server, move to the new "boot_cpuid" */
if (hw_cpu == xics_default_server)
xics_update_irq_servers();
@@ -197,6 +199,7 @@ void xics_migrate_irqs_away(void)
struct irq_chip *chip;
long server;
unsigned long flags;
+   struct irq_data *irqd;
 
/* We can't set affinity on ISA interrupts */
if (virq < NUM_ISA_INTERRUPTS)
@@ -204,9 +207,11 @@ void xics_migrate_irqs_away(void)
/* We only need to migrate enabled IRQS */
if (!desc->action)
continue;
-   if (desc->irq_data.domain != xics_host)
+   /* We need a mapping in the XICS IRQ domain */
+   irqd = irq_domain_get_irq_data(xics_host, virq);
+   if (!irqd)
continue;
-   irq = desc->irq_data.hwirq;
+   irq = irqd_to_hwirq(irqd);
/* We need to get IPIs still. */
if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
continue;
-- 
2.26.3



[PATCH 21/31] powerpc/powernv/pci: Customize the MSI EOI handler to support PHB3

2021-04-30 Thread Cédric Le Goater
PHB3s need an extra OPAL call to EOI the interrupt. The call takes an
OPAL HW IRQ number but it is translated into a vector number in OPAL.
Here, we directly use the vector number of the in-the-middle "MSI"
domain instead of grabbing the OPAL HW IRQ number in the XICS parent
domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7b75af17dc59..7035be271c34 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2313,12 +2313,33 @@ static void pnv_msi_compose_msg(struct irq_data *d, 
struct msi_msg *msg)
entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
 }
 
+/*
+ * The IRQ data is mapped in the MSI domain in which HW IRQ numbers
+ * correspond to vector numbers.
+ */
+static void pnv_msi_eoi(struct irq_data *d)
+{
+   struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+   struct pnv_phb *phb = hose->private_data;
+
+   if (phb->model == PNV_PHB_MODEL_PHB3) {
+   /*
+* The EOI OPAL call takes an OPAL HW IRQ number but
+* since it is translated into a vector number in
+* OPAL, use that directly.
+*/
+   WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq));
+   }
+
+   irq_chip_eoi_parent(d);
+}
+
 static struct irq_chip pnv_msi_irq_chip = {
.name   = "PNV-MSI",
.irq_shutdown   = pnv_msi_shutdown,
.irq_mask   = irq_chip_mask_parent,
.irq_unmask = irq_chip_unmask_parent,
-   .irq_eoi= irq_chip_eoi_parent,
+   .irq_eoi= pnv_msi_eoi,
.irq_set_affinity   = irq_chip_set_affinity_parent,
.irq_compose_msi_msg= pnv_msi_compose_msg,
 };
-- 
2.26.3



[PATCH 30/31] KVM: PPC: Book3S HV: XICS: Fix mapping of passthrough interrupts

2021-04-30 Thread Cédric Le Goater
PCI MSIs now live in an MSI domain but the underlying calls, which
will EOI the interrupt in real mode, need an HW IRQ number mapped in
the XICS IRQ domain. Grab it there.

Cc: Paul Mackerras 
Cc: Alexey Kardashevskiy 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_hv.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9f4eb74a11cc..6058bcc5b61e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5126,6 +5126,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
struct kvmppc_passthru_irqmap *pimap;
struct irq_chip *chip;
int i, rc = 0;
+   struct irq_data *host_data;
 
if (!kvm_irq_bypass)
return 1;
@@ -5190,7 +5191,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
 * the KVM real mode handler.
 */
smp_wmb();
-   irq_map->r_hwirq = desc->irq_data.hwirq;
+
+   /*
+* The 'host_irq' number is mapped in the PCI-MSI domain but
+* the underlying calls, which will EOI the interrupt in real
+* mode, need an HW IRQ number mapped in the XICS IRQ domain.
+*/
+   host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq);
+   irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data);
 
if (i == pimap->n_mapped)
pimap->n_mapped++;
@@ -5198,7 +5206,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
if (xics_on_xive())
rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
else
-   kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+   kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq);
if (rc)
irq_map->r_hwirq = 0;
 
-- 
2.26.3



[PATCH 17/31] powerpc/xics: Rename the map handler in a check handler

2021-04-30 Thread Cédric Le Goater
This moves the IRQ initialization done under the OPAL and RTAS backends
in the common part of XICS. The 'map' handler becomes a simple 'check'
on the HW IRQ at the FW level.

As we don't need an ICS anymore in xics_migrate_irqs_away(), the XICS
domain does not set a chip data for the IRQ.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xics.h|  3 ++-
 arch/powerpc/sysdev/xics/ics-opal.c| 27 +
 arch/powerpc/sysdev/xics/ics-rtas.c| 28 +-
 arch/powerpc/sysdev/xics/xics-common.c | 15 --
 4 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index 8e903b3f9c24..01b51a926f56 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -85,10 +85,11 @@ static inline int ics_opal_init(void) { return -ENODEV; }
 /* ICS instance, hooked up to chip_data of an irq */
 struct ics {
struct list_head link;
-   int (*map)(struct ics *ics, unsigned int virq);
+   int (*check)(struct ics *ics, unsigned int hwirq);
void (*mask_unknown)(struct ics *ics, unsigned long vec);
long (*get_server)(struct ics *ics, unsigned long vec);
int (*host_match)(struct ics *ics, struct device_node *node);
+   struct irq_chip *chip;
char data[];
 };
 
diff --git a/arch/powerpc/sysdev/xics/ics-opal.c 
b/arch/powerpc/sysdev/xics/ics-opal.c
index 823f6c9664cd..8c7ddcc718b6 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -157,26 +157,13 @@ static struct irq_chip ics_opal_irq_chip = {
.irq_retrigger = xics_retrigger,
 };
 
-static int ics_opal_map(struct ics *ics, unsigned int virq);
-static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec);
-static long ics_opal_get_server(struct ics *ics, unsigned long vec);
-
 static int ics_opal_host_match(struct ics *ics, struct device_node *node)
 {
return 1;
 }
 
-/* Only one global & state struct ics */
-static struct ics ics_hal = {
-   .map= ics_opal_map,
-   .mask_unknown   = ics_opal_mask_unknown,
-   .get_server = ics_opal_get_server,
-   .host_match = ics_opal_host_match,
-};
-
-static int ics_opal_map(struct ics *ics, unsigned int virq)
+static int ics_opal_check(struct ics *ics, unsigned int hw_irq)
 {
-   unsigned int hw_irq = (unsigned int)virq_to_hw(virq);
int64_t rc;
__be16 server;
int8_t priority;
@@ -189,9 +176,6 @@ static int ics_opal_map(struct ics *ics, unsigned int virq)
if (rc != OPAL_SUCCESS)
return -ENXIO;
 
-   irq_set_chip_and_handler(virq, _opal_irq_chip, handle_fasteoi_irq);
-   irq_set_chip_data(virq, _hal);
-
return 0;
 }
 
@@ -222,6 +206,15 @@ static long ics_opal_get_server(struct ics *ics, unsigned 
long vec)
return ics_opal_unmangle_server(be16_to_cpu(server));
 }
 
+/* Only one global & state struct ics */
+static struct ics ics_hal = {
+   .check  = ics_opal_check,
+   .mask_unknown   = ics_opal_mask_unknown,
+   .get_server = ics_opal_get_server,
+   .host_match = ics_opal_host_match,
+   .chip   = _opal_irq_chip,
+};
+
 int __init ics_opal_init(void)
 {
if (!firmware_has_feature(FW_FEATURE_OPAL))
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c 
b/arch/powerpc/sysdev/xics/ics-rtas.c
index 4cf18000f07c..6d19d711ed35 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -24,19 +24,6 @@ static int ibm_set_xive;
 static int ibm_int_on;
 static int ibm_int_off;
 
-static int ics_rtas_map(struct ics *ics, unsigned int virq);
-static void ics_rtas_mask_unknown(struct ics *ics, unsigned long vec);
-static long ics_rtas_get_server(struct ics *ics, unsigned long vec);
-static int ics_rtas_host_match(struct ics *ics, struct device_node *node);
-
-/* Only one global & state struct ics */
-static struct ics ics_rtas = {
-   .map= ics_rtas_map,
-   .mask_unknown   = ics_rtas_mask_unknown,
-   .get_server = ics_rtas_get_server,
-   .host_match = ics_rtas_host_match,
-};
-
 static void ics_rtas_unmask_irq(struct irq_data *d)
 {
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
@@ -169,9 +156,8 @@ static struct irq_chip ics_rtas_irq_chip = {
.irq_retrigger = xics_retrigger,
 };
 
-static int ics_rtas_map(struct ics *ics, unsigned int virq)
+static int ics_rtas_check(struct ics *ics, unsigned int hw_irq)
 {
-   unsigned int hw_irq = (unsigned int)virq_to_hw(virq);
int status[2];
int rc;
 
@@ -183,9 +169,6 @@ static int ics_rtas_map(struct ics *ics, unsigned int virq)
if (rc)
return -ENXIO;
 
-   irq_set_chip_and_handler(virq, _rtas_irq_chip, handle_fasteoi_irq);
-   irq_set_chip_data(virq, _rtas);
-
return 0;
 }
 
@@ -213,6 +196,15 @

[PATCH 26/31] powerpc/powernv/pci: Adapt is_pnv_opal_msi() to detect passthrough interrupt

2021-04-30 Thread Cédric Le Goater
The pnv_ioda2_msi_eoi chip handler is not used anymore for MSIs.
Simply use the check on the PSI-MSI chip.

Cc: Alexey Kardashevskiy 
Cc: Paul Mackerras 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c5acd85a9144..c1598ab730c3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2159,7 +2159,7 @@ static struct irq_chip pnv_pci_msi_irq_chip;
  */
 bool is_pnv_opal_msi(struct irq_chip *chip)
 {
-   return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == 
_pci_msi_irq_chip;
+   return chip == _pci_msi_irq_chip;
 }
 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
 
-- 
2.26.3



[PATCH 24/31] powerpc/pseries/pci: Drop unused MSI code

2021-04-30 Thread Cédric Le Goater
MSIs should be fully managed by the PCI and IRQ subsystems now.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 87 
 1 file changed, 87 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 1886cb5ca4df..7ddce65edb88 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -111,21 +111,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int 
offset)
return rtas_ret[0];
 }
 
-static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
-{
-   struct msi_desc *entry;
-
-   for_each_pci_msi_entry(entry, pdev) {
-   if (!entry->irq)
-   continue;
-
-   irq_set_msi_desc(entry->irq, NULL);
-   irq_dispose_mapping(entry->irq);
-   }
-
-   rtas_disable_msi(pdev);
-}
-
 static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
 {
struct device_node *dn;
@@ -459,66 +444,6 @@ static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int 
nvec_in, int type,
return 0;
 }
 
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
-{
-   struct pci_dn *pdn;
-   int hwirq, virq, i;
-   int rc;
-   struct msi_desc *entry;
-   struct msi_msg msg;
-
-   rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL);
-   if (rc)
-   return rc;
-
-   pdn = pci_get_pdn(pdev);
-   i = 0;
-   for_each_pci_msi_entry(entry, pdev) {
-   hwirq = rtas_query_irq_number(pdn, i++);
-   if (hwirq < 0) {
-   pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
-   return hwirq;
-   }
-
-   /*
-* Depending on the number of online CPUs in the original
-* kernel, it is likely for CPU #0 to be offline in a kdump
-* kernel. The associated IRQs in the affinity mappings
-* provided by irq_create_affinity_masks() are thus not
-* started by irq_startup(), as per-design for managed IRQs.
-* This can be a problem with multi-queue block devices driven
-* by blk-mq : such a non-started IRQ is very likely paired
-* with the single queue enforced by blk-mq during kdump (see
-* blk_mq_alloc_tag_set()). This causes the device to remain
-* silent and likely hangs the guest at some point.
-*
-* We don't really care for fine-grained affinity when doing
-* kdump actually : simply ignore the pre-computed affinity
-* masks in this case and let the default mask with all CPUs
-* be used when creating the IRQ mappings.
-*/
-   if (is_kdump_kernel())
-   virq = irq_create_mapping(NULL, hwirq);
-   else
-   virq = irq_create_mapping_affinity(NULL, hwirq,
-  entry->affinity);
-
-   if (!virq) {
-   pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
-   return -ENOSPC;
-   }
-
-   dev_dbg(>dev, "rtas_msi: allocated virq %d\n", virq);
-   irq_set_msi_desc(virq, entry);
-
-   /* Read config space back so we can restore after reset */
-   __pci_read_msi_msg(entry, );
-   entry->msg = msg;
-   }
-
-   return 0;
-}
-
 static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device 
*dev,
   int nvec, msi_alloc_info_t *arg)
 {
@@ -759,8 +684,6 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
 
 static int rtas_msi_init(void)
 {
-   struct pci_controller *phb;
-
query_token  = rtas_token("ibm,query-interrupt-source-number");
change_token = rtas_token("ibm,change-msi");
 
@@ -772,16 +695,6 @@ static int rtas_msi_init(void)
 
pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
 
-   WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
-   pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
-   pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
-
-   list_for_each_entry(phb, _list, list_node) {
-   WARN_ON(phb->controller_ops.setup_msi_irqs);
-   phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
-   phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
-   }
-
WARN_ON(ppc_md.pci_irq_fixup);
ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
 
-- 
2.26.3



[PATCH 10/31] powerpc/pseries/pci: Add support of MSI domains to PHB hotplug

2021-04-30 Thread Cédric Le Goater
Simply allocate or release the MSI domains when a PHB is inserted in
or removed from the machine.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/pseries.h   |  1 +
 arch/powerpc/platforms/pseries/msi.c   | 10 ++
 arch/powerpc/platforms/pseries/pci_dlpar.c |  4 
 3 files changed, 15 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 91cf2afcf423..57bf4c2091e1 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -88,6 +88,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge 
*bridge);
 
 extern struct pci_controller_ops pseries_pci_controller_ops;
 int pseries_msi_allocate_domains(struct pci_controller *phb);
+void pseries_msi_free_domains(struct pci_controller *phb);
 
 unsigned long pseries_memory_block_size(void);
 
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index da9d63a088bb..d1470941cadf 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -733,6 +733,16 @@ int pseries_msi_allocate_domains(struct pci_controller 
*phb)
return __pseries_msi_allocate_domains(phb, count);
 }
 
+void pseries_msi_free_domains(struct pci_controller *phb)
+{
+   if (phb->msi_domain)
+   irq_domain_remove(phb->msi_domain);
+   if (phb->dev_domain)
+   irq_domain_remove(phb->dev_domain);
+   if (phb->fwnode)
+   irq_domain_free_fwnode(phb->fwnode);
+}
+
 static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
 {
/* No LSI -> leave MSIs (if any) configured */
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c 
b/arch/powerpc/platforms/pseries/pci_dlpar.c
index f9ae17e8a0f4..cf8a2e7a0f2c 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -33,6 +33,8 @@ struct pci_controller *init_phb_dynamic(struct device_node 
*dn)
 
pci_devs_phb_init_dynamic(phb);
 
+   pseries_msi_allocate_domains(phb);
+
/* Create EEH devices for the PHB */
eeh_phb_pe_create(phb);
 
@@ -73,6 +75,8 @@ int remove_phb_dynamic(struct pci_controller *phb)
}
}
 
+   pseries_msi_free_domains(phb);
+
/* Remove the PCI bus and unregister the bridge device from sysfs */
phb->bus = NULL;
pci_remove_bus(b);
-- 
2.26.3



[PATCH 18/31] powerpc/xics: Give a name to the default XICS IRQ domain

2021-04-30 Thread Cédric Le Goater
and clean up the error path.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 2fa45cd12a82..981587c7 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -412,11 +412,22 @@ static const struct irq_domain_ops xics_host_ops = {
.xlate = xics_host_xlate,
 };
 
-static void __init xics_init_host(void)
+static int __init xics_allocate_domain(void)
 {
-   xics_host = irq_domain_add_tree(NULL, _host_ops, NULL);
-   BUG_ON(xics_host == NULL);
+   struct fwnode_handle *fn;
+
+   fn = irq_domain_alloc_named_fwnode("XICS");
+   if (!fn)
+   return -ENOMEM;
+
+   xics_host = irq_domain_create_tree(fn, _host_ops, NULL);
+   if (!xics_host) {
+   irq_domain_free_fwnode(fn);
+   return -ENOMEM;
+   }
+
irq_set_default_host(xics_host);
+   return 0;
 }
 
 void __init xics_register_ics(struct ics *ics)
@@ -478,6 +489,8 @@ void __init xics_init(void)
/* Initialize common bits */
xics_get_server_size();
xics_update_irq_servers();
-   xics_init_host();
+   rc = xics_allocate_domain();
+   if (rc < 0)
+   pr_err("XICS: Failed to create IRQ domain");
xics_setup_cpu();
 }
-- 
2.26.3



[PATCH 02/31] powerpc/pseries/pci: Introduce rtas_prepare_msi_irqs()

2021-04-30 Thread Cédric Le Goater
This splits the routine setting the MSIs in two parts: allocation of
MSIs for the PCI device at the FW level (RTAS) and the actual mapping
and activation of the IRQs.

rtas_prepare_msi_irqs() will serve as a handler for the MSI domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index d2d090e04745..4bf14f27e1aa 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -373,12 +373,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev)
pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0);
 }
 
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type,
+msi_alloc_info_t *arg)
 {
struct pci_dn *pdn;
-   int hwirq, virq, i, quota, rc;
-   struct msi_desc *entry;
-   struct msi_msg msg;
+   int quota, rc;
int nvec = nvec_in;
int use_32bit_msi_hack = 0;
 
@@ -456,6 +455,22 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int 
nvec_in, int type)
return rc;
}
 
+   return 0;
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
+{
+   struct pci_dn *pdn;
+   int hwirq, virq, i;
+   int rc;
+   struct msi_desc *entry;
+   struct msi_msg msg;
+
+   rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL);
+   if (rc)
+   return rc;
+
+   pdn = pci_get_pdn(pdev);
i = 0;
for_each_pci_msi_entry(entry, pdev) {
hwirq = rtas_query_irq_number(pdn, i++);
-- 
2.26.3



[PATCH 01/31] powerpc/pseries/pci: Introduce __find_pe_total_msi()

2021-04-30 Thread Cédric Le Goater
It will help to size the PCI MSI domain.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/pseries/msi.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 637300330507..d2d090e04745 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -164,12 +164,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec)
 
 /* Quota calculation */
 
-static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+static struct device_node *__find_pe_total_msi(struct device_node *node, int 
*total)
 {
struct device_node *dn;
const __be32 *p;
 
-   dn = of_node_get(pci_device_to_OF_node(dev));
+   dn = of_node_get(node);
while (dn) {
p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
if (p) {
@@ -185,6 +185,11 @@ static struct device_node *find_pe_total_msi(struct 
pci_dev *dev, int *total)
return NULL;
 }
 
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+   return __find_pe_total_msi(pci_device_to_OF_node(dev), total);
+}
+
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
struct device_node *dn;
-- 
2.26.3



[PATCH 16/31] powerpc/xics: Remove ICS list

2021-04-30 Thread Cédric Le Goater
We always had only one ICS per machine. Simplify the XICS driver by
removing the ICS list.

The ICS stored in the chip data of the XICS domain becomes useless and
we don't need it anymore to migrate away IRQs from a CPU. This will be
removed in a subsequent patch.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xics/xics-common.c | 45 +++---
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 7e4305c01bac..509b9432c368 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct xics_cppr, xics_cppr);
 
 struct irq_domain *xics_host;
 
-static LIST_HEAD(ics_list);
+static struct ics *xics_ics;
 
 void xics_update_irq_servers(void)
 {
@@ -111,12 +111,11 @@ void xics_setup_cpu(void)
 
 void xics_mask_unknown_vec(unsigned int vec)
 {
-   struct ics *ics;
-
pr_err("Interrupt 0x%x (real) is invalid, disabling it.\n", vec);
 
-   list_for_each_entry(ics, _list, link)
-   ics->mask_unknown(ics, vec);
+   if (WARN_ON(!xics_ics))
+   return;
+   xics_ics->mask_unknown(xics_ics, vec);
 }
 
 
@@ -198,7 +197,6 @@ void xics_migrate_irqs_away(void)
struct irq_chip *chip;
long server;
unsigned long flags;
-   struct ics *ics;
 
/* We can't set affinity on ISA interrupts */
if (virq < NUM_ISA_INTERRUPTS)
@@ -219,13 +217,10 @@ void xics_migrate_irqs_away(void)
raw_spin_lock_irqsave(>lock, flags);
 
/* Locate interrupt server */
-   server = -1;
-   ics = irq_desc_get_chip_data(desc);
-   if (ics)
-   server = ics->get_server(ics, irq);
+   server = xics_ics->get_server(xics_ics, irq);
if (server < 0) {
-   printk(KERN_ERR "%s: Can't find server for irq %d\n",
-  __func__, irq);
+   pr_err("%s: Can't find server for irq %d/%x\n",
+  __func__, virq, irq);
goto unlock;
}
 
@@ -307,13 +302,9 @@ int xics_get_irq_server(unsigned int virq, const struct 
cpumask *cpumask,
 static int xics_host_match(struct irq_domain *h, struct device_node *node,
   enum irq_domain_bus_token bus_token)
 {
-   struct ics *ics;
-
-   list_for_each_entry(ics, _list, link)
-   if (ics->host_match(ics, node))
-   return 1;
-
-   return 0;
+   if (WARN_ON(!xics_ics))
+   return 0;
+   return xics_ics->host_match(xics_ics, node) ? 1 : 0;
 }
 
 /* Dummies */
@@ -330,8 +321,6 @@ static struct irq_chip xics_ipi_chip = {
 static int xics_host_map(struct irq_domain *h, unsigned int virq,
 irq_hw_number_t hw)
 {
-   struct ics *ics;
-
pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw);
 
/*
@@ -348,12 +337,14 @@ static int xics_host_map(struct irq_domain *h, unsigned 
int virq,
return 0;
}
 
+   if (WARN_ON(!xics_ics))
+   return -EINVAL;
+
/* Let the ICS setup the chip data */
-   list_for_each_entry(ics, _list, link)
-   if (ics->map(ics, virq) == 0)
-   return 0;
+   if (xics_ics->map(xics_ics, virq))
+   return -EINVAL;
 
-   return -EINVAL;
+   return 0;
 }
 
 static int xics_host_xlate(struct irq_domain *h, struct device_node *ct,
@@ -427,7 +418,9 @@ static void __init xics_init_host(void)
 
 void __init xics_register_ics(struct ics *ics)
 {
-   list_add(>link, _list);
+   if (WARN_ONCE(xics_ics, "XICS: Source Controller is already defined !"))
+   return;
+   xics_ics = ics;
 }
 
 static void __init xics_get_server_size(void)
-- 
2.26.3



[PATCH 14/31] KVM: PPC: Book3S HV: XIVE: Change interface of passthrough interrupt routines

2021-04-30 Thread Cédric Le Goater
The routine kvmppc_set_passthru_irq() calls kvmppc_xive_set_mapped()
and kvmppc_xive_clr_mapped() with an IRQ descriptor. Use directly the
host IRQ number to remove a useless conversion.

Add some debug.

Cc: Paul Mackerras 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 ++--
 arch/powerpc/kvm/book3s_hv.c   |  4 ++--
 arch/powerpc/kvm/book3s_xive.c | 17 -
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 8aacd76bb702..d6c52a0ec687 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -663,9 +663,9 @@ extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
 extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc);
+ unsigned long host_irq);
 extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
- struct irq_desc *host_desc);
+ unsigned long host_irq);
 extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 86a0f8b0e6da..9f4eb74a11cc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5196,7 +5196,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
pimap->n_mapped++;
 
if (xics_on_xive())
-   rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
+   rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq);
else
kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
if (rc)
@@ -5237,7 +5237,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
}
 
if (xics_on_xive())
-   rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, 
pimap->mapped[i].desc);
+   rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq);
else
kvmppc_xics_clr_mapped(kvm, guest_gsi, 
pimap->mapped[i].r_hwirq);
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index e7219b6f5f9a..3a7da42bed57 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -856,13 +856,12 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 }
 
 int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
-  struct irq_desc *host_desc)
+  unsigned long host_irq)
 {
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
-   struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
-   unsigned int host_irq = irq_desc_get_irq(host_desc);
+   struct irq_data *host_data = irq_get_irq_data(host_irq);
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
u16 idx;
u8 prio;
@@ -871,7 +870,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
if (!xive)
return -ENODEV;
 
-   pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, 
hw_irq);
+   pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n",
+__func__, guest_irq, host_irq, hw_irq);
 
sb = kvmppc_xive_find_source(xive, guest_irq, );
if (!sb)
@@ -893,7 +893,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
 */
rc = irq_set_vcpu_affinity(host_irq, state);
if (rc) {
-   pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
+   pr_err("Failed to set VCPU affinity for host IRQ %ld\n", 
host_irq);
return rc;
}
 
@@ -953,12 +953,11 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long 
guest_irq,
 EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
 
 int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
-  struct irq_desc *host_desc)
+  unsigned long host_irq)
 {
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvmppc_xive_src_block *sb;
struct kvmppc_xive_irq_state *state;
-   unsigned int host_irq = irq_desc_get_irq(host_desc);
u16 idx;
u8 prio;
int rc;
@@ -966,7 +965,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long 
guest_irq,
if (!xive)
return -ENODEV;
 
-   pr_devel("clr_mapped girq 0x%lx...\n", guest_irq);
+   pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n"

[PATCH 12/31] powerpc/powernv/pci: Add MSI domains

2021-04-30 Thread Cédric Le Goater
This is very similar to the MSI domains of the pSeries platform. The
MSI allocator is directly handled under the Linux PHB in the
in-the-middle "MSI" domain.

Only the XIVE (P9/P10) parent domain is supported for now. We still
need to add support for IRQ domain hierarchy under XICS.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 188 ++
 1 file changed, 188 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index b2a8da6114b5..3886ca6e2ed3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -2244,6 +2245,189 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, 
struct pci_dev *dev,
return 0;
 }
 
+/*
+ * The msi_free() op is called before irq_domain_free_irqs_top() when
+ * the handler data is still available. Use that to clear the XIVE
+ * controller.
+ */
+static void pnv_msi_ops_msi_free(struct irq_domain *domain,
+struct msi_domain_info *info,
+unsigned int irq)
+{
+   if (xive_enabled())
+   xive_irq_free_data(irq);
+}
+
+static struct msi_domain_ops pnv_pci_msi_domain_ops = {
+   .msi_free   = pnv_msi_ops_msi_free,
+};
+
+static void pnv_msi_shutdown(struct irq_data *d)
+{
+   d = d->parent_data;
+   if (d->chip->irq_shutdown)
+   d->chip->irq_shutdown(d);
+}
+
+static void pnv_msi_mask(struct irq_data *d)
+{
+   pci_msi_mask_irq(d);
+   irq_chip_mask_parent(d);
+}
+
+static void pnv_msi_unmask(struct irq_data *d)
+{
+   pci_msi_unmask_irq(d);
+   irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip pnv_pci_msi_irq_chip = {
+   .name   = "PNV-PCI-MSI",
+   .irq_shutdown   = pnv_msi_shutdown,
+   .irq_mask   = pnv_msi_mask,
+   .irq_unmask = pnv_msi_unmask,
+   .irq_eoi= irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pnv_msi_domain_info = {
+   .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX),
+   .ops   = _pci_msi_domain_ops,
+   .chip  = _pci_msi_irq_chip,
+};
+
+static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
+{
+   struct msi_desc *entry = irq_data_get_msi_desc(d);
+   struct pci_dev *pdev = msi_desc_to_pci_dev(entry);
+   struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+   struct pnv_phb *phb = hose->private_data;
+   int rc;
+
+   rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
+ entry->msi_attrib.is_64, msg);
+   if (rc)
+   dev_err(>dev, "Failed to setup %s-bit MSI #%ld : %d\n",
+   entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
+}
+
+static struct irq_chip pnv_msi_irq_chip = {
+   .name   = "PNV-MSI",
+   .irq_shutdown   = pnv_msi_shutdown,
+   .irq_mask   = irq_chip_mask_parent,
+   .irq_unmask = irq_chip_unmask_parent,
+   .irq_eoi= irq_chip_eoi_parent,
+   .irq_set_affinity   = irq_chip_set_affinity_parent,
+   .irq_compose_msi_msg= pnv_msi_compose_msg,
+};
+
+static int pnv_irq_parent_domain_alloc(struct irq_domain *domain,
+  unsigned int virq, int hwirq)
+{
+   struct irq_fwspec parent_fwspec;
+   int ret;
+
+   parent_fwspec.fwnode = domain->parent->fwnode;
+   parent_fwspec.param_count = 2;
+   parent_fwspec.param[0] = hwirq;
+   parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
+
+   ret = irq_domain_alloc_irqs_parent(domain, virq, 1, _fwspec);
+   if (ret)
+   return ret;
+
+   return 0;
+}
+
+static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+   unsigned int nr_irqs, void *arg)
+{
+   struct pci_controller *hose = domain->host_data;
+   struct pnv_phb *phb = hose->private_data;
+   msi_alloc_info_t *info = arg;
+   struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc);
+   int hwirq;
+   int i, ret;
+
+   hwirq = msi_bitmap_alloc_hwirqs(>msi_bmp, nr_irqs);
+   if (hwirq < 0) {
+   dev_warn(>dev, "failed to find a free MSI\n");
+   return -ENOSPC;
+   }
+
+   dev_dbg(>dev, "%s bridge %pOF %d/%x #%d\n", __func__,
+   hose->dn, virq, hwirq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++) {
+   ret = pnv_irq_parent_domain_alloc(domain, virq + i,
+ phb->msi_base

[PATCH 11/31] powerpc/powernv/pci: Introduce __pnv_pci_ioda_msi_setup()

2021-04-30 Thread Cédric Le Goater
It will be used as a 'compose_msg' handler of the MSI domain
introduced later.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 +++
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index f0f901683a2f..b2a8da6114b5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2160,15 +2160,17 @@ bool is_pnv_opal_msi(struct irq_chip *chip)
 }
 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
 
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
+static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+   unsigned int xive_num,
+   unsigned int is_64, struct msi_msg *msg)
 {
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
-   unsigned int xive_num = hwirq - phb->msi_base;
__be32 data;
int rc;
 
+   dev_dbg(>dev, "%s: setup %s-bit MSI for vector #%d\n", __func__,
+   is_64 ? "64" : "32", xive_num);
+
/* No PE assigned ? bail out ... no MSI for you ! */
if (pe == NULL)
return -ENXIO;
@@ -2216,12 +2218,28 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, 
struct pci_dev *dev,
}
msg->data = be32_to_cpu(data);
 
+   return 0;
+}
+
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int hwirq, unsigned int virq,
+ unsigned int is_64, struct msi_msg *msg)
+{
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+   unsigned int xive_num = hwirq - phb->msi_base;
+   int rc;
+
+   rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg);
+   if (rc)
+   return rc;
+
+   /* P8 only */
pnv_set_msi_irq_chip(phb, virq);
 
pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
 " address=%x_%08x data=%x PE# %x\n",
 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
-msg->address_hi, msg->address_lo, data, pe->pe_number);
+msg->address_hi, msg->address_lo, msg->data, pe->pe_number);
 
return 0;
 }
-- 
2.26.3



[PATCH 00/31] powerpc: Modernize the PCI/MSI support

2021-04-30 Thread Cédric Le Goater
Hello,

This series adds support for MSI IRQ domains on top of the XICS (P8)
and XIVE (P9/P10) IRQ domains for the PowerNV (baremetal) and pSeries
(VM) platforms. It should improve greatly IRQ affinity of PCI MSIs
under these PowerPC platforms. Data locality can still be improved
with a machine IRQ domain per chip but this requires FW changes.

The patchset has a large impact but it is well contained under the MSI
support. Initial tests were done on the P8, P9 and P10 PowerNV and
pSeries platforms, under the KVM and PowerVM hypervisor. PCI passthrough
was tested on P8/KVM, P9/KVM and P9/pVM.

P8 passthrough adds an optimization to EOI MSIs when under real mode
but I didn't see any performance improvements with a passthrough 10G
Ethernet adapter. If someone has faster adapters, I would be interested
by the results.

The P8/CAPI driver is also impacted. Tests were done on a Firestone
system with a memory AFU.

Thanks,

C.

Cédric Le Goater (31):
  powerpc/pseries/pci: Introduce __find_pe_total_msi()
  powerpc/pseries/pci: Introduce rtas_prepare_msi_irqs()
  powerpc/xive: Add support for IRQ domain hierarchy
  powerpc/xive: Ease debugging of xive_irq_set_affinity()
  powerpc/pseries/pci: Add MSI domains
  powerpc/xive: Drop unmask of MSIs at startup
  powerpc/xive: Fix xive_irq_set_affinity for MSI
  powerpc/pseries/pci: Add a domain_free_irqs handler
  powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data
  powerpc/pseries/pci: Add support of MSI domains to PHB hotplug
  powerpc/powernv/pci: Introduce __pnv_pci_ioda_msi_setup()
  powerpc/powernv/pci: Add MSI domains
  KVM: PPC: Book3S HV: Use the new IRQ chip to detect passthrough
interrupts
  KVM: PPC: Book3S HV: XIVE: Change interface of passthrough interrupt
routines
  KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts
  powerpc/xics: Remove ICS list
  powerpc/xics: Rename the map handler in a check handler
  powerpc/xics: Give a name to the default XICS IRQ domain
  powerpc/xics: Add debug logging to the set_irq_affinity handlers
  powerpc/xics: Add support for IRQ domain hierarchy
  powerpc/powernv/pci: Customize the MSI EOI handler to support PHB3
  powerpc/pci: Drop XIVE restriction on MSI domains
  powerpc/xics: Drop unmask of MSIs at startup
  powerpc/pseries/pci: Drop unused MSI code
  powerpc/powernv/pci: Drop unused MSI code
  powerpc/powernv/pci: Adapt is_pnv_opal_msi() to detect passthrough
interrupt
  powerpc/xics: Fix IRQ migration
  powerpc/powernv/pci: Set the IRQ chip data for P8/CXL devices
  powerpc/powernv/pci: Rework pnv_opal_pci_msi_eoi()
  KVM: PPC: Book3S HV: XICS: Fix mapping of passthrough interrupts
  genirq: Improve "hwirq" output in /proc and /sys/

 arch/powerpc/include/asm/kvm_ppc.h |   4 +-
 arch/powerpc/include/asm/pci-bridge.h  |   5 +
 arch/powerpc/include/asm/pnv-pci.h |   2 +-
 arch/powerpc/include/asm/xics.h|   3 +-
 arch/powerpc/include/asm/xive.h|   1 +
 arch/powerpc/platforms/powernv/pci.h   |   6 -
 arch/powerpc/platforms/pseries/pseries.h   |   2 +
 arch/powerpc/kernel/pci-common.c   |   6 +
 arch/powerpc/kvm/book3s_hv.c   |  18 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c   |   8 +-
 arch/powerpc/kvm/book3s_xive.c |  18 +-
 arch/powerpc/platforms/powernv/pci-ioda.c  | 258 --
 arch/powerpc/platforms/powernv/pci.c   |  67 -
 arch/powerpc/platforms/pseries/msi.c   | 296 -
 arch/powerpc/platforms/pseries/pci_dlpar.c |   4 +
 arch/powerpc/platforms/pseries/setup.c |   2 +
 arch/powerpc/sysdev/xics/ics-opal.c|  40 +--
 arch/powerpc/sysdev/xics/ics-rtas.c|  40 +--
 arch/powerpc/sysdev/xics/xics-common.c | 125 ++---
 arch/powerpc/sysdev/xive/common.c  |  81 +-
 kernel/irq/irqdesc.c   |   2 +-
 kernel/irq/irqdomain.c |   1 +
 kernel/irq/proc.c  |   2 +-
 23 files changed, 693 insertions(+), 298 deletions(-)

-- 
2.26.3



[PATCH 04/31] powerpc/xive: Ease debugging of xive_irq_set_affinity()

2021-04-30 Thread Cédric Le Goater
pr_debug() is easier to activate and it helps to know how the HW is
configured when tweaking the IRQ subsystem.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 6ad26243bc33..9cb7ae728b46 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -713,7 +713,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
u32 target, old_target;
int rc = 0;
 
-   pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
+   pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq);
 
/* Is this valid ? */
if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
@@ -758,7 +758,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
return rc;
}
 
-   pr_devel("  target: 0x%x\n", target);
+   pr_debug("  target: 0x%x\n", target);
xd->target = target;
 
/* Give up previous target */
-- 
2.26.3



[PATCH 03/31] powerpc/xive: Add support for IRQ domain hierarchy

2021-04-30 Thread Cédric Le Goater
This adds handlers to allocate/free IRQs in a domain hierarchy. We
could try to use xive_irq_domain_map() in xive_irq_domain_alloc() but
we rely on xive_irq_alloc_data() to set the IRQ handler data and
duplicating the code is simpler.

xive_irq_free_data() needs to be called when IRQ are freed to clear
the MMIO mappings and free the XIVE handler data, xive_irq_data
structure. This is going to be a problem with MSI domains which we
will address later.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 60 +++
 1 file changed, 60 insertions(+)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 5acd76403ee7..6ad26243bc33 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1372,7 +1372,67 @@ static void xive_irq_domain_debug_show(struct seq_file 
*m, struct irq_domain *d,
 }
 #endif
 
+static int xive_irq_domain_translate(struct irq_domain *d,
+struct irq_fwspec *fwspec,
+unsigned long *hwirq,
+unsigned int *type)
+{
+   return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode),
+fwspec->param, fwspec->param_count,
+hwirq, type);
+}
+
+static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+unsigned int nr_irqs, void *arg)
+{
+   struct irq_fwspec *fwspec = arg;
+   irq_hw_number_t hwirq;
+   unsigned int type = IRQ_TYPE_NONE;
+   int i, rc;
+
+   rc = xive_irq_domain_translate(domain, fwspec, , );
+   if (rc)
+   return rc;
+
+   pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++) {
+   /* TODO: call xive_irq_domain_map() */
+
+   /*
+* Mark interrupts as edge sensitive by default so that resend
+* actually works. Will fix that up below if needed.
+*/
+   irq_clear_status_flags(virq, IRQ_LEVEL);
+
+   /* allocates and sets handler data */
+   rc = xive_irq_alloc_data(virq + i, hwirq + i);
+   if (rc)
+   return rc;
+
+   irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ _irq_chip, 
domain->host_data);
+   irq_set_handler(virq + i, handle_fasteoi_irq);
+   }
+
+   return 0;
+}
+
+static void xive_irq_domain_free(struct irq_domain *domain,
+unsigned int virq, unsigned int nr_irqs)
+{
+   int i;
+
+   pr_debug("%s %d #%d\n", __func__, virq, nr_irqs);
+
+   for (i = 0; i < nr_irqs; i++)
+   xive_irq_free_data(virq + i);
+}
+
 static const struct irq_domain_ops xive_irq_domain_ops = {
+   .alloc  = xive_irq_domain_alloc,
+   .free   = xive_irq_domain_free,
+   .translate = xive_irq_domain_translate,
.match = xive_irq_domain_match,
.map = xive_irq_domain_map,
.unmap = xive_irq_domain_unmap,
-- 
2.26.3



[PATCH 06/31] powerpc/xive: Drop unmask of MSIs at startup

2021-04-30 Thread Cédric Le Goater
That was a workaround in the XIVE domain because of the lack of MSI
domain. This is now handled.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 9cb7ae728b46..96737938e8e3 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -616,16 +616,6 @@ static unsigned int xive_irq_startup(struct irq_data *d)
pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
 d->irq, hw_irq, d);
 
-#ifdef CONFIG_PCI_MSI
-   /*
-* The generic MSI code returns with the interrupt disabled on the
-* card, using the MSI mask bits. Firmware doesn't appear to unmask
-* at that level, so we do it here by hand.
-*/
-   if (irq_data_get_msi_desc(d))
-   pci_msi_unmask_irq(d);
-#endif
-
/* Pick a target */
target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
if (target == XIVE_INVALID_TARGET) {
-- 
2.26.3



Re: [PATCH v1] KVM: PPC: Book3S HV P9: implement kvmppc_xive_pull_vcpu in C

2021-04-21 Thread Cédric Le Goater
On 4/13/21 3:38 PM, Nicholas Piggin wrote:
> This is more symmetric with kvmppc_xive_push_vcpu, and has the advantage
> that it runs with the MMU on.
> 
> The extra test added to the asm will go away with a future change.
> 
> Reviewed-by: Cédric Le Goater 
> Reviewed-by: Alexey Kardashevskiy 
> Signed-off-by: Nicholas Piggin 
> ---
> Another bit that came from the KVM Cify series.
> 
> Thanks,
> Nick
> 
>  arch/powerpc/include/asm/kvm_ppc.h  |  2 ++
>  arch/powerpc/kvm/book3s_hv.c|  2 ++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |  5 
>  arch/powerpc/kvm/book3s_xive.c  | 31 +
>  4 files changed, 40 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 9531b1c1b190..73b1ca5a6471 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -672,6 +672,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 
> icpval);
>  extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
>  int level, bool line_status);
>  extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
> +extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
>  
>  static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>  {
> @@ -712,6 +713,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu 
> *vcpu, u64 icpval) { retur
>  static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, 
> u32 irq,
> int level, bool line_status) { return 
> -ENODEV; }
>  static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
> +static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
>  
>  static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>   { return 0; }
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 4a532410e128..981bcaf787a8 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3570,6 +3570,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> *vcpu, u64 time_limit,
>  
>   trap = __kvmhv_vcpu_entry_p9(vcpu);
>  
> + kvmppc_xive_pull_vcpu(vcpu);
> +
>   /* Advance host PURR/SPURR by the amount used by guest */
>   purr = mfspr(SPRN_PURR);
>   spurr = mfspr(SPRN_SPURR);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 75405ef53238..c11597f815e4 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1442,6 +1442,11 @@ guest_exit_cont:   /* r9 = vcpu, r12 = 
> trap, r13 = paca */
>   bl  kvmhv_accumulate_time
>  #endif
>  #ifdef CONFIG_KVM_XICS
> + /* If we came in through the P9 short path, xive pull is done in C */
> + lwz r0, STACK_SLOT_SHORT_PATH(r1)
> + cmpwi   r0, 0
> + bne 1f
> +
>   /* We are exiting, pull the VP from the XIVE */
>   lbz r0, VCPU_XIVE_PUSHED(r9)
>   cmpwi   cr0, r0, 0
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index e7219b6f5f9a..741bf1f4387a 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -127,6 +127,37 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
>  
> +/*
> + * Pull a vcpu's context from the XIVE on guest exit.
> + * This assumes we are in virtual mode (MMU on)
> + */
> +void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
> +{
> + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
> +
> + if (!vcpu->arch.xive_pushed)
> + return;
> +
> + /*
> +  * Should not have been pushed if there is no tima
> +  */
> + if (WARN_ON(!tima))
> + return;
> +
> + eieio();
> + /* First load to pull the context, we ignore the value */
> + __raw_readl(tima + TM_SPC_PULL_OS_CTX);
> + /* Second load to recover the context state (Words 0 and 1) */
> + vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);

This load could be removed on P10, since HW is configured to do the same.
It should save a few cycles.

C. 

> + /* Fixup some of the state for the next load */
> + vcpu->arch.xive_saved_state.lsmfb = 0;
> + vcpu->arch.xive_saved_state.ack = 0xff;
> + vcpu->arch.xive_pushed = 0;
> + eieio();
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
> +
>  /*
>   * This is a simple trigger for a generic XIVE IRQ. This must
>   * only be called for interrupts that support a trigger page
> 



[PATCH] powerpc/xive: Use the "ibm, chip-id" property only under PowerNV

2021-04-13 Thread Cédric Le Goater
The 'chip_id' field of the XIVE CPU structure is used to choose a
target for a source located on the same chip. For that, the XIVE
driver queries the chip identifier from the "ibm,chip-id" property
and compares it to a 'src_chip' field identifying the chip of a
source. This information is only available on the PowerNV platform,
'src_chip' being assigned to XIVE_INVALID_CHIP_ID under pSeries.

The "ibm,chip-id" property is also not available on all platforms. It
was first introduced on PowerNV and later, under QEMU for pSeries/KVM.
However, the property is not part of PAPR and does not exist under
pSeries/PowerVM.

Assign 'chip_id' to XIVE_INVALID_CHIP_ID by default and let the
PowerNV platform override the value with the "ibm,chip-id" property.

Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/xive-internal.h | 1 +
 arch/powerpc/sysdev/xive/common.c| 9 +++--
 arch/powerpc/sysdev/xive/native.c| 6 ++
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/xive-internal.h 
b/arch/powerpc/sysdev/xive/xive-internal.h
index b3a456fdd3a5..504e7edce358 100644
--- a/arch/powerpc/sysdev/xive/xive-internal.h
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -44,6 +44,7 @@ struct xive_ops {
  u32 *sw_irq);
int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
void(*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 
prio);
+   void(*prepare_cpu)(unsigned int cpu, struct xive_cpu *xc);
void(*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
void(*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
bool(*match)(struct device_node *np);
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 587738ec4229..5acd76403ee7 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1414,17 +1414,14 @@ static int xive_prepare_cpu(unsigned int cpu)
 
xc = per_cpu(xive_cpu, cpu);
if (!xc) {
-   struct device_node *np;
-
xc = kzalloc_node(sizeof(struct xive_cpu),
  GFP_KERNEL, cpu_to_node(cpu));
if (!xc)
return -ENOMEM;
-   np = of_get_cpu_node(cpu, NULL);
-   if (np)
-   xc->chip_id = of_get_ibm_chip_id(np);
-   of_node_put(np);
xc->hw_ipi = XIVE_BAD_IRQ;
+   xc->chip_id = XIVE_INVALID_CHIP_ID;
+   if (xive_ops->prepare_cpu)
+   xive_ops->prepare_cpu(cpu, xc);
 
per_cpu(xive_cpu, cpu) = xc;
}
diff --git a/arch/powerpc/sysdev/xive/native.c 
b/arch/powerpc/sysdev/xive/native.c
index 1bb84febbaee..4fcd2dd1de71 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -382,6 +382,11 @@ static void xive_native_update_pending(struct xive_cpu *xc)
}
 }
 
+static void xive_native_prepare_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+   xc->chip_id = cpu_to_chip_id(cpu);
+}
+
 static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
 {
s64 rc;
@@ -464,6 +469,7 @@ static const struct xive_ops xive_native_ops = {
.match  = xive_native_match,
.shutdown   = xive_native_shutdown,
.update_pending = xive_native_update_pending,
+   .prepare_cpu= xive_native_prepare_cpu,
.setup_cpu  = xive_native_setup_cpu,
.teardown_cpu   = xive_native_teardown_cpu,
.sync_source= xive_native_sync_source,
-- 
2.26.3



Re: [PATCH v5 41/48] KVM: PPC: Book3S HV: Remove unused nested HV tests in XICS emulation

2021-04-02 Thread Cédric Le Goater
On 4/1/21 5:03 PM, Nicholas Piggin wrote:
> Commit f3c18e9342a44 ("KVM: PPC: Book3S HV: Use XICS hypercalls when
> running as a nested hypervisor") added nested HV tests in XICS
> hypercalls, but not all are required.
> 
> * icp_eoi is only called by kvmppc_deliver_irq_passthru which is only
>   called by kvmppc_check_passthru which is only caled by
>   kvmppc_read_one_intr.
> 
> * kvmppc_read_one_intr is only called by kvmppc_read_intr which is only
>   called by the L0 HV rmhandlers code.
> 
> * kvmhv_rm_send_ipi is called by:
>   - kvmhv_interrupt_vcore which is only called by kvmhv_commence_exit
> which is only called by the L0 HV rmhandlers code.
>   - icp_send_hcore_msg which is only called by icp_rm_set_vcpu_irq.
>   - icp_rm_set_vcpu_irq which is only called by icp_rm_try_update
>   - icp_rm_set_vcpu_irq is not nested HV safe because it writes to
> LPCR directly without a kvmhv_on_pseries test. Nested handlers
> should not in general be using the rm handlers.
> 
> The important test seems to be in kvmppc_ipi_thread, which sends the
> virt-mode H_IPI handler kick to use smp_call_function rather than
> msgsnd.
> 
> Cc: Cédric Le Goater 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kvm/book3s_hv_builtin.c | 44 +---
>  arch/powerpc/kvm/book3s_hv_rm_xics.c | 15 --
>  2 files changed, 8 insertions(+), 51 deletions(-)

So, now, the L1 is not limited to XICS anymore and we can use the XIVE 
native interrupt mode with an L2 using XICS in KVM or XIVE in QEMU.
Is that correct ?   

It seems you removed all the XICS hcalls under kvmhv_on_pseries().

C.

 
> diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
> b/arch/powerpc/kvm/book3s_hv_builtin.c
> index 8d669a0e15f8..259492bb4153 100644
> --- a/arch/powerpc/kvm/book3s_hv_builtin.c
> +++ b/arch/powerpc/kvm/book3s_hv_builtin.c
> @@ -199,15 +199,6 @@ void kvmhv_rm_send_ipi(int cpu)
>   void __iomem *xics_phys;
>   unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
>  
> - /* For a nested hypervisor, use the XICS via hcall */
> - if (kvmhv_on_pseries()) {
> - unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> -
> - plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
> - IPI_PRIORITY);
> - return;
> - }
> -
>   /* On POWER9 we can use msgsnd for any destination cpu. */
>   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
>   msg |= get_hard_smp_processor_id(cpu);
> @@ -420,19 +411,12 @@ static long kvmppc_read_one_intr(bool *again)
>   return 1;
>  
>   /* Now read the interrupt from the ICP */
> - if (kvmhv_on_pseries()) {
> - unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> -
> - rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
> - xirr = cpu_to_be32(retbuf[0]);
> - } else {
> - xics_phys = local_paca->kvm_hstate.xics_phys;
> - rc = 0;
> - if (!xics_phys)
> - rc = opal_int_get_xirr(, false);
> - else
> - xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
> - }
> + xics_phys = local_paca->kvm_hstate.xics_phys;
> + rc = 0;
> + if (!xics_phys)
> + rc = opal_int_get_xirr(, false);
> + else
> + xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
>   if (rc < 0)
>   return 1;
>  
> @@ -461,13 +445,7 @@ static long kvmppc_read_one_intr(bool *again)
>*/
>   if (xisr == XICS_IPI) {
>   rc = 0;
> - if (kvmhv_on_pseries()) {
> - unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> -
> - plpar_hcall_raw(H_IPI, retbuf,
> - hard_smp_processor_id(), 0xff);
> - plpar_hcall_raw(H_EOI, retbuf, h_xirr);
> - } else if (xics_phys) {
> + if (xics_phys) {
>   __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
>   __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
>   } else {
> @@ -493,13 +471,7 @@ static long kvmppc_read_one_intr(bool *again)
>   /* We raced with the host,
>* we need to resend that IPI, bummer
>*/
> - if (kvmhv_on_pseries()) {
> - unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> -
> - plpar_hcall_raw(H_IPI, retbuf,
> - hard_smp_processor_id(),
> - IPI_PRIORITY);
> -  

Re: [PATCH v3 8/9] powerpc/xive: Map one IPI interrupt per node

2021-04-02 Thread Cédric Le Goater
> I gave the patch below a try and we are reaching the same results, 
> even better. The simplest solution is always the best. Nick, you 
> should send that single patch.

FYI, here are results in a KVM guests with pinned vCPUs.

 * P9 DD2.2 - 2s * 64 threads - KVM guest :


IPI/sys   IPI/chip
 ---      

   unhandled  unhandled 

 chips  cpus   noirqdebug  detection  noirqdebug  detection 

 ---      

 1  0-15 4.152813   4.084240   4.061028 4.097700   4.042539   
4.008314
0-31 8.186328   8.157970   7.937127 8.277942   8.019539   
7.831189
0-4711.391635  11.232960  11.01753011.278048  10.994501  
10.889347
0-6313.907476  14.022307  11.46028013.933946  13.506828  
11.369188
 2  0-7918.105276  18.084463   8.37604718.069176  17.587916  
15.477006
0-9522.100683  22.265763   7.33822922.084006  21.588463  
19.502192
0-111   25.305948  25.473068   6.71623525.429261  24.607570  
22.733253
0-127   27.814449  28.029029   6.22273627.960119  27.253432  
23.884916

  

The three columns "IPI/chip" are results with this series. "IPI/sys" are 
without. The "unhandled detection" columns are with Nick's patch.

C. 




Re: [PATCH v3 8/9] powerpc/xive: Map one IPI interrupt per node

2021-04-02 Thread Cédric Le Goater
On 4/1/21 2:50 PM, Nicholas Piggin wrote:
> Excerpts from Cédric Le Goater's message of April 1, 2021 12:45 am:
>> ipistorm [*] can be used to benchmark the raw interrupt rate of an
>> interrupt controller by measuring the number of IPIs a system can
>> sustain. When applied to the XIVE interrupt controller of POWER9 and
>> POWER10 systems, a significant drop of the interrupt rate can be
>> observed when crossing the second node boundary.
>>
>> This is due to the fact that a single IPI interrupt is used for all
>> CPUs of the system. The structure is shared and the cache line updates
>> impact greatly the traffic between nodes and the overall IPI
>> performance.
>>
>> As a workaround, the impact can be reduced by deactivating the IRQ
>> lockup detector ("noirqdebug") which does a lot of accounting in the
>> Linux IRQ descriptor structure and is responsible for most of the
>> performance penalty.
>>
>> As a fix, this proposal allocates an IPI interrupt per node, to be
>> shared by all CPUs of that node. It solves the scaling issue, the IRQ
>> lockup detector still has an impact but the XIVE interrupt rate scales
>> linearly. It also improves the "noirqdebug" case as showed in the
>> tables below.
>>
>>  * P9 DD2.2 - 2s * 64 threads
>>
>>"noirqdebug"
>> Mint/sMint/s
>>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys
>>  --
>>  1  0-15 4.984023   4.875405   4.996536   5.048892
>> 0-3110.879164  10.544040  10.757632  11.037859
>> 0-4715.345301  14.688764  14.926520  15.310053
>> 0-6317.064907  17.066812  17.613416  17.874511
>>  2  0-7911.768764  21.650749  22.689120  22.566508
>> 0-9510.616812  26.878789  28.434703  28.320324
>> 0-111   10.151693  31.397803  31.771773  32.388122
>> 0-1279.948502  33.139336  34.875716  35.224548
>>
>>  * P10 DD1 - 4s (not homogeneous) 352 threads
>>
>>"noirqdebug"
>> Mint/sMint/s
>>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys
>>  --
>>  1  0-15 2.409402   2.364108   2.383303   2.395091
>> 0-31 6.028325   6.046075   6.08   6.073750
>> 0-47 8.655178   8.644531   8.712830   8.724702
>> 0-6311.629652  11.735953  12.088203  12.055979
>> 0-7914.392321  14.729959  14.986701  14.973073
>> 0-9512.604158  13.004034  17.528748  17.568095
>>  2  0-1119.767753  13.719831  19.968606  20.024218
>> 0-1276.744566  16.418854  22.898066  22.995110
>> 0-1436.005699  19.174421  25.425622  25.417541
>> 0-1595.649719  21.938836  27.952662  28.059603
>> 0-1755.441410  24.109484  31.133915  31.127996
>>  3  0-1915.318341  24.405322  33.999221  33.775354
>> 0-2075.191382  26.449769  36.050161  35.867307
>> 0-2235.102790  29.356943  39.544135  39.508169
>> 0-2395.035295  31.933051  42.135075  42.071975
>> 0-2554.969209  34.477367  44.655395  44.757074
>>  4  0-2714.907652  35.887016  47.080545  47.318537
>> 0-287    4.839581  38.076137  50.464307  50.636219
>> 0-3034.786031  40.881319  53.478684  53.310759
>> 0-3194.743750  43.448424  56.388102  55.973969
>> 0-3354.709936  45.623532  59.400930  58.926857
>> 0-3514.681413  45.646151  62.035804  61.830057
>>
>> [*] https://github.com/antonblanchard/ipistorm
>>
>> Cc: Thomas Gleixner 
>> Signed-off-by: Cédric Le Goater 
> 
> Very nice result but the default-on irqdebug code is quite a slowdown
> even with your improvements.
> 
> Is the main cacheline bouncing in the fast path coming from 
> desc->irq_count++ of the percpu handler? Can we do something quick and 
> dirty like the attached patch?
> 
> All this stuff seems totally racy with percpu handler but maybe that
> doesn't matter too much (and anyway it would be a much bigger change)

I gave the patch below a try and we are reaching the same results, 
even better. The simplest solution is always the best. Nick

Re: [PATCH] powerpc/powernv: Enable HAIL (HV AIL) for ISA v3.1 processors

2021-04-02 Thread Cédric Le Goater
On 4/2/21 4:41 AM, Nicholas Piggin wrote:
> Starting with ISA v3.1, LPCR[AIL] no longer controls the interrupt
> mode for HV=1 interrupts. Instead, a new LPCR[HAIL] bit is defined
> which behaves like AIL=3 for HV interrupts when set.

Will QEMU need an update ? 

Thanks,

C.


> Set HAIL on bare metal to give us mmu-on interrupts and improve
> performance.
> 
> This also fixes an scv bug: we don't implement scv real mode (AIL=0)
> vectors because they are at an inconvenient location, so we just
> disable scv support when AIL can not be set. However powernv assumes
> that LPCR[AIL] will enable AIL mode so it enables scv support despite
> HV interrupts being AIL=0, which causes scv interrupts to go off into
> the weeds.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/include/asm/reg.h |  1 +
>  arch/powerpc/kernel/setup_64.c | 19 ---
>  2 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index 1be20bc8dce2..9086a2644c89 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -441,6 +441,7 @@
>  #define   LPCR_VRMA_LP1  ASM_CONST(0x8000)
>  #define   LPCR_RMLS  0x1C00  /* Implementation dependent RMO 
> limit sel */
>  #define   LPCR_RMLS_SH   26
> +#define   LPCR_HAIL  ASM_CONST(0x0400)   /* HV AIL 
> (ISAv3.1) */
>  #define   LPCR_ILE   ASM_CONST(0x0200)   /* !HV irqs set 
> MSR:LE */
>  #define   LPCR_AIL   ASM_CONST(0x0180)   /* Alternate 
> interrupt location */
>  #define   LPCR_AIL_0 ASM_CONST(0x)   /* MMU off 
> exception offset 0x0 */
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 04a31586f760..671192afcdfd 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -233,10 +233,23 @@ static void cpu_ready_for_interrupts(void)
>* If we are not in hypervisor mode the job is done once for
>* the whole partition in configure_exceptions().
>*/
> - if (cpu_has_feature(CPU_FTR_HVMODE) &&
> - cpu_has_feature(CPU_FTR_ARCH_207S)) {
> + if (cpu_has_feature(CPU_FTR_HVMODE)) {
>   unsigned long lpcr = mfspr(SPRN_LPCR);
> - mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
> + unsigned long new_lpcr = lpcr;
> +
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> + /* P10 DD1 does not have HAIL */
> + if (pvr_version_is(PVR_POWER10) &&
> + (mfspr(SPRN_PVR) & 0xf00) == 0x100)
> + new_lpcr |= LPCR_AIL_3;
> + else
> + new_lpcr |= LPCR_HAIL;
> + } else if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
> + new_lpcr |= LPCR_AIL_3;
> + }
> +
> + if (new_lpcr != lpcr)
> + mtspr(SPRN_LPCR, new_lpcr);
>   }
>  
>   /*
> 



Re: [PATCH v3 0/9] powerpc/xive: Map one IPI interrupt per node

2021-04-01 Thread Cédric Le Goater
On 4/1/21 2:45 PM, Greg Kurz wrote:
> On Thu, 1 Apr 2021 11:18:10 +0200
> Cédric Le Goater  wrote:
> 
>> Hello,
>>
>> On 4/1/21 10:04 AM, Greg Kurz wrote:
>>> On Wed, 31 Mar 2021 16:45:05 +0200
>>> Cédric Le Goater  wrote:
>>>
>>>>
>>>> Hello,
>>>>
>>>> ipistorm [*] can be used to benchmark the raw interrupt rate of an
>>>> interrupt controller by measuring the number of IPIs a system can
>>>> sustain. When applied to the XIVE interrupt controller of POWER9 and
>>>> POWER10 systems, a significant drop of the interrupt rate can be
>>>> observed when crossing the second node boundary.
>>>>
>>>> This is due to the fact that a single IPI interrupt is used for all
>>>> CPUs of the system. The structure is shared and the cache line updates
>>>> impact greatly the traffic between nodes and the overall IPI
>>>> performance.
>>>>
>>>> As a workaround, the impact can be reduced by deactivating the IRQ
>>>> lockup detector ("noirqdebug") which does a lot of accounting in the
>>>> Linux IRQ descriptor structure and is responsible for most of the
>>>> performance penalty.
>>>>
>>>> As a fix, this proposal allocates an IPI interrupt per node, to be
>>>> shared by all CPUs of that node. It solves the scaling issue, the IRQ
>>>> lockup detector still has an impact but the XIVE interrupt rate scales
>>>> linearly. It also improves the "noirqdebug" case as showed in the
>>>> tables below. 
>>>>
>>>
>>> As explained by David and others, NUMA nodes happen to match sockets
>>> with current POWER CPUs but these are really different concepts. NUMA
>>> is about CPU memory accesses latency, 
>>
>> This is exactly our problem. we have cache issues because hw threads 
>> on different chips are trying to access the same structure in memory.
>> It happens on virtual platforms and baremetal platforms. This is not
>> restricted to pseries.
>>
> 
> Ok, I get it... the XIVE HW accesses structures in RAM, just like HW threads
> do, so the closer, the better. 

No. That's another problem related to the XIVE internal tables which
should be allocated on the chip where it is "mostly" used. 

The problem is much simpler. As the commit log says : 

 This is due to the fact that a single IPI interrupt is used for all
 CPUs of the system. The structure is shared and the cache line updates
 impact greatly the traffic between nodes and the overall IPI
 performance.

So, we have multiple threads competing for the same IRQ descriptor and 
overloading the PowerBUS with cache update synchronization. 


> This definitely looks NUMA related indeed. So
> yes, the idea of having the XIVE HW to only access local in-RAM data when
> handling IPIs between vCPUs in the same NUMA node makes sense.

yes. That's the goal.
 
> What is less clear is the exact role of ibm,chip-id actually. This is
> currently used on PowerNV only to pick up a default target on the same
> "chip" as the source if possible. What is the detailed motivation behind
> this ?

The "ibm,chip-id" issue is extra noise and not a requirement for this 
patchset.

>>> while in the case of XIVE you
>>> really need to identify a XIVE chip localized in a given socket.
>>>
>>> PAPR doesn't know about sockets, only cores. In other words, a PAPR
>>> compliant guest sees all vCPUs like they all sit in a single socket.
>>
>> There are also NUMA nodes on PAPR.
>>
> 
> Yes but nothing prevents a NUMA node to span over multiple sockets
> or having several NUMA nodes within the same socket, even if this
> isn't the case in practice with current POWER hardware.

yes. A NUMA node could even be a PCI adapter attached to storage. 
I don't know what to say. We are missing a concept maybe.

>>> Same for the XIVE. Trying to introduce a concept of socket, either
>>> by hijacking OPAL's ibm,chip-id or NUMA node ids, is a kind of
>>> spec violation in this context. If the user cares for locality of
>>> the vCPUs and XIVE on the same socket, then it should bind vCPU
>>> threads to host CPUs from the same socket in the first place.
>>
>> Yes. that's a must have of course. You need to reflect the real HW
>> topology in the guest or LPAR if you are after performance, or 
>> restrict the virtual machine to be on a single socket/chip/node.  
>>
>> And this is not only a XIVE problem. XICS has the same problem with
>> a shared single IPI interrupt descriptor but XICS doesn't scale well 
>> by design, so it doesn't show.
>>
>>
>>> Isn't this enough to solve the performance issues this series
>>> want to fix, without the need for virtual socket ids ?
>> what are virtual socket ids ? A new concept ? 
>>
> 
> For now, we have virtual CPUs identified by a virtual CPU id.
> It thus seems natural to speak of a virtual socket id, but
> anyway, the wording isn't really important here and you
> don't answer the question ;-)

if, on the hypervisor, you restrict the virtual machine vCPUs to be 
on a single POWER processor/chip, there is no problem. But large 
KVM guests or PowerVM LPARs do exist on 16s systems.

C.
 


Re: [PATCH v3 1/9] powerpc/xive: Use cpu_to_node() instead of "ibm,chip-id" property

2021-04-01 Thread Cédric Le Goater
On 4/1/21 4:49 AM, David Gibson wrote:
> On Wed, Mar 31, 2021 at 04:45:06PM +0200, Cédric Le Goater wrote:
>> The 'chip_id' field of the XIVE CPU structure is used to choose a
>> target for a source located on the same chip when possible. The XIVE
>> driver queries the chip id value from the "ibm,chip-id" DT property
>> but this property is not available on all platforms. It was first
>> introduced on the PowerNV platform and later, under QEMU for pseries.
>> However, the property does not exist under PowerVM since it is not
>> specified in PAPR.
>>
>> cpu_to_node() is a better alternative. On the PowerNV platform, the
>> node id is computed from the "ibm,associativity" property of the CPU.
>> Its value is built in the OPAL firmware from the physical chip id and
>> is equivalent to "ibm,chip-id".
> 
> Hrm... I mean, for powernv this is certainly correct, but seems to be
> relying on pretty specific specifics of the OPAL / chip behaviour,
> namely that the NUMA id == chip ID.

Yes. It seems so.  

>> On pSeries, the hcall H_HOME_NODE_ASSOCIATIVITY returns the node id.
> 
> AFAICT, the chip_id field is never actually used in the PAPR version
> of XIVE.  The only access to the field outside native.c is in
> xive_pick_irq_target(), and it only looks at chip_id if src_chip is
> valid.  

Yes.

> But src_chip is initialized to XIVE_INVALID_CHIP_ID in papr.c

Yes. The H_INT hcalls do no return any information on the source 
location.

> So it would make more sense to me to also initialize chip_id to
> XIVE_INVALID_CHIP_ID for PAPR to make it clearer that it's not
> relevant.

yes. That would clarify that chip_id is only relevant on PowerVM/KVM. 

We can drop this patch, it's not a requirement for patches 2-9, simply 
a cleanup. I will move the chip_id assignment to a platform handler 
in a other patch.

>> Also to be noted that under QEMU/KVM "ibm,chip-id" is badly calculated
>> with unusual SMT configuration. This leads to a bogus chip id value
>> being returned by of_get_ibm_chip_id().
> 
> I *still* don't clearly understand what you think is bogus about the
> chip id value that qemu generates.  It's clearly not a problem for
> XIVE, since PAPR XIVE never uses it.

I am getting confused by socket/node/chip concepts under PPC. 

However, when looking at PHB and MSI, there is definitely a "node" 
concept being used in the core IRQ layer for allocation and affinity. 
We will need to clarify that when we introduce MSI domains.  

Thanks,

C.  


Re: [PATCH v3 0/9] powerpc/xive: Map one IPI interrupt per node

2021-04-01 Thread Cédric Le Goater
Hello,

On 4/1/21 10:04 AM, Greg Kurz wrote:
> On Wed, 31 Mar 2021 16:45:05 +0200
> Cédric Le Goater  wrote:
> 
>>
>> Hello,
>>
>> ipistorm [*] can be used to benchmark the raw interrupt rate of an
>> interrupt controller by measuring the number of IPIs a system can
>> sustain. When applied to the XIVE interrupt controller of POWER9 and
>> POWER10 systems, a significant drop of the interrupt rate can be
>> observed when crossing the second node boundary.
>>
>> This is due to the fact that a single IPI interrupt is used for all
>> CPUs of the system. The structure is shared and the cache line updates
>> impact greatly the traffic between nodes and the overall IPI
>> performance.
>>
>> As a workaround, the impact can be reduced by deactivating the IRQ
>> lockup detector ("noirqdebug") which does a lot of accounting in the
>> Linux IRQ descriptor structure and is responsible for most of the
>> performance penalty.
>>
>> As a fix, this proposal allocates an IPI interrupt per node, to be
>> shared by all CPUs of that node. It solves the scaling issue, the IRQ
>> lockup detector still has an impact but the XIVE interrupt rate scales
>> linearly. It also improves the "noirqdebug" case as showed in the
>> tables below. 
>>
> 
> As explained by David and others, NUMA nodes happen to match sockets
> with current POWER CPUs but these are really different concepts. NUMA
> is about CPU memory accesses latency, 

This is exactly our problem. we have cache issues because hw threads 
on different chips are trying to access the same structure in memory.
It happens on virtual platforms and baremetal platforms. This is not
restricted to pseries.

> while in the case of XIVE you
> really need to identify a XIVE chip localized in a given socket.
> 
> PAPR doesn't know about sockets, only cores. In other words, a PAPR
> compliant guest sees all vCPUs like they all sit in a single socket.

There are also NUMA nodes on PAPR.

> Same for the XIVE. Trying to introduce a concept of socket, either
> by hijacking OPAL's ibm,chip-id or NUMA node ids, is a kind of
> spec violation in this context. If the user cares for locality of
> the vCPUs and XIVE on the same socket, then it should bind vCPU
> threads to host CPUs from the same socket in the first place.

Yes. that's a must have of course. You need to reflect the real HW
topology in the guest or LPAR if you are after performance, or 
restrict the virtual machine to be on a single socket/chip/node.  

And this is not only a XIVE problem. XICS has the same problem with
a shared single IPI interrupt descriptor but XICS doesn't scale well 
by design, so it doesn't show.


> Isn't this enough to solve the performance issues this series
> want to fix, without the need for virtual socket ids ?
what are virtual socket ids ? A new concept ? 

Thanks,

C.

> 
>>  * P9 DD2.2 - 2s * 64 threads
>>
>>"noirqdebug"
>> Mint/sMint/s   
>>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys 
>>  --
>>  1  0-15 4.984023   4.875405   4.996536   5.048892
>> 0-3110.879164  10.544040  10.757632  11.037859
>> 0-4715.345301  14.688764  14.926520  15.310053
>> 0-6317.064907  17.066812  17.613416  17.874511
>>  2  0-7911.768764  21.650749  22.689120  22.566508
>> 0-9510.616812  26.878789  28.434703  28.320324
>> 0-111   10.151693  31.397803  31.771773  32.388122
>> 0-1279.948502  33.139336  34.875716  35.224548
>>
>>
>>  * P10 DD1 - 4s (not homogeneous) 352 threads
>>
>>"noirqdebug"
>> Mint/sMint/s   
>>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys 
>>  --
>>  1  0-15 2.409402   2.364108   2.383303   2.395091
>> 0-31 6.028325   6.046075   6.08   6.073750
>> 0-47 8.655178   8.644531   8.712830   8.724702
>> 0-6311.629652  11.735953  12.088203  12.055979
>> 0-7914.392321  14.729959  14.986701  14.973073
>> 0-9512.604158  13.004034  17.528748  17.568095
>>  2  0-1119.767753  13.719831  19.968606  20.024218
>> 0-1276.744566  16.418854  22.898066  22.995110
>> 0-1436.005699  19.174421  25.425622

Re: [PATCH v3 0/9] powerpc/xive: Map one IPI interrupt per node

2021-04-01 Thread Cédric Le Goater
Hello,


On 3/31/21 4:45 PM, Cédric Le Goater wrote:
> 
> Hello,
> 
> ipistorm [*] can be used to benchmark the raw interrupt rate of an
> interrupt controller by measuring the number of IPIs a system can
> sustain. When applied to the XIVE interrupt controller of POWER9 and
> POWER10 systems, a significant drop of the interrupt rate can be
> observed when crossing the second node boundary.
> 
> This is due to the fact that a single IPI interrupt is used for all
> CPUs of the system. The structure is shared and the cache line updates
> impact greatly the traffic between nodes and the overall IPI
> performance.
> 
> As a workaround, the impact can be reduced by deactivating the IRQ
> lockup detector ("noirqdebug") which does a lot of accounting in the
> Linux IRQ descriptor structure and is responsible for most of the
> performance penalty.
> 
> As a fix, this proposal allocates an IPI interrupt per node, to be
> shared by all CPUs of that node. It solves the scaling issue, the IRQ
> lockup detector still has an impact but the XIVE interrupt rate scales
> linearly. It also improves the "noirqdebug" case as showed in the
> tables below. Hello,

>From the comments, I received on different email threads. It seems 
I am doing some wrong assumption on the code and concepts. We canpostpone this 
patchset. It's an optimization and there are some 
more cleanups that can be done before. 

Thanks for the time and the shared expertise,

C.

> 
>  * P9 DD2.2 - 2s * 64 threads
> 
>"noirqdebug"
> Mint/sMint/s   
>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys 
>  --
>  1  0-15 4.984023   4.875405   4.996536   5.048892
> 0-3110.879164  10.544040  10.757632  11.037859
> 0-4715.345301  14.688764  14.926520  15.310053
> 0-6317.064907  17.066812  17.613416  17.874511
>  2  0-7911.768764  21.650749  22.689120  22.566508
> 0-9510.616812  26.878789  28.434703  28.320324
> 0-111   10.151693  31.397803  31.771773  32.388122
> 0-1279.948502  33.139336  34.875716  35.224548
> 
> 
>  * P10 DD1 - 4s (not homogeneous) 352 threads
> 
>"noirqdebug"
> Mint/sMint/s   
>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys 
>  --
>  1  0-15 2.409402   2.364108   2.383303   2.395091
> 0-31 6.028325   6.046075   6.08   6.073750
> 0-47 8.655178   8.644531   8.712830   8.724702
> 0-6311.629652  11.735953  12.088203  12.055979
> 0-7914.392321  14.729959  14.986701  14.973073
> 0-9512.604158  13.004034  17.528748  17.568095
>  2  0-1119.767753  13.719831  19.968606  20.024218
> 0-1276.744566  16.418854  22.898066  22.995110
> 0-1436.005699  19.174421  25.425622  25.417541
> 0-1595.649719  21.938836  27.952662  28.059603
> 0-1755.441410  24.109484  31.133915  31.127996
>  3  0-1915.318341  24.405322  33.999221  33.775354
> 0-2075.191382  26.449769  36.050161  35.867307
> 0-2235.102790  29.356943  39.544135  39.508169
> 0-2395.035295  31.933051  42.135075  42.071975
> 0-2554.969209  34.477367  44.655395  44.757074
>  4  0-2714.907652  35.887016  47.080545  47.318537
> 0-2874.839581  38.076137  50.464307  50.636219
> 0-3034.786031  40.881319  53.478684  53.310759
> 0-3194.743750  43.448424  56.388102  55.973969
> 0-3354.709936  45.623532  59.400930  58.926857
> 0-3514.681413  45.646151  62.035804  61.830057
> 
> [*] https://github.com/antonblanchard/ipistorm
> 
> Thanks,
> 
> C.
> 
> Changes in v3:
> 
>   - improved commit log for the misuse of "ibm,chip-id"
>   - better error handling of xive_request_ipi()
>   - use of a fwnode_handle to name the new domain 
>   - increased IPI name length
>   - use of early_cpu_to_node() for hotplugged CPUs
>   - filter CPU-less nodes
> 
> Changes in v2:
> 
>   - extra simplification on xmon
>   - fixes on issues reported by the kernel test robot
> 
> Cédric Le Goater (9):
>   powerpc/xive: Use cpu_to_node() instead of "ibm,chip-id" property
>   powerpc/xive: Introduce an IPI interrupt domain
> 

[PATCH v3 9/9] powerpc/xive: Modernize XIVE-IPI domain with an 'alloc' handler

2021-03-31 Thread Cédric Le Goater
Instead of calling irq_create_mapping() to map the IPI for a node,
introduce an 'alloc' handler. This is usually an extension to support
hierarchy irq_domains which is not exactly the case for XIVE-IPI
domain. However, we can now use the irq_domain_alloc_irqs() routine
which allocates the IRQ descriptor on the specified node, even better
for cache performance on multi node machines.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---

 I didn't rerun the benchmark to check for a difference.
 
 arch/powerpc/sysdev/xive/common.c | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 06f29cd07448..bb7435ffe99c 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1103,15 +1103,26 @@ static struct irq_chip xive_ipi_chip = {
  * IPIs are marked per-cpu. We use separate HW interrupts under the
  * hood but associated with the same "linux" interrupt
  */
-static int xive_ipi_irq_domain_map(struct irq_domain *h, unsigned int virq,
-  irq_hw_number_t hw)
+struct xive_ipi_alloc_info {
+   irq_hw_number_t hwirq;
+};
+
+static int xive_ipi_irq_domain_alloc(struct irq_domain *domain, unsigned int 
virq,
+unsigned int nr_irqs, void *arg)
 {
-   irq_set_chip_and_handler(virq, _ipi_chip, handle_percpu_irq);
+   struct xive_ipi_alloc_info *info = arg;
+   int i;
+
+   for (i = 0; i < nr_irqs; i++) {
+   irq_domain_set_info(domain, virq + i, info->hwirq + i, 
_ipi_chip,
+   domain->host_data, handle_percpu_irq,
+   NULL, NULL);
+   }
return 0;
 }
 
 static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
-   .map = xive_ipi_irq_domain_map,
+   .alloc  = xive_ipi_irq_domain_alloc,
 };
 
 static int __init xive_request_ipi(void)
@@ -1136,7 +1147,7 @@ static int __init xive_request_ipi(void)
 
for_each_node(node) {
struct xive_ipi_desc *xid = _ipis[node];
-   irq_hw_number_t ipi_hwirq = node;
+   struct xive_ipi_alloc_info info = { node };
 
/* Skip nodes without CPUs */
if (cpumask_empty(cpumask_of_node(node)))
@@ -1147,9 +1158,9 @@ static int __init xive_request_ipi(void)
 * Since the HW interrupt number doesn't have any meaning,
 * simply use the node number.
 */
-   xid->irq = irq_create_mapping(ipi_domain, ipi_hwirq);
-   if (!xid->irq) {
-   ret = -EINVAL;
+   xid->irq = irq_domain_alloc_irqs(ipi_domain, 1, node, );
+   if (xid->irq < 0) {
+   ret = xid->irq;
goto out_free_xive_ipis;
}
 
-- 
2.26.3



[PATCH v3 8/9] powerpc/xive: Map one IPI interrupt per node

2021-03-31 Thread Cédric Le Goater
ipistorm [*] can be used to benchmark the raw interrupt rate of an
interrupt controller by measuring the number of IPIs a system can
sustain. When applied to the XIVE interrupt controller of POWER9 and
POWER10 systems, a significant drop of the interrupt rate can be
observed when crossing the second node boundary.

This is due to the fact that a single IPI interrupt is used for all
CPUs of the system. The structure is shared and the cache line updates
impact greatly the traffic between nodes and the overall IPI
performance.

As a workaround, the impact can be reduced by deactivating the IRQ
lockup detector ("noirqdebug") which does a lot of accounting in the
Linux IRQ descriptor structure and is responsible for most of the
performance penalty.

As a fix, this proposal allocates an IPI interrupt per node, to be
shared by all CPUs of that node. It solves the scaling issue, the IRQ
lockup detector still has an impact but the XIVE interrupt rate scales
linearly. It also improves the "noirqdebug" case as showed in the
tables below.

 * P9 DD2.2 - 2s * 64 threads

   "noirqdebug"
Mint/sMint/s
 chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys
 --
 1  0-15 4.984023   4.875405   4.996536   5.048892
0-3110.879164  10.544040  10.757632  11.037859
0-4715.345301  14.688764  14.926520  15.310053
0-6317.064907  17.066812  17.613416  17.874511
 2  0-7911.768764  21.650749  22.689120  22.566508
0-9510.616812  26.878789  28.434703  28.320324
0-111   10.151693  31.397803  31.771773  32.388122
0-1279.948502  33.139336  34.875716  35.224548

 * P10 DD1 - 4s (not homogeneous) 352 threads

   "noirqdebug"
Mint/sMint/s
 chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys
 --
 1  0-15 2.409402   2.364108   2.383303   2.395091
0-31 6.028325   6.046075   6.08   6.073750
0-47 8.655178   8.644531   8.712830   8.724702
0-6311.629652  11.735953  12.088203  12.055979
0-7914.392321  14.729959  14.986701  14.973073
0-9512.604158  13.004034  17.528748  17.568095
 2  0-1119.767753  13.719831  19.968606  20.024218
0-1276.744566  16.418854  22.898066  22.995110
0-1436.005699  19.174421  25.425622  25.417541
0-1595.649719  21.938836  27.952662  28.059603
0-1755.441410  24.109484  31.133915  31.127996
 3  0-1915.318341  24.405322  33.999221  33.775354
0-2075.191382  26.449769  36.050161  35.867307
0-2235.102790  29.356943  39.544135  39.508169
0-2395.035295  31.933051  42.135075  42.071975
0-2554.969209  34.477367  44.655395  44.757074
 4  0-2714.907652  35.887016  47.080545  47.318537
0-2874.839581  38.076137  50.464307  50.636219
0-3034.786031  40.881319  53.478684  53.310759
0-3194.743750  43.448424  56.388102  55.973969
0-3354.709936  45.623532  59.400930  58.926857
0-3514.681413  45.646151  62.035804  61.830057

[*] https://github.com/antonblanchard/ipistorm

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---

 Changes in v3:

  - increased IPI name length
  - use of early_cpu_to_node() for hotplugged CPUs
  - filter CPU-less nodes
  - dropped Greg's Reviewed-by because of the changes
  
 arch/powerpc/sysdev/xive/xive-internal.h |  2 -
 arch/powerpc/sysdev/xive/common.c| 60 +++-
 2 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/xive-internal.h 
b/arch/powerpc/sysdev/xive/xive-internal.h
index 9cf57c722faa..b3a456fdd3a5 100644
--- a/arch/powerpc/sysdev/xive/xive-internal.h
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -5,8 +5,6 @@
 #ifndef __XIVE_INTERNAL_H
 #define __XIVE_INTERNAL_H
 
-#define XIVE_IPI_HW_IRQ0 /* interrupt source # for IPIs */
-
 /*
  * A "disabled" interrupt should never fire, to catch problems
  * we set its logical number to this
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 69980b49afb7..06f29cd07448 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -63,8 +63,19 @@ static const struct xive_ops *xive_ops;
 static struct irq_domain *xive_irq_domain;
 
 #ifdef CONFIG_SMP
-/* The IPIs all use the same logical irq number */
-static u32 xive_ipi_irq;
+/* The IPIs use the same logical irq number when on the same chip */
+static struct xive

[PATCH v3 2/9] powerpc/xive: Introduce an IPI interrupt domain

2021-03-31 Thread Cédric Le Goater
The IPI interrupt is a special case of the XIVE IRQ domain. When
mapping and unmapping the interrupts in the Linux interrupt number
space, the HW interrupt number 0 (XIVE_IPI_HW_IRQ) is checked to
distinguish the IPI interrupt from other interrupts of the system.

Simplify the XIVE interrupt domain by introducing a specific domain
for the IPI.

Cc: Thomas Gleixner 
Signed-off-by: Cédric Le Goater 
---

 Changes in v3:

  - better error handling of xive_request_ipi()
  - use of a fwnode_handle to name the new domain
  - dropped Greg's Reviewed-by because of the changes

 arch/powerpc/sysdev/xive/common.c | 79 ++-
 1 file changed, 46 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 776871274b69..98f4dc916fa1 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1068,24 +1068,58 @@ static struct irq_chip xive_ipi_chip = {
.irq_unmask = xive_ipi_do_nothing,
 };
 
-static void __init xive_request_ipi(void)
+/*
+ * IPIs are marked per-cpu. We use separate HW interrupts under the
+ * hood but associated with the same "linux" interrupt
+ */
+static int xive_ipi_irq_domain_map(struct irq_domain *h, unsigned int virq,
+  irq_hw_number_t hw)
 {
+   irq_set_chip_and_handler(virq, _ipi_chip, handle_percpu_irq);
+   return 0;
+}
+
+static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
+   .map = xive_ipi_irq_domain_map,
+};
+
+static int __init xive_request_ipi(void)
+{
+   struct fwnode_handle *fwnode;
+   struct irq_domain *ipi_domain;
unsigned int virq;
+   int ret = -ENOMEM;
 
-   /*
-* Initialization failed, move on, we might manage to
-* reach the point where we display our errors before
-* the system falls appart
-*/
-   if (!xive_irq_domain)
-   return;
+   fwnode = irq_domain_alloc_named_fwnode("XIVE-IPI");
+   if (!fwnode)
+   goto out;
+
+   ipi_domain = irq_domain_create_linear(fwnode, 1,
+ _ipi_irq_domain_ops, NULL);
+   if (!ipi_domain)
+   goto out_free_fwnode;
 
/* Initialize it */
-   virq = irq_create_mapping(xive_irq_domain, XIVE_IPI_HW_IRQ);
+   virq = irq_create_mapping(ipi_domain, XIVE_IPI_HW_IRQ);
+   if (!virq) {
+   ret = -EINVAL;
+   goto out_free_domain;
+   }
+
xive_ipi_irq = virq;
 
-   WARN_ON(request_irq(virq, xive_muxed_ipi_action,
-   IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+   ret = request_irq(virq, xive_muxed_ipi_action,
+ IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL);
+
+   WARN(ret < 0, "Failed to request IPI %d: %d\n", virq, ret);
+   return ret;
+
+out_free_domain:
+   irq_domain_remove(ipi_domain);
+out_free_fwnode:
+   irq_domain_free_fwnode(fwnode);
+out:
+   return ret;
 }
 
 static int xive_setup_cpu_ipi(unsigned int cpu)
@@ -1179,19 +1213,6 @@ static int xive_irq_domain_map(struct irq_domain *h, 
unsigned int virq,
 */
irq_clear_status_flags(virq, IRQ_LEVEL);
 
-#ifdef CONFIG_SMP
-   /* IPIs are special and come up with HW number 0 */
-   if (hw == XIVE_IPI_HW_IRQ) {
-   /*
-* IPIs are marked per-cpu. We use separate HW interrupts under
-* the hood but associated with the same "linux" interrupt
-*/
-   irq_set_chip_and_handler(virq, _ipi_chip,
-handle_percpu_irq);
-   return 0;
-   }
-#endif
-
rc = xive_irq_alloc_data(virq, hw);
if (rc)
return rc;
@@ -1203,15 +1224,7 @@ static int xive_irq_domain_map(struct irq_domain *h, 
unsigned int virq,
 
 static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
 {
-   struct irq_data *data = irq_get_irq_data(virq);
-   unsigned int hw_irq;
-
-   /* XXX Assign BAD number */
-   if (!data)
-   return;
-   hw_irq = (unsigned int)irqd_to_hwirq(data);
-   if (hw_irq != XIVE_IPI_HW_IRQ)
-   xive_irq_free_data(virq);
+   xive_irq_free_data(virq);
 }
 
 static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
-- 
2.26.3



[PATCH v3 6/9] powerpc/xive: Simplify the dump of XIVE interrupts under xmon

2021-03-31 Thread Cédric Le Goater
Move the xmon routine under XIVE subsystem and rework the loop on the
interrupts taking into account the xive_irq_domain to filter out IPIs.

Reviewed-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive.h   |  1 +
 arch/powerpc/sysdev/xive/common.c | 14 ++
 arch/powerpc/xmon/xmon.c  | 28 ++--
 3 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 9a312b975ca8..aa094a8655b0 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -102,6 +102,7 @@ void xive_flush_interrupt(void);
 /* xmon hook */
 void xmon_xive_do_dump(int cpu);
 int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d);
+void xmon_xive_get_irq_all(void);
 
 /* APIs used by KVM */
 u32 xive_native_default_eq_shift(void);
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f5fe60c194bc..4c6e2e1289f5 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -289,6 +289,20 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data 
*d)
return 0;
 }
 
+void xmon_xive_get_irq_all(void)
+{
+   unsigned int i;
+   struct irq_desc *desc;
+
+   for_each_irq_desc(i, desc) {
+   struct irq_data *d = irq_desc_get_irq_data(desc);
+   unsigned int hwirq = (unsigned int)irqd_to_hwirq(d);
+
+   if (d->domain == xive_irq_domain)
+   xmon_xive_get_irq_config(hwirq, d);
+   }
+}
+
 #endif /* CONFIG_XMON */
 
 static unsigned int xive_get_irq(void)
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 3fe37495f63d..80fbf8968f77 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2727,30 +2727,6 @@ static void dump_all_xives(void)
dump_one_xive(cpu);
 }
 
-static void dump_one_xive_irq(u32 num, struct irq_data *d)
-{
-   xmon_xive_get_irq_config(num, d);
-}
-
-static void dump_all_xive_irq(void)
-{
-   unsigned int i;
-   struct irq_desc *desc;
-
-   for_each_irq_desc(i, desc) {
-   struct irq_data *d = irq_desc_get_irq_data(desc);
-   unsigned int hwirq;
-
-   if (!d)
-   continue;
-
-   hwirq = (unsigned int)irqd_to_hwirq(d);
-   /* IPIs are special (HW number 0) */
-   if (hwirq)
-   dump_one_xive_irq(hwirq, d);
-   }
-}
-
 static void dump_xives(void)
 {
unsigned long num;
@@ -2767,9 +2743,9 @@ static void dump_xives(void)
return;
} else if (c == 'i') {
if (scanhex())
-   dump_one_xive_irq(num, NULL);
+   xmon_xive_get_irq_config(num, NULL);
else
-   dump_all_xive_irq();
+   xmon_xive_get_irq_all();
return;
}
 
-- 
2.26.3



[PATCH v3 7/9] powerpc/xive: Fix xmon command "dxi"

2021-03-31 Thread Cédric Le Goater
When under xmon, the "dxi" command dumps the state of the XIVE
interrupts. If an interrupt number is specified, only the state of
the associated XIVE interrupt is dumped. This form of the command
lacks an irq_data parameter which is nevertheless used by
xmon_xive_get_irq_config(), leading to an xmon crash.

Fix that by doing a lookup in the system IRQ mapping to query the IRQ
descriptor data. Invalid interrupt numbers, or not belonging to the
XIVE IRQ domain, OPAL event interrupt number for instance, should be
caught by the previous query done at the firmware level.

Reported-by: kernel test robot 
Reported-by: Dan Carpenter 
Fixes: 97ef27507793 ("powerpc/xive: Fix xmon support on the PowerNV platform")
Tested-by: Greg Kurz 
Reviewed-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 4c6e2e1289f5..69980b49afb7 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -253,17 +253,20 @@ notrace void xmon_xive_do_dump(int cpu)
xmon_printf("\n");
 }
 
+static struct irq_data *xive_get_irq_data(u32 hw_irq)
+{
+   unsigned int irq = irq_find_mapping(xive_irq_domain, hw_irq);
+
+   return irq ? irq_get_irq_data(irq) : NULL;
+}
+
 int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
 {
-   struct irq_chip *chip = irq_data_get_irq_chip(d);
int rc;
u32 target;
u8 prio;
u32 lirq;
 
-   if (!is_xive_irq(chip))
-   return -EINVAL;
-
rc = xive_ops->get_irq_config(hw_irq, , , );
if (rc) {
xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
@@ -273,6 +276,9 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
hw_irq, target, prio, lirq);
 
+   if (!d)
+   d = xive_get_irq_data(hw_irq);
+
if (d) {
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
u64 val = xive_esb_read(xd, XIVE_ESB_GET);
-- 
2.26.3



[PATCH v3 5/9] powerpc/xive: Drop check on irq_data in xive_core_debug_show()

2021-03-31 Thread Cédric Le Goater
When looping on IRQ descriptor, irq_data is always valid.

Reported-by: kernel test robot 
Reported-by: Dan Carpenter 
Fixes: 930914b7d528 ("powerpc/xive: Add a debugfs file to dump internal XIVE 
state")
Reviewed-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 4149ca846e7c..f5fe60c194bc 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1607,6 +1607,8 @@ static void xive_debug_show_irq(struct seq_file *m, 
struct irq_data *d)
u32 target;
u8 prio;
u32 lirq;
+   struct xive_irq_data *xd;
+   u64 val;
 
rc = xive_ops->get_irq_config(hw_irq, , , );
if (rc) {
@@ -1617,17 +1619,14 @@ static void xive_debug_show_irq(struct seq_file *m, 
struct irq_data *d)
seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
   hw_irq, target, prio, lirq);
 
-   if (d) {
-   struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
-   u64 val = xive_esb_read(xd, XIVE_ESB_GET);
-
-   seq_printf(m, "flags=%c%c%c PQ=%c%c",
-  xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ',
-  xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ',
-  xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ',
-  val & XIVE_ESB_VAL_P ? 'P' : '-',
-  val & XIVE_ESB_VAL_Q ? 'Q' : '-');
-   }
+   xd = irq_data_get_irq_handler_data(d);
+   val = xive_esb_read(xd, XIVE_ESB_GET);
+   seq_printf(m, "flags=%c%c%c PQ=%c%c",
+  xd->flags & XIVE_IRQ_FLAG_STORE_EOI ? 'S' : ' ',
+  xd->flags & XIVE_IRQ_FLAG_LSI ? 'L' : ' ',
+  xd->flags & XIVE_IRQ_FLAG_H_INT_ESB ? 'H' : ' ',
+  val & XIVE_ESB_VAL_P ? 'P' : '-',
+  val & XIVE_ESB_VAL_Q ? 'Q' : '-');
seq_puts(m, "\n");
 }
 
-- 
2.26.3



[PATCH v3 0/9] powerpc/xive: Map one IPI interrupt per node

2021-03-31 Thread Cédric Le Goater


Hello,

ipistorm [*] can be used to benchmark the raw interrupt rate of an
interrupt controller by measuring the number of IPIs a system can
sustain. When applied to the XIVE interrupt controller of POWER9 and
POWER10 systems, a significant drop of the interrupt rate can be
observed when crossing the second node boundary.

This is due to the fact that a single IPI interrupt is used for all
CPUs of the system. The structure is shared and the cache line updates
impact greatly the traffic between nodes and the overall IPI
performance.

As a workaround, the impact can be reduced by deactivating the IRQ
lockup detector ("noirqdebug") which does a lot of accounting in the
Linux IRQ descriptor structure and is responsible for most of the
performance penalty.

As a fix, this proposal allocates an IPI interrupt per node, to be
shared by all CPUs of that node. It solves the scaling issue, the IRQ
lockup detector still has an impact but the XIVE interrupt rate scales
linearly. It also improves the "noirqdebug" case as showed in the
tables below. 

 * P9 DD2.2 - 2s * 64 threads

   "noirqdebug"
Mint/sMint/s   
 chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys 
 --
 1  0-15 4.984023   4.875405   4.996536   5.048892
0-3110.879164  10.544040  10.757632  11.037859
0-4715.345301  14.688764  14.926520  15.310053
0-6317.064907  17.066812  17.613416  17.874511
 2  0-7911.768764  21.650749  22.689120  22.566508
0-9510.616812  26.878789  28.434703  28.320324
0-111   10.151693  31.397803  31.771773  32.388122
0-1279.948502  33.139336  34.875716  35.224548


 * P10 DD1 - 4s (not homogeneous) 352 threads

   "noirqdebug"
Mint/sMint/s   
 chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys 
 --
 1  0-15 2.409402   2.364108   2.383303   2.395091
0-31 6.028325   6.046075   6.08   6.073750
0-47 8.655178   8.644531   8.712830   8.724702
0-6311.629652  11.735953  12.088203  12.055979
0-7914.392321  14.729959  14.986701  14.973073
0-9512.604158  13.004034  17.528748  17.568095
 2  0-1119.767753  13.719831  19.968606  20.024218
0-1276.744566  16.418854  22.898066  22.995110
0-1436.005699  19.174421  25.425622  25.417541
0-1595.649719  21.938836  27.952662  28.059603
0-1755.441410  24.109484  31.133915  31.127996
 3  0-1915.318341  24.405322  33.999221  33.775354
0-2075.191382  26.449769  36.050161  35.867307
0-2235.102790  29.356943  39.544135  39.508169
0-2395.035295  31.933051  42.135075  42.071975
0-2554.969209  34.477367  44.655395  44.757074
 4  0-2714.907652  35.887016  47.080545  47.318537
0-2874.839581  38.076137  50.464307  50.636219
0-3034.786031  40.881319  53.478684  53.310759
0-3194.743750  43.448424  56.388102  55.973969
0-3354.709936  45.623532  59.400930  58.926857
0-3514.681413  45.646151  62.035804  61.830057

[*] https://github.com/antonblanchard/ipistorm

Thanks,

C.

Changes in v3:

  - improved commit log for the misuse of "ibm,chip-id"
  - better error handling of xive_request_ipi()
  - use of a fwnode_handle to name the new domain 
  - increased IPI name length
  - use of early_cpu_to_node() for hotplugged CPUs
  - filter CPU-less nodes

Changes in v2:

  - extra simplification on xmon
  - fixes on issues reported by the kernel test robot

Cédric Le Goater (9):
  powerpc/xive: Use cpu_to_node() instead of "ibm,chip-id" property
  powerpc/xive: Introduce an IPI interrupt domain
  powerpc/xive: Remove useless check on XIVE_IPI_HW_IRQ
  powerpc/xive: Simplify xive_core_debug_show()
  powerpc/xive: Drop check on irq_data in xive_core_debug_show()
  powerpc/xive: Simplify the dump of XIVE interrupts under xmon
  powerpc/xive: Fix xmon command "dxi"
  powerpc/xive: Map one IPI interrupt per node
  powerpc/xive: Modernize XIVE-IPI domain with an 'alloc' handler

 arch/powerpc/include/asm/xive.h  |   1 +
 arch/powerpc/sysdev/xive/xive-internal.h |   2 -
 arch/powerpc/sysdev/xive/common.c| 211 +++
 arch/powerpc/xmon/xmon.c |  28 +--
 4 files changed, 139 insertions(+), 103 deletions(-)

-- 
2.26.3



[PATCH v3 1/9] powerpc/xive: Use cpu_to_node() instead of "ibm, chip-id" property

2021-03-31 Thread Cédric Le Goater
The 'chip_id' field of the XIVE CPU structure is used to choose a
target for a source located on the same chip when possible. The XIVE
driver queries the chip id value from the "ibm,chip-id" DT property
but this property is not available on all platforms. It was first
introduced on the PowerNV platform and later, under QEMU for pseries.
However, the property does not exist under PowerVM since it is not
specified in PAPR.

cpu_to_node() is a better alternative. On the PowerNV platform, the
node id is computed from the "ibm,associativity" property of the CPU.
Its value is built in the OPAL firmware from the physical chip id and
is equivalent to "ibm,chip-id". On pSeries, the hcall
H_HOME_NODE_ASSOCIATIVITY returns the node id.

Also to be noted that under QEMU/KVM "ibm,chip-id" is badly calculated
with unusual SMT configuration. This leads to a bogus chip id value
being returned by of_get_ibm_chip_id().

Cc: David Gibson 
Signed-off-by: Cédric Le Goater 
---

 Changes in v3:

  - improved commit log for the misuse of "ibm,chip-id"

 arch/powerpc/sysdev/xive/common.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 7e08be5e5e4a..776871274b69 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1336,16 +1336,11 @@ static int xive_prepare_cpu(unsigned int cpu)
 
xc = per_cpu(xive_cpu, cpu);
if (!xc) {
-   struct device_node *np;
-
xc = kzalloc_node(sizeof(struct xive_cpu),
  GFP_KERNEL, cpu_to_node(cpu));
if (!xc)
return -ENOMEM;
-   np = of_get_cpu_node(cpu, NULL);
-   if (np)
-   xc->chip_id = of_get_ibm_chip_id(np);
-   of_node_put(np);
+   xc->chip_id = cpu_to_node(cpu);
xc->hw_ipi = XIVE_BAD_IRQ;
 
per_cpu(xive_cpu, cpu) = xc;
-- 
2.26.3



[PATCH v3 4/9] powerpc/xive: Simplify xive_core_debug_show()

2021-03-31 Thread Cédric Le Goater
Now that the IPI interrupt has its own domain, the checks on the HW
interrupt number XIVE_IPI_HW_IRQ and on the chip can be replaced by a
check on the domain.

Reviewed-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 18 --
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 8bca9aca0607..4149ca846e7c 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1600,17 +1600,14 @@ static void xive_debug_show_cpu(struct seq_file *m, int 
cpu)
seq_puts(m, "\n");
 }
 
-static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct 
irq_data *d)
+static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d)
 {
-   struct irq_chip *chip = irq_data_get_irq_chip(d);
+   unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
int rc;
u32 target;
u8 prio;
u32 lirq;
 
-   if (!is_xive_irq(chip))
-   return;
-
rc = xive_ops->get_irq_config(hw_irq, , , );
if (rc) {
seq_printf(m, "IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
@@ -1648,16 +1645,9 @@ static int xive_core_debug_show(struct seq_file *m, void 
*private)
 
for_each_irq_desc(i, desc) {
struct irq_data *d = irq_desc_get_irq_data(desc);
-   unsigned int hw_irq;
-
-   if (!d)
-   continue;
-
-   hw_irq = (unsigned int)irqd_to_hwirq(d);
 
-   /* IPIs are special (HW number 0) */
-   if (hw_irq != XIVE_IPI_HW_IRQ)
-   xive_debug_show_irq(m, hw_irq, d);
+   if (d->domain == xive_irq_domain)
+   xive_debug_show_irq(m, d);
}
return 0;
 }
-- 
2.26.3



[PATCH v3 3/9] powerpc/xive: Remove useless check on XIVE_IPI_HW_IRQ

2021-03-31 Thread Cédric Le Goater
The IPI interrupt has its own domain now. Testing the HW interrupt
number is not needed anymore.

Reviewed-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 98f4dc916fa1..8bca9aca0607 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1417,13 +1417,12 @@ static void xive_flush_cpu_queue(unsigned int cpu, 
struct xive_cpu *xc)
struct irq_desc *desc = irq_to_desc(irq);
struct irq_data *d = irq_desc_get_irq_data(desc);
struct xive_irq_data *xd;
-   unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 
/*
 * Ignore anything that isn't a XIVE irq and ignore
 * IPIs, so can just be dropped.
 */
-   if (d->domain != xive_irq_domain || hw_irq == XIVE_IPI_HW_IRQ)
+   if (d->domain != xive_irq_domain)
continue;
 
/*
-- 
2.26.3



Re: [PATCH v2 8/8] powerpc/xive: Map one IPI interrupt per node

2021-03-30 Thread Cédric Le Goater
On 3/3/21 6:48 PM, Cédric Le Goater wrote:
> ipistorm [*] can be used to benchmark the raw interrupt rate of an
> interrupt controller by measuring the number of IPIs a system can
> sustain. When applied to the XIVE interrupt controller of POWER9 and
> POWER10 systems, a significant drop of the interrupt rate can be
> observed when crossing the second node boundary.
> 
> This is due to the fact that a single IPI interrupt is used for all
> CPUs of the system. The structure is shared and the cache line updates
> impact greatly the traffic between nodes and the overall IPI
> performance.
> 
> As a workaround, the impact can be reduced by deactivating the IRQ
> lockup detector ("noirqdebug") which does a lot of accounting in the
> Linux IRQ descriptor structure and is responsible for most of the
> performance penalty.
> 
> As a fix, this proposal allocates an IPI interrupt per node, to be
> shared by all CPUs of that node. It solves the scaling issue, the IRQ
> lockup detector still has an impact but the XIVE interrupt rate scales
> linearly. It also improves the "noirqdebug" case as showed in the
> tables below.
> 
>  * P9 DD2.2 - 2s * 64 threads
> 
>"noirqdebug"
> Mint/sMint/s
>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys
>  --
>  1  0-15 4.984023   4.875405   4.996536   5.048892
> 0-3110.879164  10.544040  10.757632  11.037859
> 0-4715.345301  14.688764  14.926520  15.310053
> 0-6317.064907  17.066812  17.613416  17.874511
>  2  0-7911.768764  21.650749  22.689120  22.566508
> 0-9510.616812  26.878789  28.434703  28.320324
> 0-111   10.151693  31.397803  31.771773  32.388122
> 0-1279.948502  33.139336  34.875716  35.224548
> 
>  * P10 DD1 - 4s (not homogeneous) 352 threads
> 
>"noirqdebug"
> Mint/sMint/s
>  chips  cpus  IPI/sys   IPI/chip   IPI/chipIPI/sys
>  --
>  1  0-15 2.409402   2.364108   2.383303   2.395091
> 0-31 6.028325   6.046075   6.08   6.073750
> 0-47 8.655178   8.644531   8.712830   8.724702
> 0-6311.629652  11.735953  12.088203  12.055979
> 0-7914.392321  14.729959  14.986701  14.973073
> 0-9512.604158  13.004034  17.528748  17.568095
>  2  0-1119.767753  13.719831  19.968606  20.024218
> 0-1276.744566  16.418854  22.898066  22.995110
> 0-1436.005699  19.174421  25.425622  25.417541
> 0-1595.649719  21.938836  27.952662  28.059603
> 0-1755.441410  24.109484  31.133915  31.127996
>  3  0-1915.318341  24.405322  33.999221  33.775354
> 0-2075.191382  26.449769  36.050161  35.867307
> 0-2235.102790  29.356943  39.544135  39.508169
> 0-2395.035295  31.933051  42.135075  42.071975
> 0-2554.969209  34.477367  44.655395  44.757074
>  4  0-2714.907652  35.887016  47.080545  47.318537
> 0-2874.839581  38.076137  50.464307  50.636219
> 0-3034.786031  40.881319  53.478684  53.310759
> 0-3194.743750  43.448424  56.388102  55.973969
> 0-3354.709936  45.623532  59.400930  58.926857
> 0-3514.681413  45.646151  62.035804  61.830057
> 
> [*] https://github.com/antonblanchard/ipistorm
> 
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/sysdev/xive/xive-internal.h |  2 --
>  arch/powerpc/sysdev/xive/common.c| 39 ++--
>  2 files changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h 
> b/arch/powerpc/sysdev/xive/xive-internal.h
> index 9cf57c722faa..b3a456fdd3a5 100644
> --- a/arch/powerpc/sysdev/xive/xive-internal.h
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -5,8 +5,6 @@
>  #ifndef __XIVE_INTERNAL_H
>  #define __XIVE_INTERNAL_H
>  
> -#define XIVE_IPI_HW_IRQ  0 /* interrupt source # for IPIs */
> -
>  /*
>   * A "disabled" interrupt should never fire, to catch problems
>   * we set its logical number to this
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index 8eefd152b947..c27f7bb0494b 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/s

Re: [PATCH v4 39/46] KVM: PPC: Book3S HV: Remove virt mode checks from real mode handlers

2021-03-23 Thread Cédric Le Goater
On 3/23/21 2:02 AM, Nicholas Piggin wrote:
> Now that the P7/8 path no longer supports radix, real-mode handlers
> do not need to deal with being called in virt mode.
> 
> This change effectively reverts commit acde25726bc6 ("KVM: PPC: Book3S
> HV: Add radix checks in real-mode hypercall handlers").
> 
> It removes a few more real-mode tests in rm hcall handlers, which also
> allows the indirect ops for the xive module to be removed from the
> built-in xics rm handlers.
> 
> kvmppc_h_random is renamed to kvmppc_rm_h_random to be a bit more
> descriptive of its function.
> 
> Cc: Cédric Le Goater 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Cédric Le Goater 

> ---
>  arch/powerpc/include/asm/kvm_ppc.h  | 10 +--
>  arch/powerpc/kvm/book3s.c   | 11 +--
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 12 
>  arch/powerpc/kvm/book3s_hv_builtin.c| 91 ++---
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |  2 +-
>  arch/powerpc/kvm/book3s_xive.c  | 18 -
>  arch/powerpc/kvm/book3s_xive.h  |  7 --
>  arch/powerpc/kvm/book3s_xive_native.c   | 10 ---
>  8 files changed, 23 insertions(+), 138 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index db6646c2ade2..5dfb3f167f2c 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -659,8 +659,6 @@ extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, 
> u32 *server,
>   u32 *priority);
>  extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
>  extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
> -extern void kvmppc_xive_init_module(void);
> -extern void kvmppc_xive_exit_module(void);
>  
>  extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>   struct kvm_vcpu *vcpu, u32 cpu);
> @@ -686,8 +684,6 @@ static inline int kvmppc_xive_enabled(struct kvm_vcpu 
> *vcpu)
>  extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
>  struct kvm_vcpu *vcpu, u32 cpu);
>  extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
> -extern void kvmppc_xive_native_init_module(void);
> -extern void kvmppc_xive_native_exit_module(void);
>  extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
>union kvmppc_one_reg *val);
>  extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
> @@ -701,8 +697,6 @@ static inline int kvmppc_xive_get_xive(struct kvm *kvm, 
> u32 irq, u32 *server,
>  u32 *priority) { return -1; }
>  static inline int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) { return -1; }
>  static inline int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) { return -1; 
> }
> -static inline void kvmppc_xive_init_module(void) { }
> -static inline void kvmppc_xive_exit_module(void) { }
>  
>  static inline int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>  struct kvm_vcpu *vcpu, u32 cpu) { 
> return -EBUSY; }
> @@ -725,8 +719,6 @@ static inline int kvmppc_xive_enabled(struct kvm_vcpu 
> *vcpu)
>  static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
> struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
>  static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
> -static inline void kvmppc_xive_native_init_module(void) { }
> -static inline void kvmppc_xive_native_exit_module(void) { }
>  static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
>   union kvmppc_one_reg *val)
>  { return 0; }
> @@ -762,7 +754,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
>  unsigned long tce_value, unsigned long npages);
>  long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
>  unsigned int yield_count);
> -long kvmppc_h_random(struct kvm_vcpu *vcpu);
> +long kvmppc_rm_h_random(struct kvm_vcpu *vcpu);
>  void kvmhv_commence_exit(int trap);
>  void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
>  void kvmppc_subcore_enter_guest(void);
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 44bf567b6589..1888aedfd410 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -1046,13 +1046,10 @@ static int kvmppc_book3s_init(void)
>  #ifdef CONFIG_KVM_XICS
>  #ifdef CONFIG_KVM_XIVE
>   if (xics_on_xive()) {
> - kvmppc_xive_init_module();
>   kvm_register_device_ops(_xive_ops, KVM_DEV_TYPE_XICS);
> - if (kvmppc_xive_native_su

Re: [PATCH v4 22/46] KVM: PPC: Book3S HV P9: Stop handling hcalls in real-mode in the P9 path

2021-03-23 Thread Cédric Le Goater
m_escalation() may be ? It has more meaning to me.  

> +{
> + void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
> +
> + if (!esc_vaddr)
> + return;
> +
> + /* we are using XIVE with single escalation */
> +
> + if (vcpu->arch.xive_esc_on) {
> + /*
> +  * If we still have a pending escalation, abort the cede,
> +  * and we must set PQ to 10 rather than 00 so that we don't
> +  * potentially end up with two entries for the escalation
> +  * interrupt in the XIVE interrupt queue.  In that case
> +  * we also don't want to set xive_esc_on to 1 here in
> +  * case we race with xive_esc_irq().
> +  */
> + vcpu->arch.ceded = 0;
> + /*
> +  * The escalation interrupts are special as we don't EOI them.
> +  * There is no need to use the load-after-store ordering offset
> +  * to set PQ to 10 as we won't use StoreEOI.
> +  */
> + __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
> + } else {
> + vcpu->arch.xive_esc_on = true;> +   mb();
> + __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
> + }
> + mb();
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
> +
>  /*
>   * This is a simple trigger for a generic XIVE IRQ. This must
>   * only be called for interrupts that support a trigger page
> @@ -2106,6 +2140,42 @@ static int kvmppc_xive_create(struct kvm_device *dev, 
> u32 type)
>   return 0;
>  }
>  
> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
> +{
> + struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> + /*
> +  * This test covers the case in which a vCPU does XICS hcalls without
> +  * QEMU having connected the vCPU to a XICS ICP. The ICP is the KVM
> +  * XICS device on P8 or XICS-on-XIVE on P9. It catches QEMU errors when
> +  * the interrupt mode is negotiated, we don't want the OS to do XICS
> +  * hcalls after having negotiated the XIVE interrupt mode.
> +  */

I think a comment like the following should be enough.

  The VM should have configured XICS mode before doing XICS hcalls.

No need to resend for that though.

Reviewed-by: Cédric Le Goater 

Thanks,

C.


> + if (!kvmppc_xics_enabled(vcpu))
> + return H_TOO_HARD;
> +
> + switch (req) {
> + case H_XIRR:
> + return xive_vm_h_xirr(vcpu);
> + case H_CPPR:
> + return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
> + case H_EOI:
> + return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
> + case H_IPI:
> + return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
> +   kvmppc_get_gpr(vcpu, 5));
> + case H_IPOLL:
> + return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
> + case H_XIRR_X:
> + xive_vm_h_xirr(vcpu);
> + kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
> + return H_SUCCESS;
> + }
> +
> + return H_UNSUPPORTED;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
> +
>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>  {
>   struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> 



Re: [PATCH v3 19/41] KVM: PPC: Book3S HV P9: Stop handling hcalls in real-mode in the P9 path

2021-03-23 Thread Cédric Le Goater
On 3/22/21 7:22 PM, Nicholas Piggin wrote:
> Excerpts from Cédric Le Goater's message of March 23, 2021 2:01 am:
>> On 3/22/21 2:15 PM, Nicholas Piggin wrote:
>>> Excerpts from Alexey Kardashevskiy's message of March 22, 2021 5:30 pm:


 On 06/03/2021 02:06, Nicholas Piggin wrote:
> In the interest of minimising the amount of code that is run in>>> 
> "real-mode", don't handle hcalls in real mode in the P9 path.
>
> POWER8 and earlier are much more expensive to exit from HV real mode
> and switch to host mode, because on those processors HV interrupts get
> to the hypervisor with the MMU off, and the other threads in the core
> need to be pulled out of the guest, and SLBs all need to be saved,
> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
> in host mode. Hash guests also require a lot of hcalls to run. The
> XICS interrupt controller requires hcalls to run.
>
> By contrast, POWER9 has independent thread switching, and in radix mode
> the hypervisor is already in a host virtual memory mode when the HV
> interrupt is taken. Radix + xive guests don't need hcalls to handle
> interrupts or manage translations.
>>
>> Do we need to handle the host-is-a-P9-without-xive case ?
> 
> I'm not sure really. Is there an intention for OPAL to be able to 
> provide a fallback layer in the worst case?

yes. OPAL has a XICS-on-XIVE emulation for P9, implemented for bringup,
and it still boots, XICS guest can run. P10 doesn't have it though.

> Maybe microwatt grows HV capability before XIVE?

I don't know if we should develop the same XIVE logic for microwatt. 
It's awfully complex and we have the XICS interface which works already. 

> So it's much less important to handle hcalls in real mode in P9.

 So acde25726bc6034b (which added if(kvm_is_radix(vcpu->kvm))return 
 H_TOO_HARD) can be reverted, pretty much?
>>>
>>> Yes. Although that calls attention to the fact I missed doing
>>> a P9 h_random handler in this patch. I'll fix that, then I think
>>> acde2572 could be reverted entirely.
>>>
>>> [...]
>>>
>   } else {
>   kvmppc_xive_push_vcpu(vcpu);
>   trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, 
> lpcr);
> - kvmppc_xive_pull_vcpu(vcpu);
> + /* H_CEDE has to be handled now, not later */
> + /* XICS hcalls must be handled before xive is pulled */
> + if (trap == BOOK3S_INTERRUPT_SYSCALL &&
> + !(vcpu->arch.shregs.msr & MSR_PR)) {
> + unsigned long req = kvmppc_get_gpr(vcpu, 3);
>   
> + if (req == H_CEDE) {
> + kvmppc_cede(vcpu);
> + kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
> + kvmppc_set_gpr(vcpu, 3, 0);
> + trap = 0;
> + }
> + if (req == H_EOI || req == H_CPPR ||

 else if (req == H_EOI ... ?
>>>
>>> Hummm, sure.
>>
>> you could integrate the H_CEDE in the switch statement below.
> 
> Below is in a different file just for the emulation calls.
> 
>>>
>>> [...]
>>>
> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
> +{
> + void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
> +
> + if (!esc_vaddr)
> + return;
> +
> + /* we are using XIVE with single escalation */
> +
> + if (vcpu->arch.xive_esc_on) {
> + /*
> +  * If we still have a pending escalation, abort the cede,
> +  * and we must set PQ to 10 rather than 00 so that we don't
> +  * potentially end up with two entries for the escalation
> +  * interrupt in the XIVE interrupt queue.  In that case
> +  * we also don't want to set xive_esc_on to 1 here in
> +  * case we race with xive_esc_irq().
> +  */
> + vcpu->arch.ceded = 0;
> + /*
> +  * The escalation interrupts are special as we don't EOI them.
> +  * There is no need to use the load-after-store ordering offset
> +  * to set PQ to 10 as we won't use StoreEOI.
> +  */
> + __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
> + } else {
> + vcpu->arch.xive_esc_on = true;
> + mb();
> + __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
> + }
> + mb();


 Uff. Thanks for cut-n-pasting the comments, helped a lot to match this c 
 to that asm!
>>>
>>> Glad it helped.
> +}
>>
>> I had to do the PowerNV models in QEMU to start understanding that stuff ... 
>>
> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
> +
>   /*
>* This is a simple trigger for a generic XIVE IRQ. This must
>* only be called for interrupts that support a trigger page
> @@ -2106,6 

Re: [PATCH v3 17/41] KVM: PPC: Book3S HV P9: implement kvmppc_xive_pull_vcpu in C

2021-03-22 Thread Cédric Le Goater
On 3/5/21 4:06 PM, Nicholas Piggin wrote:
> This is more symmetric with kvmppc_xive_push_vcpu. The extra test in
> the asm will go away in a later change.
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Cédric Le Goater 

> ---
>  arch/powerpc/include/asm/kvm_ppc.h  |  2 ++
>  arch/powerpc/kvm/book3s_hv.c|  2 ++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |  5 
>  arch/powerpc/kvm/book3s_xive.c  | 31 +
>  4 files changed, 40 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 9531b1c1b190..73b1ca5a6471 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -672,6 +672,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 
> icpval);
>  extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
>  int level, bool line_status);
>  extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
> +extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
>  
>  static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>  {
> @@ -712,6 +713,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu 
> *vcpu, u64 icpval) { retur
>  static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, 
> u32 irq,
> int level, bool line_status) { return 
> -ENODEV; }
>  static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
> +static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
>  
>  static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>   { return 0; }
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index b9cae42b9cd5..b265522fc467 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3565,6 +3565,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> *vcpu, u64 time_limit,
>  
>   trap = __kvmhv_vcpu_entry_p9(vcpu);
>  
> + kvmppc_xive_pull_vcpu(vcpu);
> +
>   /* Advance host PURR/SPURR by the amount used by guest */
>   purr = mfspr(SPRN_PURR);
>   spurr = mfspr(SPRN_SPURR);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 75405ef53238..c11597f815e4 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1442,6 +1442,11 @@ guest_exit_cont:   /* r9 = vcpu, r12 = 
> trap, r13 = paca */
>   bl  kvmhv_accumulate_time
>  #endif
>  #ifdef CONFIG_KVM_XICS
> + /* If we came in through the P9 short path, xive pull is done in C */
> + lwz r0, STACK_SLOT_SHORT_PATH(r1)
> + cmpwi   r0, 0
> + bne 1f
> +
>   /* We are exiting, pull the VP from the XIVE */
>   lbz r0, VCPU_XIVE_PUSHED(r9)
>   cmpwi   cr0, r0, 0
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index e7219b6f5f9a..52cdb9e2660a 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -127,6 +127,37 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
>  
> +/*
> + * Pull a vcpu's context from the XIVE on guest exit.
> + * This assumes we are in virtual mode (MMU on)

should we add an assert on is_rm() ? 

> + */
> +void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
> +{
> + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
> +
> + if (!vcpu->arch.xive_pushed)
> + return;
> +
> + /*
> +  * Sould not have been pushed if there is no tima
> +  */
> + if (WARN_ON(!tima))
> + return;
> +
> + eieio();
> + /* First load to pull the context, we ignore the value */
> + __raw_readl(tima + TM_SPC_PULL_OS_CTX);
> + /* Second load to recover the context state (Words 0 and 1) */
> + vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
> +
> + /* Fixup some of the state for the next load */
> + vcpu->arch.xive_saved_state.lsmfb = 0;
> + vcpu->arch.xive_saved_state.ack = 0xff;
> + vcpu->arch.xive_pushed = 0;
> + eieio();
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
> +
>  /*
>   * This is a simple trigger for a generic XIVE IRQ. This must
>   * only be called for interrupts that support a trigger page
> 



  1   2   3   4   5   6   7   8   >