From: Benjamin Herrenschmidt <b...@kernel.crashing.org>

This adds in-kernel emulation of the XICS (eXternal Interrupt
Controller Specification) interrupt controller specified by PAPR, for
both HV and PR KVM guests.

This adds a new KVM_CREATE_IRQCHIP_ARGS ioctl, which is like
KVM_CREATE_IRQCHIP in that it indicates that the virtual machine
should use in-kernel interrupt controller emulation, but also takes an
argument struct that contains the type of interrupt controller
architecture and an optional parameter.  Currently only one type value
is defined, that which indicates the XICS architecture.

The XICS emulation supports up to 1048560 interrupt sources.
Interrupt source numbers below 16 are reserved; 0 is used to mean no
interrupt and 2 is used for IPIs.  Internally these are represented in
blocks of 1024, called ICS (interrupt controller source) entities, but
that is not visible to userspace.

Two other new ioctls allow userspace to control the interrupt
sources.  The KVM_IRQCHIP_SET_SOURCES ioctl sets the priority,
destination cpu, level/edge sensitivity and pending state of a range
of interrupt sources, creating them if they don't already exist.  The
KVM_IRQCHIP_GET_SOURCES ioctl returns that information for a range of
interrupt sources (they are required to already exist).

Each vcpu gets one ICP (interrupt controller presentation) entity.
They are created automatically when the vcpu is created provided the
KVM_CREATE_IRQCHIP_ARGS ioctl has been performed.

This is based on an initial implementation by Michael Ellerman
<mich...@ellerman.id.au> reworked by Benjamin Herrenschmidt and
Paul Mackerras.

Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Signed-off-by: Paul Mackerras <pau...@samba.org>
---
 Documentation/virtual/kvm/api.txt     |   51 ++
 arch/powerpc/include/asm/kvm_book3s.h |    1 +
 arch/powerpc/include/asm/kvm_host.h   |    8 +
 arch/powerpc/include/asm/kvm_ppc.h    |   19 +
 arch/powerpc/kvm/Makefile             |    1 +
 arch/powerpc/kvm/book3s.c             |    2 +-
 arch/powerpc/kvm/book3s_hv.c          |   20 +
 arch/powerpc/kvm/book3s_pr.c          |   13 +
 arch/powerpc/kvm/book3s_pr_papr.c     |   16 +
 arch/powerpc/kvm/book3s_rtas.c        |   51 +-
 arch/powerpc/kvm/book3s_xics.c        | 1101 +++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_xics.h        |  111 ++++
 arch/powerpc/kvm/powerpc.c            |   23 +
 include/uapi/linux/kvm.h              |   29 +
 14 files changed, 1444 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_xics.c
 create mode 100644 arch/powerpc/kvm/book3s_xics.h

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d3e2d60..0ff9dcf 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2141,6 +2141,57 @@ associated with the service will be forgotten, and 
subsequent RTAS
 calls by the guest for that service will be passed to userspace to be
 handled.
 
+4.80 KVM_CREATE_IRQCHIP_ARGS
+
+Capability: KVM_CAP_IRQCHIP_ARGS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irqchip_args
+Returns: 0 on success, -1 on error
+
+Creates an interrupt controller model in the kernel.  The type field
+of the argument struct indicates the interrupt controller architecture
+of the virtual machine.  Currently the only value permitted for the
+type field is 1, indicating the XICS (eXternal Interrupt Controller
+Specification) model defined in PAPR.  For XICS, this ioctl indicates
+to the kernel that an interrupt controller presentation (ICP) entity
+should be created for every vcpu, and interrupt controller source
+(ICS) entities should be created to accommodate the sources that are
+configured with the KVM_IRQCHIP_SET_SOURCES ioctl.
+
+4.81 KVM_IRQCHIP_GET_SOURCES
+
+Capability: KVM_CAP_IRQCHIP_ARGS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irq_sources
+Returns: 0 on success, -1 on error
+
+Copies configuration and status information about a range of interrupt
+sources into a user-supplied buffer.  The argument struct gives the
+starting interrupt source number and the number of interrupt sources.
+The user buffer is an array of 64-bit quantities, one per interrupt
+source, with (from the least- significant bit) 32 bits of interrupt
+server number, 8 bits of priority, and 1 bit each for a
+level-sensitive indicator, a masked indicator, and a pending
+indicator.  If some of the sources in the range don't exist, that is,
+have not yet been created with the KVM_IRQCHIP_SET_SOURCES ioctl,
+this returns an ENODEV error.
+
+4.82 KVM_IRQCHIP_SET_SOURCES
+
+Capability: KVM_CAP_IRQCHIP_ARGS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_irq_sources
+Returns: 0 on success, -1 on error
+
+Sets the configuration and status for a range of interrupt sources
+from information supplied in a user-supplied buffer, creating the
+sources if they don't already exist.  The argument struct gives the
+starting interrupt source number and the number of interrupt sources.
+The user buffer is formatted as for KVM_IRQCHIP_GET_SOURCES.
+
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 5a56e1c..17c9a15 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -142,6 +142,7 @@ extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, 
bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, 
bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int 
vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int 
vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8295dc7..b05e7cd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -188,6 +188,10 @@ struct kvmppc_linear_info {
        int              type;
 };
 
+/* XICS components, defined in boo3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -256,6 +260,7 @@ struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
        struct list_head spapr_tce_tables;
        struct list_head rtas_tokens;
+       struct kvmppc_xics *xics;
 #endif
 };
 
@@ -572,6 +577,9 @@ struct kvm_vcpu_arch {
        u64 busy_stolen;
        u64 busy_preempt;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+       struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
 };
 
 /* Values for vcpu->arch.state */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index be611f6..f0fd22b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -130,6 +130,13 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
                        struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long 
arg);
+extern int kvmppc_xics_create(struct kvm *kvm, struct kvm_irqchip_args *args);
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu);
+extern void kvmppc_xics_free(struct kvm *kvm);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -167,6 +174,8 @@ extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct 
kvm_get_htab_fd *);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 
priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 
*priority);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -263,6 +272,16 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned 
long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline int kvmppc_xics_enabled(struct kvm *kvm)
+{
+       return kvm->arch.xics != NULL;
+}
+#else
+static inline int kvmppc_xics_enabled(struct kvm *kvm) { return 0; }
 #endif
 
 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 432132c..e2eb04c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -87,6 +87,7 @@ kvm-book3s_64-module-objs := \
        book3s.o \
        book3s_64_vio.o \
        book3s_rtas.o \
+       book3s_xics.o \
        $(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 6548445..c5a4478 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
        return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
                                          unsigned int vec)
 {
        unsigned long old_pending = vcpu->arch.pending_exceptions;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 567c264..aa3a0db 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -528,6 +528,14 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
                /* Send the error out to userspace via KVM_RUN */
                return rc;
+       case H_XIRR:
+       case H_CPPR:
+       case H_EOI:
+       case H_IPI:
+               if (kvmppc_xics_enabled(vcpu->kvm)) {
+                       ret = kvmppc_xics_hcall(vcpu, req);
+                       break;
+               } /* fallthrough */
        default:
                return RESUME_HOST;
        }
@@ -886,6 +894,13 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, 
unsigned int id)
        spin_lock_init(&vcpu->arch.tbacct_lock);
        vcpu->arch.busy_preempt = TB_NIL;
 
+       /* Create the XICS */
+       if (kvmppc_xics_enabled(kvm)) {
+               err = kvmppc_xics_create_icp(vcpu);
+               if (err < 0)
+                       goto free_vcpu;
+       }
+
        kvmppc_mmu_book3s_hv_init(vcpu);
 
        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -937,6 +952,8 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
                kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
        spin_unlock(&vcpu->arch.vpa_update_lock);
        kvm_vcpu_uninit(vcpu);
+       if (kvmppc_xics_enabled(vcpu->kvm))
+               kvmppc_xics_free_icp(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
@@ -1882,6 +1899,9 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 
        kvmppc_rtas_tokens_free(kvm);
 
+       if (kvmppc_xics_enabled(kvm))
+               kvmppc_xics_free(kvm);
+
        kvmppc_free_hpt(kvm);
        WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 73ed11c..9b2237f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1069,6 +1069,13 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm 
*kvm, unsigned int id)
        if (err < 0)
                goto uninit_vcpu;
 
+       /* Create the XICS */
+       if (kvmppc_xics_enabled(kvm)) {
+               err = kvmppc_xics_create_icp(vcpu);
+               if (err < 0)
+                       goto free_vcpu;
+       }
+
        return vcpu;
 
 uninit_vcpu:
@@ -1085,6 +1092,8 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
        struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 
+       if (kvmppc_xics_enabled(vcpu->kvm))
+               kvmppc_xics_free_icp(vcpu);
        free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
        kvm_vcpu_uninit(vcpu);
        kfree(vcpu_book3s->shadow_vcpu);
@@ -1293,6 +1302,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+       INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
        return 0;
@@ -1303,6 +1313,9 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 #ifdef CONFIG_PPC64
        WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 #endif
+       if (kvmppc_xics_enabled(kvm))
+               kvmppc_xics_free(kvm);
+
 }
 
 static int kvmppc_book3s_init(void)
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c 
b/arch/powerpc/kvm/book3s_pr_papr.c
index 4efa4a4..94cec5b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,15 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+       long rc = kvmppc_xics_hcall(vcpu, cmd);
+       if (rc == H_TOO_HARD)
+               return EMULATE_FAIL;
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
        switch (cmd) {
@@ -246,6 +255,13 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                vcpu->stat.halt_wakeup++;
                return EMULATE_DONE;
+       case H_XIRR:
+       case H_CPPR:
+       case H_EOI:
+       case H_IPI:
+               if (kvmppc_xics_enabled(vcpu->kvm))
+                       return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+               break;
        case H_RTAS:
                if (list_empty(&vcpu->kvm->arch.rtas_tokens))
                        return RESUME_HOST;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 8a324e8..6a6c1fe 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -18,12 +18,61 @@
 #include <asm/rtas.h>
 
 
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq, server, priority;
+       int rc;
+
+       if (args->nargs != 3 || args->nret != 1) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+       server = args->args[1];
+       priority = args->args[2];
+
+       rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+       if (rc)
+               rc = -3;
+out:
+       args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq, server, priority;
+       int rc;
+
+       if (args->nargs != 1 || args->nret != 3) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+
+       server = priority = 0;
+       rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+       if (rc) {
+               rc = -3;
+               goto out;
+       }
+
+       args->rets[1] = server;
+       args->rets[2] = priority;
+out:
+       args->rets[0] = rc;
+}
+
 struct rtas_handler {
        void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
        char *name;
 };
 
-static struct rtas_handler rtas_handlers[] = { };
+static struct rtas_handler rtas_handlers[] = {
+       { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+       { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+};
 
 struct rtas_token_definition {
        struct list_head list;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 0000000..7749060
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#define XICS_DBG(fmt...) do { } while (0)
+//#define XICS_DBG(fmt...) do { trace_printk(fmt); } while (0)
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ *
+ * - ioctl's to save/restore the entire state for snapshot & migration
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq);
+
+static void ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics; 
+       u16 src;
+
+       XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics) {
+               XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+               return;
+       }
+       state = &ics->irq_state[src];
+
+       /*
+        * We set state->asserted locklessly. This should be fine as
+        * we are the only setter, thus concurrent access is undefined
+        * to begin with.
+        */
+       if (level == KVM_INTERRUPT_SET_LEVEL)
+               state->asserted = 1;
+       else if (level == KVM_INTERRUPT_UNSET) {
+               state->asserted = 0;
+               return;
+       }
+
+       /* Attempt delivery */
+       icp_deliver_irq(xics, NULL, irq);
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+                            struct kvmppc_icp *icp)
+{
+       int i;
+
+       mutex_lock(&ics->lock);
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               struct ics_irq_state *state = &ics->irq_state[i];
+
+               if (!state->resend)
+                       continue;
+
+               XICS_DBG("resend %#x prio %#x\n", state->number,
+                             state->priority);
+
+               mutex_unlock(&ics->lock);
+               icp_deliver_irq(xics, icp, state->number);
+               mutex_lock(&ics->lock);
+       }
+
+       mutex_unlock(&ics->lock);
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+       bool deliver;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       icp = kvmppc_xics_find_server(kvm, server);
+       if (!icp)
+               return -EINVAL;
+
+       mutex_lock(&ics->lock);
+
+       XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+                irq, server, priority,
+                state->masked_pending, state->resend);
+
+       state->server = server;
+       state->priority = priority;
+       deliver = false;
+       if ((state->masked_pending || state->resend) && priority != MASKED) {
+               state->masked_pending = 0;
+               deliver = true;
+       }
+
+       mutex_unlock(&ics->lock);
+
+       if (deliver)
+               icp_deliver_irq(xics, icp, irq);
+
+       return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       mutex_lock(&ics->lock);
+       *server = state->server;
+       *priority = state->priority;
+       mutex_unlock(&ics->lock);
+
+       return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+                                 union kvmppc_icp_state old,
+                                 union kvmppc_icp_state new,
+                                 bool change_self)
+{
+       bool success;
+
+       /* Calculate new output value */
+       new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+       /* Attempt atomic update */
+       success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+       if (!success)
+               goto bail;
+
+       XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+                icp->vcpu->vcpu_id,
+                old.cppr, old.mfrr, old.pending_pri, old.xisr,
+                old.need_resend, old.out_ee);
+       XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+                new.cppr, new.mfrr, new.pending_pri, new.xisr,
+                new.need_resend, new.out_ee);
+       /*
+        * Check for output state update
+        *
+        * Note that this is racy since another processor could be updating
+        * the state already. This is why we never clear the interrupt output
+        * here, we only ever set it. The clear only happens prior to doing
+        * an update and only by the processor itself. Currently we do it
+        * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+        *
+        * We also do not try to figure out whether the EE state has changed,
+        * we unconditionally set it if the new state calls for it for the
+        * same reason.
+        */
+       if (new.out_ee) {
+               kvmppc_book3s_queue_irqprio(icp->vcpu,
+                                           BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+               if (!change_self)
+                       kvm_vcpu_kick(icp->vcpu);
+       }
+ bail:
+       return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+                            struct kvmppc_icp *icp)
+{
+       u32 icsid;
+       
+       /* Order this load with the test for need_resend in the caller */
+       smp_rmb();
+       for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!test_and_clear_bit(icsid, icp->resend_map))
+                       continue;
+               if (!ics)
+                       continue;
+               ics_check_resend(xics, ics, icp);
+       }
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+                              u32 *reject)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool success;
+
+       XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+                icp->vcpu->vcpu_id);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               *reject = 0;
+
+               /* See if we can deliver */
+               success = new_state.cppr > priority &&
+                       new_state.mfrr > priority &&
+                       new_state.pending_pri > priority;
+
+               /*
+                * If we can, check for a rejection and perform the
+                * delivery
+                */
+               if (success) {
+                       *reject = new_state.xisr;
+                       new_state.xisr = irq;
+                       new_state.pending_pri = priority;
+               } else {
+                       /*
+                        * If we failed to deliver we set need_resend
+                        * so a subsequent CPPR state change causes us
+                        * to try a new delivery.
+                        */
+                       new_state.need_resend = true;
+               }
+
+       } while (!icp_try_update(icp, old_state, new_state, false));
+
+       return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u32 reject;
+       u16 src;        
+
+       /*
+        * This is used both for initial delivery of an interrupt and
+        * for subsequent rejection.
+        *
+        * Rejection can be racy vs. resends. We have evaluated the
+        * rejection in an atomic ICP transaction which is now complete,
+        * so potentially the ICP can already accept the interrupt again.
+        *
+        * So we need to retry the delivery. Essentially the reject path
+        * boils down to a failed delivery. Always.
+        *
+        * Now the interrupt could also have moved to a different target,
+        * thus we may need to re-do the ICP lookup as well
+        */
+        
+ again:
+       /* Get the ICS state and lock it */
+       ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+       if (!ics) {
+               XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+               return;
+       }
+       state = &ics->irq_state[src];
+
+       /* Get a lock on the ICS */
+       mutex_lock(&ics->lock);
+
+       /* Get our server */
+       if (!icp || state->server != icp->vcpu->vcpu_id) {
+               icp = kvmppc_xics_find_server(xics->kvm, state->server);
+               if (!icp) {
+                       pr_warning("icp_deliver_irq: IRQ 0x%06x server 0x%x"
+                                  " not found !\n", new_irq, state->server);
+                       goto out;
+               }
+       }
+
+       /* Clear the resend bit of that interrupt */
+       state->resend = 0;
+
+       /*
+        * If masked, bail out
+        *
+        * Note: PAPR doesn't mention anything about masked pending
+        * when doing a resend, only when doing a delivery.
+        *
+        * However that would have the effect of losing a masked
+        * interrupt that was rejected and isn't consistent with
+        * the whole masked_pending business which is about not
+        * losing interrupts that occur while masked.
+        *
+        * I don't differenciate normal deliveries and resends, this
+        * implementation will differ from PAPR and not lose such
+        * interrupts.
+        */
+       if (state->priority == MASKED) {
+               XICS_DBG("irq %#x masked pending\n", new_irq);
+               state->masked_pending = 1;
+               goto out;
+       }
+
+       /*
+        * Try the delivery, this will set the need_resend flag
+        * in the ICP as part of the atomic transaction if the
+        * delivery is not possible.
+        *
+        * Note that if successful, the new delivery might have itself
+        * rejected an interrupt that was "delivered" before we took the
+        * icp mutex.
+        *
+        * In this case we do the whole sequence all over again for the
+        * new guy. We cannot assume that the rejected interrupt is less
+        * favored than the new one, and thus doesn't need to be delivered,
+        * because by the time we exit icp_try_to_deliver() the target
+        * processor may well have alrady consumed & completed it, and thus
+        * the rejected interrupt might actually be already acceptable.
+        */
+       if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+               /*
+                * Delivery was successful, did we reject somebody else ?
+                */
+               if (reject && reject != XICS_IPI) {
+                       mutex_unlock(&ics->lock);
+                       new_irq = reject;
+                       goto again;
+               }
+       } else {
+               /*
+                * We failed to deliver the interrupt we need to set the
+                * resend map bit and mark the ICS state as needing a resend
+                */
+               set_bit(ics->icsid, icp->resend_map);
+               state->resend = 1;
+
+               /*
+                * If the need_resend flag got cleared in the ICP some time
+                * between icp_try_to_deliver() atomic update and now, then
+                * we know it might have missed the resend_map bit. So we
+                * retry
+                */
+               smp_mb();
+               if (!icp->state.need_resend) {
+                       mutex_unlock(&ics->lock);
+                       goto again;
+               }
+       }
+ out:
+       mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                         u8 new_cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool resend;
+
+       /*
+        * This handles several related states in one operation:
+        *
+        * ICP State: Down_CPPR
+        *
+        * Load CPPR with new value and if the XISR is 0
+        * then check for resends:
+        *
+        * ICP State: Resend
+        *
+        * If MFRR is more favored than CPPR, check for IPIs
+        * and notify ICS of a potential resend. This is done
+        * asynchronously (when used in real mode, we will have
+        * to exit here).
+        *
+        * We do not handle the complete Check_IPI as documented
+        * here. In the PAPR, this state will be used for both
+        * Set_MFRR and Down_CPPR. However, we know that we aren't
+        * changing the MFRR state here so we don't need to handle
+        * the case of an MFRR causing a reject of a pending irq,
+        * this will have been handled when the MFRR was set in the
+        * first place.
+        *
+        * Thus we don't have to handle rejects, only resends.
+        *
+        * When implementing real mode for HV KVM, resend will lead to
+        * a H_TOO_HARD return and the whole transaction will be handled
+        * in virtual mode.
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Down_CPPR */
+               new_state.cppr = new_cppr;
+
+               /*
+                * Cut down Resend / Check_IPI / IPI
+                *
+                * The logic is that we cannot have a pending interrupt
+                * trumped by an IPI at this point (see above), so we
+                * know that either the pending interrupt is already an
+                * IPI (in which case we don't care to override it) or
+                * it's either more favored than us or non existent
+                */
+               if (new_state.mfrr < new_cppr &&
+                   new_state.mfrr <= new_state.pending_pri) {
+                       WARN_ON(new_state.xisr != XICS_IPI &&
+                               new_state.xisr != 0);
+                       new_state.pending_pri = new_state.mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               /* Latch/clear resend bit */
+               resend = new_state.need_resend;
+               new_state.need_resend = 0;
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       /*
+        * Now handle resend checks. Those are asynchronous to the ICP
+        * state update in HW (ie bus transactions) so we can handle them
+        * separately here too
+        */
+       if (resend)
+               icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 xirr;
+
+       /* First, remove EE from the processor */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       /*
+        * ICP State: Accept_Interrupt
+        *
+        * Return the pending interrupt (if any) along with the
+        * current CPPR, then clear the XISR & set CPPR to the
+        * pending priority
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+               if (!old_state.xisr)
+                       break;
+               new_state.cppr = new_state.pending_pri;
+               new_state.pending_pri = 0xff;
+               new_state.xisr = 0;
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+       return xirr;
+}
+
+static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+                         unsigned long mfrr)
+{
+        union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       u32 reject;
+       bool resend;
+       bool local;
+
+       XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+                       vcpu->vcpu_id, server, mfrr);
+
+       local = vcpu->vcpu_id == server;
+       if (local)
+               icp = vcpu->arch.icp;
+       else
+               icp = kvmppc_xics_find_server(vcpu->kvm, server);
+       if (!icp)
+               return H_PARAMETER;
+
+       /*
+        * ICP state: Set_MFRR
+        *
+        * If the CPPR is more favored than the new MFRR, then
+        * nothing needs to be rejected as there can be no XISR to
+        * reject.  If the MFRR is being made less favored then
+        * there might be a previously-rejected interrupt needing
+        * to be resent.
+        *
+        * If the CPPR is less favored, then we might be replacing
+        * an interrupt, and thus need to possibly reject it as in
+        *
+        * ICP state: Check_IPI
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Set_MFRR */
+               new_state.mfrr = mfrr;
+
+               /* Check_IPI */
+               reject = 0;
+               resend = false;
+               if (mfrr < new_state.cppr) {
+                       /* Reject a pending interrupt if not an IPI */
+                       if (mfrr <= new_state.pending_pri)
+                               reject = new_state.xisr;
+                       new_state.pending_pri = mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+                       resend = new_state.need_resend;
+                       new_state.need_resend = 0;
+               }
+       } while (!icp_try_update(icp, old_state, new_state, local));
+
+       /* Handle reject */
+       if (reject && reject != XICS_IPI)
+               icp_deliver_irq(xics, icp, reject);
+               
+       /* Handle resend */
+       if (resend)
+               icp_check_resend(xics, icp);
+
+       return H_SUCCESS;
+}
+
+static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 reject;
+
+       XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+       /*
+        * ICP State: Set_CPPR
+        *
+        * We can safely compare the new value with the current
+        * value outside of the transaction as the CPPR is only
+        * ever changed by the processor on itself
+        */
+       if (cppr > icp->state.cppr)
+               icp_down_cppr(xics, icp, cppr);
+       else if (cppr == icp->state.cppr)
+               return;
+
+       /*
+        * ICP State: Up_CPPR
+        *
+        * The processor is raising its priority, this can result
+        * in a rejection of a pending interrupt:
+        *
+        * ICP State: Reject_Current
+        *
+        * We can remove EE from the current processor, the update
+        * transaction will set it again if needed
+        */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               reject = 0;
+               new_state.cppr = cppr;
+
+               if (cppr <= new_state.pending_pri) {
+                       reject = new_state.xisr;
+                       new_state.xisr = 0;
+                       new_state.pending_pri = 0xff;
+               }
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       /*
+        * Check for rejects. They are handled by doing a new delivery
+        * attempt (see comments in icp_deliver_irq).
+        */
+       if (reject && reject != XICS_IPI)
+               icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u32 irq = xirr & 0x00ffffff;
+       u16 src;
+
+       XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+       /*
+        * ICP State: EOI
+        *
+        * Note: If EOI is incorrectly used by SW to lower the CPPR
+        * value (ie more favored), we do not check for rejection of
+        * a pending interrupt, this is a SW error and PAPR sepcifies
+        * that we don't have to deal with it.
+        *
+        * The sending of an EOI to the ICS is handled after the
+        * CPPR update
+        *
+        * ICP State: Down_CPPR which we handle
+        * in a separate function as it's shared with H_CPPR.
+        */
+       icp_down_cppr(xics, icp, xirr >> 24);
+
+       /* IPIs have no EOI */
+       if (irq == XICS_IPI)
+               return H_SUCCESS;
+       /*
+        * EOI handling: If the interrupt is still asserted, we need to
+        * resend it. We can take a lockless "peek" at the ICS state here.
+        *
+        * "Message" interrupts will never have "asserted" set
+        */
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics) {
+               XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+               return H_PARAMETER;
+       }
+       state = &ics->irq_state[src];
+
+       /* Still asserted, resend it */
+       if (state->asserted)
+               icp_deliver_irq(xics, icp, irq);
+
+       return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+       unsigned long res;
+       int rc = H_SUCCESS;
+
+       /* Check if we have an ICP */
+       if (!vcpu->arch.icp || !vcpu->kvm->arch.xics)
+               return H_HARDWARE;
+
+       switch (req) {
+       case H_XIRR:
+               res = h_xirr(vcpu);
+               kvmppc_set_gpr(vcpu, 4, res);
+               break;
+       case H_CPPR:
+               h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_EOI:
+               rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_IPI:
+               rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+                          kvmppc_get_gpr(vcpu, 5));
+               break;
+       }
+
+       return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+       struct kvmppc_xics *xics = m->private;
+       struct kvm *kvm = xics->kvm;
+       struct kvm_vcpu *vcpu;
+       int icsid, i;
+
+       if (!kvm)
+               return 0;
+
+       seq_printf(m, "=========\nICP state\n=========\n");
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct kvmppc_icp *icp = vcpu->arch.icp;
+               union kvmppc_icp_state state;
+
+               if (!icp)
+                       continue;
+
+               state.raw = ACCESS_ONCE(icp->state.raw);
+               seq_printf(m, "cpu server %#x XIRR:%#x PPRI:%#x CPPR:%#x "
+                          "MFRR:%#x OUT:%d NR:%d\n", vcpu->vcpu_id, state.xisr,
+                          state.pending_pri, state.cppr, state.mfrr,
+                          state.out_ee, state.need_resend);
+       }
+
+       for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!ics)
+                       continue;
+
+               seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+                          icsid);
+
+               mutex_lock(&ics->lock);
+
+               for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+                       struct ics_irq_state *irq = &ics->irq_state[i];
+
+                       seq_printf(m, "irq 0x%06x: server %#x prio %#x save"
+                                  " prio %#x asserted %d resend %d masked"
+                                  " pending %d\n",
+                                  irq->number, irq->server, irq->priority,
+                                  irq->saved_priority, irq->asserted,
+                                  irq->resend, irq->masked_pending);
+
+               }
+               mutex_unlock(&ics->lock);
+       }
+       return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+       .open = xics_debug_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+       char *name;
+
+       name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+       if (!name) {
+               pr_err("%s: no memory for name\n", __func__);
+               return;
+       }
+
+       xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+                                          xics, &xics_debug_fops);
+
+       pr_debug("%s: created %s\n", __func__, name);
+       kfree(name);
+}
+
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvmppc_xics *xics,
+                                                int irq)
+{
+       struct kvmppc_ics *ics;
+       int i, icsid;
+
+       icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+       mutex_lock(&xics->kvm->lock);
+
+       /* ICS already exists - somebody else got here first */
+       if (xics->ics[icsid])
+               goto out;
+
+       /* Create the ICS */
+       ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+       if (!ics)
+               goto out;
+
+       mutex_init(&ics->lock);
+       ics->icsid = icsid;
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+               ics->irq_state[i].priority = MASKED;
+               ics->irq_state[i].saved_priority = MASKED;
+       }
+       smp_wmb();
+       xics->ics[icsid] = ics;
+
+       if (icsid > xics->max_icsid)
+               xics->max_icsid = icsid;
+
+ out:
+       mutex_unlock(&xics->kvm->lock);
+       return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_icp *icp;
+
+       icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+       if (!icp)
+               return -ENOMEM;
+
+       icp->vcpu = vcpu;
+       icp->state.mfrr = MASKED;
+       icp->state.pending_pri = MASKED;
+       vcpu->arch.icp = icp;
+
+       XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+       return 0;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.icp)
+               return;
+       kfree(vcpu->arch.icp);
+       vcpu->arch.icp = NULL;
+}
+
+void kvmppc_xics_free(struct kvm *kvm)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       int i;
+
+       if (!xics)
+               return;
+
+       lockdep_assert_held(&kvm->lock);
+
+       debugfs_remove(xics->dentry);
+
+       if (xics->kvm) {
+               xics->kvm->arch.xics = NULL;
+               xics->kvm = NULL;
+       }
+
+       for (i = 0; i <= xics->max_icsid; i++) {
+               if (xics->ics[i])
+                       kfree(xics->ics[i]);
+       }
+       kfree(xics);
+}
+
+static int kvm_xics_get_sources(struct kvm *kvm, struct kvm_irq_sources *srcs)
+{
+       int ret = 0;
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *irqp;
+       u64 __user *ubufp;
+       u16 idx;
+       u64 val;
+       long int i, irq, nirq;
+
+       irq = srcs->irq;
+       ubufp = srcs->irqbuf;
+
+       while (srcs->nr_irqs > 0 && !ret) {
+               ics = kvmppc_xics_find_ics(xics, irq, &idx);
+               if (!ics)
+                       return -ENOENT;
+               nirq = KVMPPC_XICS_IRQ_PER_ICS - idx;
+               if (nirq > srcs->nr_irqs)
+                       nirq = srcs->nr_irqs;
+               srcs->nr_irqs -= nirq;
+               irq += nirq;
+
+               irqp = &ics->irq_state[idx];
+               mutex_lock(&ics->lock);
+               for (i = 0; i < nirq; ++i, ++irqp, ++ubufp) {
+                       ret = -ENOENT;
+                       if (!irqp->exists)
+                               break;
+                       val = irqp->server;
+                       val |= ((u64)irqp->priority << KVM_IRQ_PRIORITY_SHIFT);
+                       if (irqp->priority == MASKED)
+                               val |= KVM_IRQ_MASKED;
+                       if (irqp->asserted)
+                               val |= KVM_IRQ_LEVEL_SENSITIVE |
+                                       KVM_IRQ_PENDING;
+                       else if (irqp->masked_pending || irqp->resend)
+                               val |= KVM_IRQ_PENDING;
+                       ret = -EFAULT;
+                       if (__put_user(val, ubufp))
+                               break;
+                       ret = 0;
+               }
+               mutex_unlock(&ics->lock);
+       }
+
+       return ret;
+}
+
+static int kvm_xics_set_sources(struct kvm *kvm, struct kvm_irq_sources *srcs)
+{
+       int ret = 0;
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *irqp;
+       u64 __user *ubufp;
+       u16 idx;
+       u64 val;
+       long int i, irq, nirq;
+
+       irq = srcs->irq;
+       ubufp = srcs->irqbuf;
+
+       if (irq < KVMPPC_XICS_FIRST_IRQ ||
+           irq + srcs->nr_irqs > KVMPPC_XICS_NR_IRQS)
+               return -ENOENT;
+
+       while (srcs->nr_irqs > 0 && !ret) {
+               ics = kvmppc_xics_find_ics(xics, irq, &idx);
+               if (!ics) {
+                       ics = kvmppc_xics_create_ics(xics, irq);
+                       if (!ics)
+                               return -ENOMEM;
+               }
+               nirq = KVMPPC_XICS_IRQ_PER_ICS - idx;
+               if (nirq > srcs->nr_irqs)
+                       nirq = srcs->nr_irqs;
+               srcs->nr_irqs -= nirq;
+               irq += nirq;
+
+               irqp = &ics->irq_state[idx];
+               ubufp = srcs->irqbuf;
+               for (i = 0; i < nirq; ++i, ++irqp, ++ubufp) {
+                       ret = -EFAULT;
+                       if (__get_user(val, ubufp))
+                               break;
+                       ret = 0;
+
+                       mutex_lock(&ics->lock);
+                       irqp->server = val & KVM_IRQ_SERVER_MASK;
+                       irqp->priority = val >> KVM_IRQ_PRIORITY_SHIFT;
+                       irqp->resend = 0;
+                       irqp->masked_pending = 0;
+                       irqp->asserted = 0;
+                       if ((val & KVM_IRQ_PENDING) &&
+                           (val & KVM_IRQ_LEVEL_SENSITIVE))
+                               irqp->asserted = 1;
+                       irqp->exists = 1;
+                       mutex_unlock(&ics->lock);
+
+                       if (val & KVM_IRQ_PENDING)
+                               icp_deliver_irq(xics, NULL, irqp->number);
+               }
+       }
+
+       return ret;
+}
+
+/* -- ioctls -- */
+
+int kvmppc_xics_create(struct kvm *kvm, struct kvm_irqchip_args *args)
+{
+       struct kvmppc_xics *xics;
+       int rc = 0;
+
+       mutex_lock(&kvm->lock);
+
+       /* Already there ? */
+       if (kvm->arch.xics)
+               return -EEXIST;
+
+       xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+       if (!xics) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       xics->kvm = kvm;
+       kvm->arch.xics = xics;
+       xics_debugfs_init(xics);
+
+out:
+       mutex_unlock(&kvm->lock);
+       return rc;
+}
+
+static int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+{
+       struct kvmppc_xics *xics;
+
+       /* locking against multiple callers? */
+
+       xics = kvm->arch.xics;
+       if (!xics)
+               return -ENODEV;
+
+       switch (args->level) {
+       case KVM_INTERRUPT_SET:
+       case KVM_INTERRUPT_SET_LEVEL:
+       case KVM_INTERRUPT_UNSET:
+               ics_deliver_irq(xics, args->irq, args->level);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int kvmppc_xics_ioctl(struct kvm *kvm, unsigned ioctl, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+       int rc;
+
+       BUILD_BUG_ON(sizeof(union kvmppc_icp_state) != sizeof(unsigned long));
+
+       switch (ioctl) {
+       case KVM_IRQ_LINE: {
+               struct kvm_irq_level args;
+
+               rc = -EFAULT;
+               if (copy_from_user(&args, argp, sizeof(args)))
+                       break;
+               rc = kvm_vm_ioctl_xics_irq(kvm, &args);
+               break;
+       }
+
+       case KVM_IRQCHIP_GET_SOURCES: {
+               struct kvm_irq_sources sources;
+
+               rc = -EFAULT;
+               if (copy_from_user(&sources, argp, sizeof(sources)))
+                       break;
+               if (!access_ok(VERIFY_WRITE, sources.irqbuf,
+                              sources.nr_irqs * sizeof(u64)))
+                       break;
+               rc = kvm_xics_get_sources(kvm, &sources);
+               break;
+       }
+
+       case KVM_IRQCHIP_SET_SOURCES: {
+               struct kvm_irq_sources sources;
+
+               rc = -EFAULT;
+               if (copy_from_user(&sources, argp, sizeof(sources)))
+                       break;
+               if (!access_ok(VERIFY_READ, sources.irqbuf,
+                              sources.nr_irqs * sizeof(u64)))
+                       break;
+               rc = kvm_xics_set_sources(kvm, &sources);
+               break;
+       }
+
+       default:
+               rc = -ENOTTY;
+               break;
+       }
+
+       return rc;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 0000000..0e20a51
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID 1023
+#define KVMPPC_XICS_ICS_SHIFT  10
+#define KVMPPC_XICS_IRQ_PER_ICS        (1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK   (KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ  16
+#define KVMPPC_XICS_NR_IRQS    ((KVMPPC_XICS_MAX_ICS_ID + 1) * 
KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED 0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+       u32 number;
+       u32 server;
+       u8  priority;
+       u8  saved_priority; /* currently unused */
+       u8  resend;
+       u8  masked_pending;
+       u8  asserted; /* Only for LSI */
+       u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+       unsigned long raw;
+       struct {
+               u8 out_ee : 1;
+               u8 need_resend : 1;
+               u8 cppr;
+               u8 mfrr;
+               u8 pending_pri;
+               u32 xisr;
+       };
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE    (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+       struct kvm_vcpu *vcpu;
+       union kvmppc_icp_state state;
+       unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+};
+
+struct kvmppc_ics {
+       struct mutex lock;
+       u16 icsid;
+       struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+       struct kvm *kvm;
+       struct dentry *dentry;
+       u32 max_icsid;
+       struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+                                                        u32 nr)
+{
+       struct kvm_vcpu *vcpu = NULL;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (nr == vcpu->vcpu_id)
+                       return vcpu->arch.icp;
+       }
+       return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+                                                     u32 irq, u16 *source)
+{
+       u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+       u16 src = irq & KVMPPC_XICS_SRC_MASK;
+       struct kvmppc_ics *ics;
+
+       if (source)
+               *source = src;
+       if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+               return NULL;
+       ics = xics->ics[icsid];
+       if (!ics)
+               return NULL;
+       return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1772883..3bcc030 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -383,6 +383,7 @@ int kvm_dev_ioctl_check_extension(long ext)
                break;
 #ifdef CONFIG_PPC_BOOK3S_64
        case KVM_CAP_PPC_GET_SMMU_INFO:
+       case KVM_CAP_IRQCHIP_ARGS:
                r = 1;
                break;
 #endif
@@ -1002,6 +1003,28 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
                break;
        }
+       case KVM_IRQ_LINE:
+       case KVM_IRQCHIP_GET_SOURCES:
+       case KVM_IRQCHIP_SET_SOURCES: {
+               struct kvm *kvm = filp->private_data;
+
+               r = -ENOTTY;
+               if (kvmppc_xics_enabled(kvm))
+                       r = kvmppc_xics_ioctl(kvm, ioctl, arg);
+               break;
+       }
+       case KVM_CREATE_IRQCHIP_ARGS: {
+               struct kvm *kvm = filp->private_data;
+               struct kvm_irqchip_args args;
+
+               r = -EFAULT;
+               if (copy_from_user(&args, argp, sizeof(args)))
+                       break;
+               r = -EINVAL;
+               if (args.type == KVM_IRQCHIP_TYPE_XICS)
+                       r = kvmppc_xics_create(kvm, &args);
+               break;
+       }
 #endif /* CONFIG_PPC_BOOK3S_64 */
        default:
                r = -ENOTTY;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1e2fda0..25d73c0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -115,6 +115,7 @@ struct kvm_irq_level {
         * ACPI gsi notion of irq.
         * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
         * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+        * On powerpc SPAPR, the ICS source number, level is ignored.
         */
        union {
                __u32 irq;
@@ -146,6 +147,15 @@ struct kvm_pit_config {
 
 #define KVM_PIT_SPEAKER_DUMMY     1
 
+/* for KVM_CREATE_IRQCHIP_ARGS */
+struct kvm_irqchip_args {
+       __u64 type;
+       __u64 param;
+};
+
+/* values for type */
+#define KVM_IRQCHIP_TYPE_XICS  1       /* Power server external intr ctrler */
+
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
 #define KVM_EXIT_IO               2
@@ -663,6 +673,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_CSS_SUPPORT 85
 #define KVM_CAP_PPC_EPR 86
 #define KVM_CAP_PPC_RTAS 87
+#define KVM_CAP_IRQCHIP_ARGS 88
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -805,6 +816,21 @@ struct kvm_msi {
        __u8  pad[16];
 };
 
+struct kvm_irq_sources {
+       __u32 irq;
+       __u32 nr_irqs;
+       __u64 __user *irqbuf;
+};
+
+/* irqbuf entries are laid out like this: */
+#define KVM_IRQ_SERVER_SHIFT   0
+#define KVM_IRQ_SERVER_MASK    0xffffffffULL
+#define KVM_IRQ_PRIORITY_SHIFT 32
+#define KVM_IRQ_PRIORITY_MASK  0xff
+#define KVM_IRQ_LEVEL_SENSITIVE        (1ULL << 40)
+#define KVM_IRQ_MASKED         (1ULL << 41)
+#define KVM_IRQ_PENDING                (1ULL << 42)
+
 /*
  * ioctls for VM fds
  */
@@ -892,6 +918,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_HTAB_FD      _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
 /* Available with KVM_CAP_PPC_RTAS */
 #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xab, struct 
kvm_rtas_token_args)
+#define KVM_CREATE_IRQCHIP_ARGS   _IOW(KVMIO,  0xac, struct kvm_irqchip_args)
+#define KVM_IRQCHIP_GET_SOURCES          _IOW(KVMIO,  0xad, struct 
kvm_irq_sources)
+#define KVM_IRQCHIP_SET_SOURCES          _IOW(KVMIO,  0xae, struct 
kvm_irq_sources)
 
 /*
  * ioctls for vcpu fds
-- 
1.7.10.rc3.219.g53414

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to