from:"Alexey Kardashevskiy"

Re: [PATCH kernel 7/9] KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers

2015-12-21 Thread Alexey Kardashevskiy


On 12/08/2015 04:27 PM, David Gibson wrote:

On Tue, Sep 15, 2015 at 08:49:37PM +1000, Alexey Kardashevskiy wrote:

Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls)
will validate TCE (not to have unexpected bits) and IO address
(to be within the DMA window boundaries).

This introduces helpers to validate TCE and IO address.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
  arch/powerpc/include/asm/kvm_ppc.h  |  4 ++
  arch/powerpc/kvm/book3s_64_vio_hv.c | 89 -
  2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index c6ef05b..fcde896 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,6 +166,10 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);

  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+   unsigned long tce);
  extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
  extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6cf1ab3..f0fd84c 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -36,6 +36,7 @@
  #include 
  #include 
  #include 
+#include 

  #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))

@@ -64,7 +65,7 @@ static struct kvmppc_spapr_tce_table 
*kvmppc_find_table(struct kvm_vcpu *vcpu,
   * WARNING: This will be called in real-mode on HV KVM and virtual
   *  mode on PR KVM
   */
-static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages)
  {
unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1;
@@ -76,6 +77,79 @@ static long kvmppc_ioba_validate(struct 
kvmppc_spapr_tce_table *stt,

return H_SUCCESS;
  }
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);


Why does it need to be exported - the new users will still be in the
KVM module, won't they?



book3s_64_vio_hv.c contains realmode code and is always compiled into 
vmlinux and the helper is meant to be called from book3s_64_vio.c which may 
compile as a module.






+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).


Hmm.  These comments apply given that the only current user of this is
the kvm acceleration of userspace TCE tables, but the name suggests it
would validate any TCE, including in kernel ones for which this would
be unsafe.



The function has the "kvmppc_" prefix and the file besides in 
arch/powerpc/kvm, so to my taste it is self-explanatory that it only 
handles TCEs from KVM guests (not even from a random user-space tool), no?





+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+   unsigned long mask = ((1ULL << IOMMU_PAGE_SHIFT_4K) - 1) &
+   ~(TCE_PCI_WRITE | TCE_PCI_READ);
+
+   if (tce & mask)
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * should be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+   return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests

Re: [PATCH kernel 7/9] KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers

2015-12-21 Thread Alexey Kardashevskiy


On 12/08/2015 04:27 PM, David Gibson wrote:

On Tue, Sep 15, 2015 at 08:49:37PM +1000, Alexey Kardashevskiy wrote:

Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls)
will validate TCE (not to have unexpected bits) and IO address
(to be within the DMA window boundaries).

This introduces helpers to validate TCE and IO address.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
  arch/powerpc/include/asm/kvm_ppc.h  |  4 ++
  arch/powerpc/kvm/book3s_64_vio_hv.c | 89 -
  2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index c6ef05b..fcde896 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,6 +166,10 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);

  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+   unsigned long tce);
  extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
  extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6cf1ab3..f0fd84c 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -36,6 +36,7 @@
  #include 
  #include 
  #include 
+#include 

  #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))

@@ -64,7 +65,7 @@ static struct kvmppc_spapr_tce_table 
*kvmppc_find_table(struct kvm_vcpu *vcpu,
   * WARNING: This will be called in real-mode on HV KVM and virtual
   *  mode on PR KVM
   */
-static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages)
  {
unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1;
@@ -76,6 +77,79 @@ static long kvmppc_ioba_validate(struct 
kvmppc_spapr_tce_table *stt,

return H_SUCCESS;
  }
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);


Why does it need to be exported - the new users will still be in the
KVM module, won't they?



book3s_64_vio_hv.c contains realmode code and is always compiled into 
vmlinux and the helper is meant to be called from book3s_64_vio.c which may 
compile as a module.






+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).


Hmm.  These comments apply given that the only current user of this is
the kvm acceleration of userspace TCE tables, but the name suggests it
would validate any TCE, including in kernel ones for which this would
be unsafe.



The function has the "kvmppc_" prefix and the file besides in 
arch/powerpc/kvm, so to my taste it is self-explanatory that it only 
handles TCEs from KVM guests (not even from a random user-space tool), no?





+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+   unsigned long mask = ((1ULL << IOMMU_PAGE_SHIFT_4K) - 1) &
+   ~(TCE_PCI_WRITE | TCE_PCI_READ);
+
+   if (tce & mask)
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * should be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+   return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests

Re: [PATCH kernel 9/9] KVM: PPC: Add support for multiple-TCE hcalls

2015-12-21 Thread Alexey Kardashevskiy


On 12/08/2015 04:48 PM, David Gibson wrote:

On Tue, Sep 15, 2015 at 08:49:39PM +1000, Alexey Kardashevskiy wrote:

This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition between kernel and user space.

This implements the KVM_CAP_PPC_MULTITCE capability. When present,
the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE.
If they can not be handled by the kernel, they are passed on to
the user space. The user space still has to have an implementation
for these.

Both HV and PR-syle KVM are supported.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
  Documentation/virtual/kvm/api.txt   |  25 ++
  arch/powerpc/include/asm/kvm_ppc.h  |  12 +++
  arch/powerpc/kvm/book3s_64_vio.c| 111 +++-
  arch/powerpc/kvm/book3s_64_vio_hv.c | 145 ++--
  arch/powerpc/kvm/book3s_hv.c|  26 +-
  arch/powerpc/kvm/book3s_hv_rmhandlers.S |   6 +-
  arch/powerpc/kvm/book3s_pr_papr.c   |  35 
  arch/powerpc/kvm/powerpc.c  |   3 +
  8 files changed, 350 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d86d831..593c62a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3019,6 +3019,31 @@ Returns: 0 on success, -1 on error

  Queues an SMI on the thread's vcpu.

+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
  5. The kvm_run structure
  

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index fcde896..e5b968e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,12 +166,24 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);

  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+   struct kvm_vcpu *vcpu, unsigned long liobn);
  extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages);
  extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+   unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+   unsigned long idx, unsigned long tce);
  extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_value, unsigned long npages);
  extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba);
  extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index e347856..d3fc732 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@
   *
   * Copyright 2010 Paul Mackerras, IBM Corp. <pau...@au1.ibm.com>
   * Copyright 2011 David Gibson, IBM Corporation <d...@au1.ibm.com>
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <a...@au1.ibm.com>
   */

  #include 
@@ -37,8 +38,7 @@
  #include 
  #i

Re: [PATCH kernel 9/9] KVM: PPC: Add support for multiple-TCE hcalls

2015-12-21 Thread Alexey Kardashevskiy


On 12/08/2015 04:48 PM, David Gibson wrote:

On Tue, Sep 15, 2015 at 08:49:39PM +1000, Alexey Kardashevskiy wrote:

This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition between kernel and user space.

This implements the KVM_CAP_PPC_MULTITCE capability. When present,
the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE.
If they can not be handled by the kernel, they are passed on to
the user space. The user space still has to have an implementation
for these.

Both HV and PR-syle KVM are supported.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
  Documentation/virtual/kvm/api.txt   |  25 ++
  arch/powerpc/include/asm/kvm_ppc.h  |  12 +++
  arch/powerpc/kvm/book3s_64_vio.c| 111 +++-
  arch/powerpc/kvm/book3s_64_vio_hv.c | 145 ++--
  arch/powerpc/kvm/book3s_hv.c|  26 +-
  arch/powerpc/kvm/book3s_hv_rmhandlers.S |   6 +-
  arch/powerpc/kvm/book3s_pr_papr.c   |  35 
  arch/powerpc/kvm/powerpc.c  |   3 +
  8 files changed, 350 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d86d831..593c62a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3019,6 +3019,31 @@ Returns: 0 on success, -1 on error

  Queues an SMI on the thread's vcpu.

+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
  5. The kvm_run structure
  

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index fcde896..e5b968e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,12 +166,24 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);

  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+   struct kvm_vcpu *vcpu, unsigned long liobn);
  extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages);
  extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+   unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+   unsigned long idx, unsigned long tce);
  extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_value, unsigned long npages);
  extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba);
  extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index e347856..d3fc732 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@
   *
   * Copyright 2010 Paul Mackerras, IBM Corp. <pau...@au1.ibm.com>
   * Copyright 2011 David Gibson, IBM Corporation <d...@au1.ibm.com>
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <a...@au1.ibm.com>
   */

  #include 
@@ -37,8 +38,7 @@
  #include 
  #i

Re: [RFC PATCH 0/3] VFIO: capability chains

2015-12-17 Thread Alexey Kardashevskiy


On 11/24/2015 07:43 AM, Alex Williamson wrote:

Please see the commit log and comments in patch 1 for a general
explanation of the problems that this series tries to address.  The
general problem is that we have several cases where we want to expose
variable sized information to the user, whether it's sparse mmaps for
a region, as implemented here, or DMA mapping ranges of an IOMMU, or
reserved MSI mapping ranges, etc.  Extending data structures is hard;
extending them to report variable sized data is really hard.  After
considering several options, I think the best approach is to copy how
PCI does capabilities.  This allows the ioctl to only expose the
capabilities that are relevant for them, avoids data structures that
are too complicated to parse, and avoids creating a new ioctl each
time we think of something else that we'd like to report.  This method
also doesn't preclude extensions to the fixed structure since the
offset of these capabilities is entirely dynamic.

Comments welcome, I'll also follow-up to the QEMU and KVM lists with
an RFC making use of this for mmaps skipping over the MSI-X table.
Thanks,


Out of curiosity - could this information be exposed to the userspace via 
/sys/bus/pci/devices/:xx:xx:x/vfio_? It seems not to change after 
vfio_pci driver is bound to a device.






Alex

---

Alex Williamson (3):
   vfio: Define capability chains
   vfio: Define sparse mmap capability for regions
   vfio/pci: Include sparse mmap capability for MSI-X table regions


  drivers/vfio/pci/vfio_pci.c |  101 +++
  include/uapi/linux/vfio.h   |   53 ++-
  2 files changed, 152 insertions(+), 2 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/




--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel] vfio: Add explicit alignments in vfio_iommu_spapr_tce_create

2015-12-17 Thread Alexey Kardashevskiy

The vfio_iommu_spapr_tce_create struct has 4x32bit and 2x64bit fields
which should have resulted in sizeof(fio_iommu_spapr_tce_create) equal
to 32 bytes. However due to the gcc's default alignment, the actual
size of this struct is 40 bytes.

This fills gaps with __resv1/2 fields.

This should not cause any change in behavior.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 include/uapi/linux/vfio.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9fd7b5d..d117233 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -568,8 +568,10 @@ struct vfio_iommu_spapr_tce_create {
__u32 flags;
/* in */
__u32 page_shift;
+   __u32 __resv1;
__u64 window_size;
__u32 levels;
+   __u32 __resv2;
/* out */
__u64 start_addr;
 };
-- 
2.5.0.rc3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 0/3] VFIO: capability chains

2015-12-17 Thread Alexey Kardashevskiy


On 12/18/2015 01:38 PM, Alex Williamson wrote:

On Fri, 2015-12-18 at 13:05 +1100, Alexey Kardashevskiy wrote:

On 11/24/2015 07:43 AM, Alex Williamson wrote:

Please see the commit log and comments in patch 1 for a general
explanation of the problems that this series tries to address.  The
general problem is that we have several cases where we want to
expose
variable sized information to the user, whether it's sparse mmaps
for
a region, as implemented here, or DMA mapping ranges of an IOMMU,
or
reserved MSI mapping ranges, etc.  Extending data structures is
hard;
extending them to report variable sized data is really hard.  After
considering several options, I think the best approach is to copy
how
PCI does capabilities.  This allows the ioctl to only expose the
capabilities that are relevant for them, avoids data structures
that
are too complicated to parse, and avoids creating a new ioctl each
time we think of something else that we'd like to report.  This
method
also doesn't preclude extensions to the fixed structure since the
offset of these capabilities is entirely dynamic.

Comments welcome, I'll also follow-up to the QEMU and KVM lists
with
an RFC making use of this for mmaps skipping over the MSI-X table.
Thanks,


Out of curiosity - could this information be exposed to the userspace
via
/sys/bus/pci/devices/:xx:xx:x/vfio_? It seems not to change
after
vfio_pci driver is bound to a device.


For what purpose?  vfio doesn't have a sysfs interface, why start one?
Thanks,


well, it could simplify debugging a bit if this information was available 
from the userspace without programming a test tool doing some ioctl()'s. 
Not a big deal though...




--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kernel 5/9] KVM: PPC: Account TCE-containing pages in locked_vm

2015-11-29 Thread Alexey Kardashevskiy


On 11/30/2015 01:06 PM, Paul Mackerras wrote:

On Tue, Sep 15, 2015 at 08:49:35PM +1000, Alexey Kardashevskiy wrote:

At the moment pages used for TCE tables (in addition to pages addressed
by TCEs) are not counted in locked_vm counter so a malicious userspace
tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and
lock a lot of memory.

This adds counting for pages used for TCE tables.

This counts the number of pages required for a table plus pages for
the kvmppc_spapr_tce_table struct (TCE table descriptor) itself.

This does not change the amount of (de)allocated memory.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
  arch/powerpc/kvm/book3s_64_vio.c | 51 +++-
  1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 9526c34..b70787d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -45,13 +45,56 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
  }

+static long kvmppc_account_memlimit(long npages, bool inc)
+{
+   long ret = 0;
+   const long bytes = sizeof(struct kvmppc_spapr_tce_table) +
+   (abs(npages) * sizeof(struct page *));


Why abs(npages)?  Can npages be negative?  If so, what does that mean?



Leftover from older versions when there was one shared 
account_memlimit(long npages). It does not make sense here, I need to 
remove it.



--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kernel 5/9] KVM: PPC: Account TCE-containing pages in locked_vm

2015-11-29 Thread Alexey Kardashevskiy


On 11/30/2015 01:06 PM, Paul Mackerras wrote:

On Tue, Sep 15, 2015 at 08:49:35PM +1000, Alexey Kardashevskiy wrote:

At the moment pages used for TCE tables (in addition to pages addressed
by TCEs) are not counted in locked_vm counter so a malicious userspace
tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and
lock a lot of memory.

This adds counting for pages used for TCE tables.

This counts the number of pages required for a table plus pages for
the kvmppc_spapr_tce_table struct (TCE table descriptor) itself.

This does not change the amount of (de)allocated memory.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
  arch/powerpc/kvm/book3s_64_vio.c | 51 +++-
  1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 9526c34..b70787d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -45,13 +45,56 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
  }

+static long kvmppc_account_memlimit(long npages, bool inc)
+{
+   long ret = 0;
+   const long bytes = sizeof(struct kvmppc_spapr_tce_table) +
+   (abs(npages) * sizeof(struct page *));


Why abs(npages)?  Can npages be negative?  If so, what does that mean?



Leftover from older versions when there was one shared 
account_memlimit(long npages). It does not make sense here, I need to 
remove it.



--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 9/9] KVM: PPC: Add support for multiple-TCE hcalls

2015-09-15 Thread Alexey Kardashevskiy

This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition between kernel and user space.

This implements the KVM_CAP_PPC_MULTITCE capability. When present,
the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE.
If they can not be handled by the kernel, they are passed on to
the user space. The user space still has to have an implementation
for these.

Both HV and PR-syle KVM are supported.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 Documentation/virtual/kvm/api.txt   |  25 ++
 arch/powerpc/include/asm/kvm_ppc.h  |  12 +++
 arch/powerpc/kvm/book3s_64_vio.c| 111 +++-
 arch/powerpc/kvm/book3s_64_vio_hv.c | 145 ++--
 arch/powerpc/kvm/book3s_hv.c|  26 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   6 +-
 arch/powerpc/kvm/book3s_pr_papr.c   |  35 
 arch/powerpc/kvm/powerpc.c  |   3 +
 8 files changed, 350 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d86d831..593c62a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3019,6 +3019,31 @@ Returns: 0 on success, -1 on error
 
 Queues an SMI on the thread's vcpu.
 
+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index fcde896..e5b968e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,12 +166,24 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+   struct kvm_vcpu *vcpu, unsigned long liobn);
 extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages);
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+   unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+   unsigned long idx, unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index e347856..d3fc732 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <pau...@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <d...@au1.ibm.com>
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <a...@au1.ibm.com>
  */
 
 #include 
@@ -37,8 +38,7 @@
 #include 
 #include 
 #include 
-
-#define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
+#include 
 
 static long kvmppc_stt_npages(unsigned lon

[PATCH kernel 4/9] KVM: PPC: Use RCU for arch.spapr_tce_tables

2015-09-15 Thread Alexey Kardashevskiy

At the moment spapr_tce_tables is not protected against races. This makes
use of RCU-variants of list helpers. As some bits are executed in real
mode, this makes use of just introduced list_for_each_entry_rcu_notrace().

This converts release_spapr_tce_table() to a RCU scheduled handler.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s.c   |  2 +-
 arch/powerpc/kvm/book3s_64_vio.c| 20 +++-
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 98eebbf6..e19d412 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -178,6 +178,7 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+   struct rcu_head rcu;
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 53285d5..3418f7c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -806,7 +806,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 
 #ifdef CONFIG_PPC64
-   INIT_LIST_HEAD(>arch.spapr_tce_tables);
+   INIT_LIST_HEAD_RCU(>arch.spapr_tce_tables);
INIT_LIST_HEAD(>arch.rtas_tokens);
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc..9526c34 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -45,19 +45,16 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+static void release_spapr_tce_table(struct rcu_head *head)
 {
-   struct kvm *kvm = stt->kvm;
+   struct kvmppc_spapr_tce_table *stt = container_of(head,
+   struct kvmppc_spapr_tce_table, rcu);
int i;
 
-   mutex_lock(>lock);
-   list_del(>list);
for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
__free_page(stt->pages[i]);
+
kfree(stt);
-   mutex_unlock(>lock);
-
-   kvm_put_kvm(kvm);
 }
 
 static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault 
*vmf)
@@ -88,7 +85,12 @@ static int kvm_spapr_tce_release(struct inode *inode, struct 
file *filp)
 {
struct kvmppc_spapr_tce_table *stt = filp->private_data;
 
-   release_spapr_tce_table(stt);
+   list_del_rcu(>list);
+
+   kvm_put_kvm(stt->kvm);
+
+   call_rcu(>rcu, release_spapr_tce_table);
+
return 0;
 }
 
@@ -131,7 +133,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
kvm_get_kvm(kvm);
 
mutex_lock(>lock);
-   list_add(>list, >arch.spapr_tce_tables);
+   list_add_rcu(>list, >arch.spapr_tce_tables);
 
mutex_unlock(>lock);
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 5/9] KVM: PPC: Account TCE-containing pages in locked_vm

2015-09-15 Thread Alexey Kardashevskiy

At the moment pages used for TCE tables (in addition to pages addressed
by TCEs) are not counted in locked_vm counter so a malicious userspace
tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and
lock a lot of memory.

This adds counting for pages used for TCE tables.

This counts the number of pages required for a table plus pages for
the kvmppc_spapr_tce_table struct (TCE table descriptor) itself.

This does not change the amount of (de)allocated memory.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/kvm/book3s_64_vio.c | 51 +++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 9526c34..b70787d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -45,13 +45,56 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
+static long kvmppc_account_memlimit(long npages, bool inc)
+{
+   long ret = 0;
+   const long bytes = sizeof(struct kvmppc_spapr_tce_table) +
+   (abs(npages) * sizeof(struct page *));
+   const long stt_pages = ALIGN(bytes, PAGE_SIZE) / PAGE_SIZE;
+
+   if (!current || !current->mm)
+   return ret; /* process exited */
+
+   npages += stt_pages;
+
+   down_write(>mm->mmap_sem);
+
+   if (inc) {
+   long locked, lock_limit;
+
+   locked = current->mm->locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current->mm->locked_vm += npages;
+   } else {
+   if (npages > current->mm->locked_vm)
+   npages = current->mm->locked_vm;
+
+   current->mm->locked_vm -= npages;
+   }
+
+   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+   inc ? '+' : '-',
+   npages << PAGE_SHIFT,
+   current->mm->locked_vm << PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ? " - exceeded" : "");
+
+   up_write(>mm->mmap_sem);
+
+   return ret;
+}
+
 static void release_spapr_tce_table(struct rcu_head *head)
 {
struct kvmppc_spapr_tce_table *stt = container_of(head,
struct kvmppc_spapr_tce_table, rcu);
int i;
+   long npages = kvmppc_stt_npages(stt->window_size);
 
-   for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+   for (i = 0; i < npages; i++)
__free_page(stt->pages[i]);
 
kfree(stt);
@@ -89,6 +132,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct 
file *filp)
 
kvm_put_kvm(stt->kvm);
 
+   kvmppc_account_memlimit(kvmppc_stt_npages(stt->window_size), false);
call_rcu(>rcu, release_spapr_tce_table);
 
return 0;
@@ -114,6 +158,11 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
}
 
npages = kvmppc_stt_npages(args->window_size);
+   ret = kvmppc_account_memlimit(npages, true);
+   if (ret) {
+   stt = NULL;
+   goto fail;
+   }
 
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
  GFP_KERNEL);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 6/9] KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K

2015-09-15 Thread Alexey Kardashevskiy

SPAPR_TCE_SHIFT is used in few places only and since IOMMU_PAGE_SHIFT_4K
can be easily used instead, remove SPAPR_TCE_SHIFT.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 2 --
 arch/powerpc/kvm/book3s_64_vio.c | 3 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c  | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 2aa79c8..7529aab 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu 
*svcpu)
 }
 #endif
 
-#define SPAPR_TCE_SHIFT12
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
 #endif
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b70787d..e347856 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -36,12 +36,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
 static long kvmppc_stt_npages(unsigned long window_size)
 {
-   return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+   return ALIGN((window_size >> IOMMU_PAGE_SHIFT_4K)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 8ae12ac..6cf1ab3 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -99,7 +99,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
if (ret)
return ret;
 
-   idx = ioba >> SPAPR_TCE_SHIFT;
+   idx = ioba >> IOMMU_PAGE_SHIFT_4K;
page = stt->pages[idx / TCES_PER_PAGE];
tbl = (u64 *)page_address(page);
 
@@ -121,7 +121,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
if (stt) {
ret = kvmppc_ioba_validate(stt, ioba, 1);
if (!ret) {
-   unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+   unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K;
struct page *page = stt->pages[idx / TCES_PER_PAGE];
u64 *tbl = (u64 *)page_address(page);
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 8/9] KVM: Fix KVM_SMI chapter number

2015-09-15 Thread Alexey Kardashevskiy

The KVM_SMI capability is following the KVM_S390_SET_IRQ_STATE capability
which is "4.95", this changes the number of the KVM_SMI chapter to 4.96.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 Documentation/virtual/kvm/api.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d9eccee..d86d831 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3009,7 +3009,7 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It 
must be > 0
 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
 which is the maximum number of possibly pending cpu-local interrupts.
 
-4.90 KVM_SMI
+4.96 KVM_SMI
 
 Capability: KVM_CAP_X86_SMM
 Architectures: x86
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 9/9] KVM: PPC: Add support for multiple-TCE hcalls

2015-09-15 Thread Alexey Kardashevskiy

This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition between kernel and user space.

This implements the KVM_CAP_PPC_MULTITCE capability. When present,
the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE.
If they can not be handled by the kernel, they are passed on to
the user space. The user space still has to have an implementation
for these.

Both HV and PR-syle KVM are supported.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 Documentation/virtual/kvm/api.txt   |  25 ++
 arch/powerpc/include/asm/kvm_ppc.h  |  12 +++
 arch/powerpc/kvm/book3s_64_vio.c| 111 +++-
 arch/powerpc/kvm/book3s_64_vio_hv.c | 145 ++--
 arch/powerpc/kvm/book3s_hv.c|  26 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   6 +-
 arch/powerpc/kvm/book3s_pr_papr.c   |  35 
 arch/powerpc/kvm/powerpc.c  |   3 +
 8 files changed, 350 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d86d831..593c62a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3019,6 +3019,31 @@ Returns: 0 on success, -1 on error
 
 Queues an SMI on the thread's vcpu.
 
+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index fcde896..e5b968e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,12 +166,24 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+   struct kvm_vcpu *vcpu, unsigned long liobn);
 extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages);
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+   unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+   unsigned long idx, unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index e347856..d3fc732 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <pau...@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <d...@au1.ibm.com>
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <a...@au1.ibm.com>
  */
 
 #include 
@@ -37,8 +38,7 @@
 #include 
 #include 
 #include 
-
-#define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
+#include 
 
 static long kvmppc_stt_npages(unsigned lon

[PATCH kernel 8/9] KVM: Fix KVM_SMI chapter number

2015-09-15 Thread Alexey Kardashevskiy

The KVM_SMI capability is following the KVM_S390_SET_IRQ_STATE capability
which is "4.95", this changes the number of the KVM_SMI chapter to 4.96.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 Documentation/virtual/kvm/api.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d9eccee..d86d831 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3009,7 +3009,7 @@ len must be a multiple of sizeof(struct kvm_s390_irq). It 
must be > 0
 and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq),
 which is the maximum number of possibly pending cpu-local interrupts.
 
-4.90 KVM_SMI
+4.96 KVM_SMI
 
 Capability: KVM_CAP_X86_SMM
 Architectures: x86
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 4/9] KVM: PPC: Use RCU for arch.spapr_tce_tables

2015-09-15 Thread Alexey Kardashevskiy

At the moment spapr_tce_tables is not protected against races. This makes
use of RCU-variants of list helpers. As some bits are executed in real
mode, this makes use of just introduced list_for_each_entry_rcu_notrace().

This converts release_spapr_tce_table() to a RCU scheduled handler.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s.c   |  2 +-
 arch/powerpc/kvm/book3s_64_vio.c| 20 +++-
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 98eebbf6..e19d412 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -178,6 +178,7 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+   struct rcu_head rcu;
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 53285d5..3418f7c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -806,7 +806,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 
 #ifdef CONFIG_PPC64
-   INIT_LIST_HEAD(>arch.spapr_tce_tables);
+   INIT_LIST_HEAD_RCU(>arch.spapr_tce_tables);
INIT_LIST_HEAD(>arch.rtas_tokens);
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc..9526c34 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -45,19 +45,16 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+static void release_spapr_tce_table(struct rcu_head *head)
 {
-   struct kvm *kvm = stt->kvm;
+   struct kvmppc_spapr_tce_table *stt = container_of(head,
+   struct kvmppc_spapr_tce_table, rcu);
int i;
 
-   mutex_lock(>lock);
-   list_del(>list);
for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
__free_page(stt->pages[i]);
+
kfree(stt);
-   mutex_unlock(>lock);
-
-   kvm_put_kvm(kvm);
 }
 
 static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault 
*vmf)
@@ -88,7 +85,12 @@ static int kvm_spapr_tce_release(struct inode *inode, struct 
file *filp)
 {
struct kvmppc_spapr_tce_table *stt = filp->private_data;
 
-   release_spapr_tce_table(stt);
+   list_del_rcu(>list);
+
+   kvm_put_kvm(stt->kvm);
+
+   call_rcu(>rcu, release_spapr_tce_table);
+
return 0;
 }
 
@@ -131,7 +133,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
kvm_get_kvm(kvm);
 
mutex_lock(>lock);
-   list_add(>list, >arch.spapr_tce_tables);
+   list_add_rcu(>list, >arch.spapr_tce_tables);
 
mutex_unlock(>lock);
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 5/9] KVM: PPC: Account TCE-containing pages in locked_vm

2015-09-15 Thread Alexey Kardashevskiy

At the moment pages used for TCE tables (in addition to pages addressed
by TCEs) are not counted in locked_vm counter so a malicious userspace
tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and
lock a lot of memory.

This adds counting for pages used for TCE tables.

This counts the number of pages required for a table plus pages for
the kvmppc_spapr_tce_table struct (TCE table descriptor) itself.

This does not change the amount of (de)allocated memory.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/kvm/book3s_64_vio.c | 51 +++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 9526c34..b70787d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -45,13 +45,56 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
+static long kvmppc_account_memlimit(long npages, bool inc)
+{
+   long ret = 0;
+   const long bytes = sizeof(struct kvmppc_spapr_tce_table) +
+   (abs(npages) * sizeof(struct page *));
+   const long stt_pages = ALIGN(bytes, PAGE_SIZE) / PAGE_SIZE;
+
+   if (!current || !current->mm)
+   return ret; /* process exited */
+
+   npages += stt_pages;
+
+   down_write(>mm->mmap_sem);
+
+   if (inc) {
+   long locked, lock_limit;
+
+   locked = current->mm->locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current->mm->locked_vm += npages;
+   } else {
+   if (npages > current->mm->locked_vm)
+   npages = current->mm->locked_vm;
+
+   current->mm->locked_vm -= npages;
+   }
+
+   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+   inc ? '+' : '-',
+   npages << PAGE_SHIFT,
+   current->mm->locked_vm << PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ? " - exceeded" : "");
+
+   up_write(>mm->mmap_sem);
+
+   return ret;
+}
+
 static void release_spapr_tce_table(struct rcu_head *head)
 {
struct kvmppc_spapr_tce_table *stt = container_of(head,
struct kvmppc_spapr_tce_table, rcu);
int i;
+   long npages = kvmppc_stt_npages(stt->window_size);
 
-   for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+   for (i = 0; i < npages; i++)
__free_page(stt->pages[i]);
 
kfree(stt);
@@ -89,6 +132,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct 
file *filp)
 
kvm_put_kvm(stt->kvm);
 
+   kvmppc_account_memlimit(kvmppc_stt_npages(stt->window_size), false);
call_rcu(>rcu, release_spapr_tce_table);
 
return 0;
@@ -114,6 +158,11 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
}
 
npages = kvmppc_stt_npages(args->window_size);
+   ret = kvmppc_account_memlimit(npages, true);
+   if (ret) {
+   stt = NULL;
+   goto fail;
+   }
 
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
  GFP_KERNEL);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 6/9] KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K

2015-09-15 Thread Alexey Kardashevskiy

SPAPR_TCE_SHIFT is used in few places only and since IOMMU_PAGE_SHIFT_4K
can be easily used instead, remove SPAPR_TCE_SHIFT.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 2 --
 arch/powerpc/kvm/book3s_64_vio.c | 3 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c  | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 2aa79c8..7529aab 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu 
*svcpu)
 }
 #endif
 
-#define SPAPR_TCE_SHIFT12
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
 #endif
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b70787d..e347856 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -36,12 +36,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
 static long kvmppc_stt_npages(unsigned long window_size)
 {
-   return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+   return ALIGN((window_size >> IOMMU_PAGE_SHIFT_4K)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 8ae12ac..6cf1ab3 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -99,7 +99,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
if (ret)
return ret;
 
-   idx = ioba >> SPAPR_TCE_SHIFT;
+   idx = ioba >> IOMMU_PAGE_SHIFT_4K;
page = stt->pages[idx / TCES_PER_PAGE];
tbl = (u64 *)page_address(page);
 
@@ -121,7 +121,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
if (stt) {
ret = kvmppc_ioba_validate(stt, ioba, 1);
if (!ret) {
-   unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+   unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K;
struct page *page = stt->pages[idx / TCES_PER_PAGE];
u64 *tbl = (u64 *)page_address(page);
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 7/9] KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers

2015-09-15 Thread Alexey Kardashevskiy

Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls)
will validate TCE (not to have unexpected bits) and IO address
(to be within the DMA window boundaries).

This introduces helpers to validate TCE and IO address.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/kvm_ppc.h  |  4 ++
 arch/powerpc/kvm/book3s_64_vio_hv.c | 89 -
 2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index c6ef05b..fcde896 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -166,6 +166,10 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+   unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6cf1ab3..f0fd84c 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
@@ -64,7 +65,7 @@ static struct kvmppc_spapr_tce_table 
*kvmppc_find_table(struct kvm_vcpu *vcpu,
  * WARNING: This will be called in real-mode on HV KVM and virtual
  *  mode on PR KVM
  */
-static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages)
 {
unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1;
@@ -76,6 +77,79 @@ static long kvmppc_ioba_validate(struct 
kvmppc_spapr_tce_table *stt,
 
return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+   unsigned long mask = ((1ULL << IOMMU_PAGE_SHIFT_4K) - 1) &
+   ~(TCE_PCI_WRITE | TCE_PCI_READ);
+
+   if (tce & mask)
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * should be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+   return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Called in both real and virtual modes.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+   unsigned long idx, unsigned long tce)
+{
+   struct page *page;
+   u64 *tbl;
+
+   page = stt->pages[idx / TCES_PER_PAGE];
+   tbl = kvmppc_page_address(page);
+
+   tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_put);
 
 /* WARNING: This will be called in real-mode on HV KVM and virtual
  *  mode on PR KVM
@@ -85,9 +159,6 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
 {
struc

[PATCH kernel 0/9] KVM: PPC: Add in-kernel multitce handling

2015-09-15 Thread Alexey Kardashevskiy

These patches enable in-kernel acceleration for H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls which allow doing multiple (up to 512) TCE entries
update in a single call saving time on switching context. QEMU already
supports these hypercalls so this is just an optimization.

Both HV and PR KVM modes are supported.

This does not affect VFIO, this support is coming next.

Please comment. Thanks.


Alexey Kardashevskiy (9):
  rcu: Define notrace version of list_for_each_entry_rcu
  KVM: PPC: Make real_vmalloc_addr() public
  KVM: PPC: Rework H_PUT_TCE/H_GET_TCE handlers
  KVM: PPC: Use RCU for arch.spapr_tce_tables
  KVM: PPC: Account TCE-containing pages in locked_vm
  KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K
  KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers
  KVM: Fix KVM_SMI chapter number
  KVM: PPC: Add support for multiple-TCE hcalls

 Documentation/virtual/kvm/api.txt|  27 ++-
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 -
 arch/powerpc/include/asm/kvm_host.h  |   1 +
 arch/powerpc/include/asm/kvm_ppc.h   |  16 ++
 arch/powerpc/include/asm/mmu-hash64.h|   3 +
 arch/powerpc/kvm/book3s.c|   2 +-
 arch/powerpc/kvm/book3s_64_vio.c | 185 --
 arch/powerpc/kvm/book3s_64_vio_hv.c  | 310 +++
 arch/powerpc/kvm/book3s_hv.c |  26 ++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  17 --
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |   6 +-
 arch/powerpc/kvm/book3s_pr_papr.c|  35 
 arch/powerpc/kvm/powerpc.c   |   3 +
 arch/powerpc/mm/hash_utils_64.c  |  17 ++
 include/linux/rculist.h  |  38 
 15 files changed, 610 insertions(+), 78 deletions(-)

-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 2/9] KVM: PPC: Make real_vmalloc_addr() public

2015-09-15 Thread Alexey Kardashevskiy

This helper translates vmalloc'd addresses to linear addresses.
It is only used by the KVM MMU code now and resides in the HV KVM code.
We will need it further in the TCE code and the DMA memory preregistration
code called in real mode.

This makes real_vmalloc_addr() public and moves it to the powerpc code as
it does not do anything special for KVM.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/mmu-hash64.h |  3 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   | 17 -
 arch/powerpc/mm/hash_utils_64.c   | 17 +
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index a82f534..fd06b73 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -606,6 +606,9 @@ static inline unsigned long get_kernel_vsid(unsigned long 
ea, int ssize)
context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
return get_vsid(context, ea, ssize);
 }
+
+void *real_vmalloc_addr(void *x);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index c1df9bb..987b7d1 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -22,23 +22,6 @@
 #include 
 #include 
 
-/* Translate address of a vmalloc'd thing to a linear map address */
-static void *real_vmalloc_addr(void *x)
-{
-   unsigned long addr = (unsigned long) x;
-   pte_t *p;
-   /*
-* assume we don't have huge pages in vmalloc space...
-* So don't worry about THP collapse/split. Called
-* Only in realmode, hence won't need irq_save/restore.
-*/
-   p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
-   if (!p || !pte_present(*p))
-   return NULL;
-   addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
-   return __va(addr);
-}
-
 /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
 static int global_invalidates(struct kvm *kvm, unsigned long flags)
 {
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 5ec987f..9737d6a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1556,3 +1556,20 @@ void setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
 }
+
+/* Translate address of a vmalloc'd thing to a linear map address */
+void *real_vmalloc_addr(void *x)
+{
+   unsigned long addr = (unsigned long) x;
+   pte_t *p;
+   /*
+* assume we don't have huge pages in vmalloc space...
+* So don't worry about THP collapse/split. Called
+* Only in realmode, hence won't need irq_save/restore.
+*/
+   p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+   if (!p || !pte_present(*p))
+   return NULL;
+   addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
+   return __va(addr);
+}
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 1/9] rcu: Define notrace version of list_for_each_entry_rcu

2015-09-15 Thread Alexey Kardashevskiy

This defines list_for_each_entry_rcu_notrace and list_entry_rcu_notrace
which use rcu_dereference_raw_notrace instead of rcu_dereference_raw.
This allows using list_for_each_entry_rcu_notrace in real mode (MMU is off).

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 include/linux/rculist.h | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 17c6b1f..439c4d7 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -253,6 +253,25 @@ static inline void list_splice_init_rcu(struct list_head 
*list,
 })
 
 /**
+ * list_entry_rcu_notrace - get the struct for this entry
+ * @ptr:the  list_head pointer.
+ * @type:   the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by 
rcu_read_lock().
+ *
+ * This is the same as list_entry_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define list_entry_rcu_notrace(ptr, type, member) \
+({ \
+   typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
+   container_of((typeof(ptr))rcu_dereference_raw_notrace(__ptr), \
+   type, member); \
+})
+
+/**
  * Where are list_empty_rcu() and list_first_entry_rcu()?
  *
  * Implementing those functions following their counterparts list_empty() and
@@ -308,6 +327,25 @@ static inline void list_splice_init_rcu(struct list_head 
*list,
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
+ * list_for_each_entry_rcu_notrace - iterate over rcu list of given type
+ * @pos:   the type * to use as a loop cursor.
+ * @head:  the head for your list.
+ * @member:the name of the list_struct within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ *
+ * This is the same as list_for_each_entry_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define list_for_each_entry_rcu_notrace(pos, head, member) \
+   for (pos = list_entry_rcu_notrace((head)->next, typeof(*pos), member); \
+   >member != (head); \
+   pos = list_entry_rcu_notrace(pos->member.next, typeof(*pos), \
+   member))
+
+/**
  * list_for_each_entry_continue_rcu - continue iteration over list of given 
type
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 3/9] KVM: PPC: Rework H_PUT_TCE/H_GET_TCE handlers

2015-09-15 Thread Alexey Kardashevskiy

This reworks the existing H_PUT_TCE/H_GET_TCE handlers to have one
exit path. This allows next patch to add locks nicely.

This moves the ioba boundaries check to a helper and adds a check for
least bits which have to be zeros.

The patch is pretty mechanical (only check for least ioba bits is added)
so no change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 102 +++-
 1 file changed, 66 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 89e96b3..8ae12ac 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -35,71 +35,101 @@
 #include 
 #include 
 #include 
+#include 
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
+/*
+ * Finds a TCE table descriptor by LIOBN.
+ *
+ * WARNING: This will be called in real or virtual mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+   unsigned long liobn)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvmppc_spapr_tce_table *stt;
+
+   list_for_each_entry_rcu_notrace(stt, >arch.spapr_tce_tables, list)
+   if (stt->liobn == liobn)
+   return stt;
+
+   return NULL;
+}
+
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages)
+{
+   unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1;
+   unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K;
+   unsigned long size = stt->window_size >> IOMMU_PAGE_SHIFT_4K;
+
+   if ((ioba & mask) || (size + npages <= idx))
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+
 /* WARNING: This will be called in real-mode on HV KVM and virtual
  *  mode on PR KVM
  */
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  unsigned long ioba, unsigned long tce)
 {
-   struct kvm *kvm = vcpu->kvm;
-   struct kvmppc_spapr_tce_table *stt;
+   struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+   long ret = H_TOO_HARD;
+   unsigned long idx;
+   struct page *page;
+   u64 *tbl;
 
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
/*  liobn, ioba, tce); */
 
-   list_for_each_entry(stt, >arch.spapr_tce_tables, list) {
-   if (stt->liobn == liobn) {
-   unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-   struct page *page;
-   u64 *tbl;
+   if (!stt)
+   return ret;
 
-   /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  
window_size=0x%x\n", */
-   /*  liobn, stt, stt->window_size); */
-   if (ioba >= stt->window_size)
-   return H_PARAMETER;
+   ret = kvmppc_ioba_validate(stt, ioba, 1);
+   if (ret)
+   return ret;
 
-   page = stt->pages[idx / TCES_PER_PAGE];
-   tbl = (u64 *)page_address(page);
+   idx = ioba >> SPAPR_TCE_SHIFT;
+   page = stt->pages[idx / TCES_PER_PAGE];
+   tbl = (u64 *)page_address(page);
 
-   /* FIXME: Need to validate the TCE itself */
-   /* udbg_printf("tce @ %p\n", [idx % 
TCES_PER_PAGE]); */
-   tbl[idx % TCES_PER_PAGE] = tce;
-   return H_SUCCESS;
-   }
-   }
+   /* FIXME: Need to validate the TCE itself */
+   /* udbg_printf("tce @ %p\n", [idx % TCES_PER_PAGE]); */
+   tbl[idx % TCES_PER_PAGE] = tce;
 
-   /* Didn't find the liobn, punt it to userspace */
-   return H_TOO_HARD;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
 
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  unsigned long ioba)
 {
-   struct kvm *kvm = vcpu->kvm;
-   struct kvmppc_spapr_tce_table *stt;
+   struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+   long ret = H_TOO_HARD;
 
-   list_for_each_entry(stt, >arch.spapr_tce_tables, list) {
-   if (stt->liobn == liobn) {
+
+   if (stt) {
+   ret = kvmppc_ioba_validate(stt, ioba, 1);
+   if (!ret) {
unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-   struct page *page;
-   u64 *tbl;
-
-   if (ioba >= stt->window_size)
-

[PATCH kernel 1/9] rcu: Define notrace version of list_for_each_entry_rcu

2015-09-15 Thread Alexey Kardashevskiy

This defines list_for_each_entry_rcu_notrace and list_entry_rcu_notrace
which use rcu_dereference_raw_notrace instead of rcu_dereference_raw.
This allows using list_for_each_entry_rcu_notrace in real mode (MMU is off).

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 include/linux/rculist.h | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 17c6b1f..439c4d7 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -253,6 +253,25 @@ static inline void list_splice_init_rcu(struct list_head 
*list,
 })
 
 /**
+ * list_entry_rcu_notrace - get the struct for this entry
+ * @ptr:the  list_head pointer.
+ * @type:   the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by 
rcu_read_lock().
+ *
+ * This is the same as list_entry_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define list_entry_rcu_notrace(ptr, type, member) \
+({ \
+   typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
+   container_of((typeof(ptr))rcu_dereference_raw_notrace(__ptr), \
+   type, member); \
+})
+
+/**
  * Where are list_empty_rcu() and list_first_entry_rcu()?
  *
  * Implementing those functions following their counterparts list_empty() and
@@ -308,6 +327,25 @@ static inline void list_splice_init_rcu(struct list_head 
*list,
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
+ * list_for_each_entry_rcu_notrace - iterate over rcu list of given type
+ * @pos:   the type * to use as a loop cursor.
+ * @head:  the head for your list.
+ * @member:the name of the list_struct within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ *
+ * This is the same as list_for_each_entry_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define list_for_each_entry_rcu_notrace(pos, head, member) \
+   for (pos = list_entry_rcu_notrace((head)->next, typeof(*pos), member); \
+   >member != (head); \
+   pos = list_entry_rcu_notrace(pos->member.next, typeof(*pos), \
+   member))
+
+/**
  * list_for_each_entry_continue_rcu - continue iteration over list of given 
type
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 3/9] KVM: PPC: Rework H_PUT_TCE/H_GET_TCE handlers

2015-09-15 Thread Alexey Kardashevskiy

This reworks the existing H_PUT_TCE/H_GET_TCE handlers to have one
exit path. This allows next patch to add locks nicely.

This moves the ioba boundaries check to a helper and adds a check for
least bits which have to be zeros.

The patch is pretty mechanical (only check for least ioba bits is added)
so no change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 102 +++-
 1 file changed, 66 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 89e96b3..8ae12ac 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -35,71 +35,101 @@
 #include 
 #include 
 #include 
+#include 
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
+/*
+ * Finds a TCE table descriptor by LIOBN.
+ *
+ * WARNING: This will be called in real or virtual mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+   unsigned long liobn)
+{
+   struct kvm *kvm = vcpu->kvm;
+   struct kvmppc_spapr_tce_table *stt;
+
+   list_for_each_entry_rcu_notrace(stt, >arch.spapr_tce_tables, list)
+   if (stt->liobn == liobn)
+   return stt;
+
+   return NULL;
+}
+
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages)
+{
+   unsigned long mask = (1ULL << IOMMU_PAGE_SHIFT_4K) - 1;
+   unsigned long idx = ioba >> IOMMU_PAGE_SHIFT_4K;
+   unsigned long size = stt->window_size >> IOMMU_PAGE_SHIFT_4K;
+
+   if ((ioba & mask) || (size + npages <= idx))
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+
 /* WARNING: This will be called in real-mode on HV KVM and virtual
  *  mode on PR KVM
  */
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  unsigned long ioba, unsigned long tce)
 {
-   struct kvm *kvm = vcpu->kvm;
-   struct kvmppc_spapr_tce_table *stt;
+   struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+   long ret = H_TOO_HARD;
+   unsigned long idx;
+   struct page *page;
+   u64 *tbl;
 
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
/*  liobn, ioba, tce); */
 
-   list_for_each_entry(stt, >arch.spapr_tce_tables, list) {
-   if (stt->liobn == liobn) {
-   unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-   struct page *page;
-   u64 *tbl;
+   if (!stt)
+   return ret;
 
-   /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  
window_size=0x%x\n", */
-   /*  liobn, stt, stt->window_size); */
-   if (ioba >= stt->window_size)
-   return H_PARAMETER;
+   ret = kvmppc_ioba_validate(stt, ioba, 1);
+   if (ret)
+   return ret;
 
-   page = stt->pages[idx / TCES_PER_PAGE];
-   tbl = (u64 *)page_address(page);
+   idx = ioba >> SPAPR_TCE_SHIFT;
+   page = stt->pages[idx / TCES_PER_PAGE];
+   tbl = (u64 *)page_address(page);
 
-   /* FIXME: Need to validate the TCE itself */
-   /* udbg_printf("tce @ %p\n", [idx % 
TCES_PER_PAGE]); */
-   tbl[idx % TCES_PER_PAGE] = tce;
-   return H_SUCCESS;
-   }
-   }
+   /* FIXME: Need to validate the TCE itself */
+   /* udbg_printf("tce @ %p\n", [idx % TCES_PER_PAGE]); */
+   tbl[idx % TCES_PER_PAGE] = tce;
 
-   /* Didn't find the liobn, punt it to userspace */
-   return H_TOO_HARD;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
 
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  unsigned long ioba)
 {
-   struct kvm *kvm = vcpu->kvm;
-   struct kvmppc_spapr_tce_table *stt;
+   struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+   long ret = H_TOO_HARD;
 
-   list_for_each_entry(stt, >arch.spapr_tce_tables, list) {
-   if (stt->liobn == liobn) {
+
+   if (stt) {
+   ret = kvmppc_ioba_validate(stt, ioba, 1);
+   if (!ret) {
unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-   struct page *page;
-   u64 *tbl;
-
-   if (ioba >= stt->window_size)
-

[PATCH kernel 0/9] KVM: PPC: Add in-kernel multitce handling

2015-09-15 Thread Alexey Kardashevskiy

These patches enable in-kernel acceleration for H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls which allow doing multiple (up to 512) TCE entries
update in a single call saving time on switching context. QEMU already
supports these hypercalls so this is just an optimization.

Both HV and PR KVM modes are supported.

This does not affect VFIO, this support is coming next.

Please comment. Thanks.


Alexey Kardashevskiy (9):
  rcu: Define notrace version of list_for_each_entry_rcu
  KVM: PPC: Make real_vmalloc_addr() public
  KVM: PPC: Rework H_PUT_TCE/H_GET_TCE handlers
  KVM: PPC: Use RCU for arch.spapr_tce_tables
  KVM: PPC: Account TCE-containing pages in locked_vm
  KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K
  KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers
  KVM: Fix KVM_SMI chapter number
  KVM: PPC: Add support for multiple-TCE hcalls

 Documentation/virtual/kvm/api.txt|  27 ++-
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 -
 arch/powerpc/include/asm/kvm_host.h  |   1 +
 arch/powerpc/include/asm/kvm_ppc.h   |  16 ++
 arch/powerpc/include/asm/mmu-hash64.h|   3 +
 arch/powerpc/kvm/book3s.c|   2 +-
 arch/powerpc/kvm/book3s_64_vio.c | 185 --
 arch/powerpc/kvm/book3s_64_vio_hv.c  | 310 +++
 arch/powerpc/kvm/book3s_hv.c |  26 ++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  17 --
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |   6 +-
 arch/powerpc/kvm/book3s_pr_papr.c|  35 
 arch/powerpc/kvm/powerpc.c   |   3 +
 arch/powerpc/mm/hash_utils_64.c  |  17 ++
 include/linux/rculist.h  |  38 
 15 files changed, 610 insertions(+), 78 deletions(-)

-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel 2/9] KVM: PPC: Make real_vmalloc_addr() public

2015-09-15 Thread Alexey Kardashevskiy

This helper translates vmalloc'd addresses to linear addresses.
It is only used by the KVM MMU code now and resides in the HV KVM code.
We will need it further in the TCE code and the DMA memory preregistration
code called in real mode.

This makes real_vmalloc_addr() public and moves it to the powerpc code as
it does not do anything special for KVM.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/mmu-hash64.h |  3 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   | 17 -
 arch/powerpc/mm/hash_utils_64.c   | 17 +
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index a82f534..fd06b73 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -606,6 +606,9 @@ static inline unsigned long get_kernel_vsid(unsigned long 
ea, int ssize)
context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
return get_vsid(context, ea, ssize);
 }
+
+void *real_vmalloc_addr(void *x);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index c1df9bb..987b7d1 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -22,23 +22,6 @@
 #include 
 #include 
 
-/* Translate address of a vmalloc'd thing to a linear map address */
-static void *real_vmalloc_addr(void *x)
-{
-   unsigned long addr = (unsigned long) x;
-   pte_t *p;
-   /*
-* assume we don't have huge pages in vmalloc space...
-* So don't worry about THP collapse/split. Called
-* Only in realmode, hence won't need irq_save/restore.
-*/
-   p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
-   if (!p || !pte_present(*p))
-   return NULL;
-   addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
-   return __va(addr);
-}
-
 /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
 static int global_invalidates(struct kvm *kvm, unsigned long flags)
 {
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 5ec987f..9737d6a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1556,3 +1556,20 @@ void setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
 }
+
+/* Translate address of a vmalloc'd thing to a linear map address */
+void *real_vmalloc_addr(void *x)
+{
+   unsigned long addr = (unsigned long) x;
+   pte_t *p;
+   /*
+* assume we don't have huge pages in vmalloc space...
+* So don't worry about THP collapse/split. Called
+* Only in realmode, hence won't need irq_save/restore.
+*/
+   p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+   if (!p || !pte_present(*p))
+   return NULL;
+   addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
+   return __va(addr);
+}
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: vfio: powerpc/spapr: One function call less in tce_iommu_attach_group() after kzalloc() failure

2015-06-30 Thread Alexey Kardashevskiy


On 06/30/2015 04:08 PM, SF Markus Elfring wrote:

than the existing one should have been renamed to free_exit or 
free_unlock_exit
and new one would be unlock_exit.


I chose a smaller change at this place.


I'd just drop this patch.


How do you think about to improve the affected jump labels
a bit more there?


This branch is very unlikely to work ever so I cannot think of any 
improvement here.




--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: vfio: powerpc/spapr: One function call less in tce_iommu_attach_group() after kzalloc() failure

2015-06-29 Thread Alexey Kardashevskiy


On 06/29/2015 04:02 PM, SF Markus Elfring wrote:

tcegrp will be NULL and kfree() can handle this just fine


The affected function did not show this API knowledge, did it?



but you fixed this in 1/2 :)





(is not it the whole point of this patchset
- remove the check and just call kfree() even if the pointer is NULL?).


Partly, yes.



And if you wanted another label,


I suggest this to improve corresponding exception handling.



than the existing one should have been renamed to free_exit or 
free_unlock_exit
and new one would be unlock_exit.


I chose a smaller change at this place.



I'd just drop this patch.



I am not familiar enough with other called functions there at the moment.
Are the remaining goto statements also update candidates?





--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] vfio: powerpc/spapr: One function call less in tce_iommu_attach_group() after kzalloc() failure

2015-06-28 Thread Alexey Kardashevskiy

On 06/29/2015 02:24 AM, SF Markus Elfring wrote:

From: Markus Elfring elfr...@users.sourceforge.net
Date: Sun, 28 Jun 2015 17:58:42 +0200

The kfree() function was called even if a previous memory allocation
try failed.

tcegrp will be NULL and kfree() can handle this just fine (is not it the 
whole point of this patchset - remove the check and just call kfree() even 
if the pointer is NULL?). And if you wanted another label, than the 
existing one should have been renamed to free_exit or free_unlock_exit 
and new one would be unlock_exit.

This implementation detail could be improved by the introduction
of another jump label.

Signed-off-by: Markus Elfring elfr...@users.sourceforge.net
---
  drivers/vfio/vfio_iommu_spapr_tce.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 50ddfac..2523075 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -1200,7 +1200,7 @@ static int tce_iommu_attach_group(void *iommu_data,
tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
if (!tcegrp) {
ret = -ENOMEM;
-   goto unlock_exit;
+   goto unlock_container;
}

if (!table_group-ops || !table_group-ops-take_ownership ||
@@ -1217,7 +1217,7 @@ static int tce_iommu_attach_group(void *iommu_data,
  unlock_exit:
if (ret)
kfree(tcegrp);
-
+unlock_container:
mutex_unlock(container-lock);

return ret;

--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kernel v11 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups

2015-06-04 Thread Alexey Kardashevskiy


On 06/05/2015 10:27 AM, Gavin Shan wrote:

On Fri, May 29, 2015 at 06:44:45PM +1000, Alexey Kardashevskiy wrote:

The iommu_table struct keeps a list of IOMMU groups it is used for.
At the moment there is just a single group attached but further
patches will add TCE table sharing. When sharing is enabled, TCE cache
in each PE needs to be invalidated so does the patch.

This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
to enable TCE table sharing on PHBs older than IODA2.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru


Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com


---
Changes:
v10:
* new to the series
---
arch/powerpc/platforms/powernv/pci-ioda.c | 35 ---
1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3fd8b18..94fccc8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,6 +24,7 @@
#include linux/msi.h
#include linux/memblock.h
#include linux/iommu.h
+#include linux/rculist.h

#include asm/sections.h
#include asm/io.h
@@ -1764,23 +1765,15 @@ static inline void 
pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
__raw_writeq(cpu_to_be64(val), phb-ioda.tce_inval_reg);
}

-static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
-   unsigned long index, unsigned long npages, bool rm)
+static void pnv_pci_ioda2_tce_do_invalidate(unsigned pe_number, bool rm,
+   __be64 __iomem *invalidate, unsigned shift,
+   unsigned long index, unsigned long npages)


The better function name would be: pnv_pci_ioda2_do_tce_invalidate(), and


Ok.



it seems we needn't bool rm any more since invalidate has been assigned
with virtual/real address by caller.



We still need @rm here as different helpers are used for real and virt 
modes - __raw_rm_writeq and __raw_writeq.






Thanks,
Gavin


{
-   struct iommu_table_group_link *tgl = list_first_entry_or_null(
-   tbl-it_group_list, struct iommu_table_group_link,
-   next);
-   struct pnv_ioda_pe *pe = container_of(tgl-table_group,
-   struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
-   __be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
-   pe-phb-ioda.tce_inval_reg;
-   const unsigned shift = tbl-it_page_shift;

/* We'll invalidate DMA address in PE scope */
start = 0x2ull  60;
-   start |= (pe-pe_number  0xFF);
+   start |= (pe_number  0xFF);
end = start;

/* Figure out the start, end and step */
@@ -1798,6 +1791,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
iommu_table *tbl,
}
}

+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
+{
+   struct iommu_table_group_link *tgl;
+
+   list_for_each_entry_rcu(tgl, tbl-it_group_list, next) {
+   struct pnv_ioda_pe *pe = container_of(tgl-table_group,
+   struct pnv_ioda_pe, table_group);
+   __be64 __iomem *invalidate = rm ?
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
+
+   pnv_pci_ioda2_tce_do_invalidate(pe-pe_number, rm,
+   invalidate, tbl-it_page_shift,
+   index, npages);
+   }
+}
+
static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
--
2.4.0.rc3.8.gfb3e7d5






--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kernel v11 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-06-04 Thread Alexey Kardashevskiy


On 06/01/2015 04:24 PM, David Gibson wrote:

On Fri, May 29, 2015 at 06:44:41PM +1000, Alexey Kardashevskiy wrote:

Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically now by a pnv_pci_table_alloc() helper as PCI hotplug
code (for EEH recovery) and SRIOV are supported there.

For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.

For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_group_alloc() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_group_free()
helper is added.

This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda2_setup_dma_pe.

This replaces a single pointer to iommu_group with a list of
iommu_table_group structs. For now it is just a single iommu_table_group
in this list but later with TCE table sharing enabled, the list will
keep all the IOMMU groups which use the particular table. The list
uses iommu_table_group_link structs rather than iommu_table_group::next
as a VFIO container may have 2 IOMMU tables, each will have its own list
head pointer as it is mainly for TCE invalidation code which should
walk through all attached groups and invalidate TCE cache so
the table has to keep the list head pointer. The other option would
be storing list head in a VFIO container but it would not work as
the platform code (which does TCE table update and invalidation) has
no idea about VFIO.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com


It looks like this commit message doesn't match the code - it seems
like an older or newer version of the message from the previous patch.



This patch seems instead to be about changing the table_group - table
relationship from 1:1 to many:many.



I'll put this:

===
So far one TCE table could only be used by one IOMMU group. However
IODA2 hardware allows programming the same TCE table address to
multiple PE allowing sharing tables.

This replaces a single pointer to a group in a iommu_table struct
with a linked list of groups which provides the way of invalidating
TCE cache for every PE when an actual TCE table is updated. This adds 
pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group() helpers 
to manage the list. However without VFIO, it is still going

to be a single IOMMU group per iommu_table.

This changes iommu_add_device() to add a device to a first group
from the group list of a table as it is only called from the platform
init code or PCI bus notifier and at these moments there is only
one group per table.

This does not change TCE invalidation code to loop through all
attached groups in order to simplify this patch and because
it is not really needed in most cases. IODA2 is fixed in a later
patch.

===



---
Changes:
v10:
* iommu_table is not embedded into iommu_table_group but allocated
dynamically
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler

v9:
* s/it_group/it_table_group/
* added and used iommu_table_group_free(), from now iommu_free_table()
is only used for VIO
* added iommu_pseries_group_alloc()
* squashed powerpc/iommu: Introduce iommu_table_alloc() helper into this
---
  arch/powerpc/include/asm/iommu.h|   8 +-
  arch/powerpc/kernel/iommu.c |   9 +-
  arch/powerpc/platforms/powernv/pci-ioda.c   |  45 ++
  arch/powerpc/platforms/powernv/pci-p5ioc2.c |   3 +
  arch/powerpc/platforms/powernv/pci.c|  76 +
  arch/powerpc/platforms/powernv/pci.h|   7 ++
  arch/powerpc/platforms/pseries/iommu.c  |  33 +++-
  drivers/vfio/vfio_iommu_spapr_tce.c | 122 
  8 files changed, 242 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5a7267f..44a20cc 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,7 +91,7 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu

Re: [PATCH kernel v11 27/34] powerpc/powernv: Implement multilevel TCE tables

2015-06-03 Thread Alexey Kardashevskiy


On 06/02/2015 09:50 AM, David Gibson wrote:

On Fri, May 29, 2015 at 06:44:51PM +1000, Alexey Kardashevskiy wrote:

TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.

This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* fixed multiple comments received for v9

v9:
* moved from ioda2 to common powernv pci code
* fixed cleanup if allocation fails in a middle
* removed check for the size - all boundary checks happen in the calling code
anyway
---
  arch/powerpc/include/asm/iommu.h  |  2 +
  arch/powerpc/platforms/powernv/pci-ioda.c | 98 ---
  arch/powerpc/platforms/powernv/pci.c  | 13 
  3 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4636734..706cfc0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
  struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
+   unsigned long  it_indirect_levels;
+   unsigned long  it_level_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fda01c1..68ffc7a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,9 @@
  /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
  #define TCE32_TABLE_SIZE  ((0x1000 / 0x1000) * 8)

+#define POWERNV_IOMMU_DEFAULT_LEVELS   1
+#define POWERNV_IOMMU_MAX_LEVELS   5
+
  static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);

  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1975,6 +1978,8 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
table_group);
struct pnv_phb *phb = pe-phb;
int64_t rc;
+   const unsigned long size = tbl-it_indirect_levels ?
+   tbl-it_level_size : tbl-it_size;
const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
const __u64 win_size = tbl-it_size  tbl-it_page_shift;

@@ -1989,9 +1994,9 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
rc = opal_pci_map_pe_dma_window(phb-opal_id,
pe-pe_number,
pe-pe_number  1,
-   1,
+   tbl-it_indirect_levels + 1,
__pa(tbl-it_base),
-   tbl-it_size  3,
+   size  3,
IOMMU_PAGE_SIZE(tbl));
if (rc) {
pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
@@ -2071,11 +2076,19 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct 
pnv_phb *phb)
phb-ioda.tce_inval_reg = ioremap(phb-ioda.tce_inval_reg_phys, 8);
  }

-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+   unsigned levels, unsigned long limit,
+   unsigned long *tce_table_allocated)
  {
struct page *tce_mem = NULL;
-   __be64 *addr;
+   __be64 *addr, *tmp;
unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+   unsigned long local_allocated = 1UL  (order + PAGE_SHIFT);
+   unsigned entries = 1UL  (shift - 3);
+   long i;
+
+   if (*tce_table_allocated = limit)
+   return NULL;


I'm not quite clear what case this limit logic is trying to catch.



The function is allocating some amount of entries which may be in one chunk 
of memory and spread between multiple chunks in multiple levels. limit is 
the amount of memory for actual TCEs (not intermediate levels). If I do not 
do this, and the user requests 5 levels, and I do not check this, more 
memory will be allocated that actually needed because size of the window is 
limited.






tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
if (!tce_mem) {
@@ -2083,31 +2096,69 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int 
nid, unsigned shift)
return NULL;
}
addr = page_address(tce_mem);
-   memset(addr, 0, 1UL  (order

Re: [PATCH kernel v11 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers

2015-06-03 Thread Alexey Kardashevskiy


On 06/01/2015 02:28 PM, David Gibson wrote:

On Fri, May 29, 2015 at 06:44:33PM +1000, Alexey Kardashevskiy wrote:

There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).

This reworks debug messages to show the current value and the limit.

This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.

While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v4:
* new helpers do nothing if @npages == 0
* tce_iommu_disable() now can decrement the counter if the group was
detached (not possible now but will be in the future)
---
  drivers/vfio/vfio_iommu_spapr_tce.c | 82 -
  1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 64300cc..40583f9 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -29,6 +29,51 @@
  static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);

+static long try_increment_locked_vm(long npages)
+{
+   long ret = 0, locked, lock_limit;
+
+   if (!current || !current-mm)
+   return -ESRCH; /* process exited */
+
+   if (!npages)
+   return 0;
+
+   down_write(current-mm-mmap_sem);
+   locked = current-mm-locked_vm + npages;


Is there a possibility of userspace triggering an integer overflow
here, if npages is really huge?



I do not see how. I just do not accept npages bigger than the host RAM size 
in pages. And it is long. For (lets say) 128GB host, the number of 4KB 
pages is (12830)/4096=33554432.






+   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
+   if (locked  lock_limit  !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current-mm-locked_vm += npages;
+
+   pr_debug([%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ?  - exceeded : );
+
+   up_write(current-mm-mmap_sem);
+
+   return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+   if (!current || !current-mm || !npages)
+   return; /* process exited */
+
+   down_write(current-mm-mmap_sem);
+   if (npages  current-mm-locked_vm)
+   npages = current-mm-locked_vm;


Can this case ever occur (without there being a leak bug somewhere
else in the code)?



It should not. Safety measure. Having a warning here might make sense but I 
believe if this happens, there will be many, many warnings in other places :)





+   current-mm-locked_vm -= npages;
+   pr_debug([%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK));
+   up_write(current-mm-mmap_sem);
+}
+
  /*
   * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
   *
@@ -45,6 +90,7 @@ struct tce_container {
struct mutex lock;
struct iommu_table *tbl;
bool enabled;
+   unsigned long locked_pages;
  };

  static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
  static int tce_iommu_enable(struct tce_container *container)
  {
int ret = 0;
-   unsigned long locked, lock_limit, npages;
+   unsigned long locked;
struct iommu_table *tbl = container-tbl;

if (!container-tbl)
@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 * that would effectively kill the guest at random points, much better
 * enforcing the limit based on the max that the guest can map.
+*
+* Unfortunately at the moment it counts whole tables, no matter how
+* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+* each with 2GB DMA window, 8GB will be counted here. The reason for
+* this is that we cannot tell here the amount of RAM used by the guest

Re: [PATCH kernel v11 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window

2015-06-03 Thread Alexey Kardashevskiy


On 06/02/2015 09:30 AM, David Gibson wrote:

On Fri, May 29, 2015 at 06:44:50PM +1000, Alexey Kardashevskiy wrote:

This is a part of moving DMA window programming to an iommu_ops
callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
a first parameter (not pnv_ioda_pe) as it is going to be used as
a callback for VFIO DDW code.

This adds pnv_pci_ioda2_tvt_invalidate() to invalidate TVT as it is


I'm assuming that's what's now called pnv_pci_ioda2_invalidate_entire()?



Yes, my bad... And the patch is not adding it at all...





a good thing to do. It does not have immediate effect now as the table
is never recreated after reboot but it will in the following patches.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v11:
* replaced some 1it_page_shift with IOMMU_PAGE_SIZE() macro

v9:
* initialize pe-table_group.tables[0] at the very end when
tbl is fully initialized
* moved pnv_pci_ioda2_tvt_invalidate() from earlier patch
---
  arch/powerpc/platforms/powernv/pci-ioda.c | 47 +--
  1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3d29fe3..fda01c1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1968,6 +1968,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
}
  }

+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+   int num, struct iommu_table *tbl)
+{
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   struct pnv_phb *phb = pe-phb;
+   int64_t rc;
+   const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
+   const __u64 win_size = tbl-it_size  tbl-it_page_shift;
+
+   pe_info(pe, Setting up window %llx..%llx pg=%x\n,
+   start_addr, start_addr + win_size - 1,
+   IOMMU_PAGE_SIZE(tbl));
+
+   /*
+* Map TCE table through TVT. The TVE index is the PE number
+* shifted by 1 bit for 32-bits DMA space.
+*/
+   rc = opal_pci_map_pe_dma_window(phb-opal_id,
+   pe-pe_number,
+   pe-pe_number  1,
+   1,
+   __pa(tbl-it_base),
+   tbl-it_size  3,
+   IOMMU_PAGE_SIZE(tbl));
+   if (rc) {
+   pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
+   return rc;
+   }
+
+   pnv_pci_link_table_and_group(phb-hose-node, num,
+   tbl, pe-table_group);
+   pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+   return 0;
+}
+
  static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
  {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -2123,21 +2160,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
pe-table_group.ops = pnv_pci_ioda2_ops;
  #endif

-   /*
-* Map TCE table through TVT. The TVE index is the PE number
-* shifted by 1 bit for 32-bits DMA space.
-*/
-   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(tbl-it_base),
-   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
goto fail;
}

-   pnv_pci_ioda2_tce_invalidate_entire(pe);
-
/* OPAL variant of PHB3 invalidated TCEs */
if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);





--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kernel v11 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2

2015-06-03 Thread Alexey Kardashevskiy


On 06/02/2015 02:17 PM, David Gibson wrote:

On Fri, May 29, 2015 at 06:44:57PM +1000, Alexey Kardashevskiy wrote:

The existing implementation accounts the whole DMA window in
the locked_vm counter. This is going to be worse with multiple
containers and huge DMA windows. Also, real-time accounting would requite
additional tracking of accounted pages due to the page size difference -
IOMMU uses 4K pages and system uses 4K or 64K pages.

Another issue is that actual pages pinning/unpinning happens on every
DMA map/unmap request. This does not affect the performance much now as
we spend way too much time now on switching context between
guest/userspace/host but this will start to matter when we add in-kernel
DMA map/unmap acceleration.

This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
2 new ioctls to register/unregister DMA memory -
VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
which receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.
New IOMMU splits physical pages pinning and TCE table update
into 2 different operations. It requires:
1) guest pages to be registered first
2) consequent map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
When a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.

The new memory registration ioctls are not supported by
VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
will require memory to be preregistered in order to work.

The accounting is done per the user process.

This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.

In order to support memory pre-registration, we need a way to track
the use of every registered memory region and only allow unregistration
if a region is not in use anymore. So we need a way to tell from what
region the just cleared TCE was from.

This adds a userspace view of the TCE table into iommu_table struct.
It contains userspace address, one per TCE entry. The table is only
allocated when the ownership over an IOMMU group is taken which means
it is only used from outside of the powernv code (such as VFIO).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v11:
* mm_iommu_put() does not return a code so this does not check it
* moved v2 in tce_container to pack the struct

v10:
* moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
specific thing
* squashed powerpc/iommu: Add userspace view of TCE table into this as
it is
a part of IOMMU v2
* s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
* fixed some function names to have tce_iommu_ in the beginning rather
just tce_
* as mm_iommu_mapped_inc() can now fail, check for the return code

v9:
* s/tce_get_hva_cached/tce_iommu_use_page_v2/

v7:
* now memory is registered per mm (i.e. process)
* moved memory registration code to powerpc/mmu
* merged vfio: powerpc/spapr: Define v2 IOMMU into this
* limited new ioctls to v2 IOMMU
* updated doc
* unsupported ioclts return -ENOTTY instead of -EPERM

v6:
* tce_get_hva_cached() returns hva via a pointer

v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
  Documentation/vfio.txt  |  31 ++-
  arch/powerpc/include/asm/iommu.h|   6 +
  drivers/vfio/vfio_iommu_spapr_tce.c | 512 ++--
  include/uapi/linux/vfio.h   |  27 ++
  4 files changed, 487 insertions(+), 89 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..7dcf2b5 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note

  This implementation has some specifics:

-1) Only one IOMMU group per container is supported as an IOMMU group
-represents the minimal entity which isolation can be guaranteed for and
-groups are allocated statically, one per a Partitionable Endpoint (PE)
+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
+container is supported as an IOMMU table is allocated at the boot time,
+one table per a IOMMU group which is a Partitionable Endpoint (PE)
  (PE is often a PCI domain but not always).
+Newer systems (POWER8 with IODA2) have improved hardware design which allows
+to remove this limitation and have multiple IOMMU groups per a VFIO container.

  2) The hardware supports so called DMA windows - the PCI address range
  within which DMA transfer is allowed

[PATCH kernel v11 08/34] vfio: powerpc/spapr: Use it_page_size

2015-05-29 Thread Alexey Kardashevskiy

This makes use of the it_page_size from the iommu_table struct
as page size can differ.

This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 735b308..64300cc 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
 * enforcing the limit based on the max that the guest can map.
 */
down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
locked = current-mm-locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
@@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container 
*container)
 
down_write(current-mm-mmap_sem);
current-mm-locked_vm -= (container-tbl-it_size 
-   IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   container-tbl-it_page_shift)  PAGE_SHIFT;
up_write(current-mm-mmap_sem);
 }
 
@@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container,
tce, ret);
break;
}
-   tce += IOMMU_PAGE_SIZE_4K;
+   tce += IOMMU_PAGE_SIZE(tbl);
}
 
if (ret)
@@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data,
if (info.argsz  minsz)
return -EINVAL;
 
-   info.dma32_window_start = tbl-it_offset  IOMMU_PAGE_SHIFT_4K;
-   info.dma32_window_size = tbl-it_size  IOMMU_PAGE_SHIFT_4K;
+   info.dma32_window_start = tbl-it_offset  tbl-it_page_shift;
+   info.dma32_window_size = tbl-it_size  tbl-it_page_shift;
info.flags = 0;
 
if (copy_to_user((void __user *)arg, info, minsz))
@@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data,
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
 
-   if ((param.size  ~IOMMU_PAGE_MASK_4K) ||
-   (param.vaddr  ~IOMMU_PAGE_MASK_4K))
+   if ((param.size  ~IOMMU_PAGE_MASK(tbl)) ||
+   (param.vaddr  ~IOMMU_PAGE_MASK(tbl)))
return -EINVAL;
 
/* iova is checked by the IOMMU API */
@@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data,
return ret;
 
ret = tce_iommu_build(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   tce, param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   tce, param.size  tbl-it_page_shift);
 
iommu_flush_tce(tbl);
 
@@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data,
if (param.flags)
return -EINVAL;
 
-   if (param.size  ~IOMMU_PAGE_MASK_4K)
+   if (param.size  ~IOMMU_PAGE_MASK(tbl))
return -EINVAL;
 
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.size  tbl-it_page_shift);
if (ret)
return ret;
 
ret = tce_iommu_clear(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   param.size  tbl-it_page_shift);
iommu_flush_tce(tbl);
 
return ret;
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group

2015-05-29 Thread Alexey Kardashevskiy

Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

This defines iommu_table_group struct which stores pointers to
iommu_group and iommu_table(s). This replaces iommu_table with
iommu_table_group where iommu_table was used to identify a group:
- iommu_register_group();
- iommudata of generic iommu_group;

This removes @data from iommu_table as it_table_group provides
same access to pnv_ioda_pe.

For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically.

For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.

For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_alloc_group() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_free_group()
helper is added.

This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
This change is here because those lines had to be changed anyway.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v11:
* iommu_table_group moved outside #ifdef CONFIG_IOMMU_API as iommu_table
is dynamically allocated and it needs a pointer to PE and
iommu_table_group is this pointer

v10:
* new to the series, separated from
powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
* iommu_table is not embedded into iommu_table_group but allocated
dynamically in most cases
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler
---
 arch/powerpc/include/asm/iommu.h|  19 ++---
 arch/powerpc/include/asm/pci-bridge.h   |   2 +-
 arch/powerpc/kernel/iommu.c |  17 ++---
 arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
 arch/powerpc/platforms/powernv/pci.h|   3 +-
 arch/powerpc/platforms/pseries/iommu.c  | 107 +++-
 drivers/vfio/vfio_iommu_spapr_tce.c |  23 +++---
 8 files changed, 152 insertions(+), 92 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e2a45c3..5a7267f 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,14 +91,9 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-#ifdef CONFIG_IOMMU_API
-   struct iommu_group *it_group;
-#endif
+   struct iommu_table_group *it_table_group;
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
-#ifdef CONFIG_PPC_POWERNV
-   void   *data;
-#endif
 };
 
 /* Pure 2^n version of get_order */
@@ -129,14 +124,22 @@ extern void iommu_free_table(struct iommu_table *tbl, 
const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
+#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+
+struct iommu_table_group {
+   struct iommu_group *group;
+   struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+};
+
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+
+extern void iommu_register_group(struct iommu_table_group *table_group,
 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
unsigned long pe_num)
 {
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44..e2d7479 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -185,7 +185,7 @@ struct pci_dn {
 
struct  pci_dn *parent;
struct  pci_controller *phb;/* for pci devices */
-   struct  iommu_table *iommu_table

[PATCH kernel v11 31/34] vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in ownership control

2015-05-29 Thread Alexey Kardashevskiy

Before the IOMMU user (VFIO) would take control over the IOMMU table
belonging to a specific IOMMU group. This approach did not allow sharing
tables between IOMMU groups attached to the same container.

This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU implements take/release_ownership() callbacks, this lets
the user have full control over the IOMMU group. When the ownership
is taken, the platform code removes all the windows so the caller must
create them.
Before returning the ownership back to the platform code, VFIO
unprograms and removes all the tables it created.

This changes IODA2's onwership handler to remove the existing table
rather than manipulating with the existing one. From now on,
iommu_take_ownership() and iommu_release_ownership() are only called
from the vfio_iommu_spapr_tce driver.

Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.

No change in userspace-visible behaviour is expected. Since it recreates
TCE tables on each ownership change, related kernel traces will appear
more often.

This adds a pnv_pci_ioda2_setup_default_config() which is called
when PE is being configured at boot time and when the ownership is
passed from VFIO to the platform code.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v10:
* created pnv_pci_ioda2_setup_default_config() helper

v9:
* fixed crash in tce_iommu_detach_group() on tbl-it_ops-free as
tce_iommu_attach_group() used to initialize the table from a descriptor
on stack (it does not matter for the series as this bit is changed later anyway
but it ruing bisectability)

v6:
* fixed commit log that VFIO removes tables before passing ownership
back to the platform code, not userspace
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 101 --
 drivers/vfio/vfio_iommu_spapr_tce.c   |  88 +-
 2 files changed, 141 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c77c85e..6057ca4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2072,6 +2072,49 @@ static long pnv_pci_ioda2_create_table(struct 
iommu_table_group *table_group,
return 0;
 }
 
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+   struct iommu_table *tbl = NULL;
+   long rc;
+
+   rc = pnv_pci_ioda2_create_table(pe-table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   pe-table_group.tce32_size,
+   POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld,
+   rc);
+   return rc;
+   }
+
+   iommu_init_table(tbl, pe-phb-hose-node);
+
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
+   if (rc) {
+   pe_err(pe, Failed to configure 32-bit TCE table, err %ld\n,
+   rc);
+   pnv_ioda2_table_free(tbl);
+   return rc;
+   }
+
+   if (!pnv_iommu_bypass_disabled)
+   pnv_pci_ioda2_set_bypass(pe, true);
+
+   /* OPAL variant of PHB3 invalidated TCEs */
+   if (pe-phb-ioda.tce_inval_reg)
+   tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+   /*
+* Setting table base here only for carrying iommu_group
+* further down to let iommu_add_device() do the job.
+* pnv_pci_ioda_dma_dev_setup will override it later anyway.
+*/
+   if (pe-flags  PNV_IODA_PE_DEV)
+   set_iommu_table_base(pe-pdev-dev, tbl);
+
+   return 0;
+}
+
 #ifdef CONFIG_IOMMU_API
 static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
__u64 window_size, __u32 levels)
@@ -2133,9 +2176,12 @@ static void pnv_ioda2_take_ownership(struct 
iommu_table_group *table_group)
 {
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
+   /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+   struct iommu_table *tbl = pe-table_group.tables[0];
 
-   iommu_take_ownership(table_group-tables[0]);
pnv_pci_ioda2_set_bypass(pe, false);
+   pnv_pci_ioda2_unset_window(pe-table_group, 0);
+   pnv_ioda2_table_free(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2143,8 +2189,7 @@ static void pnv_ioda2_release_ownership(struct 
iommu_table_group *table_group)
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
 
-   iommu_release_ownership

[PATCH kernel v11 12/34] vfio: powerpc/spapr: Rework groups attaching

2015-05-29 Thread Alexey Kardashevskiy

This is to make extended ownership and multiple groups support patches
simpler for review.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 40 ++---
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 115d5e6..0fbe03e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -460,16 +460,21 @@ static int tce_iommu_attach_group(void *iommu_data,
iommu_group_id(container-tbl-it_group),
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else if (container-enabled) {
+   goto unlock_exit;
+   }
+
+   if (container-enabled) {
pr_err(tce_vfio: attaching group #%u to enabled container\n,
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else {
-   ret = iommu_take_ownership(tbl);
-   if (!ret)
-   container-tbl = tbl;
+   goto unlock_exit;
}
 
+   ret = iommu_take_ownership(tbl);
+   if (!ret)
+   container-tbl = tbl;
+
+unlock_exit:
mutex_unlock(container-lock);
 
return ret;
@@ -487,19 +492,22 @@ static void tce_iommu_detach_group(void *iommu_data,
pr_warn(tce_vfio: detaching group #%u, expected group is 
#%u\n,
iommu_group_id(iommu_group),
iommu_group_id(tbl-it_group));
-   } else {
-   if (container-enabled) {
-   pr_warn(tce_vfio: detaching group #%u from enabled 
container, forcing disable\n,
-   iommu_group_id(tbl-it_group));
-   tce_iommu_disable(container);
-   }
+   goto unlock_exit;
+   }
 
-   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
-   iommu_group_id(iommu_group), iommu_group); */
-   container-tbl = NULL;
-   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
-   iommu_release_ownership(tbl);
+   if (container-enabled) {
+   pr_warn(tce_vfio: detaching group #%u from enabled container, 
forcing disable\n,
+   iommu_group_id(tbl-it_group));
+   tce_iommu_disable(container);
}
+
+   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
+  iommu_group_id(iommu_group), iommu_group); */
+   container-tbl = NULL;
+   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
+   iommu_release_ownership(tbl);
+
+unlock_exit:
mutex_unlock(container-lock);
 }
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 07/34] vfio: powerpc/spapr: Check that IOMMU page is fully contained by system page

2015-05-29 Thread Alexey Kardashevskiy

This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.

Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.

Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v8: changed subject

v6:
* the helper is simplified to one line

v4:
* s/tce_check_page_size/tce_page_is_contained/
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index b95fa2b..735b308 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -47,6 +47,16 @@ struct tce_container {
bool enabled;
 };
 
+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+   /*
+* Check that the TCE table granularity is not bigger than the size of
+* a page we just found. Otherwise the hardware can get access to
+* a bigger memory chunk that it should.
+*/
+   return (PAGE_SHIFT + compound_order(compound_head(page))) = page_shift;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
@@ -189,6 +199,12 @@ static long tce_iommu_build(struct tce_container 
*container,
ret = -EFAULT;
break;
}
+
+   if (!tce_page_is_contained(page, tbl-it_page_shift)) {
+   ret = -EPERM;
+   break;
+   }
+
hva = (unsigned long) page_address(page) + offset;
 
ret = iommu_tce_build(tbl, entry + i, hva, direction);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 29/34] powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE release

2015-05-29 Thread Alexey Kardashevskiy

The existing code programmed TVT#0 with some address and then
immediately released that memory.

This makes use of pnv_pci_ioda2_unset_window() and
pnv_pci_ioda2_set_bypass() which do correct resource release and
TVT update.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1059bf6..00739883 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1287,34 +1287,21 @@ m64_failed:
return -EBUSY;
 }
 
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+   int num);
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct 
pnv_ioda_pe *pe)
 {
-   struct pci_bus*bus;
-   struct pci_controller *hose;
-   struct pnv_phb*phb;
struct iommu_table*tbl;
-   unsigned long addr;
int64_t   rc;
 
-   bus = dev-bus;
-   hose = pci_bus_to_host(bus);
-   phb = hose-private_data;
tbl = pe-table_group.tables[0];
-   addr = tbl-it_base;
-
-   opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-  pe-pe_number  1, 1, __pa(addr),
-  0, 0x1000);
-
-   rc = opal_pci_map_pe_dma_window_real(pe-phb-opal_id,
-   pe-pe_number,
-   (pe-pe_number  1) + 1,
-   pe-tce_bypass_base,
-   0);
+   rc = pnv_pci_ioda2_unset_window(pe-table_group, 0);
if (rc)
pe_warn(pe, OPAL error %ld release DMA window\n, rc);
 
-   pnv_pci_unlink_table_and_group(tbl, pe-table_group);
+   pnv_pci_ioda2_set_bypass(pe, false);
if (pe-table_group.group) {
iommu_group_put(pe-table_group.group);
BUG_ON(pe-table_group.group);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 18/34] vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership control

2015-05-29 Thread Alexey Kardashevskiy

This adds tce_iommu_take_ownership() and tce_iommu_release_ownership
which call in a loop iommu_take_ownership()/iommu_release_ownership()
for every table on the group. As there is just one now, no change in
behaviour is expected.

At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.

The set_bypass() callback is not really an iommu_table function but
IOMMU/PE function. This introduces a iommu_table_group_ops struct and
adds take_ownership()/release_ownership() callbacks to it which are
called when an external user takes/releases control over the IOMMU.

This replaces set_bypass() with ownership callbacks as it is not
necessarily just bypass enabling, it can be something else/more
so let's give it more generic name.

The callbacks is implemented for IODA2 only. Other platforms (P5IOC2,
IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.
The following patches will replace iommu_take_ownership/
iommu_release_ownership calls in IODA2 with full IOMMU table release/
create.

As we here and touching bypass control, this removes
pnv_pci_ioda2_setup_bypass_pe() as it does not do much
more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base
initialization to pnv_pci_ioda2_setup_dma_pe.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* fixed comments around take_ownership/release_ownership in 
iommu_table_group_ops

v9:
* squashed vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control
and vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership control
into a single patch
* moved helpers with a loop through tables in a group
to vfio_iommu_spapr_tce.c to keep the platform code free of IOMMU table
groups as much as possible
* added missing tce_iommu_clear() to tce_iommu_release_ownership()
* replaced the set_ownership(enable) callback with take_ownership() and
release_ownership()
---
 arch/powerpc/include/asm/iommu.h  | 11 -
 arch/powerpc/kernel/iommu.c   | 12 -
 arch/powerpc/platforms/powernv/pci-ioda.c | 73 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c   | 70 ++---
 4 files changed, 118 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 44a20cc..489133c 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -93,7 +93,6 @@ struct iommu_table {
unsigned long  it_page_shift;/* table iommu page size */
struct list_head it_group_list;/* List of iommu_table_group_link */
struct iommu_table_ops *it_ops;
-   void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
@@ -126,6 +125,15 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+   /* Switch ownership from platform code to external user (e.g. VFIO) */
+   void (*take_ownership)(struct iommu_table_group *table_group);
+   /* Switch ownership from external user (e.g. VFIO) back to core */
+   void (*release_ownership)(struct iommu_table_group *table_group);
+};
+
 struct iommu_table_group_link {
struct list_head next;
struct rcu_head rcu;
@@ -135,6 +143,7 @@ struct iommu_table_group_link {
 struct iommu_table_group {
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+   struct iommu_table_group_ops *ops;
 };
 
 #ifdef CONFIG_IOMMU_API
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index e305a8f..b6a397a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1047,14 +1047,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
memset(tbl-it_map, 0xff, sz);
 
-   /*
-* Disable iommu bypass, otherwise the user can DMA to all of
-* our physical memory via the bypass window instead of just
-* the pages that has been explicitly mapped into the iommu
-*/
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, false);
-
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1068,10 +1060,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
/* Restore bit#0 set by iommu_init_table() */
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
-
-   /* The kernel owns the device now, we can restore the iommu bypass */
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership

[PATCH kernel v11 34/34] vfio: powerpc/spapr: Support Dynamic DMA windows

2015-05-29 Thread Alexey Kardashevskiy

This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.

This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.

This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.

DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
as we still want to support v2 on platforms which cannot do DDW for
the sake of TCE acceleration in KVM (coming soon).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v7:
* s/VFIO_IOMMU_INFO_DDW/VFIO_IOMMU_SPAPR_INFO_DDW/
* fixed typos in and updated vfio.txt
* fixed VFIO_IOMMU_SPAPR_TCE_GET_INFO handler
* moved ddw properties to vfio_iommu_spapr_tce_ddw_info

v6:
* added explicit VFIO_IOMMU_INFO_DDW flag to vfio_iommu_spapr_tce_info,
it used to be page mask flags from platform code
* added explicit pgsizes field
* added cleanup if tce_iommu_create_window() failed in a middle
* added checks for callbacks in tce_iommu_create_window and remove those
from tce_iommu_remove_window when it is too late to test anyway
* spapr_tce_find_free_table returns sensible error code now
* updated description of VFIO_IOMMU_SPAPR_TCE_CREATE/
VFIO_IOMMU_SPAPR_TCE_REMOVE

v4:
* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
helpers
* added docs
---
 Documentation/vfio.txt  |  19 
 arch/powerpc/include/asm/iommu.h|   2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 196 +++-
 include/uapi/linux/vfio.h   |  61 ++-
 4 files changed, 273 insertions(+), 5 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 7dcf2b5..8b1ec51 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -452,6 +452,25 @@ address is from pre-registered range.
 
 This separation helps in optimizing DMA for guests.
 
+6) sPAPR specification allows guests to have an additional DMA window(s) on
+a PCI bus with a variable page size. Two ioctls have been added to support
+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
+The platform has to support the functionality or error will be returned to
+the userspace. The existing hardware supports up to 2 DMA windows, one is
+2GB long, uses 4K pages and called default 32bit window; the other can
+be as big as entire RAM, use different page size, it is optional - guests
+create those in run-time if the guest driver supports 64bit DMA.
+
+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
+a number of TCE table levels (if a TCE table is going to be big enough and
+the kernel may not be able to allocate enough of physically contiguous memory).
+It creates a new window in the available slot and returns the bus address where
+the new window starts. Due to hardware limitation, the user space cannot choose
+the location of DMA windows.
+
+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
+and removes it.
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index f9957eb..ca18cff 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -149,7 +149,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const 
char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
-#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+#define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
 struct iommu_table_group;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index cadd9f8..199d5db 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -211,6 +211,18 @@ static long tce_iommu_find_table(struct tce_container 
*container,
return -1;
 }
 
+static int tce_iommu_find_free_table(struct tce_container *container)
+{
+   int i;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   if (!container-tables[i])
+   return i;
+   }
+
+   return -ENOSPC;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
@@ -593,11 +605,115 @@ static void tce_iommu_free_table(struct iommu_table *tbl

[PATCH kernel v11 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API

2015-05-29 Thread Alexey Kardashevskiy

This extends iommu_table_group_ops by a set of callbacks to support
dynamic DMA windows management.

create_table() creates a TCE table with specific parameters.
it receives iommu_table_group to know nodeid in order to allocate
TCE table memory closer to the PHB. The exact format of allocated
multi-level table might be also specific to the PHB model (not
the case now though).
This callback calculated the DMA window offset on a PCI bus from @num
and stores it in a just created table.

set_window() sets the window at specified TVT index + @num on PHB.

unset_window() unsets the window from specified TVT.

This adds a free() callback to iommu_table_ops to free the memory
(potentially a tree of tables) allocated for the TCE table.

create_table() and free() are supposed to be called once per
VFIO container and set_window()/unset_window() are supposed to be
called for every group in a container.

This adds IOMMU capabilities to iommu_table_group such as default
32bit window parameters and others. This makes use of new values in
vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not
advertise pagemasks to the userspace.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v10:
* squashed vfio: powerpc/spapr: Use 32bit DMA window properties from 
table_group
into this
* shortened the subject

v9:
* new in the series - to make the next patch simpler
---
 arch/powerpc/include/asm/iommu.h| 19 ++
 arch/powerpc/platforms/powernv/pci-ioda.c   | 96 ++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++---
 4 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 706cfc0..e554175 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -70,6 +70,7 @@ struct iommu_table_ops {
/* get() returns a physical address */
unsigned long (*get)(struct iommu_table *tbl, long index);
void (*flush)(struct iommu_table *tbl);
+   void (*free)(struct iommu_table *tbl);
 };
 
 /* These are used by VIO */
@@ -146,6 +147,17 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+   long (*create_table)(struct iommu_table_group *table_group,
+   int num,
+   __u32 page_shift,
+   __u64 window_size,
+   __u32 levels,
+   struct iommu_table **ptbl);
+   long (*set_window)(struct iommu_table_group *table_group,
+   int num,
+   struct iommu_table *tblnew);
+   long (*unset_window)(struct iommu_table_group *table_group,
+   int num);
/* Switch ownership from platform code to external user (e.g. VFIO) */
void (*take_ownership)(struct iommu_table_group *table_group);
/* Switch ownership from external user (e.g. VFIO) back to core */
@@ -159,6 +171,13 @@ struct iommu_table_group_link {
 };
 
 struct iommu_table_group {
+   /* IOMMU properties */
+   __u32 tce32_start;
+   __u32 tce32_size;
+   __u64 pgsizes; /* Bitmap of supported page sizes */
+   __u32 max_dynamic_windows_supported;
+   __u32 max_levels;
+
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct iommu_table_group_ops *ops;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 68ffc7a..1059bf6 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include linux/memblock.h
 #include linux/iommu.h
 #include linux/rculist.h
+#include linux/sizes.h
 
 #include asm/sections.h
 #include asm/io.h
@@ -1868,6 +1869,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, 
long index,
pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
+static void pnv_ioda2_table_free(struct iommu_table *tbl)
+{
+   pnv_pci_ioda2_table_free_pages(tbl);
+   iommu_free_table(tbl, pnv);
+}
+
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
@@ -1875,6 +1882,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 #endif
.clear = pnv_ioda2_tce_free,
.get = pnv_tce_get,
+   .free = pnv_ioda2_table_free,
 };
 
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
@@ -1945,6 +1953,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 TCE_PCI_SWINV_PAIR);
 
tbl-it_ops = pnv_ioda1_iommu_ops;
+   pe-table_group.tce32_start = tbl-it_offset  tbl-it_page_shift;
+   pe-table_group.tce32_size = tbl-it_size  tbl-it_page_shift

[PATCH kernel v11 09/34] vfio: powerpc/spapr: Move locked_vm accounting to helpers

2015-05-29 Thread Alexey Kardashevskiy

There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).

This reworks debug messages to show the current value and the limit.

This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.

While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v4:
* new helpers do nothing if @npages == 0
* tce_iommu_disable() now can decrement the counter if the group was
detached (not possible now but will be in the future)
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 82 -
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 64300cc..40583f9 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -29,6 +29,51 @@
 static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
 
+static long try_increment_locked_vm(long npages)
+{
+   long ret = 0, locked, lock_limit;
+
+   if (!current || !current-mm)
+   return -ESRCH; /* process exited */
+
+   if (!npages)
+   return 0;
+
+   down_write(current-mm-mmap_sem);
+   locked = current-mm-locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
+   if (locked  lock_limit  !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current-mm-locked_vm += npages;
+
+   pr_debug([%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ?  - exceeded : );
+
+   up_write(current-mm-mmap_sem);
+
+   return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+   if (!current || !current-mm || !npages)
+   return; /* process exited */
+
+   down_write(current-mm-mmap_sem);
+   if (npages  current-mm-locked_vm)
+   npages = current-mm-locked_vm;
+   current-mm-locked_vm -= npages;
+   pr_debug([%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK));
+   up_write(current-mm-mmap_sem);
+}
+
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -45,6 +90,7 @@ struct tce_container {
struct mutex lock;
struct iommu_table *tbl;
bool enabled;
+   unsigned long locked_pages;
 };
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
-   unsigned long locked, lock_limit, npages;
+   unsigned long locked;
struct iommu_table *tbl = container-tbl;
 
if (!container-tbl)
@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 * that would effectively kill the guest at random points, much better
 * enforcing the limit based on the max that the guest can map.
+*
+* Unfortunately at the moment it counts whole tables, no matter how
+* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+* each with 2GB DMA window, 8GB will be counted here. The reason for
+* this is that we cannot tell here the amount of RAM used by the guest
+* as this information is only available from KVM and VFIO is
+* KVM agnostic.
 */
-   down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
-   locked = current-mm-locked_vm + npages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
-   if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
-   pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n,
-   rlimit(RLIMIT_MEMLOCK));
-   ret = -ENOMEM;
-   } else {
+   locked = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
+   ret = try_increment_locked_vm(locked);
+   if (ret)
+   return

[PATCH kernel v11 04/34] powerpc/iommu: Put IOMMU group explicitly

2015-05-29 Thread Alexey Kardashevskiy

So far an iommu_table lifetime was the same as PE. Dynamic DMA windows
will change this and iommu_free_table() will not always require
the group to be released.

This moves iommu_group_put() out of iommu_free_table().

This adds a iommu_pseries_free_table() helper which does
iommu_group_put() and iommu_free_table(). Later it will be
changed to receive a table_group and we will have to change less
lines then.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/iommu.c   |  7 ---
 arch/powerpc/platforms/powernv/pci-ioda.c |  5 +
 arch/powerpc/platforms/pseries/iommu.c| 16 +++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..3d47eb3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
if (tbl-it_offset == 0)
clear_bit(0, tbl-it_map);
 
-#ifdef CONFIG_IOMMU_API
-   if (tbl-it_group) {
-   iommu_group_put(tbl-it_group);
-   BUG_ON(tbl-it_group);
-   }
-#endif
-
/* verify that table contains no entries */
if (!bitmap_empty(tbl-it_map, tbl-it_size))
pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8ca7abd..8c3c4bf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -23,6 +23,7 @@
 #include linux/io.h
 #include linux/msi.h
 #include linux/memblock.h
+#include linux/iommu.h
 
 #include asm/sections.h
 #include asm/io.h
@@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
*dev, struct pnv_ioda_pe
if (rc)
pe_warn(pe, OPAL error %ld release DMA window\n, rc);
 
+   if (tbl-it_group) {
+   iommu_group_put(tbl-it_group);
+   BUG_ON(tbl-it_group);
+   }
iommu_free_table(tbl, of_node_full_name(dev-dev.of_node));
free_pages(addr, get_order(TCE32_TABLE_SIZE));
pe-tce32_table = NULL;
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 05ab06d..fe5117b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include linux/crash_dump.h
 #include linux/memory.h
 #include linux/of.h
+#include linux/iommu.h
 #include asm/io.h
 #include asm/prom.h
 #include asm/rtas.h
@@ -51,6 +52,18 @@
 
 #include pseries.h
 
+static void iommu_pseries_free_table(struct iommu_table *tbl,
+   const char *node_name)
+{
+#ifdef CONFIG_IOMMU_API
+   if (tbl-it_group) {
+   iommu_group_put(tbl-it_group);
+   BUG_ON(tbl-it_group);
+   }
+#endif
+   iommu_free_table(tbl, node_name);
+}
+
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
  __be64 *startp, __be64 *endp)
 {
@@ -1271,7 +1284,8 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
 */
remove_ddw(np, false);
if (pci  pci-iommu_table)
-   iommu_free_table(pci-iommu_table, np-full_name);
+   iommu_pseries_free_table(pci-iommu_table,
+   np-full_name);
 
spin_lock(direct_window_list_lock);
list_for_each_entry(window, direct_window_list, list) {
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 30/34] powerpc/iommu/ioda2: Add get_table_size() to calculate the size of future table

2015-05-29 Thread Alexey Kardashevskiy

This adds a way for the IOMMU user to know how much a new table will
use so it can be accounted in the locked_vm limit before allocation
happens.

This stores the allocated table size in pnv_pci_ioda2_get_table_size()
so the locked_vm counter can be updated correctly when a table is
being disposed.

This defines an iommu_table_group_ops callback to let VFIO know
how much memory will be locked if a table is created.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* s/ROUND_UP/_ALIGN_UP/
* fixed rounding up for @entries_shift (used to use ROUND_UP)

v9:
* reimplemented the whole patch
---
 arch/powerpc/include/asm/iommu.h  |  5 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 +++
 2 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e554175..9d37492 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -99,6 +99,7 @@ struct iommu_table {
unsigned long  it_size;  /* Size of iommu table in entries */
unsigned long  it_indirect_levels;
unsigned long  it_level_size;
+   unsigned long  it_allocated_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
@@ -147,6 +148,10 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+   unsigned long (*get_table_size)(
+   __u32 page_shift,
+   __u64 window_size,
+   __u32 levels);
long (*create_table)(struct iommu_table_group *table_group,
int num,
__u32 page_shift,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 00739883..c77c85e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -41,6 +41,7 @@
 #include asm/debug.h
 #include asm/firmware.h
 #include asm/pnv-pci.h
+#include asm/mmzone.h
 
 #include misc/cxl.h
 
@@ -2072,6 +2073,38 @@ static long pnv_pci_ioda2_create_table(struct 
iommu_table_group *table_group,
 }
 
 #ifdef CONFIG_IOMMU_API
+static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+   __u64 window_size, __u32 levels)
+{
+   unsigned long bytes = 0;
+   const unsigned window_shift = ilog2(window_size);
+   unsigned entries_shift = window_shift - page_shift;
+   unsigned table_shift = entries_shift + 3;
+   unsigned long tce_table_size = max(0x1000UL, 1UL  table_shift);
+   unsigned long direct_table_size;
+
+   if (!levels || (levels  POWERNV_IOMMU_MAX_LEVELS) ||
+   (window_size  memory_hotplug_max()) ||
+   !is_power_of_2(window_size))
+   return 0;
+
+   /* Calculate a direct table size from window_size and levels */
+   entries_shift = (entries_shift + levels - 1) / levels;
+   table_shift = entries_shift + 3;
+   table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+   direct_table_size =  1UL  table_shift;
+
+   for ( ; levels; --levels) {
+   bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+
+   tce_table_size /= direct_table_size;
+   tce_table_size = 3;
+   tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+   }
+
+   return bytes;
+}
+
 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
int num)
 {
@@ -2115,6 +2148,7 @@ static void pnv_ioda2_release_ownership(struct 
iommu_table_group *table_group)
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+   .get_table_size = pnv_pci_ioda2_get_table_size,
.create_table = pnv_pci_ioda2_create_table,
.set_window = pnv_pci_ioda2_set_window,
.unset_window = pnv_pci_ioda2_unset_window,
@@ -2219,6 +2253,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, 
__u64 bus_offset,
page_shift);
tbl-it_level_size = 1ULL  (level_shift - 3);
tbl-it_indirect_levels = levels - 1;
+   tbl-it_allocated_size = tce_table_allocated;
 
pr_devel(Created TCE table: ws=%08llx ts=%lx @%08llx\n,
window_size, tce_table_size, bus_offset);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 10/34] vfio: powerpc/spapr: Disable DMA mappings on disabled container

2015-05-29 Thread Alexey Kardashevskiy

At the moment DMA map/unmap requests are handled irrespective to
the container's state. This allows the user space to pin memory which
it might not be allowed to pin.

This adds checks to MAP/UNMAP that the container is enabled, otherwise
-EPERM is returned.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 40583f9..e21479c 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -318,6 +318,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct iommu_table *tbl = container-tbl;
unsigned long tce;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (!tbl)
return -ENXIO;
 
@@ -362,6 +365,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct vfio_iommu_type1_dma_unmap param;
struct iommu_table *tbl = container-tbl;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (WARN_ON(!tbl))
return -ENXIO;
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration

2015-05-29 Thread Alexey Kardashevskiy

The existing code has 3 calls to iommu_register_group() and
all 3 branches actually cover all possible cases.

This replaces 3 calls with one and moves the registration earlier;
the latter will make more sense when we add TCE table sharing.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9a77f3c..8ca7abd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1784,6 +1784,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
if (WARN_ON(pe-tce32_seg = 0))
return;
 
+   tbl = pe-tce32_table;
+   iommu_register_group(tbl, phb-hose-global_number, pe-pe_number);
+
/* Grab a 32-bit TCE table */
pe-tce32_seg = base;
pe_info(pe,  Setting up 32-bit TCE table at %08x..%08x\n,
@@ -1818,7 +1821,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
}
 
/* Setup linux iommu table */
-   tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
  base  28, IOMMU_PAGE_SHIFT_4K);
 
@@ -1840,8 +1842,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
iommu_init_table(tbl, phb-hose-node);
 
if (pe-flags  PNV_IODA_PE_DEV) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
/*
 * Setting table base here only for carrying iommu_group
 * further down to let iommu_add_device() do the job.
@@ -1849,14 +1849,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
 */
set_iommu_table_base(pe-pdev-dev, tbl);
iommu_add_device(pe-pdev-dev);
-   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
+   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe-pbus);
-   } else if (pe-flags  PNV_IODA_PE_VF) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
-   }
 
return;
  fail:
@@ -1923,6 +1917,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
if (WARN_ON(pe-tce32_seg = 0))
return;
 
+   tbl = pe-tce32_table;
+   iommu_register_group(tbl, phb-hose-global_number, pe-pe_number);
+
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
end = (1  ilog2(phb-ioda.m32_pci_base));
@@ -1954,7 +1951,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
}
 
/* Setup linux iommu table */
-   tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
IOMMU_PAGE_SHIFT_4K);
 
@@ -1974,8 +1970,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
iommu_init_table(tbl, phb-hose-node);
 
if (pe-flags  PNV_IODA_PE_DEV) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
/*
 * Setting table base here only for carrying iommu_group
 * further down to let iommu_add_device() do the job.
@@ -1983,14 +1977,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 */
set_iommu_table_base(pe-pdev-dev, tbl);
iommu_add_device(pe-pdev-dev);
-   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
+   } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe-pbus);
-   } else if (pe-flags  PNV_IODA_PE_VF) {
-   iommu_register_group(tbl, phb-hose-global_number,
-pe-pe_number);
-   }
 
/* Also create a bypass window */
if (!pnv_iommu_bypass_disabled)
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 20/34] powerpc/powernv/ioda2: Move TCE kill register address to PE

2015-05-29 Thread Alexey Kardashevskiy

At the moment the DMA setup code looks for the ibm,opal-tce-kill
property which contains the TCE kill register address. Writing to
this register invalidates TCE cache on IODA/IODA2 hub.

This moves the register address from iommu_table to pnv_pnb as this
register belongs to PHB and invalidates TCE cache for all tables of
all attached PEs.

This moves the property reading/remapping code to a helper which is
called when DMA is being configured for PE and which does DMA setup
for both IODA1 and IODA2.

This adds a new pnv_pci_ioda2_tce_invalidate_entire() helper which
invalidates cache for the entire table. It should be called after
every call to opal_pci_map_pe_dma_window(). It was not required before
because there was just a single TCE table and 64bit DMA was handled via
bypass window (which has no table so no cache was used) but this is going
to change with Dynamic DMA windows (DDW).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v11:
* s/pnv_pci_ioda2_tvt_invalidate/pnv_pci_ioda2_tce_invalidate_entire/g
(cannot think of better-and-shorter name)
* moved tce_inval_reg_phys/tce_inval_reg to pnv_phb

v10:
* fixed error from checkpatch.pl
* removed comment at ibm,opal-tce-kill parsing as irrelevant
* s/addr/val/ in pnv_pci_ioda2_tvt_invalidate() as it was not a kernel address

v9:
* new in the series
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 66 ++-
 arch/powerpc/platforms/powernv/pci.h  |  7 +++-
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1d0bb5b..3fd8b18 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1679,8 +1679,8 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
iommu_table *tbl,
struct pnv_ioda_pe *pe = container_of(tgl-table_group,
struct pnv_ioda_pe, table_group);
__be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-tce_inval_reg_phys :
-   (__be64 __iomem *)tbl-it_index;
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
unsigned long start, end, inc;
const unsigned shift = tbl-it_page_shift;
 
@@ -1751,6 +1751,19 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
.get = pnv_tce_get,
 };
 
+static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
+{
+   /* 01xb - invalidate TCEs that match the specified PE# */
+   unsigned long val = (0x4ull  60) | (pe-pe_number  0xFF);
+   struct pnv_phb *phb = pe-phb;
+
+   if (!phb-ioda.tce_inval_reg)
+   return;
+
+   mb(); /* Ensure above stores are visible */
+   __raw_writeq(cpu_to_be64(val), phb-ioda.tce_inval_reg);
+}
+
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
unsigned long index, unsigned long npages, bool rm)
 {
@@ -1761,8 +1774,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
iommu_table *tbl,
struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
__be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-tce_inval_reg_phys :
-   (__be64 __iomem *)tbl-it_index;
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
const unsigned shift = tbl-it_page_shift;
 
/* We'll invalidate DMA address in PE scope */
@@ -1820,7 +1833,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 {
 
struct page *tce_mem = NULL;
-   const __be64 *swinvp;
struct iommu_table *tbl;
unsigned int i;
int64_t rc;
@@ -1877,20 +1889,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
  base  28, IOMMU_PAGE_SHIFT_4K);
 
/* OPAL variant of P7IOC SW invalidated TCEs */
-   swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
-   if (swinvp) {
-   /* We need a couple more fields -- an address and a data
-* to or.  Since the bus is only printed out on table free
-* errors, and on the first pass the data will be a relative
-* bus number, print that out instead.
-*/
-   pe-tce_inval_reg_phys = be64_to_cpup(swinvp);
-   tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys,
-   8);
+   if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE |
 TCE_PCI_SWINV_FREE   |
 TCE_PCI_SWINV_PAIR);
-   }
+
tbl-it_ops = pnv_ioda1_iommu_ops;
iommu_init_table(tbl, phb-hose-node);
 
@@ -1971,12 +1974,24 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = 
{
 };
 #endif
 
+static void

[PATCH kernel v11 25/34] powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages

2015-05-29 Thread Alexey Kardashevskiy

This is a part of moving TCE table allocation into an iommu_ops
callback to support multiple IOMMU groups per one VFIO container.

This moves the code which allocates the actual TCE tables to helpers:
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages().
These do not allocate/free the iommu_table struct.

This enforces window size to be a power of two.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* removed @table_group parameter from pnv_pci_create_table as it was not used
* removed *tce_table_allocated from pnv_alloc_tce_table_pages()
* pnv_pci_create_table/pnv_pci_free_table renamed to
pnv_pci_ioda2_table_alloc_pages/pnv_pci_ioda2_table_free_pages and moved
back to pci-ioda.c as these only allocate pages for IODA2 and there is
no chance they will be reused for IODA1/P5IOC2
* shortened subject line

v9:
* moved helpers to the common powernv pci.c file from pci-ioda.c
* moved bits from pnv_pci_create_table() to pnv_alloc_tce_table_pages()
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 82 +++
 1 file changed, 62 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 0e88241..3d29fe3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,8 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE   ((0x1000 / 0x1000) * 8)
 
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
 {
@@ -1313,8 +1315,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
*dev, struct pnv_ioda_pe
iommu_group_put(pe-table_group.group);
BUG_ON(pe-table_group.group);
}
+   pnv_pci_ioda2_table_free_pages(tbl);
iommu_free_table(tbl, of_node_full_name(dev-dev.of_node));
-   free_pages(addr, get_order(TCE32_TABLE_SIZE));
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -2032,13 +2034,62 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct 
pnv_phb *phb)
phb-ioda.tce_inval_reg = ioremap(phb-ioda.tce_inval_reg_phys, 8);
 }
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-  struct pnv_ioda_pe *pe)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 {
struct page *tce_mem = NULL;
+   __be64 *addr;
+   unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+   if (!tce_mem) {
+   pr_err(Failed to allocate a TCE memory, order=%d\n, order);
+   return NULL;
+   }
+   addr = page_address(tce_mem);
+   memset(addr, 0, 1UL  (order + PAGE_SHIFT));
+
+   return addr;
+}
+
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+   __u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+{
void *addr;
+   const unsigned window_shift = ilog2(window_size);
+   unsigned entries_shift = window_shift - page_shift;
+   unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
+   const unsigned long tce_table_size = 1UL  table_shift;
+
+   if ((window_size  memory_hotplug_max()) || !is_power_of_2(window_size))
+   return -EINVAL;
+
+   /* Allocate TCE table */
+   addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+   if (!addr)
+   return -ENOMEM;
+
+   /* Setup linux iommu table */
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+   page_shift);
+
+   pr_devel(Created TCE table: ws=%08llx ts=%lx @%08llx\n,
+   window_size, tce_table_size, bus_offset);
+
+   return 0;
+}
+
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+   if (!tbl-it_size)
+   return;
+
+   free_pages(tbl-it_base, get_order(tbl-it_size  3));
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+  struct pnv_ioda_pe *pe)
+{
struct iommu_table *tbl;
-   unsigned int tce_table_size, end;
int64_t rc;
 
/* We shouldn't already have a 32-bit DMA associated */
@@ -2055,24 +2106,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
-   end = (1  ilog2(phb-ioda.m32_pci_base));
-   tce_table_size = (end / 0x1000) * 8;
pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n,
-   end);
+   phb-ioda.m32_pci_base);
 
-   /* Allocate TCE table */
-   tce_mem

[PATCH kernel v11 27/34] powerpc/powernv: Implement multilevel TCE tables

2015-05-29 Thread Alexey Kardashevskiy

TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.

This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* fixed multiple comments received for v9

v9:
* moved from ioda2 to common powernv pci code
* fixed cleanup if allocation fails in a middle
* removed check for the size - all boundary checks happen in the calling code
anyway
---
 arch/powerpc/include/asm/iommu.h  |  2 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 98 ---
 arch/powerpc/platforms/powernv/pci.c  | 13 
 3 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4636734..706cfc0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
 struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
+   unsigned long  it_indirect_levels;
+   unsigned long  it_level_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fda01c1..68ffc7a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -49,6 +49,9 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE   ((0x1000 / 0x1000) * 8)
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS   1
+#define POWERNV_IOMMU_MAX_LEVELS   5
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1975,6 +1978,8 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
table_group);
struct pnv_phb *phb = pe-phb;
int64_t rc;
+   const unsigned long size = tbl-it_indirect_levels ?
+   tbl-it_level_size : tbl-it_size;
const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
const __u64 win_size = tbl-it_size  tbl-it_page_shift;
 
@@ -1989,9 +1994,9 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
rc = opal_pci_map_pe_dma_window(phb-opal_id,
pe-pe_number,
pe-pe_number  1,
-   1,
+   tbl-it_indirect_levels + 1,
__pa(tbl-it_base),
-   tbl-it_size  3,
+   size  3,
IOMMU_PAGE_SIZE(tbl));
if (rc) {
pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
@@ -2071,11 +2076,19 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct 
pnv_phb *phb)
phb-ioda.tce_inval_reg = ioremap(phb-ioda.tce_inval_reg_phys, 8);
 }
 
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+   unsigned levels, unsigned long limit,
+   unsigned long *tce_table_allocated)
 {
struct page *tce_mem = NULL;
-   __be64 *addr;
+   __be64 *addr, *tmp;
unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+   unsigned long local_allocated = 1UL  (order + PAGE_SHIFT);
+   unsigned entries = 1UL  (shift - 3);
+   long i;
+
+   if (*tce_table_allocated = limit)
+   return NULL;
 
tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
if (!tce_mem) {
@@ -2083,31 +2096,69 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int 
nid, unsigned shift)
return NULL;
}
addr = page_address(tce_mem);
-   memset(addr, 0, 1UL  (order + PAGE_SHIFT));
+   memset(addr, 0, local_allocated);
+
+   --levels;
+   if (!levels) {
+   *tce_table_allocated += local_allocated;
+   return addr;
+   }
+
+   for (i = 0; i  entries; ++i) {
+   tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+   levels, limit, tce_table_allocated);
+   if (!tmp)
+   break;
+
+   addr[i] = cpu_to_be64(__pa(tmp) |
+   TCE_PCI_READ | TCE_PCI_WRITE);
+   }
 
return addr

[PATCH kernel v11 00/34] powerpc/iommu/vfio: Enable Dynamic DMA windows

2015-05-29 Thread Alexey Kardashevskiy


This enables sPAPR defined feature called Dynamic DMA windows (DDW).

Each Partitionable Endpoint (IOMMU group) has an address range on a PCI bus
where devices are allowed to do DMA. These ranges are called DMA windows.
By default, there is a single DMA window, 1 or 2GB big, mapped at zero
on a PCI bus.

Hi-speed devices may suffer from the limited size of the window.
The recent host kernels use a TCE bypass window on POWER8 CPU which implements
direct PCI bus address range mapping (with offset of 159) to the host memory.

For guests, PAPR defines a DDW RTAS API which allows pseries guests
querying the hypervisor about DDW support and capabilities (page size mask
for now). A pseries guest may request an additional (to the default)
DMA windows using this RTAS API.
The existing pseries Linux guests request an additional window as big as
the guest RAM and map the entire guest window which effectively creates
direct mapping of the guest memory to a PCI bus.

The multiple DMA windows feature is supported by POWER7/POWER8 CPUs; however
this patchset only adds support for POWER8 as TCE tables are implemented
in POWER7 in a quite different way ans POWER7 is not the highest priority.

This patchset reworks PPC64 IOMMU code and adds necessary structures
to support big windows.

Once a Linux guest discovers the presence of DDW, it does:
1. query hypervisor about number of available windows and page size masks;
2. create a window with the biggest possible page size (today 4K/64K/16M);
3. map the entire guest RAM via H_PUT_TCE* hypercalls;
4. switche dma_ops to direct_dma_ops on the selected PE.

Once this is done, H_PUT_TCE is not called anymore for 64bit devices and
the guest does not waste time on DMA map/unmap operations.

Note that 32bit devices won't use DDW and will keep using the default
DMA window so KVM optimizations will be required (to be posted later).

This is pushed to g...@github.com:aik/linux.git
 + 0ea0348...93b347697 vfio-for-github - vfio-for-github (forced update)

The pushed branch contains all patches from this patchset and KVM
acceleration patches as well to give an idea about the current state
of in-kernel acceleration support.


Please comment. Thanks!


Changes:
v11:
* reworked locking in pinned pages cache

v10:
* fixedtested on SRIOV system
* fixed multiple comments from David
* added bunch of iommu device attachment reworks

v9:
* rebased on top of SRIOV (which is in upstream now)
* fixed multiple comments from David
* reworked ownership patches
* removed vfio: powerpc/spapr: Do cleanup when releasing the group (used to be 
#2)
as updated #1 should do this
* moved powerpc/powernv: Implement accessor to TCE entry to a separate patch
* added a patch which moves TCE Kill register address to PE from IOMMU table

v8:
* fixed a bug in error fallback in powerpc/mmu: Add userspace-to-physical
addresses translation cache
* fixed subject in vfio: powerpc/spapr: Check that IOMMU page is fully
contained by system page
* moved v2 documentation to the correct patch
* added checks for failed vzalloc() in powerpc/iommu: Add userspace view
of TCE table

v7:
* moved memory preregistration to the current process's MMU context
* added code preventing unregistration if some pages are still mapped;
for this, there is a userspace view of the table is stored in iommu_table
* added locked_vm counting for DDW tables (including userspace view of those)

v6:
* fixed a bunch of errors in vfio: powerpc/spapr: Support Dynamic DMA windows
* moved static IOMMU properties from iommu_table_group to iommu_table_group_ops

v5:
* added SPAPR_TCE_IOMMU_v2 to tell the userspace that there is a memory
pre-registration feature
* added backward compatibility
* renamed few things (mostly powerpc_iommu - iommu_table_group)

v4:
* moved patches around to have VFIO and PPC patches separated as much as
possible
* now works with the existing upstream QEMU

v3:
* redesigned the whole thing
* multiple IOMMU groups per PHB - one PHB is needed for VFIO in the guest -
no problems with locked_vm counting; also we save memory on actual tables
* guest RAM preregistration is required for DDW
* PEs (IOMMU groups) are passed to VFIO with no DMA windows at all so
we do not bother with iommu_table::it_map anymore
* added multilevel TCE tables support to support really huge guests

v2:
* added missing __pa() in powerpc/powernv: Release replaced TCE
* reposted to make some noise




Alexey Kardashevskiy (34):
  powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group
  powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group
  powerpc/powernv/ioda: Clean up IOMMU group registration
  powerpc/iommu: Put IOMMU group explicitly
  powerpc/iommu: Always release iommu_table in iommu_free_table()
  vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU
driver
  vfio: powerpc/spapr: Check that IOMMU page is fully contained by
system page
  vfio: powerpc/spapr: Use it_page_size
  vfio: powerpc/spapr: Move locked_vm

[PATCH kernel v11 23/34] powerpc/iommu/powernv: Release replaced TCE

2015-05-29 Thread Alexey Kardashevskiy

At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.

This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards. The exchange() receives
a physical address unlike set() which receives linear mapping address;
and returns a physical address as the clear() does.

This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.

This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().

This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from
DMA direction.

This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v10:
* did s/tce/hpa/ in iommu_table_ops::exchange and tce_iommu_unuse_page()
* removed permission bits check from iommu_tce_put_param_check as
permission bits are not allowed in the address
* added BUG_ON(*hpa  ~IOMMU_PAGE_MASK(tbl)) to pnv_tce_xchg()

v9:
* changed exchange() to work with physical addresses as these addresses
are never accessed by the code and physical addresses are actual values
we put into the IOMMU table
---
 arch/powerpc/include/asm/iommu.h| 22 --
 arch/powerpc/kernel/iommu.c | 59 +--
 arch/powerpc/platforms/powernv/pci-ioda.c   | 34 
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++
 arch/powerpc/platforms/powernv/pci.c| 18 +
 arch/powerpc/platforms/powernv/pci.h|  2 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 63 +
 7 files changed, 132 insertions(+), 69 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 489133c..4636734 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -45,13 +45,29 @@ extern int iommu_is_off;
 extern int iommu_force_on;
 
 struct iommu_table_ops {
+   /*
+* When called with direction==DMA_NONE, it is equal to clear().
+* uaddr is a linear map address.
+*/
int (*set)(struct iommu_table *tbl,
long index, long npages,
unsigned long uaddr,
enum dma_data_direction direction,
struct dma_attrs *attrs);
+#ifdef CONFIG_IOMMU_API
+   /*
+* Exchanges existing TCE with new TCE plus direction bits;
+* returns old TCE and DMA direction mask.
+* @tce is a physical address.
+*/
+   int (*exchange)(struct iommu_table *tbl,
+   long index,
+   unsigned long *hpa,
+   enum dma_data_direction *direction);
+#endif
void (*clear)(struct iommu_table *tbl,
long index, long npages);
+   /* get() returns a physical address */
unsigned long (*get)(struct iommu_table *tbl, long index);
void (*flush)(struct iommu_table *tbl);
 };
@@ -153,6 +169,8 @@ extern void iommu_register_group(struct iommu_table_group 
*table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+   unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
@@ -225,10 +243,6 @@ extern int iommu_tce_clear_param_check(struct iommu_table 
*tbl,
unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-   unsigned long entry);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl

[PATCH kernel v11 26/34] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window

2015-05-29 Thread Alexey Kardashevskiy

This is a part of moving DMA window programming to an iommu_ops
callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
a first parameter (not pnv_ioda_pe) as it is going to be used as
a callback for VFIO DDW code.

This adds pnv_pci_ioda2_tvt_invalidate() to invalidate TVT as it is
a good thing to do. It does not have immediate effect now as the table
is never recreated after reboot but it will in the following patches.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v11:
* replaced some 1it_page_shift with IOMMU_PAGE_SIZE() macro

v9:
* initialize pe-table_group.tables[0] at the very end when
tbl is fully initialized
* moved pnv_pci_ioda2_tvt_invalidate() from earlier patch
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 47 +--
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3d29fe3..fda01c1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1968,6 +1968,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
}
 }
 
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+   int num, struct iommu_table *tbl)
+{
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   struct pnv_phb *phb = pe-phb;
+   int64_t rc;
+   const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
+   const __u64 win_size = tbl-it_size  tbl-it_page_shift;
+
+   pe_info(pe, Setting up window %llx..%llx pg=%x\n,
+   start_addr, start_addr + win_size - 1,
+   IOMMU_PAGE_SIZE(tbl));
+
+   /*
+* Map TCE table through TVT. The TVE index is the PE number
+* shifted by 1 bit for 32-bits DMA space.
+*/
+   rc = opal_pci_map_pe_dma_window(phb-opal_id,
+   pe-pe_number,
+   pe-pe_number  1,
+   1,
+   __pa(tbl-it_base),
+   tbl-it_size  3,
+   IOMMU_PAGE_SIZE(tbl));
+   if (rc) {
+   pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
+   return rc;
+   }
+
+   pnv_pci_link_table_and_group(phb-hose-node, num,
+   tbl, pe-table_group);
+   pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+   return 0;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -2123,21 +2160,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
pe-table_group.ops = pnv_pci_ioda2_ops;
 #endif
 
-   /*
-* Map TCE table through TVT. The TVE index is the PE number
-* shifted by 1 bit for 32-bits DMA space.
-*/
-   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(tbl-it_base),
-   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
goto fail;
}
 
-   pnv_pci_ioda2_tce_invalidate_entire(pe);
-
/* OPAL variant of PHB3 invalidated TCEs */
if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 24/34] powerpc/powernv/ioda2: Rework iommu_table creation

2015-05-29 Thread Alexey Kardashevskiy

This moves iommu_table creation to the beginning to make following changes
easier to review. This starts using table parameters from the iommu_table
struct.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* updated commit log and did minor cleanup
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d7ac2d4..0e88241 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2070,13 +2070,23 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
addr = page_address(tce_mem);
memset(addr, 0, tce_table_size);
 
+   /* Setup linux iommu table */
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   IOMMU_PAGE_SHIFT_4K);
+
+   tbl-it_ops = pnv_ioda2_iommu_ops;
+   iommu_init_table(tbl, phb-hose-node);
+#ifdef CONFIG_IOMMU_API
+   pe-table_group.ops = pnv_pci_ioda2_ops;
+#endif
+
/*
 * Map TCE table through TVT. The TVE index is the PE number
 * shifted by 1 bit for 32-bits DMA space.
 */
rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(addr),
-   tce_table_size, 0x1000);
+   pe-pe_number  1, 1, __pa(tbl-it_base),
+   tbl-it_size  3, 1ULL  tbl-it_page_shift);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
@@ -2085,20 +2095,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
pnv_pci_ioda2_tce_invalidate_entire(pe);
 
-   /* Setup linux iommu table */
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-   IOMMU_PAGE_SHIFT_4K);
-
/* OPAL variant of PHB3 invalidated TCEs */
if (phb-ioda.tce_inval_reg)
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 
-   tbl-it_ops = pnv_ioda2_iommu_ops;
-   iommu_init_table(tbl, phb-hose-node);
-#ifdef CONFIG_IOMMU_API
-   pe-table_group.ops = pnv_pci_ioda2_ops;
-#endif
-
if (pe-flags  PNV_IODA_PE_DEV) {
/*
 * Setting table base here only for carrying iommu_group
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 22/34] powerpc/powernv: Implement accessor to TCE entry

2015-05-29 Thread Alexey Kardashevskiy

This replaces direct accesses to TCE table with a helper which
returns an TCE entry address. This does not make difference now but will
when multi-level TCE tables get introduces.

No change in behavior is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* new patch in the series to separate this mechanical change from
functional changes; this is not right before
powerpc/powernv: Implement multilevel TCE tables but here in order
to let the next patch - powerpc/iommu/powernv: Release replaced TCE -
use pnv_tce() and avoid changing the same code twice
---
 arch/powerpc/platforms/powernv/pci.c | 34 +-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 4b4c583..b2a32d0 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,38 +572,46 @@ struct pci_ops pnv_pci_ops = {
.write = pnv_pci_write_config,
 };
 
+static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
+{
+   __be64 *tmp = ((__be64 *)tbl-it_base);
+
+   return tmp + idx;
+}
+
 int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
unsigned long uaddr, enum dma_data_direction direction,
struct dma_attrs *attrs)
 {
u64 proto_tce = iommu_direction_to_tce_perm(direction);
-   __be64 *tcep;
-   u64 rpn;
+   u64 rpn = __pa(uaddr)  tbl-it_page_shift;
+   long i;
 
-   tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
-   rpn = __pa(uaddr)  tbl-it_page_shift;
-
-   while (npages--)
-   *(tcep++) = cpu_to_be64(proto_tce |
-   (rpn++  tbl-it_page_shift));
+   for (i = 0; i  npages; i++) {
+   unsigned long newtce = proto_tce |
+   ((rpn + i)  tbl-it_page_shift);
+   unsigned long idx = index - tbl-it_offset + i;
 
+   *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
+   }
 
return 0;
 }
 
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-   __be64 *tcep;
+   long i;
 
-   tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
+   for (i = 0; i  npages; i++) {
+   unsigned long idx = index - tbl-it_offset + i;
 
-   while (npages--)
-   *(tcep++) = cpu_to_be64(0);
+   *(pnv_tce(tbl, idx)) = cpu_to_be64(0);
+   }
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-   return ((u64 *)tbl-it_base)[index - tbl-it_offset];
+   return *(pnv_tce(tbl, index - tbl-it_offset));
 }
 
 struct iommu_table *pnv_pci_table_alloc(int nid)
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver

2015-05-29 Thread Alexey Kardashevskiy

This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.

This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.

This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.

Besides the last part, the rest of the patch is mechanical.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* added missing tce_iommu_clear call after iommu_release_ownership()
* brought @offset (a local variable) back to make patch even more
mechanical

v4:
* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
 arch/powerpc/include/asm/iommu.h|  4 --
 arch/powerpc/kernel/iommu.c | 55 -
 drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++--
 3 files changed, 67 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 8353c86..e94a5e3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, 
unsigned long entry,
unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-   unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 73eb39a..0019c80 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -986,30 +986,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages)
-{
-   unsigned long oldtce;
-   struct page *page;
-
-   for ( ; pages; --pages, ++entry) {
-   oldtce = iommu_clear_tce(tbl, entry);
-   if (!oldtce)
-   continue;
-
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
-   }
-
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1039,35 +1015,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-   unsigned long tce)
-{
-   int ret;
-   struct page *page = NULL;
-   unsigned long hwaddr, offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
-   enum dma_data_direction direction = iommu_tce_direction(tce);
-
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx 
ioba=%lx ret=%d\n,
-   tce, entry  tbl-it_page_shift, ret); */
-   return -EFAULT;
-   }
-   hwaddr = (unsigned long) page_address(page) + offset;
-
-   ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-   if (ret)
-   put_page(page);
-
-   if (ret  0)
-   pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n,
-   __func__, entry  tbl-it_page_shift, tce, ret);
-
-   return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
@@ -1081,7 +1028,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
}
 
memset(tbl-it_map, 0xff, sz);
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
 
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
@@ -1099,7 +1045,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size

[PATCH kernel v11 13/34] powerpc/powernv: Do not set read flag if direction==DMA_NONE

2015-05-29 Thread Alexey Kardashevskiy

Normally a bitmap from the iommu_table is used to track what TCE entry
is in use. Since we are going to use iommu_table without its locks and
do xchg() instead, it becomes essential not to put bits which are not
implied in the direction flag as the old TCE value (more precisely -
the permission bits) will be used to decide whether to put the page or not.

This adds iommu_direction_to_tce_perm() (its counterpart is there already)
and uses it for powernv's pnv_tce_build().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* added comment why we must put only valid permission bits
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 15 +++
 arch/powerpc/platforms/powernv/pci.c |  7 +--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e94a5e3..d91bd69 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0019c80..ac2f959 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -866,6 +866,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
}
 }
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+   switch (dir) {
+   case DMA_BIDIRECTIONAL:
+   return TCE_PCI_READ | TCE_PCI_WRITE;
+   case DMA_FROM_DEVICE:
+   return TCE_PCI_WRITE;
+   case DMA_TO_DEVICE:
+   return TCE_PCI_READ;
+   default:
+   return 0;
+   }
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 #ifdef CONFIG_IOMMU_API
 /*
  * SPAPR TCE API
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index bca2aeb..b7ea245 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
 unsigned long uaddr, enum dma_data_direction direction,
 struct dma_attrs *attrs, bool rm)
 {
-   u64 proto_tce;
+   u64 proto_tce = iommu_direction_to_tce_perm(direction);
__be64 *tcep, *tces;
u64 rpn;
 
-   proto_tce = TCE_PCI_READ; // Read allowed
-
-   if (direction != DMA_TO_DEVICE)
-   proto_tce |= TCE_PCI_WRITE;
-
tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
rpn = __pa(uaddr)  tbl-it_page_shift;
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 14/34] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table

2015-05-29 Thread Alexey Kardashevskiy

This adds a iommu_table_ops struct and puts pointer to it into
the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
callbacks from ppc_md to the new struct where they really belong to.

This adds the requirement for @it_ops to be initialized before calling
iommu_init_table() to make sure that we do not leave any IOMMU table
with iommu_table_ops uninitialized. This is not a parameter of
iommu_init_table() though as there will be cases when iommu_init_table()
will not be called on TCE tables, for example - VFIO.

This does s/tce_build/set/, s/tce_free/clear/ and removes tce_
redundant prefixes.

This removes tce_xxx_rm handlers from ppc_md but does not add
them to iommu_table_ops as this will be done later if we decide to
support TCE hypercalls in real mode. This removes _vm callbacks as
only virtual mode is supported by now so this also removes @rm parameter.

For pSeries, this always uses tce_buildmulti_pSeriesLP/
tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
present. The reason for this is we still have to support multitce=off
boot parameter in disable_multitce() and we do not want to walk through
all IOMMU tables in the system and replace multi callbacks with single
ones.

For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2.
This makes the callbacks for them public. Later patches will extend
callbacks for IODA1/2.

No change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* pnv_tce_build/pnv_tce_free/pnv_tce_get have been made public and lost
rm parameters to make following patches simpler (realmode is not
supported here anyway)
* got rid of _vm versions of callbacks
---
 arch/powerpc/include/asm/iommu.h| 17 +++
 arch/powerpc/include/asm/machdep.h  | 25 ---
 arch/powerpc/kernel/iommu.c | 46 ++--
 arch/powerpc/kernel/vio.c   |  5 +++
 arch/powerpc/platforms/cell/iommu.c |  8 +++--
 arch/powerpc/platforms/pasemi/iommu.c   |  7 +++--
 arch/powerpc/platforms/powernv/pci-ioda.c   | 14 +
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 +
 arch/powerpc/platforms/powernv/pci.c| 47 +
 arch/powerpc/platforms/powernv/pci.h|  5 +++
 arch/powerpc/platforms/pseries/iommu.c  | 34 -
 arch/powerpc/sysdev/dart_iommu.c| 12 +---
 12 files changed, 116 insertions(+), 111 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d91bd69..e2a45c3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,6 +44,22 @@
 extern int iommu_is_off;
 extern int iommu_force_on;
 
+struct iommu_table_ops {
+   int (*set)(struct iommu_table *tbl,
+   long index, long npages,
+   unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs);
+   void (*clear)(struct iommu_table *tbl,
+   long index, long npages);
+   unsigned long (*get)(struct iommu_table *tbl, long index);
+   void (*flush)(struct iommu_table *tbl);
+};
+
+/* These are used by VIO */
+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
+extern struct iommu_table_ops iommu_table_pseries_ops;
+
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -78,6 +94,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *it_group;
 #endif
+   struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 #ifdef CONFIG_PPC_POWERNV
void   *data;
diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index ef889943..ab721b4 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,31 +65,6 @@ struct machdep_calls {
 * destroyed as well */
void(*hpte_clear_all)(void);
 
-   int (*tce_build)(struct iommu_table *tbl,
-long index,
-long npages,
-unsigned long uaddr,
-enum dma_data_direction direction,
-struct dma_attrs *attrs);
-   void(*tce_free)(struct iommu_table *tbl,
-   long index,
-   long npages);
-   unsigned long   (*tce_get)(struct iommu_table *tbl,
-   long index);
-   void(*tce_flush)(struct iommu_table *tbl

[PATCH kernel v11 19/34] powerpc/iommu: Fix IOMMU ownership control functions

2015-05-29 Thread Alexey Kardashevskiy

This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().

This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.

This only clears TCE content if there is no page marked busy in it_map.
Clearing must be done outside of the table locks as iommu_clear_tce()
called from iommu_clear_tces_and_put_pages() does this.

In order to use bitmap_empty(), the existing code clears bit#0 which
is set even in an empty table if it is bus-mapped at 0 as
iommu_init_table() reserves page#0 to prevent buggy drivers
from crashing when allocated page is bus-mapped at zero
(which is correct). This restores the bit in the case of failure
to bring the it_map to the state it was in when we called
iommu_take_ownership().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* iommu_table_take_ownership() did not return @ret (and ignored EBUSY),
now it does return correct error.
* updated commit log about setting bit#0 in the case of failure

v5:
* do not store bit#0 value, it has to be set for zero-based table
anyway
* removed test_and_clear_bit
---
 arch/powerpc/kernel/iommu.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b6a397a..1ae1034 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1035,31 +1035,51 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+   int ret = 0;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
if (tbl-it_offset == 0)
clear_bit(0, tbl-it_map);
 
if (!bitmap_empty(tbl-it_map, tbl-it_size)) {
pr_err(iommu_tce: it_map is not empty);
-   return -EBUSY;
+   ret = -EBUSY;
+   /* Restore bit#0 set by iommu_init_table() */
+   if (tbl-it_offset == 0)
+   set_bit(0, tbl-it_map);
+   } else {
+   memset(tbl-it_map, 0xff, sz);
}
 
-   memset(tbl-it_map, 0xff, sz);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 
-   return 0;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 void iommu_release_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
memset(tbl-it_map, 0, sz);
 
/* Restore bit#0 set by iommu_init_table() */
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
+
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group

2015-05-29 Thread Alexey Kardashevskiy

The set_iommu_table_base_and_group() name suggests that the function
sets table base and add a device to an IOMMU group.

The actual purpose for table base setting is to put some reference
into a device so later iommu_add_device() can get the IOMMU group
reference and the device to the group.

At the moment a group cannot be explicitly passed to iommu_add_device()
as we want it to work from the bus notifier, we can fix it later and
remove confusing calls of set_iommu_table_base().

This replaces set_iommu_table_base_and_group() with a couple of
set_iommu_table_base() + iommu_add_device() which makes reading the code
easier.

This adds few comments why set_iommu_table_base() and iommu_add_device()
are called where they are called.

For IODA1/2, this essentially removes iommu_add_device() call from
the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
place:
- for physical PE, the device is already attached by iommu_add_device()
in pnv_pci_ioda_setup_dma_pe();
- for virtual PE, the sysfs entries are not ready to create all symlinks
so actual adding is happening in tce_iommu_bus_notifier.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* new to the series
---
 arch/powerpc/include/asm/iommu.h|  7 ---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 27 +++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++-
 arch/powerpc/platforms/pseries/iommu.c  | 15 ---
 4 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 1e27d63..8353c86 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */
 
-static inline void set_iommu_table_base_and_group(struct device *dev,
- void *base)
-{
-   set_iommu_table_base(dev, base);
-   iommu_add_device(dev);
-}
-
 extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
struct scatterlist *sglist, int nelems,
unsigned long mask,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2f092bb..9a77f3c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb 
*phb, struct pci_dev *pdev
 
pe = phb-ioda.pe_array[pdn-pe_number];
WARN_ON(get_dma_ops(pdev-dev) != dma_iommu_ops);
-   set_iommu_table_base_and_group(pdev-dev, pe-tce32_table);
+   set_iommu_table_base(pdev-dev, pe-tce32_table);
+   /*
+* Note: iommu_add_device() will fail here as
+* for physical PE: the device is already added by now;
+* for virtual PE: sysfs entries are not ready yet and
+* tce_iommu_bus_notifier will add the device to a group later.
+*/
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
@@ -1659,7 +1665,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
struct pci_dev *dev;
 
list_for_each_entry(dev, bus-devices, bus_list) {
-   set_iommu_table_base_and_group(dev-dev, pe-tce32_table);
+   set_iommu_table_base(dev-dev, pe-tce32_table);
+   iommu_add_device(dev-dev);
 
if (dev-subordinate)
pnv_ioda_setup_bus_dma(pe, dev-subordinate);
@@ -1835,7 +1842,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
if (pe-flags  PNV_IODA_PE_DEV) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+   /*
+* Setting table base here only for carrying iommu_group
+* further down to let iommu_add_device() do the job.
+* pnv_pci_ioda_dma_dev_setup will override it later anyway.
+*/
+   set_iommu_table_base(pe-pdev-dev, tbl);
+   iommu_add_device(pe-pdev-dev);
} else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
@@ -1963,7 +1976,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
if (pe-flags  PNV_IODA_PE_DEV) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+   /*
+* Setting table base here only for carrying iommu_group
+* further down to let iommu_add_device() do the job

[PATCH kernel v11 01/34] powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group

2015-05-29 Thread Alexey Kardashevskiy

This relies on the fact that a PCI device always has an IOMMU table
which may not be the case when we get dynamic DMA windows so
let's use more reliable check for IOMMU group here.

As we do not rely on the table presence here, remove the workaround
from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group
parameter from pnv_ioda_setup_bus_dma().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Acked-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/eeh.c |  4 +---
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 +--
 2 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9ee61d1..defd874 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1412,13 +1412,11 @@ static int dev_has_iommu_table(struct device *dev, void 
*data)
 {
struct pci_dev *pdev = to_pci_dev(dev);
struct pci_dev **ppdev = data;
-   struct iommu_table *tbl;
 
if (!dev)
return 0;
 
-   tbl = get_iommu_table_base(dev);
-   if (tbl  tbl-it_group) {
+   if (dev-iommu_group) {
*ppdev = pdev;
return 1;
}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index f8bc950..2f092bb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1654,21 +1654,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct 
pnv_phb *phb,
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-  struct pci_bus *bus,
-  bool add_to_iommu_group)
+  struct pci_bus *bus)
 {
struct pci_dev *dev;
 
list_for_each_entry(dev, bus-devices, bus_list) {
-   if (add_to_iommu_group)
-   set_iommu_table_base_and_group(dev-dev,
-  pe-tce32_table);
-   else
-   set_iommu_table_base(dev-dev, pe-tce32_table);
+   set_iommu_table_base_and_group(dev-dev, pe-tce32_table);
 
if (dev-subordinate)
-   pnv_ioda_setup_bus_dma(pe, dev-subordinate,
-  add_to_iommu_group);
+   pnv_ioda_setup_bus_dma(pe, dev-subordinate);
}
 }
 
@@ -1845,7 +1839,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
} else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
+   pnv_ioda_setup_bus_dma(pe, pe-pbus);
} else if (pe-flags  PNV_IODA_PE_VF) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
@@ -1882,17 +1876,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table 
*tbl, bool enable)
 window_id,
 pe-tce_bypass_base,
 0);
-
-   /*
-* EEH needs the mapping between IOMMU table and group
-* of those VFIO/KVM pass-through devices. We can postpone
-* resetting DMA ops until the DMA mask is configured in
-* host side.
-*/
-   if (pe-pdev)
-   set_iommu_table_base(pe-pdev-dev, tbl);
-   else
-   pnv_ioda_setup_bus_dma(pe, pe-pbus, false);
}
if (rc)
pe_err(pe, OPAL error %lld configuring bypass window\n, rc);
@@ -1984,7 +1967,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
} else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-   pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
+   pnv_ioda_setup_bus_dma(pe, pe-pbus);
} else if (pe-flags  PNV_IODA_PE_VF) {
iommu_register_group(tbl, phb-hose-global_number,
 pe-pe_number);
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 21/34] powerpc/powernv/ioda2: Add TCE invalidation for all attached groups

2015-05-29 Thread Alexey Kardashevskiy

The iommu_table struct keeps a list of IOMMU groups it is used for.
At the moment there is just a single group attached but further
patches will add TCE table sharing. When sharing is enabled, TCE cache
in each PE needs to be invalidated so does the patch.

This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
to enable TCE table sharing on PHBs older than IODA2.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v10:
* new to the series
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 ---
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3fd8b18..94fccc8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,6 +24,7 @@
 #include linux/msi.h
 #include linux/memblock.h
 #include linux/iommu.h
+#include linux/rculist.h
 
 #include asm/sections.h
 #include asm/io.h
@@ -1764,23 +1765,15 @@ static inline void 
pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
__raw_writeq(cpu_to_be64(val), phb-ioda.tce_inval_reg);
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
-   unsigned long index, unsigned long npages, bool rm)
+static void pnv_pci_ioda2_tce_do_invalidate(unsigned pe_number, bool rm,
+   __be64 __iomem *invalidate, unsigned shift,
+   unsigned long index, unsigned long npages)
 {
-   struct iommu_table_group_link *tgl = list_first_entry_or_null(
-   tbl-it_group_list, struct iommu_table_group_link,
-   next);
-   struct pnv_ioda_pe *pe = container_of(tgl-table_group,
-   struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
-   __be64 __iomem *invalidate = rm ?
-   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
-   pe-phb-ioda.tce_inval_reg;
-   const unsigned shift = tbl-it_page_shift;
 
/* We'll invalidate DMA address in PE scope */
start = 0x2ull  60;
-   start |= (pe-pe_number  0xFF);
+   start |= (pe_number  0xFF);
end = start;
 
/* Figure out the start, end and step */
@@ -1798,6 +1791,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
iommu_table *tbl,
}
 }
 
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
+{
+   struct iommu_table_group_link *tgl;
+
+   list_for_each_entry_rcu(tgl, tbl-it_group_list, next) {
+   struct pnv_ioda_pe *pe = container_of(tgl-table_group,
+   struct pnv_ioda_pe, table_group);
+   __be64 __iomem *invalidate = rm ?
+   (__be64 __iomem *)pe-phb-ioda.tce_inval_reg_phys :
+   pe-phb-ioda.tce_inval_reg;
+
+   pnv_pci_ioda2_tce_do_invalidate(pe-pe_number, rm,
+   invalidate, tbl-it_page_shift,
+   index, npages);
+   }
+}
+
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2

2015-05-29 Thread Alexey Kardashevskiy

The existing implementation accounts the whole DMA window in
the locked_vm counter. This is going to be worse with multiple
containers and huge DMA windows. Also, real-time accounting would requite
additional tracking of accounted pages due to the page size difference -
IOMMU uses 4K pages and system uses 4K or 64K pages.

Another issue is that actual pages pinning/unpinning happens on every
DMA map/unmap request. This does not affect the performance much now as
we spend way too much time now on switching context between
guest/userspace/host but this will start to matter when we add in-kernel
DMA map/unmap acceleration.

This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
2 new ioctls to register/unregister DMA memory -
VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
which receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.
New IOMMU splits physical pages pinning and TCE table update
into 2 different operations. It requires:
1) guest pages to be registered first
2) consequent map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
When a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.

The new memory registration ioctls are not supported by
VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
will require memory to be preregistered in order to work.

The accounting is done per the user process.

This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.

In order to support memory pre-registration, we need a way to track
the use of every registered memory region and only allow unregistration
if a region is not in use anymore. So we need a way to tell from what
region the just cleared TCE was from.

This adds a userspace view of the TCE table into iommu_table struct.
It contains userspace address, one per TCE entry. The table is only
allocated when the ownership over an IOMMU group is taken which means
it is only used from outside of the powernv code (such as VFIO).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
---
Changes:
v11:
* mm_iommu_put() does not return a code so this does not check it
* moved v2 in tce_container to pack the struct

v10:
* moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
specific thing
* squashed powerpc/iommu: Add userspace view of TCE table into this as
it is
a part of IOMMU v2
* s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
* fixed some function names to have tce_iommu_ in the beginning rather
just tce_
* as mm_iommu_mapped_inc() can now fail, check for the return code

v9:
* s/tce_get_hva_cached/tce_iommu_use_page_v2/

v7:
* now memory is registered per mm (i.e. process)
* moved memory registration code to powerpc/mmu
* merged vfio: powerpc/spapr: Define v2 IOMMU into this
* limited new ioctls to v2 IOMMU
* updated doc
* unsupported ioclts return -ENOTTY instead of -EPERM

v6:
* tce_get_hva_cached() returns hva via a pointer

v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt  |  31 ++-
 arch/powerpc/include/asm/iommu.h|   6 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 512 ++--
 include/uapi/linux/vfio.h   |  27 ++
 4 files changed, 487 insertions(+), 89 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..7dcf2b5 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
 
 This implementation has some specifics:
 
-1) Only one IOMMU group per container is supported as an IOMMU group
-represents the minimal entity which isolation can be guaranteed for and
-groups are allocated statically, one per a Partitionable Endpoint (PE)
+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
+container is supported as an IOMMU table is allocated at the boot time,
+one table per a IOMMU group which is a Partitionable Endpoint (PE)
 (PE is often a PCI domain but not always).
+Newer systems (POWER8 with IODA2) have improved hardware design which allows
+to remove this limitation and have multiple IOMMU groups per a VFIO container.
 
 2) The hardware supports so called DMA windows - the PCI address range
 within which DMA transfer is allowed, any attempt to access address space
@@ -427,6 +429,29 @@ The code flow from the example above should be slightly 
changed

[PATCH kernel v11 32/34] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-05-29 Thread Alexey Kardashevskiy

We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory.

Another use of it is in-kernel real mode (mmu off) acceleration of
DMA requests where real time translation of guest physical to host
physical addresses is non-trivial and may fail as linux ptes may be
temporarily invalid. Also, having cached host physical addresses
(compared to just pinning at the start and then walking the page table
again on every H_PUT_TCE), we can be sure that the addresses which we put
into TCE table are the ones we already pinned.

This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage usage counters; multiple registration of the same memory
is allowed (once per container).

This implements 2 counters per registered memory region:
- @mapped: incremented on every DMA mapping; decremented on unmapping;
initialized to 1 when a region is just registered; once it becomes zero,
no more mappings allowe;
- @used: incremented on every register ioctl; decremented on
unregister; unregistration is allowed for DMA mapped regions unless
it is the very last reference. For the very last reference this checks
that the region is still mapped and returns -EBUSY so the userspace
gets to know that memory is still pinned and unregistration needs to
be retried; @used remains 1.

Host physical addresses are stored in vmalloc'ed array. In order to
access these in the real mode (mmu off), there is a real_vmalloc_addr()
helper. In-kernel acceleration patchset will move it from KVM to MMU code.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v11:
* added mutex to protect adding and removing
* added mm_iommu_init() helper
* kref is removed, now there are an atomic counter (@mapped) and a mutex
(for @used)
* merged mm_iommu_alloc into mm_iommu_get and do check-and-alloc under
one mutex lock; mm_iommu_get() returns old @used value so the caller can
know if it needs to elevate locked_vm counter
* do locked_vm counting in mmu_context_hash64_iommu.c

v10:
* split mm_iommu_mapped_update into mm_iommu_mapped_dec + mm_iommu_mapped_inc
* mapped counter now keep one reference for itself and mm_iommu_mapped_inc()
can tell if the region is being released
* updated commit log

v8:
* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
* fixed error fallback look (s/[i]/[j]/)
---
 arch/powerpc/include/asm/mmu-hash64.h  |   3 +
 arch/powerpc/include/asm/mmu_context.h |  16 ++
 arch/powerpc/kernel/setup_64.c |   3 +
 arch/powerpc/mm/Makefile   |   1 +
 arch/powerpc/mm/mmu_context_hash64.c   |   6 +
 arch/powerpc/mm/mmu_context_hash64_iommu.c | 297 +
 6 files changed, 326 insertions(+)
 create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
/* for 4K PTE fragment support */
void *pte_frag;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+   struct list_head iommu_group_mem_list;
+#endif
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..70a4f2a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,22 @@
  */
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+   struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_init(mm_context_t *ctx);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+   unsigned long size);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+   unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#endif
 
 extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next

[PATCH kernel v11 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()

2015-05-29 Thread Alexey Kardashevskiy

At the moment iommu_free_table() only releases memory if
the table was initialized for the platform code use, i.e. it had
it_map initialized (which purpose is to track DMA memory space use).

With dynamic DMA windows, we will need to be able to release
iommu_table even if it was used for VFIO in which case it_map is NULL
so does the patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v11:
* fixed parameter checks
---
 arch/powerpc/kernel/iommu.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3d47eb3..73eb39a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -713,9 +713,11 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
unsigned long bitmap_sz;
unsigned int order;
 
-   if (!tbl || !tbl-it_map) {
-   printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
-   node_name);
+   if (!tbl)
+   return;
+
+   if (!tbl-it_map) {
+   kfree(tbl);
return;
}
 
-- 
2.4.0.rc3.8.gfb3e7d5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v11 17/34] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-05-29 Thread Alexey Kardashevskiy

Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically now by a pnv_pci_table_alloc() helper as PCI hotplug
code (for EEH recovery) and SRIOV are supported there.

For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.

For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_group_alloc() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_group_free()
helper is added.

This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda2_setup_dma_pe.

This replaces a single pointer to iommu_group with a list of
iommu_table_group structs. For now it is just a single iommu_table_group
in this list but later with TCE table sharing enabled, the list will
keep all the IOMMU groups which use the particular table. The list
uses iommu_table_group_link structs rather than iommu_table_group::next
as a VFIO container may have 2 IOMMU tables, each will have its own list
head pointer as it is mainly for TCE invalidation code which should
walk through all attached groups and invalidate TCE cache so
the table has to keep the list head pointer. The other option would
be storing list head in a VFIO container but it would not work as
the platform code (which does TCE table update and invalidation) has
no idea about VFIO.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v10:
* iommu_table is not embedded into iommu_table_group but allocated
dynamically
* iommu_table allocation is moved to a single place for IODA2's
pnv_pci_ioda_setup_dma_pe where it belongs to
* added list of groups into iommu_table; most of the code just looks at
the first item to keep the patch simpler

v9:
* s/it_group/it_table_group/
* added and used iommu_table_group_free(), from now iommu_free_table()
is only used for VIO
* added iommu_pseries_group_alloc()
* squashed powerpc/iommu: Introduce iommu_table_alloc() helper into this
---
 arch/powerpc/include/asm/iommu.h|   8 +-
 arch/powerpc/kernel/iommu.c |   9 +-
 arch/powerpc/platforms/powernv/pci-ioda.c   |  45 ++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |   3 +
 arch/powerpc/platforms/powernv/pci.c|  76 +
 arch/powerpc/platforms/powernv/pci.h|   7 ++
 arch/powerpc/platforms/pseries/iommu.c  |  33 +++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 122 
 8 files changed, 242 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5a7267f..44a20cc 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,7 +91,7 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-   struct iommu_table_group *it_table_group;
+   struct list_head it_group_list;/* List of iommu_table_group_link */
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
@@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group_link {
+   struct list_head next;
+   struct rcu_head rcu;
+   struct iommu_table_group *table_group;
+};
+
 struct iommu_table_group {
struct iommu_group *group;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 719f048..e305a8f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
struct iommu_table *tbl;
+   struct iommu_table_group_link *tgl;
 
/*
 * The sysfs entries should be populated before
@@ -1095,15 +1096,17 @@ int iommu_add_device(struct device *dev)
}
 
tbl = get_iommu_table_base(dev);
-   if (!tbl || !tbl-it_table_group || !tbl-it_table_group-group) {
+   if (!tbl || list_empty(tbl-it_group_list

[PATCH kernel v11 15/34] powerpc/powernv/ioda/ioda2: Rework TCE invalidation in tce_build()/tce_free()

2015-05-29 Thread Alexey Kardashevskiy

The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
supposed to be called on IODA1/2 and not called on p5ioc2. It receives
start and end host addresses of TCE table.

IODA2 actually needs PCI addresses to invalidate the cache. Those
can be calculated from host addresses but since we are going
to implement multi-level TCE tables, calculating PCI address from
a host address might get either tricky or ugly as TCE table remains flat
on PCI bus but not in RAM.

This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/
pnt_tce_free and defines IODA1/2-specific callbacks which call generic
ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps
using generic callbacks as before.

This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
number of pages which are PCI addresses shifted by IOMMU page shift.

No change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v11:
* changed type of some ret to int as everywhere else

v10:
* moved before Switch from iommu_table to new iommu_table_group as it adds
list of groups to iommu_table and tce invalidation depends on it

v9:
* removed confusing comment from commit log about unintentional calling of
pnv_pci_ioda_tce_invalidate()
* moved mechanical changes away to powerpc/iommu: Move tce_xxx callbacks from 
ppc_md to iommu_table
* fixed bug with broken invalidation in pnv_pci_ioda2_tce_invalidate -
@index includes @tbl-it_offset but old code added it anyway which later broke
DDW
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 81 ++-
 arch/powerpc/platforms/powernv/pci.c  | 17 ++-
 2 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2924abe..3d32c37 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1678,18 +1678,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
*pe,
}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
 {
+   struct pnv_ioda_pe *pe = tbl-data;
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
unsigned long start, end, inc;
const unsigned shift = tbl-it_page_shift;
 
-   start = __pa(startp);
-   end = __pa(endp);
+   start = __pa(((__be64 *)tbl-it_base) + index - tbl-it_offset);
+   end = __pa(((__be64 *)tbl-it_base) + index - tbl-it_offset +
+   npages - 1);
 
/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
if (tbl-it_busno) {
@@ -1725,16 +1726,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
pnv_ioda_pe *pe,
 */
 }
 
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+   long npages, unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs)
+{
+   int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+   attrs);
+
+   if (!ret  (tbl-it_type  TCE_PCI_SWINV_CREATE))
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+   return ret;
+}
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+   long npages)
+{
+   pnv_tce_free(tbl, index, npages);
+
+   if (tbl-it_type  TCE_PCI_SWINV_FREE)
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
-   .set = pnv_tce_build,
-   .clear = pnv_tce_free,
+   .set = pnv_ioda1_tce_build,
+   .clear = pnv_ioda1_tce_free,
.get = pnv_tce_get,
 };
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
 {
+   struct pnv_ioda_pe *pe = tbl-data;
unsigned long start, end, inc;
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
@@ -1747,10 +1771,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
end = start;
 
/* Figure out the start, end and step */
-   inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64));
-   start |= (inc  shift

[PATCH kernel v11 11/34] vfio: powerpc/spapr: Moving pinning/unpinning to helpers

2015-05-29 Thread Alexey Kardashevskiy

This is a pretty mechanical patch to make next patches simpler.

New tce_iommu_unuse_page() helper does put_page() now but it might skip
that after the memory registering patch applied.

As we are here, this removes unnecessary checks for a value returned
by pfn_to_page() as it cannot possibly return NULL.

This moves tce_iommu_disable() later to let tce_iommu_clear() know if
the container has been enabled because if it has not been, then
put_page() must not be called on TCEs from the TCE table. This situation
is not yet possible but it will after KVM acceleration patchset is
applied.

This changes code to work with physical addresses rather than linear
mapping addresses for better code readability. Following patches will
add an xchg() callback for an IOMMU table which will accept/return
physical addresses (unlike current tce_build()) which will eliminate
redundant conversions.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com
---
Changes:
v9:
* changed helpers to work with physical addresses rather than linear
(for simplicity - later ::xchg() will receive physical and avoid
additional convertions)

v6:
* tce_get_hva() returns hva via a pointer
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 61 +
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index e21479c..115d5e6 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
struct tce_container *container = iommu_data;
 
WARN_ON(container-tbl  !container-tbl-it_group);
-   tce_iommu_disable(container);
 
if (container-tbl  container-tbl-it_group)
tce_iommu_detach_group(iommu_data, container-tbl-it_group);
 
+   tce_iommu_disable(container);
mutex_destroy(container-lock);
 
kfree(container);
 }
 
+static void tce_iommu_unuse_page(struct tce_container *container,
+   unsigned long oldtce)
+{
+   struct page *page;
+
+   if (!(oldtce  (TCE_PCI_READ | TCE_PCI_WRITE)))
+   return;
+
+   page = pfn_to_page(oldtce  PAGE_SHIFT);
+
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(page);
+
+   put_page(page);
+}
+
 static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
 {
unsigned long oldtce;
-   struct page *page;
 
for ( ; pages; --pages, ++entry) {
oldtce = iommu_clear_tce(tbl, entry);
if (!oldtce)
continue;
 
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
+   tce_iommu_unuse_page(container, oldtce);
}
 
return 0;
 }
 
+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
+{
+   struct page *page = NULL;
+   enum dma_data_direction direction = iommu_tce_direction(tce);
+
+   if (get_user_pages_fast(tce  PAGE_MASK, 1,
+   direction != DMA_TO_DEVICE, page) != 1)
+   return -EFAULT;
+
+   *hpa = __pa((unsigned long) page_address(page));
+
+   return 0;
+}
+
 static long tce_iommu_build(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages)
 {
long i, ret = 0;
-   struct page *page = NULL;
-   unsigned long hva;
+   struct page *page;
+   unsigned long hpa;
enum dma_data_direction direction = iommu_tce_direction(tce);
 
for (i = 0; i  pages; ++i) {
unsigned long offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
 
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   ret = -EFAULT;
+   ret = tce_iommu_use_page(tce, hpa);
+   if (ret)
break;
-   }
 
+   page = pfn_to_page(hpa  PAGE_SHIFT);
if (!tce_page_is_contained(page, tbl-it_page_shift)) {
ret = -EPERM;
break;
}
 
-   hva = (unsigned long) page_address(page) + offset;
-
-   ret = iommu_tce_build(tbl, entry + i, hva, direction);
+   hpa |= offset;
+   ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va

Re: [PATCH kernel v6 26/29] vfio: powerpc/spapr: Define v2 IOMMU

2015-03-16 Thread Alexey Kardashevskiy


On 03/17/2015 06:45 AM, Alex Williamson wrote:

On Fri, 2015-03-13 at 19:07 +1100, Alexey Kardashevskiy wrote:

The existing IOMMU requires VFIO_IOMMU_ENABLE call to enable actual use
of the container (i.e. call DMA map/unmap) and this is where we check
the rlimit for locked pages. It assumes that only as much memory
as a default DMA window can be mapped. Every DMA map/unmap request will
do pinning/unpinning of physical pages.

New IOMMU will split physical pages pinning and TCE table update.
It will require guest pages to be registered first and consequent
map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
However when a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.

This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v6:
* enforced limitations imposed by the SPAPR TCE IOMMU version
---
  drivers/vfio/vfio_iommu_spapr_tce.c | 18 +-
  include/uapi/linux/vfio.h   |  2 ++
  2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 9d240b4..e191438 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -95,6 +95,7 @@ struct tce_container {
bool enabled;
unsigned long locked_pages;
struct list_head mem_list;
+   bool v2;
  };

  struct tce_memory {
@@ -398,7 +399,7 @@ static void *tce_iommu_open(unsigned long arg)
  {
struct tce_container *container;

-   if (arg != VFIO_SPAPR_TCE_IOMMU) {
+   if ((arg != VFIO_SPAPR_TCE_IOMMU)  (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
pr_err(tce_vfio: Wrong IOMMU type\n);
return ERR_PTR(-EINVAL);
}
@@ -410,6 +411,8 @@ static void *tce_iommu_open(unsigned long arg)
mutex_init(container-lock);
INIT_LIST_HEAD_RCU(container-mem_list);

+   container-v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
+
return container;
  }

@@ -580,6 +583,7 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_CHECK_EXTENSION:
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
+   case VFIO_SPAPR_TCE_v2_IOMMU:
ret = 1;
break;
default:
@@ -719,6 +723,9 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
struct vfio_iommu_spapr_register_memory param;

+   if (!container-v2)
+   return -EPERM;
+
minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
size);

@@ -741,6 +748,9 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
struct vfio_iommu_spapr_register_memory param;

+   if (!container-v2)
+   return -EPERM;
+
minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
size);

@@ -761,6 +771,9 @@ static long tce_iommu_ioctl(void *iommu_data,
return 0;
}
case VFIO_IOMMU_ENABLE:
+   if (container-v2)
+   return -EPERM;
+
mutex_lock(container-lock);
ret = tce_iommu_enable(container);
mutex_unlock(container-lock);
@@ -768,6 +781,9 @@ static long tce_iommu_ioctl(void *iommu_data,


case VFIO_IOMMU_DISABLE:
+   if (container-v2)
+   return -EPERM;
+
mutex_lock(container-lock);
tce_iommu_disable(container);
mutex_unlock(container-lock);



I wouldn't have guessed; nothing in the documentation suggests these
ioctls are deprecated in v2 (ie. please document).  If the ioctl doesn't
exist for the IOMMU type, why not simply break and let it fall out at
-ENOTTY?  Same for the above, v1 would have previously returned -ENOTTY
for those ioctls, why change to -EPERM?



Good points. I'll fix them and merge this patch to vfio: powerpc/spapr: 
Register memory as this is where it actually belongs to. Agree?



Thanks for the review!


--
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH kernel v6 29/29] vfio: powerpc/spapr: Support Dynamic DMA windows

2015-03-16 Thread Alexey Kardashevskiy


On 03/17/2015 06:38 AM, Alex Williamson wrote:

On Fri, 2015-03-13 at 19:07 +1100, Alexey Kardashevskiy wrote:

This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.

This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.

This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.

DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
as we still want to support v2 on platforms which cannot do DDW for
the sake of TCE acceleration in KVM (coming soon).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v6:
* added explicit VFIO_IOMMU_INFO_DDW flag to vfio_iommu_spapr_tce_info,
it used to be page mask flags from platform code
* added explicit pgsizes field
* added cleanup if tce_iommu_create_window() failed in a middle
* added checks for callbacks in tce_iommu_create_window and remove those
from tce_iommu_remove_window when it is too late to test anyway
* spapr_tce_find_free_table returns sensible error code now
* updated description of VFIO_IOMMU_SPAPR_TCE_CREATE/
VFIO_IOMMU_SPAPR_TCE_REMOVE

v4:
* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
helpers
* added docs
---
  Documentation/vfio.txt  |  19 
  arch/powerpc/include/asm/iommu.h|   2 +-
  drivers/vfio/vfio_iommu_spapr_tce.c | 206 +++-
  include/uapi/linux/vfio.h   |  41 ++-
  4 files changed, 265 insertions(+), 3 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 791e85c..61ce393 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -446,6 +446,25 @@ the memory block.
  The user space is not expected to call these often and the block descriptors
  are stored in a linked list in the kernel.

+6) sPAPR specification allows guests to have an ddditional DMA window(s) on



s/ddditional/additional/


+a PCI bus with a variable page size. Two ioctls have been added to support
+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
+The platform has to support the functionality or error will be returned to
+the userspace. The existing hardware supports up to 2 DMA windows, one is
+2GB long, uses 4K pages and called default 32bit window; the other can
+be as big as entire RAM, use different page size, it is optional - guests
+create those in run-time if the guest driver supports 64bit DMA.
+
+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
+a number of TCE table levels (if a TCE table is going to be big enough and
+the kernel may not be able to allocate enough of physicall contiguous memory).


s/physicall/physically/


+It creates a new window in the available slot and returns the bus address where
+the new window starts. Due to hardware limitation, the user space cannot choose
+the location of DMA windows.
+
+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
+and removes it.
+
  
---

  [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 13145a2..bac02bf 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -138,7 +138,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const 
char *node_name);
  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);

-#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+#define IOMMU_TABLE_GROUP_MAX_TABLES   2

  struct iommu_table_group;

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index d94116b..0129a4f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -600,11 +600,137 @@ static long tce_iommu_build(struct tce_container 
*container,
return ret;
  }

+static int spapr_tce_find_free_table(struct tce_container *container)
+{
+   int i;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = container-tables[i];
+
+   if (!tbl-it_size)
+   return i;
+   }
+
+   return -ENOSPC;
+}
+
+static long tce_iommu_create_window(struct tce_container *container,
+   __u32 page_shift, __u64 window_size, __u32 levels,
+   __u64 *start_addr)
+{
+   struct

[PATCH kernel v6 05/29] vfio: powerpc/spapr: Move locked_vm accounting to helpers

2015-03-13 Thread Alexey Kardashevskiy

There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).

This reworks debug messages to show the current value and the limit.

This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.

While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* new helpers do nothing if @npages == 0
* tce_iommu_disable() now can decrement the counter if the group was
detached (not possible now but will be in the future)
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 82 -
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 431e605..c47b969 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -31,6 +31,51 @@
 static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
 
+static long try_increment_locked_vm(long npages)
+{
+   long ret = 0, locked, lock_limit;
+
+   if (!current || !current-mm)
+   return -ESRCH; /* process exited */
+
+   if (!npages)
+   return 0;
+
+   down_write(current-mm-mmap_sem);
+   locked = current-mm-locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
+   if (locked  lock_limit  !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current-mm-locked_vm += npages;
+
+   pr_debug([%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ?  - exceeded : );
+
+   up_write(current-mm-mmap_sem);
+
+   return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+   if (!current || !current-mm || !npages)
+   return; /* process exited */
+
+   down_write(current-mm-mmap_sem);
+   if (npages  current-mm-locked_vm)
+   npages = current-mm-locked_vm;
+   current-mm-locked_vm -= npages;
+   pr_debug([%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK));
+   up_write(current-mm-mmap_sem);
+}
+
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -47,6 +92,7 @@ struct tce_container {
struct mutex lock;
struct iommu_table *tbl;
bool enabled;
+   unsigned long locked_pages;
 };
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -62,7 +108,7 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
-   unsigned long locked, lock_limit, npages;
+   unsigned long locked;
struct iommu_table *tbl = container-tbl;
 
if (!container-tbl)
@@ -91,21 +137,22 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 * that would effectively kill the guest at random points, much better
 * enforcing the limit based on the max that the guest can map.
+*
+* Unfortunately at the moment it counts whole tables, no matter how
+* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+* each with 2GB DMA window, 8GB will be counted here. The reason for
+* this is that we cannot tell here the amount of RAM used by the guest
+* as this information is only available from KVM and VFIO is
+* KVM agnostic.
 */
-   down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
-   locked = current-mm-locked_vm + npages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
-   if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
-   pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n,
-   rlimit(RLIMIT_MEMLOCK));
-   ret = -ENOMEM;
-   } else {
+   locked = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
+   ret = try_increment_locked_vm(locked);
+   if (ret)
+   return ret;
 
-   current-mm-locked_vm += npages;
-   container-enabled = true;
-   }
-   up_write(current-mm-mmap_sem);
+   container-locked_pages = locked

[PATCH kernel v6 14/29] vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control

2015-03-13 Thread Alexey Kardashevskiy

This replaces iommu_take_ownership()/iommu_release_ownership() calls
with the callback calls and it is up to the platform code to call
iommu_take_ownership()/iommu_release_ownership() if needed.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h|  4 +--
 arch/powerpc/kernel/iommu.c | 50 -
 drivers/vfio/vfio_iommu_spapr_tce.c |  4 +--
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 667aa1a..b9e50d3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -225,8 +225,8 @@ extern unsigned long iommu_clear_tce(struct iommu_table 
*tbl,
unsigned long entry);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
-extern int iommu_take_ownership(struct iommu_table *tbl);
-extern void iommu_release_ownership(struct iommu_table *tbl);
+extern int iommu_take_ownership(struct iommu_table_group *table_group);
+extern void iommu_release_ownership(struct iommu_table_group *table_group);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
 extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 7794dce..8358631 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1050,7 +1050,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_take_ownership(struct iommu_table *tbl)
+static int iommu_table_take_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
@@ -1064,19 +1064,36 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
memset(tbl-it_map, 0xff, sz);
 
-   /*
-* Disable iommu bypass, otherwise the user can DMA to all of
-* our physical memory via the bypass window instead of just
-* the pages that has been explicitly mapped into the iommu
-*/
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, false);
+   return 0;
+}
+
+static void iommu_table_release_ownership(struct iommu_table *tbl);
+
+int iommu_take_ownership(struct iommu_table_group *table_group)
+{
+   int i, j, rc = 0;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = table_group-tables[i];
+
+   if (!tbl-it_map)
+   continue;
+
+   rc = iommu_table_take_ownership(tbl);
+   if (rc) {
+   for (j = 0; j  i; ++j)
+   iommu_table_release_ownership(
+   table_group-tables[j]);
+
+   return rc;
+   }
+   }
 
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
-void iommu_release_ownership(struct iommu_table *tbl)
+static void iommu_table_release_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
@@ -1086,9 +1103,18 @@ void iommu_release_ownership(struct iommu_table *tbl)
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
 
-   /* The kernel owns the device now, we can restore the iommu bypass */
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, true);
+}
+
+extern void iommu_release_ownership(struct iommu_table_group *table_group)
+{
+   int i;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = table_group-tables[i];
+
+   if (tbl-it_map)
+   iommu_table_release_ownership(tbl);
+   }
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index bfbe8bb..5d1762b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -801,7 +801,7 @@ static int tce_iommu_attach_group(void *iommu_data,
goto unlock_exit;
}
 
-   ret = iommu_take_ownership(table_group-tables[0]);
+   ret = iommu_take_ownership(table_group);
if (!ret)
container-grp = iommu_group;
 
@@ -838,7 +838,7 @@ static void tce_iommu_detach_group(void *iommu_data,
table_group = iommu_group_get_iommudata(iommu_group);
BUG_ON(!table_group);
 
-   iommu_release_ownership(table_group-tables[0]);
+   iommu_release_ownership(table_group);
 
 unlock_exit:
mutex_unlock(container-lock);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 17/29] powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free()

2015-03-13 Thread Alexey Kardashevskiy

The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
supposed to be called on IODA1/2 and not called on p5ioc2. It receives
start and end host addresses of TCE table. This approach makes it possible
to get pnv_pci_ioda_tce_invalidate() unintentionally called on p5ioc2.
Another issue is that IODA2 needs PCI addresses to invalidate the cache
and those can be calculated from host addresses but since we are going
to implement multi-level TCE tables, calculating PCI address from
a host address might get either tricky or ugly as TCE table remains flat
on PCI bus but not in RAM.

This defines separate iommu_table_ops callbacks for p5ioc2 and IODA1/2
PHBs. They all call common pnv_tce_build/pnv_tce_free/pnv_tce_get helpers
but call PHB specific TCE invalidation helper (when needed).

This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
number of pages which are PCI addresses shifted by IOMMU page shift.

The patch is pretty mechanical and behaviour is not expected to change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 92 ++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  9 ++-
 arch/powerpc/platforms/powernv/pci.c| 76 +---
 arch/powerpc/platforms/powernv/pci.h|  7 ++-
 4 files changed, 111 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9687731..fd993bc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1065,18 +1065,20 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
*pe,
}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
 {
+   struct pnv_ioda_pe *pe = container_of(tbl-it_group,
+   struct pnv_ioda_pe, table_group);
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
unsigned long start, end, inc;
const unsigned shift = tbl-it_page_shift;
 
-   start = __pa(startp);
-   end = __pa(endp);
+   start = __pa((__be64 *)tbl-it_base + index - tbl-it_offset);
+   end = __pa((__be64 *)tbl-it_base + index - tbl-it_offset +
+   npages - 1);
 
/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
if (tbl-it_busno) {
@@ -1112,10 +1114,40 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
pnv_ioda_pe *pe,
 */
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static int pnv_ioda1_tce_build_vm(struct iommu_table *tbl, long index,
+   long npages, unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs)
 {
+   long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+   attrs);
+
+   if (!ret  (tbl-it_type  TCE_PCI_SWINV_CREATE))
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+   return ret;
+}
+
+static void pnv_ioda1_tce_free_vm(struct iommu_table *tbl, long index,
+   long npages)
+{
+   pnv_tce_free(tbl, index, npages);
+
+   if (tbl-it_type  TCE_PCI_SWINV_FREE)
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
+struct iommu_table_ops pnv_ioda1_iommu_ops = {
+   .set = pnv_ioda1_tce_build_vm,
+   .clear = pnv_ioda1_tce_free_vm,
+   .get = pnv_tce_get,
+};
+
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
+{
+   struct pnv_ioda_pe *pe = container_of(tbl-it_group,
+   struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
@@ -1128,9 +1160,9 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
end = start;
 
/* Figure out the start, end and step */
-   inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64));
+   inc = tbl-it_offset + index / sizeof(u64);
start |= (inc  shift);
-   inc = tbl-it_offset + (((u64)endp - tbl-it_base) / sizeof(u64));
+   inc = tbl-it_offset + (index + npages - 1) / sizeof(u64);
end |= (inc  shift);
inc = (0x1ull  shift);
mb();
@@ -1144,19 +1176,35 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe

[PATCH kernel v6 22/29] powerpc/iommu: Split iommu_free_table into 2 helpers

2015-03-13 Thread Alexey Kardashevskiy

The iommu_free_table helper release memory it is using (the TCE table and
@it_map) and release the iommu_table struct as well. We might not want
the very last step as we store iommu_table in parent structures.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 57 
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 6af22a4..fd118ea 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -125,6 +125,7 @@ static inline void *get_iommu_table_base(struct device *dev)
 
 extern struct iommu_table *iommu_table_alloc(int node);
 /* Frees table for an individual device node */
+extern void iommu_reset_table(struct iommu_table *tbl, const char *node_name);
 extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
 
 /* Initializes an iommu_table based in values set in the passed-in
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2856a4a..3500bd0 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -721,24 +721,46 @@ struct iommu_table *iommu_table_alloc(int node)
return table_group-tables[0];
 }
 
+void iommu_reset_table(struct iommu_table *tbl, const char *node_name)
+{
+   if (!tbl)
+   return;
+
+   if (tbl-it_map) {
+   unsigned long bitmap_sz;
+   unsigned int order;
+
+   /*
+* In case we have reserved the first bit, we should not emit
+* the warning below.
+*/
+   if (tbl-it_offset == 0)
+   clear_bit(0, tbl-it_map);
+
+   /* verify that table contains no entries */
+   if (!bitmap_empty(tbl-it_map, tbl-it_size))
+   pr_warn(%s: Unexpected TCEs for %s\n, __func__,
+   node_name);
+
+   /* calculate bitmap size in bytes */
+   bitmap_sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long);
+
+   /* free bitmap */
+   order = get_order(bitmap_sz);
+   free_pages((unsigned long) tbl-it_map, order);
+   }
+
+   memset(tbl, 0, sizeof(*tbl));
+}
+
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
-   unsigned long bitmap_sz;
-   unsigned int order;
struct iommu_table_group *table_group = tbl-it_group;
 
-   if (!tbl || !tbl-it_map) {
-   printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
-   node_name);
+   if (!tbl)
return;
-   }
 
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl-it_offset == 0)
-   clear_bit(0, tbl-it_map);
+   iommu_reset_table(tbl, node_name);
 
 #ifdef CONFIG_IOMMU_API
if (table_group-group) {
@@ -747,17 +769,6 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
}
 #endif
 
-   /* verify that table contains no entries */
-   if (!bitmap_empty(tbl-it_map, tbl-it_size))
-   pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name);
-
-   /* calculate bitmap size in bytes */
-   bitmap_sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long);
-
-   /* free bitmap */
-   order = get_order(bitmap_sz);
-   free_pages((unsigned long) tbl-it_map, order);
-
/* free table */
kfree(table_group);
 }
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 18/29] powerpc/iommu/powernv: Release replaced TCE

2015-03-13 Thread Alexey Kardashevskiy

At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.

This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards.

The returned old TCE value is a virtual address as the new TCE value.
This is different from tce_clear() which returns a physical address.

This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.

This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().

This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from DMA direction.

This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h| 15 +--
 arch/powerpc/kernel/iommu.c | 54 ++---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 26 
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  1 +
 arch/powerpc/platforms/powernv/pci.c| 15 +++
 arch/powerpc/platforms/powernv/pci.h|  2 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 62 ++---
 7 files changed, 113 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d1f8c6c..6af22a4 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,11 +44,20 @@ extern int iommu_is_off;
 extern int iommu_force_on;
 
 struct iommu_table_ops {
+   /* When called with direction==DMA_NONE, it is equal to clear() */
int (*set)(struct iommu_table *tbl,
long index, long npages,
unsigned long uaddr,
enum dma_data_direction direction,
struct dma_attrs *attrs);
+   /*
+* Exchanges existing TCE with new TCE plus direction bits;
+* returns old TCE and DMA direction mask
+*/
+   int (*exchange)(struct iommu_table *tbl,
+   long index,
+   unsigned long *tce,
+   enum dma_data_direction *direction);
void (*clear)(struct iommu_table *tbl,
long index, long npages);
unsigned long (*get)(struct iommu_table *tbl, long index);
@@ -231,10 +240,8 @@ extern int iommu_tce_clear_param_check(struct iommu_table 
*tbl,
unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-   unsigned long entry);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+   unsigned long *tce, enum dma_data_direction *direction);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table_group *table_group);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index df888a7..2856a4a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -982,9 +982,6 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
 int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce)
 {
-   if (!(tce  (TCE_PCI_WRITE | TCE_PCI_READ)))
-   return -EINVAL;
-
if (tce  ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
return -EINVAL;
 
@@ -1002,44 +999,23 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 
-unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
-{
-   unsigned long oldtce;
-   struct iommu_pool *pool = get_pool(tbl, entry);
-
-   spin_lock((pool-lock));
-
-   oldtce = tbl-it_ops-get(tbl, entry);
-   if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ))
-   tbl-it_ops-clear(tbl, entry, 1);
-   else
-   oldtce = 0;
-
-   spin_unlock((pool-lock));
-
-   return oldtce;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tce

[PATCH kernel v6 10/29] powerpc/powernv: Do not set read flag if direction==DMA_NONE

2015-03-13 Thread Alexey Kardashevskiy

Normally a bitmap from the iommu_table is used to track what TCE entry
is in use. Since we are going to use iommu_table without its locks and
do xchg() instead, it becomes essential not to put bits which are not
implied in the direction flag.

This adds iommu_direction_to_tce_perm() (its counterpart is there already)
and uses it for powernv's pnv_tce_build().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 15 +++
 arch/powerpc/platforms/powernv/pci.c |  7 +--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index ed69b7d..2af2d70 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -203,6 +203,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 1b4a178..865beef 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -916,6 +916,21 @@ enum dma_data_direction iommu_tce_direction(unsigned long 
tce)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_direction);
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+   switch (dir) {
+   case DMA_BIDIRECTIONAL:
+   return TCE_PCI_READ | TCE_PCI_WRITE;
+   case DMA_FROM_DEVICE:
+   return TCE_PCI_WRITE;
+   case DMA_TO_DEVICE:
+   return TCE_PCI_READ;
+   default:
+   return 0;
+   }
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 void iommu_flush_tce(struct iommu_table *tbl)
 {
/* Flush/invalidate TLB caches if necessary */
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 54323d6..609f5b1 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -593,15 +593,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
 unsigned long uaddr, enum dma_data_direction direction,
 struct dma_attrs *attrs, bool rm)
 {
-   u64 proto_tce;
+   u64 proto_tce = iommu_direction_to_tce_perm(direction);
__be64 *tcep, *tces;
u64 rpn;
 
-   proto_tce = TCE_PCI_READ; // Read allowed
-
-   if (direction != DMA_TO_DEVICE)
-   proto_tce |= TCE_PCI_WRITE;
-
tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
rpn = __pa(uaddr)  tbl-it_page_shift;
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 25/29] powerpc/powernv/ioda: Define and implement DMA table/window management callbacks

2015-03-13 Thread Alexey Kardashevskiy

This extends iommu_table_group_ops by a set of callbacks to support
dynamic DMA windows management.

query() returns IOMMU capabilities such as default DMA window address and
supported number of DMA windows and TCE table levels.

create_table() creates a TCE table with specific parameters.
it receives iommu_table_group to know nodeid in order to allocate
TCE table memory closer to the PHB. The exact format of allocated
multi-level table might be also specific to the PHB model (not
the case now though).
This callback calculated the DMA window offset on a PCI bus from @num
and stores it in a just created table.

set_window() sets the window at specified TVT index + @num on PHB.

unset_window() unsets the window from specified TVT.

This adds a free() callback to iommu_table_ops to free the memory
(potentially a tree of tables) allocated for the TCE table.

create_table() and free() are supposed to be called once per
VFIO container and set_window()/unset_window() are supposed to be
called for every group in a container.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h| 21 
 arch/powerpc/platforms/powernv/pci-ioda.c   | 84 -
 arch/powerpc/platforms/powernv/pci-p5ioc2.c | 11 ++--
 3 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4007432..13145a2 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -62,6 +62,8 @@ struct iommu_table_ops {
long index, long npages);
unsigned long (*get)(struct iommu_table *tbl, long index);
void (*flush)(struct iommu_table *tbl);
+
+   void (*free)(struct iommu_table *tbl);
 };
 
 /* These are used by VIO */
@@ -148,12 +150,31 @@ struct iommu_table_group_ops {
 */
void (*set_ownership)(struct iommu_table_group *table_group,
bool enable);
+
+   long (*create_table)(struct iommu_table_group *table_group,
+   int num,
+   __u32 page_shift,
+   __u64 window_size,
+   __u32 levels,
+   struct iommu_table *tbl);
+   long (*set_window)(struct iommu_table_group *table_group,
+   int num,
+   struct iommu_table *tblnew);
+   long (*unset_window)(struct iommu_table_group *table_group,
+   int num);
 };
 
 struct iommu_table_group {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *group;
 #endif
+   /* Some key properties of IOMMU */
+   __u32 tce32_start;
+   __u32 tce32_size;
+   __u64 pgsizes; /* Bitmap of supported page sizes */
+   __u32 max_dynamic_windows_supported;
+   __u32 max_levels;
+
struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct iommu_table_group_ops *ops;
 };
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index ca6c3f3..25a93f2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include linux/memblock.h
 #include linux/iommu.h
 #include linux/mmzone.h
+#include linux/sizes.h
 
 #include asm/mmzone.h
 #include asm/sections.h
@@ -48,6 +49,7 @@
 #include pci.h
 
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
+#define POWERNV_IOMMU_MAX_LEVELS   5
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
@@ -1228,11 +1230,14 @@ static void pnv_ioda2_tce_free_vm(struct iommu_table 
*tbl, long index,
pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
+static void pnv_pci_free_table(struct iommu_table *tbl);
+
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build_vm,
.exchange = pnv_ioda2_tce_xchg_vm,
.clear = pnv_ioda2_tce_free_vm,
.get = pnv_tce_get,
+   .free = pnv_pci_free_table,
 };
 
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
@@ -1315,6 +1320,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 TCE_PCI_SWINV_PAIR);
}
tbl-it_ops = pnv_ioda1_iommu_ops;
+   pe-table_group.tce32_start = tbl-it_offset  tbl-it_page_shift;
+   pe-table_group.tce32_size = tbl-it_size  tbl-it_page_shift;
iommu_init_table(tbl, phb-hose-node);
iommu_register_group(pe-table_group, phb-hose-global_number,
pe-pe_number);
@@ -1399,7 +1406,7 @@ static __be64 *pnv_alloc_tce_table(int nid,
 }
 
 static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
-   __u32 page_shift, __u64 window_size, __u32 levels,
+   int num, __u32 page_shift, __u64 window_size, __u32 levels,
struct iommu_table *tbl)
 {
struct pnv_ioda_pe *pe

[PATCH kernel v6 20/29] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_create_table/pnc_pci_free_table

2015-03-13 Thread Alexey Kardashevskiy

This is a part of moving TCE table allocation into an iommu_ops
callback to support multiple IOMMU groups per one VFIO container.

This enforce window size to be a power of two.

This is a pretty mechanical patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++
 1 file changed, 66 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9486b07b..af9208c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,7 +24,9 @@
 #include linux/msi.h
 #include linux/memblock.h
 #include linux/iommu.h
+#include linux/mmzone.h
 
+#include asm/mmzone.h
 #include asm/sections.h
 #include asm/io.h
 #include asm/prom.h
@@ -1329,6 +1331,61 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
+static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe,
+   __u32 page_shift, __u64 window_size,
+   struct iommu_table *tbl)
+{
+   int nid = pe-phb-hose-node;
+   struct page *tce_mem = NULL;
+   void *addr;
+   unsigned long tce_table_size;
+   int64_t rc;
+   unsigned order;
+
+   if (!(table_group-pgsizes  (1ULL  page_shift)))
+   return -EINVAL;
+
+   if ((window_size  memory_hotplug_max()) || !is_power_of_2(window_size))
+   return -EINVAL;
+
+   tce_table_size = (window_size  page_shift) * 8;
+   tce_table_size = max(0x1000UL, tce_table_size);
+
+   /* Allocate TCE table */
+   order = get_order(tce_table_size);
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+   if (!tce_mem) {
+   pr_err(Failed to allocate a TCE memory, order=%d\n, order);
+   rc = -ENOMEM;
+   goto fail;
+   }
+   addr = page_address(tce_mem);
+   memset(addr, 0, tce_table_size);
+
+   /* Setup linux iommu table */
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   page_shift);
+
+   tbl-it_ops = pnv_ioda2_iommu_ops;
+
+   return 0;
+fail:
+   if (tce_mem)
+   __free_pages(tce_mem, get_order(tce_table_size));
+
+   return rc;
+}
+
+static void pnv_pci_free_table(struct iommu_table *tbl)
+{
+   if (!tbl-it_size)
+   return;
+
+   free_pages(tbl-it_base, get_order(tbl-it_size  3));
+   memset(tbl, 0, sizeof(struct iommu_table));
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -1399,11 +1456,9 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
-   struct page *tce_mem = NULL;
-   void *addr;
const __be64 *swinvp;
-   struct iommu_table *tbl;
-   unsigned int tce_table_size, end;
+   unsigned int end;
+   struct iommu_table *tbl = pe-table_group.tables[0];
int64_t rc;
 
/* We shouldn't already have a 32-bit DMA associated */
@@ -1412,30 +1467,20 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
+
end = (1  ilog2(phb-ioda.m32_pci_base));
-   tce_table_size = (end / 0x1000) * 8;
pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n,
end);
 
-   /* Allocate TCE table */
-   tce_mem = alloc_pages_node(phb-hose-node, GFP_KERNEL,
-  get_order(tce_table_size));
-   if (!tce_mem) {
-   pe_err(pe, Failed to allocate a 32-bit TCE memory\n);
-   goto fail;
+   rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K,
+   phb-ioda.m32_pci_base, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld, rc);
+   return;
}
-   addr = page_address(tce_mem);
-   memset(addr, 0, tce_table_size);
 
/* Setup iommu */
pe-table_group.tables[0].it_group = pe-table_group;
-
-   /* Setup linux iommu table */
-   tbl = pe-table_group.tables[0];
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-   IOMMU_PAGE_SHIFT_4K);
-
-   tbl-it_ops = pnv_ioda2_iommu_ops;
iommu_init_table(tbl, phb-hose-node);
pe-table_group.ops = pnv_pci_ioda2_ops;
 
@@ -1482,8 +1527,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 fail:
if (pe-tce32_seg = 0)
pe-tce32_seg = -1;
-   if (tce_mem)
-   __free_pages(tce_mem, get_order(tce_table_size));
+   pnv_pci_free_table(tbl);
 }
 
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
-- 
2.0.0

[PATCH kernel v6 15/29] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership control

2015-03-13 Thread Alexey Kardashevskiy

At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.

The set_bypass() callback is not really an iommu_table function but
IOMMU/PE function. This introduces a iommu_table_group_ops struct and
adds a set_ownership() callback to it which is called when an external
user takes control over the IOMMU.

This renames set_bypass() to set_ownership() as it is not necessarily
just enabling bypassing, it can be something else/more so let's give it
more generic name. The bool parameter is inverted.

The callback is implemented for IODA2 only. Other platforms (P5IOC2,
IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  | 14 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 ++
 drivers/vfio/vfio_iommu_spapr_tce.c   | 26 ++
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index b9e50d3..d1f8c6c 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -92,7 +92,6 @@ struct iommu_table {
unsigned long  it_page_shift;/* table iommu page size */
struct iommu_table_group *it_group;
struct iommu_table_ops *it_ops;
-   void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
@@ -127,11 +126,24 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+   /*
+* Switches ownership from the kernel itself to an external
+* user. While onwership is enabled, the kernel cannot use IOMMU
+* for itself.
+*/
+   void (*set_ownership)(struct iommu_table_group *table_group,
+   bool enable);
+};
+
 struct iommu_table_group {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *group;
 #endif
struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+   struct iommu_table_group_ops *ops;
 };
 
 #ifdef CONFIG_IOMMU_API
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index a964c50..9687731 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1255,10 +1255,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
-   struct pnv_ioda_pe *pe = container_of(tbl-it_group, struct pnv_ioda_pe,
- table_group);
uint16_t window_id = (pe-pe_number  1 ) + 1;
int64_t rc;
 
@@ -1286,7 +1284,8 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table 
*tbl, bool enable)
 * host side.
 */
if (pe-pdev)
-   set_iommu_table_base(pe-pdev-dev, tbl);
+   set_iommu_table_base(pe-pdev-dev,
+   pe-table_group.tables[0]);
else
pnv_ioda_setup_bus_dma(pe, pe-pbus, false);
}
@@ -1302,13 +1301,27 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct 
pnv_phb *phb,
/* TVE #1 is selected by PCI address bit 59 */
pe-tce_bypass_base = 1ull  59;
 
-   /* Install set_bypass callback for VFIO */
-   pe-table_group.tables[0].set_bypass = pnv_pci_ioda2_set_bypass;
-
/* Enable bypass by default */
-   pnv_pci_ioda2_set_bypass(pe-table_group.tables[0], true);
+   pnv_pci_ioda2_set_bypass(pe, true);
 }
 
+static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
+bool enable)
+{
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   if (enable)
+   iommu_take_ownership(table_group);
+   else
+   iommu_release_ownership(table_group);
+
+   pnv_pci_ioda2_set_bypass(pe, !enable);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+   .set_ownership = pnv_ioda2_set_ownership,
+};
+
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
@@ -1376,6 +1389,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
}
tbl-it_ops = pnv_iommu_ops;
iommu_init_table(tbl, phb-hose-node);
+   pe-table_group.ops = pnv_pci_ioda2_ops

[PATCH kernel v6 07/29] vfio: powerpc/spapr: Moving pinning/unpinning to helpers

2015-03-13 Thread Alexey Kardashevskiy

This is a pretty mechanical patch to make next patches simpler.

New tce_iommu_unuse_page() helper does put_page() now but it might skip
that after the memory registering patch applied.

As we are here, this removes unnecessary checks for a value returned
by pfn_to_page() as it cannot possibly return NULL.

This moves tce_iommu_disable() later to let tce_iommu_clear() know if
the container has been enabled because if it has not been, then
put_page() must not be called on TCEs from the TCE table. This situation
is not yet possible but it will after KVM acceleration patchset is
applied.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v6:
* tce_get_hva() returns hva via a pointer
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 69 +++--
 1 file changed, 51 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 8a667cb..be693ca 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -198,7 +198,6 @@ static void tce_iommu_release(void *iommu_data)
struct iommu_table *tbl = container-tbl;
 
WARN_ON(tbl  !tbl-it_group);
-   tce_iommu_disable(container);
 
if (tbl) {
tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
@@ -206,63 +205,97 @@ static void tce_iommu_release(void *iommu_data)
if (tbl-it_group)
tce_iommu_detach_group(iommu_data, tbl-it_group);
}
+
+   tce_iommu_disable(container);
+
mutex_destroy(container-lock);
 
kfree(container);
 }
 
+static void tce_iommu_unuse_page(struct tce_container *container,
+   unsigned long oldtce)
+{
+   struct page *page;
+
+   if (!(oldtce  (TCE_PCI_READ | TCE_PCI_WRITE)))
+   return;
+
+   /*
+* VFIO cannot map/unmap when a container is not enabled so
+* we would not need this check but KVM could map/unmap and if
+* this happened, we must not put pages as KVM does not get them as
+* it expects memory pre-registation to do this part.
+*/
+   if (!container-enabled)
+   return;
+
+   page = pfn_to_page(__pa(oldtce)  PAGE_SHIFT);
+
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(page);
+
+   put_page(page);
+}
+
 static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
 {
unsigned long oldtce;
-   struct page *page;
 
for ( ; pages; --pages, ++entry) {
oldtce = iommu_clear_tce(tbl, entry);
if (!oldtce)
continue;
 
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
+   tce_iommu_unuse_page(container, (unsigned long) __va(oldtce));
}
 
return 0;
 }
 
+static int tce_get_hva(struct tce_container *container,
+   unsigned page_shift, unsigned long tce, unsigned long *hva)
+{
+   struct page *page = NULL;
+   enum dma_data_direction direction = iommu_tce_direction(tce);
+
+   if (get_user_pages_fast(tce  PAGE_MASK, 1,
+   direction != DMA_TO_DEVICE, page) != 1)
+   return -EFAULT;
+
+   *hva = (unsigned long) page_address(page);
+
+   return 0;
+}
+
 static long tce_iommu_build(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages)
 {
long i, ret = 0;
-   struct page *page = NULL;
+   struct page *page;
unsigned long hva;
enum dma_data_direction direction = iommu_tce_direction(tce);
 
for (i = 0; i  pages; ++i) {
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   ret = -EFAULT;
+   ret = tce_get_hva(container, tbl-it_page_shift, tce, hva);
+   if (ret)
break;
-   }
 
+   page = pfn_to_page(__pa(hva)  PAGE_SHIFT);
if (!tce_page_is_contained(page, tbl-it_page_shift)) {
ret = -EPERM;
break;
}
 
-   hva = (unsigned long) page_address(page) +
-   (tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);
+   /* Preserve offset within IOMMU page */
+   hva |= tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
 
ret = iommu_tce_build(tbl, entry + i, hva, direction);
if (ret) {
-   put_page(page

[PATCH kernel v6 19/29] powerpc/powernv/ioda2: Rework iommu_table creation

2015-03-13 Thread Alexey Kardashevskiy

This moves iommu_table creation to the beginning. This is a mechanical
patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9691b3a..9486b07b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1427,27 +1427,31 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
addr = page_address(tce_mem);
memset(addr, 0, tce_table_size);
 
+   /* Setup iommu */
+   pe-table_group.tables[0].it_group = pe-table_group;
+
+   /* Setup linux iommu table */
+   tbl = pe-table_group.tables[0];
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   IOMMU_PAGE_SHIFT_4K);
+
+   tbl-it_ops = pnv_ioda2_iommu_ops;
+   iommu_init_table(tbl, phb-hose-node);
+   pe-table_group.ops = pnv_pci_ioda2_ops;
+
/*
 * Map TCE table through TVT. The TVE index is the PE number
 * shifted by 1 bit for 32-bits DMA space.
 */
rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(addr),
-   tce_table_size, 0x1000);
+   pe-pe_number  1, 1, __pa(tbl-it_base),
+   tbl-it_size  3, 1ULL  tbl-it_page_shift);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
goto fail;
}
 
-   /* Setup iommu */
-   pe-table_group.tables[0].it_group = pe-table_group;
-
-   /* Setup linux iommu table */
-   tbl = pe-table_group.tables[0];
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-   IOMMU_PAGE_SHIFT_4K);
-
/* OPAL variant of PHB3 invalidated TCEs */
swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
if (swinvp) {
@@ -1461,14 +1465,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
8);
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
}
-   tbl-it_ops = pnv_ioda2_iommu_ops;
-   iommu_init_table(tbl, phb-hose-node);
-   pe-table_group.ops = pnv_pci_ioda2_ops;
iommu_register_group(pe-table_group, phb-hose-global_number,
pe-pe_number);
 
if (pe-pdev)
-   set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+   set_iommu_table_base_and_group(pe-pdev-dev,
+   pe-table_group.tables[0]);
else
pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 08/29] vfio: powerpc/spapr: Register memory

2015-03-13 Thread Alexey Kardashevskiy

The existing implementation accounts the whole DMA window in
the locked_vm counter which is going to be even worse with multiple
containers and huge DMA windows.

This introduces 2 ioctls to register/unregister DMA memory which
receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.

If any memory region was registered, all subsequent DMA map requests
should address already pinned memory. If no memory was registered,
then the amount of memory required for a single default memory will be
accounted when the container is enabled and every map/unmap will pin/unpin
a page (with degraded performance).

Dynamic DMA window and in-kernel acceleration will require memory to
be preregistered in order to work.

The accounting is done per VFIO container. When support for
multiple groups per container is added, we will have more accurate locked_vm
accounting.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v6:
* tce_get_hva_cached() returns hva via a pointer

v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt  |  19 +++
 drivers/vfio/vfio_iommu_spapr_tce.c | 275 +++-
 include/uapi/linux/vfio.h   |  25 
 3 files changed, 313 insertions(+), 6 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..791e85c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -427,6 +427,25 @@ The code flow from the example above should be slightly 
changed:
 

 
+5) PPC64 paravirtualized guests may generate a lot of map/unmap requests,
+and the handling of those includes pinning/unpinning pages and updating
+mm::locked_vm counter to make sure we do not exceed the rlimit. Handling these
+in real-mode is quite expensive and may fail. In order to simplify in-kernel
+acceleration of map/unmap requests, two ioctls have been added to pre-register
+and unregister guest RAM pages where DMA can possibly happen to. Having these
+calles, the userspace and in-kernel handlers do not have to take care of
+pinning or accounting.
+
+The ioctls are VFIO_IOMMU_SPAPR_REGISTER_MEMORY and
+VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY.
+These receive a user space address and size of the block to be pinned.
+Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
+be called with the exact address and size used for registering
+the memory block.
+
+The user space is not expected to call these often and the block descriptors
+are stored in a linked list in the kernel.
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index be693ca..838123e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include linux/uaccess.h
 #include linux/err.h
 #include linux/vfio.h
+#include linux/vmalloc.h
 #include asm/iommu.h
 #include asm/tce.h
 
@@ -93,8 +94,196 @@ struct tce_container {
struct iommu_table *tbl;
bool enabled;
unsigned long locked_pages;
+   struct list_head mem_list;
 };
 
+struct tce_memory {
+   struct list_head next;
+   struct rcu_head rcu;
+   __u64 vaddr;
+   __u64 size;
+   __u64 hpas[];
+};
+
+static inline bool tce_preregistered(struct tce_container *container)
+{
+   return !list_empty(container-mem_list);
+}
+
+static struct tce_memory *tce_mem_alloc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem;
+   long ret;
+
+   ret = try_increment_locked_vm(size  PAGE_SHIFT);
+   if (ret)
+   return NULL;
+
+   mem = vzalloc(sizeof(*mem) + (size  (PAGE_SHIFT - 3)));
+   if (!mem) {
+   decrement_locked_vm(size  PAGE_SHIFT);
+   return NULL;
+   }
+
+   mem-vaddr = vaddr;
+   mem-size = size;
+
+   list_add_rcu(mem-next, container-mem_list);
+
+   return mem;
+}
+
+static void release_tce_memory(struct rcu_head *head)
+{
+   struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
+
+   vfree(mem);
+}
+
+static void tce_mem_free(struct tce_memory *mem)
+{
+   decrement_locked_vm(mem-size);
+   list_del_rcu(mem-next);
+   call_rcu(mem-rcu, release_tce_memory);
+}
+
+static struct tce_memory *tce_pinned_desc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem, *ret = NULL;
+
+   rcu_read_lock();
+   vaddr = ~(TCE_PCI_READ | TCE_PCI_WRITE);
+   list_for_each_entry_rcu(mem, container-mem_list, next

[PATCH kernel v6 16/29] powerpc/iommu: Fix IOMMU ownership control functions

2015-03-13 Thread Alexey Kardashevskiy

This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().

This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.

This only clears TCE content if there is no page marked busy in it_map.
Clearing must be done outside of the table locks as iommu_clear_tce()
called from iommu_clear_tces_and_put_pages() does this.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v5:
* do not store bit#0 value, it has to be set for zero-based table
anyway
* removed test_and_clear_bit
---
 arch/powerpc/kernel/iommu.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 8358631..df888a7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1052,17 +1052,28 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
 
 static int iommu_table_take_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+   int ret = 0;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
if (tbl-it_offset == 0)
clear_bit(0, tbl-it_map);
 
if (!bitmap_empty(tbl-it_map, tbl-it_size)) {
pr_err(iommu_tce: it_map is not empty);
-   return -EBUSY;
+   ret = -EBUSY;
+   if (tbl-it_offset == 0)
+   set_bit(0, tbl-it_map);
+   } else {
+   memset(tbl-it_map, 0xff, sz);
}
 
-   memset(tbl-it_map, 0xff, sz);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 
return 0;
 }
@@ -1095,7 +1106,11 @@ EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 static void iommu_table_release_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
memset(tbl-it_map, 0, sz);
 
@@ -1103,6 +1118,9 @@ static void iommu_table_release_ownership(struct 
iommu_table *tbl)
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
 
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 }
 
 extern void iommu_release_ownership(struct iommu_table_group *table_group)
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 01/29] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver

2015-03-13 Thread Alexey Kardashevskiy

This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.

This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.

This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.

Besides the last part, the rest of the patch is mechanical.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
 arch/powerpc/include/asm/iommu.h|  4 --
 arch/powerpc/kernel/iommu.c | 55 --
 drivers/vfio/vfio_iommu_spapr_tce.c | 78 ++---
 3 files changed, 65 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index f1ea597..ed69b7d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -197,10 +197,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, 
unsigned long entry,
unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-   unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..1b4a178 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -991,30 +991,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages)
-{
-   unsigned long oldtce;
-   struct page *page;
-
-   for ( ; pages; --pages, ++entry) {
-   oldtce = iommu_clear_tce(tbl, entry);
-   if (!oldtce)
-   continue;
-
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
-   }
-
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1044,35 +1020,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-   unsigned long tce)
-{
-   int ret;
-   struct page *page = NULL;
-   unsigned long hwaddr, offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
-   enum dma_data_direction direction = iommu_tce_direction(tce);
-
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx 
ioba=%lx ret=%d\n,
-   tce, entry  tbl-it_page_shift, ret); */
-   return -EFAULT;
-   }
-   hwaddr = (unsigned long) page_address(page) + offset;
-
-   ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-   if (ret)
-   put_page(page);
-
-   if (ret  0)
-   pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n,
-   __func__, entry  tbl-it_page_shift, tce, ret);
-
-   return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
@@ -1086,7 +1033,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
}
 
memset(tbl-it_map, 0xff, sz);
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
 
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
@@ -1104,7 +1050,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
memset(tbl-it_map, 0, sz);
 
/* Restore bit#0 set by iommu_init_table() */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 20abc3a..6c59339 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -149,6 +149,66 @@ static void tce_iommu_release

[PATCH kernel v6 04/29] vfio: powerpc/spapr: Use it_page_size

2015-03-13 Thread Alexey Kardashevskiy

This makes use of the it_page_size from the iommu_table struct
as page size can differ.

This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 4716b97..431e605 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -93,7 +93,7 @@ static int tce_iommu_enable(struct tce_container *container)
 * enforcing the limit based on the max that the guest can map.
 */
down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
locked = current-mm-locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
@@ -122,7 +122,7 @@ static void tce_iommu_disable(struct tce_container 
*container)
 
down_write(current-mm-mmap_sem);
current-mm-locked_vm -= (container-tbl-it_size 
-   IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   container-tbl-it_page_shift)  PAGE_SHIFT;
up_write(current-mm-mmap_sem);
 }
 
@@ -224,7 +224,7 @@ static long tce_iommu_build(struct tce_container *container,
tce, ret);
break;
}
-   tce += IOMMU_PAGE_SIZE_4K;
+   tce += IOMMU_PAGE_SIZE(tbl);
}
 
if (ret)
@@ -269,8 +269,8 @@ static long tce_iommu_ioctl(void *iommu_data,
if (info.argsz  minsz)
return -EINVAL;
 
-   info.dma32_window_start = tbl-it_offset  IOMMU_PAGE_SHIFT_4K;
-   info.dma32_window_size = tbl-it_size  IOMMU_PAGE_SHIFT_4K;
+   info.dma32_window_start = tbl-it_offset  tbl-it_page_shift;
+   info.dma32_window_size = tbl-it_size  tbl-it_page_shift;
info.flags = 0;
 
if (copy_to_user((void __user *)arg, info, minsz))
@@ -300,8 +300,8 @@ static long tce_iommu_ioctl(void *iommu_data,
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
 
-   if ((param.size  ~IOMMU_PAGE_MASK_4K) ||
-   (param.vaddr  ~IOMMU_PAGE_MASK_4K))
+   if ((param.size  ~IOMMU_PAGE_MASK(tbl)) ||
+   (param.vaddr  ~IOMMU_PAGE_MASK(tbl)))
return -EINVAL;
 
/* iova is checked by the IOMMU API */
@@ -316,8 +316,8 @@ static long tce_iommu_ioctl(void *iommu_data,
return ret;
 
ret = tce_iommu_build(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   tce, param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   tce, param.size  tbl-it_page_shift);
 
iommu_flush_tce(tbl);
 
@@ -343,17 +343,17 @@ static long tce_iommu_ioctl(void *iommu_data,
if (param.flags)
return -EINVAL;
 
-   if (param.size  ~IOMMU_PAGE_MASK_4K)
+   if (param.size  ~IOMMU_PAGE_MASK(tbl))
return -EINVAL;
 
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.size  tbl-it_page_shift);
if (ret)
return ret;
 
ret = tce_iommu_clear(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   param.size  tbl-it_page_shift);
iommu_flush_tce(tbl);
 
return ret;
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 02/29] vfio: powerpc/spapr: Do cleanup when releasing the group

2015-03-13 Thread Alexey Kardashevskiy

This clears the TCE table when a container is being closed as this is
a good thing to leave the table clean before passing the ownership
back to the host kernel.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6c59339..756831f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -134,16 +134,24 @@ static void *tce_iommu_open(unsigned long arg)
return container;
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+   struct iommu_table *tbl,
+   unsigned long entry, unsigned long pages);
+
 static void tce_iommu_release(void *iommu_data)
 {
struct tce_container *container = iommu_data;
+   struct iommu_table *tbl = container-tbl;
 
-   WARN_ON(container-tbl  !container-tbl-it_group);
+   WARN_ON(tbl  !tbl-it_group);
tce_iommu_disable(container);
 
-   if (container-tbl  container-tbl-it_group)
-   tce_iommu_detach_group(iommu_data, container-tbl-it_group);
+   if (tbl) {
+   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
 
+   if (tbl-it_group)
+   tce_iommu_detach_group(iommu_data, tbl-it_group);
+   }
mutex_destroy(container-lock);
 
kfree(container);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 09/29] vfio: powerpc/spapr: Rework attach/detach

2015-03-13 Thread Alexey Kardashevskiy

This is to make extended ownership and multiple groups support patches
simpler for review.

This is a mechanical patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 38 ++---
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 838123e..353d2d5 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -744,16 +744,21 @@ static int tce_iommu_attach_group(void *iommu_data,
iommu_group_id(container-tbl-it_group),
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else if (container-enabled) {
+   goto unlock_exit;
+   }
+
+   if (container-enabled) {
pr_err(tce_vfio: attaching group #%u to enabled container\n,
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else {
-   ret = iommu_take_ownership(tbl);
-   if (!ret)
-   container-tbl = tbl;
+   goto unlock_exit;
}
 
+   ret = iommu_take_ownership(tbl);
+   if (!ret)
+   container-tbl = tbl;
+
+unlock_exit:
mutex_unlock(container-lock);
 
return ret;
@@ -771,18 +776,21 @@ static void tce_iommu_detach_group(void *iommu_data,
pr_warn(tce_vfio: detaching group #%u, expected group is 
#%u\n,
iommu_group_id(iommu_group),
iommu_group_id(tbl-it_group));
-   } else {
-   if (container-enabled) {
-   pr_warn(tce_vfio: detaching group #%u from enabled 
container, forcing disable\n,
-   iommu_group_id(tbl-it_group));
-   tce_iommu_disable(container);
-   }
+   goto unlock_exit;
+   }
 
-   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
-   iommu_group_id(iommu_group), iommu_group); */
-   container-tbl = NULL;
-   iommu_release_ownership(tbl);
+   if (container-enabled) {
+   pr_warn(tce_vfio: detaching group #%u from enabled container, 
forcing disable\n,
+   iommu_group_id(tbl-it_group));
+   tce_iommu_disable(container);
}
+
+   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
+  iommu_group_id(iommu_group), iommu_group); */
+   container-tbl = NULL;
+   iommu_release_ownership(tbl);
+
+unlock_exit:
mutex_unlock(container-lock);
 }
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 13/29] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-03-13 Thread Alexey Kardashevskiy

Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h|  18 +++--
 arch/powerpc/kernel/iommu.c |  34 
 arch/powerpc/platforms/powernv/pci-ioda.c   |  38 +
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  17 ++--
 arch/powerpc/platforms/powernv/pci.c|   2 +-
 arch/powerpc/platforms/powernv/pci.h|   4 +-
 arch/powerpc/platforms/pseries/iommu.c  |   9 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 120 
 8 files changed, 160 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index eb75726..667aa1a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -90,9 +90,7 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-#ifdef CONFIG_IOMMU_API
-   struct iommu_group *it_group;
-#endif
+   struct iommu_table_group *it_group;
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
@@ -126,14 +124,24 @@ extern void iommu_free_table(struct iommu_table *tbl, 
const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
+
+#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+
+struct iommu_table_group {
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+   struct iommu_group *group;
+#endif
+   struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+};
+
+#ifdef CONFIG_IOMMU_API
+extern void iommu_register_group(struct iommu_table_group *table_group,
 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
unsigned long pe_num)
 {
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index dbc5d8d..7794dce 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -712,17 +712,20 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid)
 
 struct iommu_table *iommu_table_alloc(int node)
 {
-   struct iommu_table *tbl;
+   struct iommu_table_group *table_group;
 
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+  node);
+   table_group-tables[0].it_group = table_group;
 
-   return tbl;
+   return table_group-tables[0];
 }
 
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
unsigned long bitmap_sz;
unsigned int order;
+   struct iommu_table_group *table_group = tbl-it_group;
 
if (!tbl || !tbl-it_map) {
printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
@@ -738,9 +741,9 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
clear_bit(0, tbl-it_map);
 
 #ifdef CONFIG_IOMMU_API
-   if (tbl-it_group) {
-   iommu_group_put(tbl-it_group);
-   BUG_ON(tbl-it_group);
+   if (table_group-group) {
+   iommu_group_put(table_group-group);
+   BUG_ON(table_group-group);
}
 #endif
 
@@ -756,7 +759,7 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
free_pages((unsigned long) tbl-it_map, order);
 
/* free table */
-   kfree(tbl);
+   kfree(table_group);
 }
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
@@ -888,11 +891,12 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
  */
 static void group_release(void *iommu_data)
 {
-   struct iommu_table *tbl = iommu_data;
-   tbl-it_group = NULL;
+   struct iommu_table_group *table_group = iommu_data;
+
+   table_group-group = NULL;
 }
 
-void iommu_register_group(struct iommu_table *tbl,
+void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number, unsigned long pe_num)
 {
struct iommu_group *grp;
@@ -904,8 +908,8 @@ void iommu_register_group(struct iommu_table *tbl,
PTR_ERR(grp));
return;
}
-   tbl-it_group = grp

[PATCH kernel v6 27/29] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership

2015-03-13 Thread Alexey Kardashevskiy

Before the IOMMU user (VFIO) would take control over the IOMMU table
belonging to a specific IOMMU group. This approach did not allow sharing
tables between IOMMU groups attached to the same container.

This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU implements a set_ownership() callback, this lets the user have
full control over the IOMMU group. When the ownership is taken,
the platform code removes all the windows so the caller must create them.
Before returning the ownership back to the platform code, VFIO
unprograms and removes all the tables it created.

Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v6:
* fixed commit log that VFIO removes tables before passing ownership
back to the platform code, not userspace
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c   | 51 ---
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 25a93f2..51394fb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1616,11 +1616,33 @@ static void pnv_ioda2_set_ownership(struct 
iommu_table_group *table_group,
 {
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
-   if (enable)
-   iommu_take_ownership(table_group);
-   else
-   iommu_release_ownership(table_group);
+   if (enable) {
+   pnv_pci_ioda2_unset_window(pe-table_group, 0);
+   pnv_pci_free_table(pe-table_group.tables[0]);
+   } else {
+   struct iommu_table *tbl = pe-table_group.tables[0];
+   int64_t rc;
 
+   rc = pnv_pci_ioda2_create_table(pe-table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   pe-phb-ioda.m32_pci_base,
+   POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld,
+   rc);
+   return;
+   }
+
+   iommu_init_table(tbl, pe-phb-hose-node);
+
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
+   if (rc) {
+   pe_err(pe, Failed to configure 32-bit TCE table, err 
%ld\n,
+   rc);
+   pnv_pci_free_table(tbl);
+   return;
+   }
+   }
pnv_pci_ioda2_set_bypass(pe, !enable);
 }
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index e191438..5e754b0 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -423,18 +423,11 @@ static int tce_iommu_clear(struct tce_container 
*container,
 static void tce_iommu_release(void *iommu_data)
 {
struct tce_container *container = iommu_data;
-   struct iommu_table *tbl;
-   struct iommu_table_group *table_group;
 
WARN_ON(container-grp);
 
-   if (container-grp) {
-   table_group = iommu_group_get_iommudata(container-grp);
-   tbl = table_group-tables[0];
-   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
-
+   if (container-grp)
tce_iommu_detach_group(iommu_data, container-grp);
-   }
 
tce_mem_unregister_all(container);
tce_iommu_disable(container);
@@ -833,14 +826,24 @@ static int tce_iommu_attach_group(void *iommu_data,
 
if (!table_group-ops || !table_group-ops-set_ownership) {
ret = iommu_take_ownership(table_group);
+   } else if (!table_group-ops-create_table ||
+   !table_group-ops-set_window) {
+   WARN_ON_ONCE(1);
+   ret = -EFAULT;
} else {
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
 * our physical memory via the bypass window instead of just
 * the pages that has been explicitly mapped into the iommu
 */
+   struct iommu_table tbltmp = { 0 }, *tbl = tbltmp;
+
table_group-ops-set_ownership(table_group, true);
-   ret = 0;
+   ret = table_group-ops-create_table(table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   table_group-tce32_size, 1, tbl);
+   if (!ret)
+   ret = table_group-ops-set_window(table_group, 0, tbl);
}
 
if (ret)
@@ -859,6 +862,7

[PATCH kernel v6 23/29] powerpc/powernv: Implement multilevel TCE tables

2015-03-13 Thread Alexey Kardashevskiy

TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level TCE tables,
up to 5 levels which splits the table into a tree of smaller subtables.

This adds multi-level TCE tables support to pnv_pci_ioda2_create_table()
and pnv_pci_ioda2_free_table() callbacks.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  |   2 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 127 --
 arch/powerpc/platforms/powernv/pci.c  |  19 +
 3 files changed, 122 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index fd118ea..4007432 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -88,6 +88,8 @@ struct iommu_pool {
 struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
+   unsigned long  it_indirect_levels;
+   unsigned long  it_level_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8bb5d6d..bdf511d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -47,6 +47,8 @@
 #include powernv.h
 #include pci.h
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS   1
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
 {
@@ -1331,16 +1333,79 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
+static void pnv_free_tce_table(unsigned long addr, unsigned size,
+   unsigned level)
+{
+   addr = ~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+   if (level) {
+   long i;
+   u64 *tmp = (u64 *) addr;
+
+   for (i = 0; i  size; ++i) {
+   unsigned long hpa = be64_to_cpu(tmp[i]);
+
+   if (!(hpa  (TCE_PCI_READ | TCE_PCI_WRITE)))
+   continue;
+
+   pnv_free_tce_table((unsigned long) __va(hpa),
+   size, level - 1);
+   }
+   }
+
+   free_pages(addr, get_order(size  3));
+}
+
+static __be64 *pnv_alloc_tce_table(int nid,
+   unsigned shift, unsigned levels, unsigned long *left)
+{
+   struct page *tce_mem = NULL;
+   __be64 *addr, *tmp;
+   unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+   unsigned long chunk = 1UL  shift, i;
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+   if (!tce_mem) {
+   pr_err(Failed to allocate a TCE memory\n);
+   return NULL;
+   }
+
+   if (!*left)
+   return NULL;
+
+   addr = page_address(tce_mem);
+   memset(addr, 0, chunk);
+
+   --levels;
+   if (!levels) {
+   /* This is last level, actual TCEs */
+   *left -= min(*left, chunk);
+   return addr;
+   }
+
+   for (i = 0; i  (chunk  3); ++i) {
+   /* We allocated required TCEs, mark the rest page fault */
+   if (!*left) {
+   addr[i] = cpu_to_be64(0);
+   continue;
+   }
+
+   tmp = pnv_alloc_tce_table(nid, shift, levels, left);
+   addr[i] = cpu_to_be64(__pa(tmp) |
+   TCE_PCI_READ | TCE_PCI_WRITE);
+   }
+
+   return addr;
+}
+
 static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe,
-   __u32 page_shift, __u64 window_size,
+   __u32 page_shift, __u64 window_size, __u32 levels,
struct iommu_table *tbl)
 {
int nid = pe-phb-hose-node;
-   struct page *tce_mem = NULL;
void *addr;
-   unsigned long tce_table_size;
-   int64_t rc;
-   unsigned order;
+   unsigned long tce_table_size, left;
+   unsigned shift;
 
if (!(table_group-pgsizes  (1ULL  page_shift)))
return -EINVAL;
@@ -1348,20 +1413,27 @@ static long pnv_pci_ioda2_create_table(struct 
pnv_ioda_pe *pe,
if ((window_size  memory_hotplug_max()) || !is_power_of_2(window_size))
return -EINVAL;
 
+   if (!levels || (levels  POWERNV_IOMMU_MAX_LEVELS))
+   return -EINVAL;
+
tce_table_size = (window_size  page_shift) * 8;
tce_table_size = max(0x1000UL, tce_table_size

[PATCH kernel v6 26/29] vfio: powerpc/spapr: Define v2 IOMMU

2015-03-13 Thread Alexey Kardashevskiy

The existing IOMMU requires VFIO_IOMMU_ENABLE call to enable actual use
of the container (i.e. call DMA map/unmap) and this is where we check
the rlimit for locked pages. It assumes that only as much memory
as a default DMA window can be mapped. Every DMA map/unmap request will
do pinning/unpinning of physical pages.

New IOMMU will split physical pages pinning and TCE table update.
It will require guest pages to be registered first and consequent
map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
However when a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.

This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v6:
* enforced limitations imposed by the SPAPR TCE IOMMU version
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 18 +-
 include/uapi/linux/vfio.h   |  2 ++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 9d240b4..e191438 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -95,6 +95,7 @@ struct tce_container {
bool enabled;
unsigned long locked_pages;
struct list_head mem_list;
+   bool v2;
 };
 
 struct tce_memory {
@@ -398,7 +399,7 @@ static void *tce_iommu_open(unsigned long arg)
 {
struct tce_container *container;
 
-   if (arg != VFIO_SPAPR_TCE_IOMMU) {
+   if ((arg != VFIO_SPAPR_TCE_IOMMU)  (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
pr_err(tce_vfio: Wrong IOMMU type\n);
return ERR_PTR(-EINVAL);
}
@@ -410,6 +411,8 @@ static void *tce_iommu_open(unsigned long arg)
mutex_init(container-lock);
INIT_LIST_HEAD_RCU(container-mem_list);
 
+   container-v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
+
return container;
 }
 
@@ -580,6 +583,7 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_CHECK_EXTENSION:
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
+   case VFIO_SPAPR_TCE_v2_IOMMU:
ret = 1;
break;
default:
@@ -719,6 +723,9 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
struct vfio_iommu_spapr_register_memory param;
 
+   if (!container-v2)
+   return -EPERM;
+
minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
size);
 
@@ -741,6 +748,9 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
struct vfio_iommu_spapr_register_memory param;
 
+   if (!container-v2)
+   return -EPERM;
+
minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
size);
 
@@ -761,6 +771,9 @@ static long tce_iommu_ioctl(void *iommu_data,
return 0;
}
case VFIO_IOMMU_ENABLE:
+   if (container-v2)
+   return -EPERM;
+
mutex_lock(container-lock);
ret = tce_iommu_enable(container);
mutex_unlock(container-lock);
@@ -768,6 +781,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 
case VFIO_IOMMU_DISABLE:
+   if (container-v2)
+   return -EPERM;
+
mutex_lock(container-lock);
tce_iommu_disable(container);
mutex_unlock(container-lock);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b17e120..fbc5286 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -36,6 +36,8 @@
 /* Two-stage IOMMU */
 #define VFIO_TYPE1_NESTING_IOMMU   6   /* Implies v2 */
 
+#define VFIO_SPAPR_TCE_v2_IOMMU7
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 21/29] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window

2015-03-13 Thread Alexey Kardashevskiy

This is a part of moving DMA window programming to an iommu_ops
callback.

This is a mechanical patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 85 ---
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index af9208c..8bb5d6d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1386,6 +1386,57 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
memset(tbl, 0, sizeof(struct iommu_table));
 }
 
+static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe,
+   struct iommu_table *tbl)
+{
+   struct pnv_phb *phb = pe-phb;
+   const __be64 *swinvp;
+   int64_t rc;
+   const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
+   const __u64 win_size = tbl-it_size  tbl-it_page_shift;
+
+   pe_info(pe, Setting up window at %llx..%llx pagesize=0x%x 
tablesize=0x%lx\n,
+   start_addr, start_addr + win_size - 1,
+   1UL  tbl-it_page_shift, tbl-it_size  3);
+
+   pe-table_group.tables[0] = *tbl;
+   tbl = pe-table_group.tables[0];
+   tbl-it_group = pe-table_group;
+
+   /*
+* Map TCE table through TVT. The TVE index is the PE number
+* shifted by 1 bit for 32-bits DMA space.
+*/
+   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
+   pe-pe_number  1, 1, __pa(tbl-it_base),
+   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   if (rc) {
+   pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
+   goto fail;
+   }
+
+   /* OPAL variant of PHB3 invalidated TCEs */
+   swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
+   if (swinvp) {
+   /* We need a couple more fields -- an address and a data
+* to or.  Since the bus is only printed out on table free
+* errors, and on the first pass the data will be a relative
+* bus number, print that out instead.
+*/
+   pe-tce_inval_reg_phys = be64_to_cpup(swinvp);
+   tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys,
+   8);
+   tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+   }
+
+   return 0;
+fail:
+   if (pe-tce32_seg = 0)
+   pe-tce32_seg = -1;
+
+   return rc;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -1456,7 +1507,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
-   const __be64 *swinvp;
unsigned int end;
struct iommu_table *tbl = pe-table_group.tables[0];
int64_t rc;
@@ -1484,31 +1534,14 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
iommu_init_table(tbl, phb-hose-node);
pe-table_group.ops = pnv_pci_ioda2_ops;
 
-   /*
-* Map TCE table through TVT. The TVE index is the PE number
-* shifted by 1 bit for 32-bits DMA space.
-*/
-   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(tbl-it_base),
-   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   rc = pnv_pci_ioda2_set_window(pe, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
-   goto fail;
-   }
-
-   /* OPAL variant of PHB3 invalidated TCEs */
-   swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
-   if (swinvp) {
-   /* We need a couple more fields -- an address and a data
-* to or.  Since the bus is only printed out on table free
-* errors, and on the first pass the data will be a relative
-* bus number, print that out instead.
-*/
-   pe-tce_inval_reg_phys = be64_to_cpup(swinvp);
-   tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys,
-   8);
-   tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+   pnv_pci_free_table(tbl);
+   if (pe-tce32_seg = 0)
+   pe-tce32_seg = -1;
+   return;
}
iommu_register_group(pe-table_group, phb-hose-global_number,
pe-pe_number);
@@ -1522,12 +1555,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
/* Also create a bypass window */
if (!pnv_iommu_bypass_disabled)
pnv_pci_ioda2_setup_bypass_pe(phb, pe

[PATCH kernel v6 24/29] powerpc/powernv: Change prototypes to receive iommu

2015-03-13 Thread Alexey Kardashevskiy

This changes few functions to receive a iommu_table_group pointer
rather than PE as they are going to be a part of upcoming
iommu_table_group_ops callback set.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index bdf511d..ca6c3f3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1398,10 +1398,12 @@ static __be64 *pnv_alloc_tce_table(int nid,
return addr;
 }
 
-static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe,
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
__u32 page_shift, __u64 window_size, __u32 levels,
struct iommu_table *tbl)
 {
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
int nid = pe-phb-hose-node;
void *addr;
unsigned long tce_table_size, left;
@@ -1456,9 +1458,11 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
iommu_reset_table(tbl, ioda2);
 }
 
-static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe,
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
struct iommu_table *tbl)
 {
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = pe-phb;
const __be64 *swinvp;
int64_t rc;
@@ -1591,12 +1595,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
-
end = (1  ilog2(phb-ioda.m32_pci_base));
pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n,
end);
 
-   rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K,
+   rc = pnv_pci_ioda2_create_table(pe-table_group, IOMMU_PAGE_SHIFT_4K,
phb-ioda.m32_pci_base,
POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
if (rc) {
@@ -1609,7 +1612,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
iommu_init_table(tbl, phb-hose-node);
pe-table_group.ops = pnv_pci_ioda2_ops;
 
-   rc = pnv_pci_ioda2_set_window(pe, tbl);
+   rc = pnv_pci_ioda2_set_window(pe-table_group, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 12/29] powerpc/iommu: Introduce iommu_table_alloc() helper

2015-03-13 Thread Alexey Kardashevskiy

This replaces multiple calls of kzalloc_node() with a new
iommu_table_alloc() helper. Right now it calls kzalloc_node() but
later it will be modified to allocate a iommu_table_group struct with
a single iommu_table in it.

Later the helper will allocate a iommu_table_group struct which embeds
the iommu table(s).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h   |  1 +
 arch/powerpc/kernel/iommu.c|  9 +
 arch/powerpc/platforms/powernv/pci.c   |  2 +-
 arch/powerpc/platforms/pseries/iommu.c | 12 
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d909e2a..eb75726 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -117,6 +117,7 @@ static inline void *get_iommu_table_base(struct device *dev)
return dev-archdata.dma_data.iommu_table_base;
 }
 
+extern struct iommu_table *iommu_table_alloc(int node);
 /* Frees table for an individual device node */
 extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 57cb615..dbc5d8d 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -710,6 +710,15 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid)
return tbl;
 }
 
+struct iommu_table *iommu_table_alloc(int node)
+{
+   struct iommu_table *tbl;
+
+   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+
+   return tbl;
+}
+
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
unsigned long bitmap_sz;
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index c619ec6..1c31ac8 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -680,7 +680,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct 
pci_controller *hose)
   hose-dn-full_name);
return NULL;
}
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose-node);
+   tbl = iommu_table_alloc(hose-node);
if (WARN_ON(!tbl))
return NULL;
pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 48d1fde..41a8b14 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -617,8 +617,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
pci-phb-dma_window_size = 0x800ul;
pci-phb-dma_window_base_cur = 0x800ul;
 
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  pci-phb-node);
+   tbl = iommu_table_alloc(pci-phb-node);
 
iommu_table_setparms(pci-phb, dn, tbl);
tbl-it_ops = iommu_table_pseries_ops;
@@ -669,8 +668,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 pdn-full_name, ppci-iommu_table);
 
if (!ppci-iommu_table) {
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  ppci-phb-node);
+   tbl = iommu_table_alloc(ppci-phb-node);
iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window);
tbl-it_ops = iommu_table_lpar_multi_ops;
ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node);
@@ -697,8 +695,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
struct pci_controller *phb = PCI_DN(dn)-phb;
 
pr_debug( -- first child, no bridge. Allocating iommu 
table.\n);
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  phb-node);
+   tbl = iommu_table_alloc(phb-node);
iommu_table_setparms(phb, dn, tbl);
tbl-it_ops = iommu_table_pseries_ops;
PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node);
@@ -1120,8 +1117,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev 
*dev)
 
pci = PCI_DN(pdn);
if (!pci-iommu_table) {
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  pci-phb-node);
+   tbl = iommu_table_alloc(pci-phb-node);
iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window);
tbl-it_ops = iommu_table_lpar_multi_ops;
pci-iommu_table = iommu_init_table(tbl, pci-phb-node);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 06/29] vfio: powerpc/spapr: Disable DMA mappings on disabled container

2015-03-13 Thread Alexey Kardashevskiy

At the moment DMA map/unmap requests are handled irrespective to
the container's state. This allows the user space to pin memory which
it might not be allowed to pin.

This adds checks to MAP/UNMAP that the container is enabled, otherwise
-EPERM is returned.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index c47b969..8a667cb 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -327,6 +327,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct iommu_table *tbl = container-tbl;
unsigned long tce;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (!tbl)
return -ENXIO;
 
@@ -371,6 +374,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct vfio_iommu_type1_dma_unmap param;
struct iommu_table *tbl = container-tbl;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (WARN_ON(!tbl))
return -ENXIO;
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kernel v6 28/29] vfio: powerpc/spapr: Support multiple groups in one container if possible

2015-03-13 Thread Alexey Kardashevskiy

At the moment only one group per container is supported.
POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
IOMMU group so we can relax this limitation and support multiple groups
per container.

This adds TCE table descriptors to a container and uses iommu_table_group_ops
to create/set DMA windows on IOMMU groups so the same TCE tables will be
shared between several IOMMU groups.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 252 
 1 file changed, 170 insertions(+), 82 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 5e754b0..d94116b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -91,11 +91,17 @@ static void decrement_locked_vm(long npages)
  */
 struct tce_container {
struct mutex lock;
-   struct iommu_group *grp;
bool enabled;
unsigned long locked_pages;
struct list_head mem_list;
bool v2;
+   struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+   struct list_head group_list;
+};
+
+struct tce_iommu_group {
+   struct list_head next;
+   struct iommu_group *grp;
 };
 
 struct tce_memory {
@@ -295,20 +301,20 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
return (PAGE_SHIFT + compound_order(compound_head(page))) = page_shift;
 }
 
+static inline bool tce_groups_attached(struct tce_container *container)
+{
+   return !list_empty(container-group_list);
+}
+
 static struct iommu_table *spapr_tce_find_table(
struct tce_container *container,
phys_addr_t ioba)
 {
long i;
struct iommu_table *ret = NULL;
-   struct iommu_table_group *table_group;
-
-   table_group = iommu_group_get_iommudata(container-grp);
-   if (!table_group)
-   return NULL;
 
for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-   struct iommu_table *tbl = table_group-tables[i];
+   struct iommu_table *tbl = container-tables[i];
unsigned long entry = ioba  tbl-it_page_shift;
unsigned long start = tbl-it_offset;
unsigned long end = start + tbl-it_size;
@@ -326,11 +332,8 @@ static int tce_iommu_enable(struct tce_container 
*container)
 {
int ret = 0;
unsigned long locked;
-   struct iommu_table *tbl;
struct iommu_table_group *table_group;
-
-   if (!container-grp)
-   return -ENXIO;
+   struct tce_iommu_group *tcegrp;
 
if (!current-mm)
return -ESRCH; /* process exited */
@@ -364,12 +367,24 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * KVM agnostic.
 */
if (!tce_preregistered(container)) {
-   table_group = iommu_group_get_iommudata(container-grp);
+   if (!tce_groups_attached(container))
+   return -ENODEV;
+
+   tcegrp = list_first_entry(container-group_list,
+   struct tce_iommu_group, next);
+   table_group = iommu_group_get_iommudata(tcegrp-grp);
if (!table_group)
return -ENODEV;
 
-   tbl = table_group-tables[0];
-   locked = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
+   /*
+* We do not allow enabling a group if no DMA-able memory was
+* registered as there is no way to know how much we should
+* increment the locked_vm counter.
+*/
+   if (!table_group-tce32_size)
+   return -EPERM;
+
+   locked = table_group-tce32_size  PAGE_SHIFT;
ret = try_increment_locked_vm(locked);
if (ret)
return ret;
@@ -410,6 +425,7 @@ static void *tce_iommu_open(unsigned long arg)
 
mutex_init(container-lock);
INIT_LIST_HEAD_RCU(container-mem_list);
+   INIT_LIST_HEAD_RCU(container-group_list);
 
container-v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 
@@ -423,11 +439,30 @@ static int tce_iommu_clear(struct tce_container 
*container,
 static void tce_iommu_release(void *iommu_data)
 {
struct tce_container *container = iommu_data;
+   struct iommu_table_group *table_group;
+   int i;
+   struct tce_iommu_group *tcegrp;
 
-   WARN_ON(container-grp);
+   while (tce_groups_attached(container)) {
+   tcegrp = list_first_entry(container-group_list,
+   struct tce_iommu_group, next);
+   table_group = iommu_group_get_iommudata(tcegrp-grp);
+   tce_iommu_detach_group(iommu_data, tcegrp-grp);
+   }
 
-   if (container-grp)
-   tce_iommu_detach_group(iommu_data, container-grp);
+   /* Free tables */
+   for (i = 0; i

1 2 3 4 5 6 >

1 - 100 of 583 matches

Mail list logo