date:20150309

[PATCH v5 28/29] vfio: powerpc/spapr: Support multiple groups in one container if possible

2015-03-09 Thread Alexey Kardashevskiy

At the moment only one group per container is supported.
POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
IOMMU group so we can relax this limitation and support multiple groups
per container.

This adds TCE table descriptors to a container and uses iommu_table_group_ops
to create/set DMA windows on IOMMU groups so the same TCE tables will be
shared between several IOMMU groups.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 252 
 1 file changed, 170 insertions(+), 82 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 3bc0645..3a0b5fe 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -91,10 +91,16 @@ static void decrement_locked_vm(long npages)
  */
 struct tce_container {
struct mutex lock;
-   struct iommu_group *grp;
bool enabled;
unsigned long locked_pages;
struct list_head mem_list;
+   struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+   struct list_head group_list;
+};
+
+struct tce_iommu_group {
+   struct list_head next;
+   struct iommu_group *grp;
 };
 
 struct tce_memory {
@@ -300,20 +306,20 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
return false;
 }
 
+static inline bool tce_groups_attached(struct tce_container *container)
+{
+   return !list_empty(container-group_list);
+}
+
 static struct iommu_table *spapr_tce_find_table(
struct tce_container *container,
phys_addr_t ioba)
 {
long i;
struct iommu_table *ret = NULL;
-   struct iommu_table_group *table_group;
-
-   table_group = iommu_group_get_iommudata(container-grp);
-   if (!table_group)
-   return NULL;
 
for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-   struct iommu_table *tbl = table_group-tables[i];
+   struct iommu_table *tbl = container-tables[i];
unsigned long entry = ioba  tbl-it_page_shift;
unsigned long start = tbl-it_offset;
unsigned long end = start + tbl-it_size;
@@ -331,11 +337,8 @@ static int tce_iommu_enable(struct tce_container 
*container)
 {
int ret = 0;
unsigned long locked;
-   struct iommu_table *tbl;
struct iommu_table_group *table_group;
-
-   if (!container-grp)
-   return -ENXIO;
+   struct tce_iommu_group *tcegrp;
 
if (!current-mm)
return -ESRCH; /* process exited */
@@ -369,12 +372,24 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * KVM agnostic.
 */
if (!tce_preregistered(container)) {
-   table_group = iommu_group_get_iommudata(container-grp);
+   if (!tce_groups_attached(container))
+   return -ENODEV;
+
+   tcegrp = list_first_entry(container-group_list,
+   struct tce_iommu_group, next);
+   table_group = iommu_group_get_iommudata(tcegrp-grp);
if (!table_group)
return -ENODEV;
 
-   tbl = table_group-tables[0];
-   locked = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
+   /*
+* We do not allow enabling a group if no DMA-able memory was
+* registered as there is no way to know how much we should
+* increment the locked_vm counter.
+*/
+   if (!table_group-tce32_size)
+   return -EPERM;
+
+   locked = table_group-tce32_size  PAGE_SHIFT;
ret = try_increment_locked_vm(locked);
if (ret)
return ret;
@@ -415,6 +430,7 @@ static void *tce_iommu_open(unsigned long arg)
 
mutex_init(container-lock);
INIT_LIST_HEAD_RCU(container-mem_list);
+   INIT_LIST_HEAD_RCU(container-group_list);
 
return container;
 }
@@ -426,11 +442,30 @@ static int tce_iommu_clear(struct tce_container 
*container,
 static void tce_iommu_release(void *iommu_data)
 {
struct tce_container *container = iommu_data;
+   struct iommu_table_group *table_group;
+   int i;
+   struct tce_iommu_group *tcegrp;
 
-   WARN_ON(container-grp);
+   while (tce_groups_attached(container)) {
+   tcegrp = list_first_entry(container-group_list,
+   struct tce_iommu_group, next);
+   table_group = iommu_group_get_iommudata(tcegrp-grp);
+   tce_iommu_detach_group(iommu_data, tcegrp-grp);
+   }
 
-   if (container-grp)
-   tce_iommu_detach_group(iommu_data, container-grp);
+   /* Free tables */
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = container-tables[i];
+

[PATCH v5 07/29] vfio: powerpc/spapr: Moving pinning/unpinning to helpers

2015-03-09 Thread Alexey Kardashevskiy

This is a pretty mechanical patch to make next patches simpler.

New tce_iommu_unuse_page() helper does put_page() now but it might skip
that after the memory registering patch applied.

As we are here, this removes unnecessary checks for a value returned
by pfn_to_page() as it cannot possibly return NULL.

This moves tce_iommu_disable() later to let tce_iommu_clear() know if
the container has been enabled because if it has not been, then
put_page() must not be called on TCEs from the TCE table. This situation
is not yet possible but it will after KVM acceleration patchset is
applied.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 70 -
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index d3ab34f..ca396e5 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -204,7 +204,6 @@ static void tce_iommu_release(void *iommu_data)
struct iommu_table *tbl = container-tbl;
 
WARN_ON(tbl  !tbl-it_group);
-   tce_iommu_disable(container);
 
if (tbl) {
tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
@@ -212,63 +211,102 @@ static void tce_iommu_release(void *iommu_data)
if (tbl-it_group)
tce_iommu_detach_group(iommu_data, tbl-it_group);
}
+
+   tce_iommu_disable(container);
+
mutex_destroy(container-lock);
 
kfree(container);
 }
 
+static void tce_iommu_unuse_page(struct tce_container *container,
+   unsigned long oldtce)
+{
+   struct page *page;
+
+   if (!(oldtce  (TCE_PCI_READ | TCE_PCI_WRITE)))
+   return;
+
+   /*
+* VFIO cannot map/unmap when a container is not enabled so
+* we would not need this check but KVM could map/unmap and if
+* this happened, we must not put pages as KVM does not get them as
+* it expects memory pre-registation to do this part.
+*/
+   if (!container-enabled)
+   return;
+
+   page = pfn_to_page(__pa(oldtce)  PAGE_SHIFT);
+
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(page);
+
+   put_page(page);
+}
+
 static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
 {
unsigned long oldtce;
-   struct page *page;
 
for ( ; pages; --pages, ++entry) {
oldtce = iommu_clear_tce(tbl, entry);
if (!oldtce)
continue;
 
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
+   tce_iommu_unuse_page(container, (unsigned long) __va(oldtce));
}
 
return 0;
 }
 
+static unsigned long tce_get_hva(struct tce_container *container,
+   unsigned page_shift, unsigned long tce)
+{
+   long ret;
+   struct page *page = NULL;
+   unsigned long hva;
+   enum dma_data_direction direction = iommu_tce_direction(tce);
+
+   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
+   direction != DMA_TO_DEVICE, page);
+   if (unlikely(ret != 1))
+   return -1;
+
+   hva = (unsigned long) page_address(page);
+
+   return hva;
+}
+
 static long tce_iommu_build(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long tce, unsigned long pages)
 {
long i, ret = 0;
-   struct page *page = NULL;
+   struct page *page;
unsigned long hva;
enum dma_data_direction direction = iommu_tce_direction(tce);
 
for (i = 0; i  pages; ++i) {
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
+   hva = tce_get_hva(container, tbl-it_page_shift, tce);
+   if (hva == -1) {
ret = -EFAULT;
break;
}
 
+   page = pfn_to_page(__pa(hva)  PAGE_SHIFT);
if (!tce_page_is_contained(page, tbl-it_page_shift)) {
ret = -EPERM;
break;
}
 
-   hva = (unsigned long) page_address(page) +
-   (tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);
+   /* Preserve offset within IOMMU page */
+   hva |= tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
 
ret = iommu_tce_build(tbl, entry + i, hva, direction);
if (ret) {
-   put_page(page);
+

[PATCH v5 11/29] powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table

2015-03-09 Thread Alexey Kardashevskiy

This adds a iommu_table_ops struct and puts pointer to it into
the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
callbacks from ppc_md to the new struct where they really belong to.

This adds the requirement for @it_ops to be initialized before calling
iommu_init_table() to make sure that we do not leave any IOMMU table
with iommu_table_ops uninitialized. This is not a parameter of
iommu_init_table() though as there will be cases when iommu_init_table()
will not be called on TCE tables, for example - VFIO.

This does s/tce_build/set/, s/tce_free/clear/ and removes tce_
redundand prefixes.

This removes tce_xxx_rm handlers from ppc_md but does not add
them to iommu_table_ops as this will be done later if we decide to
support TCE hypercalls in real mode.

For pSeries, this always uses tce_buildmulti_pSeriesLP/
tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
present. The reason for this is we still have to support multitce=off
boot parameter in disable_multitce() and we do not want to walk through
all IOMMU tables in the system and replace multi callbacks with single
ones.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h| 17 +++
 arch/powerpc/include/asm/machdep.h  | 25 
 arch/powerpc/kernel/iommu.c | 46 +++--
 arch/powerpc/kernel/vio.c   |  5 
 arch/powerpc/platforms/cell/iommu.c |  8 +++--
 arch/powerpc/platforms/pasemi/iommu.c   |  7 +++--
 arch/powerpc/platforms/powernv/pci-ioda.c   |  2 ++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  1 +
 arch/powerpc/platforms/powernv/pci.c| 23 ---
 arch/powerpc/platforms/powernv/pci.h|  1 +
 arch/powerpc/platforms/pseries/iommu.c  | 34 +++--
 arch/powerpc/sysdev/dart_iommu.c| 12 
 12 files changed, 93 insertions(+), 88 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2af2d70..d909e2a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -43,6 +43,22 @@
 extern int iommu_is_off;
 extern int iommu_force_on;
 
+struct iommu_table_ops {
+   int (*set)(struct iommu_table *tbl,
+   long index, long npages,
+   unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs);
+   void (*clear)(struct iommu_table *tbl,
+   long index, long npages);
+   unsigned long (*get)(struct iommu_table *tbl, long index);
+   void (*flush)(struct iommu_table *tbl);
+};
+
+/* These are used by VIO */
+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
+extern struct iommu_table_ops iommu_table_pseries_ops;
+
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -77,6 +93,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *it_group;
 #endif
+   struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index c8175a3..2abe744 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,31 +65,6 @@ struct machdep_calls {
 * destroyed as well */
void(*hpte_clear_all)(void);
 
-   int (*tce_build)(struct iommu_table *tbl,
-long index,
-long npages,
-unsigned long uaddr,
-enum dma_data_direction direction,
-struct dma_attrs *attrs);
-   void(*tce_free)(struct iommu_table *tbl,
-   long index,
-   long npages);
-   unsigned long   (*tce_get)(struct iommu_table *tbl,
-   long index);
-   void(*tce_flush)(struct iommu_table *tbl);
-
-   /* _rm versions are for real mode use only */
-   int (*tce_build_rm)(struct iommu_table *tbl,
-long index,
-long npages,
-unsigned long uaddr,
-enum dma_data_direction direction,
-struct dma_attrs *attrs);
-   void(*tce_free_rm)(struct iommu_table *tbl,
-   long index,
-   long npages);
-   void(*tce_flush_rm)(struct iommu_table *tbl);
-
void __iomem *  (*ioremap)(phys_addr_t addr, unsigned long

[PATCH v5 03/29] vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size

2015-03-09 Thread Alexey Kardashevskiy

This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.

Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.

Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* s/tce_check_page_size/tce_page_is_contained/
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 756831f..91e7599 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -49,6 +49,22 @@ struct tce_container {
bool enabled;
 };
 
+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+   unsigned shift;
+
+   /*
+* Check that the TCE table granularity is not bigger than the size of
+* a page we just found. Otherwise the hardware can get access to
+* a bigger memory chunk that it should.
+*/
+   shift = PAGE_SHIFT + compound_order(compound_head(page));
+   if (shift = page_shift)
+   return true;
+
+   return false;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
@@ -197,6 +213,12 @@ static long tce_iommu_build(struct tce_container 
*container,
ret = -EFAULT;
break;
}
+
+   if (!tce_page_is_contained(page, tbl-it_page_shift)) {
+   ret = -EPERM;
+   break;
+   }
+
hva = (unsigned long) page_address(page) +
(tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK);
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 00/29] powerpc/iommu/vfio: Enable Dynamic DMA windows

2015-03-09 Thread Alexey Kardashevskiy


This enables sPAPR defined feature called Dynamic DMA windows (DDW).

Each Partitionable Endpoint (IOMMU group) has an address range on a PCI bus
where devices are allowed to do DMA. These ranges are called DMA windows.
By default, there is a single DMA window, 1 or 2GB big, mapped at zero
on a PCI bus.

Hi-speed devices may suffer from the limited size of the window.
The recent host kernels use a TCE bypass window on POWER8 CPU which implements
direct PCI bus address range mapping (with offset of 159) to the host memory.

For guests, PAPR defines a DDW RTAS API which allows pseries guests
querying the hypervisor about DDW support and capabilities (page size mask
for now). A pseries guest may request an additional (to the default)
DMA windows using this RTAS API.
The existing pseries Linux guests request an additional window as big as
the guest RAM and map the entire guest window which effectively creates
direct mapping of the guest memory to a PCI bus.

The multiple DMA windows feature is supported by POWER7/POWER8 CPUs; however
this patchset only adds support for POWER8 as TCE tables are implemented
in POWER7 in a quite different way ans POWER7 is not the highest priority.

This patchset reworks PPC64 IOMMU code and adds necessary structures
to support big windows.

Once a Linux guest discovers the presence of DDW, it does:
1. query hypervisor about number of available windows and page size masks;
2. create a window with the biggest possible page size (today 4K/64K/16M);
3. map the entire guest RAM via H_PUT_TCE* hypercalls;
4. switche dma_ops to direct_dma_ops on the selected PE.

Once this is done, H_PUT_TCE is not called anymore for 64bit devices and
the guest does not waste time on DMA map/unmap operations.

Note that 32bit devices won't use DDW and will keep using the default
DMA window so KVM optimizations will be required (to be posted later).

Changes:
v5:
* added SPAPR_TCE_IOMMU_v2 to tell the userspace that there is a memory
pre-registration feature
* added backward compatibility
* renamed few things (mostly powerpc_iommu - iommu_table_group)

v4:
* moved patches around to have VFIO and PPC patches separated as much as
possible
* now works with the existing upstream QEMU

v3:
* redesigned the whole thing
* multiple IOMMU groups per PHB - one PHB is needed for VFIO in the guest -
no problems with locked_vm counting; also we save memory on actual tables
* guest RAM preregistration is required for DDW
* PEs (IOMMU groups) are passed to VFIO with no DMA windows at all so
we do not bother with iommu_table::it_map anymore
* added multilevel TCE tables support to support really huge guests

v2:
* added missing __pa() in powerpc/powernv: Release replaced TCE
* reposted to make some noise




Alexey Kardashevskiy (29):
  vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU
driver
  vfio: powerpc/spapr: Do cleanup when releasing the group
  vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size
  vfio: powerpc/spapr: Use it_page_size
  vfio: powerpc/spapr: Move locked_vm accounting to helpers
  vfio: powerpc/spapr: Disable DMA mappings on disabled container
  vfio: powerpc/spapr: Moving pinning/unpinning to helpers
  vfio: powerpc/spapr: Register memory
  vfio: powerpc/spapr: Rework attach/detach
  powerpc/powernv: Do not set read flag if direction==DMA_NONE
  powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
  powerpc/iommu: Introduce iommu_table_alloc() helper
  powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
  vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control
  vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership
control
  powerpc/iommu: Fix IOMMU ownership control functions
  powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free()
  powerpc/iommu/powernv: Release replaced TCE
  poweppc/powernv/ioda2: Rework iommu_table creation
  powerpc/powernv/ioda2: Introduce
pnv_pci_ioda2_create_table/pnc_pci_free_table
  powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window
  powerpc/iommu: Split iommu_free_table into 2 helpers
  powerpc/powernv: Implement multilevel TCE tables
  powerpc/powernv: Change prototypes to receive iommu
  powerpc/powernv/ioda: Define and implement DMA table/window management
callbacks
  vfio: powerpc/spapr: Define v2 IOMMU
  vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership
  vfio: powerpc/spapr: Support multiple groups in one container if
possible
  vfio: powerpc/spapr: Support Dynamic DMA windows

 Documentation/vfio.txt  |   38 +
 arch/powerpc/include/asm/iommu.h|  109 ++-
 arch/powerpc/include/asm/machdep.h  |   25 -
 arch/powerpc/kernel/iommu.c |  327 +
 arch/powerpc/kernel/vio.c   |5 +
 arch/powerpc/platforms/cell/iommu.c |8 +-
 arch/powerpc/platforms/pasemi/iommu.c   |7 +-
 arch/powerpc/platforms/powernv/pci-ioda.c   |  500 ++---

Re: [PATCH] KVM: make halt_poll_ns static

2015-03-09 Thread Christian Borntraeger

Am 03.03.2015 um 22:46 schrieb Christian Borntraeger:
[...]
 
 halt_poll_ns is used only locally. Make it static and remove 
 the initializer.
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 34310a8..58bc2a9 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -66,7 +66,7 @@
  MODULE_AUTHOR(Qumranet);
  MODULE_LICENSE(GPL);
 
 -unsigned int halt_poll_ns = 0;
 +static unsigned int halt_poll_ns;
  module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
  /*
 
 
 Otherwise I will resend.

Ping. Do you want me to send that via my next pull request for s390 or shall I 
resend as a standalone patch for you to apply?

Christian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 26/29] vfio: powerpc/spapr: Define v2 IOMMU

2015-03-09 Thread Alexey Kardashevskiy

The existing IOMMU code takes/releases ownership over the existing IOMMU
tables created by the platform code, i.e. the tables remain in memory
all the time. Also, the existing IOMMU requires VFIO_IOMMU_ENABLE call to
start working as that's where we check the rlimit for locked pages.

New IOMMU will take over the IOMMU group completely which means the IOMMU
tables created by the platform code are going to be disposed and VFIO
will create its own tables. Also, with the DMA memory pre-registration
feature, the userspace will not need to call VFIO_IOMMU_ENABLE as
the locked pages accounting will be done by
VFIO_IOMMU_SPAPR_REGISTER_MEMORY.

In order to inform the userspace that VFIO supports new capabilities,
this adds a new SPAPR TCE IOMMU v2 type.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 3 ++-
 include/uapi/linux/vfio.h   | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index e572c28..d665ddc 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -404,7 +404,7 @@ static void *tce_iommu_open(unsigned long arg)
 {
struct tce_container *container;
 
-   if (arg != VFIO_SPAPR_TCE_IOMMU) {
+   if ((arg != VFIO_SPAPR_TCE_IOMMU)  (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
pr_err(tce_vfio: Wrong IOMMU type\n);
return ERR_PTR(-EINVAL);
}
@@ -588,6 +588,7 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_CHECK_EXTENSION:
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
+   case VFIO_SPAPR_TCE_v2_IOMMU:
ret = 1;
break;
default:
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b17e120..fbc5286 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -36,6 +36,8 @@
 /* Two-stage IOMMU */
 #define VFIO_TYPE1_NESTING_IOMMU   6   /* Implies v2 */
 
+#define VFIO_SPAPR_TCE_v2_IOMMU7
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 19/29] poweppc/powernv/ioda2: Rework iommu_table creation

2015-03-09 Thread Alexey Kardashevskiy

This moves iommu_table creation to the beginning. This is a mechanical
patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3703098..d05abaf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1429,27 +1429,31 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
addr = page_address(tce_mem);
memset(addr, 0, tce_table_size);
 
+   /* Setup iommu */
+   pe-table_group.tables[0].it_group = pe-table_group;
+
+   /* Setup linux iommu table */
+   tbl = pe-table_group.tables[0];
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   IOMMU_PAGE_SHIFT_4K);
+
+   tbl-it_ops = pnv_ioda2_iommu_ops;
+   iommu_init_table(tbl, phb-hose-node);
+   pe-table_group.ops = pnv_pci_ioda2_ops;
+
/*
 * Map TCE table through TVT. The TVE index is the PE number
 * shifted by 1 bit for 32-bits DMA space.
 */
rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(addr),
-   tce_table_size, 0x1000);
+   pe-pe_number  1, 1, __pa(tbl-it_base),
+   tbl-it_size  3, 1ULL  tbl-it_page_shift);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
goto fail;
}
 
-   /* Setup iommu */
-   pe-table_group.tables[0].it_group = pe-table_group;
-
-   /* Setup linux iommu table */
-   tbl = pe-table_group.tables[0];
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-   IOMMU_PAGE_SHIFT_4K);
-
/* OPAL variant of PHB3 invalidated TCEs */
swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
if (swinvp) {
@@ -1463,14 +1467,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
8);
tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
}
-   tbl-it_ops = pnv_ioda2_iommu_ops;
-   iommu_init_table(tbl, phb-hose-node);
-   pe-table_group.ops = pnv_pci_ioda2_ops;
iommu_register_group(pe-table_group, phb-hose-global_number,
pe-pe_number);
 
if (pe-pdev)
-   set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+   set_iommu_table_base_and_group(pe-pdev-dev,
+   pe-table_group.tables[0]);
else
pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 04/29] vfio: powerpc/spapr: Use it_page_size

2015-03-09 Thread Alexey Kardashevskiy

This makes use of the it_page_size from the iommu_table struct
as page size can differ.

This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: David Gibson da...@gibson.dropbear.id.au
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 91e7599..0e37400 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -99,7 +99,7 @@ static int tce_iommu_enable(struct tce_container *container)
 * enforcing the limit based on the max that the guest can map.
 */
down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
locked = current-mm-locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
@@ -128,7 +128,7 @@ static void tce_iommu_disable(struct tce_container 
*container)
 
down_write(current-mm-mmap_sem);
current-mm-locked_vm -= (container-tbl-it_size 
-   IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   container-tbl-it_page_shift)  PAGE_SHIFT;
up_write(current-mm-mmap_sem);
 }
 
@@ -230,7 +230,7 @@ static long tce_iommu_build(struct tce_container *container,
tce, ret);
break;
}
-   tce += IOMMU_PAGE_SIZE_4K;
+   tce += IOMMU_PAGE_SIZE(tbl);
}
 
if (ret)
@@ -275,8 +275,8 @@ static long tce_iommu_ioctl(void *iommu_data,
if (info.argsz  minsz)
return -EINVAL;
 
-   info.dma32_window_start = tbl-it_offset  IOMMU_PAGE_SHIFT_4K;
-   info.dma32_window_size = tbl-it_size  IOMMU_PAGE_SHIFT_4K;
+   info.dma32_window_start = tbl-it_offset  tbl-it_page_shift;
+   info.dma32_window_size = tbl-it_size  tbl-it_page_shift;
info.flags = 0;
 
if (copy_to_user((void __user *)arg, info, minsz))
@@ -306,8 +306,8 @@ static long tce_iommu_ioctl(void *iommu_data,
VFIO_DMA_MAP_FLAG_WRITE))
return -EINVAL;
 
-   if ((param.size  ~IOMMU_PAGE_MASK_4K) ||
-   (param.vaddr  ~IOMMU_PAGE_MASK_4K))
+   if ((param.size  ~IOMMU_PAGE_MASK(tbl)) ||
+   (param.vaddr  ~IOMMU_PAGE_MASK(tbl)))
return -EINVAL;
 
/* iova is checked by the IOMMU API */
@@ -322,8 +322,8 @@ static long tce_iommu_ioctl(void *iommu_data,
return ret;
 
ret = tce_iommu_build(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   tce, param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   tce, param.size  tbl-it_page_shift);
 
iommu_flush_tce(tbl);
 
@@ -349,17 +349,17 @@ static long tce_iommu_ioctl(void *iommu_data,
if (param.flags)
return -EINVAL;
 
-   if (param.size  ~IOMMU_PAGE_MASK_4K)
+   if (param.size  ~IOMMU_PAGE_MASK(tbl))
return -EINVAL;
 
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.size  tbl-it_page_shift);
if (ret)
return ret;
 
ret = tce_iommu_clear(container, tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   param.size  tbl-it_page_shift);
iommu_flush_tce(tbl);
 
return ret;
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 13/29] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-03-09 Thread Alexey Kardashevskiy

Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h|  18 +++--
 arch/powerpc/kernel/iommu.c |  34 
 arch/powerpc/platforms/powernv/pci-ioda.c   |  38 +
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  17 ++--
 arch/powerpc/platforms/powernv/pci.c|   2 +-
 arch/powerpc/platforms/powernv/pci.h|   4 +-
 arch/powerpc/platforms/pseries/iommu.c  |   9 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 120 
 8 files changed, 160 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index eb75726..667aa1a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -90,9 +90,7 @@ struct iommu_table {
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
-#ifdef CONFIG_IOMMU_API
-   struct iommu_group *it_group;
-#endif
+   struct iommu_table_group *it_group;
struct iommu_table_ops *it_ops;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
@@ -126,14 +124,24 @@ extern void iommu_free_table(struct iommu_table *tbl, 
const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
+
+#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+
+struct iommu_table_group {
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+   struct iommu_group *group;
+#endif
+   struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+};
+
+#ifdef CONFIG_IOMMU_API
+extern void iommu_register_group(struct iommu_table_group *table_group,
 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
unsigned long pe_num)
 {
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index dbc5d8d..7794dce 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -712,17 +712,20 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid)
 
 struct iommu_table *iommu_table_alloc(int node)
 {
-   struct iommu_table *tbl;
+   struct iommu_table_group *table_group;
 
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+  node);
+   table_group-tables[0].it_group = table_group;
 
-   return tbl;
+   return table_group-tables[0];
 }
 
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
unsigned long bitmap_sz;
unsigned int order;
+   struct iommu_table_group *table_group = tbl-it_group;
 
if (!tbl || !tbl-it_map) {
printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
@@ -738,9 +741,9 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
clear_bit(0, tbl-it_map);
 
 #ifdef CONFIG_IOMMU_API
-   if (tbl-it_group) {
-   iommu_group_put(tbl-it_group);
-   BUG_ON(tbl-it_group);
+   if (table_group-group) {
+   iommu_group_put(table_group-group);
+   BUG_ON(table_group-group);
}
 #endif
 
@@ -756,7 +759,7 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
free_pages((unsigned long) tbl-it_map, order);
 
/* free table */
-   kfree(tbl);
+   kfree(table_group);
 }
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
@@ -888,11 +891,12 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
  */
 static void group_release(void *iommu_data)
 {
-   struct iommu_table *tbl = iommu_data;
-   tbl-it_group = NULL;
+   struct iommu_table_group *table_group = iommu_data;
+
+   table_group-group = NULL;
 }
 
-void iommu_register_group(struct iommu_table *tbl,
+void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number, unsigned long pe_num)
 {
struct iommu_group *grp;
@@ -904,8 +908,8 @@ void iommu_register_group(struct iommu_table *tbl,
PTR_ERR(grp));
return;
}
-   tbl-it_group = grp;
-

[PATCH v5 08/29] vfio: powerpc/spapr: Register memory

2015-03-09 Thread Alexey Kardashevskiy

The existing implementation accounts the whole DMA window in
the locked_vm counter which is going to be even worse with multiple
containers and huge DMA windows.

This introduces 2 ioctls to register/unregister DMA memory which
receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.

If any memory region was registered, all subsequent DMA map requests
should address already pinned memory. If no memory was registered,
then the amount of memory required for a single default memory will be
accounted when the container is enabled and every map/unmap will pin/unpin
a page (with degraded performance).

Dynamic DMA window and in-kernel acceleration will require memory to
be preregistered in order to work.

The accounting is done per VFIO container. When support for
multiple groups per container is added, we will have more accurate locked_vm
accounting.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* updated docs
* s/kzmalloc/vzalloc/
* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
replaced offset with index
* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
and removed duplicating vfio_iommu_spapr_register_memory
---
 Documentation/vfio.txt  |  19 +++
 drivers/vfio/vfio_iommu_spapr_tce.c | 272 +++-
 include/uapi/linux/vfio.h   |  25 
 3 files changed, 310 insertions(+), 6 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..791e85c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -427,6 +427,25 @@ The code flow from the example above should be slightly 
changed:
 

 
+5) PPC64 paravirtualized guests may generate a lot of map/unmap requests,
+and the handling of those includes pinning/unpinning pages and updating
+mm::locked_vm counter to make sure we do not exceed the rlimit. Handling these
+in real-mode is quite expensive and may fail. In order to simplify in-kernel
+acceleration of map/unmap requests, two ioctls have been added to pre-register
+and unregister guest RAM pages where DMA can possibly happen to. Having these
+calles, the userspace and in-kernel handlers do not have to take care of
+pinning or accounting.
+
+The ioctls are VFIO_IOMMU_SPAPR_REGISTER_MEMORY and
+VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY.
+These receive a user space address and size of the block to be pinned.
+Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
+be called with the exact address and size used for registering
+the memory block.
+
+The user space is not expected to call these often and the block descriptors
+are stored in a linked list in the kernel.
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index ca396e5..f0dfd95 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include linux/uaccess.h
 #include linux/err.h
 #include linux/vfio.h
+#include linux/vmalloc.h
 #include asm/iommu.h
 #include asm/tce.h
 
@@ -93,8 +94,196 @@ struct tce_container {
struct iommu_table *tbl;
bool enabled;
unsigned long locked_pages;
+   struct list_head mem_list;
 };
 
+struct tce_memory {
+   struct list_head next;
+   struct rcu_head rcu;
+   __u64 vaddr;
+   __u64 size;
+   __u64 hpas[];
+};
+
+static inline bool tce_preregistered(struct tce_container *container)
+{
+   return !list_empty(container-mem_list);
+}
+
+static struct tce_memory *tce_mem_alloc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem;
+   long ret;
+
+   ret = try_increment_locked_vm(size  PAGE_SHIFT);
+   if (ret)
+   return NULL;
+
+   mem = vzalloc(sizeof(*mem) + (size  (PAGE_SHIFT - 3)));
+   if (!mem) {
+   decrement_locked_vm(size  PAGE_SHIFT);
+   return NULL;
+   }
+
+   mem-vaddr = vaddr;
+   mem-size = size;
+
+   list_add_rcu(mem-next, container-mem_list);
+
+   return mem;
+}
+
+static void release_tce_memory(struct rcu_head *head)
+{
+   struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
+
+   vfree(mem);
+}
+
+static void tce_mem_free(struct tce_memory *mem)
+{
+   decrement_locked_vm(mem-size);
+   list_del_rcu(mem-next);
+   call_rcu(mem-rcu, release_tce_memory);
+}
+
+static struct tce_memory *tce_pinned_desc(struct tce_container *container,
+   __u64 vaddr, __u64 size)
+{
+   struct tce_memory *mem, *ret = NULL;
+
+   rcu_read_lock();
+   vaddr = ~(TCE_PCI_READ | TCE_PCI_WRITE);
+   list_for_each_entry_rcu(mem, container-mem_list, next) {
+   if ((mem-vaddr = vaddr) 
+

[PATCH v5 05/29] vfio: powerpc/spapr: Move locked_vm accounting to helpers

2015-03-09 Thread Alexey Kardashevskiy

There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).

This reworks debug messages to show the current value and the limit.

This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.

While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* new helpers do nothing if @npages == 0
* tce_iommu_disable() now can decrement the counter if the group was
detached (not possible now but will be in the future)
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 82 -
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 0e37400..432a0de 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -31,6 +31,51 @@
 static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
 
+static long try_increment_locked_vm(long npages)
+{
+   long ret = 0, locked, lock_limit;
+
+   if (!current || !current-mm)
+   return -ESRCH; /* process exited */
+
+   if (!npages)
+   return 0;
+
+   down_write(current-mm-mmap_sem);
+   locked = current-mm-locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
+   if (locked  lock_limit  !capable(CAP_IPC_LOCK))
+   ret = -ENOMEM;
+   else
+   current-mm-locked_vm += npages;
+
+   pr_debug([%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK),
+   ret ?  - exceeded : );
+
+   up_write(current-mm-mmap_sem);
+
+   return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+   if (!current || !current-mm || !npages)
+   return; /* process exited */
+
+   down_write(current-mm-mmap_sem);
+   if (npages  current-mm-locked_vm)
+   npages = current-mm-locked_vm;
+   current-mm-locked_vm -= npages;
+   pr_debug([%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n, current-pid,
+   npages  PAGE_SHIFT,
+   current-mm-locked_vm  PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK));
+   up_write(current-mm-mmap_sem);
+}
+
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -47,6 +92,7 @@ struct tce_container {
struct mutex lock;
struct iommu_table *tbl;
bool enabled;
+   unsigned long locked_pages;
 };
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -68,7 +114,7 @@ static bool tce_page_is_contained(struct page *page, 
unsigned page_shift)
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
-   unsigned long locked, lock_limit, npages;
+   unsigned long locked;
struct iommu_table *tbl = container-tbl;
 
if (!container-tbl)
@@ -97,21 +143,22 @@ static int tce_iommu_enable(struct tce_container 
*container)
 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 * that would effectively kill the guest at random points, much better
 * enforcing the limit based on the max that the guest can map.
+*
+* Unfortunately at the moment it counts whole tables, no matter how
+* much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+* each with 2GB DMA window, 8GB will be counted here. The reason for
+* this is that we cannot tell here the amount of RAM used by the guest
+* as this information is only available from KVM and VFIO is
+* KVM agnostic.
 */
-   down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
-   locked = current-mm-locked_vm + npages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
-   if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
-   pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n,
-   rlimit(RLIMIT_MEMLOCK));
-   ret = -ENOMEM;
-   } else {
+   locked = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
+   ret = try_increment_locked_vm(locked);
+   if (ret)
+   return ret;
 
-   current-mm-locked_vm += npages;
-   container-enabled = true;
-   }
-   up_write(current-mm-mmap_sem);
+   container-locked_pages = locked;
+
+

[PATCH v5 09/29] vfio: powerpc/spapr: Rework attach/detach

2015-03-09 Thread Alexey Kardashevskiy

This is to make extended ownership and multiple groups support patches
simpler for review.

This is a mechanical patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 38 ++---
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index f0dfd95..235d915 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -752,16 +752,21 @@ static int tce_iommu_attach_group(void *iommu_data,
iommu_group_id(container-tbl-it_group),
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else if (container-enabled) {
+   goto unlock_exit;
+   }
+
+   if (container-enabled) {
pr_err(tce_vfio: attaching group #%u to enabled container\n,
iommu_group_id(iommu_group));
ret = -EBUSY;
-   } else {
-   ret = iommu_take_ownership(tbl);
-   if (!ret)
-   container-tbl = tbl;
+   goto unlock_exit;
}
 
+   ret = iommu_take_ownership(tbl);
+   if (!ret)
+   container-tbl = tbl;
+
+unlock_exit:
mutex_unlock(container-lock);
 
return ret;
@@ -779,18 +784,21 @@ static void tce_iommu_detach_group(void *iommu_data,
pr_warn(tce_vfio: detaching group #%u, expected group is 
#%u\n,
iommu_group_id(iommu_group),
iommu_group_id(tbl-it_group));
-   } else {
-   if (container-enabled) {
-   pr_warn(tce_vfio: detaching group #%u from enabled 
container, forcing disable\n,
-   iommu_group_id(tbl-it_group));
-   tce_iommu_disable(container);
-   }
+   goto unlock_exit;
+   }
 
-   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
-   iommu_group_id(iommu_group), iommu_group); */
-   container-tbl = NULL;
-   iommu_release_ownership(tbl);
+   if (container-enabled) {
+   pr_warn(tce_vfio: detaching group #%u from enabled container, 
forcing disable\n,
+   iommu_group_id(tbl-it_group));
+   tce_iommu_disable(container);
}
+
+   /* pr_debug(tce_vfio: detaching group #%u from iommu %p\n,
+  iommu_group_id(iommu_group), iommu_group); */
+   container-tbl = NULL;
+   iommu_release_ownership(tbl);
+
+unlock_exit:
mutex_unlock(container-lock);
 }
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] target-arm/kvm64: fix save/restore of SPSR regs

2015-03-09 Thread Peter Maydell

On 9 March 2015 at 21:56, Christoffer Dall christoffer.d...@linaro.org wrote:
 this function, however, is not used only when migration, but should
 generally cover the case where you want to synchronize QEMU's state into
 KVM's state, right?  So while it may not be harmful in currently
 supported use cases, is there ever a situation where (is_a64(env)  el
 == 0) and env-spsr != banked_spsr[el], and where env-spsr is
 out-of-date?

If EL == 0 then you can't access any SPSR, so env-spsr is by
definition out of date.

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] target-arm/kvm64: fix save/restore of SPSR regs

2015-03-09 Thread Christoffer Dall

On Tue, Mar 03, 2015 at 11:28:21AM +, Alex Bennée wrote:
 
 Christoffer Dall christoffer.d...@linaro.org writes:
 
  Hi Alex,
 
  Seems like you accidentally sent out two copies of this patch, hopefully
  I'm reviewing the right one...
 
 Oops, perils of not wiping your output directory. I think it was just a
 title tweak!
 
  On Wed, Feb 25, 2015 at 04:02:38PM +, Alex Bennée wrote:
  From: Christoffer Dall christoffer.d...@linaro.org
  
  The current code was negatively indexing the cpu state array and not
  synchronizing banked spsr register state with the current mode's spsr
  state, causing occasional failures with migration.
  
  Some munging is done to take care of the aarch64 mapping and also to
  ensure the most current value of the spsr is updated to the banked
  registers (relevant for KVM-TCG migration).
  
  Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
  Signed-off-by: Alex Bennée alex.ben...@linaro.org
  
  ---
  v2 (ajb)
- minor tweaks and clarifications
  
  diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
  index c60e989..1e36b0a 100644
  --- a/target-arm/kvm64.c
  +++ b/target-arm/kvm64.c
  @@ -140,6 +140,7 @@ int kvm_arch_put_registers(CPUState *cs, int level)
   uint64_t val;
   int i;
   int ret;
  +unsigned int el;
   
   ARMCPU *cpu = ARM_CPU(cs);
   CPUARMState *env = cpu-env;
  @@ -206,9 +207,25 @@ int kvm_arch_put_registers(CPUState *cs, int level)
   return ret;
   }
   
  +/* Saved Program State Registers
  + *
  + * Before we restore from the banked_spsr[] array we need to
  + * ensure that any modifications to env-spsr are correctly
  + * reflected and map aarch64 exception levels if required.
  + */
  +el = arm_current_el(env);
  +if (is_a64(env)  el  0) {
  +g_assert(el == 1);
  +/* KVM maps KVM_SPSR_SVC to KVM_SPSR_EL1 for aarch64 */
  +env-banked_spsr[1] = env-banked_spsr[0];
  +env-banked_spsr[aarch64_banked_spsr_index(el)] = env-spsr;
  +} else {
  +env-banked_spsr[el] = env-spsr;
 
  is this valid if (is_a64(env)  el == 0)) ?  I thought that if you're
  in el == 0, then env-banked_spsr[x] is the most up-to-date one, not
  env-spsr ?
 
 Actually they will both be the same (at least for KVM). In the end both:
 
 VMSTATE_UINT32(env.spsr, ARMCPU),
 VMSTATE_UINT64_ARRAY(env.banked_spsr, ARMCPU, 8),
 
 get serialised in the stream and when we save the stream out we make
 sure everything is in sync (i.e. env-spsr is correct). In reality this
 makes more sense for TCG than KVM which is the only reason env-spsr
 exists.
 

this function, however, is not used only when migration, but should
generally cover the case where you want to synchronize QEMU's state into
KVM's state, right?  So while it may not be harmful in currently
supported use cases, is there ever a situation where (is_a64(env)  el
== 0) and env-spsr != banked_spsr[el], and where env-spsr is
out-of-date?

If that's the case, I think it would be better to have an assert that
says don't call the code in this situation than relying on limited use
cases for callers of this function.

 
  for !is_a64(env) this looks wrong, because of the same as above if el ==
  0, but also because I think you need
  bank_number(env-uncached_cpsr  CPSR_M) to index into the array.
 
 
 Good catch. For some reason I was treating the number from
 arm_current_el() as equivalent. How about:
 
 el = arm_current_el(env);
 if (is_a64(env)  el  0) {
 g_assert(el == 1);
 /* KVM only maps KVM_SPSR_SVC to KVM_SPSR_EL1 for aarch64 ATM */
 env-banked_spsr[1] = env-banked_spsr[0];
 }
 i = is_a64(env) ?
 aarch64_banked_spsr_index(el) : bank_number(env-unached_cpsr  
 CPSR_M);

I think that will fail due to the assert in aarch64_banked_spsr_index()
for el == 0.


-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 22/29] powerpc/iommu: Split iommu_free_table into 2 helpers

2015-03-09 Thread Alexey Kardashevskiy

The iommu_free_table helper release memory it is using (the TCE table and
@it_map) and release the iommu_table struct as well. We might not want
the very last step as we store iommu_table in parent structures.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 57 
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 6af22a4..fd118ea 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -125,6 +125,7 @@ static inline void *get_iommu_table_base(struct device *dev)
 
 extern struct iommu_table *iommu_table_alloc(int node);
 /* Frees table for an individual device node */
+extern void iommu_reset_table(struct iommu_table *tbl, const char *node_name);
 extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
 
 /* Initializes an iommu_table based in values set in the passed-in
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f0f60ec..1440c6a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -721,24 +721,46 @@ struct iommu_table *iommu_table_alloc(int node)
return table_group-tables[0];
 }
 
+void iommu_reset_table(struct iommu_table *tbl, const char *node_name)
+{
+   if (!tbl)
+   return;
+
+   if (tbl-it_map) {
+   unsigned long bitmap_sz;
+   unsigned int order;
+
+   /*
+* In case we have reserved the first bit, we should not emit
+* the warning below.
+*/
+   if (tbl-it_offset == 0)
+   clear_bit(0, tbl-it_map);
+
+   /* verify that table contains no entries */
+   if (!bitmap_empty(tbl-it_map, tbl-it_size))
+   pr_warn(%s: Unexpected TCEs for %s\n, __func__,
+   node_name);
+
+   /* calculate bitmap size in bytes */
+   bitmap_sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long);
+
+   /* free bitmap */
+   order = get_order(bitmap_sz);
+   free_pages((unsigned long) tbl-it_map, order);
+   }
+
+   memset(tbl, 0, sizeof(*tbl));
+}
+
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
-   unsigned long bitmap_sz;
-   unsigned int order;
struct iommu_table_group *table_group = tbl-it_group;
 
-   if (!tbl || !tbl-it_map) {
-   printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
-   node_name);
+   if (!tbl)
return;
-   }
 
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl-it_offset == 0)
-   clear_bit(0, tbl-it_map);
+   iommu_reset_table(tbl, node_name);
 
 #ifdef CONFIG_IOMMU_API
if (table_group-group) {
@@ -747,17 +769,6 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
}
 #endif
 
-   /* verify that table contains no entries */
-   if (!bitmap_empty(tbl-it_map, tbl-it_size))
-   pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name);
-
-   /* calculate bitmap size in bytes */
-   bitmap_sz = BITS_TO_LONGS(tbl-it_size) * sizeof(unsigned long);
-
-   /* free bitmap */
-   order = get_order(bitmap_sz);
-   free_pages((unsigned long) tbl-it_map, order);
-
/* free table */
kfree(table_group);
 }
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 21/29] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window

2015-03-09 Thread Alexey Kardashevskiy

This is a part of moving DMA window programming to an iommu_ops
callback.

This is a mechanical patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 85 ---
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fae8cf6..126d803 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1389,6 +1389,57 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
memset(tbl, 0, sizeof(struct iommu_table));
 }
 
+static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe,
+   struct iommu_table *tbl)
+{
+   struct pnv_phb *phb = pe-phb;
+   const __be64 *swinvp;
+   int64_t rc;
+   const __u64 start_addr = tbl-it_offset  tbl-it_page_shift;
+   const __u64 win_size = tbl-it_size  tbl-it_page_shift;
+
+   pe_info(pe, Setting up window at %llx..%llx pagesize=0x%x 
tablesize=0x%lx\n,
+   start_addr, start_addr + win_size - 1,
+   1UL  tbl-it_page_shift, tbl-it_size  3);
+
+   pe-table_group.tables[0] = *tbl;
+   tbl = pe-table_group.tables[0];
+   tbl-it_group = pe-table_group;
+
+   /*
+* Map TCE table through TVT. The TVE index is the PE number
+* shifted by 1 bit for 32-bits DMA space.
+*/
+   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
+   pe-pe_number  1, 1, __pa(tbl-it_base),
+   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   if (rc) {
+   pe_err(pe, Failed to configure TCE table, err %ld\n, rc);
+   goto fail;
+   }
+
+   /* OPAL variant of PHB3 invalidated TCEs */
+   swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
+   if (swinvp) {
+   /* We need a couple more fields -- an address and a data
+* to or.  Since the bus is only printed out on table free
+* errors, and on the first pass the data will be a relative
+* bus number, print that out instead.
+*/
+   pe-tce_inval_reg_phys = be64_to_cpup(swinvp);
+   tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys,
+   8);
+   tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+   }
+
+   return 0;
+fail:
+   if (pe-tce32_seg = 0)
+   pe-tce32_seg = -1;
+
+   return rc;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -1459,7 +1510,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
-   const __be64 *swinvp;
unsigned int end;
struct iommu_table *tbl = pe-table_group.tables[0];
int64_t rc;
@@ -1486,31 +1536,14 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
pe-table_group.tables[0].it_group = pe-table_group;
pe-table_group.ops = pnv_pci_ioda2_ops;
 
-   /*
-* Map TCE table through TVT. The TVE index is the PE number
-* shifted by 1 bit for 32-bits DMA space.
-*/
-   rc = opal_pci_map_pe_dma_window(phb-opal_id, pe-pe_number,
-   pe-pe_number  1, 1, __pa(tbl-it_base),
-   tbl-it_size  3, 1ULL  tbl-it_page_shift);
+   rc = pnv_pci_ioda2_set_window(pe, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
-   goto fail;
-   }
-
-   /* OPAL variant of PHB3 invalidated TCEs */
-   swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
-   if (swinvp) {
-   /* We need a couple more fields -- an address and a data
-* to or.  Since the bus is only printed out on table free
-* errors, and on the first pass the data will be a relative
-* bus number, print that out instead.
-*/
-   pe-tce_inval_reg_phys = be64_to_cpup(swinvp);
-   tbl-it_index = (unsigned long)ioremap(pe-tce_inval_reg_phys,
-   8);
-   tbl-it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+   pnv_pci_free_table(tbl);
+   if (pe-tce32_seg = 0)
+   pe-tce32_seg = -1;
+   return;
}
iommu_register_group(pe-table_group, phb-hose-global_number,
pe-pe_number);
@@ -1524,12 +1557,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
/* Also create a bypass window */
if (!pnv_iommu_bypass_disabled)

[PATCH v5 17/29] powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free()

2015-03-09 Thread Alexey Kardashevskiy

The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
supposed to be called on IODA1/2 and not called on p5ioc2. It receives
start and end host addresses of TCE table. This approach makes it possible
to get pnv_pci_ioda_tce_invalidate() unintentionally called on p5ioc2.
Another issue is that IODA2 needs PCI addresses to invalidate the cache
and those can be calculated from host addresses but since we are going
to implement multi-level TCE tables, calculating PCI address from
a host address might get either tricky or ugly as TCE table remains flat
on PCI bus but not in RAM.

This defines separate iommu_table_ops callbacks for p5ioc2 and IODA1/2
PHBs. They all call common pnv_tce_build/pnv_tce_free/pnv_tce_get helpers
but call PHB specific TCE invalidation helper (when needed).

This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
number of pages which are PCI addresses shifted by IOMMU page shift.

The patch is pretty mechanical and behaviour is not expected to change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 92 ++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  9 ++-
 arch/powerpc/platforms/powernv/pci.c| 76 +---
 arch/powerpc/platforms/powernv/pci.h|  7 ++-
 4 files changed, 111 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 67a3fe4..43a7cce 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1067,18 +1067,20 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
*pe,
}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
 {
+   struct pnv_ioda_pe *pe = container_of(tbl-it_group,
+   struct pnv_ioda_pe, table_group);
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
unsigned long start, end, inc;
const unsigned shift = tbl-it_page_shift;
 
-   start = __pa(startp);
-   end = __pa(endp);
+   start = __pa((__be64 *)tbl-it_base + index - tbl-it_offset);
+   end = __pa((__be64 *)tbl-it_base + index - tbl-it_offset +
+   npages - 1);
 
/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
if (tbl-it_busno) {
@@ -1114,10 +1116,40 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
pnv_ioda_pe *pe,
 */
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static int pnv_ioda1_tce_build_vm(struct iommu_table *tbl, long index,
+   long npages, unsigned long uaddr,
+   enum dma_data_direction direction,
+   struct dma_attrs *attrs)
 {
+   long ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+   attrs);
+
+   if (!ret  (tbl-it_type  TCE_PCI_SWINV_CREATE))
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+   return ret;
+}
+
+static void pnv_ioda1_tce_free_vm(struct iommu_table *tbl, long index,
+   long npages)
+{
+   pnv_tce_free(tbl, index, npages);
+
+   if (tbl-it_type  TCE_PCI_SWINV_FREE)
+   pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
+struct iommu_table_ops pnv_ioda1_iommu_ops = {
+   .set = pnv_ioda1_tce_build_vm,
+   .clear = pnv_ioda1_tce_free_vm,
+   .get = pnv_tce_get,
+};
+
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool rm)
+{
+   struct pnv_ioda_pe *pe = container_of(tbl-it_group,
+   struct pnv_ioda_pe, table_group);
unsigned long start, end, inc;
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
@@ -1130,9 +1162,9 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
end = start;
 
/* Figure out the start, end and step */
-   inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64));
+   inc = tbl-it_offset + index / sizeof(u64);
start |= (inc  shift);
-   inc = tbl-it_offset + (((u64)endp - tbl-it_base) / sizeof(u64));
+   inc = tbl-it_offset + (index + npages - 1) / sizeof(u64);
end |= (inc  shift);
inc = (0x1ull  shift);
mb();
@@ -1146,19 +1178,35 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe

[PATCH v5 18/29] powerpc/iommu/powernv: Release replaced TCE

2015-03-09 Thread Alexey Kardashevskiy

At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.

This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards.

The returned old TCE value is a virtual address as the new TCE value.
This is different from tce_clear() which returns a physical address.

This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.

This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().

This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from DMA direction.

This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h| 15 +--
 arch/powerpc/kernel/iommu.c | 53 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c   | 26 
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  1 +
 arch/powerpc/platforms/powernv/pci.c| 15 +++
 arch/powerpc/platforms/powernv/pci.h|  2 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 62 ++---
 7 files changed, 116 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d1f8c6c..6af22a4 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,11 +44,20 @@ extern int iommu_is_off;
 extern int iommu_force_on;
 
 struct iommu_table_ops {
+   /* When called with direction==DMA_NONE, it is equal to clear() */
int (*set)(struct iommu_table *tbl,
long index, long npages,
unsigned long uaddr,
enum dma_data_direction direction,
struct dma_attrs *attrs);
+   /*
+* Exchanges existing TCE with new TCE plus direction bits;
+* returns old TCE and DMA direction mask
+*/
+   int (*exchange)(struct iommu_table *tbl,
+   long index,
+   unsigned long *tce,
+   enum dma_data_direction *direction);
void (*clear)(struct iommu_table *tbl,
long index, long npages);
unsigned long (*get)(struct iommu_table *tbl, long index);
@@ -231,10 +240,8 @@ extern int iommu_tce_clear_param_check(struct iommu_table 
*tbl,
unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-   unsigned long entry);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+   unsigned long *tce, enum dma_data_direction *direction);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table_group *table_group);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index df888a7..f0f60ec 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -982,9 +982,6 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
 int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce)
 {
-   if (!(tce  (TCE_PCI_WRITE | TCE_PCI_READ)))
-   return -EINVAL;
-
if (tce  ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
return -EINVAL;
 
@@ -1002,44 +999,30 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 
-unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+static void iommu_tce_mk_dirty(unsigned long tce)
 {
-   unsigned long oldtce;
-   struct iommu_pool *pool = get_pool(tbl, entry);
+   struct page *pg = pfn_to_page(__pa(tce)  PAGE_SHIFT);
 
-   spin_lock((pool-lock));
-
-   oldtce = tbl-it_ops-get(tbl, entry);
-   if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ))
-   tbl-it_ops-clear(tbl, entry, 1);
-   else
-

[PATCH v5 27/29] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership

2015-03-09 Thread Alexey Kardashevskiy

Before the IOMMU user would take control over the IOMMU table belonging to
a specific IOMMU group. This approach did not allow sharing tables between
IOMMU groups attached to the same container.

This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU supports a set_ownership() callback, that lets the user have
full control over the IOMMU group. When the ownership is taken,
the platform code removes all the windows so the caller must create them.
Before returning the ownership back to the platform code, the user
has to unprogram and remove all the tables it created.

Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c   | 51 ---
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 07857c4..afb6906 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1620,11 +1620,33 @@ static void pnv_ioda2_set_ownership(struct 
iommu_table_group *table_group,
 {
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
-   if (enable)
-   iommu_take_ownership(table_group);
-   else
-   iommu_release_ownership(table_group);
+   if (enable) {
+   pnv_pci_ioda2_unset_window(pe-table_group, 0);
+   pnv_pci_free_table(pe-table_group.tables[0]);
+   } else {
+   struct iommu_table *tbl = pe-table_group.tables[0];
+   int64_t rc;
 
+   rc = pnv_pci_ioda2_create_table(pe-table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   pe-phb-ioda.m32_pci_base,
+   POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld,
+   rc);
+   return;
+   }
+
+   iommu_init_table(tbl, pe-phb-hose-node);
+
+   rc = pnv_pci_ioda2_set_window(pe-table_group, 0, tbl);
+   if (rc) {
+   pe_err(pe, Failed to configure 32-bit TCE table, err 
%ld\n,
+   rc);
+   pnv_pci_free_table(tbl);
+   return;
+   }
+   }
pnv_pci_ioda2_set_bypass(pe, !enable);
 }
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index d665ddc..3bc0645 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -426,18 +426,11 @@ static int tce_iommu_clear(struct tce_container 
*container,
 static void tce_iommu_release(void *iommu_data)
 {
struct tce_container *container = iommu_data;
-   struct iommu_table *tbl;
-   struct iommu_table_group *table_group;
 
WARN_ON(container-grp);
 
-   if (container-grp) {
-   table_group = iommu_group_get_iommudata(container-grp);
-   tbl = table_group-tables[0];
-   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
-
+   if (container-grp)
tce_iommu_detach_group(iommu_data, container-grp);
-   }
 
tce_mem_unregister_all(container);
tce_iommu_disable(container);
@@ -826,14 +819,24 @@ static int tce_iommu_attach_group(void *iommu_data,
 
if (!table_group-ops || !table_group-ops-set_ownership) {
ret = iommu_take_ownership(table_group);
+   } else if (!table_group-ops-create_table ||
+   !table_group-ops-set_window) {
+   WARN_ON_ONCE(1);
+   ret = -EFAULT;
} else {
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
 * our physical memory via the bypass window instead of just
 * the pages that has been explicitly mapped into the iommu
 */
+   struct iommu_table tbltmp = { 0 }, *tbl = tbltmp;
+
table_group-ops-set_ownership(table_group, true);
-   ret = 0;
+   ret = table_group-ops-create_table(table_group, 0,
+   IOMMU_PAGE_SHIFT_4K,
+   table_group-tce32_size, 1, tbl);
+   if (!ret)
+   ret = table_group-ops-set_window(table_group, 0, tbl);
}
 
if (ret)
@@ -852,6 +855,7 @@ static void tce_iommu_detach_group(void *iommu_data,
 {
struct tce_container *container = iommu_data;
struct

Re: [PATCH v2 5/6] target-arm: kvm64 fix save/restore of SPSR regs

2015-03-09 Thread Christoffer Dall

On Wed, Mar 04, 2015 at 02:35:52PM +, Alex Bennée wrote:
 From: Christoffer Dall christoffer.d...@linaro.org
 
 The current code was negatively indexing the cpu state array and not
 synchronizing banked spsr register state with the current mode's spsr
 state, causing occasional failures with migration.
 
 Some munging is done to take care of the aarch64 mapping and also to
 ensure the most current value of the spsr is updated to the banked
 registers (relevant for KVM-TCG migration).
 
 Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 ---
 v2 (ajb)
   - minor tweaks and clarifications
 v3
   - Use the correct bank index function for setting/getting env-spsr
   - only deal with spsrs in elevated exception levels
 
 diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
 index c60e989..45e5c3f 100644
 --- a/target-arm/kvm64.c
 +++ b/target-arm/kvm64.c
 @@ -140,6 +140,7 @@ int kvm_arch_put_registers(CPUState *cs, int level)
  uint64_t val;
  int i;
  int ret;
 +unsigned int el;
  
  ARMCPU *cpu = ARM_CPU(cs);
  CPUARMState *env = cpu-env;
 @@ -206,9 +207,27 @@ int kvm_arch_put_registers(CPUState *cs, int level)
  return ret;
  }
  
 +/* Saved Program State Registers
 + *
 + * Before we restore from the banked_spsr[] array we need to
 + * ensure that any modifications to env-spsr are correctly
 + * reflected and map aarch64 exception levels if required.
 + */
 +el = arm_current_el(env);
 +if (el  0) {
 +if (is_a64(env)) {
 +g_assert(el == 1);
 +/* KVM only maps KVM_SPSR_SVC to KVM_SPSR_EL1 for aarch64 ATM */

not sure about the 'for aarch64' comment; I would say that it's for
aarch32 support.  Also, you can drop the ATM, since this is user space
ABI that we don't change easily.


don't you need to do env-banked_spsr[0] = env-spsr first?

 +env-banked_spsr[1] = env-banked_spsr[0];


 +} else {
 +i = bank_number(env-uncached_cpsr  CPSR_M);
 +env-banked_spsr[i] = env-spsr;

so here we don't need to worry about banked_spsr[1] = banked_spsr[0]
because banked_spsr[0] is meaningless for 32-bit state and we only sync
banked_spsr[1] and up to KVM, correct?  I think this is what may deserve
a comment.

 +}
 +}
 +
  for (i = 0; i  KVM_NR_SPSR; i++) {
  reg.id = AARCH64_CORE_REG(spsr[i]);
 -reg.addr = (uintptr_t) env-banked_spsr[i - 1];
 +reg.addr = (uintptr_t) env-banked_spsr[i+1];
  ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, reg);
  if (ret) {
  return ret;
 @@ -253,6 +272,7 @@ int kvm_arch_get_registers(CPUState *cs)
  struct kvm_one_reg reg;
  uint64_t val;
  uint32_t fpr;
 +unsigned int el;
  int i;
  int ret;
  
 @@ -325,15 +345,35 @@ int kvm_arch_get_registers(CPUState *cs)
  return ret;
  }
  
 +/* Fetch the SPSR registers
 + *
 + * KVM has an array of state indexed for all the possible aarch32
 + * privilage levels. Although not all are valid at all points

privilege

 + * there are some transitions possible which can access old state
 + * so it is worth keeping them all.
 + */

dubious comment overall

  for (i = 0; i  KVM_NR_SPSR; i++) {
  reg.id = AARCH64_CORE_REG(spsr[i]);
 -reg.addr = (uintptr_t) env-banked_spsr[i - 1];
 +reg.addr = (uintptr_t) env-banked_spsr[i+1];
  ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, reg);
  if (ret) {
  return ret;
  }
  }
  
 +el = arm_current_el(env);
 +if (el  0) {
 +if (is_a64(env)) {
 +g_assert(el == 1);
 +/* KVM maps KVM_SPSR_SVC to KVM_SPSR_EL1 for aarch64 */

same as above

 +env-banked_spsr[0] = env-banked_spsr[1];
 +i = aarch64_banked_spsr_index(el);
 +} else {
 +i = bank_number(env-uncached_cpsr  CPSR_M);

same potential place for comment as above.

 +}
 +env-spsr = env-banked_spsr[i];
 +}
 +
  /* Advanced SIMD and FP registers */
  for (i = 0; i  32; i++) {
  reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
 -- 
 2.3.1
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 14/29] vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control

2015-03-09 Thread Alexey Kardashevskiy

This replaces iommu_take_ownership()/iommu_release_ownership() calls
with the callback calls and it is up to the platform code to call
iommu_take_ownership()/iommu_release_ownership() if needed.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h|  4 +--
 arch/powerpc/kernel/iommu.c | 50 -
 drivers/vfio/vfio_iommu_spapr_tce.c |  4 +--
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 667aa1a..b9e50d3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -225,8 +225,8 @@ extern unsigned long iommu_clear_tce(struct iommu_table 
*tbl,
unsigned long entry);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
-extern int iommu_take_ownership(struct iommu_table *tbl);
-extern void iommu_release_ownership(struct iommu_table *tbl);
+extern int iommu_take_ownership(struct iommu_table_group *table_group);
+extern void iommu_release_ownership(struct iommu_table_group *table_group);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
 extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 7794dce..8358631 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1050,7 +1050,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_take_ownership(struct iommu_table *tbl)
+static int iommu_table_take_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
@@ -1064,19 +1064,36 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
memset(tbl-it_map, 0xff, sz);
 
-   /*
-* Disable iommu bypass, otherwise the user can DMA to all of
-* our physical memory via the bypass window instead of just
-* the pages that has been explicitly mapped into the iommu
-*/
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, false);
+   return 0;
+}
+
+static void iommu_table_release_ownership(struct iommu_table *tbl);
+
+int iommu_take_ownership(struct iommu_table_group *table_group)
+{
+   int i, j, rc = 0;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = table_group-tables[i];
+
+   if (!tbl-it_map)
+   continue;
+
+   rc = iommu_table_take_ownership(tbl);
+   if (rc) {
+   for (j = 0; j  i; ++j)
+   iommu_table_release_ownership(
+   table_group-tables[j]);
+
+   return rc;
+   }
+   }
 
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
-void iommu_release_ownership(struct iommu_table *tbl)
+static void iommu_table_release_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
@@ -1086,9 +1103,18 @@ void iommu_release_ownership(struct iommu_table *tbl)
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
 
-   /* The kernel owns the device now, we can restore the iommu bypass */
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, true);
+}
+
+extern void iommu_release_ownership(struct iommu_table_group *table_group)
+{
+   int i;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = table_group-tables[i];
+
+   if (tbl-it_map)
+   iommu_table_release_ownership(tbl);
+   }
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 9b02040..6372b3b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -809,7 +809,7 @@ static int tce_iommu_attach_group(void *iommu_data,
goto unlock_exit;
}
 
-   ret = iommu_take_ownership(table_group-tables[0]);
+   ret = iommu_take_ownership(table_group);
if (!ret)
container-grp = iommu_group;
 
@@ -846,7 +846,7 @@ static void tce_iommu_detach_group(void *iommu_data,
table_group = iommu_group_get_iommudata(iommu_group);
BUG_ON(!table_group);
 
-   iommu_release_ownership(table_group-tables[0]);
+   iommu_release_ownership(table_group);
 
 unlock_exit:
mutex_unlock(container-lock);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 12/29] powerpc/iommu: Introduce iommu_table_alloc() helper

2015-03-09 Thread Alexey Kardashevskiy

This replaces multiple calls of kzalloc_node() with a new
iommu_table_alloc() helper. Right now it calls kzalloc_node() but
later it will be modified to allocate a iommu_table_group struct with
a single iommu_table in it.

Later the helper will allocate a iommu_table_group struct which embeds
the iommu table(s).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h   |  1 +
 arch/powerpc/kernel/iommu.c|  9 +
 arch/powerpc/platforms/powernv/pci.c   |  2 +-
 arch/powerpc/platforms/pseries/iommu.c | 12 
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d909e2a..eb75726 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -117,6 +117,7 @@ static inline void *get_iommu_table_base(struct device *dev)
return dev-archdata.dma_data.iommu_table_base;
 }
 
+extern struct iommu_table *iommu_table_alloc(int node);
 /* Frees table for an individual device node */
 extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 57cb615..dbc5d8d 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -710,6 +710,15 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid)
return tbl;
 }
 
+struct iommu_table *iommu_table_alloc(int node)
+{
+   struct iommu_table *tbl;
+
+   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+
+   return tbl;
+}
+
 void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 {
unsigned long bitmap_sz;
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index c619ec6..1c31ac8 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -680,7 +680,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct 
pci_controller *hose)
   hose-dn-full_name);
return NULL;
}
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, hose-node);
+   tbl = iommu_table_alloc(hose-node);
if (WARN_ON(!tbl))
return NULL;
pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 48d1fde..41a8b14 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -617,8 +617,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
pci-phb-dma_window_size = 0x800ul;
pci-phb-dma_window_base_cur = 0x800ul;
 
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  pci-phb-node);
+   tbl = iommu_table_alloc(pci-phb-node);
 
iommu_table_setparms(pci-phb, dn, tbl);
tbl-it_ops = iommu_table_pseries_ops;
@@ -669,8 +668,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 pdn-full_name, ppci-iommu_table);
 
if (!ppci-iommu_table) {
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  ppci-phb-node);
+   tbl = iommu_table_alloc(ppci-phb-node);
iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window);
tbl-it_ops = iommu_table_lpar_multi_ops;
ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node);
@@ -697,8 +695,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
struct pci_controller *phb = PCI_DN(dn)-phb;
 
pr_debug( -- first child, no bridge. Allocating iommu 
table.\n);
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  phb-node);
+   tbl = iommu_table_alloc(phb-node);
iommu_table_setparms(phb, dn, tbl);
tbl-it_ops = iommu_table_pseries_ops;
PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node);
@@ -1120,8 +1117,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev 
*dev)
 
pci = PCI_DN(pdn);
if (!pci-iommu_table) {
-   tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-  pci-phb-node);
+   tbl = iommu_table_alloc(pci-phb-node);
iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window);
tbl-it_ops = iommu_table_lpar_multi_ops;
pci-iommu_table = iommu_init_table(tbl, pci-phb-node);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/5] arm: KVM: export vcpi-pause state via MP_STATE ioctls

2015-03-09 Thread Christoffer Dall

Hi Alex,

The subject of this change has a typo, and I also think it's not about
exposing the pause state (that's just an internal name/concept), but
about exposing the PSCI state, or simply the VCPU power state.

On Mon, Mar 02, 2015 at 01:29:00PM +, Alex Bennée wrote:
 To cleanly restore an SMP VM we need to ensure that the current pause
 state of each vcpu is correctly recorded. Things could get confused if
 the CPU starts running after migration restore completes when it was
 paused before it state was captured.
 
 We use the existing KVM_GET/SET_MP_STATE ioctl to do this. The arm/arm64
 interface is a lot simpler as the only valid states are
 KVM_MP_STATE_RUNNABLE and KVM_MP_STATE_HALTED.
 
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index b112efc..602156f 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -997,7 +997,7 @@ for vm-wide capabilities.
  4.38 KVM_GET_MP_STATE
  
  Capability: KVM_CAP_MP_STATE
 -Architectures: x86, s390
 +Architectures: x86, s390, arm, arm64
  Type: vcpu ioctl
  Parameters: struct kvm_mp_state (out)
  Returns: 0 on success; -1 on error
 @@ -1027,15 +1027,21 @@ Possible values are:
   - KVM_MP_STATE_LOAD:the vcpu is in a special load/startup state
   [s390]
  
 -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 -in-kernel irqchip, the multiprocessing state must be maintained by userspace 
 on
 +For x86:
 +
 +This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 +irqchip, the multiprocessing state must be maintained by userspace on

Nit: I would not taint the git log with this change but instead just
introduce a paragraph below starting with On arm/arm64,  and you would
get the same effect.

  these architectures.
  
 +For arm/arm64:
 +
 +The only states that are valid are KVM_MP_STATE_HALTED and
 +KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.

As suggested on the QEMU series, HALTED is probably not the right thing
to use.

  
  4.39 KVM_SET_MP_STATE
  
  Capability: KVM_CAP_MP_STATE
 -Architectures: x86, s390
 +Architectures: x86, s390, arm, arm64
  Type: vcpu ioctl
  Parameters: struct kvm_mp_state (in)
  Returns: 0 on success; -1 on error
 @@ -1043,10 +1049,16 @@ Returns: 0 on success; -1 on error
  Sets the vcpu's current multiprocessing state; see KVM_GET_MP_STATE for
  arguments.
  
 -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 -in-kernel irqchip, the multiprocessing state must be maintained by userspace 
 on
 +For x86:
 +
 +This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 +irqchip, the multiprocessing state must be maintained by userspace on
  these architectures.
  
 +For arm/arm64:
 +
 +The only states that are valid are KVM_MP_STATE_HALTED and
 +KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.

same as above

  
  4.40 KVM_SET_IDENTITY_MAP_ADDR
  
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 5560f74..8531536 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -183,6 +183,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
 ext)
   case KVM_CAP_ARM_PSCI:
   case KVM_CAP_ARM_PSCI_0_2:
   case KVM_CAP_READONLY_MEM:
 + case KVM_CAP_MP_STATE:
   r = 1;
   break;
   case KVM_CAP_COALESCED_MMIO:
 @@ -313,13 +314,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
 *vcpu,
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
   struct kvm_mp_state *mp_state)
  {
 - return -EINVAL;
 + if (vcpu-arch.pause)
 + mp_state-mp_state = KVM_MP_STATE_HALTED;
 + else
 + mp_state-mp_state = KVM_MP_STATE_RUNNABLE;
 +
 + return 0;
  }
  
  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
   struct kvm_mp_state *mp_state)
  {
 - return -EINVAL;
 + switch (mp_state-mp_state) {
 + case KVM_MP_STATE_RUNNABLE:
 + vcpu-arch.pause = false;
 + break;
 + case KVM_MP_STATE_HALTED:
 + vcpu-arch.pause = true;
 + break;
 + default:
 + return -EINVAL;
 + }
 +
 + return 0;
  }
  
  /**

Are we capturing the vcpu features in some way or do we expect userspace
to migrate these on its own?  The reason I'm asking, is if you create
multiple VCPUs, where all the non-primary VCPUs are started in power off
mode, and then you boot your guest which powers on all VCPUs, and then
you restart your guest (with PSCI RESET), the system will not power on
all the non-primary VCPUs but hold them in power-off.

We need to make sure this behavior is preserved for a reboot across a
migration.  Is it?

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a

[PATCH v5 06/29] vfio: powerpc/spapr: Disable DMA mappings on disabled container

2015-03-09 Thread Alexey Kardashevskiy

At the moment DMA map/unmap requests are handled irrespective to
the container's state. This allows the user space to pin memory which
it might not be allowed to pin.

This adds checks to MAP/UNMAP that the container is enabled, otherwise
-EPERM is returned.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 432a0de..d3ab34f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -333,6 +333,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct iommu_table *tbl = container-tbl;
unsigned long tce;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (!tbl)
return -ENXIO;
 
@@ -377,6 +380,9 @@ static long tce_iommu_ioctl(void *iommu_data,
struct vfio_iommu_type1_dma_unmap param;
struct iommu_table *tbl = container-tbl;
 
+   if (!container-enabled)
+   return -EPERM;
+
if (WARN_ON(!tbl))
return -ENXIO;
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 29/29] vfio: powerpc/spapr: Support Dynamic DMA windows

2015-03-09 Thread Alexey Kardashevskiy

This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.

This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.

This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
helpers
* added docs
---
 Documentation/vfio.txt  |  19 +
 arch/powerpc/include/asm/iommu.h|   2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 165 +++-
 include/uapi/linux/vfio.h   |  24 +-
 4 files changed, 207 insertions(+), 3 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 791e85c..61ce393 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -446,6 +446,25 @@ the memory block.
 The user space is not expected to call these often and the block descriptors
 are stored in a linked list in the kernel.
 
+6) sPAPR specification allows guests to have an ddditional DMA window(s) on
+a PCI bus with a variable page size. Two ioctls have been added to support
+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
+The platform has to support the functionality or error will be returned to
+the userspace. The existing hardware supports up to 2 DMA windows, one is
+2GB long, uses 4K pages and called default 32bit window; the other can
+be as big as entire RAM, use different page size, it is optional - guests
+create those in run-time if the guest driver supports 64bit DMA.
+
+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
+a number of TCE table levels (if a TCE table is going to be big enough and
+the kernel may not be able to allocate enough of physicall contiguous memory).
+It creates a new window in the available slot and returns the bus address where
+the new window starts. Due to hardware limitation, the user space cannot choose
+the location of DMA windows.
+
+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
+and removes it.
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 04f72ac..de82b61 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -138,7 +138,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const 
char *node_name);
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
 
-#define IOMMU_TABLE_GROUP_MAX_TABLES   1
+#define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
 struct iommu_table_group;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 3a0b5fe..7aa4141b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -96,6 +96,7 @@ struct tce_container {
struct list_head mem_list;
struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
+   bool v2;
 };
 
 struct tce_iommu_group {
@@ -333,6 +334,20 @@ static struct iommu_table *spapr_tce_find_table(
return ret;
 }
 
+static int spapr_tce_find_free_table(struct tce_container *container)
+{
+   int i;
+
+   for (i = 0; i  IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+   struct iommu_table *tbl = container-tables[i];
+
+   if (!tbl-it_size)
+   return i;
+   }
+
+   return -1;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
int ret = 0;
@@ -432,6 +447,8 @@ static void *tce_iommu_open(unsigned long arg)
INIT_LIST_HEAD_RCU(container-mem_list);
INIT_LIST_HEAD_RCU(container-group_list);
 
+   container-v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
+
return container;
 }
 
@@ -605,11 +622,90 @@ static long tce_iommu_build(struct tce_container 
*container,
return ret;
 }
 
+static long tce_iommu_create_window(struct tce_container *container,
+   __u32 page_shift, __u64 window_size, __u32 levels,
+   __u64 *start_addr)
+{
+   struct iommu_table_group *table_group;
+   struct tce_iommu_group *tcegrp;
+   int num;
+   long ret;
+
+   num = spapr_tce_find_free_table(container);
+   if (num  0)
+   return -ENOSYS;
+
+

[PATCH v5 02/29] vfio: powerpc/spapr: Do cleanup when releasing the group

2015-03-09 Thread Alexey Kardashevskiy

This clears the TCE table when a container is being closed as this is
a good thing to leave the table clean before passing the ownership
back to the host kernel.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6c59339..756831f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -134,16 +134,24 @@ static void *tce_iommu_open(unsigned long arg)
return container;
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+   struct iommu_table *tbl,
+   unsigned long entry, unsigned long pages);
+
 static void tce_iommu_release(void *iommu_data)
 {
struct tce_container *container = iommu_data;
+   struct iommu_table *tbl = container-tbl;
 
-   WARN_ON(container-tbl  !container-tbl-it_group);
+   WARN_ON(tbl  !tbl-it_group);
tce_iommu_disable(container);
 
-   if (container-tbl  container-tbl-it_group)
-   tce_iommu_detach_group(iommu_data, container-tbl-it_group);
+   if (tbl) {
+   tce_iommu_clear(container, tbl, tbl-it_offset, tbl-it_size);
 
+   if (tbl-it_group)
+   tce_iommu_detach_group(iommu_data, tbl-it_group);
+   }
mutex_destroy(container-lock);
 
kfree(container);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 01/29] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver

2015-03-09 Thread Alexey Kardashevskiy

This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.

This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.

This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.

Besides the last part, the rest of the patch is mechanical.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v4:
* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
 arch/powerpc/include/asm/iommu.h|  4 --
 arch/powerpc/kernel/iommu.c | 55 --
 drivers/vfio/vfio_iommu_spapr_tce.c | 78 ++---
 3 files changed, 65 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index f1ea597..ed69b7d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -197,10 +197,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, 
unsigned long entry,
unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-   unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..1b4a178 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -991,30 +991,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages)
-{
-   unsigned long oldtce;
-   struct page *page;
-
-   for ( ; pages; --pages, ++entry) {
-   oldtce = iommu_clear_tce(tbl, entry);
-   if (!oldtce)
-   continue;
-
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
-   }
-   }
-
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1044,35 +1020,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-   unsigned long tce)
-{
-   int ret;
-   struct page *page = NULL;
-   unsigned long hwaddr, offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
-   enum dma_data_direction direction = iommu_tce_direction(tce);
-
-   ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-   direction != DMA_TO_DEVICE, page);
-   if (unlikely(ret != 1)) {
-   /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx 
ioba=%lx ret=%d\n,
-   tce, entry  tbl-it_page_shift, ret); */
-   return -EFAULT;
-   }
-   hwaddr = (unsigned long) page_address(page) + offset;
-
-   ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-   if (ret)
-   put_page(page);
-
-   if (ret  0)
-   pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n,
-   __func__, entry  tbl-it_page_shift, tce, ret);
-
-   return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
@@ -1086,7 +1033,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
}
 
memset(tbl-it_map, 0xff, sz);
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
 
/*
 * Disable iommu bypass, otherwise the user can DMA to all of
@@ -1104,7 +1050,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
unsigned long sz = (tbl-it_size + 7)  3;
 
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
memset(tbl-it_map, 0, sz);
 
/* Restore bit#0 set by iommu_init_table() */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 20abc3a..6c59339 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -149,6 +149,66 @@ static void

Re: [PATCH v2 3/5] arm: KVM: add a common vgic_queue_irq_to_lr fn

2015-03-09 Thread Christoffer Dall

On Mon, Mar 02, 2015 at 01:29:02PM +, Alex Bennée wrote:
 This helps re-factor away some of the repetitive code and makes the code
 flow more nicely.
 
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-09 Thread Mario Smarduch

On 03/09/2015 07:26 AM, Andrew Jones wrote:
 On Fri, Mar 06, 2015 at 01:08:29PM -0800, Mario Smarduch wrote:
 On 03/05/2015 09:43 AM, Paolo Bonzini wrote:


 On 05/03/2015 15:58, Catalin Marinas wrote:
 It would especially suck if the user has a cluster with different
 machines, some of them coherent and others non-coherent, and then has to
 debug why the same configuration works on some machines and not on others.

 That's a problem indeed, especially with guest migration. But I don't
 think we have any sane solution here for the bus master DMA.

 I do not oppose doing cache management in QEMU for bus master DMA
 (though if the solution you outlined below works it would be great).

 ARM can override them as well but only making them stricter. Otherwise,
 on a weakly ordered architecture, it's not always safe (let's say the
 guest thinks it accesses Strongly Ordered memory and avoids barriers for
 flag updates but the host upgrades it to Cacheable which breaks the
 memory order).

 The same can happen on x86 though, even if it's rarer.  You still need a
 barrier between stores and loads.

 If we want the host to enforce guest memory mapping attributes via stage
 2, we could do it the other way around: get the guests to always assume
 full cache coherency, generating Normal Cacheable mappings, but use the
 stage 2 attributes restriction in the host to make such mappings
 non-cacheable when needed (it works this way on ARM but not in the other
 direction to relax the attributes).

 That sounds like a plan for device assignment.  But it still would not
 solve the problem of the MMIO framebuffer, right?

 The problem arises with MMIO areas that the guest can reasonably expect
 to be uncacheable, but that are optimized by the host so that they end
 up backed by cacheable RAM.  It's perfectly reasonable that the same
 device needs cacheable mapping with one userspace, and works with
 uncacheable mapping with another userspace that doesn't optimize the
 MMIO area to RAM.

 Unless the guest allocates the framebuffer itself (e.g.
 dma_alloc_coherent), we can't control the cacheability via
 dma-coherent properties as it refers to bus master DMA.

 Okay, it's good to rule that out.  One less thing to think about. :)
 Same for _DSD.

 So for MMIO with the buffer allocated by the host (Qemu), the only
 solution I see on ARM is for the host to ensure coherency, either via
 explicit cache maintenance (new KVM API) or by changing the memory
 attributes used by Qemu to access such virtual MMIO.

 Basically Qemu is acting as a bus master when reading the framebuffer it
 allocated but the guest considers it a slave access and we don't have a
 way to tell the guest that such accesses should be cacheable, nor can we
 upgrade them via architecture features.

 Yes, that's a way to put it.

 In practice, the VGA framebuffer has an optimization that uses dirty
 page tracking, so we could piggyback on the ioctls that return which
 pages are dirty.  It turns out that piggybacking on those ioctls also
 should fix the case of migrating a guest while the MMU is disabled.

 Yes, Qemu would need to invalidate the cache before reading a dirty
 framebuffer page.

 As I said above, an API that allows non-cacheable mappings for the VGA
 framebuffer in Qemu would also solve the problem. I'm not sure what KVM
 provides here (or whether we can add such API).

 Nothing for now; other architectures simply do not have the issue.

 As long as it's just VGA, we can quirk it.  There's just a couple
 vendor/device IDs to catch, and the guest can then use a cacheable mapping.

 For a more generic solution, the API would be madvise(MADV_DONTCACHE).
 It would be easy for QEMU to use it, but I am not too optimistic about
 convincing the mm folks about it.  We can try.
 
 I forgot to list this one in my summary of approaches[*]. This is a
 nice, clean approach. Avoids getting cache maintenance into everything.
 However, besides the difficulty to get it past mm people, it reduces
 performance for any userspace-userspace uses/sharing of the memory.
 userspace-guest requires cache maintenance, but nothing else. Maybe
 that's not an important concern for the few emulated devices that need
 it though.
 

 Interested to see the outcome.

 I was thinking of a very basic memory driver that can provide
 an uncached memslot to QEMU - in mmap() file operation
 apply pgprot_uncached to allocated pages, lock them, flush TLB
 call remap_pfn_range().
 
 I guess this is the same as the madvise approach, but with a driver.
 KVM could take this approach itself when memslots are added/updated
 with the INCOHERENT flag. Maybe worth some experimental patches to
 find out?

I would work on this but I'm tied up for next 3 weeks.
If anyone is interested I can provide base code, I used
it for memory passthrough although testing may be time consuming.
I think the hurdle here is the kernel doesn't map these
for any reason like page migration, locking pages should
tell kernel don't

[PATCH v5 15/29] vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership control

2015-03-09 Thread Alexey Kardashevskiy

At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.

The set_bypass() callback is not really an iommu_table function but
IOMMU/PE function. This introduces a iommu_table_group_ops struct and
adds a set_ownership() callback to it which is called when an external
user takes control over the IOMMU.

This renames set_bypass() to set_ownership() as it is not necessarily
just enabling bypassing, it can be something else/more so let's give it
more generic name. The bool parameter is inverted.

The callback is implemented for IODA2 only. Other platforms (P5IOC2,
IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  | 14 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 ++
 drivers/vfio/vfio_iommu_spapr_tce.c   | 26 ++
 3 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index b9e50d3..d1f8c6c 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -92,7 +92,6 @@ struct iommu_table {
unsigned long  it_page_shift;/* table iommu page size */
struct iommu_table_group *it_group;
struct iommu_table_ops *it_ops;
-   void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
@@ -127,11 +126,24 @@ extern struct iommu_table *iommu_init_table(struct 
iommu_table * tbl,
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   1
 
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+   /*
+* Switches ownership from the kernel itself to an external
+* user. While onwership is enabled, the kernel cannot use IOMMU
+* for itself.
+*/
+   void (*set_ownership)(struct iommu_table_group *table_group,
+   bool enable);
+};
+
 struct iommu_table_group {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *group;
 #endif
struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+   struct iommu_table_group_ops *ops;
 };
 
 #ifdef CONFIG_IOMMU_API
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 61956e2..67a3fe4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1257,10 +1257,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
-   struct pnv_ioda_pe *pe = container_of(tbl-it_group, struct pnv_ioda_pe,
- table_group);
uint16_t window_id = (pe-pe_number  1 ) + 1;
int64_t rc;
 
@@ -1288,7 +1286,8 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table 
*tbl, bool enable)
 * host side.
 */
if (pe-pdev)
-   set_iommu_table_base(pe-pdev-dev, tbl);
+   set_iommu_table_base(pe-pdev-dev,
+   pe-table_group.tables[0]);
else
pnv_ioda_setup_bus_dma(pe, pe-pbus, false);
}
@@ -1304,13 +1303,27 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct 
pnv_phb *phb,
/* TVE #1 is selected by PCI address bit 59 */
pe-tce_bypass_base = 1ull  59;
 
-   /* Install set_bypass callback for VFIO */
-   pe-table_group.tables[0].set_bypass = pnv_pci_ioda2_set_bypass;
-
/* Enable bypass by default */
-   pnv_pci_ioda2_set_bypass(pe-table_group.tables[0], true);
+   pnv_pci_ioda2_set_bypass(pe, true);
 }
 
+static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
+bool enable)
+{
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   if (enable)
+   iommu_take_ownership(table_group);
+   else
+   iommu_release_ownership(table_group);
+
+   pnv_pci_ioda2_set_bypass(pe, !enable);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+   .set_ownership = pnv_ioda2_set_ownership,
+};
+
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
@@ -1378,6 +1391,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
}
tbl-it_ops = pnv_iommu_ops;
iommu_init_table(tbl, phb-hose-node);
+   pe-table_group.ops = pnv_pci_ioda2_ops;

[PATCH v5 10/29] powerpc/powernv: Do not set read flag if direction==DMA_NONE

2015-03-09 Thread Alexey Kardashevskiy

Normally a bitmap from the iommu_table is used to track what TCE entry
is in use. Since we are going to use iommu_table without its locks and
do xchg() instead, it becomes essential not to put bits which are not
implied in the direction flag.

This adds iommu_direction_to_tce_perm() (its counterpart is there already)
and uses it for powernv's pnv_tce_build().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 15 +++
 arch/powerpc/platforms/powernv/pci.c |  7 +--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index ed69b7d..2af2d70 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -203,6 +203,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 1b4a178..865beef 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -916,6 +916,21 @@ enum dma_data_direction iommu_tce_direction(unsigned long 
tce)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_direction);
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+   switch (dir) {
+   case DMA_BIDIRECTIONAL:
+   return TCE_PCI_READ | TCE_PCI_WRITE;
+   case DMA_FROM_DEVICE:
+   return TCE_PCI_WRITE;
+   case DMA_TO_DEVICE:
+   return TCE_PCI_READ;
+   default:
+   return 0;
+   }
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 void iommu_flush_tce(struct iommu_table *tbl)
 {
/* Flush/invalidate TLB caches if necessary */
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 54323d6..609f5b1 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -593,15 +593,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
 unsigned long uaddr, enum dma_data_direction direction,
 struct dma_attrs *attrs, bool rm)
 {
-   u64 proto_tce;
+   u64 proto_tce = iommu_direction_to_tce_perm(direction);
__be64 *tcep, *tces;
u64 rpn;
 
-   proto_tce = TCE_PCI_READ; // Read allowed
-
-   if (direction != DMA_TO_DEVICE)
-   proto_tce |= TCE_PCI_WRITE;
-
tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
rpn = __pa(uaddr)  tbl-it_page_shift;
 
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 20/29] powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_create_table/pnc_pci_free_table

2015-03-09 Thread Alexey Kardashevskiy

This is a part of moving TCE table allocation into an iommu_ops
callback to support multiple IOMMU groups per one VFIO container.

This enforce window size to be a power of two.

This is a pretty mechanical patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 90 +++
 1 file changed, 67 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d05abaf..fae8cf6 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,7 +24,9 @@
 #include linux/msi.h
 #include linux/memblock.h
 #include linux/iommu.h
+#include linux/mmzone.h
 
+#include asm/mmzone.h
 #include asm/sections.h
 #include asm/io.h
 #include asm/prom.h
@@ -1331,6 +1333,62 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
+static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe,
+   __u32 page_shift, __u64 window_size,
+   struct iommu_table *tbl)
+{
+   int nid = pe-phb-hose-node;
+   struct page *tce_mem = NULL;
+   void *addr;
+   unsigned long tce_table_size;
+   int64_t rc;
+   unsigned order;
+
+   if ((page_shift != 12)  (page_shift != 16)  (page_shift != 24))
+   return -EINVAL;
+
+   if ((window_size  memory_hotplug_max()) || !is_power_of_2(window_size))
+   return -EINVAL;
+
+   tce_table_size = (window_size  page_shift) * 8;
+   tce_table_size = max(0x1000UL, tce_table_size);
+
+   /* Allocate TCE table */
+   order = get_order(tce_table_size);
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+   if (!tce_mem) {
+   pr_err(Failed to allocate a TCE memory, order=%d\n, order);
+   rc = -ENOMEM;
+   goto fail;
+   }
+   addr = page_address(tce_mem);
+   memset(addr, 0, tce_table_size);
+
+   /* Setup linux iommu table */
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   page_shift);
+
+   tbl-it_ops = pnv_ioda2_iommu_ops;
+   iommu_init_table(tbl, nid);
+
+   return 0;
+fail:
+   if (tce_mem)
+   __free_pages(tce_mem, get_order(tce_table_size));
+
+   return rc;
+}
+
+static void pnv_pci_free_table(struct iommu_table *tbl)
+{
+   if (!tbl-it_size)
+   return;
+
+   free_pages(tbl-it_base, get_order(tbl-it_size  3));
+   memset(tbl, 0, sizeof(struct iommu_table));
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
uint16_t window_id = (pe-pe_number  1 ) + 1;
@@ -1401,11 +1459,9 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
-   struct page *tce_mem = NULL;
-   void *addr;
const __be64 *swinvp;
-   struct iommu_table *tbl;
-   unsigned int tce_table_size, end;
+   unsigned int end;
+   struct iommu_table *tbl = pe-table_group.tables[0];
int64_t rc;
 
/* We shouldn't already have a 32-bit DMA associated */
@@ -1414,31 +1470,20 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
+
end = (1  ilog2(phb-ioda.m32_pci_base));
-   tce_table_size = (end / 0x1000) * 8;
pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n,
end);
 
-   /* Allocate TCE table */
-   tce_mem = alloc_pages_node(phb-hose-node, GFP_KERNEL,
-  get_order(tce_table_size));
-   if (!tce_mem) {
-   pe_err(pe, Failed to allocate a 32-bit TCE memory\n);
-   goto fail;
+   rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K,
+   phb-ioda.m32_pci_base, tbl);
+   if (rc) {
+   pe_err(pe, Failed to create 32-bit TCE table, err %ld, rc);
+   return;
}
-   addr = page_address(tce_mem);
-   memset(addr, 0, tce_table_size);
 
/* Setup iommu */
pe-table_group.tables[0].it_group = pe-table_group;
-
-   /* Setup linux iommu table */
-   tbl = pe-table_group.tables[0];
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-   IOMMU_PAGE_SHIFT_4K);
-
-   tbl-it_ops = pnv_ioda2_iommu_ops;
-   iommu_init_table(tbl, phb-hose-node);
pe-table_group.ops = pnv_pci_ioda2_ops;
 
/*
@@ -1484,8 +1529,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 fail:
if (pe-tce32_seg = 0)
pe-tce32_seg = -1;
-   if (tce_mem)
-   __free_pages(tce_mem, get_order(tce_table_size));
+   pnv_pci_free_table(tbl);
 }
 
 static void

[PATCH v5 25/29] powerpc/powernv/ioda: Define and implement DMA table/window management callbacks

2015-03-09 Thread Alexey Kardashevskiy

This extends iommu_table_group_ops by a set of callbacks to support dynamic
DMA windows management.

query() returns IOMMU capabilities such as default DMA window address and
supported number of DMA windows and TCE table levels.

create_table() creates a TCE table with specific parameters.
it receives iommu_table_group to know nodeid in order to allocate TCE table
memory closer to the PHB. The exact format of allocated multi-level table
might be also specific to the PHB model (not the case now though).
This callback puts the DMA window offset on a PCI bus into just created table.

set_window() sets the window at specified TVT index on PHB.

unset_window() unsets the window from specified TVT.

This adds a free() callback to iommu_table_ops to free the memory
(potentially a tree of tables) allocated for the TCE table.

create_table() and free() are supposed to be called once per VFIO container
and set_window()/unset_window() are supposed to be called for every group
in a container.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h| 32 +++
 arch/powerpc/platforms/powernv/pci-ioda.c   | 87 -
 arch/powerpc/platforms/powernv/pci-p5ioc2.c | 14 -
 3 files changed, 115 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4007432..04f72ac 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -62,6 +62,8 @@ struct iommu_table_ops {
long index, long npages);
unsigned long (*get)(struct iommu_table *tbl, long index);
void (*flush)(struct iommu_table *tbl);
+
+   void (*free)(struct iommu_table *tbl);
 };
 
 /* These are used by VIO */
@@ -148,12 +150,42 @@ struct iommu_table_group_ops {
 */
void (*set_ownership)(struct iommu_table_group *table_group,
bool enable);
+
+   long (*create_table)(struct iommu_table_group *table_group,
+   int num,
+   __u32 page_shift,
+   __u64 window_size,
+   __u32 levels,
+   struct iommu_table *tbl);
+   long (*set_window)(struct iommu_table_group *table_group,
+   int num,
+   struct iommu_table *tblnew);
+   long (*unset_window)(struct iommu_table_group *table_group,
+   int num);
 };
 
+/* Page size flags for ibm,query-pe-dma-window */
+#define DDW_PGSIZE_4K   0x01
+#define DDW_PGSIZE_64K  0x02
+#define DDW_PGSIZE_16M  0x04
+#define DDW_PGSIZE_32M  0x08
+#define DDW_PGSIZE_64M  0x10
+#define DDW_PGSIZE_128M 0x20
+#define DDW_PGSIZE_256M 0x40
+#define DDW_PGSIZE_16G  0x80
+#define DDW_PGSIZE_MASK 0xFF
+
 struct iommu_table_group {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *group;
 #endif
+   /* Some key properties of IOMMU */
+   __u32 tce32_start;
+   __u32 tce32_size;
+   __u32 max_dynamic_windows_supported;
+   __u32 max_levels;
+   __u32 flags;
+
struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct iommu_table_group_ops *ops;
 };
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index ed60b38..07857c4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -48,6 +48,7 @@
 #include pci.h
 
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
+#define POWERNV_IOMMU_MAX_LEVELS   5
 
 extern void ioda_eeh_tvt_print(struct pnv_phb *phb);
 
@@ -1155,11 +1156,14 @@ static void pnv_ioda1_tce_free_vm(struct iommu_table 
*tbl, long index,
pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
 }
 
+static void pnv_pci_free_table(struct iommu_table *tbl);
+
 struct iommu_table_ops pnv_ioda1_iommu_ops = {
.set = pnv_ioda1_tce_build_vm,
.exchange = pnv_ioda1_tce_xchg_vm,
.clear = pnv_ioda1_tce_free_vm,
.get = pnv_tce_get,
+   .free = pnv_pci_free_table
 };
 
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
@@ -1317,6 +1321,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
 TCE_PCI_SWINV_PAIR);
}
tbl-it_ops = pnv_ioda1_iommu_ops;
+   pe-table_group.tce32_start = tbl-it_offset  tbl-it_page_shift;
+   pe-table_group.tce32_size = tbl-it_size  tbl-it_page_shift;
+   pe-table_group.max_dynamic_windows_supported = 0;
+   pe-table_group.max_levels = 0;
+   pe-table_group.flags = 0;
iommu_init_table(tbl, phb-hose-node);
iommu_register_group(pe-table_group, phb-hose-global_number,
pe-pe_number);
@@ -1401,7 +1410,7 @@ static __be64 *pnv_alloc_tce_table(int nid,
 }
 
 static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
-

[PATCH v5 24/29] powerpc/powernv: Change prototypes to receive iommu

2015-03-09 Thread Alexey Kardashevskiy

This changes few functions to receive a iommu_table_group pointer
rather than PE as they are going to be a part of upcoming
iommu_table_group_ops callback set.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 19b5f36..ed60b38 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1400,10 +1400,12 @@ static __be64 *pnv_alloc_tce_table(int nid,
return addr;
 }
 
-static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe,
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
__u32 page_shift, __u64 window_size, __u32 levels,
struct iommu_table *tbl)
 {
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
int nid = pe-phb-hose-node;
void *addr;
unsigned long tce_table_size, left;
@@ -1459,9 +1461,11 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
iommu_reset_table(tbl, ioda2);
 }
 
-static long pnv_pci_ioda2_set_window(struct pnv_ioda_pe *pe,
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
struct iommu_table *tbl)
 {
+   struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = pe-phb;
const __be64 *swinvp;
int64_t rc;
@@ -1594,12 +1598,11 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 
/* The PE will reserve all possible 32-bits space */
pe-tce32_seg = 0;
-
end = (1  ilog2(phb-ioda.m32_pci_base));
pe_info(pe, Setting up 32-bit TCE table at 0..%08x\n,
end);
 
-   rc = pnv_pci_ioda2_create_table(pe, IOMMU_PAGE_SHIFT_4K,
+   rc = pnv_pci_ioda2_create_table(pe-table_group, IOMMU_PAGE_SHIFT_4K,
phb-ioda.m32_pci_base,
POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
if (rc) {
@@ -1611,7 +1614,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
pe-table_group.tables[0].it_group = pe-table_group;
pe-table_group.ops = pnv_pci_ioda2_ops;
 
-   rc = pnv_pci_ioda2_set_window(pe, tbl);
+   rc = pnv_pci_ioda2_set_window(pe-table_group, tbl);
if (rc) {
pe_err(pe, Failed to configure 32-bit TCE table,
err %ld\n, rc);
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v5 23/29] powerpc/powernv: Implement multilevel TCE tables

2015-03-09 Thread Alexey Kardashevskiy

TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level TCE tables,
up to 5 levels which splits the table into a tree of smaller subtables.

This adds multi-level TCE tables support to pnv_pci_ioda2_create_table()
and pnv_pci_ioda2_free_table() callbacks.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  |   2 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 127 --
 arch/powerpc/platforms/powernv/pci.c  |  19 +
 3 files changed, 122 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index fd118ea..4007432 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -88,6 +88,8 @@ struct iommu_pool {
 struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
+   unsigned long  it_indirect_levels;
+   unsigned long  it_level_size;
unsigned long  it_offset;/* Offset into global table */
unsigned long  it_base;  /* mapped address of tce table */
unsigned long  it_index; /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 126d803..19b5f36 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -47,6 +47,8 @@
 #include powernv.h
 #include pci.h
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS   1
+
 extern void ioda_eeh_tvt_print(struct pnv_phb *phb);
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1333,16 +1335,79 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
+static void pnv_free_tce_table(unsigned long addr, unsigned size,
+   unsigned level)
+{
+   addr = ~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+   if (level) {
+   long i;
+   u64 *tmp = (u64 *) addr;
+
+   for (i = 0; i  size; ++i) {
+   unsigned long hpa = be64_to_cpu(tmp[i]);
+
+   if (!(hpa  (TCE_PCI_READ | TCE_PCI_WRITE)))
+   continue;
+
+   pnv_free_tce_table((unsigned long) __va(hpa),
+   size, level - 1);
+   }
+   }
+
+   free_pages(addr, get_order(size  3));
+}
+
+static __be64 *pnv_alloc_tce_table(int nid,
+   unsigned shift, unsigned levels, unsigned long *left)
+{
+   struct page *tce_mem = NULL;
+   __be64 *addr, *tmp;
+   unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+   unsigned long chunk = 1UL  shift, i;
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+   if (!tce_mem) {
+   pr_err(Failed to allocate a TCE memory\n);
+   return NULL;
+   }
+
+   if (!*left)
+   return NULL;
+
+   addr = page_address(tce_mem);
+   memset(addr, 0, chunk);
+
+   --levels;
+   if (!levels) {
+   /* This is last level, actual TCEs */
+   *left -= min(*left, chunk);
+   return addr;
+   }
+
+   for (i = 0; i  (chunk  3); ++i) {
+   /* We allocated required TCEs, mark the rest page fault */
+   if (!*left) {
+   addr[i] = cpu_to_be64(0);
+   continue;
+   }
+
+   tmp = pnv_alloc_tce_table(nid, shift, levels, left);
+   addr[i] = cpu_to_be64(__pa(tmp) |
+   TCE_PCI_READ | TCE_PCI_WRITE);
+   }
+
+   return addr;
+}
+
 static long pnv_pci_ioda2_create_table(struct pnv_ioda_pe *pe,
-   __u32 page_shift, __u64 window_size,
+   __u32 page_shift, __u64 window_size, __u32 levels,
struct iommu_table *tbl)
 {
int nid = pe-phb-hose-node;
-   struct page *tce_mem = NULL;
void *addr;
-   unsigned long tce_table_size;
-   int64_t rc;
-   unsigned order;
+   unsigned long tce_table_size, left;
+   unsigned shift;
 
if ((page_shift != 12)  (page_shift != 16)  (page_shift != 24))
return -EINVAL;
@@ -1350,20 +1415,27 @@ static long pnv_pci_ioda2_create_table(struct 
pnv_ioda_pe *pe,
if ((window_size  memory_hotplug_max()) || !is_power_of_2(window_size))
return -EINVAL;
 
+   if (!levels || (levels  5))
+   return -EINVAL;
+
tce_table_size = (window_size  page_shift) * 8;
tce_table_size = max(0x1000UL, tce_table_size);
 
/*

[PATCH v5 16/29] powerpc/iommu: Fix IOMMU ownership control functions

2015-03-09 Thread Alexey Kardashevskiy

This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().

This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.

This only clears TCE content if there is no page marked busy in it_map.
Clearing must be done outside of the table locks as iommu_clear_tce()
called from iommu_clear_tces_and_put_pages() does this.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v5:
* do not store bit#0 value, it has to be set for zero-based table
anyway
* removed test_and_clear_bit
---
 arch/powerpc/kernel/iommu.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 8358631..df888a7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1052,17 +1052,28 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
 
 static int iommu_table_take_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+   int ret = 0;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
if (tbl-it_offset == 0)
clear_bit(0, tbl-it_map);
 
if (!bitmap_empty(tbl-it_map, tbl-it_size)) {
pr_err(iommu_tce: it_map is not empty);
-   return -EBUSY;
+   ret = -EBUSY;
+   if (tbl-it_offset == 0)
+   set_bit(0, tbl-it_map);
+   } else {
+   memset(tbl-it_map, 0xff, sz);
}
 
-   memset(tbl-it_map, 0xff, sz);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 
return 0;
 }
@@ -1095,7 +1106,11 @@ EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 static void iommu_table_release_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
memset(tbl-it_map, 0, sz);
 
@@ -1103,6 +1118,9 @@ static void iommu_table_release_ownership(struct 
iommu_table *tbl)
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
 
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 }
 
 extern void iommu_release_ownership(struct iommu_table_group *table_group)
-- 
2.0.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] x86: svm: use cr_interception for SVM_EXIT_CR0_SEL_WRITE

2015-03-09 Thread Radim Krčmář

2015-03-06 14:44-0600, Joel Schopp:
 From: David Kaplan david.kap...@amd.com
 
 Another patch in my war on emulate_on_interception() use as a svm exit 
 handler.
 
 These were pulled out of a larger patch at the suggestion of Radim Krcmar, see
 https://lkml.org/lkml/2015/2/25/559
 
 Changes since v1:
   * fixed typo introduced after test, retested
 
 Signed-off-by: David Kaplan david.kap...@amd.com
 [separated out just cr_interception part from larger removal of
 INTERCEPT_CR0_WRITE, forward ported, tested]
 Signed-off-by: Joel Schopp joel.sch...@amd.com
 ---

(Existing nested handling is tangled, but looks like everything works,)

Reviewed-by: Radim Krčmář rkrc...@redhat.com

Thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] target-arm/kvm64: fix save/restore of SPSR regs

2015-03-09 Thread Alex Bennée


Peter Maydell peter.mayd...@linaro.org writes:

 On 9 March 2015 at 21:56, Christoffer Dall christoffer.d...@linaro.org 
 wrote:
 this function, however, is not used only when migration, but should
 generally cover the case where you want to synchronize QEMU's state into
 KVM's state, right?  So while it may not be harmful in currently
 supported use cases, is there ever a situation where (is_a64(env)  el
 == 0) and env-spsr != banked_spsr[el], and where env-spsr is
 out-of-date?

 If EL == 0 then you can't access any SPSR, so env-spsr is by
 definition out of date.

Indeed and in v2 the whole thing is wrapped in if (el  0)


 -- PMM

-- 
Alex Bennée
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: Get rid of kvm_kvfree()

2015-03-09 Thread Thomas Huth

On Tue, 24 Feb 2015 21:29:25 +0100
Thomas Huth th...@linux.vnet.ibm.com wrote:

 kvm_kvfree() provides exactly the same functionality as the
 new common kvfree() function - so let's simply replace the
 kvm function with the common function.
 
 Signed-off-by: Thomas Huth th...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/x86.c   |8 
  include/linux/kvm_host.h |1 -
  virt/kvm/kvm_main.c  |   10 +-
  3 files changed, 5 insertions(+), 14 deletions(-)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index bd7a70b..c5f7e03 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -7429,7 +7429,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct 
 kvm_memory_slot *free,
 
   for (i = 0; i  KVM_NR_PAGE_SIZES; ++i) {
   if (!dont || free-arch.rmap[i] != dont-arch.rmap[i]) {
 - kvm_kvfree(free-arch.rmap[i]);
 + kvfree(free-arch.rmap[i]);
   free-arch.rmap[i] = NULL;
   }
   if (i == 0)
 @@ -7437,7 +7437,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct 
 kvm_memory_slot *free,
 
   if (!dont || free-arch.lpage_info[i - 1] !=
dont-arch.lpage_info[i - 1]) {
 - kvm_kvfree(free-arch.lpage_info[i - 1]);
 + kvfree(free-arch.lpage_info[i - 1]);
   free-arch.lpage_info[i - 1] = NULL;
   }
   }
 @@ -7491,12 +7491,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct 
 kvm_memory_slot *slot,
 
  out_free:
   for (i = 0; i  KVM_NR_PAGE_SIZES; ++i) {
 - kvm_kvfree(slot-arch.rmap[i]);
 + kvfree(slot-arch.rmap[i]);
   slot-arch.rmap[i] = NULL;
   if (i == 0)
   continue;
 
 - kvm_kvfree(slot-arch.lpage_info[i - 1]);
 + kvfree(slot-arch.lpage_info[i - 1]);
   slot-arch.lpage_info[i - 1] = NULL;
   }
   return -ENOMEM;
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index d12b210..0f574eb 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -658,7 +658,6 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
  void *kvm_kvzalloc(unsigned long size);
 -void kvm_kvfree(const void *addr);
 
  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
  static inline struct kvm *kvm_arch_alloc_vm(void)
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index a109370..6f1a9c2 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -539,20 +539,12 @@ void *kvm_kvzalloc(unsigned long size)
   return kzalloc(size, GFP_KERNEL);
  }
 
 -void kvm_kvfree(const void *addr)
 -{
 - if (is_vmalloc_addr(addr))
 - vfree(addr);
 - else
 - kfree(addr);
 -}
 -
  static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
  {
   if (!memslot-dirty_bitmap)
   return;
 
 - kvm_kvfree(memslot-dirty_bitmap);
 + kvfree(memslot-dirty_bitmap);
   memslot-dirty_bitmap = NULL;
  }
 

ping?

 Thomas

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-09 Thread Andrew Jones

On Fri, Mar 06, 2015 at 01:08:29PM -0800, Mario Smarduch wrote:
 On 03/05/2015 09:43 AM, Paolo Bonzini wrote:
  
  
  On 05/03/2015 15:58, Catalin Marinas wrote:
  It would especially suck if the user has a cluster with different
  machines, some of them coherent and others non-coherent, and then has to
  debug why the same configuration works on some machines and not on others.
 
  That's a problem indeed, especially with guest migration. But I don't
  think we have any sane solution here for the bus master DMA.
  
  I do not oppose doing cache management in QEMU for bus master DMA
  (though if the solution you outlined below works it would be great).
  
  ARM can override them as well but only making them stricter. Otherwise,
  on a weakly ordered architecture, it's not always safe (let's say the
  guest thinks it accesses Strongly Ordered memory and avoids barriers for
  flag updates but the host upgrades it to Cacheable which breaks the
  memory order).
  
  The same can happen on x86 though, even if it's rarer.  You still need a
  barrier between stores and loads.
  
  If we want the host to enforce guest memory mapping attributes via stage
  2, we could do it the other way around: get the guests to always assume
  full cache coherency, generating Normal Cacheable mappings, but use the
  stage 2 attributes restriction in the host to make such mappings
  non-cacheable when needed (it works this way on ARM but not in the other
  direction to relax the attributes).
  
  That sounds like a plan for device assignment.  But it still would not
  solve the problem of the MMIO framebuffer, right?
  
  The problem arises with MMIO areas that the guest can reasonably expect
  to be uncacheable, but that are optimized by the host so that they end
  up backed by cacheable RAM.  It's perfectly reasonable that the same
  device needs cacheable mapping with one userspace, and works with
  uncacheable mapping with another userspace that doesn't optimize the
  MMIO area to RAM.
 
  Unless the guest allocates the framebuffer itself (e.g.
  dma_alloc_coherent), we can't control the cacheability via
  dma-coherent properties as it refers to bus master DMA.
  
  Okay, it's good to rule that out.  One less thing to think about. :)
  Same for _DSD.
  
  So for MMIO with the buffer allocated by the host (Qemu), the only
  solution I see on ARM is for the host to ensure coherency, either via
  explicit cache maintenance (new KVM API) or by changing the memory
  attributes used by Qemu to access such virtual MMIO.
 
  Basically Qemu is acting as a bus master when reading the framebuffer it
  allocated but the guest considers it a slave access and we don't have a
  way to tell the guest that such accesses should be cacheable, nor can we
  upgrade them via architecture features.
  
  Yes, that's a way to put it.
  
  In practice, the VGA framebuffer has an optimization that uses dirty
  page tracking, so we could piggyback on the ioctls that return which
  pages are dirty.  It turns out that piggybacking on those ioctls also
  should fix the case of migrating a guest while the MMU is disabled.
 
  Yes, Qemu would need to invalidate the cache before reading a dirty
  framebuffer page.
 
  As I said above, an API that allows non-cacheable mappings for the VGA
  framebuffer in Qemu would also solve the problem. I'm not sure what KVM
  provides here (or whether we can add such API).
  
  Nothing for now; other architectures simply do not have the issue.
  
  As long as it's just VGA, we can quirk it.  There's just a couple
  vendor/device IDs to catch, and the guest can then use a cacheable mapping.
  
  For a more generic solution, the API would be madvise(MADV_DONTCACHE).
  It would be easy for QEMU to use it, but I am not too optimistic about
  convincing the mm folks about it.  We can try.

I forgot to list this one in my summary of approaches[*]. This is a
nice, clean approach. Avoids getting cache maintenance into everything.
However, besides the difficulty to get it past mm people, it reduces
performance for any userspace-userspace uses/sharing of the memory.
userspace-guest requires cache maintenance, but nothing else. Maybe
that's not an important concern for the few emulated devices that need
it though.

 
 Interested to see the outcome.
 
 I was thinking of a very basic memory driver that can provide
 an uncached memslot to QEMU - in mmap() file operation
 apply pgprot_uncached to allocated pages, lock them, flush TLB
 call remap_pfn_range().

I guess this is the same as the madvise approach, but with a driver.
KVM could take this approach itself when memslots are added/updated
with the INCOHERENT flag. Maybe worth some experimental patches to
find out?

I'm still thinking about experimenting with the ARM private syscalls
next though.

drew

[*] http://lists.gnu.org/archive/html/qemu-devel/2015-03/msg01254.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to

Re: [PATCH v2 2/5] arm/arm64: KVM: Implement support for unqueueing active IRQs

2015-03-09 Thread Christoffer Dall

On Mon, Mar 02, 2015 at 01:29:01PM +, Alex Bennée wrote:
 From: Christoffer Dall christoffer.d...@linaro.org
 
 Migrating active interrupts causes the active state to be lost
 completely. This implements some additional bitmaps to track the active
 state on the distributor and export this to user space.
 
 Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 ---
 AJB:
- fixed merge conflicts
- moved additional shared bitmaps to be dynamically allocated
- make irq_active_on_cpu dynamically allocated as well
- in vgic_queue_irq don't queue pending if already active
- in __kvm_vgic_flush_hwstate use pr_shared when checking SPIs
- vgic: clear active on CPU bit
- checkpatch, remove extraneous braces
- checkpatch, remove debug, fix overflows
- move register access fns to re-factored vgic-v2-emul.c
 v2
- doc: unqueue and update_state
 
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 7c55dd5..7042251 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -196,6 +196,9 @@ struct vgic_dist {
   /* Level-triggered interrupt queued on VCPU interface */
   struct vgic_bitmap  irq_queued;
  
 + /* Interrupt was active when unqueue from VCPU interface */
 + struct vgic_bitmap  irq_active;
 +
   /* Interrupt priority. Not used yet. */
   struct vgic_bytemap irq_priority;
  
 @@ -236,6 +239,9 @@ struct vgic_dist {
   /* Bitmap indicating which CPU has something pending */
   unsigned long   *irq_pending_on_cpu;
  
 + /* Bitmap indicating which CPU has active IRQs */
 + unsigned long   *irq_active_on_cpu;
 +
   struct vgic_vm_ops  vm_ops;
  #endif
  };
 @@ -269,9 +275,15 @@ struct vgic_cpu {
   /* per IRQ to LR mapping */
   u8  *vgic_irq_lr_map;
  
 - /* Pending interrupts on this VCPU */
 + /* Pending/active/both interrupts on this VCPU */
   DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
 + DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS);
 + DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
 +
 + /* Pending/active/both shared interrupts, dynamically sized */
   unsigned long   *pending_shared;
 + unsigned long   *active_shared;
 + unsigned long   *pend_act_shared;
  
   /* Bitmap of used/free list registers */
   DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS);
 @@ -311,6 +323,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, 
 unsigned int irq_num,
   bool level);
  void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 +int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
  bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
 struct kvm_exit_mmio *mmio);
  
 diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
 index 19c6210..c818662 100644
 --- a/virt/kvm/arm/vgic-v2-emul.c
 +++ b/virt/kvm/arm/vgic-v2-emul.c
 @@ -107,6 +107,22 @@ static bool handle_mmio_clear_pending_reg(struct 
 kvm_vcpu *vcpu,
vcpu-vcpu_id);
  }
  
 +static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
 +struct kvm_exit_mmio *mmio,
 +phys_addr_t offset)
 +{
 + return vgic_handle_set_active_reg(vcpu-kvm, mmio, offset,
 +   vcpu-vcpu_id);
 +}
 +
 +static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
 +  struct kvm_exit_mmio *mmio,
 +  phys_addr_t offset)
 +{
 + return vgic_handle_clear_active_reg(vcpu-kvm, mmio, offset,
 + vcpu-vcpu_id);
 +}
 +
  static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
struct kvm_exit_mmio *mmio,
phys_addr_t offset)
 @@ -344,13 +360,13 @@ static const struct kvm_mmio_range vgic_dist_ranges[] = 
 {
   .base   = GIC_DIST_ACTIVE_SET,
   .len= VGIC_MAX_IRQS / 8,
   .bits_per_irq   = 1,
 - .handle_mmio= handle_mmio_raz_wi,
 + .handle_mmio= handle_mmio_set_active_reg,
   },
   {
   .base   = GIC_DIST_ACTIVE_CLEAR,
   .len= VGIC_MAX_IRQS / 8,
   .bits_per_irq   = 1,
 - .handle_mmio= handle_mmio_raz_wi,
 + .handle_mmio= handle_mmio_clear_active_reg,
   },
   {
   .base   = GIC_DIST_PRI,
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 0cc6ab6..bfb6fbb 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -277,6 +277,14 @@ static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, 
 int irq)

Re: [PATCH] KVM: Get rid of kvm_kvfree()

2015-03-09 Thread Paolo Bonzini



On 09/03/2015 16:39, Thomas Huth wrote:
 On Tue, 24 Feb 2015 21:29:25 +0100
 Thomas Huth th...@linux.vnet.ibm.com wrote:
 
 kvm_kvfree() provides exactly the same functionality as the
 new common kvfree() function - so let's simply replace the
 kvm function with the common function.

 Signed-off-by: Thomas Huth th...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/x86.c   |8 
  include/linux/kvm_host.h |1 -
  virt/kvm/kvm_main.c  |   10 +-
  3 files changed, 5 insertions(+), 14 deletions(-)

 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index bd7a70b..c5f7e03 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -7429,7 +7429,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct 
 kvm_memory_slot *free,

  for (i = 0; i  KVM_NR_PAGE_SIZES; ++i) {
  if (!dont || free-arch.rmap[i] != dont-arch.rmap[i]) {
 -kvm_kvfree(free-arch.rmap[i]);
 +kvfree(free-arch.rmap[i]);
  free-arch.rmap[i] = NULL;
  }
  if (i == 0)
 @@ -7437,7 +7437,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct 
 kvm_memory_slot *free,

  if (!dont || free-arch.lpage_info[i - 1] !=
   dont-arch.lpage_info[i - 1]) {
 -kvm_kvfree(free-arch.lpage_info[i - 1]);
 +kvfree(free-arch.lpage_info[i - 1]);
  free-arch.lpage_info[i - 1] = NULL;
  }
  }
 @@ -7491,12 +7491,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct 
 kvm_memory_slot *slot,

  out_free:
  for (i = 0; i  KVM_NR_PAGE_SIZES; ++i) {
 -kvm_kvfree(slot-arch.rmap[i]);
 +kvfree(slot-arch.rmap[i]);
  slot-arch.rmap[i] = NULL;
  if (i == 0)
  continue;

 -kvm_kvfree(slot-arch.lpage_info[i - 1]);
 +kvfree(slot-arch.lpage_info[i - 1]);
  slot-arch.lpage_info[i - 1] = NULL;
  }
  return -ENOMEM;
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index d12b210..0f574eb 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -658,7 +658,6 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);

  void *kvm_kvzalloc(unsigned long size);
 -void kvm_kvfree(const void *addr);

  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
  static inline struct kvm *kvm_arch_alloc_vm(void)
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index a109370..6f1a9c2 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -539,20 +539,12 @@ void *kvm_kvzalloc(unsigned long size)
  return kzalloc(size, GFP_KERNEL);
  }

 -void kvm_kvfree(const void *addr)
 -{
 -if (is_vmalloc_addr(addr))
 -vfree(addr);
 -else
 -kfree(addr);
 -}
 -
  static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
  {
  if (!memslot-dirty_bitmap)
  return;

 -kvm_kvfree(memslot-dirty_bitmap);
 +kvfree(memslot-dirty_bitmap);
  memslot-dirty_bitmap = NULL;
  }

 
 ping?

This must have slipped through the cracks.

Acked-by: Paolo Bonzini pbonz...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kvm-unit-test] x86/emulator: Fix inline assembler warning

2015-03-09 Thread Jan Kiszka

Code compiles to the same binary, but now with one warning less.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 x86/emulator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/x86/emulator.c b/x86/emulator.c
index 0964e6a..e5c1c6b 100644
--- a/x86/emulator.c
+++ b/x86/emulator.c
@@ -838,7 +838,7 @@ static void test_jmp_noncanonical(uint64_t *mem)
 
exceptions = 0;
handle_exception(GP_VECTOR, advance_rip_by_3_and_note_exception);
-   asm volatile (jmp %0 : : m(*mem));
+   asm volatile (jmp *%0 : : m(*mem));
report(jump to non-canonical address, exceptions == 1);
handle_exception(GP_VECTOR, 0);
 }
-- 
2.1.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Avi Kivity


On 03/03/2015 11:52 AM, Paolo Bonzini wrote:

In this
case, the VM might expect exceptions when PTE bits which are higher than the
maximum (reported) address width are set, and it would not get such
exceptions. This problem can easily be experienced by small change to the
existing KVM unit-tests.

There are many variants to this problem, and the only solution which I
consider complete is to report to the VM the maximum (52) physical address
width to the VM, configure the VM to exit on #PF with reserved-bit
error-codes, and then emulate these faulting instructions.

Not even that would be a definitive solution.  If the guest tries to map
RAM (e.g. a PCI BAR that is backed by RAM) above the host MAXPHYADDR,
you would get EPT misconfiguration vmexits.

I think there is no way to emulate physical address width correctly,
except by disabling EPT.



Is the issue emulating a higher MAXPHYADDR on the guest than is 
available on the host? I don't think there's any need to support that.


Emulating a lower setting on the guest than is available on the host is, 
I think, desirable. Whether it would work depends on the relative 
priority of EPT misconfiguration exits vs. page table permission faults.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Nadav Amit

Avi Kivity avi.kiv...@gmail.com wrote:

 On 03/03/2015 11:52 AM, Paolo Bonzini wrote:
 In this
 case, the VM might expect exceptions when PTE bits which are higher than the
 maximum (reported) address width are set, and it would not get such
 exceptions. This problem can easily be experienced by small change to the
 existing KVM unit-tests.
 
 There are many variants to this problem, and the only solution which I
 consider complete is to report to the VM the maximum (52) physical address
 width to the VM, configure the VM to exit on #PF with reserved-bit
 error-codes, and then emulate these faulting instructions.
 Not even that would be a definitive solution.  If the guest tries to map
 RAM (e.g. a PCI BAR that is backed by RAM) above the host MAXPHYADDR,
 you would get EPT misconfiguration vmexits.
 
 I think there is no way to emulate physical address width correctly,
 except by disabling EPT.
 
 Is the issue emulating a higher MAXPHYADDR on the guest than is available
 on the host? I don't think there's any need to support that.
 
 Emulating a lower setting on the guest than is available on the host is, I
 think, desirable. Whether it would work depends on the relative priority
 of EPT misconfiguration exits vs. page table permission faults.

Thanks for the feedback.

Guest page-table permissions faults got priority over EPT misconfiguration.
KVM can even be set to trap page-table permission faults, at least in VT-x.
Anyhow, I don’t think it is enough. Here is an example

My machine has MAXPHYADDR of 46. I modified kvm-unit-tests access test to
set pte.45 instead of pte.51, which from the VM point-of-view should cause
the #PF error-code indicate the reserved bits are set (just as pte.51 does).
Here is one error from the log:

test pte.p pte.45 pde.p user: FAIL: error code 5 expected d
Dump mapping: address: 1234
--L4: 304b007
--L3: 304c007
--L2: 304d001
--L1: 2201

As you can see, the #PF should have had two reasons: reserved bits, and user
access to supervisor only page. The error-code however does not indicate the
reserved-bits are set.

Note that KVM did not trap any exit on that faulting instruction, as
otherwise it would try to emulate the instruction and assuming it is
supported (and that the #PF was not on an instruction fetch), should be able
to emulate the #PF correctly. 
[ The test actually crashes soon after this error due to these reasons. ]

Anyhow, that is the reason for me to assume that having the maximum
MAXPHYADDR is better.

Nadav

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/5] arm: KVM: export vcpi-pause state via MP_STATE ioctls

2015-03-09 Thread Alex Bennée


Christoffer Dall christoffer.d...@linaro.org writes:

 Hi Alex,

 The subject of this change has a typo, and I also think it's not about
 exposing the pause state (that's just an internal name/concept), but
 about exposing the PSCI state, or simply the VCPU power state.

arm: KVM: export VCPU power state via MP_STATE ioctl?


 On Mon, Mar 02, 2015 at 01:29:00PM +, Alex Bennée wrote:
 To cleanly restore an SMP VM we need to ensure that the current pause
 state of each vcpu is correctly recorded. Things could get confused if
 the CPU starts running after migration restore completes when it was
 paused before it state was captured.
 
 We use the existing KVM_GET/SET_MP_STATE ioctl to do this. The arm/arm64
 interface is a lot simpler as the only valid states are
 KVM_MP_STATE_RUNNABLE and KVM_MP_STATE_HALTED.
 
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index b112efc..602156f 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -997,7 +997,7 @@ for vm-wide capabilities.
  4.38 KVM_GET_MP_STATE
  
  Capability: KVM_CAP_MP_STATE
 -Architectures: x86, s390
 +Architectures: x86, s390, arm, arm64
  Type: vcpu ioctl
  Parameters: struct kvm_mp_state (out)
  Returns: 0 on success; -1 on error
 @@ -1027,15 +1027,21 @@ Possible values are:
   - KVM_MP_STATE_LOAD:the vcpu is in a special load/startup state
   [s390]
  
 -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 -in-kernel irqchip, the multiprocessing state must be maintained by 
 userspace on
 +For x86:
 +
 +This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 +irqchip, the multiprocessing state must be maintained by userspace on

 Nit: I would not taint the git log with this change but instead just
 introduce a paragraph below starting with On arm/arm64,  and you would
 get the same effect.

  these architectures.
  
 +For arm/arm64:
 +
 +The only states that are valid are KVM_MP_STATE_HALTED and
 +KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.

 As suggested on the QEMU series, HALTED is probably not the right thing
 to use.

KVM_MP_STATE_STOPPED is currently only used for s390 but seems to fit.
I'm wary of adding yet another define.


  
  4.39 KVM_SET_MP_STATE
  
  Capability: KVM_CAP_MP_STATE
 -Architectures: x86, s390
 +Architectures: x86, s390, arm, arm64
  Type: vcpu ioctl
  Parameters: struct kvm_mp_state (in)
  Returns: 0 on success; -1 on error
 @@ -1043,10 +1049,16 @@ Returns: 0 on success; -1 on error
  Sets the vcpu's current multiprocessing state; see KVM_GET_MP_STATE for
  arguments.
  
 -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
 -in-kernel irqchip, the multiprocessing state must be maintained by 
 userspace on
 +For x86:
 +
 +This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 +irqchip, the multiprocessing state must be maintained by userspace on
  these architectures.
  
 +For arm/arm64:
 +
 +The only states that are valid are KVM_MP_STATE_HALTED and
 +KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.

 same as above

  
  4.40 KVM_SET_IDENTITY_MAP_ADDR
  
 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 5560f74..8531536 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -183,6 +183,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
 ext)
  case KVM_CAP_ARM_PSCI:
  case KVM_CAP_ARM_PSCI_0_2:
  case KVM_CAP_READONLY_MEM:
 +case KVM_CAP_MP_STATE:
  r = 1;
  break;
  case KVM_CAP_COALESCED_MMIO:
 @@ -313,13 +314,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct 
 kvm_vcpu *vcpu,
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
  struct kvm_mp_state *mp_state)
  {
 -return -EINVAL;
 +if (vcpu-arch.pause)
 +mp_state-mp_state = KVM_MP_STATE_HALTED;
 +else
 +mp_state-mp_state = KVM_MP_STATE_RUNNABLE;
 +
 +return 0;
  }
  
  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
  struct kvm_mp_state *mp_state)
  {
 -return -EINVAL;
 +switch (mp_state-mp_state) {
 +case KVM_MP_STATE_RUNNABLE:
 +vcpu-arch.pause = false;
 +break;
 +case KVM_MP_STATE_HALTED:
 +vcpu-arch.pause = true;
 +break;
 +default:
 +return -EINVAL;
 +}
 +
 +return 0;
  }
  
  /**

 Are we capturing the vcpu features in some way or do we expect userspace
 to migrate these on its own?  The reason I'm asking, is if you create
 multiple VCPUs, where all the non-primary VCPUs are started in power off
 mode, and then you boot your guest which powers on all VCPUs, and then
 you restart your guest (with PSCI RESET), the system will not power on
 all the non-primary VCPUs but hold them

Re: [PATCH v2 2/5] arm/arm64: KVM: Implement support for unqueueing active IRQs

2015-03-09 Thread Alex Bennée


Christoffer Dall christoffer.d...@linaro.org writes:

 On Mon, Mar 02, 2015 at 01:29:01PM +, Alex Bennée wrote:
 From: Christoffer Dall christoffer.d...@linaro.org
 
 Migrating active interrupts causes the active state to be lost
 completely. This implements some additional bitmaps to track the active
 state on the distributor and export this to user space.
 
 Signed-off-by: Christoffer Dall christoffer.d...@linaro.org
 Signed-off-by: Alex Bennée alex.ben...@linaro.org
 
 ---
 AJB:
- fixed merge conflicts
- moved additional shared bitmaps to be dynamically allocated
- make irq_active_on_cpu dynamically allocated as well
- in vgic_queue_irq don't queue pending if already active
- in __kvm_vgic_flush_hwstate use pr_shared when checking SPIs
- vgic: clear active on CPU bit
- checkpatch, remove extraneous braces
- checkpatch, remove debug, fix overflows
- move register access fns to re-factored vgic-v2-emul.c
 v2
- doc: unqueue and update_state
 
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 7c55dd5..7042251 100644
snip
 if you respin: the active state
 if you respin: Mark the

OK

snip
  
 @@ -1030,39 +1113,49 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu 
 *vcpu)
  {
  struct vgic_cpu *vgic_cpu = vcpu-arch.vgic_cpu;
  struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +unsigned long *pa_percpu, *pa_shared;
  int i, vcpu_id;
  int overflow = 0;
 +int nr_shared = vgic_nr_shared_irqs(dist);
  
  vcpu_id = vcpu-vcpu_id;
  
 +pa_percpu = vcpu-arch.vgic_cpu.pend_act_percpu;
 +pa_shared = vcpu-arch.vgic_cpu.pend_act_shared;
 +
 +bitmap_or(pa_percpu, vgic_cpu-pending_percpu, vgic_cpu-active_percpu,
 +  VGIC_NR_PRIVATE_IRQS);
 +bitmap_or(pa_shared, vgic_cpu-pending_shared, vgic_cpu-active_shared,
 +  nr_shared);
  /*
   * We may not have any pending interrupt, or the interrupts
   * may have been serviced from another vcpu. In all cases,
   * move along.
   */
 -if (!kvm_vgic_vcpu_pending_irq(vcpu)) {
 -pr_debug(CPU%d has no pending interrupt\n, vcpu_id);
 +if (!kvm_vgic_vcpu_pending_irq(vcpu)  !kvm_vgic_vcpu_active_irq(vcpu))
  goto epilog;
 -}
  
  /* SGIs */
 -for_each_set_bit(i, vgic_cpu-pending_percpu, VGIC_NR_SGIS) {
 +for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
  if (!queue_sgi(vcpu, i))
  overflow = 1;
  }
  
  /* PPIs */
 -for_each_set_bit_from(i, vgic_cpu-pending_percpu, 
 VGIC_NR_PRIVATE_IRQS) {
 +for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
  if (!vgic_queue_hwirq(vcpu, i))
  overflow = 1;
  }
  
  /* SPIs */
 -for_each_set_bit(i, vgic_cpu-pending_shared, 
 vgic_nr_shared_irqs(dist)) {
 +for_each_set_bit(i, pa_shared, nr_shared) {
  if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
  overflow = 1;
  }

 meh, these changes will now produce a quite strange result if one
 bisects at this specific patch, since we will be setting pending
 interrupts to active  I thought it was more clear as one patch
 containing the changes to vgic_queue_... imho.

I split the re-factor off by Marc's request but your right it becomes a
little weird. Now the horror of dealing with the merge conflicts is done
I could split the next patch and add it before this one:

* add a common vgic_queue_irq_to_lr fn (just deal with pending)
* Implement support for unqueuing active IRQs (expand vgic_queue_irq_to_lr)


 Thoughts?

  
 +
 +
 +
  epilog:
  if (overflow) {
  vgic_enable_underflow(vcpu);
 @@ -1209,6 +1302,17 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
  return test_bit(vcpu-vcpu_id, dist-irq_pending_on_cpu);
  }
  
 +int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu)
 +{
 +struct vgic_dist *dist = vcpu-kvm-arch.vgic;
 +
 +if (!irqchip_in_kernel(vcpu-kvm))
 +return 0;
 +
 +return test_bit(vcpu-vcpu_id, dist-irq_active_on_cpu);
 +}
 +
 +
  void vgic_kick_vcpus(struct kvm *kvm)
  {
  struct kvm_vcpu *vcpu;
 @@ -1381,8 +1485,12 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
  struct vgic_cpu *vgic_cpu = vcpu-arch.vgic_cpu;
  
  kfree(vgic_cpu-pending_shared);
 +kfree(vgic_cpu-active_shared);
 +kfree(vgic_cpu-pend_act_shared);
  kfree(vgic_cpu-vgic_irq_lr_map);
  vgic_cpu-pending_shared = NULL;
 +vgic_cpu-active_shared = NULL;
 +vgic_cpu-pend_act_shared = NULL;
  vgic_cpu-vgic_irq_lr_map = NULL;
  }
  
 @@ -1392,9 +1500,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, 
 int nr_irqs)
  
  int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8;
  vgic_cpu-pending_shared = kzalloc(sz, GFP_KERNEL);
 +vgic_cpu-active_shared = kzalloc(sz, GFP_KERNEL);
 +vgic_cpu-pend_act_shared = kzalloc(sz, GFP_KERNEL);

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Avi Kivity


On 03/09/2015 07:51 PM, Nadav Amit wrote:

Avi Kivity avi.kiv...@gmail.com wrote:


On 03/03/2015 11:52 AM, Paolo Bonzini wrote:

In this
case, the VM might expect exceptions when PTE bits which are higher than the
maximum (reported) address width are set, and it would not get such
exceptions. This problem can easily be experienced by small change to the
existing KVM unit-tests.

There are many variants to this problem, and the only solution which I
consider complete is to report to the VM the maximum (52) physical address
width to the VM, configure the VM to exit on #PF with reserved-bit
error-codes, and then emulate these faulting instructions.

Not even that would be a definitive solution.  If the guest tries to map
RAM (e.g. a PCI BAR that is backed by RAM) above the host MAXPHYADDR,
you would get EPT misconfiguration vmexits.

I think there is no way to emulate physical address width correctly,
except by disabling EPT.

Is the issue emulating a higher MAXPHYADDR on the guest than is available
on the host? I don't think there's any need to support that.

Emulating a lower setting on the guest than is available on the host is, I
think, desirable. Whether it would work depends on the relative priority
of EPT misconfiguration exits vs. page table permission faults.

Thanks for the feedback.

Guest page-table permissions faults got priority over EPT misconfiguration.
KVM can even be set to trap page-table permission faults, at least in VT-x.
Anyhow, I don’t think it is enough.


Why is it not enough? If you trap a permission fault, you can inject any 
exception error code you like.



  Here is an example

My machine has MAXPHYADDR of 46. I modified kvm-unit-tests access test to
set pte.45 instead of pte.51, which from the VM point-of-view should cause
the #PF error-code indicate the reserved bits are set (just as pte.51 does).
Here is one error from the log:

test pte.p pte.45 pde.p user: FAIL: error code 5 expected d
Dump mapping: address: 1234
--L4: 304b007
--L3: 304c007
--L2: 304d001
--L1: 2201


This is with an ept misconfig programmed into that address, yes?


As you can see, the #PF should have had two reasons: reserved bits, and user
access to supervisor only page. The error-code however does not indicate the
reserved-bits are set.

Note that KVM did not trap any exit on that faulting instruction, as
otherwise it would try to emulate the instruction and assuming it is
supported (and that the #PF was not on an instruction fetch), should be able
to emulate the #PF correctly.
[ The test actually crashes soon after this error due to these reasons. ]

Anyhow, that is the reason for me to assume that having the maximum
MAXPHYADDR is better.



Well, that doesn't work for the reasons Paolo noted.  The guest can have 
a ivshmem device attached, and map it above a host-supported virtual 
address, and suddenly it goes slow.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL] KVM fixes for 4.0-rc3

2015-03-09 Thread Marcelo Tosatti


Linus,

Please pull from

git://git.kernel.org/pub/scm/virt/kvm/kvm.git master

To receive the following KVM S/390 bug fixes


Christian Borntraeger (1):
  KVM: s390/cpacf: Fix kernel bug under z/VM

Marcelo Tosatti (1):
  Merge tag 'kvm-s390-master-20150303' of 
git://git.kernel.org/.../kvms390/linux

Michael Mueller (3):
  KVM: s390: fix in memory copy of facility lists
  KVM: s390: include guest facilities in kvm facility test
  KVM: s390: non-LPAR case obsolete during facilities mask init

Tony Krowiak (1):
  KVM: s390/cpacf: Enable key wrapping by default

 arch/s390/include/asm/kvm_host.h |   12 +++---
 arch/s390/kvm/kvm-s390.c |   70 +--
 arch/s390/kvm/kvm-s390.h |3 +
 arch/s390/kvm/priv.c |2 -
 4 files changed, 41 insertions(+), 46 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Paolo Bonzini



On 09/03/2015 20:19, Avi Kivity wrote:
 I can't think of one with reasonable performance either.  Perhaps the
 maintainers could raise the issue with Intel.  It looks academic but it
 can happen in real life -- KVM for example used to rely on reserved bits
 faults (it set all bits in the PTE so it wouldn't have been caught by
 this).

Yes, and it checked that MAXPHYADDR != 52 before.  If you want to set
only one bit, making that bit 51 makes sense anyway for simplicity, so
it is still 99.9% academic.  Once processors appear with MAXPHYADDR =
52, the remaining 0.1% will become more relevant.

The current limit is IIRC 46 or 48 (on Haswell Xeons).

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Avi Kivity


On 03/09/2015 09:38 PM, Paolo Bonzini wrote:


On 09/03/2015 20:19, Avi Kivity wrote:

I can't think of one with reasonable performance either.  Perhaps the
maintainers could raise the issue with Intel.  It looks academic but it
can happen in real life -- KVM for example used to rely on reserved bits
faults (it set all bits in the PTE so it wouldn't have been caught by
this).

Yes, and it checked that MAXPHYADDR != 52 before.  If you want to set
only one bit, making that bit 51 makes sense anyway for simplicity, so
it is still 99.9% academic.  Once processors appear with MAXPHYADDR =
52, the remaining 0.1% will become more relevant.

The current limit is IIRC 46 or 48 (on Haswell Xeons).



It will be interesting to have processors with 52 bits of physical 
address and 48 bits of virtual address. HIGHMEM for x86_64?  Or 5-level 
page tables?


50 bits == 1 PiB.  That's quite an amount of RAM.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: make halt_poll_ns static

2015-03-09 Thread Marcelo Tosatti

On Fri, Feb 27, 2015 at 04:50:10PM +0100, Christian Borntraeger wrote:
 halt_poll_ns is used only locally. Make it static.
 
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 ---
  virt/kvm/kvm_main.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: Get rid of kvm_kvfree()

2015-03-09 Thread Marcelo Tosatti

On Tue, Feb 24, 2015 at 09:29:25PM +0100, Thomas Huth wrote:
 kvm_kvfree() provides exactly the same functionality as the
 new common kvfree() function - so let's simply replace the
 kvm function with the common function.
 
 Signed-off-by: Thomas Huth th...@linux.vnet.ibm.com

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: fix to update memslots properly

2015-03-09 Thread Marcelo Tosatti

On Sat, Dec 27, 2014 at 09:41:45PM +0100, Paolo Bonzini wrote:
  diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
  index f528343..6e52f3f 100644
  --- a/virt/kvm/kvm_main.c
  +++ b/virt/kvm/kvm_main.c
  @@ -672,6 +672,7 @@ static void update_memslots(struct kvm_memslots *slots,
  WARN_ON(mslots[i].id != id);
  if (!new-npages) {
  new-base_gfn = 0;
  +   new-flags = 0;
  if (mslots[i].npages)
  slots-used_slots--;
  } else {
 
 This should not be necessary.  The part of the mslots array that has 
 base_gfn == npages == 0 is entirely unused, and such a slot can never 
 be returned by search_memslots because this:
 
 if (gfn = memslots[slot].base_gfn 
 gfn  memslots[slot].base_gfn + memslots[slot].npages)
 
 can never be true.
 
  @@ -688,7 +689,9 @@ static void update_memslots(struct kvm_memslots *slots,
  i++;
  }
  while (i  0 
  -  new-base_gfn  mslots[i - 1].base_gfn) {
  +  ((new-base_gfn  mslots[i - 1].base_gfn) ||
  +   (!new-base_gfn 
  +!mslots[i - 1].base_gfn  !mslots[i - 1].npages))) {
  mslots[i] = mslots[i - 1];
  slots-id_to_index[mslots[i].id] = i;
  i--;
  
 
 You should have explained _why_ this fixes the bug, and what invariant 
 is not being respected, something like this:
 
 kvm: fix sorting of memslots with base_gfn == 0
 
 Before commit 0e60b0799fed (kvm: change memslot sorting rule from size
 to GFN, 2014-12-01), the memslots' sorting key was npages, meaning
 that a valid memslot couldn't have its sorting key equal to zero.
 On the other hand, a valid memslot can have base_gfn == 0, and invalid
 memslots are identified by base_gfn == npages == 0.
 
 Because of this, commit 0e60b0799fed broke the invariant that invalid
 memslots are at the end of the mslots array.  When a memslot with
 base_gfn == 0 was created, any invalid memslot before it were left
 in place.
 
 This suggests another fix.  We can change the insertion to use a =
 comparison, as in your first patch.  Alone it is not correct, but we
 only need to take some care and avoid breaking the case of deleting a
 memslot.
 
 It's enough to wrap the second loop (that you patched) with
 if (new-npages).  In the new-npages == 0 case the first loop has
 already set i to the right value, and moving i back would be wrong:
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index f5283438ee05..050974c051b5 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -687,11 +687,23 @@ static void update_memslots(struct kvm_memslots *slots,
   slots-id_to_index[mslots[i].id] = i;
   i++;
   }
 - while (i  0 
 -new-base_gfn  mslots[i - 1].base_gfn) {
 - mslots[i] = mslots[i - 1];
 - slots-id_to_index[mslots[i].id] = i;
 - i--;
 +
 + /*
 +  * The = is needed when creating a slot with base_gfn == 0,
 +  * so that it moves before all those with base_gfn == npages == 0.
 +  *
 +  * On the other hand, if new-npages is zero, the above loop has
 +  * already left i pointing to the beginning of the empty part of
 +  * mslots, and the = would move the hole backwards in this
 +  * case---which is wrong.  So skip the loop when deleting a slot.
 +  */
 + if (new-npages) {
 + while (i  0 
 +new-base_gfn = mslots[i - 1].base_gfn) {
 + mslots[i] = mslots[i - 1];
 + slots-id_to_index[mslots[i].id] = i;
 + i--;
 + }
   }
  
   mslots[i] = *new;
 
 Paolo

Paolo,

Can you include a proper changelog for this patch?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/5] arm: KVM: export vcpi-pause state via MP_STATE ioctls

2015-03-09 Thread Christoffer Dall

On Mon, Mar 09, 2015 at 04:34:21PM +, Alex Bennée wrote:
 
 Christoffer Dall christoffer.d...@linaro.org writes:
 
  Hi Alex,
 
  The subject of this change has a typo, and I also think it's not about
  exposing the pause state (that's just an internal name/concept), but
  about exposing the PSCI state, or simply the VCPU power state.
 
 arm: KVM: export VCPU power state via MP_STATE ioctl?
 

arm/arm64: KVM: 

otherwise looks good to me ;)


 
  On Mon, Mar 02, 2015 at 01:29:00PM +, Alex Bennée wrote:
  To cleanly restore an SMP VM we need to ensure that the current pause
  state of each vcpu is correctly recorded. Things could get confused if
  the CPU starts running after migration restore completes when it was
  paused before it state was captured.
  
  We use the existing KVM_GET/SET_MP_STATE ioctl to do this. The arm/arm64
  interface is a lot simpler as the only valid states are
  KVM_MP_STATE_RUNNABLE and KVM_MP_STATE_HALTED.
  
  Signed-off-by: Alex Bennée alex.ben...@linaro.org
  
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index b112efc..602156f 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -997,7 +997,7 @@ for vm-wide capabilities.
   4.38 KVM_GET_MP_STATE
   
   Capability: KVM_CAP_MP_STATE
  -Architectures: x86, s390
  +Architectures: x86, s390, arm, arm64
   Type: vcpu ioctl
   Parameters: struct kvm_mp_state (out)
   Returns: 0 on success; -1 on error
  @@ -1027,15 +1027,21 @@ Possible values are:
- KVM_MP_STATE_LOAD:the vcpu is in a special load/startup 
  state
[s390]
   
  -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
  -in-kernel irqchip, the multiprocessing state must be maintained by 
  userspace on
  +For x86:
  +
  +This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
  +irqchip, the multiprocessing state must be maintained by userspace on
 
  Nit: I would not taint the git log with this change but instead just
  introduce a paragraph below starting with On arm/arm64,  and you would
  get the same effect.
 
   these architectures.
   
  +For arm/arm64:
  +
  +The only states that are valid are KVM_MP_STATE_HALTED and
  +KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
 
  As suggested on the QEMU series, HALTED is probably not the right thing
  to use.
 
 KVM_MP_STATE_STOPPED is currently only used for s390 but seems to fit.
 I'm wary of adding yet another define.
 

sounds fine, as long as it doesn't have some inherently different
meaning with s390.

 
   
   4.39 KVM_SET_MP_STATE
   
   Capability: KVM_CAP_MP_STATE
  -Architectures: x86, s390
  +Architectures: x86, s390, arm, arm64
   Type: vcpu ioctl
   Parameters: struct kvm_mp_state (in)
   Returns: 0 on success; -1 on error
  @@ -1043,10 +1049,16 @@ Returns: 0 on success; -1 on error
   Sets the vcpu's current multiprocessing state; see KVM_GET_MP_STATE for
   arguments.
   
  -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
  -in-kernel irqchip, the multiprocessing state must be maintained by 
  userspace on
  +For x86:
  +
  +This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
  +irqchip, the multiprocessing state must be maintained by userspace on
   these architectures.
   
  +For arm/arm64:
  +
  +The only states that are valid are KVM_MP_STATE_HALTED and
  +KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.
 
  same as above
 
   
   4.40 KVM_SET_IDENTITY_MAP_ADDR
   
  diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
  index 5560f74..8531536 100644
  --- a/arch/arm/kvm/arm.c
  +++ b/arch/arm/kvm/arm.c
  @@ -183,6 +183,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
  ext)
 case KVM_CAP_ARM_PSCI:
 case KVM_CAP_ARM_PSCI_0_2:
 case KVM_CAP_READONLY_MEM:
  +  case KVM_CAP_MP_STATE:
 r = 1;
 break;
 case KVM_CAP_COALESCED_MMIO:
  @@ -313,13 +314,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct 
  kvm_vcpu *vcpu,
   int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 struct kvm_mp_state *mp_state)
   {
  -  return -EINVAL;
  +  if (vcpu-arch.pause)
  +  mp_state-mp_state = KVM_MP_STATE_HALTED;
  +  else
  +  mp_state-mp_state = KVM_MP_STATE_RUNNABLE;
  +
  +  return 0;
   }
   
   int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 struct kvm_mp_state *mp_state)
   {
  -  return -EINVAL;
  +  switch (mp_state-mp_state) {
  +  case KVM_MP_STATE_RUNNABLE:
  +  vcpu-arch.pause = false;
  +  break;
  +  case KVM_MP_STATE_HALTED:
  +  vcpu-arch.pause = true;
  +  break;
  +  default:
  +  return -EINVAL;
  +  }
  +
  +  return 0;
   }
   
   /**
 
  Are we capturing the vcpu features in some way or do we expect userspace
  to migrate these on its own?

[PATCH] KVM: x86: Fix re-execution of patched vmmcall

2015-03-09 Thread Jan Kiszka

For a very long time (since 2b3d2a20), the path handling a vmmcall
instruction of the guest on an Intel host only applied the patch but no
longer handled the hypercall. The reverse case, vmcall on AMD hosts, is
fine. As both em_vmcall and em_vmmcall actually have to do the same, we
can fix the issue by consolidating both into the same handler.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 arch/x86/kvm/emulate.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 106c015..c941abe 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3323,7 +3323,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
 }
 
-static int em_vmcall(struct x86_emulate_ctxt *ctxt)
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
 {
int rc = ctxt-ops-fix_hypercall(ctxt);
 
@@ -3395,17 +3395,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
return em_lgdt_lidt(ctxt, true);
 }
 
-static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
-{
-   int rc;
-
-   rc = ctxt-ops-fix_hypercall(ctxt);
-
-   /* Disable writeback. */
-   ctxt-dst.type = OP_NONE;
-   return rc;
-}
-
 static int em_lidt(struct x86_emulate_ctxt *ctxt)
 {
return em_lgdt_lidt(ctxt, false);
@@ -3769,7 +3758,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 
 static const struct opcode group7_rm0[] = {
N,
-   I(SrcNone | Priv | EmulateOnUD, em_vmcall),
+   I(SrcNone | Priv | EmulateOnUD, em_hypercall),
N, N, N, N, N, N,
 };
 
@@ -3781,7 +3770,7 @@ static const struct opcode group7_rm1[] = {
 
 static const struct opcode group7_rm3[] = {
DIP(SrcNone | Prot | Priv,  vmrun,  check_svme_pa),
-   II(SrcNone  | Prot | EmulateOnUD,   em_vmmcall, vmmcall),
+   II(SrcNone  | Prot | EmulateOnUD,   em_hypercall,   vmmcall),
DIP(SrcNone | Prot | Priv,  vmload, check_svme_pa),
DIP(SrcNone | Prot | Priv,  vmsave, check_svme_pa),
DIP(SrcNone | Prot | Priv,  stgi,   check_svme),
-- 
2.1.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/5] arm: KVM: export vcpi-pause state via MP_STATE ioctls

2015-03-09 Thread Peter Maydell

On 10 March 2015 at 04:29, Christoffer Dall christoffer.d...@linaro.org wrote:
 On Mon, Mar 09, 2015 at 04:34:21PM +, Alex Bennée wrote:
 - Boot
 - Power on secondary CPUs
 - Power off one secondary CPU
 - Migrate to file (cpu_powered reflects state of each CPU)

 - Start fresh QEMU
 - Restore from file (cpu_powered - vcpu-paused via ioctl)
 - Run (we continue with the same power state pre-migrate)

 - PSCI RESET
 - Does what it does, power all secondaries down?
 - Kernel boots, turns them on?

 Hmmm. As long as QEMU always inits all VCPUs in the exact same way
 (including the KVM_ARM_VCPU_POWER_OFF feature bit) I guess it works and
 that's probably a reasonable requirement for migration.

We init the VCPUs with the POWER_OFF flag set for exactly the
set of CPUs that we want to start powered off. Typically that
means that the first CPU is inited with POWER_OFF clear and the
rest are inited with it set.

Regardless, if we're doing an incoming migration, then the
incoming data will be used to set the VCPU on/off state
appropriately for resuming the VM. (This might include
powering on a VCPU that's been inited in power-off and never
run, or powering down a VCPU that was inited power-on but
never ran. In the 'migration moves a vcpu to powered-on'
case we'll also set the vcpu's PC and other registers so
when it is run it will DTRT.)

If the resumed guest subsequently does a PSCI reset then
QEMU will re-init the VCPUs with the set of feature bits that
the machine model/etc defines for a reset condition for this
board, which will be the same first CPU powered on, all the
rest powered off config we started with.

(It's the user's responsibility to ensure that when doing a
migration the QEMUs at both ends are identically configured,
incidentally.)

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH kvm-unit-test] x86: vmx: Check #UD triggering of vmmcall

2015-03-09 Thread Jan Kiszka

KVM tends to patch and emulated vmmcall on Intel. But that must not
happen for L2.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---

If the recently posted fixed for KVM are applied, this test passes.

 x86/vmx_tests.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index 41a9a82..4f8ace1 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -11,6 +11,7 @@
 #include fwcfg.h
 #include isr.h
 #include apic.h
+#include types.h
 
 u64 ia32_pat;
 u64 ia32_efer;
@@ -1493,6 +1494,44 @@ static int msr_switch_exit_handler()
return VMX_TEST_EXIT;
 }
 
+static int vmmcall_init(struct vmcs *vmcs  )
+{
+   vmcs_write(EXC_BITMAP, 1  UD_VECTOR);
+   return VMX_TEST_START;
+}
+
+static void vmmcall_main(void)
+{
+   asm volatile(
+   mov $0xABCD, %%rax\n\t
+   vmmcall\n\t
+   ::: rax);
+
+   report(VMMCALL, 0);
+}
+
+static int vmmcall_exit_handler()
+{
+   ulong reason;
+
+   reason = vmcs_read(EXI_REASON);
+   switch (reason) {
+   case VMX_VMCALL:
+   printf(here\n);
+   report(VMMCALL triggers #UD, 0);
+   break;
+   case VMX_EXC_NMI:
+   report(VMMCALL triggers #UD,
+  (vmcs_read(EXI_INTR_INFO)  0xff) == UD_VECTOR);
+   break;
+   default:
+   printf(Unknown exit reason, %d\n, reason);
+   print_vmexit_info();
+   }
+
+   return VMX_TEST_VMEXIT;
+}
+
 /* name/init/guest_main/exit_handler/syscall_handler/guest_regs */
 struct vmx_test vmx_tests[] = {
{ null, NULL, basic_guest_main, basic_exit_handler, NULL, {0} },
@@ -1516,5 +1555,6 @@ struct vmx_test vmx_tests[] = {
NULL, {0} },
{ MSR switch, msr_switch_init, msr_switch_main,
msr_switch_exit_handler, NULL, {0} },
+   { vmmcall, vmmcall_init, vmmcall_main, vmmcall_exit_handler, NULL, 
{0} },
{ NULL, NULL, NULL, NULL, NULL, {0} },
 };
-- 
2.1.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH trivial] KVM: s390: Spelling s/intance/instance/

2015-03-09 Thread Geert Uytterhoeven

Signed-off-by: Geert Uytterhoeven geert+rene...@glider.be
---
 arch/s390/kvm/kvm-s390.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index fda3f3146eb636d3..83f32a147d72c0f3 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -125,7 +125,7 @@ static inline void kvm_s390_set_psw_cc(struct kvm_vcpu 
*vcpu, unsigned long cc)
vcpu-arch.sie_block-gpsw.mask |= cc  44;
 }
 
-/* test availability of facility in a kvm intance */
+/* test availability of facility in a kvm instance */
 static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr)
 {
return __test_facility(nr, kvm-arch.model.fac-mask) 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Nadav Amit

Avi Kivity avi.kiv...@gmail.com wrote:

 On 03/09/2015 07:51 PM, Nadav Amit wrote:
 Avi Kivity avi.kiv...@gmail.com wrote:
 
 On 03/03/2015 11:52 AM, Paolo Bonzini wrote:
 In this
 case, the VM might expect exceptions when PTE bits which are higher than 
 the
 maximum (reported) address width are set, and it would not get such
 exceptions. This problem can easily be experienced by small change to the
 existing KVM unit-tests.
 
 There are many variants to this problem, and the only solution which I
 consider complete is to report to the VM the maximum (52) physical address
 width to the VM, configure the VM to exit on #PF with reserved-bit
 error-codes, and then emulate these faulting instructions.
 Not even that would be a definitive solution.  If the guest tries to map
 RAM (e.g. a PCI BAR that is backed by RAM) above the host MAXPHYADDR,
 you would get EPT misconfiguration vmexits.
 
 I think there is no way to emulate physical address width correctly,
 except by disabling EPT.
 Is the issue emulating a higher MAXPHYADDR on the guest than is available
 on the host? I don't think there's any need to support that.
 
 Emulating a lower setting on the guest than is available on the host is, I
 think, desirable. Whether it would work depends on the relative priority
 of EPT misconfiguration exits vs. page table permission faults.
 Thanks for the feedback.
 
 Guest page-table permissions faults got priority over EPT misconfiguration.
 KVM can even be set to trap page-table permission faults, at least in VT-x.
 Anyhow, I don’t think it is enough.
 
 Why is it not enough? If you trap a permission fault, you can inject any 
 exception error code you like.

Because there is no real permission fault. In the following example, the VM
expects one (VM’s MAXPHYADDR=40), but there isn’t (Host’s MAXPHYADDR=46), so
the hypervisor cannot trap it. It can only trap all #PF, which is obviously
too intrusive.

  Here is an example
 
 My machine has MAXPHYADDR of 46. I modified kvm-unit-tests access test to
 set pte.45 instead of pte.51, which from the VM point-of-view should cause
 the #PF error-code indicate the reserved bits are set (just as pte.51 does).
 Here is one error from the log:
 
 test pte.p pte.45 pde.p user: FAIL: error code 5 expected d
 Dump mapping: address: 1234
 --L4: 304b007
 --L3: 304c007
 --L2: 304d001
 --L1: 2201
 
 This is with an ept misconfig programmed into that address, yes?
A reserved bit in the PTE is set - from the VM point-of-view. If there
wasn’t another cause for #PF, it would lead to EPT violation/misconfig.

 As you can see, the #PF should have had two reasons: reserved bits, and user
 access to supervisor only page. The error-code however does not indicate the
 reserved-bits are set.
 
 Note that KVM did not trap any exit on that faulting instruction, as
 otherwise it would try to emulate the instruction and assuming it is
 supported (and that the #PF was not on an instruction fetch), should be able
 to emulate the #PF correctly.
 [ The test actually crashes soon after this error due to these reasons. ]
 
 Anyhow, that is the reason for me to assume that having the maximum
 MAXPHYADDR is better.
 
 Well, that doesn't work for the reasons Paolo noted.  The guest can have a 
 ivshmem device attached, and map it above a host-supported virtual address, 
 and suddenly it goes slow.

I fully understand. That’s the reason I don’t have a reasonable solution.

Regards,
Nadav--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Paolo Bonzini



On 09/03/2015 18:08, Avi Kivity wrote:
 Is the issue emulating a higher MAXPHYADDR on the guest than is
 available on the host? I don't think there's any need to support that.

No, indeed.  The only problem is that the failure mode is quite horrible
(you get a triple fault, possibly while the guest is running).

If you need that, disable EPT. :)

 Emulating a lower setting on the guest than is available on the host is,
 I think, desirable. Whether it would work depends on the relative
 priority of EPT misconfiguration exits vs. page table permission faults.

Yes, that's doable.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Avi Kivity


On 03/09/2015 09:07 PM, Nadav Amit wrote:

Avi Kivity avi.kiv...@gmail.com wrote:


On 03/09/2015 07:51 PM, Nadav Amit wrote:

Avi Kivity avi.kiv...@gmail.com wrote:


On 03/03/2015 11:52 AM, Paolo Bonzini wrote:

In this
case, the VM might expect exceptions when PTE bits which are higher than the
maximum (reported) address width are set, and it would not get such
exceptions. This problem can easily be experienced by small change to the
existing KVM unit-tests.

There are many variants to this problem, and the only solution which I
consider complete is to report to the VM the maximum (52) physical address
width to the VM, configure the VM to exit on #PF with reserved-bit
error-codes, and then emulate these faulting instructions.

Not even that would be a definitive solution.  If the guest tries to map
RAM (e.g. a PCI BAR that is backed by RAM) above the host MAXPHYADDR,
you would get EPT misconfiguration vmexits.

I think there is no way to emulate physical address width correctly,
except by disabling EPT.

Is the issue emulating a higher MAXPHYADDR on the guest than is available
on the host? I don't think there's any need to support that.

Emulating a lower setting on the guest than is available on the host is, I
think, desirable. Whether it would work depends on the relative priority
of EPT misconfiguration exits vs. page table permission faults.

Thanks for the feedback.

Guest page-table permissions faults got priority over EPT misconfiguration.
KVM can even be set to trap page-table permission faults, at least in VT-x.
Anyhow, I don’t think it is enough.

Why is it not enough? If you trap a permission fault, you can inject any 
exception error code you like.

Because there is no real permission fault. In the following example, the VM
expects one (VM’s MAXPHYADDR=40), but there isn’t (Host’s MAXPHYADDR=46), so
the hypervisor cannot trap it. It can only trap all #PF, which is obviously
too intrusive.


There are three cases:

1) The guest has marked the page as not present.  In this case, no 
reserved bits are set and the guest should receive its #PF.
2) The page is present and the permissions are sufficient.  In this 
case, you will get an EPT misconfiguration and can proceed to inject a 
#PF with the reserved bit flag set.
3) The page is present but permissions are not sufficient.  In this case 
you can trap the fault via the PFEC_MASK register and inject a #PF to 
the guest.


So you can emulate it and only trap permission faults.  It's still too 
expensive though.




  Here is an example

My machine has MAXPHYADDR of 46. I modified kvm-unit-tests access test to
set pte.45 instead of pte.51, which from the VM point-of-view should cause
the #PF error-code indicate the reserved bits are set (just as pte.51 does).
Here is one error from the log:

test pte.p pte.45 pde.p user: FAIL: error code 5 expected d
Dump mapping: address: 1234
--L4: 304b007
--L3: 304c007
--L2: 304d001
--L1: 2201

This is with an ept misconfig programmed into that address, yes?

A reserved bit in the PTE is set - from the VM point-of-view. If there
wasn’t another cause for #PF, it would lead to EPT violation/misconfig.


As you can see, the #PF should have had two reasons: reserved bits, and user
access to supervisor only page. The error-code however does not indicate the
reserved-bits are set.

Note that KVM did not trap any exit on that faulting instruction, as
otherwise it would try to emulate the instruction and assuming it is
supported (and that the #PF was not on an instruction fetch), should be able
to emulate the #PF correctly.
[ The test actually crashes soon after this error due to these reasons. ]

Anyhow, that is the reason for me to assume that having the maximum
MAXPHYADDR is better.

Well, that doesn't work for the reasons Paolo noted.  The guest can have a 
ivshmem device attached, and map it above a host-supported virtual address, and 
suddenly it goes slow.

I fully understand. That’s the reason I don’t have a reasonable solution.


I can't think of one with reasonable performance either.  Perhaps the 
maintainers could raise the issue with Intel.  It looks academic but it 
can happen in real life -- KVM for example used to rely on reserved bits 
faults (it set all bits in the PTE so it wouldn't have been caught by this).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2 CPU Conformance Issue in KVM/x86

2015-03-09 Thread Avi Kivity


On 03/09/2015 09:33 PM, Paolo Bonzini wrote:


On 09/03/2015 18:08, Avi Kivity wrote:

Is the issue emulating a higher MAXPHYADDR on the guest than is
available on the host? I don't think there's any need to support that.

No, indeed.  The only problem is that the failure mode is quite horrible
(you get a triple fault, possibly while the guest is running).


Can't qemu simply check for it?


If you need that, disable EPT. :)



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] target-arm/kvm64: fix save/restore of SPSR regs

2015-03-09 Thread Christoffer Dall

On Mon, Mar 09, 2015 at 10:31:19PM +0900, Peter Maydell wrote:
 On 9 March 2015 at 21:56, Christoffer Dall christoffer.d...@linaro.org 
 wrote:
  this function, however, is not used only when migration, but should
  generally cover the case where you want to synchronize QEMU's state into
  KVM's state, right?  So while it may not be harmful in currently
  supported use cases, is there ever a situation where (is_a64(env)  el
  == 0) and env-spsr != banked_spsr[el], and where env-spsr is
  out-of-date?
 
 If EL == 0 then you can't access any SPSR, so env-spsr is by
 definition out of date.
 
ok, never mind then.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: nVMX: Do not emulate #UD while in guest mode

2015-03-09 Thread Jan Kiszka

While in L2, leave all #UD to L2 and do not try to emulate it. If L1 is
interested in doing this, it reports its interest via the exception
bitmap, and we never get into handle_exception of L0 anyway.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---

Noticed while wondering where the vmmcall of a misconfigured L2 went on
an Intel box: to nowhere. This bug caused a spurious fixup, and the
emulator bug did not even let it trigger a vmcall vmexit.

 arch/x86/kvm/vmx.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b4..fa0627c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5065,6 +5065,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
 
if (is_invalid_opcode(intr_info)) {
+   if (is_guest_mode(vcpu)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
-- 
2.1.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] x86: svm: make wbinvd faster

2015-03-09 Thread Marcelo Tosatti

On Fri, Feb 27, 2015 at 06:19:18PM -0600, Joel Schopp wrote:
 From: David Kaplan david.kap...@amd.com
 No need to re-decode WBINVD since we know what it is from the intercept.
 
 Signed-off-by: David Kaplan david.kap...@amd.com
 [extracted from larger unlrelated patch, forward ported, tested]
 Signed-off-by: Joel Schopp joel.sch...@amd.com

Can't you disable the intercept if need_emulate_wbinvd(vcpu) == false? 

 ---
  arch/x86/kvm/svm.c |   10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)
 
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index d319e0c..86ecd21 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -2776,6 +2776,14 @@ static int skinit_interception(struct vcpu_svm *svm)
   return 1;
  }
  
 +static int wbinvd_interception(struct vcpu_svm *svm)
 +{
 + kvm_emulate_wbinvd(svm-vcpu);
 + skip_emulated_instruction(svm-vcpu);
 + return 1;
 +}
 +
 +
  static int xsetbv_interception(struct vcpu_svm *svm)
  {
   u64 new_bv = kvm_read_edx_eax(svm-vcpu);
 @@ -3376,7 +3384,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm 
 *svm) = {
   [SVM_EXIT_STGI] = stgi_interception,
   [SVM_EXIT_CLGI] = clgi_interception,
   [SVM_EXIT_SKINIT]   = skinit_interception,
 - [SVM_EXIT_WBINVD]   = emulate_on_interception,
 + [SVM_EXIT_WBINVD]   = wbinvd_interception,
   [SVM_EXIT_MONITOR]  = monitor_interception,
   [SVM_EXIT_MWAIT]= mwait_interception,
   [SVM_EXIT_XSETBV]   = xsetbv_interception,
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv2 0/9] Fix possible warnings and errors for kvm_main.c

2015-03-09 Thread Marcelo Tosatti

On Fri, Feb 27, 2015 at 03:18:17PM +0800, Xiubo Li wrote:
 
 There are to much warning/error noise in kvm_main.c when gernerating and
 checking new patches.
 
 This patch series fix most of these warnings and errors to reduce possible
 noise.
 
 
 Change in v2:
 - Accept Thomas Huth's advice of using pr_info instead of printk to avoid
   exceeding the 80 columns limit.

I applied v1 of the series by modifying the last patch.

Thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] x86: irq_comm: Add check for RH bit in kvm_set_msi_irq

2015-03-09 Thread James Sullivan

Hi folks,

This is a small patch that implements logic to handle the RH bit
being set in the msi message address register. Currently the DM bit is
the only thing used to decide irq-dest_mode (logical when DM set,
physical when unset). Documentation indicates that the DM bit will be
ignored when the RH bit is unset, and physical destination mode is used
in this case.

Fixed this to set irq-dest_mode to logical just when both bits are set,
and physical otherwise.

Signed-off-by: James Sullivan sullivan.jame...@gmail.com
---
 arch/x86/kvm/irq_comm.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 72298b3..26bbab8 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -97,18 +97,26 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct 
kvm_lapic *src,
 static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
   struct kvm_lapic_irq *irq)
 {
+   bool phys;
trace_kvm_msi_set_irq(e-msi.address_lo, e-msi.data);

irq-dest_id = (e-msi.address_lo 
MSI_ADDR_DEST_ID_MASK)  MSI_ADDR_DEST_ID_SHIFT;
irq-vector = (e-msi.data 
MSI_DATA_VECTOR_MASK)  MSI_DATA_VECTOR_SHIFT;
-   irq-dest_mode = (1  MSI_ADDR_DEST_MODE_SHIFT)  e-msi.address_lo;
+   /*
+* Set dest_mode to logical just in case both the RH and DM
+* bits are set, otherwise default to physical.
+*/
+   phys = ((e-msi.address_lo  (MSI_ADDR_REDIRECTION_LOWPRI |
+   MSI_ADDR_DEST_MODE_LOGICAL)) !=
+   (MSI_ADDR_REDIRECTION_LOWPRI |
+   MSI_ADDR_DEST_MODE_LOGICAL));
+   irq-dest_mode = phys ? 0 : (MSI_ADDR_DEST_MODE_LOGICAL);
irq-trig_mode = (1  MSI_DATA_TRIGGER_SHIFT)  e-msi.data;
irq-delivery_mode = e-msi.data  0x700;
irq-level = 1;
irq-shorthand = 0;
-   /* TODO Deal with RH bit of MSI message address */
 }

 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-- 
2.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv2 0/9] Fix possible warnings and errors for kvm_main.c

2015-03-09 Thread Xiubo Li


On 10/03/2015 07:07, Marcelo Tosatti wrote:

On Fri, Feb 27, 2015 at 03:18:17PM +0800, Xiubo Li wrote:

There are to much warning/error noise in kvm_main.c when gernerating and
checking new patches.

This patch series fix most of these warnings and errors to reduce possible
noise.


Change in v2:
- Accept Thomas Huth's advice of using pr_info instead of printk to avoid
   exceeding the 80 columns limit.

I applied v1 of the series by modifying the last patch.


Thanks very much.

BRs
Xiubo



Thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] virtio-balloon: do not call blocking ops when !TASK_RUNNING

2015-03-09 Thread Rusty Russell

Thomas Huth th...@linux.vnet.ibm.com writes:
 On Wed, 25 Feb 2015 16:11:27 +0100
 Cornelia Huck cornelia.h...@de.ibm.com wrote:

 On Wed, 25 Feb 2015 15:36:02 +0100
 Michael S. Tsirkin m...@redhat.com wrote:
 
  virtio balloon has this code:
  wait_event_interruptible(vb-config_change,
   (diff = towards_target(vb)) != 0
   || vb-need_stats_update
   || kthread_should_stop()
   || freezing(current));
  
  Which is a problem because towards_target() call might block after
  wait_event_interruptible sets task state to TAST_INTERRUPTIBLE, causing
  the task_struct::state collision typical of nesting of sleeping
  primitives
  
  See also http://lwn.net/Articles/628628/ or Thomas's
  bug report
  http://article.gmane.org/gmane.linux.kernel.virtualization/24846
  for a fuller explanation.
  
  To fix, rewrite using wait_woken.
  
  Cc: sta...@vger.kernel.org
  Reported-by: Thomas Huth th...@linux.vnet.ibm.com
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
  ---
  
  changes from v1:
 remove wait_event_interruptible
 noticed by Cornelia Huck cornelia.h...@de.ibm.com
  
   drivers/virtio/virtio_balloon.c | 19 ++-
   1 file changed, 14 insertions(+), 5 deletions(-)
  
 
 I was able to reproduce Thomas' original problem and can confirm that
 it is gone with this patch.
 
 Reviewed-by: Cornelia Huck cornelia.h...@de.ibm.com

 Right, I just applied the patch on my system, too, and the problem is
 indeed gone! Thanks for the quick fix!

 Tested-by: Thomas Huth th...@linux.vnet.ibm.com

Applied.

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-09 Thread Rusty Russell

Cornelia Huck cornelia.h...@de.ibm.com writes:
 On Wed, 4 Mar 2015 11:25:56 +0100
 Michael S. Tsirkin m...@redhat.com wrote:

 On Wed, Mar 04, 2015 at 04:44:54PM +1030, Rusty Russell wrote:
  Michael S. Tsirkin m...@redhat.com writes:
   On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
   Thomas Huth th...@linux.vnet.ibm.com writes:
On Thu, 26 Feb 2015 11:50:42 +1030
Rusty Russell ru...@rustcorp.com.au wrote:
   
Thomas Huth th...@linux.vnet.ibm.com writes:
  Hi all,

 with the recent kernel 3.19, I get a kernel warning when I start my
 KVM guest on s390 with virtio balloon enabled:

The deeper problem is that virtio_ccw_get_config just silently fails 
on
OOM.

Neither get_config nor set_config are expected to fail.
   
AFAIK this is currently not a problem. According to
http://lwn.net/Articles/627419/ these kmalloc calls never
fail because they allocate less than a page.
   
   I strongly suggest you unlearn that fact.
   The fix for this is in two parts:
   
   1) Annotate using sched_annotate_sleep() and add a comment: we may spin
  a few times in low memory situations, but this isn't a high
  performance path.
   
   2) Handle get_config (and other) failure in some more elegant way.
   
   Cheers,
   Rusty.
  
   I agree, but I'd like to point out that even without kmalloc,
   on s390 get_config is blocking - it's waiting
   for a hardware interrupt.
  
   And it makes sense: config is not data path, I don't think
   we should spin there.
  
   So I think besides these two parts, we still need my two patches:
   virtio-balloon: do not call blocking ops when !TASK_RUNNING
  
  I prefer to annotate, over trying to fix this.
  
  Because it's not important.  We might spin a few times, but it's very
  unlikely, and it's certainly not performance critical.
  
  Thanks,
  Rusty.
  
  Subject: virtio_balloon: annotate possible sleep waiting for event.
  
  CCW (s390) does this.
  
  Reported-by: Thomas Huth th...@linux.vnet.ibm.com
  Signed-off-by: Rusty Russell ru...@rustcorp.com.au
  
  diff --git a/drivers/virtio/virtio_balloon.c 
  b/drivers/virtio/virtio_balloon.c
  index 0413157f3b49..3f4d5acdbde0 100644
  --- a/drivers/virtio/virtio_balloon.c
  +++ b/drivers/virtio/virtio_balloon.c
  @@ -340,6 +340,15 @@ static int balloon(void *_vballoon)
 s64 diff;
   
 try_to_freeze();
  +
  +  /*
  +   * Reading the config on the ccw backend involves an
  +   * allocation, so we may actually sleep and have an
  +   * extra iteration.  It's extremely unlikely,
 
 Hmm, this part of the comment seems wrong to me.
 Reading the config on the ccw backend always sleeps
 because it's interrupt driven.

 (...)

 So I suspect
 http://mid.gmane.org/1424874878-17155-1-git-send-email-...@redhat.com
 is better.
 
 What do you think?

 I'd prefer to fix this as well. While the I/O request completes
 instantly on current qemu (the ssch backend handles the start function
 immediately, not asynchronously as on real hardware), this (a) is an
 implementation detail that may change and (b) doesn't account for the
 need to deliver the interrupt to the guest - which might take non-zero
 time.

Ah, I see.  My mistake.

I've thrown out my patch, applied that one.

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] virtio_rpmsg: set DRIVER_OK before using device

2015-03-09 Thread Michael S. Tsirkin

On Mon, Mar 09, 2015 at 05:39:20PM +1030, Rusty Russell wrote:
 Michael S. Tsirkin m...@redhat.com writes:
  virtio spec requires that all drivers set DRIVER_OK
  before using devices. While rpmsg isn't yet
  included in the virtio 1 spec, previous spec versions
  also required this.
 
  virtio rpmsg violates this rule: is calls kick
  before setting DRIVER_OK.
 
  The fix isn't trivial since simply calling virtio_device_ready earlier
  would mean we might get an interrupt in parallel with adding buffers.
 
  Instead, split kick out to prepare+notify calls.  prepare before
  virtio_device_ready - when we know we won't get interrupts. notify right
  afterwards.
 
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
 
 Applied.  I'll wait for Ohad to ack before sending to Linus.
 
 BTW I assume you have a version of qemu which warns on these kind of
 failures?  That'd be nice to have!
 
 Thanks,
 Rusty.

Yes but it's hacky ATM - it's just a one-liner that checks
the status on kick and does fprintf(stderr).
I'm trying to rework the code so it'll actually
set the status automatically, but there are some difficulties there
when ioeventfd is used,
in particular we need a way to re-inject the kick
after status update.


  ---
 
  Note: compile-tested only.
 
   drivers/rpmsg/virtio_rpmsg_bus.c | 17 -
   1 file changed, 16 insertions(+), 1 deletion(-)
 
  diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c 
  b/drivers/rpmsg/virtio_rpmsg_bus.c
  index 92f6af6..73354ee 100644
  --- a/drivers/rpmsg/virtio_rpmsg_bus.c
  +++ b/drivers/rpmsg/virtio_rpmsg_bus.c
  @@ -951,6 +951,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
  void *bufs_va;
  int err = 0, i;
  size_t total_buf_space;
  +   bool notify;
   
  vrp = kzalloc(sizeof(*vrp), GFP_KERNEL);
  if (!vrp)
  @@ -1030,8 +1031,22 @@ static int rpmsg_probe(struct virtio_device *vdev)
  }
  }
   
  +   /*
  +* Prepare to kick but don't notify yet - we can't do this before
  +* device is ready.
  +*/
  +   notify = virtqueue_kick_prepare(vrp-rvq);
  +
  +   /* From this point on, we can notify and get callbacks. */
  +   virtio_device_ready(vdev);
  +
  /* tell the remote processor it can start sending messages */
  -   virtqueue_kick(vrp-rvq);
  +   /*
  +* this might be concurrent with callbacks, but we are only
  +* doing notify, not a full kick here, so that's ok.
  +*/
  +   if (notify)
  +   virtqueue_notify(vrp-rvq);
   
  dev_info(vdev-dev, rpmsg host is online\n);
   
  -- 
  MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] virtio_rpmsg: set DRIVER_OK before using device

2015-03-09 Thread Michael S. Tsirkin

On Sat, Mar 07, 2015 at 08:06:56PM +0100, Michael S. Tsirkin wrote:
 virtio spec requires that all drivers set DRIVER_OK
 before using devices. While rpmsg isn't yet
 included in the virtio 1 spec, previous spec versions
 also required this.
 
 virtio rpmsg violates this rule: is calls kick
 before setting DRIVER_OK.
 
 The fix isn't trivial since simply calling virtio_device_ready earlier
 would mean we might get an interrupt in parallel with adding buffers.
 
 Instead, split kick out to prepare+notify calls.  prepare before
 virtio_device_ready - when we know we won't get interrupts. notify right
 afterwards.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---

Ohad, can you review and ack pls?

 Note: compile-tested only.
 
  drivers/rpmsg/virtio_rpmsg_bus.c | 17 -
  1 file changed, 16 insertions(+), 1 deletion(-)
 
 diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c 
 b/drivers/rpmsg/virtio_rpmsg_bus.c
 index 92f6af6..73354ee 100644
 --- a/drivers/rpmsg/virtio_rpmsg_bus.c
 +++ b/drivers/rpmsg/virtio_rpmsg_bus.c
 @@ -951,6 +951,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
   void *bufs_va;
   int err = 0, i;
   size_t total_buf_space;
 + bool notify;
  
   vrp = kzalloc(sizeof(*vrp), GFP_KERNEL);
   if (!vrp)
 @@ -1030,8 +1031,22 @@ static int rpmsg_probe(struct virtio_device *vdev)
   }
   }
  
 + /*
 +  * Prepare to kick but don't notify yet - we can't do this before
 +  * device is ready.
 +  */
 + notify = virtqueue_kick_prepare(vrp-rvq);
 +
 + /* From this point on, we can notify and get callbacks. */
 + virtio_device_ready(vdev);
 +
   /* tell the remote processor it can start sending messages */
 - virtqueue_kick(vrp-rvq);
 + /*
 +  * this might be concurrent with callbacks, but we are only
 +  * doing notify, not a full kick here, so that's ok.
 +  */
 + if (notify)
 + virtqueue_notify(vrp-rvq);
  
   dev_info(vdev-dev, rpmsg host is online\n);
  
 -- 
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: virtio fixes pull for 4.0?

2015-03-09 Thread Cornelia Huck

On Mon, 09 Mar 2015 17:43:19 +1030
Rusty Russell ru...@rustcorp.com.au wrote:

 Michael S. Tsirkin m...@redhat.com writes:

  virtio-balloon: do not call blocking ops when !TASK_RUNNING
 
  Rusty, it seems you prefer a different fix for this issue,
  while Cornelia prefers mine. Maybe both me and Cornelia misunderstand the
  issue? I know you dealt with a ton of similar issues recently
  so you are more likely to be right, but I'd like to understand
  what's going on better all the same. Could you help please?
 
 In the longer run, we should handle failures from these callbacks.  But
 we don't need to do that right now.  So we want the minimal fix.
 
 And an annotation is the minimal fix.  The bug has been there for ages;
 it's just the warning that is new (if we *always* slept, we would
 spin, but in practice we'll rarely sleep).

I don't think doing_io() in virtio-ccw will sleep often, although it
might happen under loads that delay posting the interrupt.

I understand why you'd like to do the minimal fix, and it's not likely
that new problems start popping up now.

So I'm OK with doing the annotation for 4.0 and more involved changes
for 4.1.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] virtio_rpmsg: set DRIVER_OK before using device

2015-03-09 Thread Rusty Russell

Michael S. Tsirkin m...@redhat.com writes:
 virtio spec requires that all drivers set DRIVER_OK
 before using devices. While rpmsg isn't yet
 included in the virtio 1 spec, previous spec versions
 also required this.

 virtio rpmsg violates this rule: is calls kick
 before setting DRIVER_OK.

 The fix isn't trivial since simply calling virtio_device_ready earlier
 would mean we might get an interrupt in parallel with adding buffers.

 Instead, split kick out to prepare+notify calls.  prepare before
 virtio_device_ready - when we know we won't get interrupts. notify right
 afterwards.

 Signed-off-by: Michael S. Tsirkin m...@redhat.com

Applied.  I'll wait for Ohad to ack before sending to Linus.

BTW I assume you have a version of qemu which warns on these kind of
failures?  That'd be nice to have!

Thanks,
Rusty.

 ---

 Note: compile-tested only.

  drivers/rpmsg/virtio_rpmsg_bus.c | 17 -
  1 file changed, 16 insertions(+), 1 deletion(-)

 diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c 
 b/drivers/rpmsg/virtio_rpmsg_bus.c
 index 92f6af6..73354ee 100644
 --- a/drivers/rpmsg/virtio_rpmsg_bus.c
 +++ b/drivers/rpmsg/virtio_rpmsg_bus.c
 @@ -951,6 +951,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
   void *bufs_va;
   int err = 0, i;
   size_t total_buf_space;
 + bool notify;
  
   vrp = kzalloc(sizeof(*vrp), GFP_KERNEL);
   if (!vrp)
 @@ -1030,8 +1031,22 @@ static int rpmsg_probe(struct virtio_device *vdev)
   }
   }
  
 + /*
 +  * Prepare to kick but don't notify yet - we can't do this before
 +  * device is ready.
 +  */
 + notify = virtqueue_kick_prepare(vrp-rvq);
 +
 + /* From this point on, we can notify and get callbacks. */
 + virtio_device_ready(vdev);
 +
   /* tell the remote processor it can start sending messages */
 - virtqueue_kick(vrp-rvq);
 + /*
 +  * this might be concurrent with callbacks, but we are only
 +  * doing notify, not a full kick here, so that's ok.
 +  */
 + if (notify)
 + virtqueue_notify(vrp-rvq);
  
   dev_info(vdev-dev, rpmsg host is online\n);
  
 -- 
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: virtio fixes pull for 4.0?

2015-03-09 Thread Rusty Russell

Michael S. Tsirkin m...@redhat.com writes:
 Hi Rusty!
 There are a bunch of (mostly virtio 1.0 related) fixes for virtio
 that need to go into 4.0 I think.
   virtio_blk: typo fix
   virtio_blk: fix comment for virtio 1.0

OK, I've added these two.  I tend to be overcautious after the merge
window.

   virtio_console: init work unconditionally
   virtio_console: avoid config access from irq
   virtio_balloon: set DRIVER_OK before using device

 seem ready?

These are in my virtio-next tree already.  

   virtio_mmio: generation support
   virtio_mmio: fix endian-ness for mmio these two are waiting for ack by 
 Pawel

 These two fix bugs in virtio 1.0 code for mmio.
 Host code for that was AFAIK not posted, so I can't test properly.
 Pawel?

I'm waiting on Acks for these two.

   virtio-balloon: do not call blocking ops when !TASK_RUNNING

 Rusty, it seems you prefer a different fix for this issue,
 while Cornelia prefers mine. Maybe both me and Cornelia misunderstand the
 issue? I know you dealt with a ton of similar issues recently
 so you are more likely to be right, but I'd like to understand
 what's going on better all the same. Could you help please?

In the longer run, we should handle failures from these callbacks.  But
we don't need to do that right now.  So we want the minimal fix.

And an annotation is the minimal fix.  The bug has been there for ages;
it's just the warning that is new (if we *always* slept, we would
spin, but in practice we'll rarely sleep).

   virtio_rpmsg: set DRIVER_OK before using device

 Just posted this, but seems pretty obvious.

Yep, I've applied this too.  Thanks!

 I think it's a good idea to merge these patches (maybe except the
 !TASK_RUNNING thing) sooner rather than later, to make sure people have
 the time to test the fixes properly.  Would you like me to pack up (some
 of them) them up and do a pull request?

I'm waiting a bit longer, since they seem to still be tricking in.

I'm still chasing a QEMU bug, where it seems to fill in a number too
large in the 'len' field for a block device.  It should be 1 byte for a
block device write, for example.  See patch which causes assert() in
qemu, but I had to stop at that point (should get back tomorrow I hope).

Thanks,
Rusty.

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 882a31b..98e99b8 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -243,16 +243,21 @@ int virtio_queue_empty(VirtQueue *vq)
 }
 
 void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
-unsigned int len, unsigned int idx)
+unsigned int len_written, unsigned int idx)
 {
-unsigned int offset;
+unsigned int offset, tot_wlen;
 int i;
 
-trace_virtqueue_fill(vq, elem, len, idx);
+trace_virtqueue_fill(vq, elem, len_written, idx);
+
+for (tot_wlen = i = 0; i  elem-out_num; i++) {
+tot_wlen += elem-out_sg[i].iov_len;
+}
+assert(len_written = tot_wlen);
 
 offset = 0;
 for (i = 0; i  elem-in_num; i++) {
-size_t size = MIN(len - offset, elem-in_sg[i].iov_len);
+size_t size = MIN(len_written - offset, elem-in_sg[i].iov_len);
 
 cpu_physical_memory_unmap(elem-in_sg[i].iov_base,
   elem-in_sg[i].iov_len,
@@ -270,7 +275,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement 
*elem,
 
 /* Get a pointer to the next entry in the used ring. */
 vring_used_ring_id(vq, idx, elem-index);
-vring_used_ring_len(vq, idx, len);
+vring_used_ring_len(vq, idx, len_written);
 }
 
 void virtqueue_flush(VirtQueue *vq, unsigned int count)
@@ -288,9 +293,9 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count)
 }
 
 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
-unsigned int len)
+unsigned int len_written)
 {
-virtqueue_fill(vq, elem, len, 0);
+virtqueue_fill(vq, elem, len_written, 0);
 virtqueue_flush(vq, 1);
 }
 
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index df09993..153374f 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -191,10 +191,10 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int 
queue_size,
 void virtio_del_queue(VirtIODevice *vdev, int n);
 
 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
-unsigned int len);
+unsigned int len_written);
 void virtqueue_flush(VirtQueue *vq, unsigned int count);
 void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
-unsigned int len, unsigned int idx);
+unsigned int len_written, unsigned int idx);
 
 void virtqueue_map_sg(struct iovec *sg, hwaddr *addr,
 size_t num_sg, int is_write);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at

77 matches

Mail list logo