date:20140715

[PATCH 1/2] powerpc/powernv: Fix IOMMU table for VFIO dev

2014-07-15 Thread Gavin Shan

On PHB3, PCI devices can bypass IOMMU for DMA access. If we pass
through one PCI device, whose hose driver ever enable the bypass
mode, pdev-dev.archdata.dma_data.iommu_table_base isn't IOMMU
table. However, EEH needs access the IOMMU table when the device
is owned by guest.

The patch fixes pdev-dev.archdata.dma_data.iommu_table when
passing through the device to guest in pnv_pci_ioda2_set_bypass().

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 +-
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index de19ede..93fd815 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -494,14 +494,22 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
return 0;
 }
 
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
+  struct pci_bus *bus,
+  bool add_to_iommu_group)
 {
struct pci_dev *dev;
 
list_for_each_entry(dev, bus-devices, bus_list) {
-   set_iommu_table_base_and_group(dev-dev, pe-tce32_table);
+   if (add_to_iommu_group)
+   set_iommu_table_base_and_group(dev-dev,
+  pe-tce32_table);
+   else
+   set_iommu_table_base(dev-dev, pe-tce32_table);
+
if (dev-subordinate)
-   pnv_ioda_setup_bus_dma(pe, dev-subordinate);
+   pnv_ioda_setup_bus_dma(pe, dev-subordinate,
+  add_to_iommu_group);
}
 }
 
@@ -677,7 +685,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
if (pe-pdev)
set_iommu_table_base_and_group(pe-pdev-dev, tbl);
else
-   pnv_ioda_setup_bus_dma(pe, pe-pbus);
+   pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
 
return;
  fail:
@@ -713,11 +721,15 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table 
*tbl, bool enable)
 0);
 
/*
-* We might want to reset the DMA ops of all devices on
-* this PE. However in theory, that shouldn't be necessary
-* as this is used for VFIO/KVM pass-through and the device
-* hasn't yet been returned to its kernel driver
+* EEH needs the mapping between IOMMU table and group
+* of those VFIO/KVM pass-through devices. We can postpone
+* resetting DMA ops until the DMA mask is configured in
+* host side.
 */
+   if (pe-pdev)
+   set_iommu_table_base(pe-pdev-dev, tbl);
+   else
+   pnv_ioda_setup_bus_dma(pe, pe-pbus, false);
}
if (rc)
pe_err(pe, OPAL error %lld configuring bypass window\n, rc);
@@ -805,7 +817,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
if (pe-pdev)
set_iommu_table_base_and_group(pe-pdev-dev, tbl);
else
-   pnv_ioda_setup_bus_dma(pe, pe-pbus);
+   pnv_ioda_setup_bus_dma(pe, pe-pbus, true);
 
/* Also create a bypass window */
pnv_pci_ioda2_setup_bypass_pe(phb, pe);
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] powerpc/eeh: Fetch IOMMU table in reliable way

2014-07-15 Thread Gavin Shan

Function eeh_iommu_group_to_pe() iterates each PCI device to check
the binding IOMMU group with get_iommu_table_base(), which possibly
fetches pdev-dev.archdata.dma_data.dma_offset. It's (0x1  59)
for bypass cases.

The patch fixes the issue by iterating devices hooked to the IOMMU
group and fetch IOMMU table there.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/eeh.c | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 18c40fd..4de2103 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -27,6 +27,7 @@
 #include linux/init.h
 #include linux/list.h
 #include linux/pci.h
+#include linux/iommu.h
 #include linux/proc_fs.h
 #include linux/rbtree.h
 #include linux/reboot.h
@@ -1178,6 +1179,24 @@ out:
 }
 EXPORT_SYMBOL(eeh_dev_release);
 
+static int dev_has_iommu_table(struct device *dev, void *data)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct pci_dev **ppdev = data;
+   struct iommu_table *tbl;
+
+   if (!dev)
+   return 0;
+
+   tbl = get_iommu_table_base(dev);
+   if (tbl  tbl-it_group) {
+   *ppdev = pdev;
+   return 1;
+   }
+
+   return 0;
+}
+
 /**
  * eeh_iommu_group_to_pe - Convert IOMMU group to EEH PE
  * @group: IOMMU group
@@ -1186,24 +1205,16 @@ EXPORT_SYMBOL(eeh_dev_release);
  */
 struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group)
 {
-   struct iommu_table *tbl;
struct pci_dev *pdev = NULL;
struct eeh_dev *edev;
-   bool found = false;
+   int ret;
 
/* No IOMMU group ? */
if (!group)
return NULL;
 
-   /* No PCI device ? */
-   for_each_pci_dev(pdev) {
-   tbl = get_iommu_table_base(pdev-dev);
-   if (tbl  tbl-it_group == group) {
-   found = true;
-   break;
-   }
-   }
-   if (!found)
+   ret = iommu_group_for_each_dev(group, pdev, dev_has_iommu_table);
+   if (!ret || !pdev)
return NULL;
 
/* No EEH device or PE ? */
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 0/2] Bug fix for VFIO EEH

2014-07-15 Thread Gavin Shan

Those 2 patches are bug fix for VFIO EEH support, which isn't merged yet though
all reviewers gave their ack. So I'm sending this to avoid revert or something
like that.

The problem is that dma_offset/iommu_table_base
are sharing same memory location. When disabling bypass mode, we missed to 
restore
iommu_table_base. EEH is utilizing that to translate IOMMU group ID to PE. The
patches fix the issue.

Another issue is that we're searching all online PCI devices for translating
IOMMU group ID to PE. That's incorrect since we're uncertain that one speicific
device (except those in current IOMMU group) is running in bypassed mode or
not. So we should have search current IOMMU group.

It should be applied on top of unmerged VFIO EEH support patchset:

http://patchwork.ozlabs.org/patch/357665/

Gavin Shan (2):
  powerpc/powernv: Fix IOMMU table for VFIO dev
  powerpc/eeh: Fetch IOMMU table in reliable way

 arch/powerpc/kernel/eeh.c | 33 ---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++-
 2 files changed, 43 insertions(+), 20 deletions(-)

-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v5 2/2] [BUGFIX] kprobes: Fix Failed to find blacklist error on ia64 and ppc64

2014-07-15 Thread Benjamin Herrenschmidt

On Tue, 2014-07-15 at 13:19 +1000, Michael Ellerman wrote:

  Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com
  Reported-by: Tony Luck tony.l...@gmail.com
  Tested-by: Tony Luck tony.l...@intel.com
  Cc: Michael Ellerman m...@ellerman.id.au
 
 Tested-by: Michael Ellerman m...@ellerman.id.au
 Acked-by: Michael Ellerman m...@ellerman.id.au (for powerpc)
 
 Ben, can you take this in your tree?

Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org

That looks more like generic material. Do we have a kprobes maintainer ?
Andrew, do you want to take this ?

I'm happy to put it in powerpc and send it to Linus tomorrow if nobody
cares :-)

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] ppc/xmon: use isxdigit/isspace/isalnum from ctype.h

2014-07-15 Thread Vincent Bernat

Use linux/ctype.h instead of defining custom versions of
isxdigit/isspace/isalnum.

Signed-off-by: Vincent Bernat vinc...@bernat.im
---
 arch/powerpc/xmon/xmon.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d199bfa2f1fa..c0c31a47c469 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -24,6 +24,7 @@
 #include linux/interrupt.h
 #include linux/irq.h
 #include linux/bug.h
+#include linux/ctype.h
 
 #include asm/ptrace.h
 #include asm/string.h
@@ -177,14 +178,6 @@ extern void xmon_leave(void);
 #define GETWORD(v) (((v)[0]  24) + ((v)[1]  16) + ((v)[2]  8) + 
(v)[3])
 #endif
 
-#define isxdigit(c)(('0' = (c)  (c) = '9') \
-|| ('a' = (c)  (c) = 'f') \
-|| ('A' = (c)  (c) = 'F'))
-#define isalnum(c) (('0' = (c)  (c) = '9') \
-|| ('a' = (c)  (c) = 'z') \
-|| ('A' = (c)  (c) = 'Z'))
-#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0)
-
 static char *help_string = \
 Commands:\n\
   bshow breakpoints\n\
@@ -2121,9 +2114,6 @@ static void dump_pacas(void)
 }
 #endif
 
-#define isxdigit(c)(('0' = (c)  (c) = '9') \
-|| ('a' = (c)  (c) = 'f') \
-|| ('A' = (c)  (c) = 'F'))
 static void
 dump(void)
 {
-- 
2.0.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH] ppc/xmon: use isxdigit/isspace/isalnum from ctype.h

2014-07-15 Thread David Laight

From: Vincent Bernat
 Use linux/ctype.h instead of defining custom versions of
 isxdigit/isspace/isalnum.

...
 -#define isspace(c)   (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0)

That is different from the version in linux/ctype.h
Especially for 'c == 0', but probably also vertical tab and form feed.

David

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 01/16] powerpc/iommu: Fix comments with it_page_shift

2014-07-15 Thread Alexey Kardashevskiy

There is a couple of commented debug prints which still use
IOMMU_PAGE_SHIFT() which is not defined for POWERPC anymore, replace
them with it_page_shift.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kernel/iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 88e3ec6..f84f799 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1037,7 +1037,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 
/* if (unlikely(ret))
pr_err(iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx 
ret=%d\n,
-   __func__, hwaddr, entry  IOMMU_PAGE_SHIFT(tbl),
+   __func__, hwaddr, entry  tbl-it_page_shift,
hwaddr, ret); */
 
return ret;
@@ -1056,7 +1056,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, 
unsigned long entry,
direction != DMA_TO_DEVICE, page);
if (unlikely(ret != 1)) {
/* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx 
ioba=%lx ret=%d\n,
-   tce, entry  IOMMU_PAGE_SHIFT(tbl), ret); */
+   tce, entry  tbl-it_page_shift, ret); */
return -EFAULT;
}
hwaddr = (unsigned long) page_address(page) + offset;
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 02/16] KVM: PPC: Use RCU when adding to arch.spapr_tce_tables

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc..516f2ee 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -131,7 +131,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
kvm_get_kvm(kvm);
 
mutex_lock(kvm-lock);
-   list_add(stt-list, kvm-arch.spapr_tce_tables);
+   list_add_rcu(stt-list, kvm-arch.spapr_tce_tables);
 
mutex_unlock(kvm-lock);
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW)

2014-07-15 Thread Alexey Kardashevskiy

This prepares existing upstream kernel for DDW (Dynamic DMA windows) and
adds actual DDW support for VFIO.

This patchset does not contain any in-kernel acceleration stuff.

This patchset does not enable DDW for emulated devices.


Alexey Kardashevskiy (16):
  powerpc/iommu: Fix comments with it_page_shift
  KVM: PPC: Use RCU when adding to arch.spapr_tce_tables
  powerpc/powernv: Use it_page_shift for TCE invalidation
  powerpc/powernv: Use it_page_shift in TCE build
  powerpc/powernv: Add a page size parameter to
pnv_pci_setup_iommu_table()
  powerpc/powernv: Make invalidate() callback an iommu_table callback
  powerpc/spapr: vfio: Implement spapr_tce_iommu_ops
  powerpc/powernv: Convert/move set_bypass() callback to
take_ownership()
  powerpc/iommu: Fix IOMMU ownership control functions
  powerpc/iommu: Fix missing permission bits in
iommu_put_tce_user_mode()
  powerpc/iommu: Extend ppc_md.tce_build(_rm) to return old TCE values
  powerpc/powernv: Return non-zero TCE from pnv_tce_build
  powerpc/iommu: Implement put_page() if TCE had non-zero value
  powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA
  vfio: Use it_page_size
  vfio: powerpc: Enable Dynamic DMA windows

 arch/powerpc/include/asm/iommu.h|  11 +-
 arch/powerpc/include/asm/machdep.h  |   2 +
 arch/powerpc/include/asm/tce.h  |  36 
 arch/powerpc/kernel/iommu.c |  95 +++---
 arch/powerpc/kvm/book3s_64_vio.c|   2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c   | 253 ++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |   4 +-
 arch/powerpc/platforms/powernv/pci.c|  60 --
 arch/powerpc/platforms/powernv/pci.h|   4 +-
 arch/powerpc/platforms/pseries/iommu.c  |  17 +-
 arch/powerpc/sysdev/dart_iommu.c|   1 +
 drivers/vfio/vfio_iommu_spapr_tce.c | 280 
 include/uapi/linux/vfio.h   |  37 +++-
 13 files changed, 679 insertions(+), 123 deletions(-)

-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 06/16] powerpc/powernv: Make invalidate() callback an iommu_table callback

2014-07-15 Thread Alexey Kardashevskiy

This implements pnv_pci_ioda(1|2)_tce_invalidate as a callback
of iommu_table to simplify code structure. The callbacks receive
iommu_table only and cast it to PE, the specific callback knows how.

This registers invalidate() callbacks for IODA1 and IODA2:
- pnv_pci_ioda1_tce_invalidate;
- pnv_pci_ioda2_tce_invalidate_32.

There will be another pnv_pci_ioda2_tce_invalidate_64() callback for
huge DMA windows.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  |  4 
 arch/powerpc/platforms/powernv/pci-ioda.c | 19 +--
 arch/powerpc/platforms/powernv/pci.c  | 27 +++
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 42632c7..d8fb3fa 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -60,6 +60,9 @@ struct iommu_pool {
spinlock_t lock;
 } cacheline_aligned_in_smp;
 
+typedef void (*iommu_invalidate_fn)(struct iommu_table *tbl,
+   __be64 *startp, __be64 *endp, bool rm);
+
 struct iommu_table {
unsigned long  it_busno; /* Bus number this table belongs to */
unsigned long  it_size;  /* Size of iommu table in entries */
@@ -77,6 +80,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
struct iommu_group *it_group;
 #endif
+   iommu_invalidate_fn invalidate;
void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9f28e18..48e2358 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -505,10 +505,11 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
*pe, struct pci_bus *bus)
}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-struct iommu_table *tbl,
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 __be64 *startp, __be64 *endp, bool rm)
 {
+   struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
+ tce32_table);
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
@@ -584,17 +585,13 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
}
 }
 
-void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
-__be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda2_tce_invalidate_32(struct iommu_table *tbl,
+   __be64 *startp, __be64 *endp, bool rm)
 {
struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
- tce32_table);
-   struct pnv_phb *phb = pe-phb;
+   tce32_table);
 
-   if (phb-type == PNV_PHB_IODA1)
-   pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
-   else
-   pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+   pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
 }
 
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
@@ -657,6 +654,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
  base  28, IOMMU_PAGE_SHIFT_4K);
+   tbl-invalidate = pnv_pci_ioda1_tce_invalidate;
 
/* OPAL variant of P7IOC SW invalidated TCEs */
swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
@@ -788,6 +786,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
IOMMU_PAGE_SHIFT_4K);
+   tbl-invalidate = pnv_pci_ioda2_tce_invalidate_32;
 
/* OPAL variant of PHB3 invalidated TCEs */
swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 4dff552..1ab0f62 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -550,6 +550,23 @@ struct pci_ops pnv_pci_ops = {
.write = pnv_pci_write_config,
 };
 
+static void pnv_tce_invalidate(struct iommu_table *tbl, __be64 *startp,
+   __be64 *endp, bool rm)
+{
+   /*
+* Some implementations won't cache invalid TCEs and thus may not
+* need that flush. We'll probably turn it_type into a bit mask
+* of flags if that becomes the case
+*/
+   if (!(tbl-it_type  TCE_PCI_SWINV_FREE))
+   return;
+
+   if (!tbl-invalidate)
+   return;
+
+   tbl-invalidate(tbl, startp, endp, rm);
+}
+
 static int pnv_tce_build(struct

[PATCH v1 07/16] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops

2014-07-15 Thread Alexey Kardashevskiy

Modern IBM POWERPC systems support multiple IOMMU tables per PHB
so we need a more reliable way (compared to container_of()) to get
a PE pointer from the iommu_table struct pointer used in IOMMU functions.

At the moment IOMMU group data points to an iommu_table struct. This
introduces a spapr_tce_iommu_group struct which keeps an iommu_owner
and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to
the pnv_ioda_pe struct, for others it is still a pointer to
the iommu_table struct. The ops structs correspond to the type which
iommu_owner points to.

At the moment a get_table() callback is the only one. It returns
an iommu_table for a bus address.

As the IOMMU group data pointer points to variable type instead of
iommu_table, VFIO SPAPR TCE driver is fixed to use new type.
This changes the tce_container struct to keep iommu_group instead of
iommu_table.

So, it was:
- iommu_table points to iommu_group via iommu_table::it_group;
- iommu_group points to iommu_table via iommu_group_get_iommudata();

now it is:
- iommu_table points to iommu_group via iommu_table::it_group;
- iommu_group points to spapr_tce_iommu_group via
iommu_group_get_iommudata();
- spapr_tce_iommu_group points to either (depending on .get_table()):
- iommu_table;
- pnv_ioda_pe;

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h|   6 ++
 arch/powerpc/include/asm/tce.h  |  13 
 arch/powerpc/kernel/iommu.c |  31 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c   |  37 -
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |   1 +
 arch/powerpc/platforms/powernv/pci.c|   2 +-
 arch/powerpc/platforms/pseries/iommu.c  |  10 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c | 112 +---
 8 files changed, 177 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d8fb3fa..fb2c884 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -112,13 +112,19 @@ extern void iommu_free_table(struct iommu_table *tbl, 
const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
+
+struct spapr_tce_iommu_ops;
 #ifdef CONFIG_IOMMU_API
 extern void iommu_register_group(struct iommu_table *tbl,
+void *iommu_owner,
+struct spapr_tce_iommu_ops *ops,
 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 #else
 static inline void iommu_register_group(struct iommu_table *tbl,
+   void *iommu_owner,
+   struct spapr_tce_iommu_ops *ops,
int pci_domain_number,
unsigned long pe_num)
 {
diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index 743f36b..a697681 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -50,5 +50,18 @@
 #define TCE_PCI_READ   0x1 /* read from PCI allowed */
 #define TCE_VB_WRITE   0x1 /* write from VB allowed */
 
+struct spapr_tce_iommu_group;
+
+struct spapr_tce_iommu_ops {
+   struct iommu_table *(*get_table)(
+   struct spapr_tce_iommu_group *data,
+   phys_addr_t addr);
+};
+
+struct spapr_tce_iommu_group {
+   void *iommu_owner;
+   struct spapr_tce_iommu_ops *ops;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TCE_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f84f799..b207332 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -877,24 +877,49 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
  */
 static void group_release(void *iommu_data)
 {
-   struct iommu_table *tbl = iommu_data;
-   tbl-it_group = NULL;
+   kfree(iommu_data);
 }
 
+static struct iommu_table *spapr_tce_get_default_table(
+   struct spapr_tce_iommu_group *data, phys_addr_t addr)
+{
+   struct iommu_table *tbl = data-iommu_owner;
+
+   if ((addr  tbl-it_page_shift)  tbl-it_size)
+   return tbl;
+
+   return NULL;
+}
+
+static struct spapr_tce_iommu_ops spapr_tce_default_ops = {
+   .get_table = spapr_tce_get_default_table
+};
+
 void iommu_register_group(struct iommu_table *tbl,
+   void *iommu_owner, struct spapr_tce_iommu_ops *ops,
int pci_domain_number, unsigned long pe_num)
 {
struct iommu_group *grp;
char *name;
+   struct spapr_tce_iommu_group *data;
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   if (!data)
+   return;
+
+

[PATCH v1 05/16] powerpc/powernv: Add a page size parameter to pnv_pci_setup_iommu_table()

2014-07-15 Thread Alexey Kardashevskiy

Since a TCE page size can be other than 4K, make it configurable for
P5IOC2 and IODA PHBs.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 5 +++--
 arch/powerpc/platforms/powernv/pci-p5ioc2.c | 3 ++-
 arch/powerpc/platforms/powernv/pci.c| 6 +++---
 arch/powerpc/platforms/powernv/pci.h| 2 +-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 40f968e..9f28e18 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -656,7 +656,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
/* Setup linux iommu table */
tbl = pe-tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
- base  28);
+ base  28, IOMMU_PAGE_SHIFT_4K);
 
/* OPAL variant of P7IOC SW invalidated TCEs */
swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
@@ -786,7 +786,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
/* Setup linux iommu table */
tbl = pe-tce32_table;
-   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0);
+   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+   IOMMU_PAGE_SHIFT_4K);
 
/* OPAL variant of PHB3 invalidated TCEs */
swinvp = of_get_property(phb-hose-dn, ibm,opal-tce-kill, NULL);
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c 
b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index e3807d6..94ce348 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -172,7 +172,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct 
device_node *np, u64 hub_id,
/* Setup TCEs */
phb-dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup;
pnv_pci_setup_iommu_table(phb-p5ioc2.iommu_table,
- tce_mem, tce_size, 0);
+ tce_mem, tce_size, 0,
+ IOMMU_PAGE_SHIFT_4K);
 }
 
 void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index b6cb996..4dff552 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -628,11 +628,11 @@ static void pnv_tce_free_rm(struct iommu_table *tbl, long 
index, long npages)
 
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
   void *tce_mem, u64 tce_size,
-  u64 dma_offset)
+  u64 dma_offset, unsigned page_shift)
 {
tbl-it_blocksize = 16;
tbl-it_base = (unsigned long)tce_mem;
-   tbl-it_page_shift = IOMMU_PAGE_SHIFT_4K;
+   tbl-it_page_shift = page_shift;
tbl-it_offset = dma_offset  tbl-it_page_shift;
tbl-it_index = 0;
tbl-it_size = tce_size  3;
@@ -657,7 +657,7 @@ static struct iommu_table *pnv_pci_setup_bml_iommu(struct 
pci_controller *hose)
if (WARN_ON(!tbl))
return NULL;
pnv_pci_setup_iommu_table(tbl, __va(be64_to_cpup(basep)),
- be32_to_cpup(sizep), 0);
+ be32_to_cpup(sizep), 0, IOMMU_PAGE_SHIFT_4K);
iommu_init_table(tbl, hose-node);
iommu_register_group(tbl, pci_domain_nr(hose-bus), 0);
 
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 676232c..6f5ff69 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -198,7 +198,7 @@ int pnv_pci_cfg_write(struct device_node *dn,
  int where, int size, u32 val);
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
  void *tce_mem, u64 tce_size,
- u64 dma_offset);
+ u64 dma_offset, unsigned page_shift);
 extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 03/16] powerpc/powernv: Use it_page_shift for TCE invalidation

2014-07-15 Thread Alexey Kardashevskiy

This fixes IODA1/2 to use it_page_shift as it may be bigger than 4K.

This changes the involved constant values to use ull modifier.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index de19ede..40f968e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -513,15 +513,16 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
pnv_ioda_pe *pe,
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
unsigned long start, end, inc;
+   const unsigned shift = tbl-it_page_shift;
 
start = __pa(startp);
end = __pa(endp);
 
/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
if (tbl-it_busno) {
-   start = 12;
-   end = 12;
-   inc = 128  12;
+   start = shift;
+   end = shift;
+   inc = 128ull  shift;
start |= tbl-it_busno;
end |= tbl-it_busno;
} else if (tbl-it_type  TCE_PCI_SWINV_PAIR) {
@@ -559,18 +560,19 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
__be64 __iomem *invalidate = rm ?
(__be64 __iomem *)pe-tce_inval_reg_phys :
(__be64 __iomem *)tbl-it_index;
+   const unsigned shift = tbl-it_page_shift;
 
/* We'll invalidate DMA address in PE scope */
-   start = 0x2ul  60;
+   start = 0x2ull  60;
start |= (pe-pe_number  0xFF);
end = start;
 
/* Figure out the start, end and step */
inc = tbl-it_offset + (((u64)startp - tbl-it_base) / sizeof(u64));
-   start |= (inc  12);
+   start |= (inc  shift);
inc = tbl-it_offset + (((u64)endp - tbl-it_base) / sizeof(u64));
-   end |= (inc  12);
-   inc = (0x1ul  12);
+   end |= (inc  shift);
+   inc = (0x1ull  shift);
mb();
 
while (start = end) {
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 08/16] powerpc/powernv: Convert/move set_bypass() callback to take_ownership()

2014-07-15 Thread Alexey Kardashevskiy

At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.

Since the set_bypass() is not really an iommu_table function but PE's
function, and we have an ops struct per IOMMU owner, let's move
set_bypass() to the spapr_tce_iommu_ops struct.

As arch/powerpc/kernel/iommu.c is more about POWERPC IOMMU tables and
has very little to do with PEs, this moves take_ownership() calls to
the VFIO SPAPR TCE driver.

This renames set_bypass() to take_ownership() as it is not necessarily
just enabling bypassing, it can be something else/more so let's give it
a generic name. The bool parameter is inverted.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  |  1 -
 arch/powerpc/include/asm/tce.h|  2 ++
 arch/powerpc/kernel/iommu.c   | 12 
 arch/powerpc/platforms/powernv/pci-ioda.c | 17 ++---
 drivers/vfio/vfio_iommu_spapr_tce.c   | 16 
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index fb2c884..00205cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -81,7 +81,6 @@ struct iommu_table {
struct iommu_group *it_group;
 #endif
iommu_invalidate_fn invalidate;
-   void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index a697681..c3d295d 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -56,6 +56,8 @@ struct spapr_tce_iommu_ops {
struct iommu_table *(*get_table)(
struct spapr_tce_iommu_group *data,
phys_addr_t addr);
+   void (*take_ownership)(struct spapr_tce_iommu_group *data,
+   bool enable);
 };
 
 struct spapr_tce_iommu_group {
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b207332..d9494b2 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1113,14 +1113,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
memset(tbl-it_map, 0xff, sz);
iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
 
-   /*
-* Disable iommu bypass, otherwise the user can DMA to all of
-* our physical memory via the bypass window instead of just
-* the pages that has been explicitly mapped into the iommu
-*/
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, false);
-
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1135,10 +1127,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
/* Restore bit#0 set by iommu_init_table() */
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
-
-   /* The kernel owns the device now, we can restore the iommu bypass */
-   if (tbl-set_bypass)
-   tbl-set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8152e30..b5e757b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -715,10 +715,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
-   struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
- tce32_table);
uint16_t window_id = (pe-pe_number  1 ) + 1;
int64_t rc;
 
@@ -758,16 +756,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb 
*phb,
/* TVE #1 is selected by PCI address bit 59 */
pe-tce_bypass_base = 1ull  59;
 
-   /* Install set_bypass callback for VFIO */
-   pe-tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
-
/* Enable bypass by default */
-   pnv_pci_ioda2_set_bypass(pe-tce32_table, true);
+   pnv_pci_ioda2_set_bypass(pe, true);
+}
 
+static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data,
+bool enable)
+{
+   struct pnv_ioda_pe *pe = data-iommu_owner;
+
+   pnv_pci_ioda2_set_bypass(pe, !enable);
 }
 
 static struct spapr_tce_iommu_ops pnv_pci_ioda2_ops = {
.get_table = pnv_ioda1_iommu_get_table,
+   .take_ownership = pnv_ioda2_take_ownership,
 };
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index

[PATCH v1 04/16] powerpc/powernv: Use it_page_shift in TCE build

2014-07-15 Thread Alexey Kardashevskiy

This makes use of iommu_table::it_page_shift instead of TCE_SHIFT and
TCE_RPN_SHIFT hardcoded values.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index f91a4e5..b6cb996 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -564,10 +564,11 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
proto_tce |= TCE_PCI_WRITE;
 
tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
-   rpn = __pa(uaddr)  TCE_SHIFT;
+   rpn = __pa(uaddr)  tbl-it_page_shift;
 
while (npages--)
-   *(tcep++) = cpu_to_be64(proto_tce | (rpn++  TCE_RPN_SHIFT));
+   *(tcep++) = cpu_to_be64(proto_tce |
+   (rpn++  tbl-it_page_shift));
 
/* Some implementations won't cache invalid TCEs and thus may not
 * need that flush. We'll probably turn it_type into a bit mask
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 11/16] powerpc/iommu: Extend ppc_md.tce_build(_rm) to return old TCE values

2014-07-15 Thread Alexey Kardashevskiy

The tce_build/tce_build_rm callbacks are used to implement H_PUT_TCE/etc
hypercalls. The PAPR spec does not allow to fail if the TCE is not empty.
However we cannot just overwrite the existing TCE value with the new one
as we still have to do page counting.

This adds an optional @old_tces return parameter. If it is not NULL,
it must point to an array of @npages size where the callbacks will
store old TCE values. Since tce_build receives virtual addresses,
the old_tces array will contain virtual addresses as well.

As this patch is mechanical, no change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/machdep.h |  2 ++
 arch/powerpc/kernel/iommu.c|  8 +---
 arch/powerpc/platforms/powernv/pci.c   | 13 -
 arch/powerpc/platforms/pseries/iommu.c |  7 +--
 arch/powerpc/sysdev/dart_iommu.c   |  1 +
 5 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index f92b0b5..f11596c 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -69,6 +69,7 @@ struct machdep_calls {
 long index,
 long npages,
 unsigned long uaddr,
+unsigned long *old_tces,
 enum dma_data_direction direction,
 struct dma_attrs *attrs);
void(*tce_free)(struct iommu_table *tbl,
@@ -83,6 +84,7 @@ struct machdep_calls {
 long index,
 long npages,
 unsigned long uaddr,
+long *old_tces,
 enum dma_data_direction direction,
 struct dma_attrs *attrs);
void(*tce_free_rm)(struct iommu_table *tbl,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 01ac319..ae57910 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -324,7 +324,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct 
iommu_table *tbl,
/* Put the TCEs in the HW table */
build_fail = ppc_md.tce_build(tbl, entry, npages,
  (unsigned long)page 
- IOMMU_PAGE_MASK(tbl), direction, attrs);
+ IOMMU_PAGE_MASK(tbl), NULL, direction,
+ attrs);
 
/* ppc_md.tce_build() only returns non-zero for transient errors.
 * Clean up the table bitmap in this case and return
@@ -497,7 +498,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
/* Insert into HW table */
build_fail = ppc_md.tce_build(tbl, entry, npages,
  vaddr  IOMMU_PAGE_MASK(tbl),
- direction, attrs);
+ NULL, direction, attrs);
if(unlikely(build_fail))
goto failure;
 
@@ -1056,7 +1057,8 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
oldtce = ppc_md.tce_get(tbl, entry);
/* Add new entry if it is not busy */
if (!(oldtce  (TCE_PCI_WRITE | TCE_PCI_READ)))
-   ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+   ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, NULL,
+   direction, NULL);
 
spin_unlock((pool-lock));
 
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 09287c7..e002c66 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -568,7 +568,8 @@ static void pnv_tce_invalidate(struct iommu_table *tbl, 
__be64 *startp,
 }
 
 static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
-unsigned long uaddr, enum dma_data_direction direction,
+unsigned long uaddr, unsigned long *old_tces,
+enum dma_data_direction direction,
 struct dma_attrs *attrs, bool rm)
 {
u64 proto_tce;
@@ -593,12 +594,12 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
 }
 
 static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
-   unsigned long uaddr,
+   unsigned long uaddr, unsigned long *old_tces,
enum dma_data_direction direction,
struct dma_attrs *attrs)
 {
-   return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
-   false);
+

[PATCH v1 12/16] powerpc/powernv: Return non-zero TCE from pnv_tce_build

2014-07-15 Thread Alexey Kardashevskiy

This returns old TCE values to the caller if requested.
The caller is expectded to call put_page() for them.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index e002c66..a9165a5 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -575,6 +575,7 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
u64 proto_tce;
__be64 *tcep, *tces;
u64 rpn;
+   long i;
 
proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -584,9 +585,13 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
tces = tcep = ((__be64 *)tbl-it_base) + index - tbl-it_offset;
rpn = __pa(uaddr)  tbl-it_page_shift;
 
-   while (npages--)
-   *(tcep++) = cpu_to_be64(proto_tce |
-   (rpn++  tbl-it_page_shift));
+   for (i = 0; i  npages; i++) {
+   unsigned long oldtce = xchg(tcep, cpu_to_be64(proto_tce |
+   (rpn++  tbl-it_page_shift)));
+   if (old_tces)
+   old_tces[i] = (unsigned long) __va(oldtce);
+   tcep++;
+   }
 
pnv_tce_invalidate(tbl, tces, tcep - 1, rm);
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 13/16] powerpc/iommu: Implement put_page() if TCE had non-zero value

2014-07-15 Thread Alexey Kardashevskiy

Guests might put new TCEs without clearing them first and the PAPR spec
allows that.

This adds put_page() for TCEs which we just replaced.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kernel/iommu.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ae57910..25fda58 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1054,11 +1054,11 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 
spin_lock((pool-lock));
 
-   oldtce = ppc_md.tce_get(tbl, entry);
-   /* Add new entry if it is not busy */
-   if (!(oldtce  (TCE_PCI_WRITE | TCE_PCI_READ)))
-   ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, NULL,
-   direction, NULL);
+   ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, oldtce,
+   direction, NULL);
+
+   if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ))
+   put_page(pfn_to_page(__pa(oldtce)));
 
spin_unlock((pool-lock));
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 10/16] powerpc/iommu: Fix missing permission bits in iommu_put_tce_user_mode()

2014-07-15 Thread Alexey Kardashevskiy

This adds missing permission bits to the translated TCE.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kernel/iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index da04561..01ac319 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1085,6 +1085,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, 
unsigned long entry,
return -EFAULT;
}
hwaddr = (unsigned long) page_address(page) + offset;
+   hwaddr |= tce  (TCE_PCI_READ | TCE_PCI_WRITE);
 
ret = iommu_tce_build(tbl, entry, hwaddr, direction);
if (ret)
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 09/16] powerpc/iommu: Fix IOMMU ownership control functions

2014-07-15 Thread Alexey Kardashevskiy

This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().

This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.

This only clears TCE content if there is no page marked busy in it_map.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kernel/iommu.c | 37 ++---
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d9494b2..da04561 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1100,33 +1100,56 @@ EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
+   int ret = 0, bit0 = 0;
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
 
if (tbl-it_offset == 0)
-   clear_bit(0, tbl-it_map);
+   bit0 = test_and_clear_bit(0, tbl-it_map);
 
if (!bitmap_empty(tbl-it_map, tbl-it_size)) {
pr_err(iommu_tce: it_map is not empty);
-   return -EBUSY;
+   ret = -EBUSY;
+   if (bit0)
+   set_bit(0, tbl-it_map);
+   } else {
+   memset(tbl-it_map, 0xff, sz);
}
 
-   memset(tbl-it_map, 0xff, sz);
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
+   if (!ret)
+   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset,
+   tbl-it_size);
 
-   return 0;
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
+
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 void iommu_release_ownership(struct iommu_table *tbl)
 {
-   unsigned long sz = (tbl-it_size + 7)  3;
+   unsigned long flags, i, sz = (tbl-it_size + 7)  3;
 
iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
+
+   spin_lock_irqsave(tbl-large_pool.lock, flags);
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_lock(tbl-pools[i].lock);
+
memset(tbl-it_map, 0, sz);
 
/* Restore bit#0 set by iommu_init_table() */
if (tbl-it_offset == 0)
set_bit(0, tbl-it_map);
+
+   for (i = 0; i  tbl-nr_pools; i++)
+   spin_unlock(tbl-pools[i].lock);
+   spin_unlock_irqrestore(tbl-large_pool.lock, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 16/16] vfio: powerpc: Enable Dynamic DMA windows

2014-07-15 Thread Alexey Kardashevskiy

This defines and implements VFIO IOMMU API required to support
Dynamic DMA windows defined in the SPAPR specification. The ioctl handlers
implement host-size part of corresponding RTAS calls:
- VFIO_IOMMU_SPAPR_TCE_QUERY - ibm,query-pe-dma-window;
- VFIO_IOMMU_SPAPR_TCE_CREATE - ibm,create-pe-dma-window;
- VFIO_IOMMU_SPAPR_TCE_REMOVE - ibm,remove-pe-dma-window;
- VFIO_IOMMU_SPAPR_TCE_RESET - ibm,reset-pe-dma-window.

The VFIO IOMMU driver does basic sanity checks and calls corresponding
SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge)
implements them.

This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via
VFIO_IOMMU_SPAPR_TCE_GET_INFO.

This calls reset() when IOMMU is being disabled (happens when VFIO stops
using it).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   1 +
 drivers/vfio/vfio_iommu_spapr_tce.c   | 132 +-
 include/uapi/linux/vfio.h |  37 -
 3 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 25a4f0e..63aa697 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -869,6 +869,7 @@ static long pnv_pci_ioda2_ddw_create(struct 
spapr_tce_iommu_group *data,
tbl64-invalidate = pnv_pci_ioda2_tce_invalidate_64;
 
/* Copy invalidate register address */
+   tbl64-it_group = pe-tce32_table.it_group;
tbl64-it_index = pe-tce32_table.it_index;
tbl64-it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE |
TCE_PCI_SWINV_PAIR;
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 808c7d3..8f992de 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -124,13 +124,20 @@ static void tce_iommu_disable(struct tce_container 
*container)
 
container-enabled = false;
 
-   if (!container-grp || !current-mm)
+   if (!container-grp)
return;
 
data = iommu_group_get_iommudata(container-grp);
if (!data || !data-iommu_owner || !data-ops-get_table)
return;
 
+   /* Try resetting, there might have been a 64bit window */
+   if (data-ops-reset)
+   data-ops-reset(data);
+
+   if (!current-mm)
+   return;
+
tbl = data-ops-get_table(data, 0);
if (!tbl)
return;
@@ -213,6 +220,8 @@ static long tce_iommu_ioctl(void *iommu_data,
info.dma32_window_start = tbl-it_offset  tbl-it_page_shift;
info.dma32_window_size = tbl-it_size  tbl-it_page_shift;
info.flags = 0;
+   if (data-ops-query  data-ops-create  data-ops-remove)
+   info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW;
 
if (copy_to_user((void __user *)arg, info, minsz))
return -EFAULT;
@@ -338,6 +347,127 @@ static long tce_iommu_ioctl(void *iommu_data,
tce_iommu_disable(container);
mutex_unlock(container-lock);
return 0;
+
+   case VFIO_IOMMU_SPAPR_TCE_QUERY: {
+   struct vfio_iommu_spapr_tce_query query;
+   struct spapr_tce_iommu_group *data;
+
+   if (WARN_ON(!container-grp))
+   return -ENXIO;
+
+   data = iommu_group_get_iommudata(container-grp);
+
+   minsz = offsetofend(struct vfio_iommu_spapr_tce_query,
+   page_size_mask);
+
+   if (copy_from_user(query, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (query.argsz  minsz)
+   return -EINVAL;
+
+   if (!data-ops-query || !data-iommu_owner)
+   return -ENOSYS;
+
+   ret = data-ops-query(data,
+   query.windows_available,
+   query.page_size_mask);
+
+   if (copy_to_user((void __user *)arg, query, minsz))
+   return -EFAULT;
+
+   return 0;
+   }
+   case VFIO_IOMMU_SPAPR_TCE_CREATE: {
+   struct vfio_iommu_spapr_tce_create create;
+   struct spapr_tce_iommu_group *data;
+   struct iommu_table *tbl;
+
+   if (WARN_ON(!container-grp))
+   return -ENXIO;
+
+   data = iommu_group_get_iommudata(container-grp);
+
+   minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
+   start_addr);
+
+   if (copy_from_user(create, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (create.argsz  minsz)
+   return -EINVAL;
+
+   if (!data-ops-create || !data-iommu_owner)
+

[PATCH v1 14/16] powerpc/powernv: Implement Dynamic DMA windows (DDW) for IODA

2014-07-15 Thread Alexey Kardashevskiy

SPAPR defines an interface to create additional DMA windows dynamically.
Dynamically means that the window is not allocated at the guest start
and the guest can request it later. In practice, existing linux guests
check for the capability and if it is there, they create+map one big DMA
window as big as the entire guest RAM.

SPAPR defines 4 RTAS calls for this feature which userspace implements.
This adds 4 callbacks into the spapr_tce_iommu_ops struct:
1. query - ibm,query-pe-dma-window - returns number/size of windows
which can be created (one, any page size);
2. create - ibm,create-pe-dma-window - creates a window;
3. remove - ibm,remove-pe-dma-window - removes a window; only additional
window created by create() can be removed, the default 32bit window cannot
be removed as guests do not expect new windows to start from zero;
4. reset -  ibm,reset-pe-dma-window - reset the DMA windows configuration
to the default state; now it only removes the additional window if it
was created.

The next patch will add corresponding ioctls to VFIO SPAPR TCE driver to
pass RTAS call from the userspace to the IODA code.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/tce.h|  21 
 arch/powerpc/platforms/powernv/pci-ioda.c | 160 +-
 arch/powerpc/platforms/powernv/pci.h  |   2 +
 3 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index c3d295d..68f6575 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -58,6 +58,27 @@ struct spapr_tce_iommu_ops {
phys_addr_t addr);
void (*take_ownership)(struct spapr_tce_iommu_group *data,
bool enable);
+
+   /* Dynamic DMA window */
+   /* Page size flags for ibm,query-pe-dma-window */
+#define DDW_PGSIZE_4K   0x01
+#define DDW_PGSIZE_64K  0x02
+#define DDW_PGSIZE_16M  0x04
+#define DDW_PGSIZE_32M  0x08
+#define DDW_PGSIZE_64M  0x10
+#define DDW_PGSIZE_128M 0x20
+#define DDW_PGSIZE_256M 0x40
+#define DDW_PGSIZE_16G  0x80
+   long (*query)(struct spapr_tce_iommu_group *data,
+   __u32 *windows_available,
+   __u32 *page_size_mask);
+   long (*create)(struct spapr_tce_iommu_group *data,
+   __u32 page_shift,
+   __u32 window_shift,
+   struct iommu_table **ptbl);
+   long (*remove)(struct spapr_tce_iommu_group *data,
+   struct iommu_table *tbl);
+   long (*reset)(struct spapr_tce_iommu_group *data);
 };
 
 struct spapr_tce_iommu_group {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index b5e757b..25a4f0e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -595,6 +595,15 @@ static void pnv_pci_ioda2_tce_invalidate_32(struct 
iommu_table *tbl,
pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
 }
 
+static void pnv_pci_ioda2_tce_invalidate_64(struct iommu_table *tbl,
+   __be64 *startp, __be64 *endp, bool rm)
+{
+   struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
+   tce64_table);
+
+   pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+}
+
 static bool pnv_pci_ioda_check_addr(struct iommu_table *tbl, __u64 start_addr)
 {
unsigned long entry = start_addr  tbl-it_page_shift;
@@ -760,6 +769,21 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb 
*phb,
pnv_pci_ioda2_set_bypass(pe, true);
 }
 
+static struct iommu_table *pnv_ioda2_iommu_get_table(
+   struct spapr_tce_iommu_group *data,
+   phys_addr_t addr)
+{
+   struct pnv_ioda_pe *pe = data-iommu_owner;
+
+   if (pnv_pci_ioda_check_addr(pe-tce64_table, addr))
+   return pe-tce64_table;
+
+   if (pnv_pci_ioda_check_addr(pe-tce32_table, addr))
+   return pe-tce32_table;
+
+   return NULL;
+}
+
 static void pnv_ioda2_take_ownership(struct spapr_tce_iommu_group *data,
 bool enable)
 {
@@ -768,9 +792,143 @@ static void pnv_ioda2_take_ownership(struct 
spapr_tce_iommu_group *data,
pnv_pci_ioda2_set_bypass(pe, !enable);
 }
 
+static long pnv_pci_ioda2_ddw_query(struct spapr_tce_iommu_group *data,
+   __u32 *windows_available, __u32 *page_size_mask)
+{
+   struct pnv_ioda_pe *pe = data-iommu_owner;
+
+   if (pe-tce64_active) {
+   *page_size_mask = 0;
+   *windows_available = 0;
+   } else {
+   *page_size_mask =
+   DDW_PGSIZE_4K |
+   DDW_PGSIZE_64K |
+   DDW_PGSIZE_16M |
+   DDW_PGSIZE_32M |
+   DDW_PGSIZE_64M |
+   DDW_PGSIZE_128M |
+

[PATCH v1 3/7] powerpc/iommu: Clean up IOMMU API

2014-07-15 Thread Alexey Kardashevskiy

The iommu_tce_direction() function is not used from outside iommu.c
so make it static.

The iommu_clear_tce() is not used anymore at all so remove it.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h |  4 
 arch/powerpc/kernel/iommu.c  | 22 +-
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 1c9b346..2f420c28 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -195,8 +195,6 @@ extern int iommu_tce_put_param_check(struct iommu_table 
*tbl,
unsigned long ioba, unsigned long tce);
 extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
unsigned long *hpas, unsigned long npages, bool realmode);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-   unsigned long entry);
 extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
unsigned long entry, unsigned long pages,
bool realmode);
@@ -207,7 +205,5 @@ extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
-extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index dd68569..259ddb5 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -929,7 +929,7 @@ void iommu_register_group(struct iommu_table *tbl,
kfree(name);
 }
 
-enum dma_data_direction iommu_tce_direction(unsigned long tce)
+static enum dma_data_direction iommu_tce_direction(unsigned long tce)
 {
if ((tce  TCE_PCI_READ)  (tce  TCE_PCI_WRITE))
return DMA_BIDIRECTIONAL;
@@ -940,7 +940,6 @@ enum dma_data_direction iommu_tce_direction(unsigned long 
tce)
else
return DMA_NONE;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_direction);
 
 void iommu_flush_tce(struct iommu_table *tbl)
 {
@@ -998,25 +997,6 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 
-unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
-{
-   unsigned long oldtce;
-   struct iommu_pool *pool = get_pool(tbl, entry);
-
-   spin_lock((pool-lock));
-
-   oldtce = ppc_md.tce_get(tbl, entry);
-   if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ))
-   ppc_md.tce_free(tbl, entry, 1);
-   else
-   oldtce = 0;
-
-   spin_unlock((pool-lock));
-
-   return oldtce;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tce);
-
 int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
unsigned long entry, unsigned long pages,
bool realmode)
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 1/7] powerpc/iommu: Change prototypes for realmode support

2014-07-15 Thread Alexey Kardashevskiy

This is a mechanical patch to add an extra realmode parameter to
iommu_clear_tces_and_put_pages() and iommu_tce_build() helpers.

This changes iommu_tce_build() to receive multiple page addresses at once
as in the future we want to save on locks and TCE flushes in realmode.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/iommu.h  |  5 +++--
 arch/powerpc/kernel/iommu.c   | 15 +--
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 ++-
 drivers/vfio/vfio_iommu_spapr_tce.c   |  6 --
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 00205cb..1c9b346 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -194,11 +194,12 @@ extern int iommu_tce_clear_param_check(struct iommu_table 
*tbl,
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce);
 extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction);
+   unsigned long *hpas, unsigned long npages, bool realmode);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
unsigned long entry);
 extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages);
+   unsigned long entry, unsigned long pages,
+   bool realmode);
 extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
unsigned long entry, unsigned long tce);
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 25fda58..8771b73 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1018,7 +1018,8 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
 int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-   unsigned long entry, unsigned long pages)
+   unsigned long entry, unsigned long pages,
+   bool realmode)
 {
unsigned long oldtce;
struct page *page;
@@ -1046,15 +1047,16 @@ EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
  * tce_build converts it to a physical address.
  */
 int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction)
+   unsigned long *hpas, unsigned long npages, bool realmode)
 {
int ret = -EBUSY;
unsigned long oldtce;
struct iommu_pool *pool = get_pool(tbl, entry);
+   enum dma_data_direction direction = iommu_tce_direction(*hpas);
 
spin_lock((pool-lock));
 
-   ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, oldtce,
+   ret = ppc_md.tce_build(tbl, entry, 1, *hpas, oldtce,
direction, NULL);
 
if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ))
@@ -1089,7 +1091,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, 
unsigned long entry,
hwaddr = (unsigned long) page_address(page) + offset;
hwaddr |= tce  (TCE_PCI_READ | TCE_PCI_WRITE);
 
-   ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+   ret = iommu_tce_build(tbl, entry, hwaddr, 1, direction);
if (ret)
put_page(page);
 
@@ -1124,7 +1126,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
if (!ret)
iommu_clear_tces_and_put_pages(tbl, tbl-it_offset,
-   tbl-it_size);
+   tbl-it_size, false);
 
for (i = 0; i  tbl-nr_pools; i++)
spin_unlock(tbl-pools[i].lock);
@@ -1138,7 +1140,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
unsigned long flags, i, sz = (tbl-it_size + 7)  3;
 
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
+   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size,
+   false);
 
spin_lock_irqsave(tbl-large_pool.lock, flags);
for (i = 0; i  tbl-nr_pools; i++)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 63aa697..2d65a7d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -895,7 +895,8 @@ static long pnv_pci_ioda2_ddw_remove(struct 
spapr_tce_iommu_group *data,
 
pr_info(Removing huge 64bit DMA window\n);
 
-   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);
+   iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size,
+   false);
 
pe-tce64_active = false;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 8f992de..ff1b29e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -283,7 +283,8 @@

[PATCH v1 0/7] powerpc/iommu: kvm: Enable MultiTCE support

2014-07-15 Thread Alexey Kardashevskiy


This prepares upstream kernel for in-kernel acceleration of TCE hypercalls
(H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE). This implements acceleration
for both real and virtual modes. As it requires gup() for real mode to
parse TCE list page, this implements gup() for realmode.

This only accelerates emulated PCI and VIO devices. DDW is not affected.

This was made on top of
[PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW)



Alexey Kardashevskiy (7):
  powerpc/iommu: Change prototypes for realmode support
  powerpc/iommu: Support real mode
  powerpc/iommu: Clean up IOMMU API
  KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K
  KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers
  KVM: PPC: Add kvmppc_find_tce_table()
  KVM: PPC: Add support for multiple-TCE hcalls

 Documentation/virtual/kvm/api.txt |  26 +++
 arch/powerpc/include/asm/iommu.h  |   9 +-
 arch/powerpc/include/asm/kvm_book3s_64.h  |   2 -
 arch/powerpc/include/asm/kvm_host.h   |  30 +++
 arch/powerpc/include/asm/kvm_ppc.h|  16 ++
 arch/powerpc/kernel/iommu.c   | 140 +++-
 arch/powerpc/kvm/book3s_64_vio.c  | 177 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c   | 343 ++
 arch/powerpc/kvm/book3s_hv.c  |  30 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   4 +-
 arch/powerpc/kvm/book3s_pr.c  |   4 +
 arch/powerpc/kvm/book3s_pr_papr.c |  35 +++
 arch/powerpc/kvm/powerpc.c|   3 +
 arch/powerpc/platforms/powernv/pci-ioda.c |   3 +-
 drivers/vfio/vfio_iommu_spapr_tce.c   |   6 +-
 15 files changed, 720 insertions(+), 108 deletions(-)

-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 2/7] powerpc/iommu: Support real mode

2014-07-15 Thread Alexey Kardashevskiy

The TCE tables handling differs for real (MMU off) and virtual modes
(MMU on) so additional set of realmode-capable TCE callbacks has
been added to ppc_md:
* tce_build_rm
* tce_free_rm
* tce_flush_rm

This makes use of new ppc_md calls in iommu_clear_tces_and_put_pages.

This changes iommu_tce_build() to handle multiple pages at once under
the same lock. tce_flush() is called once per call.

This adds a memory barrier after flushing TCE table changes.

This removes comment about hwaddr as now it is an array called hpas
and hpa is descriptive enough acronym.

This does not clear TCE for a huge page in real mode and passes handling
of this to virtual mode.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kernel/iommu.c | 107 +---
 1 file changed, 81 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 8771b73..dd68569 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1021,53 +1021,108 @@ int iommu_clear_tces_and_put_pages(struct iommu_table 
*tbl,
unsigned long entry, unsigned long pages,
bool realmode)
 {
-   unsigned long oldtce;
-   struct page *page;
+   int i, ret = 0, to_free = 0;
 
-   for ( ; pages; --pages, ++entry) {
-   oldtce = iommu_clear_tce(tbl, entry);
-   if (!oldtce)
+   if (realmode  !ppc_md.tce_free_rm)
+   return -EAGAIN;
+
+   for (i = 0; i  pages; ++i) {
+   unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+
+   if (!(oldtce  (TCE_PCI_WRITE | TCE_PCI_READ)))
continue;
 
-   page = pfn_to_page(oldtce  PAGE_SHIFT);
-   WARN_ON(!page);
-   if (page) {
-   if (oldtce  TCE_PCI_WRITE)
-   SetPageDirty(page);
-   put_page(page);
+   if (realmode) {
+   struct page *pg = realmode_pfn_to_page(
+   oldtce  PAGE_SHIFT);
+   if (!pg) {
+   ret = -EAGAIN;
+   } else if (PageCompound(pg)) {
+   ret = -EAGAIN;
+   } else {
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(pg);
+   if (!put_page_unless_one(pg))
+   ret = -EAGAIN;
+   }
+   } else {
+   struct page *pg = pfn_to_page(oldtce  PAGE_SHIFT);
+
+   if (!pg) {
+   ret = -EAGAIN;
+   } else {
+   if (oldtce  TCE_PCI_WRITE)
+   SetPageDirty(pg);
+   put_page(pg);
+   }
}
+   if (ret)
+   break;
+   to_free = i + 1;
}
 
-   return 0;
+   if (to_free) {
+   if (realmode)
+   ppc_md.tce_free_rm(tbl, entry, to_free);
+   else
+   ppc_md.tce_free(tbl, entry, to_free);
+
+   if (realmode  ppc_md.tce_flush_rm)
+   ppc_md.tce_flush_rm(tbl);
+   else if (!realmode  ppc_md.tce_flush)
+   ppc_md.tce_flush(tbl);
+   }
+
+   /* Make sure updates are seen by hardware */
+   mb();
+
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
 
-/*
- * hwaddr is a kernel virtual address here (0xc... bazillion),
- * tce_build converts it to a physical address.
- */
 int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
unsigned long *hpas, unsigned long npages, bool realmode)
 {
-   int ret = -EBUSY;
-   unsigned long oldtce;
-   struct iommu_pool *pool = get_pool(tbl, entry);
-   enum dma_data_direction direction = iommu_tce_direction(*hpas);
+   int i, ret = 0;
 
-   spin_lock((pool-lock));
+   if (realmode  !ppc_md.tce_build_rm)
+   return -EAGAIN;
 
-   ret = ppc_md.tce_build(tbl, entry, 1, *hpas, oldtce,
-   direction, NULL);
+   for (i = 0; i  npages; ++i) {
+   unsigned long hva = (unsigned long) __va(hpas[i]);
+   enum dma_data_direction dir = iommu_tce_direction(hva);
+   unsigned long oldtce = 0;
 
-   if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ))
-   put_page(pfn_to_page(__pa(oldtce)));
+   if (realmode) {
+   ret = ppc_md.tce_build_rm(tbl, entry + i, 1,
+   hva, oldtce, dir, NULL);
+   if (oldtce  (TCE_PCI_WRITE | TCE_PCI_READ)) {
+

[PATCH v1 4/7] KVM: PPC: Replace SPAPR_TCE_SHIFT with IOMMU_PAGE_SHIFT_4K

2014-07-15 Thread Alexey Kardashevskiy

SPAPR_TCE_SHIFT is used in few places only and since IOMMU_PAGE_SHIFT_4K
can bre easily used instead, remove SPAPR_TCE_SHIFT.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 2 --
 arch/powerpc/kvm/book3s_64_vio.c | 3 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c  | 5 +++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index fddb72b..4f7dcf6 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu 
*svcpu)
 }
 #endif
 
-#define SPAPR_TCE_SHIFT12
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
 extern unsigned long kvm_rma_pages;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 516f2ee..e9bcb13 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -36,12 +36,13 @@
 #include asm/ppc-opcode.h
 #include asm/kvm_host.h
 #include asm/udbg.h
+#include asm/iommu.h
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
 static long kvmppc_stt_npages(unsigned long window_size)
 {
-   return ALIGN((window_size  SPAPR_TCE_SHIFT)
+   return ALIGN((window_size  IOMMU_PAGE_SHIFT_4K)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 89e96b3..2624a01 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -35,6 +35,7 @@
 #include asm/ppc-opcode.h
 #include asm/kvm_host.h
 #include asm/udbg.h
+#include asm/iommu.h
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
@@ -52,7 +53,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
 
list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) {
if (stt-liobn == liobn) {
-   unsigned long idx = ioba  SPAPR_TCE_SHIFT;
+   unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
struct page *page;
u64 *tbl;
 
@@ -84,7 +85,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
 
list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) {
if (stt-liobn == liobn) {
-   unsigned long idx = ioba  SPAPR_TCE_SHIFT;
+   unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
struct page *page;
u64 *tbl;
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 5/7] KVM: PPC: Move reusable bits of H_PUT_TCE handler to helpers

2014-07-15 Thread Alexey Kardashevskiy

Upcoming multi-tce support (H_PUT_TCE_INDIRECT/H_STUFF_TCE hypercalls)
will validate TCE (not to have unexpected bits) and IO address (to be within
the DMA window boundaries).

This introduces helpers to validate TCE and IO address.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/kvm_ppc.h  |   4 ++
 arch/powerpc/kvm/book3s_64_vio_hv.c | 117 
 2 files changed, 109 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 9c89cdd..26e6e1a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -127,6 +127,10 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+   unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 2624a01..ab3f50f 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -36,9 +36,102 @@
 #include asm/kvm_host.h
 #include asm/udbg.h
 #include asm/iommu.h
+#include asm/tce.h
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long npages)
+{
+   unsigned long mask = (1  IOMMU_PAGE_SHIFT_4K) - 1;
+   unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   unsigned long size = stt-window_size  IOMMU_PAGE_SHIFT_4K;
+
+   if ((ioba  mask) || (size + npages = idx))
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+   unsigned long mask = ((1  IOMMU_PAGE_SHIFT_4K) - 1) 
+   ~(TCE_PCI_WRITE | TCE_PCI_READ);
+
+   if (tce  mask)
+   return H_PARAMETER;
+
+   return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * should be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+   return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Called in both real and virtual modes.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *  mode on PR KVM
+ */
+void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+   unsigned long idx, unsigned long tce)
+{
+   struct page *page;
+   u64 *tbl;
+
+   page = stt-pages[idx / TCES_PER_PAGE];
+   tbl = kvmppc_page_address(page);
+
+   tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+
 /* WARNING: This will be called in real-mode on HV KVM and virtual
  *  mode on PR KVM
  */
@@ -54,20 +147,19 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,

[PATCH v1 6/7] KVM: PPC: Add kvmppc_find_tce_table()

2014-07-15 Thread Alexey Kardashevskiy

This adds a common helper to search for a kvmppc_spapr_tce_table by
LIOBN.

This makes H_PUT_TCE and H_GET_TCE handler use this new helper.

The helper will be also used in H_PUT_TCE_INDIRECT and H_STUFF_TCE handlers.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 79 -
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index ab3f50f..79406f1 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -40,6 +40,20 @@
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 
+struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(struct kvm *kvm,
+   unsigned long liobn)
+{
+   struct kvmppc_spapr_tce_table *stt;
+
+   list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) {
+   if (stt-liobn == liobn)
+   return stt;
+   }
+
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_find_tce_table);
+
 /*
  * Validates IO address.
  *
@@ -138,62 +152,55 @@ EXPORT_SYMBOL_GPL(kvmppc_tce_put);
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  unsigned long ioba, unsigned long tce)
 {
-   struct kvm *kvm = vcpu-kvm;
struct kvmppc_spapr_tce_table *stt;
+   long ret;
+   unsigned long idx;
 
/* udbg_printf(H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n, */
/*  liobn, ioba, tce); */
 
-   list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) {
-   if (stt-liobn == liobn) {
-   unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
-   /* udbg_printf(H_PUT_TCE: liobn 0x%lx = stt=%p  
window_size=0x%x\n, */
-   /*  liobn, stt, stt-window_size); */
-   long ret = kvmppc_ioba_validate(stt, ioba, 1);
+   stt = kvmppc_find_tce_table(vcpu-kvm, liobn);
+   if (!stt)
+   return H_TOO_HARD;
 
-   if (ret)
-   return ret;
+   ret = kvmppc_ioba_validate(stt, ioba, 1);
+   if (ret)
+   return ret;
 
-   ret = kvmppc_tce_validate(stt, tce);
-   if (ret)
-   return ret;
+   ret = kvmppc_tce_validate(stt, tce);
+   if (ret)
+   return ret;
 
-   kvmppc_tce_put(stt, idx, tce);
+   idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   kvmppc_tce_put(stt, idx, tce);
 
-   return H_SUCCESS;
-   }
-   }
-
-   /* Didn't find the liobn, punt it to userspace */
-   return H_TOO_HARD;
+   return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
 
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
  unsigned long ioba)
 {
-   struct kvm *kvm = vcpu-kvm;
struct kvmppc_spapr_tce_table *stt;
+   long ret;
+   unsigned long idx;
+   struct page *page;
+   u64 *tbl;
 
-   list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) {
-   if (stt-liobn == liobn) {
-   unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
-   struct page *page;
-   u64 *tbl;
-   long ret = kvmppc_ioba_validate(stt, ioba, 1);
+   stt = kvmppc_find_tce_table(vcpu-kvm, liobn);
+   if (!stt)
+   return H_TOO_HARD;
 
-   if (ret)
-   return ret;
+   ret = kvmppc_ioba_validate(stt, ioba, 1);
+   if (ret)
+   return ret;
 
-   page = stt-pages[idx / TCES_PER_PAGE];
-   tbl = (u64 *)page_address(page);
+   idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   page = stt-pages[idx / TCES_PER_PAGE];
+   tbl = (u64 *)page_address(page);
 
-   vcpu-arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
-   return H_SUCCESS;
-   }
-   }
+   vcpu-arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
 
-   /* Didn't find the liobn, punt it to userspace */
-   return H_TOO_HARD;
+   return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 7/7] KVM: PPC: Add support for multiple-TCE hcalls

2014-07-15 Thread Alexey Kardashevskiy

This adds real and virtual mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for user space emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition between kernel and user space.

This adds a tce_tmp_hpas cache to kvm_vcpu_arch to save valid TCEs
(copied from user and verified) before writing the whole list into
the TCE table. This cache will be utilized more in the upcoming
VFIO/IOMMU support to continue TCE list processing in the virtual
mode in the case if the real mode handler failed for some reason.

This adds kvmppc_spapr_tce_init() and kvmppc_spapr_tce_free() helpers
to allocate and free the tce_tmp_hpas cache.

This adds a function to convert a guest physical address to a host
virtual address in order to parse a TCE list from H_PUT_TCE_INDIRECT.

This caches tce_rm_list_pg TCE list page pointer for situation when
the real mode handler managed to reference the list page and
then PTE changed and real mode handler could not dereference the page.
The cached page pointer is dereferenced in virtual mode.

This implements the KVM_CAP_PPC_MULTITCE capability. When present,
the kernel will try handling H_PUT_TCE_INDIRECT and H_STUFF_TCE.
If they can not be handled by the kernel, they are passed on to
the user space. The user space still has to have an implementation
for these.

Both HV and PR-syle KVM are supported.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---
Changelog:
v12:
* used RCU for kvm-arch.spapr_tce_tables

v11:
* added kvm_vcpu_arch::tce_rm_list_pg to cache page struct pointer
referenced-and-not-dereferenced in real mode
* kvmppc_spapr_tce_init/kvmppc_spapr_tce_free called from PR code now too
* removed get_page/put_page from virtual mode handler'
* srcu_read_lock(vcpu-kvm-srcu) now protects entire kvmppc_h_put_tce_indirect
(virtual mode handler for H_PUT_TCE_INDIRECT)

v10:
* kvmppc_find_tce_table() changed to take kvm* instead of vcpu*

v8:
* fixed warnings from check_patch.pl

2013/08/01 (v7):
* realmode_get_page/realmode_put_page use was replaced with
get_page_unless_zero/put_page_unless_one

2013/07/11:
* addressed many, many comments from maintainers

2013/07/06:
* fixed number of wrong get_page()/put_page() calls

2013/06/27:
* fixed clear of BUSY bit in kvmppc_lookup_pte()
* H_PUT_TCE_INDIRECT does realmode_get_page() now
* KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64
* updated doc

2013/06/05:
* fixed mistype about IBMVIO in the commit message
* updated doc and moved it to another section
* changed capability number

2013/05/21:
* added kvm_vcpu_arch::tce_tmp
* removed cleanup if put_indirect failed, instead we do not even start
writing to TCE table if we cannot get TCEs from the user and they are
invalid
* kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
and kvmppc_emulated_validate_tce (for the previous item)
* fixed bug with failthrough for H_IPI
* removed all get_user() from real mode handlers
* kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
---
 Documentation/virtual/kvm/api.txt   |  26 +
 arch/powerpc/include/asm/kvm_host.h |  30 ++
 arch/powerpc/include/asm/kvm_ppc.h  |  12 +++
 arch/powerpc/kvm/book3s_64_vio.c| 174 +++-
 arch/powerpc/kvm/book3s_64_vio_hv.c | 168 +-
 arch/powerpc/kvm/book3s_hv.c|  30 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   4 +-
 arch/powerpc/kvm/book3s_pr.c|   4 +
 arch/powerpc/kvm/book3s_pr_papr.c   |  35 +++
 arch/powerpc/kvm/powerpc.c  |   3 +
 10 files changed, 478 insertions(+), 8 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 0fe3649..e1c72bf 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2494,6 +2494,32 @@ calls by the guest for that service will be passed to 
userspace to be
 handled.
 
 
+4.87 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if hcall-multi-tce is
+present in the ibm,hypertas-functions device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user

[PATCH v1 15/16] vfio: Use it_page_size

2014-07-15 Thread Alexey Kardashevskiy

This makes use of the it_page_size from the iommu_table struct
as page size can differ.

This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio_iommu_spapr_tce.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index ff2bb92..808c7d3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -97,7 +97,7 @@ static int tce_iommu_enable(struct tce_container *container)
return -ENXIO;
 
down_write(current-mm-mmap_sem);
-   npages = (tbl-it_size  IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   npages = (tbl-it_size  tbl-it_page_shift)  PAGE_SHIFT;
locked = current-mm-locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
@@ -137,7 +137,7 @@ static void tce_iommu_disable(struct tce_container 
*container)
 
down_write(current-mm-mmap_sem);
current-mm-locked_vm -= (tbl-it_size 
-   IOMMU_PAGE_SHIFT_4K)  PAGE_SHIFT;
+   tbl-it_page_shift)  PAGE_SHIFT;
up_write(current-mm-mmap_sem);
 }
 
@@ -210,8 +210,8 @@ static long tce_iommu_ioctl(void *iommu_data,
if (info.argsz  minsz)
return -EINVAL;
 
-   info.dma32_window_start = tbl-it_offset  IOMMU_PAGE_SHIFT_4K;
-   info.dma32_window_size = tbl-it_size  IOMMU_PAGE_SHIFT_4K;
+   info.dma32_window_start = tbl-it_offset  tbl-it_page_shift;
+   info.dma32_window_size = tbl-it_size  tbl-it_page_shift;
info.flags = 0;
 
if (copy_to_user((void __user *)arg, info, minsz))
@@ -264,17 +264,17 @@ static long tce_iommu_ioctl(void *iommu_data,
if (ret)
return ret;
 
-   for (i = 0; i  (param.size  IOMMU_PAGE_SHIFT_4K); ++i) {
+   for (i = 0; i  (param.size  tbl-it_page_shift); ++i) {
ret = iommu_put_tce_user_mode(tbl,
-   (param.iova  IOMMU_PAGE_SHIFT_4K) + i,
+   (param.iova  tbl-it_page_shift) + i,
tce);
if (ret)
break;
-   tce += IOMMU_PAGE_SIZE_4K;
+   tce += IOMMU_PAGE_SIZE(tbl);
}
if (ret)
iommu_clear_tces_and_put_pages(tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K, i);
+   param.iova  tbl-it_page_shift, i);
 
iommu_flush_tce(tbl);
 
@@ -315,13 +315,13 @@ static long tce_iommu_ioctl(void *iommu_data,
BUG_ON(!tbl-it_group);
 
ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.size  tbl-it_page_shift);
if (ret)
return ret;
 
ret = iommu_clear_tces_and_put_pages(tbl,
-   param.iova  IOMMU_PAGE_SHIFT_4K,
-   param.size  IOMMU_PAGE_SHIFT_4K);
+   param.iova  tbl-it_page_shift,
+   param.size  tbl-it_page_shift);
iommu_flush_tce(tbl);
 
return ret;
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 00/13] powerpc: kvm: Enable in-kernel acceleration for VFIO

2014-07-15 Thread Alexey Kardashevskiy

This enables in-kernel acceleration of TCE hypercalls (H_PUT_TCE,
H_PUT_TCE_INDIRECT, H_STUFF_TCE). This implements acceleration
for both real and virtual modes.

This was made on top of both:
[PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW)
[PATCH v1 0/7] powerpc/iommu: kvm: Enable MultiTCE support


Alexey Kardashevskiy (13):
  KVM: PPC: Account TCE pages in locked_vm
  KVM: PPC: Rework kvmppc_spapr_tce_table to support variable page size
  KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64 permanently
  KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number
  KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_64 capability number
  KVM: PPC: Add @offset to kvmppc_spapr_tce_table
  KVM: PPC: Add support for 64bit TCE windows
  KVM: PPC: Add hugepage support for IOMMU in-kernel handling
  KVM: PPC: Add page_shift support for in-kernel H_PUT_TCE/etc handlers
  KVM: PPC: Fix kvmppc_gpa_to_hva_and_get() to return host physical
address
  KVM: PPC: Associate IOMMU group with guest copy of TCE table
  KVM: PPC: vfio kvm device: support spapr tce
  KVM: PPC: Add support for IOMMU in-kernel handling

 Documentation/virtual/kvm/api.txt  |  51 
 Documentation/virtual/kvm/devices/vfio.txt |  20 +-
 arch/powerpc/include/asm/kvm_host.h|  41 ++-
 arch/powerpc/include/asm/kvm_ppc.h |   9 +-
 arch/powerpc/include/uapi/asm/kvm.h|   9 +
 arch/powerpc/kernel/iommu.c|   6 +-
 arch/powerpc/kvm/Kconfig   |   2 +
 arch/powerpc/kvm/Makefile  |   3 +
 arch/powerpc/kvm/book3s_64_vio.c   | 389 +++--
 arch/powerpc/kvm/book3s_64_vio_hv.c| 177 -
 arch/powerpc/kvm/book3s_hv.c   |   3 +
 arch/powerpc/kvm/powerpc.c |  25 +-
 include/uapi/linux/kvm.h   |  12 +
 virt/kvm/vfio.c|  69 +
 14 files changed, 775 insertions(+), 41 deletions(-)

-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 01/13] KVM: PPC: Account TCE pages in locked_vm

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 2137836..4ca33f1 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -73,18 +73,48 @@ static long kvmppc_stt_npages(unsigned long window_size)
 * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
+/*
+ * Checks ulimit in order not to let the user space to pin all
+ * available memory for TCE tables.
+ */
+static long kvmppc_account_memlimit(long npages)
+{
+   unsigned long ret = 0, locked, lock_limit;
+
+   if (!current-mm)
+   return -ESRCH; /* process exited */
+
+   down_write(current-mm-mmap_sem);
+   locked = current-mm-locked_vm + npages;
+   lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
+   if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
+   pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n,
+   rlimit(RLIMIT_MEMLOCK));
+   ret = -ENOMEM;
+   } else {
+   current-mm-locked_vm += npages;
+   }
+   up_write(current-mm-mmap_sem);
+
+   return ret;
+}
+
 static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
 {
struct kvm *kvm = stt-kvm;
int i;
+   long npages = kvmppc_stt_npages(stt-window_size);
 
mutex_lock(kvm-lock);
list_del(stt-list);
-   for (i = 0; i  kvmppc_stt_npages(stt-window_size); i++)
+   for (i = 0; i  npages; i++)
__free_page(stt-pages[i]);
+
kfree(stt);
mutex_unlock(kvm-lock);
 
+   kvmppc_account_memlimit(-(npages + 1));
+
kvm_put_kvm(kvm);
 }
 
@@ -140,6 +170,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
}
 
npages = kvmppc_stt_npages(args-window_size);
+   ret = kvmppc_account_memlimit(npages + 1);
+   if (ret)
+   goto fail;
 
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
  GFP_KERNEL);
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 02/13] KVM: PPC: Rework kvmppc_spapr_tce_table to support variable page size

2014-07-15 Thread Alexey Kardashevskiy

At the moment the kvmppc_spapr_tce_table struct can only describe
4GB windows which is not enough for big DMA windows.

This replaces window_size (in bytes, 4GB max) with page_shift (32bit)
and size (64bit, in pages).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/kvm_host.h |  3 ++-
 arch/powerpc/kvm/book3s_64_vio.c| 17 +
 arch/powerpc/kvm/book3s_64_vio_hv.c |  3 +--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index c37fee2..d3a154c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -180,7 +180,8 @@ struct kvmppc_spapr_tce_table {
struct list_head list;
struct kvm *kvm;
u64 liobn;
-   u32 window_size;
+   u32 page_shift;
+   u64 size;   /* in pages */
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 4ca33f1..f2c8e4d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -67,10 +67,9 @@ void kvmppc_spapr_tce_free(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvmppc_spapr_tce_free);
 
-static long kvmppc_stt_npages(unsigned long window_size)
+static long kvmppc_stt_npages(unsigned long size)
 {
-   return ALIGN((window_size  IOMMU_PAGE_SHIFT_4K)
-* sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+   return ALIGN(size * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
 }
 
 /*
@@ -103,7 +102,7 @@ static void release_spapr_tce_table(struct 
kvmppc_spapr_tce_table *stt)
 {
struct kvm *kvm = stt-kvm;
int i;
-   long npages = kvmppc_stt_npages(stt-window_size);
+   long npages = kvmppc_stt_npages(stt-size);
 
mutex_lock(kvm-lock);
list_del(stt-list);
@@ -123,7 +122,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, 
struct vm_fault *vmf)
struct kvmppc_spapr_tce_table *stt = vma-vm_file-private_data;
struct page *page;
 
-   if (vmf-pgoff = kvmppc_stt_npages(stt-window_size))
+   if (vmf-pgoff = kvmppc_stt_npages(stt-size))
return VM_FAULT_SIGBUS;
 
page = stt-pages[vmf-pgoff];
@@ -159,7 +158,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
   struct kvm_create_spapr_tce *args)
 {
struct kvmppc_spapr_tce_table *stt = NULL;
-   long npages;
+   long npages, size;
int ret = -ENOMEM;
int i;
 
@@ -169,7 +168,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
return -EBUSY;
}
 
-   npages = kvmppc_stt_npages(args-window_size);
+   size = args-window_size  IOMMU_PAGE_SHIFT_4K;
+   npages = kvmppc_stt_npages(size);
ret = kvmppc_account_memlimit(npages + 1);
if (ret)
goto fail;
@@ -180,7 +180,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
 
stt-liobn = args-liobn;
-   stt-window_size = args-window_size;
+   stt-page_shift = IOMMU_PAGE_SHIFT_4K;
+   stt-size = size;
stt-kvm = kvm;
 
for (i = 0; i  npages; i++) {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 79a39bb..fadfacb 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -73,9 +73,8 @@ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
 {
unsigned long mask = (1  IOMMU_PAGE_SHIFT_4K) - 1;
unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
-   unsigned long size = stt-window_size  IOMMU_PAGE_SHIFT_4K;
 
-   if ((ioba  mask) || (size + npages = idx))
+   if ((ioba  mask) || (stt-size + npages = idx))
return H_PARAMETER;
 
return H_SUCCESS;
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 04/13] KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number

2014-07-15 Thread Alexey Kardashevskiy

This adds a capability number for in-kernel support for VFIO on
SPAPR platform.

The capability will tell the user space whether in-kernel handlers of
H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space
must not attempt allocating a TCE table in the host kernel via
the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests
will not be passed to the user space which is desired action in
the situation like that.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e11d8f1..3048c86 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -758,6 +758,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_VM_ATTRIBUTES 101
 #define KVM_CAP_ARM_PSCI_0_2 102
 #define KVM_CAP_PPC_FIXUP_HCALL 103
+#define KVM_CAP_SPAPR_TCE_VFIO 104
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 05/13] KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_64 capability number

2014-07-15 Thread Alexey Kardashevskiy

This adds a capability number for 64-bit TCE tables support.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3048c86..65c2689 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -759,6 +759,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PSCI_0_2 102
 #define KVM_CAP_PPC_FIXUP_HCALL 103
 #define KVM_CAP_SPAPR_TCE_VFIO 104
+#define KVM_CAP_SPAPR_TCE_64 105
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 03/13] KVM: PPC: Enable IOMMU_API for KVM_BOOK3S_64 permanently

2014-07-15 Thread Alexey Kardashevskiy

It does not make much sense to have KVM in book3s-64 and
not to have IOMMU bits for PCI pass through support as it costs little
and allows VFIO to function on book3s KVM.

Having IOMMU_API always enabled makes it unnecessary to have a lot of
#ifdef IOMMU_API in arch/powerpc/kvm/book3s_64_vio*. With those
ifdef's we could have only user space emulated devices accelerated
(but not VFIO) which do not seem to be very useful.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index d7a16ac6..301fa6b 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -63,6 +63,7 @@ config KVM_BOOK3S_64
select KVM_BOOK3S_64_HANDLER
select KVM
select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+   select SPAPR_TCE_IOMMU if IOMMU_SUPPORT
---help---
  Support running unmodified book3s_64 and book3s_32 guest kernels
  in virtual machines on book3s_64 host processors.
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 07/13] KVM: PPC: Add support for 64bit TCE windows

2014-07-15 Thread Alexey Kardashevskiy

The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not
enough for directly mapped windows as the guest can get more than 4GB.

This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it
via KVM_CAP_SPAPR_TCE_64 capability.

Since 64bit windows are to support Dynamic DMA windows (DDW), let's add
@bus_offset and @page_shift which are also required by DDW.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 Documentation/virtual/kvm/api.txt   | 51 +
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  9 +++
 arch/powerpc/kvm/book3s_64_vio.c| 10 +---
 arch/powerpc/kvm/powerpc.c  | 25 +-
 include/uapi/linux/kvm.h|  2 ++
 6 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index e1c72bf..b4695ea 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2520,6 +2520,57 @@ an implementation for these despite the in kernel 
acceleration.
 This capability is always enabled.
 
 
+4.88 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows.
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u32 page_shift;
+   __u64 offset;   /* in pages */
+   __u64 size; /* in pages */
+   __u32 flags;
+};
+
+
+!!! FIXME !!!
+
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table. The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every IOMMU page. The bus_offset field tells where
+this window is mapped on the IO bus. The page_shift field tells the size
+of the pages in this window (for example, 10, 16, 24 for 4K, 64K, 16MB
+page sizes respectively). The flags field is not used at the moment
+but provides the room for extensions.
+
+When the guest issues an H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE hcall
+on a liobn for which a TCE table has been created using this ioctl(),
+the kernel will handle it in real or virtual mode, updating the TCE table.
+If liobn has not been registered with this ioctl, H_PUT_TCE/etc calls
+will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index b84ed80..e0a68ef 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -128,7 +128,7 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern int kvmppc_spapr_tce_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_spapr_tce_free(struct kvm_vcpu *vcpu);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-   struct kvm_create_spapr_tce *args);
+   struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
struct kvm *kvm, unsigned long liobn);
 extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 2bc4a94..4452f6e 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u32 page_shift;
+   __u64 offset;   /* in pages */
+   __u64 size; /* in pages */
+   __u32 flags;
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
__u64 rma_size;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f2c8e4d..2c6ab20 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -155,20 +155,23 @@ static const struct file_operations kvm_spapr_tce_fops = {
 };
 
 long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-  struct kvm_create_spapr_tce *args)
+  struct

[PATCH v1 06/13] KVM: PPC: Add @offset to kvmppc_spapr_tce_table

2014-07-15 Thread Alexey Kardashevskiy

This enables guest visible TCE tables to start from non-zero offset
on a bus. This will be used for VFIO support.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/kvm_host.h | 1 +
 arch/powerpc/kvm/book3s_64_vio_hv.c | 5 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d3a154c..ed96b09 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -181,6 +181,7 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 page_shift;
+   u64 offset; /* in pages */
u64 size;   /* in pages */
struct page *pages[0];
 };
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index fadfacb..a3a6597 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -74,7 +74,8 @@ long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long mask = (1  IOMMU_PAGE_SHIFT_4K) - 1;
unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
 
-   if ((ioba  mask) || (stt-size + npages = idx))
+   if ((ioba  mask) || (idx  stt-offset) ||
+   (stt-offset + stt-size + npages = idx))
return H_PARAMETER;
 
return H_SUCCESS;
@@ -146,6 +147,7 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
struct page *page;
u64 *tbl;
 
+   idx -= stt-offset;
page = stt-pages[idx / TCES_PER_PAGE];
tbl = kvmppc_page_address(page);
 
@@ -351,6 +353,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
return ret;
 
idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   idx -= stt-offset;
page = stt-pages[idx / TCES_PER_PAGE];
tbl = (u64 *)page_address(page);
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 08/13] KVM: PPC: Add hugepage support for IOMMU in-kernel handling

2014-07-15 Thread Alexey Kardashevskiy

This adds special support for huge pages (16MB) in real mode.
The reference counting cannot be easily done for such pages in real
mode (when MMU is off) so this adds a hash table of huge pages.
It is populated in virtual mode and get_page is called just once
per a huge page. Real mode handlers check if the requested page is
in the hash table, then no reference counting is done, otherwise
an exit to virtual mode happens. The hash table is released at KVM
exit.

This defines kvmppc_spapr_iommu_hugepage hash table entry and adds it
to kvm_arch.

This adds kvmppc_iommu_hugepages_init() and
kvmppc_iommu_hugepages_cleanup() helpers. The latter puts cached pages.

This fixes iommu_clear_tces_and_put_pages() not to put huge pages as this
is to be done by kvmppc_iommu_hugepages_cleanup().

This implements a real mode kvmppc_rm_hugepage_gpa_to_hpa() helper to
find a hash entry and a virtual mode kvmppc_iommu_hugepage_try_add()
helper to add one.

At the moment the fastest card available for tests uses up to 9 huge
pages so walking through this hash table does not cost much.
However this can change and we may want to optimize this.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
v11:
* moved hashtables from IOMMU to KVM

2013/07/12:
* removed multiple #ifdef IOMMU_API as IOMMU_API is always enabled
for KVM_BOOK3S_64

2013/06/27:
* list of huge pages replaces with hashtable for better performance
* spinlock removed from real mode and only protects insertion of new
huge [ages descriptors into the hashtable

2013/06/05:
* fixed compile error when CONFIG_IOMMU_API=n

2013/05/20:
* the real mode handler now searches for a huge page by gpa (used to be pte)
* the virtual mode handler prints warning if it is called twice for the same
huge page as the real mode handler is expected to fail just once - when a huge
page is not in the list yet.
* the huge page is refcounted twice - when added to the hugepage list and
when used in the virtual mode hcall handler (can be optimized but it will
make the patch less nice).
---
 arch/powerpc/include/asm/kvm_host.h |  34 +++
 arch/powerpc/include/asm/kvm_ppc.h  |   2 +
 arch/powerpc/kernel/iommu.c |   6 +-
 arch/powerpc/kvm/book3s_64_vio.c| 116 +++-
 arch/powerpc/kvm/book3s_64_vio_hv.c |  25 
 arch/powerpc/kvm/book3s_hv.c|   3 +
 6 files changed, 183 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index ed96b09..8a3b465 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -31,6 +31,7 @@
 #include linux/list.h
 #include linux/atomic.h
 #include linux/tracepoint.h
+#include linux/hashtable.h
 #include asm/kvm_asm.h
 #include asm/processor.h
 #include asm/page.h
@@ -191,6 +192,36 @@ struct kvm_rma_info {
unsigned long base_pfn;
 };
 
+/*
+ * The KVM guest can be backed with 16MB pages.
+ * In this case, we cannot do page counting from the real mode
+ * as the compound pages are used - they are linked in a list
+ * with pointers as virtual addresses which are inaccessible
+ * in real mode.
+ *
+ * To address the issue, here is what we do:
+ *
+ * 1) add a hashtable per KVM, each entry is kvmppc_spapr_iommu_hugepage
+ * and describes gpa-to-hpa mapping;
+ * 2) in real mode, if gpa is in the hash table, use the cached hpa;
+ * otherwise pass the request to virtual mode;
+ * 3) in virtual mode, check if gpa is in the hash table and use cached
+ * hpa; otherwise translate gpa to hpa and reference the page.
+ *
+ * hpa of every used hugepage will be cached in the hash table
+ * and referenced just once. Pages are released at KVM exit.
+ */
+#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)hash_32(gpa  24, 32)
+#define KVMPPC_SPAPR_HUGEPAGE_BUCKETS   64
+
+struct kvmppc_spapr_iommu_hugepage {
+   struct hlist_node hash_node;
+   unsigned long gpa;  /* Guest physical address */
+   unsigned long hpa;  /* Host physical address */
+   struct page *page;  /* page struct of the very first subpage */
+   unsigned long size; /* Huge page size (always 16MB at the moment) */
+};
+
 /* XICS components, defined in book3s_xics.c */
 struct kvmppc_xics;
 struct kvmppc_icp;
@@ -266,6 +297,9 @@ struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
struct list_head spapr_tce_tables;
struct list_head rtas_tokens;
+   DECLARE_HASHTABLE(hugepages_hash_tab,
+   ilog2(KVMPPC_SPAPR_HUGEPAGE_BUCKETS));
+   spinlock_t hugepages_write_lock;
 #endif
 #ifdef CONFIG_KVM_MPIC
struct openpic *mpic;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index e0a68ef..86f5015 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -127,6 +127,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_spapr_tce_init(struct kvm_vcpu *vcpu);
 extern void

[PATCH v1 09/13] KVM: PPC: Add page_shift support for in-kernel H_PUT_TCE/etc handlers

2014-07-15 Thread Alexey Kardashevskiy

Recently introduced KVM_CREATE_SPAPR_TCE_64 added page_shift. This makes
use of it in kvmppc_tce_put().

This changes kvmppc_tce_put() to take an TCE index rather than IO address.

This does not change the existing behaviour and will be utilized later
by Dynamic DMA windows which support 64K and 16MB page sizes.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio.c|  8 
 arch/powerpc/kvm/book3s_64_vio_hv.c | 16 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 2648d88..8250521 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -371,7 +371,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu,
if (ret)
return ret;
 
-   kvmppc_tce_put(stt, ioba  IOMMU_PAGE_SHIFT_4K, tce);
+   kvmppc_tce_put(stt, ioba  stt-page_shift, tce);
 
return H_SUCCESS;
 }
@@ -436,7 +436,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
}
 
for (i = 0; i  npages; ++i)
-   kvmppc_tce_put(stt, (ioba  IOMMU_PAGE_SHIFT_4K) + i,
+   kvmppc_tce_put(stt, (ioba  stt-page_shift) + i,
vcpu-arch.tce_tmp_hpas[i]);
 
 unlock_exit:
@@ -465,8 +465,8 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
if (ret || (tce_value  (TCE_PCI_WRITE | TCE_PCI_READ)))
return H_PARAMETER;
 
-   for (i = 0; i  npages; ++i, ioba += IOMMU_PAGE_SIZE_4K)
-   kvmppc_tce_put(stt, ioba  IOMMU_PAGE_SHIFT_4K, tce_value);
+   for (i = 0; i  npages; ++i, ioba += (1  stt-page_shift))
+   kvmppc_tce_put(stt, ioba  stt-page_shift, tce_value);
 
return H_SUCCESS;
 }
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6c0b95d..99bac58 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -71,8 +71,8 @@ EXPORT_SYMBOL_GPL(kvmppc_find_tce_table);
 long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long ioba, unsigned long npages)
 {
-   unsigned long mask = (1  IOMMU_PAGE_SHIFT_4K) - 1;
-   unsigned long idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   unsigned long mask = (1  stt-page_shift) - 1;
+   unsigned long idx = ioba  stt-page_shift;
 
if ((ioba  mask) || (idx  stt-offset) ||
(stt-offset + stt-size + npages = idx))
@@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
  */
 long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
 {
-   unsigned long mask = ((1  IOMMU_PAGE_SHIFT_4K) - 1) 
+   unsigned long mask = ((1  stt-page_shift) - 1) 
~(TCE_PCI_WRITE | TCE_PCI_READ);
 
if (tce  mask)
@@ -271,7 +271,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned 
long liobn,
if (ret)
return ret;
 
-   idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   idx = ioba  stt-page_shift;
kvmppc_tce_put(stt, idx, tce);
 
return H_SUCCESS;
@@ -323,7 +323,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
}
 
for (i = 0; i  npages; ++i)
-   kvmppc_tce_put(stt, (ioba  IOMMU_PAGE_SHIFT_4K) + i,
+   kvmppc_tce_put(stt, (ioba  stt-page_shift) + i,
vcpu-arch.tce_tmp_hpas[i]);
 
 put_page_exit:
@@ -354,8 +354,8 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
if (ret || (tce_value  (TCE_PCI_WRITE | TCE_PCI_READ)))
return H_PARAMETER;
 
-   for (i = 0; i  npages; ++i, ioba += IOMMU_PAGE_SIZE_4K)
-   kvmppc_tce_put(stt, ioba  IOMMU_PAGE_SHIFT_4K, tce_value);
+   for (i = 0; i  npages; ++i, ioba += (1  stt-page_shift))
+   kvmppc_tce_put(stt, ioba  stt-page_shift, tce_value);
 
return H_SUCCESS;
 }
@@ -377,7 +377,7 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
if (ret)
return ret;
 
-   idx = ioba  IOMMU_PAGE_SHIFT_4K;
+   idx = ioba  stt-page_shift;
idx -= stt-offset;
page = stt-pages[idx / TCES_PER_PAGE];
tbl = (u64 *)page_address(page);
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 10/13] KVM: PPC: Fix kvmppc_gpa_to_hva_and_get() to return host physical address

2014-07-15 Thread Alexey Kardashevskiy

The existing support of emulated devices does not need to calculate
a host physical address as the translation is performed by the userspace.

The upcoming support of VFIO needs it as it stores host physical
addresses in the real hardware TCE table which hardware uses during DMA
transfer. This translation could be done using page struct object
which is returned by kvmppc_gpa_to_hva_and_get().

However kvmppc_gpa_to_hva_and_get() does not return valid page struct
for huge pages to avoid possible bugs with excessive page releases.

This extends kvmppc_gpa_to_hva_and_get() to return a physical page address.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/kvm/book3s_64_vio.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 8250521..573fd6d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -321,7 +321,7 @@ fail:
  * and returns ERROR_ADDR if failed.
  */
 static void __user *kvmppc_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
-   unsigned long gpa, struct page **pg)
+   unsigned long gpa, struct page **pg, unsigned long *phpa)
 {
unsigned long hva, gfn = gpa  PAGE_SHIFT;
struct kvm_memory_slot *memslot;
@@ -337,6 +337,10 @@ static void __user *kvmppc_gpa_to_hva_and_get(struct 
kvm_vcpu *vcpu,
if (get_user_pages_fast(hva  PAGE_MASK, 1, is_write, pg) != 1)
return ERROR_ADDR;
 
+   if (phpa)
+   *phpa = __pa((unsigned long) page_address(*pg)) |
+   (hva  ~PAGE_MASK);
+
/*
 * Check if this GPA is taken care of by the hash table.
 * If this is the case, do not show the caller page struct
@@ -404,7 +408,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
return ret;
 
idx = srcu_read_lock(vcpu-kvm-srcu);
-   tces = kvmppc_gpa_to_hva_and_get(vcpu, tce_list, NULL);
+   tces = kvmppc_gpa_to_hva_and_get(vcpu, tce_list, NULL, NULL);
if (tces == ERROR_ADDR) {
ret = H_TOO_HARD;
goto unlock_exit;
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 12/13] KVM: PPC: vfio kvm device: support spapr tce

2014-07-15 Thread Alexey Kardashevskiy

In addition to the external VFIO user API, a VFIO KVM device
has been introduced recently.

sPAPR TCE IOMMU is para-virtualized and the guest does map/unmap
via hypercalls which take a logical bus id (LIOBN) as a target IOMMU
identifier. LIOBNs are made up, advertised to the guest system and
linked to IOMMU groups by the user space.
In order to enable acceleration for IOMMU operations in KVM, we need
to tell KVM the information about LIOBN-to-group mapping.

For that, a new KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN parameter
is added. It accepts a pair of a VFIO group fd and LIOBN.

KVM uses kvm_vfio_find_group_by_liobn() once per KVM run and caches
the result in kvm_arch. iommu_group_put() for all groups is called
at KVM finish in the SPAPR TCE (will be added in KVM enablement patch).

Before notifying KVM about new link, this check the group for being
registered with KVM device in order to release them at unexpected KVM
finish.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
Changes:
v5:
* added lock in search function
* changed callback function type name

v4:
* fixed few bugs
* changed kvm_vfio_find_group_by_liobn() to return informative errors

v3:
* total rework
* added a release callback into kvm_vfio_find_group_by_liobn so now
the user of the API can get a notification if the group is about to
disappear
---
 Documentation/virtual/kvm/devices/vfio.txt | 20 -
 arch/powerpc/kvm/Kconfig   |  1 +
 arch/powerpc/kvm/Makefile  |  3 ++
 include/uapi/linux/kvm.h   |  8 
 virt/kvm/vfio.c| 69 ++
 5 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/devices/vfio.txt 
b/Documentation/virtual/kvm/devices/vfio.txt
index ef51740..eaf0f5e 100644
--- a/Documentation/virtual/kvm/devices/vfio.txt
+++ b/Documentation/virtual/kvm/devices/vfio.txt
@@ -16,7 +16,23 @@ Groups:
 
 KVM_DEV_VFIO_GROUP attributes:
   KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking
+   kvm_device_attr.addr points to an int32_t file descriptor
+   for the VFIO group.
+
   KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking
+   kvm_device_attr.addr points to an int32_t file descriptor
+   for the VFIO group.
 
-For each, kvm_device_attr.addr points to an int32_t file descriptor
-for the VFIO group.
+  KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN: sets a liobn for a VFIO group
+   kvm_device_attr.addr points to a struct:
+   struct kvm_vfio_spapr_tce_liobn {
+   __u32   argsz;
+   __s32   fd;
+   __u32   liobn;
+   __u64   start_addr;
+   };
+   where
+   @argsz is the size of kvm_vfio_spapr_tce_liobn;
+   @fd is a file descriptor for a VFIO group;
+   @liobn is a logical bus id to be associated with the group;
+   @start_addr is a DMA window offset on the IO (PCI) bus.
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 301fa6b..f708e61 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -64,6 +64,7 @@ config KVM_BOOK3S_64
select KVM
select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
select SPAPR_TCE_IOMMU if IOMMU_SUPPORT
+   select KVM_VFIO if VFIO
---help---
  Support running unmodified book3s_64 and book3s_32 guest kernels
  in virtual machines on book3s_64 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ce569b6..d55c097 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -97,6 +97,9 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
book3s_xics.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_VFIO) += \
+   $(KVM)/vfio.o \
+
 kvm-book3s_64-module-objs += \
$(KVM)/kvm_main.o \
$(KVM)/eventfd.o \
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3beb542..c1ad9b7 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -949,9 +949,17 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_GROUP1
 #define   KVM_DEV_VFIO_GROUP_ADD   1
 #define   KVM_DEV_VFIO_GROUP_DEL   2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN   3
 #define KVM_DEV_TYPE_ARM_VGIC_V2   5
 #define KVM_DEV_TYPE_FLIC  6
 
+struct kvm_vfio_spapr_tce_liobn {
+   __u32   argsz;
+   __s32   fd;
+   __u32   liobn;
+   __u64   start_addr;
+};
+
 /*
  * ioctls for VM fds
  */
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index ba1a93f..43a224b 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -19,6 +19,10 @@
 #include linux/uaccess.h
 #include linux/vfio.h
 
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+#include asm/kvm_ppc.h
+#endif
+
 struct kvm_vfio_group {
struct list_head node;
struct vfio_group

[PATCH v1 11/13] KVM: PPC: Associate IOMMU group with guest copy of TCE table

2014-07-15 Thread Alexey Kardashevskiy

The existing in-kernel TCE table for emulated devices contains
guest physical addresses which are accesses by emulated devices.
Since we need to keep this information for VFIO devices too
in order to implement H_GET_TCE, we are reusing it.

This adds iommu_group* and iommu_table* pointers to kvmppc_spapr_tce_table.

This adds kvm_spapr_tce_attach_iommu_group() helper to initialize
the pointers.

This puts the group when guest copy of TCE table is destroyed which
happens when TCE table fd is closed.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/kvm_host.h |  2 ++
 arch/powerpc/include/asm/kvm_ppc.h  |  5 +
 arch/powerpc/kvm/book3s_64_vio.c| 28 
 3 files changed, 35 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8a3b465..8d8eee9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -184,6 +184,8 @@ struct kvmppc_spapr_tce_table {
u32 page_shift;
u64 offset; /* in pages */
u64 size;   /* in pages */
+   struct iommu_table *tbl;
+   struct iommu_group *refgrp;/* reference counting only */
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 86f5015..92be7f5 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -129,6 +129,11 @@ extern int kvmppc_spapr_tce_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_spapr_tce_free(struct kvm_vcpu *vcpu);
 extern void kvmppc_iommu_hugepages_init(struct kvm_arch *ka);
 extern void kvmppc_iommu_hugepages_cleanup(struct kvm_arch *ka);
+struct iommu_group;
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm,
+   unsigned long liobn,
+   phys_addr_t start_addr,
+   struct iommu_group *grp);
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 573fd6d..b7de38e 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -28,6 +28,7 @@
 #include linux/hugetlb.h
 #include linux/list.h
 #include linux/anon_inodes.h
+#include linux/iommu.h
 
 #include asm/tlbflush.h
 #include asm/kvm_ppc.h
@@ -205,6 +206,10 @@ static void release_spapr_tce_table(struct 
kvmppc_spapr_tce_table *stt)
 
mutex_lock(kvm-lock);
list_del(stt-list);
+
+   if (stt-refgrp)
+   iommu_group_put(stt-refgrp);
+
for (i = 0; i  npages; i++)
__free_page(stt-pages[i]);
 
@@ -253,6 +258,29 @@ static const struct file_operations kvm_spapr_tce_fops = {
.release= kvm_spapr_tce_release,
 };
 
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm,
+   unsigned long liobn,
+   phys_addr_t start_addr,
+   struct iommu_group *grp)
+{
+   struct kvmppc_spapr_tce_table *stt = NULL;
+
+   /* Check this LIOBN hasn't been previously allocated */
+   list_for_each_entry(stt, kvm-arch.spapr_tce_tables, list) {
+   if (stt-liobn == liobn) {
+   struct spapr_tce_iommu_group *data;
+
+   data = iommu_group_get_iommudata(grp);
+   BUG_ON(!data);
+   stt-tbl = data-ops-get_table(data, start_addr);
+   stt-refgrp = grp;
+   return 0;
+   }
+   }
+
+   return -ENODEV;
+}
+
 long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
   struct kvm_create_spapr_tce_64 *args)
 {
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v1 13/13] KVM: PPC: Add support for IOMMU in-kernel handling

2014-07-15 Thread Alexey Kardashevskiy

This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests targeted an IOMMU TCE table without passing
them to user space which saves time on switching to user space and back.

Both real and virtual modes are supported. The kernel tries to
handle a TCE request in the real mode, if fails it passes the request
to the virtual mode to complete the operation. If it a virtual mode
handler fails, the request is passed to user space.

The first user of this is VFIO on POWER. Trampolines to the VFIO external
user API functions are required for this patch.

This adds a SPAPR TCE IOMMU KVM device to associate a logical bus
number (LIOBN) with an VFIO IOMMU group fd and enable in-kernel handling
of map/unmap requests. The device supports a single attribute which is
a struct with LIOBN and IOMMU fd. When the attribute is set, the device
establishes the connection between KVM and VFIO.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
v12:
* reworked for the latest VFIO KVM device

v11:
* removed VFIO_IOMMU capability
* fixed comments from Gleb
* added @type to kvmppc_spapr_tce_table struct and split it into 2 parts
(emulated, iommu)

v10:
* all IOMMU TCE links are handled by one KVM device now
* KVM device has its own list of TCE descriptors
* the search-by-liobn function was extended to search through
emulated and IOMMU lists

v9:
* KVM_CAP_SPAPR_TCE_IOMMU ioctl to KVM replaced with SPAPR TCE IOMMU
KVM device
* release_spapr_tce_table() is not shared between different TCE types
* reduced the patch size by moving KVM device bits and VFIO external API
trampolines to separate patches
* moved documentation from Documentation/virtual/kvm/api.txt to
Documentation/virtual/kvm/devices/spapr_tce_iommu.txt

v8:
* fixed warnings from check_patch.pl

2013/07/11:
* removed multiple #ifdef IOMMU_API as IOMMU_API is always enabled
for KVM_BOOK3S_64
* kvmppc_gpa_to_hva_and_get also returns host phys address. Not much sense
for this here but the next patch for hugepages support will use it more.

2013/07/06:
* added realmode arch_spin_lock to protect TCE table from races
in real and virtual modes
* POWERPC IOMMU API is changed to support real mode
* iommu_take_ownership and iommu_release_ownership are protected by
iommu_table's locks
* VFIO external user API use rewritten
* multiple small fixes

2013/06/27:
* tce_list page is referenced now in order to protect it from accident
invalidation during H_PUT_TCE_INDIRECT execution
* added use of the external user VFIO API

2013/06/05:
* changed capability number
* changed ioctl number
* update the doc article number

2013/05/20:
* removed get_user() from real mode handlers
* kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there
translated TCEs, tries realmode_get_page() on those and if it fails, it
passes control over the virtual mode handler which tries to finish
the request handling
* kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit
on a page
* The only reason to pass the request to user mode now is when the user mode
did not register TCE table in the kernel, in all other cases the virtual mode
handler is expected to do the job

Conflicts:
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/book3s_64_vio.c
---
 arch/powerpc/include/asm/kvm_host.h |   1 +
 arch/powerpc/kvm/book3s_64_vio.c| 177 ++--
 arch/powerpc/kvm/book3s_64_vio_hv.c | 130 ++
 3 files changed, 298 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8d8eee9..6056114 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -726,6 +726,7 @@ struct kvm_vcpu_arch {
 */
} tce_rm_fail;  /* failed stage of request processing */
struct page *tce_rm_list_pg;/* unreferenced page from realmode */
+   unsigned long tce_tmp_num;  /* valid entries number */
 #endif
 #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) || \
defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b7de38e..90e7ad1 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -21,7 +21,6 @@
 #include linux/string.h
 #include linux/kvm.h
 #include linux/kvm_host.h
-
 #include linux/highmem.h
 #include linux/gfp.h
 #include linux/slab.h
@@ -29,6 +28,8 @@
 #include linux/list.h
 #include linux/anon_inodes.h
 #include linux/iommu.h
+#include linux/module.h
+#include linux/file.h
 
 #include asm/tlbflush.h
 #include asm/kvm_ppc.h
@@ -347,6 +348,8 @@ fail:
  *
  * If pg!=NULL, tries to increase page counter via get_user_pages_fast()
  * and returns ERROR_ADDR if failed.
+ *
+ * if pg!=NULLphpa!=NULL,

Re: [PATCH v1 01/13] KVM: PPC: Account TCE pages in locked_vm

2014-07-15 Thread Alexey Kardashevskiy

On 07/15/2014 07:25 PM, Alexey Kardashevskiy wrote:
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru


Just realized this should go to powernv: vfio: Add Dynamic DMA windows (DDW).

And neither patchset accounts DDW in locked_vm, need to decide how...


 ---
  arch/powerpc/kvm/book3s_64_vio.c | 35 ++-
  1 file changed, 34 insertions(+), 1 deletion(-)
 
 diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
 b/arch/powerpc/kvm/book3s_64_vio.c
 index 2137836..4ca33f1 100644
 --- a/arch/powerpc/kvm/book3s_64_vio.c
 +++ b/arch/powerpc/kvm/book3s_64_vio.c
 @@ -73,18 +73,48 @@ static long kvmppc_stt_npages(unsigned long window_size)
* sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
  }
  
 +/*
 + * Checks ulimit in order not to let the user space to pin all
 + * available memory for TCE tables.
 + */
 +static long kvmppc_account_memlimit(long npages)
 +{
 + unsigned long ret = 0, locked, lock_limit;
 +
 + if (!current-mm)
 + return -ESRCH; /* process exited */
 +
 + down_write(current-mm-mmap_sem);
 + locked = current-mm-locked_vm + npages;
 + lock_limit = rlimit(RLIMIT_MEMLOCK)  PAGE_SHIFT;
 + if (locked  lock_limit  !capable(CAP_IPC_LOCK)) {
 + pr_warn(RLIMIT_MEMLOCK (%ld) exceeded\n,
 + rlimit(RLIMIT_MEMLOCK));
 + ret = -ENOMEM;
 + } else {
 + current-mm-locked_vm += npages;
 + }
 + up_write(current-mm-mmap_sem);
 +
 + return ret;
 +}
 +
  static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
  {
   struct kvm *kvm = stt-kvm;
   int i;
 + long npages = kvmppc_stt_npages(stt-window_size);
  
   mutex_lock(kvm-lock);
   list_del(stt-list);
 - for (i = 0; i  kvmppc_stt_npages(stt-window_size); i++)
 + for (i = 0; i  npages; i++)
   __free_page(stt-pages[i]);
 +
   kfree(stt);
   mutex_unlock(kvm-lock);
  
 + kvmppc_account_memlimit(-(npages + 1));
 +
   kvm_put_kvm(kvm);
  }
  
 @@ -140,6 +170,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
   }
  
   npages = kvmppc_stt_npages(args-window_size);
 + ret = kvmppc_account_memlimit(npages + 1);
 + if (ret)
 + goto fail;
  
   stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
 GFP_KERNEL);
 


-- 
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 00/12] vfio: pci: Enable DDW and in-kernel acceleration

2014-07-15 Thread Alexey Kardashevskiy

This makes use of kernel patchsets:
[PATCH v1 00/16] powernv: vfio: Add Dynamic DMA windows (DDW)
[PATCH v1 0/7] powerpc/iommu: kvm: Enable MultiTCE support
[PATCH v1 00/13] powerpc: kvm: Enable in-kernel acceleration for VFIO

I am posting it for reference here, reviews are still welcome but not required 
:)


Alexey Kardashevskiy (12):
  spapr_iommu: Disable in-kernel IOMMU tables for 4GB windows
  spapr_pci: Make find_phb()/find_dev() public
  spapr_iommu: Make spapr_tce_find_by_liobn() public
  linux headers update for DDW
  spapr_rtas: Add Dynamic DMA windows (DDW) RTAS calls support
  spapr: Add ddw machine option
  spapr_pci: Enable DDW
  spapr_pci_vfio: Enable DDW
  vfio: Enable DDW ioctls to VFIO IOMMU driver
  headers: update for KVM_CAP_SPAPR_TCE_64 and VFIO KVM device
  target-ppc: kvm: make use of KVM_CREATE_SPAPR_TCE_64
  vfio: Enable in-kernel acceleration via VFIO KVM device

 hw/misc/vfio.c|  45 ++
 hw/ppc/Makefile.objs  |   3 +
 hw/ppc/spapr.c|  15 ++
 hw/ppc/spapr_iommu.c  |   6 +-
 hw/ppc/spapr_pci.c|  84 +--
 hw/ppc/spapr_pci_vfio.c   |  95 
 hw/ppc/spapr_rtas_ddw.c   | 296 ++
 include/hw/misc/vfio.h|   5 +
 include/hw/pci-host/spapr.h   |  25 
 include/hw/ppc/spapr.h|   8 +-
 linux-headers/asm-mips/kvm_para.h |   6 +-
 linux-headers/asm-powerpc/kvm.h   |   9 ++
 linux-headers/linux/kvm.h |  12 ++
 linux-headers/linux/kvm_para.h|   3 +
 linux-headers/linux/vfio.h|  37 -
 target-ppc/kvm.c  |  47 --
 target-ppc/kvm_ppc.h  |  10 +-
 trace-events  |   4 +
 vl.c  |   4 +
 19 files changed, 683 insertions(+), 31 deletions(-)
 create mode 100644 hw/ppc/spapr_rtas_ddw.c

-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 05/12] spapr_rtas: Add Dynamic DMA windows (DDW) RTAS calls support

2014-07-15 Thread Alexey Kardashevskiy

spapr_pci_vfio: Support dynamic DMA window

This adds support for Dynamic DMA Windows (DDW) option defined by
the SPAPR specification which allows to have additional DMA windows besides
the default and small one which can only handle 4K pages and which
should completely fit into first 32bit of PCI address space what makes it
less than perfect solution for high-speed PCI devices.

The existing implementation of DDW in the guest tries to create one huge
DMA window with 64K or 16MB pages and map the entire guest RAM to. If this
operation suceedes, the guest switches to dma_direct_ops and never calls
TCE hypercalls (H_PUT_TCE,...). This enables VFIO devices to use the entire
RAM and not spend time on mapping/unmapping.

This adds 4 RTAS handlers:
* ibm,query-pe-dma-window
* ibm,create-pe-dma-window
* ibm,remove-pe-dma-window
* ibm,reset-pe-dma-window
These are registered from qapi_init callback.

This adds @ddw_supported property to sPAPRPHBState to enable DDW
feature.

This adds @ddw_reset_supported property to sPAPRPHBState to enable DDW
reset extention (TODO: debug).

This bumps migration descriptor version as there are 2 new properties.

This adds a notifier for VFIO to provide path for calling DDW-related
ioctls via VFIO container fd.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---

Reset is not implemented yet and it is questionable if we really want it.
---
 hw/ppc/Makefile.objs|   3 +
 hw/ppc/spapr_rtas_ddw.c | 296 
 include/hw/pci-host/spapr.h |  18 +++
 include/hw/ppc/spapr.h  |   7 +-
 trace-events|   4 +
 5 files changed, 326 insertions(+), 2 deletions(-)
 create mode 100644 hw/ppc/spapr_rtas_ddw.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index edd44d0..9773294 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,6 +7,9 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o
 ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
 obj-y += spapr_pci_vfio.o
 endif
+ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES), yy)
+obj-y += spapr_rtas_ddw.o
+endif
 # PowerPC 4xx boards
 obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
 obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
new file mode 100644
index 000..943af2c
--- /dev/null
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -0,0 +1,296 @@
+/*
+ * QEMU sPAPR Dynamic DMA windows support
+ *
+ * Copyright (c) 2014 Alexey Kardashevskiy, IBM Corporation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see http://www.gnu.org/licenses/.
+ */
+
+#include hw/ppc/spapr.h
+#include hw/pci-host/spapr.h
+#include trace.h
+
+static inline uint32_t spapr_iommu_fixmask(uint32_t cur_mask,
+   struct ppc_one_seg_page_size *sps,
+   uint32_t query_mask,
+   int shift,
+   uint32_t add_mask)
+{
+if ((sps-page_shift == shift)  (query_mask  add_mask)) {
+cur_mask |= add_mask;
+}
+return cur_mask;
+}
+
+static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu,
+ sPAPREnvironment *spapr,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+CPUPPCState *env = cpu-env;
+sPAPRPHBState *sphb;
+sPAPRPHBClass *spc;
+uint64_t buid;
+uint32_t addr, pgmask = 0;
+uint32_t windows_available = 0, page_size_mask = 0;
+long ret, i;
+
+if ((nargs != 3) || (nret != 5)) {
+goto param_error_exit;
+}
+
+buid = ((uint64_t)rtas_ld(args, 1)  32) | rtas_ld(args, 2);
+addr = rtas_ld(args, 0);
+sphb = spapr_pci_find_phb(spapr, buid);
+if (!sphb) {
+goto param_error_exit;
+}
+
+spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb);
+if (!spc-ddw_query) {
+goto hw_error_exit;
+}
+
+ret = spc-ddw_query(sphb, windows_available, page_size_mask);
+trace_spapr_iommu_ddw_query(buid, addr, windows_available,
+page_size_mask, pgmask, ret);
+if (ret) {
+goto hw_error_exit;
+}
+
+/* DBG! */
+if (!(page_size_mask  DDW_PGSIZE_16M)) {
+goto hw_error_exit;
+}
+
+/* Work out

[PATCH QEMU 02/12] spapr_pci: Make find_phb()/find_dev() public

2014-07-15 Thread Alexey Kardashevskiy

This makes find_phb()/find_dev() public and changed its names
to spapr_pci_find_phb()/spapr_pci_find_dev() as they are going to
be used from other parts of QEMU such as VFIO DDW (dynamic DMA window)
or VFIO PCI error injection or VFIO EEH handling - in all these
cases there are RTAS calls which are addressed to BUID+config_addr
in IEEE1275 format.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr_pci.c  | 22 +++---
 include/hw/pci-host/spapr.h |  4 
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 9ed39a9..230b59c 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -47,7 +47,7 @@
 #define RTAS_TYPE_MSI   1
 #define RTAS_TYPE_MSIX  2
 
-static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, uint64_t buid)
+sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid)
 {
 sPAPRPHBState *sphb;
 
@@ -61,10 +61,10 @@ static sPAPRPHBState *find_phb(sPAPREnvironment *spapr, 
uint64_t buid)
 return NULL;
 }
 
-static PCIDevice *find_dev(sPAPREnvironment *spapr, uint64_t buid,
-   uint32_t config_addr)
+PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid,
+  uint32_t config_addr)
 {
-sPAPRPHBState *sphb = find_phb(spapr, buid);
+sPAPRPHBState *sphb = spapr_pci_find_phb(spapr, buid);
 PCIHostState *phb = PCI_HOST_BRIDGE(sphb);
 int bus_num = (config_addr  16)  0xFF;
 int devfn = (config_addr  8)  0xFF;
@@ -95,7 +95,7 @@ static void finish_read_pci_config(sPAPREnvironment *spapr, 
uint64_t buid,
 return;
 }
 
-pci_dev = find_dev(spapr, buid, addr);
+pci_dev = spapr_pci_find_dev(spapr, buid, addr);
 addr = rtas_pci_cfgaddr(addr);
 
 if (!pci_dev || (addr % size) || (addr = pci_config_size(pci_dev))) {
@@ -162,7 +162,7 @@ static void finish_write_pci_config(sPAPREnvironment 
*spapr, uint64_t buid,
 return;
 }
 
-pci_dev = find_dev(spapr, buid, addr);
+pci_dev = spapr_pci_find_dev(spapr, buid, addr);
 addr = rtas_pci_cfgaddr(addr);
 
 if (!pci_dev || (addr % size) || (addr = pci_config_size(pci_dev))) {
@@ -281,9 +281,9 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, 
sPAPREnvironment *spapr,
 }
 
 /* Fins sPAPRPHBState */
-phb = find_phb(spapr, buid);
+phb = spapr_pci_find_phb(spapr, buid);
 if (phb) {
-pdev = find_dev(spapr, buid, config_addr);
+pdev = spapr_pci_find_dev(spapr, buid, config_addr);
 }
 if (!phb || !pdev) {
 rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
@@ -377,9 +377,9 @@ static void 
rtas_ibm_query_interrupt_source_number(PowerPCCPU *cpu,
 spapr_pci_msi *msi;
 
 /* Find sPAPRPHBState */
-phb = find_phb(spapr, buid);
+phb = spapr_pci_find_phb(spapr, buid);
 if (phb) {
-pdev = find_dev(spapr, buid, config_addr);
+pdev = spapr_pci_find_dev(spapr, buid, config_addr);
 }
 if (!phb || !pdev) {
 rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
@@ -553,7 +553,7 @@ static void spapr_phb_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 
-if (find_phb(spapr, sphb-buid)) {
+if (spapr_pci_find_phb(spapr, sphb-buid)) {
 error_setg(errp, PCI host bridges must have unique BUIDs);
 return;
 }
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 32f0aa7..14c2ab0 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -122,4 +122,8 @@ void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr 
addr);
 
 void spapr_pci_rtas_init(void);
 
+sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid);
+PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid,
+  uint32_t config_addr);
+
 #endif /* __HW_SPAPR_PCI_H__ */
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 04/12] linux headers update for DDW

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 linux-headers/linux/vfio.h | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index 26c218e..f0aa97d 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -448,13 +448,48 @@ struct vfio_iommu_type1_dma_unmap {
  */
 struct vfio_iommu_spapr_tce_info {
__u32 argsz;
-   __u32 flags;/* reserved for future use */
+   __u32 flags;
+#define VFIO_IOMMU_SPAPR_TCE_FLAG_DDW  1 /* Support dynamic windows */
__u32 dma32_window_start;   /* 32 bit window start (bytes) */
__u32 dma32_window_size;/* 32 bit window size (bytes) */
 };
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO  _IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/*
+ * Dynamic DMA windows
+ */
+struct vfio_iommu_spapr_tce_query {
+   __u32 argsz;
+   /* out */
+   __u32 windows_available;
+   __u32 page_size_mask;
+};
+#define VFIO_IOMMU_SPAPR_TCE_QUERY _IO(VFIO_TYPE, VFIO_BASE + 17)
+
+struct vfio_iommu_spapr_tce_create {
+   __u32 argsz;
+   /* in */
+   __u32 page_shift;
+   __u32 window_shift;
+   /* out */
+   __u64 start_addr;
+
+};
+#define VFIO_IOMMU_SPAPR_TCE_CREATE_IO(VFIO_TYPE, VFIO_BASE + 18)
+
+struct vfio_iommu_spapr_tce_remove {
+   __u32 argsz;
+   /* in */
+   __u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE_IO(VFIO_TYPE, VFIO_BASE + 19)
+
+struct vfio_iommu_spapr_tce_reset {
+   __u32 argsz;
+};
+#define VFIO_IOMMU_SPAPR_TCE_RESET _IO(VFIO_TYPE, VFIO_BASE + 20)
+
 /* * */
 
 #endif /* VFIO_H */
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 03/12] spapr_iommu: Make spapr_tce_find_by_liobn() public

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr_iommu.c   | 2 +-
 include/hw/ppc/spapr.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 36f5d27..588d442 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -40,7 +40,7 @@ enum sPAPRTCEAccess {
 
 static QLIST_HEAD(spapr_tce_tables, sPAPRTCETable) spapr_tce_tables;
 
-static sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn)
+sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn)
 {
 sPAPRTCETable *tcet;
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index bbba51a..9c5686e 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -467,6 +467,7 @@ struct sPAPRTCETable {
 QLIST_ENTRY(sPAPRTCETable) list;
 };
 
+sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn);
 void spapr_events_init(sPAPREnvironment *spapr);
 void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
 int spapr_h_cas_compose_response(target_ulong addr, target_ulong size);
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 06/12] spapr: Add ddw machine option

2014-07-15 Thread Alexey Kardashevskiy

This option will enable Dynamic DMA windows (DDW) support for pseries
machine.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr.c | 15 +++
 vl.c   |  4 
 2 files changed, 19 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d01978f..fec295b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -100,6 +100,7 @@ struct sPAPRMachineState {
 
 /* public */
 char *kvm_type;
+bool ddw_supported;
 };
 
 sPAPREnvironment *spapr;
@@ -1570,10 +1571,24 @@ static void spapr_set_kvm_type(Object *obj, const char 
*value, Error **errp)
 sm-kvm_type = g_strdup(value);
 }
 
+static bool spapr_machine_get_ddw(Object *obj, Error **errp)
+{
+sPAPRMachineState *sms = SPAPR_MACHINE(obj);
+return sms-ddw_supported;
+}
+
+static void spapr_machine_set_ddw(Object *obj, bool value, Error **errp)
+{
+sPAPRMachineState *sms = SPAPR_MACHINE(obj);
+sms-ddw_supported = value;
+}
+
 static void spapr_machine_initfn(Object *obj)
 {
 object_property_add_str(obj, kvm-type,
 spapr_get_kvm_type, spapr_set_kvm_type, NULL);
+object_property_add_bool(obj, ddw, spapr_machine_get_ddw,
+ spapr_machine_set_ddw, NULL);
 }
 
 static void spapr_machine_class_init(ObjectClass *oc, void *data)
diff --git a/vl.c b/vl.c
index 6e084c2..a615fb1 100644
--- a/vl.c
+++ b/vl.c
@@ -383,6 +383,10 @@ static QemuOptsList qemu_machine_opts = {
 .name = kvm-type,
 .type = QEMU_OPT_STRING,
 .help = Specifies the KVM virtualization mode (HV, PR),
+}, {
+.name = ddw,
+.type = QEMU_OPT_BOOL,
+.help = Enable Dynamic DMA windows support (pseries only),
 },{
 .name = PC_MACHINE_MAX_RAM_BELOW_4G,
 .type = QEMU_OPT_SIZE,
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 01/12] spapr_iommu: Disable in-kernel IOMMU tables for 4GB windows

2014-07-15 Thread Alexey Kardashevskiy

The existing KVM_CREATE_SPAPR_TCE ioctl only support 4G windows max.
We are going to add huge DMA windows support so this will create small
window and unexpectedly fail later.

This disables KVM_CREATE_SPAPR_TCE for windows bigger that 4GB. Since
those windows are normally mapped at the boot time, there will be no
performance impact.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr_iommu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index f6e32a4..36f5d27 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -113,11 +113,11 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = {
 static int spapr_tce_table_realize(DeviceState *dev)
 {
 sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
+uint64_t window_size = tcet-nb_table  tcet-page_shift;
 
-if (kvm_enabled()) {
+if (kvm_enabled()  !(window_size  32)) {
 tcet-table = kvmppc_create_spapr_tce(tcet-liobn,
-  tcet-nb_table 
-  tcet-page_shift,
+  window_size,
   tcet-fd,
   tcet-vfio_accel);
 }
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 08/12] spapr_pci_vfio: Enable DDW

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr_pci_vfio.c | 73 +
 1 file changed, 73 insertions(+)

diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index d3bddf2..b72aff0 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -71,6 +71,75 @@ static void spapr_phb_vfio_finish_realize(sPAPRPHBState 
*sphb, Error **errp)
 spapr_tce_get_iommu(tcet));
 }
 
+static int spapr_pci_vfio_ddw_query(sPAPRPHBState *sphb,
+uint32_t *windows_available,
+uint32_t *page_size_mask)
+{
+sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+struct vfio_iommu_spapr_tce_query query = { .argsz = sizeof(query) };
+int ret;
+
+ret = vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid,
+   VFIO_IOMMU_SPAPR_TCE_QUERY, query);
+if (ret) {
+return ret;
+}
+
+*windows_available = query.windows_available;
+*page_size_mask = query.page_size_mask;
+
+return ret;
+}
+
+static int spapr_pci_vfio_ddw_create(sPAPRPHBState *sphb, uint32_t page_shift,
+ uint32_t window_shift, uint32_t liobn,
+ sPAPRTCETable **ptcet)
+{
+sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+struct vfio_iommu_spapr_tce_create create = {
+.argsz = sizeof(create),
+.page_shift = page_shift,
+.window_shift = window_shift,
+.start_addr = 0
+};
+int ret;
+
+ret = vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid,
+   VFIO_IOMMU_SPAPR_TCE_CREATE, create);
+if (ret) {
+return ret;
+}
+
+*ptcet = spapr_tce_new_table(DEVICE(sphb), liobn, create.start_addr,
+ page_shift, 1  (window_shift - page_shift),
+ true);
+memory_region_add_subregion(sphb-iommu_root, (*ptcet)-bus_offset,
+spapr_tce_get_iommu(*ptcet));
+
+return ret;
+}
+
+static int spapr_pci_vfio_ddw_remove(sPAPRPHBState *sphb, sPAPRTCETable *tcet)
+{
+sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+struct vfio_iommu_spapr_tce_remove remove = {
+.argsz = sizeof(remove),
+.start_addr = tcet-bus_offset
+};
+
+return vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid,
+VFIO_IOMMU_SPAPR_TCE_REMOVE, remove);
+}
+
+static int spapr_pci_vfio_ddw_reset(sPAPRPHBState *sphb)
+{
+sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb);
+struct vfio_iommu_spapr_tce_reset reset = { .argsz = sizeof(reset) };
+
+return vfio_container_ioctl(sphb-iommu_as, svphb-iommugroupid,
+VFIO_IOMMU_SPAPR_TCE_RESET, reset);
+}
+
 static void spapr_phb_vfio_reset(DeviceState *qdev)
 {
 /* Do nothing */
@@ -84,6 +153,10 @@ static void spapr_phb_vfio_class_init(ObjectClass *klass, 
void *data)
 dc-props = spapr_phb_vfio_properties;
 dc-reset = spapr_phb_vfio_reset;
 spc-finish_realize = spapr_phb_vfio_finish_realize;
+spc-ddw_query = spapr_pci_vfio_ddw_query;
+spc-ddw_create = spapr_pci_vfio_ddw_create;
+spc-ddw_remove = spapr_pci_vfio_ddw_remove;
+spc-ddw_reset = spapr_pci_vfio_ddw_reset;
 }
 
 static const TypeInfo spapr_phb_vfio_info = {
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 07/12] spapr_pci: Enable DDW

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr_pci.c  | 62 +
 include/hw/pci-host/spapr.h |  3 +++
 2 files changed, 65 insertions(+)

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 230b59c..038a485 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -22,6 +22,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include sysemu/sysemu.h
 #include hw/hw.h
 #include hw/pci/pci.h
 #include hw/pci/msi.h
@@ -781,6 +782,42 @@ static const char *spapr_phb_root_bus_path(PCIHostState 
*host_bridge,
 return sphb-dtbusname;
 }
 
+static int spapr_pci_ddw_query(sPAPRPHBState *sphb,
+   uint32_t *windows_available,
+   uint32_t *page_size_mask)
+{
+*windows_available = 1;
+*page_size_mask = DDW_PGSIZE_16M;
+
+return 0;
+}
+
+static int spapr_pci_ddw_create(sPAPRPHBState *sphb, uint32_t page_shift,
+uint32_t window_shift, uint32_t liobn,
+sPAPRTCETable **ptcet)
+{
+*ptcet = spapr_tce_new_table(DEVICE(sphb), liobn, SPAPR_PCI_TCE64_START,
+ page_shift, 1  (window_shift - page_shift),
+ true);
+if (!*ptcet) {
+return -1;
+}
+memory_region_add_subregion(sphb-iommu_root, (*ptcet)-bus_offset,
+spapr_tce_get_iommu(*ptcet));
+
+return 0;
+}
+
+static int spapr_pci_ddw_remove(sPAPRPHBState *sphb, sPAPRTCETable *tcet)
+{
+return 0;
+}
+
+static int spapr_pci_ddw_reset(sPAPRPHBState *sphb)
+{
+return 0;
+}
+
 static void spapr_phb_class_init(ObjectClass *klass, void *data)
 {
 PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
@@ -795,6 +832,10 @@ static void spapr_phb_class_init(ObjectClass *klass, void 
*data)
 set_bit(DEVICE_CATEGORY_BRIDGE, dc-categories);
 dc-cannot_instantiate_with_device_add_yet = false;
 spc-finish_realize = spapr_phb_finish_realize;
+spc-ddw_query = spapr_pci_ddw_query;
+spc-ddw_create = spapr_pci_ddw_create;
+spc-ddw_remove = spapr_pci_ddw_remove;
+spc-ddw_reset = spapr_pci_ddw_reset;
 }
 
 static const TypeInfo spapr_phb_info = {
@@ -878,6 +919,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
 uint32_t interrupt_map_mask[] = {
 cpu_to_be32(b_d(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)};
 uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7];
+uint32_t ddw_applicable[] = {
+RTAS_IBM_QUERY_PE_DMA_WINDOW,
+RTAS_IBM_CREATE_PE_DMA_WINDOW,
+RTAS_IBM_REMOVE_PE_DMA_WINDOW
+};
+uint32_t ddw_extensions[] = { 1, RTAS_IBM_RESET_PE_DMA_WINDOW };
+sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(phb);
+QemuOpts *machine_opts = qemu_get_machine_opts();
 
 /* Start populating the FDT */
 sprintf(nodename, pci@% PRIx64, phb-buid);
@@ -907,6 +956,19 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
 _FDT(fdt_setprop_cell(fdt, bus_off, ibm,pci-config-space-type, 0x1));
 _FDT(fdt_setprop_cell(fdt, bus_off, ibm,pe-total-#msi, XICS_IRQS));
 
+/* Dynamic DMA window */
+if (qemu_opt_get_bool(machine_opts, ddw, true) 
+spc-ddw_query  spc-ddw_create  spc-ddw_remove) {
+_FDT(fdt_setprop(fdt, bus_off, ibm,ddw-applicable, ddw_applicable,
+ sizeof(ddw_applicable)));
+
+if (spc-ddw_reset) {
+/* When enabled, the guest will remove the default 32bit window */
+_FDT(fdt_setprop(fdt, bus_off, ibm,ddw-extensions,
+ ddw_extensions, sizeof(ddw_extensions)));
+}
+}
+
 /* Build the interrupt-map, this must matches what is done
  * in pci_spapr_map_irq
  */
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index 119d326..f494cbb 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -125,6 +125,9 @@ struct sPAPRPHBVFIOState {
 
 #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x8000ULL
 
+/* Default 64bit dynamic window offset */
+#define SPAPR_PCI_TCE64_START0x8000ULL
+
 static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
 {
 return xics_get_qirq(spapr-icp, phb-lsi_table[pin].irq);
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 11/12] target-ppc: kvm: make use of KVM_CREATE_SPAPR_TCE_64

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/spapr_iommu.c |  7 ---
 target-ppc/kvm.c | 47 ---
 target-ppc/kvm_ppc.h | 10 +++---
 3 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 588d442..1710595 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -113,11 +113,12 @@ static MemoryRegionIOMMUOps spapr_iommu_ops = {
 static int spapr_tce_table_realize(DeviceState *dev)
 {
 sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
-uint64_t window_size = tcet-nb_table  tcet-page_shift;
 
-if (kvm_enabled()  !(window_size  32)) {
+if (kvm_enabled()) {
 tcet-table = kvmppc_create_spapr_tce(tcet-liobn,
-  window_size,
+  tcet-nb_table,
+  tcet-bus_offset,
+  tcet-page_shift,
   tcet-fd,
   tcet-vfio_accel);
 }
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 42718f7..cfc2599 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -62,6 +62,7 @@ static int cap_booke_sregs;
 static int cap_ppc_smt;
 static int cap_ppc_rma;
 static int cap_spapr_tce;
+static int cap_spapr_tce_64;
 static int cap_spapr_multitce;
 static int cap_spapr_vfio;
 static int cap_hior;
@@ -101,6 +102,7 @@ int kvm_arch_init(KVMState *s)
 cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
 cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
 cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
+cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
 cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
 cap_spapr_vfio = false;
 cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
@@ -1655,13 +1657,10 @@ bool kvmppc_spapr_use_multitce(void)
 return cap_spapr_multitce;
 }
 
-void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd,
-  bool vfio_accel)
+void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_shift,
+  uint64_t bus_offset, uint32_t page_shift,
+  int *pfd, bool vfio_accel)
 {
-struct kvm_create_spapr_tce args = {
-.liobn = liobn,
-.window_size = window_size,
-};
 long len;
 int fd;
 void *table;
@@ -1674,14 +1673,40 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t 
window_size, int *pfd,
 return NULL;
 }
 
-fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, args);
-if (fd  0) {
-fprintf(stderr, KVM: Failed to create TCE table for liobn 0x%x\n,
-liobn);
+if (cap_spapr_tce_64) {
+struct kvm_create_spapr_tce_64 args = {
+.liobn = liobn,
+.page_shift = page_shift,
+.offset = bus_offset  page_shift,
+.size = window_shift,
+.flags = 0
+};
+fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_64, args);
+if (fd  0) {
+fprintf(stderr,
+KVM: Failed to create TCE64 table for liobn 0x%x\n,
+liobn);
+return NULL;
+}
+} else if (cap_spapr_tce) {
+struct kvm_create_spapr_tce args = {
+.liobn = liobn,
+.window_size = window_shift  page_shift,
+};
+if (((window_shift  page_shift) != args.window_size) || bus_offset) {
+return NULL;
+}
+fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, args);
+if (fd  0) {
+fprintf(stderr, KVM: Failed to create TCE table for liobn 0x%x\n,
+liobn);
+return NULL;
+}
+} else {
 return NULL;
 }
 
-len = (window_size / SPAPR_TCE_PAGE_SIZE) * sizeof(uint64_t);
+len = window_shift * sizeof(uint64_t);
 /* FIXME: round this up to page size */
 
 table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index d9516e7..154f434 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -33,8 +33,9 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
 #ifndef CONFIG_USER_ONLY
 off_t kvmppc_alloc_rma(void **rma);
 bool kvmppc_spapr_use_multitce(void);
-void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd,
-  bool vfio_accel);
+void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_shift,
+  uint64_t bus_offset, uint32_t page_shift,
+  int *pfd, bool vfio_accel);
 int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
 int kvmppc_reset_htab(int shift_hint);
 uint64_t kvmppc_rma_size(uint64_t

[PATCH QEMU 12/12] vfio: Enable in-kernel acceleration via VFIO KVM device

2014-07-15 Thread Alexey Kardashevskiy

TCE hypercalls (H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE) use a logical bus
number (LIOBN) to identify which TCE table the request is addressed to.
However VFIO kernel driver operates with IOMMU group IDs and has no idea
about which LIOBN corresponds to which group. If the host kernel supports
in-kernel acceleration for TCE calls, we have to provide the LIOBN to IOMMU
mapping information.

This makes use of a VFIO KVM device's
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN attribute to set the link
between LIOBN and IOMMU group.

The vfio_container_spapr_set_liobn() helper is implemented completely
in vfio.c because kvm_vfio_spapr_tce_liobn needs a group fd and
we do not want to share resources likes that outside vfio.c.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/misc/vfio.c  | 41 +
 hw/ppc/spapr_iommu.c|  1 +
 hw/ppc/spapr_pci_vfio.c | 22 ++
 include/hw/misc/vfio.h  |  5 +
 4 files changed, 69 insertions(+)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index e7b4d6e..6e9919a 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -4450,3 +4450,44 @@ int vfio_container_ioctl(AddressSpace *as, int32_t 
groupid,
 
 return vfio_container_do_ioctl(as, groupid, req, param);
 }
+
+int vfio_container_spapr_set_liobn(AddressSpace *as,
+   int32_t groupid,
+   uint64_t liobn,
+   uint64_t start_addr)
+{
+#ifdef CONFIG_KVM
+VFIOGroup *group;
+int ret;
+struct kvm_vfio_spapr_tce_liobn param = {
+.argsz = sizeof(param),
+.liobn = liobn,
+.start_addr = start_addr
+};
+struct kvm_device_attr attr = {
+.group = KVM_DEV_VFIO_GROUP,
+.attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN,
+.addr = (uint64_t)(unsigned long)param,
+};
+
+if (vfio_kvm_device_fd  0) {
+return 0;
+}
+
+group = vfio_get_group(groupid, as);
+if (!group) {
+return -1;
+}
+
+param.fd = group-fd;
+ret = ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, attr);
+if (ret) {
+error_report(vfio: failed to setup liobn for a group: %s,
+ strerror(errno));
+}
+
+return ret;
+#else
+return 0;
+#endif
+}
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 1710595..3c2a9c9 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -126,6 +126,7 @@ static int spapr_tce_table_realize(DeviceState *dev)
 if (!tcet-table) {
 size_t table_size = tcet-nb_table * sizeof(uint64_t);
 tcet-table = g_malloc0(table_size);
+tcet-vfio_accel = false;
 }
 
 trace_spapr_iommu_new_table(tcet-liobn, tcet, tcet-table, tcet-fd);
diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c
index b72aff0..06b4e02 100644
--- a/hw/ppc/spapr_pci_vfio.c
+++ b/hw/ppc/spapr_pci_vfio.c
@@ -21,6 +21,7 @@
 #include hw/pci-host/spapr.h
 #include linux/vfio.h
 #include hw/misc/vfio.h
+#include qemu/error-report.h
 
 static Property spapr_phb_vfio_properties[] = {
 DEFINE_PROP_INT32(iommu, sPAPRPHBVFIOState, iommugroupid, -1),
@@ -69,6 +70,17 @@ static void spapr_phb_vfio_finish_realize(sPAPRPHBState 
*sphb, Error **errp)
 /* Register default 32bit DMA window */
 memory_region_add_subregion(sphb-iommu_root, tcet-bus_offset,
 spapr_tce_get_iommu(tcet));
+
+if (!tcet-vfio_accel) {
+return;
+}
+ret = vfio_container_spapr_set_liobn(svphb-phb.iommu_as,
+ svphb-iommugroupid,
+ tcet-liobn,
+ tcet-bus_offset);
+if (ret) {
+error_report(spapr-vfio: failed to create link to IOMMU);
+}
 }
 
 static int spapr_pci_vfio_ddw_query(sPAPRPHBState *sphb,
@@ -116,6 +128,16 @@ static int spapr_pci_vfio_ddw_create(sPAPRPHBState *sphb, 
uint32_t page_shift,
 memory_region_add_subregion(sphb-iommu_root, (*ptcet)-bus_offset,
 spapr_tce_get_iommu(*ptcet));
 
+if (!(*ptcet)-vfio_accel) {
+return 0;
+}
+ret = vfio_container_spapr_set_liobn(sphb-iommu_as, svphb-iommugroupid,
+ liobn, (*ptcet)-bus_offset);
+if (ret) {
+error_report(spapr-vfio: failed to create link to IOMMU);
+ret = 0;
+}
+
 return ret;
 }
 
diff --git a/include/hw/misc/vfio.h b/include/hw/misc/vfio.h
index 0b26cd8..8f248e2 100644
--- a/include/hw/misc/vfio.h
+++ b/include/hw/misc/vfio.h
@@ -6,4 +6,9 @@
 extern int vfio_container_ioctl(AddressSpace *as, int32_t groupid,
 int req, void *param);
 
+extern int vfio_container_spapr_set_liobn(AddressSpace *as,
+  int32_t groupid,
+  uint64_t liobn,
+  uint64_t

[PATCH QEMU 10/12] headers: update for KVM_CAP_SPAPR_TCE_64 and VFIO KVM device

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 linux-headers/asm-mips/kvm_para.h |  6 +-
 linux-headers/asm-powerpc/kvm.h   |  9 +
 linux-headers/linux/kvm.h | 12 
 linux-headers/linux/kvm_para.h|  3 +++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/linux-headers/asm-mips/kvm_para.h 
b/linux-headers/asm-mips/kvm_para.h
index 14fab8f..dbb2464 100644
--- a/linux-headers/asm-mips/kvm_para.h
+++ b/linux-headers/asm-mips/kvm_para.h
@@ -1 +1,5 @@
-#include asm-generic/kvm_para.h
+#ifndef _ASM_MIPS_KVM_PARA_H
+#define _ASM_MIPS_KVM_PARA_H
+
+
+#endif /* _ASM_MIPS_KVM_PARA_H */
diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index 2bc4a94..39325bf 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u32 page_shift;
+   __u64 offset;   /* in pages */
+   __u64 size; /* in pages */
+   __u32 flags;
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
__u64 rma_size;
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index f5d2c38..fd728d3 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -758,6 +758,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_VM_ATTRIBUTES 101
 #define KVM_CAP_ARM_PSCI_0_2 102
 #define KVM_CAP_PPC_FIXUP_HCALL 103
+#define KVM_CAP_SPAPR_TCE_VFIO 104
+#define KVM_CAP_SPAPR_TCE_64 105
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -947,9 +949,17 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_GROUP1
 #define   KVM_DEV_VFIO_GROUP_ADD   1
 #define   KVM_DEV_VFIO_GROUP_DEL   2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN   3
 #define KVM_DEV_TYPE_ARM_VGIC_V2   5
 #define KVM_DEV_TYPE_FLIC  6
 
+struct kvm_vfio_spapr_tce_liobn {
+   __u32   argsz;
+   __s32   fd;
+   __u32   liobn;
+   __u64   start_addr;
+};
+
 /*
  * ioctls for VM fds
  */
@@ -1031,6 +1041,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB_IOWR(KVMIO, 0xa7, __u32)
 #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO,  0xa8, struct 
kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64  _IOW(KVMIO,  0xa8, \
+  struct kvm_create_spapr_tce_64)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */
diff --git a/linux-headers/linux/kvm_para.h b/linux-headers/linux/kvm_para.h
index 2dff783..e61661e 100644
--- a/linux-headers/linux/kvm_para.h
+++ b/linux-headers/linux/kvm_para.h
@@ -20,6 +20,9 @@
 #define KVM_HC_FEATURES3
 #define KVM_HC_PPC_MAP_MAGIC_PAGE  4
 #define KVM_HC_KICK_CPU5
+#define KVM_HC_MIPS_GET_CLOCK_FREQ 6
+#define KVM_HC_MIPS_EXIT_VM7
+#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
 
 /*
  * hypercalls use architecture specific
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH QEMU 09/12] vfio: Enable DDW ioctls to VFIO IOMMU driver

2014-07-15 Thread Alexey Kardashevskiy

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/misc/vfio.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 0b9eba0..e7b4d6e 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -4437,6 +4437,10 @@ int vfio_container_ioctl(AddressSpace *as, int32_t 
groupid,
 switch (req) {
 case VFIO_CHECK_EXTENSION:
 case VFIO_IOMMU_SPAPR_TCE_GET_INFO:
+case VFIO_IOMMU_SPAPR_TCE_QUERY:
+case VFIO_IOMMU_SPAPR_TCE_CREATE:
+case VFIO_IOMMU_SPAPR_TCE_REMOVE:
+case VFIO_IOMMU_SPAPR_TCE_RESET:
 break;
 default:
 /* Return an error on unknown requests */
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] ppc/xmon: use isxdigit/isspace/isalnum from ctype.h

2014-07-15 Thread Vincent Bernat

 ❦ 15 juillet 2014 08:55 GMT, David Laight david.lai...@aculab.com :

 Use linux/ctype.h instead of defining custom versions of
 isxdigit/isspace/isalnum.

 ...
 -#define isspace(c)  (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0)

 That is different from the version in linux/ctype.h
 Especially for 'c == 0', but probably also vertical tab and form feed.

OK. Looking more carefully, the one in ctype.h is 9-13 (11 is vertical
tab, 12 is form feed), 32 and 160 (non-breaking space, not ASCII).

For isxdigit, this is the same. For isalnum, the one in ctype.h does
accept non ASCII chars from 223.

Also, in xmon.c, isxdigit is defined twice.
-- 
Parenthesise to avoid ambiguity.
- The Elements of Programming Style (Kernighan  Plauger)
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Re: [PATCH v5 2/2] [BUGFIX] kprobes: Fix Failed to find blacklist error on ia64 and ppc64

2014-07-15 Thread Masami Hiramatsu

(2014/07/15 16:16), Benjamin Herrenschmidt wrote:
 On Tue, 2014-07-15 at 13:19 +1000, Michael Ellerman wrote:
 
 Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com
 Reported-by: Tony Luck tony.l...@gmail.com
 Tested-by: Tony Luck tony.l...@intel.com
 Cc: Michael Ellerman m...@ellerman.id.au

 Tested-by: Michael Ellerman m...@ellerman.id.au
 Acked-by: Michael Ellerman m...@ellerman.id.au (for powerpc)

 Ben, can you take this in your tree?
 
 Acked-by: Benjamin Herrenschmidt b...@kernel.crashing.org
 
 That looks more like generic material. Do we have a kprobes maintainer ?
 Andrew, do you want to take this ?

Yeah, I usually use Ingo's tip tree for kprobes maintenance.
Ingo, could you pull this as urgent-for-linus patch?

 I'm happy to put it in powerpc and send it to Linus tomorrow if nobody
 cares :-)

Thank you!

-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Research Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu...@hitachi.com


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: Move bad_stack() below the fwnmi_data_area

2014-07-15 Thread Michael Ellerman

At the moment the allmodconfig build is failing because we run out of
space between altivec_assist() at 0x5700 and the fwnmi_data_area at
0x7000.

Fixing it permanently will take some more work, but a quick fix is to
move bad_stack() below the fwnmi_data_area. That gives us just enough
room with everything enabled.

bad_stack() is called from the common exception handlers, but it's a
non-conditional branch, so we have plenty of scope to move it further
way.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/kernel/exceptions-64s.S | 120 +--
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b859b3665be6..647d6c75ed62 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -953,66 +953,6 @@ ppc64_runlatch_on_trampoline:
b   __ppc64_runlatch_on
 
 /*
- * Here we have detected that the kernel stack pointer is bad.
- * R9 contains the saved CR, r13 points to the paca,
- * r10 contains the (bad) kernel stack pointer,
- * r11 and r12 contain the saved SRR0 and SRR1.
- * We switch to using an emergency stack, save the registers there,
- * and call kernel_bad_stack(), which panics.
- */
-bad_stack:
-   ld  r1,PACAEMERGSP(r13)
-   subir1,r1,64+INT_FRAME_SIZE
-   std r9,_CCR(r1)
-   std r10,GPR1(r1)
-   std r11,_NIP(r1)
-   std r12,_MSR(r1)
-   mfspr   r11,SPRN_DAR
-   mfspr   r12,SPRN_DSISR
-   std r11,_DAR(r1)
-   std r12,_DSISR(r1)
-   mflrr10
-   mfctr   r11
-   mfxer   r12
-   std r10,_LINK(r1)
-   std r11,_CTR(r1)
-   std r12,_XER(r1)
-   SAVE_GPR(0,r1)
-   SAVE_GPR(2,r1)
-   ld  r10,EX_R3(r3)
-   std r10,GPR3(r1)
-   SAVE_GPR(4,r1)
-   SAVE_4GPRS(5,r1)
-   ld  r9,EX_R9(r3)
-   ld  r10,EX_R10(r3)
-   SAVE_2GPRS(9,r1)
-   ld  r9,EX_R11(r3)
-   ld  r10,EX_R12(r3)
-   ld  r11,EX_R13(r3)
-   std r9,GPR11(r1)
-   std r10,GPR12(r1)
-   std r11,GPR13(r1)
-BEGIN_FTR_SECTION
-   ld  r10,EX_CFAR(r3)
-   std r10,ORIG_GPR3(r1)
-END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
-   SAVE_8GPRS(14,r1)
-   SAVE_10GPRS(22,r1)
-   lhz r12,PACA_TRAP_SAVE(r13)
-   std r12,_TRAP(r1)
-   addir11,r1,INT_FRAME_SIZE
-   std r11,0(r1)
-   li  r12,0
-   std r12,0(r11)
-   ld  r2,PACATOC(r13)
-   ld  r11,exception_marker@toc(r2)
-   std r12,RESULT(r1)
-   std r11,STACK_FRAME_OVERHEAD-16(r1)
-1: addir3,r1,STACK_FRAME_OVERHEAD
-   bl  kernel_bad_stack
-   b   1b
-
-/*
  * Here r13 points to the paca, r9 contains the saved CR,
  * SRR0 and SRR1 are saved in r11 and r12,
  * r9 - r13 are saved in paca-exgen.
@@ -1636,3 +1576,63 @@ handle_dabr_fault:
li  r5,SIGSEGV
bl  bad_page_fault
b   ret_from_except
+
+/*
+ * Here we have detected that the kernel stack pointer is bad.
+ * R9 contains the saved CR, r13 points to the paca,
+ * r10 contains the (bad) kernel stack pointer,
+ * r11 and r12 contain the saved SRR0 and SRR1.
+ * We switch to using an emergency stack, save the registers there,
+ * and call kernel_bad_stack(), which panics.
+ */
+bad_stack:
+   ld  r1,PACAEMERGSP(r13)
+   subir1,r1,64+INT_FRAME_SIZE
+   std r9,_CCR(r1)
+   std r10,GPR1(r1)
+   std r11,_NIP(r1)
+   std r12,_MSR(r1)
+   mfspr   r11,SPRN_DAR
+   mfspr   r12,SPRN_DSISR
+   std r11,_DAR(r1)
+   std r12,_DSISR(r1)
+   mflrr10
+   mfctr   r11
+   mfxer   r12
+   std r10,_LINK(r1)
+   std r11,_CTR(r1)
+   std r12,_XER(r1)
+   SAVE_GPR(0,r1)
+   SAVE_GPR(2,r1)
+   ld  r10,EX_R3(r3)
+   std r10,GPR3(r1)
+   SAVE_GPR(4,r1)
+   SAVE_4GPRS(5,r1)
+   ld  r9,EX_R9(r3)
+   ld  r10,EX_R10(r3)
+   SAVE_2GPRS(9,r1)
+   ld  r9,EX_R11(r3)
+   ld  r10,EX_R12(r3)
+   ld  r11,EX_R13(r3)
+   std r9,GPR11(r1)
+   std r10,GPR12(r1)
+   std r11,GPR13(r1)
+BEGIN_FTR_SECTION
+   ld  r10,EX_CFAR(r3)
+   std r10,ORIG_GPR3(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+   SAVE_8GPRS(14,r1)
+   SAVE_10GPRS(22,r1)
+   lhz r12,PACA_TRAP_SAVE(r13)
+   std r12,_TRAP(r1)
+   addir11,r1,INT_FRAME_SIZE
+   std r11,0(r1)
+   li  r12,0
+   std r12,0(r11)
+   ld  r2,PACATOC(r13)
+   ld  r11,exception_marker@toc(r2)
+   std r12,RESULT(r1)
+   std r11,STACK_FRAME_OVERHEAD-16(r1)
+1: addir3,r1,STACK_FRAME_OVERHEAD
+   bl  kernel_bad_stack
+   b   1b
-- 
1.9.1

___
Linuxppc-dev mailing

[PATCH 1/3] powerpc: Update comments in irqflags.h

2014-07-15 Thread Michael Ellerman

The comment on TRACE_ENABLE_INTS is incorrect, and appears to have
always been incorrect since the code was merged. It probably came from
an original out-of-tree patch.

Replace it with something that's correct. Also propagate the message to
RECONCILE_IRQ_STATE(), because it's potentially subtle.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/irqflags.h | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/irqflags.h 
b/arch/powerpc/include/asm/irqflags.h
index e20eb95429a8..f2149066fe5d 100644
--- a/arch/powerpc/include/asm/irqflags.h
+++ b/arch/powerpc/include/asm/irqflags.h
@@ -32,9 +32,8 @@
 #endif
 
 /*
- * Most of the CPU's IRQ-state tracing is done from assembly code; we
- * have to call a C function so call a wrapper that saves all the
- * C-clobbered registers.
+ * These are calls to C code, so the caller must be prepared for volatiles to
+ * be clobbered.
  */
 #define TRACE_ENABLE_INTS  TRACE_WITH_FRAME_BUFFER(trace_hardirqs_on)
 #define TRACE_DISABLE_INTS TRACE_WITH_FRAME_BUFFER(trace_hardirqs_off)
@@ -42,6 +41,9 @@
 /*
  * This is used by assembly code to soft-disable interrupts first and
  * reconcile irq state.
+ *
+ * NB: This may call C code, so the caller must be prepared for volatiles to
+ * be clobbered.
  */
 #define RECONCILE_IRQ_STATE(__rA, __rB)\
lbz __rA,PACASOFTIRQEN(r13);\
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/3] powerpc: Document register clobbering in EXCEPTION_COMMON()

2014-07-15 Thread Michael Ellerman

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/exception-64s.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 8f35cd7d59cc..066c15cd2837 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -532,6 +532,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
.globl label##_common;  \
 label##_common:\
EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);  \
+   /* Volatile regs are potentially clobbered here */  \
additions;  \
addir3,r1,STACK_FRAME_OVERHEAD; \
bl  hdlr;   \
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 3/3] powerpc: Remove misleading DISABLE_INTS

2014-07-15 Thread Michael Ellerman

DISABLE_INTS has a long and storied history, but for some time now it
has not actually disabled interrupts.

For the open-coded exception handlers, just stop using it, instead call
RECONCILE_IRQ_STATE directly. This has the benefit of removing a level
of indirection, and making it clear that r10  r11 are used at that
point.

For the addition case we still need a macro, so rename it to clarify
what it actually does.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/exception-64s.h | 11 +++
 arch/powerpc/kernel/exceptions-64s.S | 28 ++--
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 066c15cd2837..13a63379e496 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -513,8 +513,11 @@ label##_relon_hv:  
\
  * runlatch, etc...
  */
 
-/* Exception addition: Hard disable interrupts */
-#define DISABLE_INTS   RECONCILE_IRQ_STATE(r10,r11)
+/*
+ * This addition reconciles our actual IRQ state with the various software
+ * flags that track it. This may call C code.
+ */
+#define ADD_RECONCILE  RECONCILE_IRQ_STATE(r10,r11)
 
 #define ADD_NVGPRS \
bl  save_nvgprs
@@ -540,7 +543,7 @@ label##_common: 
\
 
 #define STD_EXCEPTION_COMMON(trap, label, hdlr)\
EXCEPTION_COMMON(trap, label, hdlr, ret_from_except,\
-ADD_NVGPRS;DISABLE_INTS)
+ADD_NVGPRS;ADD_RECONCILE)
 
 /*
  * Like STD_EXCEPTION_COMMON, but for exceptions that can occur
@@ -549,7 +552,7 @@ label##_common: 
\
  */
 #define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr)\
EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \
-FINISH_NAP;DISABLE_INTS;RUNLATCH_ON)
+FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
 
 /*
  * When the idle code in power4_idle puts the CPU into NAP mode,
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index a7d36b19221d..03a54ef03049 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1057,7 +1057,7 @@ data_access_common:
mfspr   r10,SPRN_DSISR
stw r10,PACA_EXGEN+EX_DSISR(r13)
EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
ld  r12,_MSR(r1)
ld  r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
@@ -1073,7 +1073,7 @@ h_data_storage_common:
stw r10,PACA_EXGEN+EX_DSISR(r13)
EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
bl  save_nvgprs
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
addir3,r1,STACK_FRAME_OVERHEAD
bl  unknown_exception
b   ret_from_except
@@ -1082,7 +1082,7 @@ h_data_storage_common:
.globl instruction_access_common
 instruction_access_common:
EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
ld  r12,_MSR(r1)
ld  r3,_NIP(r1)
andis.  r4,r12,0x5820
@@ -1146,7 +1146,7 @@ slb_miss_fault:
 
 unrecov_user_slb:
EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
bl  save_nvgprs
 1: addir3,r1,STACK_FRAME_OVERHEAD
bl  unrecoverable_exception
@@ -1169,7 +1169,7 @@ machine_check_common:
stw r10,PACA_EXGEN+EX_DSISR(r13)
EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
FINISH_NAP
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
ld  r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
std r3,_DAR(r1)
@@ -1192,7 +1192,7 @@ alignment_common:
std r3,_DAR(r1)
std r4,_DSISR(r1)
bl  save_nvgprs
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
addir3,r1,STACK_FRAME_OVERHEAD
bl  alignment_exception
b   ret_from_except
@@ -1202,7 +1202,7 @@ alignment_common:
 program_check_common:
EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
bl  save_nvgprs
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
addir3,r1,STACK_FRAME_OVERHEAD
bl  program_check_exception
b   ret_from_except
@@ -1213,7 +1213,7 @@ fp_unavailable_common:
EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
bne 1f  /* if from user, just load it up */
bl  save_nvgprs
-   DISABLE_INTS
+   RECONCILE_IRQ_STATE(r10, r11)
addir3,r1,STACK_FRAME_OVERHEAD
bl  kernel_fp_unavailable_exception

Re: [PATCH 1/6] powerpc/powernv: Enable M64 aperatus for PHB3

2014-07-15 Thread Gavin Shan

On Tue, Jul 15, 2014 at 10:55:25AM +0800, Wei Yang wrote:
On Thu, Jul 10, 2014 at 09:53:41PM +0800, Guo Chao wrote:
This patch enable M64 aperatus for PHB3.

We already had platform hook (ppc_md.pcibios_window_alignment) to affect
the PCI resource assignment done in PCI core so that each PE's M32 resource
was built on basis of M32 segment size. Similarly, we're using that for
M64 assignment on basis of M64 segment size.

   * We're using last M64 BAR to cover M64 aperatus, and it's shared by all
 256 PEs.
   * We don't support P7IOC yet. However, some function callbacks are added
 to (struct pnv_phb) so that we can reuse them on P7IOC in future.
   * PE, corresponding to PCI bus with large M64 BAR device attached, might
 span multiple M64 segments. We introduce compound PE to cover the case.
 The compound PE is a list of PEs and the master PE is used as before.
 The slave PEs are just for MMIO isolation.

Signed-off-by: Guo Chao y...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/opal.h   |   8 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 284 
 --
 arch/powerpc/platforms/powernv/pci.h  |  20 +++
 3 files changed, 297 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 0da1dbd..ae885cc 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -340,6 +340,12 @@ enum OpalMveEnableAction {
  OPAL_ENABLE_MVE = 1
 };
 
+enum OpalM64EnableAction {
+ OPAL_DISABLE_M64 = 0,
+ OPAL_ENABLE_M64_SPLIT = 1,
+ OPAL_ENABLE_M64_NON_SPLIT = 2
+};
+
 enum OpalPciResetScope {
  OPAL_PHB_COMPLETE = 1, OPAL_PCI_LINK = 2, OPAL_PHB_ERROR = 3,
  OPAL_PCI_HOT_RESET = 4, OPAL_PCI_FUNDAMENTAL_RESET = 5,
@@ -768,7 +774,7 @@ int64_t opal_pci_set_phb_mem_window(uint64_t phb_id, 
uint16_t window_type,
  uint16_t window_num,
  uint64_t starting_real_address,
  uint64_t starting_pci_address,
- uint16_t segment_size);
+ uint64_t size);
 int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint16_t pe_number,
  uint16_t window_type, uint16_t window_num,
  uint16_t segment_num);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index de19ede..851e615 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -36,6 +36,7 @@
 #include asm/tce.h
 #include asm/xics.h
 #include asm/debug.h
+#include asm/firmware.h
 
 #include powernv.h
 #include pci.h
@@ -82,6 +83,12 @@ static inline void __raw_rm_writeq(u64 val, volatile void 
__iomem *paddr)
  : : r (val), r (paddr) : memory);
 }
 
+static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
+{
+ return ((flags  (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
+ (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
+}
+
 static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
  unsigned long pe;
@@ -106,6 +113,243 @@ static void pnv_ioda_free_pe(struct pnv_phb *phb, int 
pe)
  clear_bit(pe, phb-ioda.pe_alloc);
 }
 
+/* The default M64 BAR is shared by all PEs */
+static int pnv_ioda2_init_m64(struct pnv_phb *phb)
+{
+ const char *desc;
+ struct resource *r;
+ s64 rc;
+
+ /* Configure the default M64 BAR */
+ rc = opal_pci_set_phb_mem_window(phb-opal_id,
+  OPAL_M64_WINDOW_TYPE,
+  phb-ioda.m64_bar_idx,
+  phb-ioda.m64_base,
+  0, /* unused */
+  phb-ioda.m64_size);
+ if (rc != OPAL_SUCCESS) {
+ desc = configuring;
+ goto fail;
+ }
+
+ /* Enable the default M64 BAR */
+ rc = opal_pci_phb_mmio_enable(phb-opal_id,
+   OPAL_M64_WINDOW_TYPE,
+   phb-ioda.m64_bar_idx,
+   OPAL_ENABLE_M64_SPLIT);
+ if (rc != OPAL_SUCCESS) {
+ desc = enabling;
+ goto fail;
+ }
+
+ /* Mark the M64 BAR assigned */
+ set_bit(phb-ioda.m64_bar_idx, phb-ioda.m64_bar_alloc);
+
+ /*
+  * Strip off the segment used by the reserved PE, which is
+  * expected to be 0 or last one of PE capabicity.
+  */
+ r = phb-hose-mem_resources[1];
+ if (phb-ioda.reserved_pe == 0)
+ r-start += phb-ioda.m64_segsize;
+ else if (phb-ioda.reserved_pe == (phb-ioda.total_pe - 1))
+ r-end -= phb-ioda.m64_segsize;
+ else
+ pr_warn(  Cannot strip M64 segment for reserved PE#%d\n,
+ phb-ioda.reserved_pe);
+
+ return 0;
+
+fail:
+ pr_warn(  Failure %lld %s M64 BAR#%d\n,
+

[PATCH] ppc/xmon: use isspace/isxdigit/isalnum from linux/ctype.h

2014-07-15 Thread Vincent Bernat

isxdigit() macro definition is the same.

isalnum() from linux/ctype.h will accept additional latin non-ASCII
characters. This is harmless since this macro is used in scanhex() which
parses user input.

isspace() from linux/ctype.h will accept vertical tab and form feed but
not NULL. The use of this macro is modified to accept NULL as
well. Additional characters are harmless since this macro is also only
used in scanhex().

Signed-off-by: Vincent Bernat vinc...@bernat.im
---
 arch/powerpc/xmon/xmon.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d199bfa2f1fa..55d9b48774b7 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -24,6 +24,7 @@
 #include linux/interrupt.h
 #include linux/irq.h
 #include linux/bug.h
+#include linux/ctype.h
 
 #include asm/ptrace.h
 #include asm/string.h
@@ -177,14 +178,6 @@ extern void xmon_leave(void);
 #define GETWORD(v) (((v)[0]  24) + ((v)[1]  16) + ((v)[2]  8) + 
(v)[3])
 #endif
 
-#define isxdigit(c)(('0' = (c)  (c) = '9') \
-|| ('a' = (c)  (c) = 'f') \
-|| ('A' = (c)  (c) = 'F'))
-#define isalnum(c) (('0' = (c)  (c) = '9') \
-|| ('a' = (c)  (c) = 'z') \
-|| ('A' = (c)  (c) = 'Z'))
-#define isspace(c) (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0)
-
 static char *help_string = \
 Commands:\n\
   bshow breakpoints\n\
@@ -2121,9 +2114,6 @@ static void dump_pacas(void)
 }
 #endif
 
-#define isxdigit(c)(('0' = (c)  (c) = '9') \
-|| ('a' = (c)  (c) = 'f') \
-|| ('A' = (c)  (c) = 'F'))
 static void
 dump(void)
 {
@@ -2526,7 +2516,7 @@ scanhex(unsigned long *vp)
int i;
for (i=0; i63; i++) {
c = inchar();
-   if (isspace(c)) {
+   if (isspace(c) || c == '\0') {
termch = c;
break;
}
-- 
2.0.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/3] powerpc: Add machine_early_initcall()

2014-07-15 Thread Michael Ellerman

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/include/asm/machdep.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index f92b0b54e921..5c7e74ddee4c 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -366,6 +366,7 @@ static inline void log_error(char *buf, unsigned int 
err_type, int fatal)
} \
__define_initcall(__machine_initcall_##mach##_##fn, id);
 
+#define machine_early_initcall(mach, fn)   __define_machine_initcall(mach, 
fn, early)
 #define machine_core_initcall(mach, fn)
__define_machine_initcall(mach, fn, 1)
 #define machine_core_initcall_sync(mach, fn)   __define_machine_initcall(mach, 
fn, 1s)
 #define machine_postcore_initcall(mach, fn)__define_machine_initcall(mach, 
fn, 2)
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/3] powerpc/powernv: Switch powernv drivers to use machine_xxx_initcall()

2014-07-15 Thread Michael Ellerman

A lot of the code in platforms/powernv is using non-machine initcalls.
That means if a kernel built with powernv support runs on another
platform, for example pseries, the initcalls will still run.

That is usually OK, because the initcalls will check for something in
the device tree or elsewhere before doing anything, so on other
platforms they will usually just return.

But it's fishy for powernv code to be running on other platforms, so
switch them all to be machine initcalls. If we want any of them to run
on other platforms in future they should move to sysdev.

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/platforms/powernv/eeh-powernv.c| 6 +-
 arch/powerpc/platforms/powernv/opal-async.c | 3 ++-
 arch/powerpc/platforms/powernv/opal-lpc.c   | 2 +-
 arch/powerpc/platforms/powernv/opal-memory-errors.c | 3 ++-
 arch/powerpc/platforms/powernv/opal-xscom.c | 2 +-
 arch/powerpc/platforms/powernv/opal.c   | 9 +
 arch/powerpc/platforms/powernv/pci.c| 3 +--
 arch/powerpc/platforms/powernv/rng.c| 2 +-
 8 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 56a206f32f77..998bcc18a491 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -398,9 +398,6 @@ static int __init eeh_powernv_init(void)
 {
int ret = -EINVAL;
 
-   if (!machine_is(powernv))
-   return ret;
-
ret = eeh_ops_register(powernv_eeh_ops);
if (!ret)
pr_info(EEH: PowerNV platform initialized\n);
@@ -409,5 +406,4 @@ static int __init eeh_powernv_init(void)
 
return ret;
 }
-
-early_initcall(eeh_powernv_init);
+machine_early_initcall(powernv, eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c 
b/arch/powerpc/platforms/powernv/opal-async.c
index 32e2adfa5320..e462ab947d16 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -20,6 +20,7 @@
 #include linux/wait.h
 #include linux/gfp.h
 #include linux/of.h
+#include asm/machdep.h
 #include asm/opal.h
 
 #define N_ASYNC_COMPLETIONS64
@@ -201,4 +202,4 @@ out_opal_node:
 out:
return err;
 }
-subsys_initcall(opal_async_comp_init);
+machine_subsys_initcall(powernv, opal_async_comp_init);
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c 
b/arch/powerpc/platforms/powernv/opal-lpc.c
index f04b4d8aca5a..ad4b31df779a 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -324,7 +324,7 @@ static int opal_lpc_init_debugfs(void)
rc |= opal_lpc_debugfs_create_type(root, fw, OPAL_LPC_FW);
return rc;
 }
-device_initcall(opal_lpc_init_debugfs);
+machine_device_initcall(powernv, opal_lpc_init_debugfs);
 #endif  /* CONFIG_DEBUG_FS */
 
 void opal_lpc_init(void)
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c 
b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index b17a34b695ef..43db2136dbff 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -27,6 +27,7 @@
 #include linux/mm.h
 #include linux/slab.h
 
+#include asm/machdep.h
 #include asm/opal.h
 #include asm/cputable.h
 
@@ -143,4 +144,4 @@ static int __init opal_mem_err_init(void)
}
return 0;
 }
-subsys_initcall(opal_mem_err_init);
+machine_subsys_initcall(powernv, opal_mem_err_init);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c 
b/arch/powerpc/platforms/powernv/opal-xscom.c
index 4cd2ea6c0dbe..7634d1c62299 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -130,4 +130,4 @@ static int opal_xscom_init(void)
scom_init(opal_scom_controller);
return 0;
 }
-arch_initcall(opal_xscom_init);
+machine_arch_initcall(powernv, opal_xscom_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 199975613fe9..6ef2e5c5bc64 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -22,6 +22,8 @@
 #include linux/kobject.h
 #include linux/delay.h
 #include linux/memblock.h
+
+#include asm/machdep.h
 #include asm/opal.h
 #include asm/firmware.h
 #include asm/mce.h
@@ -200,8 +202,7 @@ static int __init opal_register_exception_handlers(void)
 
return 0;
 }
-
-early_initcall(opal_register_exception_handlers);
+machine_early_initcall(powernv, opal_register_exception_handlers);
 
 int opal_notifier_register(struct notifier_block *nb)
 {
@@ -368,7 +369,7 @@ static int __init opal_message_init(void)
}
return 0;
 }
-early_initcall(opal_message_init);
+machine_early_initcall(powernv, opal_message_init);
 
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
@@ -630,7 +631,7 @@

[PATCH 3/3] powerpc/pseries: Switch pseries drivers to use machine_xxx_initcall()

2014-07-15 Thread Michael Ellerman

A lot of the code in platforms/pseries is using non-machine initcalls.
That means if a kernel built with pseries support runs on another
platform, for example powernv, the initcalls will still run.

Most of these cases are OK, though sometimes only due to luck. Some were
having more effect:

 * hcall_inst_init
  - Checking FW_FEATURE_LPAR which is set on ps3  celleb.
 * mobility_sysfs_init
  - created sysfs files unconditionally
  - but no effect due to ENOSYS from rtas_ibm_suspend_me()
 * apo_pm_init
  - created sysfs, allows write
  - nothing checks the value written to though
 * alloc_dispatch_log_kmem_cache
  - creating kmem_cache on non-pseries machines

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
 arch/powerpc/platforms/pseries/dtl.c | 3 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 8 ++--
 arch/powerpc/platforms/pseries/hvCall_inst.c | 2 +-
 arch/powerpc/platforms/pseries/mobility.c| 3 ++-
 arch/powerpc/platforms/pseries/msi.c | 3 +--
 arch/powerpc/platforms/pseries/power.c   | 5 +++--
 arch/powerpc/platforms/pseries/ras.c | 2 +-
 arch/powerpc/platforms/pseries/reconfig.c| 5 +
 arch/powerpc/platforms/pseries/rng.c | 2 +-
 arch/powerpc/platforms/pseries/setup.c   | 2 +-
 arch/powerpc/platforms/pseries/suspend.c | 5 ++---
 11 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dtl.c 
b/arch/powerpc/platforms/pseries/dtl.c
index 7d61498e45c0..1062f71f5a85 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -29,6 +29,7 @@
 #include asm/lppaca.h
 #include asm/debug.h
 #include asm/plpar_wrappers.h
+#include asm/machdep.h
 
 struct dtl {
struct dtl_entry*buf;
@@ -391,4 +392,4 @@ err_remove_dir:
 err:
return rc;
 }
-arch_initcall(dtl_init);
+machine_arch_initcall(pseries, dtl_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 0bec0c02c5e7..476a5d8b0b36 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -743,10 +743,7 @@ static struct eeh_ops pseries_eeh_ops = {
  */
 static int __init eeh_pseries_init(void)
 {
-   int ret = -EINVAL;
-
-   if (!machine_is(pseries))
-   return ret;
+   int ret;
 
ret = eeh_ops_register(pseries_eeh_ops);
if (!ret)
@@ -757,5 +754,4 @@ static int __init eeh_pseries_init(void)
 
return ret;
 }
-
-early_initcall(eeh_pseries_init);
+machine_early_initcall(pseries, eeh_pseries_init);
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c 
b/arch/powerpc/platforms/pseries/hvCall_inst.c
index cf4e7736e4f1..19f15310facf 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -162,4 +162,4 @@ static int __init hcall_inst_init(void)
 
return 0;
 }
-__initcall(hcall_inst_init);
+machine_device_initcall(pseries, hcall_inst_init);
diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index bde7ebad3949..d146fef038b8 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -18,6 +18,7 @@
 #include linux/delay.h
 #include linux/slab.h
 
+#include asm/machdep.h
 #include asm/rtas.h
 #include pseries.h
 
@@ -362,4 +363,4 @@ static int __init mobility_sysfs_init(void)
 
return rc;
 }
-device_initcall(mobility_sysfs_init);
+machine_device_initcall(pseries, mobility_sysfs_init);
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 0c882e83c4ce..b3486dd041df 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -532,5 +532,4 @@ static int rtas_msi_init(void)
 
return 0;
 }
-arch_initcall(rtas_msi_init);
-
+machine_arch_initcall(pseries, rtas_msi_init);
diff --git a/arch/powerpc/platforms/pseries/power.c 
b/arch/powerpc/platforms/pseries/power.c
index 6d6266236446..c26eadde434c 100644
--- a/arch/powerpc/platforms/pseries/power.c
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -25,6 +25,7 @@
 #include linux/string.h
 #include linux/errno.h
 #include linux/init.h
+#include asm/machdep.h
 
 unsigned long rtas_poweron_auto; /* default and normal state is 0 */
 
@@ -71,11 +72,11 @@ static int __init pm_init(void)
return -ENOMEM;
return sysfs_create_group(power_kobj, attr_group);
 }
-core_initcall(pm_init);
+machine_core_initcall(pseries, pm_init);
 #else
 static int __init apo_pm_init(void)
 {
return (sysfs_create_file(power_kobj, auto_poweron_attr.attr));
 }
-__initcall(apo_pm_init);
+machine_device_initcall(pseries, apo_pm_init);
 #endif
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 9c5778e6ed4b..dff05b9eb946 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@

Re: bit fields data tearing

2014-07-15 Thread Peter Hurley


On 07/13/2014 06:25 PM, Benjamin Herrenschmidt wrote:

On Sun, 2014-07-13 at 09:15 -0400, Peter Hurley wrote:


I'm not sure I understand your point here, Ben.

Suppose that two different spinlocks are used independently to
protect r-m-w access to adjacent data. In Oleg's example,
suppose spinlock 1 is used for access to the bitfield and
spinlock 2 is used for access to freeze_stop.

What would prevent an accidental write to freeze_stop from the
kt_1 thread?


My point was to be weary of bitfields in general because access
to them is always R-M-W, never atomic and that seem to escape
people regularily :-) (Among other problems such as endian etc...)

As for Oleg's example, it *should* have worked because the bitfield and
the adjacent freeze_stop should have been accessed using load/stores
that don't actually overlap, but the compiler bug causes the bitfield
access to not properly use the basic type of the bitfield, but escalate
to a full 64-bit R-M-W instead, thus incorrectly R-M-W'ing the field
next door.


Yeah, ok, so just a generic heads-up about non-atomicity of bitfields,
and not something specific to Oleg's example. Thanks.

Jonathan Corbet wrote a LWN article about this back in 2012:
http://lwn.net/Articles/478657/

I guess it's fixed in gcc 4.8, but too bad there's not a workaround for
earlier compilers (akin to -fstrict_volatile_bitfields without requiring
the volatile keyword).

Regards,
Peter Hurley



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3] arm64, ia64, ppc, s390, sh, tile, um, x86, mm: Remove default gate area

2014-07-15 Thread Andy Lutomirski

On Sun, Jul 13, 2014 at 1:01 PM, Andy Lutomirski l...@amacapital.net wrote:
 The core mm code will provide a default gate area based on
 FIXADDR_USER_START and FIXADDR_USER_END if
 !defined(__HAVE_ARCH_GATE_AREA)  defined(AT_SYSINFO_EHDR).

 This default is only useful for ia64.  arm64, ppc, s390, sh, tile,
 64-bit UML, and x86_32 have their own code just to disable it.  arm,
 32-bit UML, and x86_64 have gate areas, but they have their own
 implementations.

 This gets rid of the default and moves the code into ia64.

 This should save some code on architectures without a gate area: it's
 now possible to inline the gate_area functions in the default case.

Can one of you pull this somewhere?  Otherwise I can put it somewhere
stable and ask for -next inclusion, but that seems like overkill for a
single patch.

--Andy
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: thp: Add write barrier after updating the valid bit

2014-07-15 Thread Aneesh Kumar K.V

With hugepages, we store the hpte valid information in the pte page
whose address is stored in the second half of the PMD. Use a
write barrier to make sure that clearing pmd busy bit and updating
hpte valid info are ordered properly.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index eb9261024f51..558beb760062 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -394,6 +394,12 @@ static inline void mark_hpte_slot_valid(unsigned char 
*hpte_slot_array,
unsigned int index, unsigned int hidx)
 {
hpte_slot_array[index] = hidx  4 | 0x1  3;
+   /*
+* The hpte valid is stored in the pgtable whose address is in the
+* second half of the PMD. Order this against clearing of the busy bit 
in
+* huge pmd.
+*/
+   smp_wmb();
 }
 
 struct page *realmode_pfn_to_page(unsigned long pfn);
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] powerpc: thp: invalidate old 64K based hash page mapping before insert

2014-07-15 Thread Aneesh Kumar K.V

If we changed base page size of the segment, either via sub_page_protect
or via remap_4k_pfn, we do a demote_segment which doesn't flush the hash
table entries. We do that when inserting a new hash pte by checking the
_PAGE_COMBO flag. We missed to do that when inserting hash for a new 16MB
page. Add the same. This patch mark the 4k base page size 16MB hugepage
via _PAGE_COMBO.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/mm/hugepage-hash64.c | 66 +++
 1 file changed, 66 insertions(+)

diff --git a/arch/powerpc/mm/hugepage-hash64.c 
b/arch/powerpc/mm/hugepage-hash64.c
index 826893fcb3a7..28d1b8b93674 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -18,6 +18,56 @@
 #include linux/mm.h
 #include asm/machdep.h
 
+static void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
+   pmd_t *pmdp, unsigned int psize, int ssize)
+{
+   int i, max_hpte_count, valid;
+   unsigned long s_addr = addr;
+   unsigned char *hpte_slot_array;
+   unsigned long hidx, shift, vpn, hash, slot;
+
+   hpte_slot_array = get_hpte_slot_array(pmdp);
+   /*
+* IF we try to do a HUGE PTE update after a withdraw is done.
+* we will find the below NULL. This happens when we do
+* split_huge_page_pmd
+*/
+   if (!hpte_slot_array)
+   return;
+
+   if (ppc_md.hugepage_invalidate)
+   return ppc_md.hugepage_invalidate(vsid, addr, hpte_slot_array,
+ psize, ssize);
+   /*
+* No bluk hpte removal support, invalidate each entry
+*/
+   shift = mmu_psize_defs[psize].shift;
+   max_hpte_count = HPAGE_PMD_SIZE  shift;
+   for (i = 0; i  max_hpte_count; i++) {
+   /*
+* 8 bits per each hpte entries
+* 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+*/
+   valid = hpte_valid(hpte_slot_array, i);
+   if (!valid)
+   continue;
+   hidx =  hpte_hash_index(hpte_slot_array, i);
+
+   /* get the vpn */
+   addr = s_addr + (i * (1ul  shift));
+   vpn = hpt_vpn(addr, vsid, ssize);
+   hash = hpt_hash(vpn, shift, ssize);
+   if (hidx  _PTEIDX_SECONDARY)
+   hash = ~hash;
+
+   slot = (hash  htab_hash_mask) * HPTES_PER_GROUP;
+   slot += hidx  _PTEIDX_GROUP_IX;
+   ppc_md.hpte_invalidate(slot, vpn, psize,
+  MMU_PAGE_16M, ssize, 0);
+   }
+}
+
+
 int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
pmd_t *pmdp, unsigned long trap, int local, int ssize,
unsigned int psize)
@@ -85,6 +135,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, 
unsigned long vsid,
vpn = hpt_vpn(ea, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
hpte_slot_array = get_hpte_slot_array(pmdp);
+   if (psize == MMU_PAGE_4K) {
+   /*
+* invalidate the old hpte entry if we have that mapped via 64K
+* base page size. This is because demote_segment won't flush
+* hash page table entries.
+*/
+   if (!(old_pmd  _PAGE_COMBO))
+   flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, 
ssize);
+   }
 
valid = hpte_valid(hpte_slot_array, index);
if (valid) {
@@ -172,6 +231,13 @@ repeat:
mark_hpte_slot_valid(hpte_slot_array, index, slot);
}
/*
+* Mark the pte with _PAGE_COMBO, if we are trying to hash it with
+* base page size 4k.
+*/
+   if (psize == MMU_PAGE_4K)
+   new_pmd |= _PAGE_COMBO;
+
+   /*
 * No need to use ldarx/stdcx here
 */
*pmdp = __pmd(new_pmd  ~_PAGE_BUSY);
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: thp: Add write barrier after updating the valid bit

2014-07-15 Thread Aneesh Kumar K.V

With hugepages, we store the hpte valid information in the pte page
whose address is stored in the second half of the PMD. Use a
write barrier to make sure that clearing pmd busy bit and updating
hpte valid info are ordered properly.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index eb9261024f51..558beb760062 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -394,6 +394,12 @@ static inline void mark_hpte_slot_valid(unsigned char 
*hpte_slot_array,
unsigned int index, unsigned int hidx)
 {
hpte_slot_array[index] = hidx  4 | 0x1  3;
+   /*
+* The hpte valid is stored in the pgtable whose address is in the
+* second half of the PMD. Order this against clearing of the busy bit 
in
+* huge pmd.
+*/
+   smp_wmb();
 }
 
 struct page *realmode_pfn_to_page(unsigned long pfn);
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: subpage_protect: Increase the array size to take care of 64TB

2014-07-15 Thread Aneesh Kumar K.V

We now support TASK_SIZE of 16TB, hence the array should be 8.

Fixes the below crash:

Unable to handle kernel paging request for data at address 0x000100bd
Faulting instruction address: 0xc004f914
cpu 0x13: Vector: 300 (Data Access) at [c00fea75fa90]
pc: c004f914: .sys_subpage_prot+0x2d4/0x5c0
lr: c004fb5c: .sys_subpage_prot+0x51c/0x5c0
sp: c00fea75fd10
   msr: 90009032
   dar: 100bd
 dsisr: 4000
  current = 0xc00fea6ae490
  paca= 0xcfb8ab00   softe: 0irq_happened: 0x00
pid   = 8237, comm = a.out
enter ? for help
[c00fea75fe30] c000a164 syscall_exit+0x0/0x98
--- Exception: c00 (System Call) at 3fff89737004

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/mmu-hash64.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 807014dde821..c2b4dcf23d03 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -22,6 +22,7 @@
  */
 #include asm/pgtable-ppc64.h
 #include asm/bug.h
+#include asm/processor.h
 
 /*
  * Segment table
@@ -496,7 +497,7 @@ extern void slb_set_size(u16 size);
  */
 struct subpage_prot_table {
unsigned long maxaddr;  /* only addresses  this are protected */
-   unsigned int **protptrs[2];
+   unsigned int **protptrs[(TASK_SIZE_USER64  43)];
unsigned int *low_prot[4];
 };
 
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] powerpc: thp: don't recompute vsid and ssize in loop on invalidate

2014-07-15 Thread Aneesh Kumar K.V

The segment identifier and segment size will remain the same in
the loop, So we can compute it outside. We also change the
hugepage_invalidate interface so that we can use it the later patch

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/machdep.h|  6 +++---
 arch/powerpc/mm/hash_native_64.c  | 19 +--
 arch/powerpc/mm/pgtable_64.c  | 24 
 arch/powerpc/platforms/pseries/lpar.c | 20 ++--
 4 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index f92b0b54e921..8dcb721d03d8 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -57,10 +57,10 @@ struct machdep_calls {
void(*hpte_removebolted)(unsigned long ea,
 int psize, int ssize);
void(*flush_hash_range)(unsigned long number, int local);
-   void(*hugepage_invalidate)(struct mm_struct *mm,
+   void(*hugepage_invalidate)(unsigned long vsid,
+  unsigned long addr,
   unsigned char *hpte_slot_array,
-  unsigned long addr, int psize);
-
+  int psize, int ssize);
/* special for kexec, to be called in real mode, linear mapping is
 * destroyed as well */
void(*hpte_clear_all)(void);
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index cf1d325eae8b..fb89d7695a9a 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -412,18 +412,18 @@ static void native_hpte_invalidate(unsigned long slot, 
unsigned long vpn,
local_irq_restore(flags);
 }
 
-static void native_hugepage_invalidate(struct mm_struct *mm,
+static void native_hugepage_invalidate(unsigned long vsid,
+  unsigned long addr,
   unsigned char *hpte_slot_array,
-  unsigned long addr, int psize)
+  int psize, int ssize)
 {
-   int ssize = 0, i;
-   int lock_tlbie;
+   int i, lock_tlbie;
struct hash_pte *hptep;
int actual_psize = MMU_PAGE_16M;
unsigned int max_hpte_count, valid;
unsigned long flags, s_addr = addr;
unsigned long hpte_v, want_v, shift;
-   unsigned long hidx, vpn = 0, vsid, hash, slot;
+   unsigned long hidx, vpn = 0, hash, slot;
 
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U  (PMD_SHIFT - shift);
@@ -437,15 +437,6 @@ static void native_hugepage_invalidate(struct mm_struct 
*mm,
 
/* get the vpn */
addr = s_addr + (i * (1ul  shift));
-   if (!is_kernel_addr(addr)) {
-   ssize = user_segment_size(addr);
-   vsid = get_vsid(mm-context.id, addr, ssize);
-   WARN_ON(vsid == 0);
-   } else {
-   vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
-   ssize = mmu_kernel_ssize;
-   }
-
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx  _PTEIDX_SECONDARY)
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index f6ce1f111f5b..ac8c0754a4e9 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -745,12 +745,21 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, 
unsigned long addr,
if (!hpte_slot_array)
return;
 
-   /* get the base page size */
+   /* get the base page size,vsid and segment size */
psize = get_slice_psize(mm, s_addr);
+   if (!is_kernel_addr(s_addr)) {
+   ssize = user_segment_size(s_addr);
+   vsid = get_vsid(mm-context.id, s_addr, ssize);
+   WARN_ON(vsid == 0);
+   } else {
+   vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize);
+   ssize = mmu_kernel_ssize;
+   }
 
if (ppc_md.hugepage_invalidate)
-   return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
- s_addr, psize);
+   return ppc_md.hugepage_invalidate(vsid, s_addr,
+ hpte_slot_array,
+ psize, ssize);
/*
 * No bluk hpte removal support, invalidate each entry
 */
@@ -768,15 +777,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned 
long addr,
 
/* get the vpn */
addr = s_addr + (i * (1ul  shift));
-   if (!is_kernel_addr(addr)) {

Re: bit fields data tearing

2014-07-15 Thread Richard Henderson

On 07/15/2014 06:54 AM, Peter Hurley wrote:
 
 Jonathan Corbet wrote a LWN article about this back in 2012:
 http://lwn.net/Articles/478657/
 
 I guess it's fixed in gcc 4.8, but too bad there's not a workaround for
 earlier compilers (akin to -fstrict_volatile_bitfields without requiring
 the volatile keyword)

From the gcc pr, it looks like the patch was backported to 4.7.
But we didn't fix it in versions earlier than that.


r~
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: subpage_protect: Increase the array size to take care of 64TB

2014-07-15 Thread Aneesh Kumar K.V

Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes:

 We now support TASK_SIZE of 16TB, hence the array should be 8.

should be ^^^ 64TB   


 Fixes the below crash:

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] ppc/xmon: use isspace/isxdigit/isalnum from linux/ctype.h

2014-07-15 Thread Benjamin Herrenschmidt

On Tue, 2014-07-15 at 13:43 +0200, Vincent Bernat wrote:
 isxdigit() macro definition is the same.
 
 isalnum() from linux/ctype.h will accept additional latin non-ASCII
 characters. This is harmless since this macro is used in scanhex() which
 parses user input.
 
 isspace() from linux/ctype.h will accept vertical tab and form feed but
 not NULL. The use of this macro is modified to accept NULL as
 well. Additional characters are harmless since this macro is also only
 used in scanhex().

I don't think we care about \0 ... Paul, care to chime in ? After all,
you wrote that stuff a century or two ago... :)

Cheers,
Ben.

 Signed-off-by: Vincent Bernat vinc...@bernat.im
 ---
  arch/powerpc/xmon/xmon.c | 14 ++
  1 file changed, 2 insertions(+), 12 deletions(-)
 
 diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
 index d199bfa2f1fa..55d9b48774b7 100644
 --- a/arch/powerpc/xmon/xmon.c
 +++ b/arch/powerpc/xmon/xmon.c
 @@ -24,6 +24,7 @@
  #include linux/interrupt.h
  #include linux/irq.h
  #include linux/bug.h
 +#include linux/ctype.h
  
  #include asm/ptrace.h
  #include asm/string.h
 @@ -177,14 +178,6 @@ extern void xmon_leave(void);
  #define GETWORD(v)   (((v)[0]  24) + ((v)[1]  16) + ((v)[2]  8) + 
 (v)[3])
  #endif
  
 -#define isxdigit(c)  (('0' = (c)  (c) = '9') \
 -  || ('a' = (c)  (c) = 'f') \
 -  || ('A' = (c)  (c) = 'F'))
 -#define isalnum(c)   (('0' = (c)  (c) = '9') \
 -  || ('a' = (c)  (c) = 'z') \
 -  || ('A' = (c)  (c) = 'Z'))
 -#define isspace(c)   (c == ' ' || c == '\t' || c == 10 || c == 13 || c == 0)
 -
  static char *help_string = \
  Commands:\n\
b  show breakpoints\n\
 @@ -2121,9 +2114,6 @@ static void dump_pacas(void)
  }
  #endif
  
 -#define isxdigit(c)  (('0' = (c)  (c) = '9') \
 -  || ('a' = (c)  (c) = 'f') \
 -  || ('A' = (c)  (c) = 'F'))
  static void
  dump(void)
  {
 @@ -2526,7 +2516,7 @@ scanhex(unsigned long *vp)
   int i;
   for (i=0; i63; i++) {
   c = inchar();
 - if (isspace(c)) {
 + if (isspace(c) || c == '\0') {
   termch = c;
   break;
   }


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 3/3] powerpc/pseries: Switch pseries drivers to use machine_xxx_initcall()

2014-07-15 Thread Michael Ellerman

A lot of the code in platforms/pseries is using non-machine initcalls.
That means if a kernel built with pseries support runs on another
platform, for example powernv, the initcalls will still run.

Most of these cases are OK, though sometimes only due to luck. Some were
having more effect:

 * hcall_inst_init
  - Checking FW_FEATURE_LPAR which is set on ps3  celleb.
 * mobility_sysfs_init
  - created sysfs files unconditionally
  - but no effect due to ENOSYS from rtas_ibm_suspend_me()
 * apo_pm_init
  - created sysfs, allows write
  - nothing checks the value written to though
 * alloc_dispatch_log_kmem_cache
  - creating kmem_cache on non-pseries machines

Signed-off-by: Michael Ellerman m...@ellerman.id.au
---
v2: Add missing includes of machdep.h

 arch/powerpc/platforms/pseries/dtl.c | 3 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 8 ++--
 arch/powerpc/platforms/pseries/hvCall_inst.c | 3 ++-
 arch/powerpc/platforms/pseries/mobility.c| 3 ++-
 arch/powerpc/platforms/pseries/msi.c | 4 ++--
 arch/powerpc/platforms/pseries/power.c   | 5 +++--
 arch/powerpc/platforms/pseries/ras.c | 2 +-
 arch/powerpc/platforms/pseries/reconfig.c| 5 +
 arch/powerpc/platforms/pseries/rng.c | 2 +-
 arch/powerpc/platforms/pseries/setup.c   | 2 +-
 arch/powerpc/platforms/pseries/suspend.c | 5 ++---
 11 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dtl.c 
b/arch/powerpc/platforms/pseries/dtl.c
index 7d61498e45c0..1062f71f5a85 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -29,6 +29,7 @@
 #include asm/lppaca.h
 #include asm/debug.h
 #include asm/plpar_wrappers.h
+#include asm/machdep.h
 
 struct dtl {
struct dtl_entry*buf;
@@ -391,4 +392,4 @@ err_remove_dir:
 err:
return rc;
 }
-arch_initcall(dtl_init);
+machine_arch_initcall(pseries, dtl_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 0bec0c02c5e7..476a5d8b0b36 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -743,10 +743,7 @@ static struct eeh_ops pseries_eeh_ops = {
  */
 static int __init eeh_pseries_init(void)
 {
-   int ret = -EINVAL;
-
-   if (!machine_is(pseries))
-   return ret;
+   int ret;
 
ret = eeh_ops_register(pseries_eeh_ops);
if (!ret)
@@ -757,5 +754,4 @@ static int __init eeh_pseries_init(void)
 
return ret;
 }
-
-early_initcall(eeh_pseries_init);
+machine_early_initcall(pseries, eeh_pseries_init);
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c 
b/arch/powerpc/platforms/pseries/hvCall_inst.c
index cf4e7736e4f1..4575f0c9e521 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -27,6 +27,7 @@
 #include asm/firmware.h
 #include asm/cputable.h
 #include asm/trace.h
+#include asm/machdep.h
 
 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
 
@@ -162,4 +163,4 @@ static int __init hcall_inst_init(void)
 
return 0;
 }
-__initcall(hcall_inst_init);
+machine_device_initcall(pseries, hcall_inst_init);
diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index bde7ebad3949..d146fef038b8 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -18,6 +18,7 @@
 #include linux/delay.h
 #include linux/slab.h
 
+#include asm/machdep.h
 #include asm/rtas.h
 #include pseries.h
 
@@ -362,4 +363,4 @@ static int __init mobility_sysfs_init(void)
 
return rc;
 }
-device_initcall(mobility_sysfs_init);
+machine_device_initcall(pseries, mobility_sysfs_init);
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 0c882e83c4ce..18ff4626d74e 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -16,6 +16,7 @@
 #include asm/rtas.h
 #include asm/hw_irq.h
 #include asm/ppc-pci.h
+#include asm/machdep.h
 
 static int query_token, change_token;
 
@@ -532,5 +533,4 @@ static int rtas_msi_init(void)
 
return 0;
 }
-arch_initcall(rtas_msi_init);
-
+machine_arch_initcall(pseries, rtas_msi_init);
diff --git a/arch/powerpc/platforms/pseries/power.c 
b/arch/powerpc/platforms/pseries/power.c
index 6d6266236446..c26eadde434c 100644
--- a/arch/powerpc/platforms/pseries/power.c
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -25,6 +25,7 @@
 #include linux/string.h
 #include linux/errno.h
 #include linux/init.h
+#include asm/machdep.h
 
 unsigned long rtas_poweron_auto; /* default and normal state is 0 */
 
@@ -71,11 +72,11 @@ static int __init pm_init(void)
return -ENOMEM;
return sysfs_create_group(power_kobj, attr_group);
 }
-core_initcall(pm_init);
+machine_core_initcall(pseries, pm_init);
 #else
 static int __init

Re: [PATCH v2] powerpc/pseries: dynamically added OF nodes need to call of_node_init

2014-07-15 Thread Grant Likely

On Thu, Jul 10, 2014 at 1:59 PM, Nathan Fontenot
nf...@linux.vnet.ibm.com wrote:
 On 07/10/2014 01:50 PM, Tyrel Datwyler wrote:
 Commit 75b57ecf9 refactored device tree nodes to use kobjects such that they
 can be exposed via /sysfs. A secondary commit 0829f6d1f furthered this rework
 by moving the kobect initialization logic out of of_node_add into its own
 of_node_init function. The inital commit removed the existing kref_init calls
 in the pseries dlpar code with the assumption kobject initialization would
 occur in of_node_add. The second commit had the side effect of triggering a
 BUG_ON during DLPAR, migration and suspend/resume operations as a result of
 dynamically added nodes being uninitialized.

 This patch fixes this by adding of_node_init calls in place of the previously
 removed kref_init calls.

 Fixes: 0829f6d1f69e (of: device_node kobject lifecycle fixes)
 Cc: sta...@vger.kernel.org
 Signed-off-by: Tyrel Datwyler tyr...@linux.vnet.ibm.com

 Acked-by: Nathan Fontenot nf...@linux.vnet.ibm.com

Acked-by: Grant Likely grant.lik...@linaro.org

Ben, are you going to take this or should I take it via my tree?

g.


 ---
 V2:
  - included stable kernel list on Cc per comment by mpe

  arch/powerpc/platforms/pseries/dlpar.c| 1 +
  arch/powerpc/platforms/pseries/reconfig.c | 1 +
  2 files changed, 2 insertions(+)

 diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
 b/arch/powerpc/platforms/pseries/dlpar.c
 index 022b38e..2d0b4d6 100644
 --- a/arch/powerpc/platforms/pseries/dlpar.c
 +++ b/arch/powerpc/platforms/pseries/dlpar.c
 @@ -86,6 +86,7 @@ static struct device_node *dlpar_parse_cc_node(struct 
 cc_workarea *ccwa,
   }

   of_node_set_flag(dn, OF_DYNAMIC);
 + of_node_init(dn);

   return dn;
  }
 diff --git a/arch/powerpc/platforms/pseries/reconfig.c 
 b/arch/powerpc/platforms/pseries/reconfig.c
 index 0435bb6..1c0a60d 100644
 --- a/arch/powerpc/platforms/pseries/reconfig.c
 +++ b/arch/powerpc/platforms/pseries/reconfig.c
 @@ -69,6 +69,7 @@ static int pSeries_reconfig_add_node(const char *path, 
 struct property *proplist

   np-properties = proplist;
   of_node_set_flag(np, OF_DYNAMIC);
 + of_node_init(np);

   np-parent = derive_parent(path);
   if (IS_ERR(np-parent)) {


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: OF_DYNAMIC node lifecycle

2014-07-15 Thread Grant Likely

I've got another question about powerpc reconfiguration. I was looking
at the dlpar_configure_connector() function in dlpar.c. I see that the
function has the ability to process multiple nodes with additional
sibling and child nodes. It appears to link them into a detached tree
structure, and the function returns a pointer to the first node.

All of the callers of that function then call dlpar_attach_node(),
which calls of_attach_node(). However, of_attach_node() only handles a
single node. It doesn't handle siblings or children. Is this a bug?
Does the configure connector ever actually receive more than one node
at once?

g.

On Fri, Jun 27, 2014 at 8:41 AM, Nathan Fontenot nf...@austin.ibm.com wrote:
 On 06/27/2014 07:41 AM, Grant Likely wrote:
 On Thu, 26 Jun 2014 15:01:49 -0500, Nathan Fontenot nf...@austin.ibm.com 
 wrote:
 On 06/25/2014 03:24 PM, Grant Likely wrote:
 On Tue, 24 Jun 2014 15:10:55 -0500, Nathan Fontenot nf...@austin.ibm.com 
 wrote:
 heh! I have often thought about adding reference counting to device tree
 properties.

 You horrible, horrible man.

 Yes. I are evil :)

 After looking again the work needed to add reference counts to properties
 would be huge. The few properties I am concerned with are specific to 
 powerpc
 so perhaps just adding an arch specific lock around updating those
 properties would work.

 Which code/properties? I'd like to have a look myself.

 /ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory

 The property is updated in
 arch/powerpc/platforms/pseries/hotplug-memory.c:pseries_update_drconf_memory()

 Specifically, what do you need for the locking? Are you wanting to hold
 off additional changes while that function is executing? Pantelis is
 adding a mutex for device tree writers. Holding that mutex would prevent
 any changes from happening in the tree without affecting readers. Would
 that be sufficient?

 That would work.

 -Nathan

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 4/6] powerpc/eeh: Replace pr_warning() with pr_warn()

2014-07-15 Thread Gavin Shan

pr_warn() is equal to pr_warning(), but the former is a bit more
formal. The patch replaces pr_warning() with pr_warn().

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/eeh.c| 16 
 arch/powerpc/kernel/eeh_cache.c  |  7 ---
 arch/powerpc/kernel/eeh_dev.c|  3 ++-
 arch/powerpc/kernel/eeh_driver.c | 16 
 arch/powerpc/kernel/eeh_pe.c |  3 ++-
 arch/powerpc/platforms/powernv/eeh-ioda.c| 12 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c |  7 ---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 28 ++--
 8 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 123c151..dcc2a95 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -334,8 +334,8 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
/* Find the PHB PE */
phb_pe = eeh_phb_pe_get(pe-phb);
if (!phb_pe) {
-   pr_warning(%s Can't find PE for PHB#%d\n,
-  __func__, pe-phb-global_number);
+   pr_warn(%s Can't find PE for PHB#%d\n,
+   __func__, pe-phb-global_number);
return -EEXIST;
}
 
@@ -784,13 +784,13 @@ void eeh_save_bars(struct eeh_dev *edev)
 int __init eeh_ops_register(struct eeh_ops *ops)
 {
if (!ops-name) {
-   pr_warning(%s: Invalid EEH ops name for %p\n,
+   pr_warn(%s: Invalid EEH ops name for %p\n,
__func__, ops);
return -EINVAL;
}
 
if (eeh_ops  eeh_ops != ops) {
-   pr_warning(%s: EEH ops of platform %s already existing (%s)\n,
+   pr_warn(%s: EEH ops of platform %s already existing (%s)\n,
__func__, eeh_ops-name, ops-name);
return -EEXIST;
}
@@ -810,7 +810,7 @@ int __init eeh_ops_register(struct eeh_ops *ops)
 int __exit eeh_ops_unregister(const char *name)
 {
if (!name || !strlen(name)) {
-   pr_warning(%s: Invalid EEH ops name\n,
+   pr_warn(%s: Invalid EEH ops name\n,
__func__);
return -EINVAL;
}
@@ -875,11 +875,11 @@ int eeh_init(void)
 
/* call platform initialization function */
if (!eeh_ops) {
-   pr_warning(%s: Platform EEH operation not found\n,
+   pr_warn(%s: Platform EEH operation not found\n,
__func__);
return -EEXIST;
} else if ((ret = eeh_ops-init())) {
-   pr_warning(%s: Failed to call platform init function (%d)\n,
+   pr_warn(%s: Failed to call platform init function (%d)\n,
__func__, ret);
return ret;
}
@@ -920,7 +920,7 @@ int eeh_init(void)
if (eeh_enabled())
pr_info(EEH: PCI Enhanced I/O Error Handling Enabled\n);
else
-   pr_warning(EEH: No capable adapters found\n);
+   pr_warn(EEH: No capable adapters found\n);
 
return ret;
 }
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 3639bee..07d8a24 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -143,7 +143,7 @@ eeh_addr_cache_insert(struct pci_dev *dev, unsigned long 
alo,
} else {
if (dev != piar-pcidev ||
alo != piar-addr_lo || ahi != piar-addr_hi) {
-   pr_warning(PIAR: overlapping address range\n);
+   pr_warn(PIAR: overlapping address range\n);
}
return piar;
}
@@ -177,13 +177,14 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev 
*dev)
 
dn = pci_device_to_OF_node(dev);
if (!dn) {
-   pr_warning(PCI: no pci dn found for dev=%s\n, pci_name(dev));
+   pr_warn(PCI: no pci dn found for dev=%s\n,
+   pci_name(dev));
return;
}
 
edev = of_node_to_eeh_dev(dn);
if (!edev) {
-   pr_warning(PCI: no EEH dev found for dn=%s\n,
+   pr_warn(PCI: no EEH dev found for dn=%s\n,
dn-full_name);
return;
}
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index 1efa28f..e5274ee 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -57,7 +57,8 @@ void *eeh_dev_init(struct device_node *dn, void *data)
/* Allocate EEH device */
edev = kzalloc(sizeof(*edev), GFP_KERNEL);
if (!edev) {
-   pr_warning(%s: out of memory\n, __func__);
+   pr_warn(%s: out of memory\n,
+   __func__);
return

[PATCH 1/6] powerpc/eeh: Refactor EEH flag accessors

2014-07-15 Thread Gavin Shan

There are multiple global EEH flags. Almost each flag has its own
accessor, which doesn't make sense. The patch refactors EEH flag
accessors so that they look unified:

  eeh_add_flag():   Add EEH flag
  eeh_clear_flag(): Clear EEH flag
  eeh_has_flag():   Check if one specific flag has been set
  eeh_enabled():Check if EEH functionality has been enabled

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   | 32 ++--
 arch/powerpc/kernel/eeh.c| 20 -
 arch/powerpc/kernel/eeh_cache.c  |  2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  6 +++---
 arch/powerpc/platforms/powernv/pci-ioda.c|  1 -
 arch/powerpc/platforms/pseries/eeh_pseries.c |  4 ++--
 6 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 6e47894..ca8aada 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -206,36 +206,28 @@ extern int eeh_subsystem_flags;
 extern struct eeh_ops *eeh_ops;
 extern raw_spinlock_t confirm_error_lock;
 
-static inline bool eeh_enabled(void)
+static inline void eeh_add_flag(int flag)
 {
-   if ((eeh_subsystem_flags  EEH_FORCE_DISABLED) ||
-   !(eeh_subsystem_flags  EEH_ENABLED))
-   return false;
-
-   return true;
+   eeh_subsystem_flags |= flag;
 }
 
-static inline void eeh_set_enable(bool mode)
+static inline void eeh_clear_flag(int flag)
 {
-   if (mode)
-   eeh_subsystem_flags |= EEH_ENABLED;
-   else
-   eeh_subsystem_flags = ~EEH_ENABLED;
+   eeh_subsystem_flags = ~flag;
 }
 
-static inline void eeh_probe_mode_set(int flag)
+static inline bool eeh_has_flag(int flag)
 {
-   eeh_subsystem_flags |= flag;
+return !!(eeh_subsystem_flags  flag);
 }
 
-static inline int eeh_probe_mode_devtree(void)
+static inline bool eeh_enabled(void)
 {
-   return (eeh_subsystem_flags  EEH_PROBE_MODE_DEVTREE);
-}
+   if (eeh_has_flag(EEH_FORCE_DISABLED) ||
+   !eeh_has_flag(EEH_ENABLED))
+   return false;
 
-static inline int eeh_probe_mode_dev(void)
-{
-   return (eeh_subsystem_flags  EEH_PROBE_MODE_DEV);
+   return true;
 }
 
 static inline void eeh_serialize_lock(unsigned long *flags)
@@ -314,8 +306,6 @@ static inline bool eeh_enabled(void)
 return false;
 }
 
-static inline void eeh_set_enable(bool mode) { }
-
 static inline int eeh_init(void)
 {
return 0;
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 4de2103..65a163f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -142,7 +142,7 @@ static struct eeh_stats eeh_stats;
 static int __init eeh_setup(char *str)
 {
if (!strcmp(str, off))
-   eeh_subsystem_flags |= EEH_FORCE_DISABLED;
+   eeh_add_flag(EEH_FORCE_DISABLED);
 
return 1;
 }
@@ -252,7 +252,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 * 0xFF's is always returned from PCI config space.
 */
if (!(pe-type  EEH_PE_PHB)) {
-   if (eeh_probe_mode_devtree())
+   if (eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
eeh_ops-configure_bridge(pe);
eeh_pe_restore_bars(pe);
@@ -303,7 +303,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
unsigned long flags;
int ret;
 
-   if (!eeh_probe_mode_dev())
+   if (!eeh_has_flag(EEH_PROBE_MODE_DEV))
return -EPERM;
 
/* Find the PHB PE */
@@ -801,7 +801,7 @@ int __exit eeh_ops_unregister(const char *name)
 static int eeh_reboot_notifier(struct notifier_block *nb,
   unsigned long action, void *unused)
 {
-   eeh_set_enable(false);
+   eeh_clear_flag(EEH_ENABLED);
return NOTIFY_DONE;
 }
 
@@ -865,13 +865,13 @@ int eeh_init(void)
return ret;
 
/* Enable EEH for all adapters */
-   if (eeh_probe_mode_devtree()) {
+   if (eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) {
list_for_each_entry_safe(hose, tmp,
hose_list, list_node) {
phb = hose-dn;
traverse_pci_devices(phb, eeh_ops-of_probe, NULL);
}
-   } else if (eeh_probe_mode_dev()) {
+   } else if (eeh_has_flag(EEH_PROBE_MODE_DEV)) {
list_for_each_entry_safe(hose, tmp,
hose_list, list_node)
pci_walk_bus(hose-bus, eeh_ops-dev_probe, NULL);
@@ -923,7 +923,7 @@ void eeh_add_device_early(struct device_node *dn)
 * would delay the probe until late stage because
 * the PCI device isn't available this moment.
 */
-   if (!eeh_probe_mode_devtree())
+   if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
return;

[PATCH 3/6] powerpc/eeh: Reduce lines of log dump

2014-07-15 Thread Gavin Shan

The patch prints 4 PCIE or AER config registers each line, which
is part of the EEH log so that it looks a bit more compact.

Suggested-by: Benjamin Herrenschmidt b...@kernel.crashing.org
Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/eeh.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index aa33656..123c151 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -157,12 +157,13 @@ __setup(eeh=, eeh_setup);
  * This routine captures assorted PCI configuration space data,
  * and puts them into a buffer for RTAS error logging.
  */
-static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
+static size_t eeh_gather_pci_data(struct eeh_dev *edev, char *buf, size_t len)
 {
struct device_node *dn = eeh_dev_to_of_node(edev);
u32 cfg;
-   int cap, i;
+   int cap, i, j;
int n = 0;
+   char buffer[128];
 
n += scnprintf(buf+n, len-n, %s\n, dn-full_name);
pr_warn(EEH: of node=%s\n, dn-full_name);
@@ -204,10 +205,22 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, 
char * buf, size_t len)
n += scnprintf(buf+n, len-n, pci-e cap10:\n);
pr_warn(EEH: PCI-E capabilities and status follow:\n);
 
-   for (i=0; i=8; i++) {
+   for (i=0, j=0; i=8; i++) {
eeh_ops-read_config(dn, cap+4*i, 4, cfg);
n += scnprintf(buf+n, len-n, %02x:%x\n, 4*i, cfg);
-   pr_warn(EEH: PCI-E %02x: %08x\n, i, cfg);
+
+   if ((i % 4) == 0) {
+   memset(buffer, 0, sizeof(buffer));
+   j = scnprintf(buffer, sizeof(buffer),
+ EEH: PCI-E %02x: %08x ,
+ 4*i, cfg);
+   } else {
+   j += scnprintf(buffer+j, sizeof(buffer)-j,
+  %08x , cfg);
+   }
+
+   if ((i % 4) == 3 || i = 8)
+   pr_warn(%s\n, buffer);
}
}
 
@@ -217,10 +230,22 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, 
char * buf, size_t len)
n += scnprintf(buf+n, len-n, pci-e AER:\n);
pr_warn(EEH: PCI-E AER capability register set follows:\n);
 
-   for (i=0; i14; i++) {
+   for (i=0, j=0; i=13; i++) {
eeh_ops-read_config(dn, cap+4*i, 4, cfg);
n += scnprintf(buf+n, len-n, %02x:%x\n, 4*i, cfg);
-   pr_warn(EEH: PCI-E AER %02x: %08x\n, i, cfg);
+
+   if ((i % 4) == 0) {
+   memset(buffer, 0, sizeof(buffer));
+   j = scnprintf(buffer, sizeof(buffer),
+ EEH: PCI-E AER %02x: %08x ,
+ 4*i, cfg);
+   } else {
+   j += scnprintf(buffer+j, sizeof(buffer)-j,
+  %08x , cfg);
+   }
+
+   if ((i % 4) == 3 || i = 13)
+   pr_warn(%s\n, buffer);
}
}
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 0/6] EEH Cleanup

2014-07-15 Thread Gavin Shan

The patchset is EEH cleanup and expected to be merged during 3.17
window. The the patchset is expected to be applied after:

   |EEH support for guest
   |2 more bug fixes for EEH support for guest
   |M64 related EEH changes
   |2 bug fixes from Mike Qiu
   |
   +-  The current patchset

Except the following 2 patches, all patches are for cleanup:
Refactoring EEH log, replacing pr_warning() with pr_warn(),
reducing length of EEH log dump etc:

PATCH[2/6]: We have to enable I/O path before collecting EEH log.
Otherwise, 0xFF is always returned from PCI config
of devices in frozen PE. the problem is only existing
on PHB3.
PATCH[6/6]: It's something related to EEH guest log retrieval.
Currently, all PEs in one specific PHB are sharing
diag-data blob for storing EEH log. It's possible
for diag-data blob overwritten before being collected
by guest. The patch introduce auxillary data for PE,
which is maintained by backend. On PowerNV, that's used
for EEH log.

Gavin Shan (6):
  powerpc/eeh: Refactor EEH flag accessors
  powerpc/eeh: Selectively enable IO for error log
  powerpc/eeh: Reduce lines of log dump
  powerpc/eeh: Replace pr_warning() with pr_warn()
  powerpc/eeh: Make diag-data not endian dependent
  powerpc/eeh: Aux PE data for error log

 arch/powerpc/include/asm/eeh.h   |  43 -
 arch/powerpc/include/asm/opal.h  | 128 +--
 arch/powerpc/kernel/eeh.c|  73 ++-
 arch/powerpc/kernel/eeh_cache.c  |   9 +-
 arch/powerpc/kernel/eeh_dev.c|   3 +-
 arch/powerpc/kernel/eeh_driver.c |  16 ++--
 arch/powerpc/kernel/eeh_pe.c |  29 +-
 arch/powerpc/platforms/powernv/eeh-ioda.c| 103 +
 arch/powerpc/platforms/powernv/eeh-powernv.c |  32 +--
 arch/powerpc/platforms/powernv/pci-ioda.c|   1 -
 arch/powerpc/platforms/powernv/pci.c |  68 --
 arch/powerpc/platforms/pseries/eeh_pseries.c |  32 +++
 12 files changed, 323 insertions(+), 214 deletions(-)

-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 5/6] powerpc/eeh: Make diag-data not endian dependent

2014-07-15 Thread Gavin Shan

It's followup of commit ddf0322a (powerpc/powernv: Fix endianness
problems in EEH). The patch helps to get non-endian-dependent
diag-data.

Cc: Guo Chao y...@linux.vnet.ibm.com
Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/opal.h   | 128 +++---
 arch/powerpc/platforms/powernv/eeh-ioda.c |  51 +++-
 arch/powerpc/platforms/powernv/pci.c  |  68 ++--
 3 files changed, 139 insertions(+), 108 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index edbfe1c..f0b5b40 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -520,40 +520,40 @@ enum {
 };
 
 struct OpalIoP7IOCErrorData {
-   uint16_t type;
+   __be16 type;
 
/* GEM */
-   uint64_t gemXfir;
-   uint64_t gemRfir;
-   uint64_t gemRirqfir;
-   uint64_t gemMask;
-   uint64_t gemRwof;
+   __be64 gemXfir;
+   __be64 gemRfir;
+   __be64 gemRirqfir;
+   __be64 gemMask;
+   __be64 gemRwof;
 
/* LEM */
-   uint64_t lemFir;
-   uint64_t lemErrMask;
-   uint64_t lemAction0;
-   uint64_t lemAction1;
-   uint64_t lemWof;
+   __be64 lemFir;
+   __be64 lemErrMask;
+   __be64 lemAction0;
+   __be64 lemAction1;
+   __be64 lemWof;
 
union {
struct OpalIoP7IOCRgcErrorData {
-   uint64_t rgcStatus; /* 3E1C10 */
-   uint64_t rgcLdcp;   /* 3E1C18 */
+   __be64 rgcStatus;   /* 3E1C10 */
+   __be64 rgcLdcp; /* 3E1C18 */
}rgc;
struct OpalIoP7IOCBiErrorData {
-   uint64_t biLdcp0;   /* 3C0100, 3C0118 */
-   uint64_t biLdcp1;   /* 3C0108, 3C0120 */
-   uint64_t biLdcp2;   /* 3C0110, 3C0128 */
-   uint64_t biFenceStatus; /* 3C0130, 3C0130 */
+   __be64 biLdcp0; /* 3C0100, 3C0118 */
+   __be64 biLdcp1; /* 3C0108, 3C0120 */
+   __be64 biLdcp2; /* 3C0110, 3C0128 */
+   __be64 biFenceStatus;   /* 3C0130, 3C0130 */
 
-   uint8_t  biDownbound;   /* BI Downbound or 
Upbound */
+   u8 biDownbound; /* BI Downbound or Upbound */
}bi;
struct OpalIoP7IOCCiErrorData {
-   uint64_t ciPortStatus;  /* 3Dn008 */
-   uint64_t ciPortLdcp;/* 3Dn010 */
+   __be64 ciPortStatus;/* 3Dn008 */
+   __be64 ciPortLdcp;  /* 3Dn010 */
 
-   uint8_t  ciPort;/* Index of CI port: 
0/1 */
+   u8 ciPort;  /* Index of CI port: 0/1 */
}ci;
};
 };
@@ -585,60 +585,60 @@ struct OpalIoPhbErrorCommon {
 struct OpalIoP7IOCPhbErrorData {
struct OpalIoPhbErrorCommon common;
 
-   uint32_t brdgCtl;
+   __be32 brdgCtl;
 
// P7IOC utl regs
-   uint32_t portStatusReg;
-   uint32_t rootCmplxStatus;
-   uint32_t busAgentStatus;
+   __be32 portStatusReg;
+   __be32 rootCmplxStatus;
+   __be32 busAgentStatus;
 
// P7IOC cfg regs
-   uint32_t deviceStatus;
-   uint32_t slotStatus;
-   uint32_t linkStatus;
-   uint32_t devCmdStatus;
-   uint32_t devSecStatus;
+   __be32 deviceStatus;
+   __be32 slotStatus;
+   __be32 linkStatus;
+   __be32 devCmdStatus;
+   __be32 devSecStatus;
 
// cfg AER regs
-   uint32_t rootErrorStatus;
-   uint32_t uncorrErrorStatus;
-   uint32_t corrErrorStatus;
-   uint32_t tlpHdr1;
-   uint32_t tlpHdr2;
-   uint32_t tlpHdr3;
-   uint32_t tlpHdr4;
-   uint32_t sourceId;
+   __be32 rootErrorStatus;
+   __be32 uncorrErrorStatus;
+   __be32 corrErrorStatus;
+   __be32 tlpHdr1;
+   __be32 tlpHdr2;
+   __be32 tlpHdr3;
+   __be32 tlpHdr4;
+   __be32 sourceId;
 
-   uint32_t rsv3;
+   __be32 rsv3;
 
// Record data about the call to allocate a buffer.
-   uint64_t errorClass;
-   uint64_t correlator;
+   __be64 errorClass;
+   __be64 correlator;
 
//P7IOC MMIO Error Regs
-   uint64_t p7iocPlssr;// n120
-   uint64_t p7iocCsr;  // n110
-   uint64_t lemFir;// nC00
-   uint64_t lemErrorMask;  // nC18
-   uint64_t lemWOF;// nC40
-   uint64_t phbErrorStatus;// nC80
-   uint64_t phbFirstErrorStatus;   // nC88
-   uint64_t phbErrorLog0;  // nCC0
-   uint64_t phbErrorLog1;  // nCC8
-   uint64_t mmioErrorStatus;   //

[PATCH 6/6] powerpc/eeh: Aux PE data for error log

2014-07-15 Thread Gavin Shan

The patch allows PE (struct eeh_pe) instance to have auxillary data,
whose size is configurable on basis of platform. For PowerNV, the
auxillary data will be used to cache PHB diag-data for that PE
(frozen PE or fenced PHB). In turn, we can retrieve the diag-data
at any later points It's useful for the case of VFIO PCI devices
where the error log should be cached, and then be retrieved by the
guest at later point.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |  2 ++
 arch/powerpc/kernel/eeh_pe.c | 26 -
 arch/powerpc/platforms/powernv/eeh-ioda.c| 42 +++-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  3 +-
 4 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 494c3ff..9983c3d 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -88,6 +88,7 @@ struct eeh_pe {
int false_positives;/* Times of reported #ff's  */
atomic_t pass_dev_cnt;  /* Count of passed through devs */
struct eeh_pe *parent;  /* Parent PE*/
+   void *data; /* PE auxillary data*/
struct list_head child_list;/* Link PE to the child list*/
struct list_head edevs; /* Link list of EEH devices */
struct list_head child; /* Child PEs*/
@@ -248,6 +249,7 @@ static inline void eeh_serialize_unlock(unsigned long flags)
 #define EEH_MAX_ALLOWED_FREEZES 5
 
 typedef void *(*eeh_traverse_func)(void *data, void *flag);
+void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 77632ab..00e3844 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -32,9 +32,24 @@
 #include asm/pci-bridge.h
 #include asm/ppc-pci.h
 
+static int eeh_pe_aux_size = 0;
 static LIST_HEAD(eeh_phb_pe);
 
 /**
+ * eeh_set_pe_aux_size - Set PE auxillary data size
+ * @size: PE auxillary data size
+ *
+ * Set PE auxillary data size
+ */
+void eeh_set_pe_aux_size(int size)
+{
+   if (size  0)
+   return;
+
+   eeh_pe_aux_size = size;
+}
+
+/**
  * eeh_pe_alloc - Allocate PE
  * @phb: PCI controller
  * @type: PE type
@@ -44,9 +59,16 @@ static LIST_HEAD(eeh_phb_pe);
 static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
 {
struct eeh_pe *pe;
+   size_t alloc_size;
+
+   alloc_size = sizeof(struct eeh_pe);
+   if (eeh_pe_aux_size) {
+   alloc_size = ALIGN(alloc_size, cache_line_size());
+   alloc_size += eeh_pe_aux_size;
+   }
 
/* Allocate PHB PE */
-   pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+   pe = kzalloc(alloc_size, GFP_KERNEL);
if (!pe) return NULL;
 
/* Initialize PHB PE */
@@ -56,6 +78,8 @@ static struct eeh_pe *eeh_pe_alloc(struct pci_controller 
*phb, int type)
INIT_LIST_HEAD(pe-child);
INIT_LIST_HEAD(pe-edevs);
 
+   pe-data = (void *)pe + ALIGN(sizeof(struct eeh_pe),
+ cache_line_size());
return pe;
 }
 
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index bccdf60..b4624cf 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -236,20 +236,16 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int 
option)
return ret;
 }
 
-static void ioda_eeh_phb_diag(struct pci_controller *hose)
+static void ioda_eeh_phb_diag(struct eeh_pe *pe)
 {
-   struct pnv_phb *phb = hose-private_data;
+   struct pnv_phb *phb = pe-phb-private_data;
long rc;
 
-   rc = opal_pci_get_phb_diag_data2(phb-opal_id, phb-diag.blob,
+   rc = opal_pci_get_phb_diag_data2(phb-opal_id, pe-data,
 PNV_PCI_DIAG_BUF_SIZE);
-   if (rc != OPAL_SUCCESS) {
+   if (rc != OPAL_SUCCESS)
pr_warn(%s: Failed to get diag-data for PHB#%x (%ld)\n,
-   __func__, hose-global_number, rc);
-   return;
-   }
-
-   pnv_pci_dump_phb_diag_data(hose, phb-diag.blob);
+   __func__, pe-phb-global_number, rc);
 }
 
 static int ioda_eeh_get_phb_state(struct eeh_pe *pe)
@@ -282,7 +278,7 @@ static int ioda_eeh_get_phb_state(struct eeh_pe *pe)
  EEH_STATE_DMA_ENABLED);
} else if (!(pe-state  EEH_PE_ISOLATED)) {
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
-   ioda_eeh_phb_diag(phb-hose);
+   ioda_eeh_phb_diag(pe);
}
 
return result;
@@ -380,7 +376,7 @@ static int

[PATCH 2/6] powerpc/eeh: Selectively enable IO for error log

2014-07-15 Thread Gavin Shan

According to the experiment I did, PCI config access is blocked
on P7IOC frozen PE by hardware, but PHB3 doesn't do that. That
means we always get 0xFF's while dumping PCI config space of the
frozen PE on P7IOC. We don't have the problem on PHB3. So we have
to enable I/O prioir to collecting error log. Otherwise, meaningless
0xFF's are always returned.

The patch fixes it by EEH flag (EEH_ENABLE_IO_FOR_LOG), which is
selectively set to indicate the case for: P7IOC on PowerNV platform,
pSeries platform.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |  9 +
 arch/powerpc/kernel/eeh.c|  2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 16 
 arch/powerpc/platforms/pseries/eeh_pseries.c |  2 +-
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index ca8aada..494c3ff 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -34,10 +34,11 @@ struct device_node;
 #ifdef CONFIG_EEH
 
 /* EEH subsystem flags */
-#define EEH_ENABLED0x1 /* EEH enabled  */
-#define EEH_FORCE_DISABLED 0x2 /* EEH disabled */
-#define EEH_PROBE_MODE_DEV 0x4 /* From PCI device  */
-#define EEH_PROBE_MODE_DEVTREE 0x8 /* From device tree */
+#define EEH_ENABLED0x01/* EEH enabled  */
+#define EEH_FORCE_DISABLED 0x02/* EEH disabled */
+#define EEH_PROBE_MODE_DEV 0x04/* From PCI device  */
+#define EEH_PROBE_MODE_DEVTREE 0x08/* From device tree */
+#define EEH_ENABLE_IO_FOR_LOG  0x10/* Enable IO for log*/
 
 /*
  * Delay for PE reset, all in ms
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 65a163f..aa33656 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -252,7 +252,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 * 0xFF's is always returned from PCI config space.
 */
if (!(pe-type  EEH_PE_PHB)) {
-   if (eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
+   if (eeh_has_flag(EEH_ENABLE_IO_FOR_LOG))
eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
eeh_ops-configure_bridge(pe);
eeh_pe_restore_bars(pe);
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ba134ac..740c396 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -45,6 +45,9 @@
  */
 static int powernv_eeh_init(void)
 {
+   struct pci_controller *hose;
+   struct pnv_phb *phb;
+
/* We require OPALv3 */
if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
pr_warning(%s: OPALv3 is required !\n, __func__);
@@ -54,6 +57,19 @@ static int powernv_eeh_init(void)
/* Set probe mode */
eeh_add_flag(EEH_PROBE_MODE_DEV);
 
+   /*
+* P7IOC blocks PCI config access to frozen PE, but PHB3
+* doesn't do that. So we have to selectively enable I/O
+* prior to collecting error log.
+*/
+   list_for_each_entry(hose, hose_list, list_node) {
+   phb = hose-private_data;
+
+   if (phb-model == PNV_PHB_MODEL_P7IOC)
+   eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+   break;
+   }
+
return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index f173705..1e15cdd 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -128,7 +128,7 @@ static int pseries_eeh_init(void)
}
 
/* Set EEH probe mode */
-   eeh_add_flag(EEH_PROBE_MODE_DEVTREE);
+   eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG);
 
return 0;
 }
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

90 matches

Mail list logo