Re: [PATCH V4 0/2] mm: FAULT_AROUND_ORDER patchset performance data for powerpc

2014-05-27 Thread Madhavan Srinivasan
On Tuesday 20 May 2014 03:57 PM, Kirill A. Shutemov wrote:
 Rusty Russell wrote:
 Kirill A. Shutemov kirill.shute...@linux.intel.com writes:
 Andrew Morton wrote:
 On Mon, 19 May 2014 16:23:07 -0700 (PDT) Hugh Dickins hu...@google.com 
 wrote:

 Shouldn't FAULT_AROUND_ORDER and fault_around_order be changed to be
 the order of the fault-around size in bytes, and fault_around_pages()
 use 1UL  (fault_around_order - PAGE_SHIFT)

 Yes.  And shame on me for missing it (this time!) at review.

 There's still time to fix this.  Patches, please.

 Here it is. Made at 3.30 AM, build tested only.

 Prefer on top of Maddy's patch which makes it always a variable, rather
 than CONFIG_DEBUG_FS.  It's got enough hair as it is.
 
 Something like this?
 
 From: Kirill A. Shutemov kirill.shute...@linux.intel.com
 Date: Tue, 20 May 2014 13:02:03 +0300
 Subject: [PATCH] mm: nominate faultaround area in bytes rather then page order
 
 There are evidences that faultaround feature is less relevant on
 architectures with page size bigger then 4k. Which makes sense since
 page fault overhead per byte of mapped area should be less there.
 
 Let's rework the feature to specify faultaround area in bytes instead of
 page order. It's 64 kilobytes for now.
 
 The patch effectively disables faultaround on architectures with
 page size = 64k (like ppc64).
 
 It's possible that some other size of faultaround area is relevant for a
 platform. We can expose `fault_around_bytes' variable to arch-specific
 code once such platforms will be found.
 
 Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
 ---
  mm/memory.c | 62 
 +++--
  1 file changed, 23 insertions(+), 39 deletions(-)
 
 diff --git a/mm/memory.c b/mm/memory.c
 index 037b812a9531..252b319e8cdf 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -3402,63 +3402,47 @@ void do_set_pte(struct vm_area_struct *vma, unsigned 
 long address,
   update_mmu_cache(vma, address, pte);
  }
 
 -#define FAULT_AROUND_ORDER 4
 +static unsigned long fault_around_bytes = 65536;
 +
 +static inline unsigned long fault_around_pages(void)
 +{
 + return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
 +}
 +
 +static inline unsigned long fault_around_mask(void)
 +{
 + return ~(rounddown_pow_of_two(fault_around_bytes) - 1)  PAGE_MASK;
 +}
 
 -#ifdef CONFIG_DEBUG_FS
 -static unsigned int fault_around_order = FAULT_AROUND_ORDER;
 
 -static int fault_around_order_get(void *data, u64 *val)
 +#ifdef CONFIG_DEBUG_FS
 +static int fault_around_bytes_get(void *data, u64 *val)
  {
 - *val = fault_around_order;
 + *val = fault_around_bytes;
   return 0;
  }
 
 -static int fault_around_order_set(void *data, u64 val)
 +static int fault_around_bytes_set(void *data, u64 val)
  {

Kindly ignore the question if not relevant. Even though we need root
access to alter the value, will we be fine with
negative value?.

Regards
Maddy

 - BUILD_BUG_ON((1UL  FAULT_AROUND_ORDER)  PTRS_PER_PTE);
 - if (1UL  val  PTRS_PER_PTE)
 + if (val / PAGE_SIZE  PTRS_PER_PTE)
   return -EINVAL;
 - fault_around_order = val;
 + fault_around_bytes = val;
   return 0;
  }
 -DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
 - fault_around_order_get, fault_around_order_set, %llu\n);
 +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
 + fault_around_bytes_get, fault_around_bytes_set, %llu\n);
 
  static int __init fault_around_debugfs(void)
  {
   void *ret;
 
 - ret = debugfs_create_file(fault_around_order, 0644, NULL, NULL,
 - fault_around_order_fops);
 + ret = debugfs_create_file(fault_around_bytes, 0644, NULL, NULL,
 + fault_around_bytes_fops);
   if (!ret)
 - pr_warn(Failed to create fault_around_order in debugfs);
 + pr_warn(Failed to create fault_around_bytes in debugfs);
   return 0;
  }
  late_initcall(fault_around_debugfs);
 -
 -static inline unsigned long fault_around_pages(void)
 -{
 - return 1UL  fault_around_order;
 -}
 -
 -static inline unsigned long fault_around_mask(void)
 -{
 - return ~((1UL  (PAGE_SHIFT + fault_around_order)) - 1);
 -}
 -#else
 -static inline unsigned long fault_around_pages(void)
 -{
 - unsigned long nr_pages;
 -
 - nr_pages = 1UL  FAULT_AROUND_ORDER;
 - BUILD_BUG_ON(nr_pages  PTRS_PER_PTE);
 - return nr_pages;
 -}
 -
 -static inline unsigned long fault_around_mask(void)
 -{
 - return ~((1UL  (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
 -}
  #endif
 
  static void do_fault_around(struct vm_area_struct *vma, unsigned long 
 address,
 @@ -3515,7 +3499,7 @@ static int do_read_fault(struct mm_struct *mm, struct 
 vm_area_struct *vma,
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
 - if (vma-vm_ops-map_pages) {
 + if (vma-vm_ops-map_pages  fault_around_pages()  1) {
   pte = 

[RFT PATCH -next v2] [BUGFIX] kprobes: Fix Failed to find blacklist error on ia64 and ppc64

2014-05-27 Thread Masami Hiramatsu
On ia64 and ppc64, the function pointer does not point the
entry address of the function, but the address of function
discriptor (which contains the entry address and misc
data.) Since the kprobes passes the function pointer stored
by NOKPROBE_SYMBOL() to kallsyms_lookup_size_offset() for
initalizing its blacklist, it fails and reports many errors
as below.

  Failed to find blacklist 000101316830
  Failed to find blacklist 0001013000f0a000
  Failed to find blacklist 000101315f70a000
  Failed to find blacklist 000101324c80a000
  Failed to find blacklist 0001013063f0a000
  Failed to find blacklist 000101327800a000
  Failed to find blacklist 0001013277f0a000
  Failed to find blacklist 000101315a70a000
  Failed to find blacklist 0001013277e0a000
  Failed to find blacklist 000101305a20a000
  Failed to find blacklist 0001013277d0a000
  Failed to find blacklist 00010130bdc0a000
  Failed to find blacklist 00010130dc20a000
  Failed to find blacklist 000101309a00a000
  Failed to find blacklist 0001013277c0a000
  Failed to find blacklist 0001013277b0a000
  Failed to find blacklist 0001013277a0a000
  Failed to find blacklist 000101327790a000
  Failed to find blacklist 000101303140a000
  Failed to find blacklist 0001013a3280a000

To fix this bug, this introduces function_entry() macro to
retrieve the entry address from the given function pointer,
and uses for kallsyms_lookup_size_offset() while initializing
blacklist.

Changes in V2:
 - Use function_entry() macro when lookin up symbols instead
   of storing it.
 - Update for the latest -next.

Signed-off-by: Masami Hiramatsu masami.hiramatsu...@hitachi.com
Reported-by: Tony Luck tony.l...@gmail.com
Cc: Suzuki K. Poulose suz...@in.ibm.com
Cc: Tony Luck tony.l...@intel.com
Cc: Fenghua Yu fenghua...@intel.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Kevin Hao haoke...@gmail.com
Cc: linux-i...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/ia64/include/asm/types.h|2 ++
 arch/powerpc/include/asm/types.h |   11 +++
 include/linux/types.h|4 
 kernel/kprobes.c |4 +++-
 4 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/include/asm/types.h b/arch/ia64/include/asm/types.h
index 4c351b1..95279dd 100644
--- a/arch/ia64/include/asm/types.h
+++ b/arch/ia64/include/asm/types.h
@@ -27,5 +27,7 @@ struct fnptr {
unsigned long gp;
 };
 
+#define function_entry(fn) (((struct fnptr *)(fn))-ip)
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_IA64_TYPES_H */
diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h
index bfb6ded..8b89d65 100644
--- a/arch/powerpc/include/asm/types.h
+++ b/arch/powerpc/include/asm/types.h
@@ -25,6 +25,17 @@ typedef struct {
unsigned long env;
 } func_descr_t;
 
+#if defined(CONFIG_PPC64)  (!defined(_CALL_ELF) || _CALL_ELF == 1)
+/*
+ * On PPC64 ABIv1 the function pointer actually points to the
+ * function's descriptor. The first entry in the descriptor is the
+ * address of the function text.
+ */
+#define function_entry(fn) (((func_descr_t *)(fn))-entry)
+#else
+#define function_entry(fn) ((unsigned long)(fn))
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_TYPES_H */
diff --git a/include/linux/types.h b/include/linux/types.h
index a0bb704..3b95369 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -213,5 +213,9 @@ struct callback_head {
 };
 #define rcu_head callback_head
 
+#ifndef function_entry
+#define function_entry(fn) ((unsigned long)(fn))
+#endif
+
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2ac9f13..3859c88 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -32,6 +32,7 @@
  * prasa...@in.ibm.com added function-return probes.
  */
 #include linux/kprobes.h
+#include linux/types.h
 #include linux/hash.h
 #include linux/init.h
 #include linux/slab.h
@@ -2042,7 +2043,8 @@ static int __init populate_kprobe_blacklist(unsigned long 
*start,
unsigned long offset = 0, size = 0;
 
for (iter = start; iter  end; iter++) {
-   if (!kallsyms_lookup_size_offset(*iter, size, offset)) {
+   if (!kallsyms_lookup_size_offset(function_entry(*iter),
+size, offset)) {
pr_err(Failed to find blacklist %p\n, (void *)*iter);
continue;
}


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Build regressions/improvements in v3.15-rc7

2014-05-27 Thread Geert Uytterhoeven
On Tue, 27 May 2014, Geert Uytterhoeven wrote:
 JFYI, when comparing v3.15-rc7[1]  to v3.15-rc6[3], the summaries are:
   - build errors: +4/-1

  + /scratch/kisskb/src/drivers/tty/serial/nwpserial.c: error: implicit 
declaration of function 'udelay' [-Werror=implicit-function-declaration]:  = 
53:3

powerpc-randconfig

  + error: No rule to make target drivers/scsi/aic7xxx/aicasm/*.[chyl]:  = N/A

i386-randconfig

  + error: ion.c: undefined reference to `vm_insert_pfn':  = .text+0x66f4c4)
  + error: ion.c: undefined reference to `zap_page_range':  = .text+0x672794)

sh-randconfig

 [1] http://kisskb.ellerman.id.au/kisskb/head/7506/ (all 119 configs)
 [3] http://kisskb.ellerman.id.au/kisskb/head/7497/ (all 119 configs)

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say programmer or something like that.
-- Linus Torvalds
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v7 2/3] powerpc/eeh: EEH support for VFIO PCI device

2014-05-27 Thread Gavin Shan
The patch exports functions to be used by new ioctl commands, which
will be introduced in subsequent patch, to support EEH functinality
for VFIO PCI device.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h |  15 +++
 arch/powerpc/kernel/eeh.c  | 286 +
 2 files changed, 301 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 34a2d83..ffc95e7 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -26,6 +26,7 @@
 #include linux/string.h
 #include linux/time.h
 
+struct iommu_table;
 struct pci_dev;
 struct pci_bus;
 struct device_node;
@@ -191,6 +192,8 @@ enum {
 #define EEH_OPT_ENABLE 1   /* EEH enable   */
 #define EEH_OPT_THAW_MMIO  2   /* MMIO enable  */
 #define EEH_OPT_THAW_DMA   3   /* DMA enable   */
+#define EEH_OPT_GET_PE_ADDR0   /* Get PE addr  */
+#define EEH_OPT_GET_PE_MODE1   /* Get PE mode  */
 #define EEH_STATE_UNAVAILABLE  (1  0)/* State unavailable*/
 #define EEH_STATE_NOT_SUPPORT  (1  1)/* EEH not supported*/
 #define EEH_STATE_RESET_ACTIVE (1  2)/* Active reset */
@@ -198,6 +201,11 @@ enum {
 #define EEH_STATE_DMA_ACTIVE   (1  4)/* Active DMA   */
 #define EEH_STATE_MMIO_ENABLED (1  5)/* MMIO enabled */
 #define EEH_STATE_DMA_ENABLED  (1  6)/* DMA enabled  */
+#define EEH_PE_STATE_NORMAL0   /* Normal state */
+#define EEH_PE_STATE_RESET 1   /* PE reset */
+#define EEH_PE_STATE_STOPPED_IO_DMA2   /* Stopped  */
+#define EEH_PE_STATE_STOPPED_DMA   4   /* Stopped DMA  */
+#define EEH_PE_STATE_UNAVAIL   5   /* Unavailable  */
 #define EEH_RESET_DEACTIVATE   0   /* Deactivate the PE reset  */
 #define EEH_RESET_HOT  1   /* Hot reset*/
 #define EEH_RESET_FUNDAMENTAL  3   /* Fundamental reset*/
@@ -305,6 +313,13 @@ void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+int eeh_dev_open(struct pci_dev *pdev);
+void eeh_dev_release(struct pci_dev *pdev);
+struct eeh_pe *eeh_iommu_table_to_pe(struct iommu_table *tbl);
+int eeh_pe_set_option(struct eeh_pe *pe, int option);
+int eeh_pe_get_state(struct eeh_pe *pe);
+int eeh_pe_reset(struct eeh_pe *pe, int option);
+int eeh_pe_configure(struct eeh_pe *pe);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bc8b12..30693c1 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -40,6 +40,7 @@
 #include asm/eeh.h
 #include asm/eeh_event.h
 #include asm/io.h
+#include asm/iommu.h
 #include asm/machdep.h
 #include asm/ppc-pci.h
 #include asm/rtas.h
@@ -108,6 +109,9 @@ struct eeh_ops *eeh_ops = NULL;
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
  * can access it.
@@ -1106,6 +1110,288 @@ void eeh_remove_device(struct pci_dev *dev)
edev-mode = ~EEH_DEV_SYSFS;
 }
 
+/**
+ * eeh_dev_open - Mark EEH device and PE as passed through
+ * @pdev: PCI device
+ *
+ * Mark the indicated EEH device and PE as passed through.
+ * In the result, the EEH errors detected on the PE won't be
+ * reported. The owner of the device will be responsible for
+ * detection and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev) {
+   mutex_unlock(eeh_dev_mutex);
+   return -ENODEV;
+   }
+
+   /* No EEH device ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev-pe) {
+   mutex_unlock(eeh_dev_mutex);
+   return -ENODEV;
+   }
+
+   eeh_dev_set_passed(edev, true);
+   eeh_pe_set_passed(edev-pe, true);
+   mutex_unlock(eeh_dev_mutex);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Reclaim the ownership of EEH device
+ * @pdev: PCI device
+ *
+ * Reclaim ownership of EEH device, potentially the corresponding
+ * PE. In the result, the EEH errors detected on the PE will be
+ * reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+   bool release_pe = true;
+   struct eeh_pe *pe = NULL;
+   struct eeh_dev *tmp, *edev;
+
+   mutex_lock(eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev) {
+   

[PATCH v7 1/3] powerpc/eeh: Avoid event on passed PE

2014-05-27 Thread Gavin Shan
If we detects frozen state on PE that has been passed through to somebody
else. we needn't handle it. Instead, we rely on the device's owner to
detect and recover it. The patch avoid EEH event on the frozen passed PE
so that the device's owner can have chance to handle that.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h| 32 +++
 arch/powerpc/kernel/eeh.c |  8 
 arch/powerpc/platforms/powernv/eeh-ioda.c |  3 ++-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7782056..34a2d83 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -72,6 +72,7 @@ struct device_node;
 #define EEH_PE_RESET   (1  2)/* PE reset in progress */
 
 #define EEH_PE_KEEP(1  8)/* Keep PE on hotplug   */
+#define EEH_PE_PASSTHROUGH (1  9)/* PE owned by guest*/
 
 struct eeh_pe {
int type;   /* PE type: PHB/Bus/Device  */
@@ -93,6 +94,21 @@ struct eeh_pe {
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
list_for_each_entry_safe(edev, tmp, pe-edevs, list)
 
+static inline bool eeh_pe_passed(struct eeh_pe *pe)
+{
+   return pe ? !!(pe-state  EEH_PE_PASSTHROUGH) : false;
+}
+
+static inline void eeh_pe_set_passed(struct eeh_pe *pe, bool passed)
+{
+   if (pe) {
+   if (passed)
+   pe-state |= EEH_PE_PASSTHROUGH;
+   else
+   pe-state = ~EEH_PE_PASSTHROUGH;
+   }
+}
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
@@ -110,6 +126,7 @@ struct eeh_pe {
 #define EEH_DEV_SYSFS  (1  9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1  10)   /* Removed permanently  
*/
 #define EEH_DEV_FRESET (1  11)   /* Fundamental reset*/
+#define EEH_DEV_PASSTHROUGH(1  12)   /* Owned by guest   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
@@ -138,6 +155,21 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct 
eeh_dev *edev)
return edev ? edev-pdev : NULL;
 }
 
+static inline bool eeh_dev_passed(struct eeh_dev *dev)
+{
+   return dev ? !!(dev-mode  EEH_DEV_PASSTHROUGH) : false;
+}
+
+static inline void eeh_dev_set_passed(struct eeh_dev *dev, bool passed)
+{
+   if (dev) {
+   if (passed)
+   dev-mode |= EEH_DEV_PASSTHROUGH;
+   else
+   dev-mode = ~EEH_DEV_PASSTHROUGH;
+   }
+}
+
 /* Return values from eeh_ops::next_error */
 enum {
EEH_NEXT_ERR_NONE = 0,
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..3bc8b12 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -400,6 +400,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (ret  0)
return ret;
 
+   /*
+* If the PE isn't owned by us, we shouldn't check the
+* state. Instead, let the owner handle it if the PE has
+* been frozen.
+*/
+   if (eeh_pe_passed(pe))
+   return 0;
+
/* If we already have a pending isolation event for this
 * slot, we know it's bad already, we don't need to check.
 * Do this checking under a lock; as multiple PCI devices
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index cab3e62..79193eb 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -892,7 +892,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
opal_pci_eeh_freeze_clear(phb-opal_id, 
frozen_pe_no,
OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
ret = EEH_NEXT_ERR_NONE;
-   } else if ((*pe)-state  EEH_PE_ISOLATED) {
+   } else if ((*pe)-state  EEH_PE_ISOLATED ||
+  eeh_pe_passed(*pe)) {
ret = EEH_NEXT_ERR_NONE;
} else {
pr_err(EEH: Frozen PHB#%x-PE#%x (%s) 
detected\n,
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Gavin Shan
The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 Documentation/vfio.txt  | 92 -
 drivers/vfio/pci/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c | 20 +---
 drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
 drivers/vfio/pci/vfio_pci_private.h |  5 ++
 drivers/vfio/vfio_iommu_spapr_tce.c | 85 ++
 include/uapi/linux/vfio.h   | 66 ++
 7 files changed, 308 insertions(+), 7 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..d890fed 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
+subtree that can be treated as a unit for the purposes of partitioning and
+error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
+function of a multi-function IOA, or multiple IOAs (possibly including switch
+and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors
+and recover from them via EEH RTAS services, which works on the basis of
+additional ioctl commands.
+
+So 7 additional ioctls have been added:
 
VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
of the DMA window on the PCI bus.
@@ -316,6 +324,17 @@ So 3 additional ioctls have been added:
 
VFIO_IOMMU_DISABLE - disables the container.
 
+   VFIO_EEH_PE_SET_OPTION - enables or disables EEH functionality on the
+   specified device. Also, it can be used to remove IO or DMA
+   stopped state on the frozen PE.
+
+   VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
+
+   VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
+   error recovering.
+
+   VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
+   one of the major steps for error recoverying.
 
 The code flow from the example above should be slightly changed:
 
@@ -346,6 +365,77 @@ The code flow from the example above should be slightly 
changed:
ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
.
 
+Based on the initial example we have, the following piece of code could be
+reference for EEH setup and error handling:
+
+   struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
+   struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
+   struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
+   struct vfio_eeh_pe_configure configure = { .argsz = sizeof(configure) };
+
+   
+
+   /* Get a file descriptor for the device */
+   device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
+
+   /* Enable the EEH functionality on the device */
+   option.option = VFIO_EEH_PE_SET_OPT_ENABLE;
+   ioctl(container, VFIO_EEH_PE_SET_OPTION, option);
+
+   /* You're suggested to create additional data struct to represent
+* PE, and put child devices belonging to same IOMMU group to the
+* PE instance for later reference.
+*/
+
+   /* Check the PE's state and make sure it's in functional state */
+   ioctl(container, VFIO_EEH_PE_GET_STATE, state);
+
+   /* Save device's state. pci_save_state() would be good enough
+* as an example.
+*/
+
+   /* Test and setup the device */
+   ioctl(device, VFIO_DEVICE_GET_INFO, device_info);
+
+   
+
+   /* When 0xFF's returned from reading PCI config space or IO BARs
+* of the PCI device. Check the PE state to see if that has been
+* frozen.
+*/
+   ioctl(container, VFIO_EEH_PE_GET_STATE, state);
+
+   /* Waiting for pending PCI transactions to be completed and don't
+* produce any more PCI traffic from/to the affected PE until
+* recovery is finished.
+*/
+
+   /* Enable IO for the affected PE and collect logs. Usually, the
+* standard part of PCI config space, AER registers are dumped
+* as logs for further analysis.
+*/
+   option.option = VFIO_EEH_PE_SET_OPT_IO;
+   ioctl(container, VFIO_EEH_PE_SET_OPTION, option);
+
+   /* Issue PE reset */
+   reset.option = VFIO_EEH_PE_RESET_HOT;
+   ioctl(container, VFIO_EEH_PE_RESET, reset);
+   reset.option = VFIO_EEH_PE_RESET_DEACTIVATE;
+   ioctl(container, VFIO_EEH_PE_RESET, reset);
+
+   /* Configure the 

[PATCH v7 0/3] EEH Support for VFIO PCI Device

2014-05-27 Thread Gavin Shan
The series of patches intends to support EEH for PCI devices, which are
passed through to PowerKVM based guest via VFIO. The implementation is
straightforward based on the issues or problems we have to resolve to
support EEH for PowerKVM based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
  If QEMU can't handle it, the request will be sent to host via newly introduced
  VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.

Change log
==
v1 - v2:
* EEH RTAS requests are routed to QEMU, and then possiblly to host 
kerenl.
  The mechanism KVM in-kernel handling is dropped.
* Error injection is reimplemented based syscall, instead of KVM 
in-kerenl
  handling. The logic for error injection token management is moved to
  QEMU. The error injection request is routed to QEMU and then possiblly
  to host kernel.
v2 - v3:
* Make the fields in struct eeh_vfio_pci_addr, struct vfio_eeh_info 
based
  on the comments from Alexey.
* Define macros for EEH VFIO operations (Alexey).
* Clear frozen state after successful PE reset. 
* Merge original [PATCH 1/2/3] to one.
v3 - v4:
* Remove the error injection from the patchset. Mike or I will work on 
that
  later.
* Rename CONFIG_VFIO_EEH to VFIO_PCI_EEH.
* Rename the IOCTL command to VFIO_EEH_OP and it's handled by VFIO-PCI 
device
  instead of VFIO container.
* Rename the IOCTL argument structure to vfio_eeh_op accordingly. 
Also, more
  fields added to hold return values for RTAS requests.
* The address mapping stuff is totally removed. When opening or 
releasing VFIO
  PCI device, notification sent to EEH to update the flags indicates 
the device
  is passed to guest or not.
* Change pr_warn() to pr_debug() to avoid DOS as pointed by Alex.W
* Argument size check issue pointed by Alex.W.
v4 - v5:
* Functions for VFIO PCI EEH support are moved to eeh.c and exported 
from there.
  VFIO PCI driver just uses those functions to tackle IOCTL command 
VFIO_EEH_OP.
  All of this is to make the code organized in a good way as suggested 
by Alex.G.
  Another potential benefit is PowerNV/pSeries are sharing eeh_ops 
and same
  infrastructure could possiblly work for KVM_PR and KVM_HV mode at the 
same time.
* Don't clear error injection registers after finishing PE reset as the 
patchset
  is doing nothing related to error injection.
* Amending Documentation/vfio.txt, which was missed in last revision.
* No QEMU changes for this revision. v4 works well. Also, remove 
RFC from the
  subject as the design is basically recognized.
v5 - v6:
* CONFIG_VFIO_PCI_EEH removed. Instead to use CONFIG_EEH.
* Split one ioctl command to 5.
* In eeh.c, description has been added for those exported functions. 
Also, the
  functions have negative return values for error and information with 
other values.
  All digital numbers have been replaced by macros defined in eeh.h. 
The comments,
  including the function names have been amended not to mention guest 
or vfio.
* Add one mutex to protect flag in eeh_dev_open()/release().
* More information on how to use those ioctl commands to 
Documentation/vfio.txt.
v6 - v7:
* Remove ioctl command VFIO_EEH_PE_GET_ADDR, the PE address will be 
figured out
  in userland (e.g. QEMU) as Alex.G suggested.
* Let sPAPR VFIO container process the ioctl commands as VFIO container 
is naturally
  corresponds to IOMMU group (aka PE on sPAPR platform).
* All VFIO PCI EEH ioctl commands have argsz+flags for its companion 
data struct.
* For VFIO PCI EEH ioctl commands, ioctl() returns negative number to 
indicate error
  or zero for success. Additinal output information is transported by 
the companion
  data struct.
* Explaining PE in Documentation/vfio.txt, typo fixes, more comments 
suggested by
  Alex.G.
* Split/merge patches according to suggestions from Alex.G and Alex.W.
* To have EEH stub in drivers/vfio/pci/, which was suggested by Alex.W.
* Define various EEH options as macros in vfio.h for userland to use.

Gavin Shan (3):
  powerpc/eeh: Avoid event on passed PE
  powerpc/eeh: EEH support for VFIO PCI device
  drivers/vfio: EEH support for VFIO PCI device

Documentation/vfio.txt|  92 
+++-
arch/powerpc/include/asm/eeh.h|  47 +++
arch/powerpc/kernel/eeh.c | 294 
++

Re: [PATCH V4 0/2] mm: FAULT_AROUND_ORDER patchset performance data for powerpc

2014-05-27 Thread Kirill A. Shutemov
Madhavan Srinivasan wrote:
 On Tuesday 20 May 2014 03:57 PM, Kirill A. Shutemov wrote:
  Rusty Russell wrote:
  Kirill A. Shutemov kirill.shute...@linux.intel.com writes:
  Andrew Morton wrote:
  On Mon, 19 May 2014 16:23:07 -0700 (PDT) Hugh Dickins hu...@google.com 
  wrote:
 
  Shouldn't FAULT_AROUND_ORDER and fault_around_order be changed to be
  the order of the fault-around size in bytes, and fault_around_pages()
  use 1UL  (fault_around_order - PAGE_SHIFT)
 
  Yes.  And shame on me for missing it (this time!) at review.
 
  There's still time to fix this.  Patches, please.
 
  Here it is. Made at 3.30 AM, build tested only.
 
  Prefer on top of Maddy's patch which makes it always a variable, rather
  than CONFIG_DEBUG_FS.  It's got enough hair as it is.
  
  Something like this?
  
  From: Kirill A. Shutemov kirill.shute...@linux.intel.com
  Date: Tue, 20 May 2014 13:02:03 +0300
  Subject: [PATCH] mm: nominate faultaround area in bytes rather then page 
  order
  
  There are evidences that faultaround feature is less relevant on
  architectures with page size bigger then 4k. Which makes sense since
  page fault overhead per byte of mapped area should be less there.
  
  Let's rework the feature to specify faultaround area in bytes instead of
  page order. It's 64 kilobytes for now.
  
  The patch effectively disables faultaround on architectures with
  page size = 64k (like ppc64).
  
  It's possible that some other size of faultaround area is relevant for a
  platform. We can expose `fault_around_bytes' variable to arch-specific
  code once such platforms will be found.
  
  Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
  ---
   mm/memory.c | 62 
  +++--
   1 file changed, 23 insertions(+), 39 deletions(-)
  
  diff --git a/mm/memory.c b/mm/memory.c
  index 037b812a9531..252b319e8cdf 100644
  --- a/mm/memory.c
  +++ b/mm/memory.c
  @@ -3402,63 +3402,47 @@ void do_set_pte(struct vm_area_struct *vma, 
  unsigned long address,
  update_mmu_cache(vma, address, pte);
   }
  
  -#define FAULT_AROUND_ORDER 4
  +static unsigned long fault_around_bytes = 65536;
  +
  +static inline unsigned long fault_around_pages(void)
  +{
  +   return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
  +}
  +
  +static inline unsigned long fault_around_mask(void)
  +{
  +   return ~(rounddown_pow_of_two(fault_around_bytes) - 1)  PAGE_MASK;
  +}
  
  -#ifdef CONFIG_DEBUG_FS
  -static unsigned int fault_around_order = FAULT_AROUND_ORDER;
  
  -static int fault_around_order_get(void *data, u64 *val)
  +#ifdef CONFIG_DEBUG_FS
  +static int fault_around_bytes_get(void *data, u64 *val)
   {
  -   *val = fault_around_order;
  +   *val = fault_around_bytes;
  return 0;
   }
  
  -static int fault_around_order_set(void *data, u64 val)
  +static int fault_around_bytes_set(void *data, u64 val)
   {
 
 Kindly ignore the question if not relevant. Even though we need root
 access to alter the value, will we be fine with
 negative value?.

val is u64. or I miss something?

-- 
 Kirill A. Shutemov
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 0/2] mm: FAULT_AROUND_ORDER patchset performance data for powerpc

2014-05-27 Thread Madhavan Srinivasan
On Tuesday 27 May 2014 03:51 PM, Kirill A. Shutemov wrote:
 Madhavan Srinivasan wrote:
 On Tuesday 20 May 2014 03:57 PM, Kirill A. Shutemov wrote:
 Rusty Russell wrote:
 Kirill A. Shutemov kirill.shute...@linux.intel.com writes:
 Andrew Morton wrote:
 On Mon, 19 May 2014 16:23:07 -0700 (PDT) Hugh Dickins hu...@google.com 
 wrote:

 Shouldn't FAULT_AROUND_ORDER and fault_around_order be changed to be
 the order of the fault-around size in bytes, and fault_around_pages()
 use 1UL  (fault_around_order - PAGE_SHIFT)

 Yes.  And shame on me for missing it (this time!) at review.

 There's still time to fix this.  Patches, please.

 Here it is. Made at 3.30 AM, build tested only.

 Prefer on top of Maddy's patch which makes it always a variable, rather
 than CONFIG_DEBUG_FS.  It's got enough hair as it is.

 Something like this?

 From: Kirill A. Shutemov kirill.shute...@linux.intel.com
 Date: Tue, 20 May 2014 13:02:03 +0300
 Subject: [PATCH] mm: nominate faultaround area in bytes rather then page 
 order

 There are evidences that faultaround feature is less relevant on
 architectures with page size bigger then 4k. Which makes sense since
 page fault overhead per byte of mapped area should be less there.

 Let's rework the feature to specify faultaround area in bytes instead of
 page order. It's 64 kilobytes for now.

 The patch effectively disables faultaround on architectures with
 page size = 64k (like ppc64).

 It's possible that some other size of faultaround area is relevant for a
 platform. We can expose `fault_around_bytes' variable to arch-specific
 code once such platforms will be found.

 Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
 ---
  mm/memory.c | 62 
 +++--
  1 file changed, 23 insertions(+), 39 deletions(-)

 diff --git a/mm/memory.c b/mm/memory.c
 index 037b812a9531..252b319e8cdf 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -3402,63 +3402,47 @@ void do_set_pte(struct vm_area_struct *vma, 
 unsigned long address,
 update_mmu_cache(vma, address, pte);
  }

 -#define FAULT_AROUND_ORDER 4
 +static unsigned long fault_around_bytes = 65536;
 +
 +static inline unsigned long fault_around_pages(void)
 +{
 +   return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
 +}
 +
 +static inline unsigned long fault_around_mask(void)
 +{
 +   return ~(rounddown_pow_of_two(fault_around_bytes) - 1)  PAGE_MASK;
 +}

 -#ifdef CONFIG_DEBUG_FS
 -static unsigned int fault_around_order = FAULT_AROUND_ORDER;

 -static int fault_around_order_get(void *data, u64 *val)
 +#ifdef CONFIG_DEBUG_FS
 +static int fault_around_bytes_get(void *data, u64 *val)
  {
 -   *val = fault_around_order;
 +   *val = fault_around_bytes;
 return 0;
  }

 -static int fault_around_order_set(void *data, u64 val)
 +static int fault_around_bytes_set(void *data, u64 val)
  {

 Kindly ignore the question if not relevant. Even though we need root
 access to alter the value, will we be fine with
 negative value?.
 ppc
 val is u64. or I miss something?
 

My Bad. What I wanted to check was for all 0xf input and guess we are
fine. Sorry about that.

Regards
Maddy

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc, kexec: Fix Processor X is stuck issue during kexec from ST mode

2014-05-27 Thread Srivatsa S. Bhat
If we try to perform a kexec when the machine is in ST (Single-Threaded) mode
(ppc64_cpu --smt=off), the kexec operation doesn't succeed properly, and we
get the following messages during boot:

[0.089866] POWER8 performance monitor hardware support registered
[0.089985] power8-pmu: PMAO restore workaround active.
[5.095419] Processor 1 is stuck.
[   10.097933] Processor 2 is stuck.
[   15.100480] Processor 3 is stuck.
[   20.102982] Processor 4 is stuck.
[   25.105489] Processor 5 is stuck.
[   30.108005] Processor 6 is stuck.
[   35.110518] Processor 7 is stuck.
[   40.113369] Processor 9 is stuck.
[   45.115879] Processor 10 is stuck.
[   50.118389] Processor 11 is stuck.
[   55.120904] Processor 12 is stuck.
[   60.123425] Processor 13 is stuck.
[   65.125970] Processor 14 is stuck.
[   70.128495] Processor 15 is stuck.
[   75.131316] Processor 17 is stuck.

Note that only the sibling threads are stuck, while the primary threads (0, 8,
16 etc) boot just fine. Looking closer at the previous step of kexec, we observe
that kexec tries to wakeup (bring online) the sibling threads of all the cores,
before performing kexec:

[ 9464.131231] Starting new kernel
[ 9464.148507] kexec: Waking offline cpu 1.
[ 9464.148552] kexec: Waking offline cpu 2.
[ 9464.148600] kexec: Waking offline cpu 3.
[ 9464.148636] kexec: Waking offline cpu 4.
[ 9464.148671] kexec: Waking offline cpu 5.
[ 9464.148708] kexec: Waking offline cpu 6.
[ 9464.148743] kexec: Waking offline cpu 7.
[ 9464.148779] kexec: Waking offline cpu 9.
[ 9464.148815] kexec: Waking offline cpu 10.
[ 9464.148851] kexec: Waking offline cpu 11.
[ 9464.148887] kexec: Waking offline cpu 12.
[ 9464.148922] kexec: Waking offline cpu 13.
[ 9464.148958] kexec: Waking offline cpu 14.
[ 9464.148994] kexec: Waking offline cpu 15.
[ 9464.149030] kexec: Waking offline cpu 17.

Instrumenting this piece of code revealed that the cpu_up() operation actually
fails with -EBUSY. Thus, only the primary threads of all the cores are online
during kexec, and hence this is a sure-shot receipe for disaster, as explained
in commit e8e5c2155b (powerpc/kexec: Fix orphaned offline CPUs across kexec),
as well as in the comment above wake_offline_cpus().

It turns out that cpu_up() was returning -EBUSY because the variable
'cpu_hotplug_disabled' was set to 1; and this disabling of CPU hotplug was done
by migrate_to_reboot_cpu() inside kernel_kexec().

Now, migrate_to_reboot_cpu() was originally written with the assumption that
any further code will not need to perform CPU hotplug, since we are anyway in
the reboot path. However, kexec is clearly not such a case, since we depend on
onlining CPUs, atleast on powerpc.

So re-enable cpu-hotplug after returning from migrate_to_reboot_cpu() in the
kexec path, to fix this regression in kexec on powerpc.

Also, wrap the cpu_up() in powerpc kexec code within a WARN_ON(), so that we
can catch such issues more easily in the future.

Fixes: c97102ba963 (kexec: migrate to reboot cpu)
Cc: sta...@vger.kernel.org
Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
---

 arch/powerpc/kernel/machine_kexec_64.c |2 +-
 kernel/kexec.c |8 
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 59d229a..879b3aa 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -237,7 +237,7 @@ static void wake_offline_cpus(void)
if (!cpu_online(cpu)) {
printk(KERN_INFO kexec: Waking offline cpu %d.\n,
   cpu);
-   cpu_up(cpu);
+   WARN_ON(cpu_up(cpu));
}
}
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad..28c5706 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1683,6 +1683,14 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare(NULL);
migrate_to_reboot_cpu();
+
+   /*
+* migrate_to_reboot_cpu() disables CPU hotplug assuming that
+* no further code needs to use CPU hotplug (which is true in
+* the reboot case). However, the kexec path depends on using
+* CPU hotplug again; so re-enable it here.
+*/
+   cpu_hotplug_enable();
printk(KERN_EMERG Starting new kernel\n);
machine_shutdown();
}

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [V6 00/11] perf: New conditional branch filter

2014-05-27 Thread Stephane Eranian
Hi,


On Mon, May 5, 2014 at 11:09 AM, Anshuman Khandual
khand...@linux.vnet.ibm.com wrote:

 This patchset is the re-spin of the original branch stack 
 sampling
 patchset which introduced new PERF_SAMPLE_BRANCH_COND branch filter. This 
 patchset
 also enables SW based branch filtering support for book3s powerpc platforms 
 which
 have PMU HW backed branch stack sampling support.

 Summary of code changes in this patchset:

 (1) Introduces a new PERF_SAMPLE_BRANCH_COND branch filter
 (2) Add the cond branch filter options in the perf record tool
 (3) Enable PERF_SAMPLE_BRANCH_COND in X86 platforms
 (4) Enable PERF_SAMPLE_BRANCH_COND in POWER8 platform
 (5) Update the documentation regarding perf record tool
 (6) Add some new powerpc instruction analysis functions in code-patching 
 library
 (7) Enable SW based branch filter support for powerpc book3s
 (8) Changed BHRB configuration in POWER8 to accommodate SW branch filters

I have been looking at those patches and ran some tests.
And I found a few issues so far.

I am running:
$ perf record -j any_ret -e cycles:u test_program
$ perf report -D

Most entries are okay and match the filter, however some do not make sense:

3642586996762 0x15d0 [0x108]: PERF_RECORD_SAMPLE(IP, 2): 17921/17921:
0x10001170 period: 613678 addr: 0
 branch stack: nr:9
.  0: 100011cc - 1e38
.  1: 10001150 - 100011bc
.  2: 10001208 - 1e38
.  3: 10001160 - 100011f8
.  4: 100011cc - 1e38
.  5: 10001150 - 100011bc
.  6: 10001208 - 1e38
.  7: 10001160 - 100011f8
.  8:  - 10001160
^^
Entry 8 does not make sense, unless 0x0 is a valid return branch
instruction address.
If an address is invalid, the whole entry needs to be eliminated. It
is okay to have
less than the max number of entries supported by HW.

I also had cases where monitoring only at the user level, got me
branch addresses in the
0xc000.. range. My test program is linked statically.

when eliminating the bogus entries, my tests yielded only return
branch instruction addresses
which is good. Will run more tests.


 With this new SW enablement, the branch filter support for book3s platforms 
 have
 been extended to include all these combinations discussed below with a sample 
 test
 application program (included here).

 Changes in V2
 =
 (1) Enabled PPC64 SW branch filtering support
 (2) Incorporated changes required for all previous comments

 Changes in V3
 =
 (1) Split the SW branch filter enablement into multiple patches
 (2) Added PMU neutral SW branch filtering code, PMU specific HW branch 
 filtering code
 (3) Added new instruction analysis functionality into powerpc code-patching 
 library
 (4) Changed name for some of the functions
 (5) Fixed couple of spelling mistakes
 (6) Changed code documentation in multiple places

 Changes in V4
 =
 (1) Changed the commit message for patch (01/10)
 (2) Changed the patch (02/10) to accommodate review comments from Michael 
 Ellerman
 (3) Rebased the patchset against latest Linus's tree

 Changes in V5
 =
 (1) Added a precursor patch to cleanup the indentation problem in 
 power_pmu_bhrb_read
 (2) Added a precursor patch to re-arrange P8 PMU BHRB filter config which 
 improved the clarity
 (3) Merged the previous 10th patch into the 8th patch
 (4) Moved SW based branch analysis code from core perf into code-patching 
 library as suggested by Michael
 (5) Simplified the logic in branch analysis library
 (6) Fixed some ambiguities in documentation at various places
 (7) Added some more in-code documentation blocks at various places
 (8) Renamed some local variable and function names
 (9) Fixed some indentation and white space errors in the code
 (10) Implemented almost all the review comments and suggestions made by 
 Michael Ellerman on V4 patchset
 (11) Enabled privilege mode SW branch filter
 (12) Simplified and generalized the SW implemented conditional branch filter
 (13) PERF_SAMPLE_BRANCH_COND filter is now supported only through SW 
 implementation
 (14) Adjusted other patches to deal with the above changes

 Changes in V6
 =
 (1) Rebased the patchset against the master
 (2) Added Reviewed-by: Andi Kleen in the first four patches in the series 
 which changes the
 generic or X86 perf code. [https://lkml.org/lkml/2014/4/7/130]

 HW implemented branch filters
 =

 (1) perf record -j any_call -e branch-misses:u ./cprog

 # Overhead  Command  Source Shared ObjectSource Symbol  Target 
 Shared Object Target Symbol
 #   ...    ...  
   
 #
  7.85%cprog  cprog [.] sw_3_1   cprog 
 [.] 

Re: [2/2] powerpc/corenet64_smp_defconfig: enable RTC support

2014-05-27 Thread Kumar Gala

On May 25, 2014, at 10:08 PM, shengzhou@freescale.com wrote:

 
 -Original Message-
 From: Wood Scott-B07421
 Sent: Saturday, May 24, 2014 1:06 AM
 To: Liu Shengzhou-B36685
 Cc: linuxppc-dev@lists.ozlabs.org
 Subject: Re: [2/2] powerpc/corenet64_smp_defconfig: enable RTC support
 
 On Fri, 2014-05-23 at 03:03 -0500, Liu Shengzhou-B36685 wrote:
 -Original Message-
 From: Wood Scott-B07421
 Sent: Friday, May 23, 2014 6:52 AM
 To: Liu Shengzhou-B36685
 Cc: linuxppc-dev@lists.ozlabs.org
 Subject: Re: [2/2] powerpc/corenet64_smp_defconfig: enable RTC
 support
 
 +++ b/arch/powerpc/configs/corenet64_smp_defconfig
 @@ -125,6 +125,11 @@ CONFIG_USB_EHCI_FSL=y  CONFIG_USB_STORAGE=y
 CONFIG_MMC=y  CONFIG_MMC_SDHCI=y
 +CONFIG_RTC_CLASS=y
 +CONFIG_RTC_DRV_CMOS=y
 +CONFIG_RTC_DRV_DS1307=y
 +CONFIG_RTC_DRV_DS1374=y
 +CONFIG_RTC_DRV_DS3232=y
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
 CONFIG_DMADEVICES=y
 
 Why only corenet64 and not corenet32?
 
 -Scott
 [Shengzhou] There is already RTC support in corenet32, only missing in
 corenet64.
 
 Only DS3232, not DS1307 or DS1374.  Which boards use the latter two?
 
 Why do we need CONFIG_RTC_DRV_CMOS?
 
 -Scott
 
 [Shengzhou] so far DS1307 and DS1374 occur only on those boards with 
 corenet64. 
 CONFIG_RTC_DRV_CMOS is enabled in mpc85xx_defconfig, mpc85xx_smp_defconfig, 
 corenet32_smp_defconfig, etc, here keeps consistent in corenet64.
 It seems CONFIG_RTC_DRV_CMOS is not needed on 85xx platform, do we need to 
 remove CONFIG_RTC_DRV_CMOS from all 85xx/corenet defconfig? If so, I will 
 post a new patch to do it.

The CDS board uses an RTC over ISA if I remember correctly, not sure what 
driver deals with that (if its CONFIG_RTC_DRV_CMOS) or something else.

- k
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [2/2] powerpc/corenet64_smp_defconfig: enable RTC support

2014-05-27 Thread Scott Wood
On Tue, 2014-05-27 at 10:33 -0500, Kumar Gala wrote:
 On May 25, 2014, at 10:08 PM, shengzhou@freescale.com wrote:
 
  
  -Original Message-
  From: Wood Scott-B07421
  Sent: Saturday, May 24, 2014 1:06 AM
  To: Liu Shengzhou-B36685
  Cc: linuxppc-dev@lists.ozlabs.org
  Subject: Re: [2/2] powerpc/corenet64_smp_defconfig: enable RTC support
  
  On Fri, 2014-05-23 at 03:03 -0500, Liu Shengzhou-B36685 wrote:
  -Original Message-
  From: Wood Scott-B07421
  Sent: Friday, May 23, 2014 6:52 AM
  To: Liu Shengzhou-B36685
  Cc: linuxppc-dev@lists.ozlabs.org
  Subject: Re: [2/2] powerpc/corenet64_smp_defconfig: enable RTC
  support
  
  +++ b/arch/powerpc/configs/corenet64_smp_defconfig
  @@ -125,6 +125,11 @@ CONFIG_USB_EHCI_FSL=y  CONFIG_USB_STORAGE=y
  CONFIG_MMC=y  CONFIG_MMC_SDHCI=y
  +CONFIG_RTC_CLASS=y
  +CONFIG_RTC_DRV_CMOS=y
  +CONFIG_RTC_DRV_DS1307=y
  +CONFIG_RTC_DRV_DS1374=y
  +CONFIG_RTC_DRV_DS3232=y
  CONFIG_EDAC=y
  CONFIG_EDAC_MM_EDAC=y
  CONFIG_DMADEVICES=y
  
  Why only corenet64 and not corenet32?
  
  -Scott
  [Shengzhou] There is already RTC support in corenet32, only missing in
  corenet64.
  
  Only DS3232, not DS1307 or DS1374.  Which boards use the latter two?
  
  Why do we need CONFIG_RTC_DRV_CMOS?
  
  -Scott
  
  [Shengzhou] so far DS1307 and DS1374 occur only on those boards with 
  corenet64. 

Which boards?  I don't see them in any corenet dts files.  I do see some
instances of ds1374 in the dts files of boards non-corenet mpc85xx
boards (mpc8568mds, mpc8569mds, and p1021mds), yet it's not in the
mpc85xx_defconfig or mpc85xx_smp_defconfig.

  CONFIG_RTC_DRV_CMOS is enabled in mpc85xx_defconfig, mpc85xx_smp_defconfig, 
  corenet32_smp_defconfig, etc, here keeps consistent in corenet64.
  It seems CONFIG_RTC_DRV_CMOS is not needed on 85xx platform, do we need to 
  remove CONFIG_RTC_DRV_CMOS from all 85xx/corenet defconfig? If so, I will 
  post a new patch to do it.
 
 The CDS board uses an RTC over ISA if I remember correctly, not sure what 
 driver deals with that (if its CONFIG_RTC_DRV_CMOS) or something else.

If it's just CDS then we don't need it in either corenet config.

-Scott


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Sat, 2014-05-24 at 12:06 +1000, Gavin Shan wrote:
 On Fri, May 23, 2014 at 08:29:59AM -0600, Alex Williamson wrote:
 On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
  On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
  On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
 
 .../...
 
 No, sorry, I mean how does the user get information about the error?
 The interface we have here is:
 a) find that something bad has happened
 b) kick it into working again
 c) continue
 
 How does the user figure out what happened and if it makes sense to
 attempt to recover?  Where does the user learn that their disk is on
 fire?
 
 
 When 0xFF's returned from config or IO read, user should check the
 device (PE)'s state with ioctl command VFIO_EEH_PE_GET_STATE. If the
 device (PE) has been put into frozen state, It's confirmed the device
 (disk you mentioned) is on fire.

No, this only confirms that something bad happened, not _what_ bad thing
happened.

  User should kick off recovery, which
 includes:

And here you're just describing the kick operation again...

 
 - User stops any operatins (config, IO, DMA) on the device because any
   PCI traffic to frozen device will be dropped from software or hardware
   level. Also, we don't expect DMA traffic during recovery. Otherwise,
   we will bump into recursive errors and the recovery should fail.
 - VFIO_EEH_PE_SET_OPTION to enable I/O path (DMA path is still under frozen
   state). EEH_VFIO_PE_CONFIGURE to reconfigure affected PCI bridges and then
   do error log retrieval.

These logs, where do they go?  How does the user get access?  That's
what I'm trying to ask about.

 - VFIO_EEH_PE_RESET to reset the affected device (PE). EEH_VFIO_PE_CONFIUGRE
   to restore BARs.
 - User resumes the device to start PCI traffic and device is brought to
   funtional state.
 
 .../...
 
 
 No, I prefer to stay consistent with the rest of the VFIO API and use
 argsz + flags.
 
 
 Here's the recap for previous reply: I have several cases for ioctl().
 
 - ioctl(fd, cmd, NULL):   I needn't any input info.
 - ioctl(fd, cmd, data):  I need input info
 
 For all the cases, should I simply have a data struct to include 
 argsz+flags?

Anything that requires data should have argsz+flags, if it doesn't
require data, it doesn't need them, but think long an hard about whether
there's any possibility that we'll need parameters in the future.

 For return value from ioctl(), can we simply to have additional field in the
 above data struct to carry it? 0 is the information I have to return for
 some of the cases.

If for instance your ioctl is returning something like number of
errors, then it's perfectly fine to use that as the ioctl return.  0
is error, = zero is a success with value.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
 The patch adds new IOCTL commands for sPAPR VFIO container device
 to support EEH functionality for PCI devices, which have been passed
 through from host to somebody else via VFIO.
 
 Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
 ---
  Documentation/vfio.txt  | 92 
 -
  drivers/vfio/pci/Makefile   |  1 +
  drivers/vfio/pci/vfio_pci.c | 20 +---
  drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
  drivers/vfio/pci/vfio_pci_private.h |  5 ++
  drivers/vfio/vfio_iommu_spapr_tce.c | 85 ++
  include/uapi/linux/vfio.h   | 66 ++
  7 files changed, 308 insertions(+), 7 deletions(-)
  create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
 
 diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
 index b9ca023..d890fed 100644
 --- a/Documentation/vfio.txt
 +++ b/Documentation/vfio.txt
 @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
 real mode which provides
  an excellent performance which has limitations such as inability to do
  locked pages accounting in real time.
  
 -So 3 additional ioctls have been added:
 +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
 +subtree that can be treated as a unit for the purposes of partitioning and
 +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
 +function of a multi-function IOA, or multiple IOAs (possibly including switch
 +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
 errors
 +and recover from them via EEH RTAS services, which works on the basis of
 +additional ioctl commands.
 +
 +So 7 additional ioctls have been added:
  
   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
   of the DMA window on the PCI bus.
 @@ -316,6 +324,17 @@ So 3 additional ioctls have been added:
  
   VFIO_IOMMU_DISABLE - disables the container.
  
 + VFIO_EEH_PE_SET_OPTION - enables or disables EEH functionality on the
 + specified device. Also, it can be used to remove IO or DMA
 + stopped state on the frozen PE.
 +
 + VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
 +
 + VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
 + error recovering.
 +
 + VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
 + one of the major steps for error recoverying.
  
  The code flow from the example above should be slightly changed:
  
 @@ -346,6 +365,77 @@ The code flow from the example above should be slightly 
 changed:
   ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
   .
  
 +Based on the initial example we have, the following piece of code could be
 +reference for EEH setup and error handling:
 +
 + struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
 + struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
 + struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
 + struct vfio_eeh_pe_configure configure = { .argsz = sizeof(configure) };
 +
 + 
 +
 + /* Get a file descriptor for the device */
 + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
 +
 + /* Enable the EEH functionality on the device */
 + option.option = VFIO_EEH_PE_SET_OPT_ENABLE;
 + ioctl(container, VFIO_EEH_PE_SET_OPTION, option);
 +
 + /* You're suggested to create additional data struct to represent
 +  * PE, and put child devices belonging to same IOMMU group to the
 +  * PE instance for later reference.
 +  */
 +
 + /* Check the PE's state and make sure it's in functional state */
 + ioctl(container, VFIO_EEH_PE_GET_STATE, state);
 +
 + /* Save device's state. pci_save_state() would be good enough
 +  * as an example.
 +  */
 +
 + /* Test and setup the device */
 + ioctl(device, VFIO_DEVICE_GET_INFO, device_info);
 +
 + 
 +
 + /* When 0xFF's returned from reading PCI config space or IO BARs
 +  * of the PCI device. Check the PE state to see if that has been
 +  * frozen.
 +  */
 + ioctl(container, VFIO_EEH_PE_GET_STATE, state);
 +
 + /* Waiting for pending PCI transactions to be completed and don't
 +  * produce any more PCI traffic from/to the affected PE until
 +  * recovery is finished.
 +  */
 +
 + /* Enable IO for the affected PE and collect logs. Usually, the
 +  * standard part of PCI config space, AER registers are dumped
 +  * as logs for further analysis.
 +  */
 + option.option = VFIO_EEH_PE_SET_OPT_IO;
 + ioctl(container, VFIO_EEH_PE_SET_OPTION, option);
 +
 + /* Issue PE reset */
 + reset.option = VFIO_EEH_PE_RESET_HOT;
 + ioctl(container, VFIO_EEH_PE_RESET, reset);
 + reset.option = VFIO_EEH_PE_RESET_DEACTIVATE;
 + 

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Benjamin Herrenschmidt
On Tue, 2014-05-27 at 12:15 -0600, Alex Williamson wrote:

  +/*
  + * Reset is the major step to recover problematic PE. The following
  + * command helps on that.
  + */
  +struct vfio_eeh_pe_reset {
  +   __u32 argsz;
  +   __u32 flags;
  +   __u32 option;
  +#define VFIO_EEH_PE_RESET_DEACTIVATE   0   /* Deactivate reset 
  */
  +#define VFIO_EEH_PE_RESET_HOT  1   /* Hot reset
  */
  +#define VFIO_EEH_PE_RESET_FUNDAMENTAL  3   /* Fundamental reset
  */
 
 How does a user know which of these to use?

The usual way is the driver asks for one or the other, this plumbs back
into the guest EEH code which itself plumbs into the PCIe error recovery
framework in Linux.

However I do have a question for Gavin here: Why do we expose an
explicit deactivate ? The reset functions should do the whole
reset sequence (assertion, delay, deassertion). In fact the firmware
doesn't really give you a choice for PERST right ? Or do we have
a requirement to expose both phases for RTAS? (In that case I'm
happy to ignore the deassertion there too).

Cheers,
Ben.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Wed, 2014-05-28 at 06:30 +1000, Benjamin Herrenschmidt wrote:
 On Tue, 2014-05-27 at 12:15 -0600, Alex Williamson wrote:
 
   +/*
   + * Reset is the major step to recover problematic PE. The following
   + * command helps on that.
   + */
   +struct vfio_eeh_pe_reset {
   + __u32 argsz;
   + __u32 flags;
   + __u32 option;
   +#define VFIO_EEH_PE_RESET_DEACTIVATE 0   /* Deactivate reset 
   */
   +#define VFIO_EEH_PE_RESET_HOT1   /* Hot reset
   */
   +#define VFIO_EEH_PE_RESET_FUNDAMENTAL3   /* Fundamental reset
   */
  
  How does a user know which of these to use?
 
 The usual way is the driver asks for one or the other, this plumbs back
 into the guest EEH code which itself plumbs into the PCIe error recovery
 framework in Linux.

So magic?

 
 However I do have a question for Gavin here: Why do we expose an
 explicit deactivate ? The reset functions should do the whole
 reset sequence (assertion, delay, deassertion). In fact the firmware
 doesn't really give you a choice for PERST right ? Or do we have
 a requirement to expose both phases for RTAS? (In that case I'm
 happy to ignore the deassertion there too).
 
 Cheers,
 Ben.
 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Benjamin Herrenschmidt
On Tue, 2014-05-27 at 14:37 -0600, Alex Williamson wrote:

  The usual way is the driver asks for one or the other, this plumbs back
  into the guest EEH code which itself plumbs into the PCIe error recovery
  framework in Linux.
 
 So magic?

Yes. The driver is expected to more or less knows what kind of reset it
wants for its device. Ideally hot reset is sufficient but some drivers
knows that the device they drive is crappy enough that it mostly ignores
hot reset and really needs a PERST for example...

Also we have other reasons to expose those interfaces outside of EEH. 

For example, some drivers might want to specifically trigger a PERST
after a microcode update. IE. There are path outside of EEH error
recovery where drivers in the guest might want to trigger a reset
to the device and they have control under some circumstances on
which kind of reset they are doing (and the guest Linux does  have
different code path to do a hot reset vs. a fundamental reset).

So we need to expose that distinction to be able to honor the guest
decision.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 00/13] Refactor pci_is_brdige() to simplify code

2014-05-27 Thread Bjorn Helgaas
On Sun, May 04, 2014 at 12:23:35PM +0800, Yijing Wang wrote:
 v1-v2: Add comments for new pci_is_bridge().
 
 This patchset rename the current pci_is_bridge() to pci_has_subordinate(), 
 and introduce a new pci_is_bridge() which determine pci bridge by check
 dev-hdr_type. The new one is more accurate. PCI Spec define the pci
 device is a bridge by the dev-hdr_type = 0x01 || 0x02.
 
 There is no guarantee that a PCI bridge must attached a subordinate bus.
 When PCI bridge is created but before the scan child bus, it has no 
 subordinate bus. Also users can remove the pci bus using
 interface pci_remove_bus() in remove.c. 
 
 So use new pci_is_bridge() check if the PCI device is bridge is better
 choice. If user want check PCI bridge whether has a subordinate bus,
 pci_has_subordinate() is a candidate.
 
 
 Yijing Wang (13):
   PCI: rename pci_is_bridge() to pci_has_subordinate()
   PCI: Introduce new pci_is_bridge() helper function
   PCI: Use new pci_is_bridge() to simplify code
   x86/PCI: Use new pci_is_bridge() to simplify code
   IA64/PCI: Use new pci_is_bridge() to simplify code
   powerpc/PCI: Use new pci_is_bridge() to simplify code
   sparc/PCI: Use new pci_is_bridge() to simplify code
   PCI, rpaphp: Use new pci_is_bridge() to simplify code
   PCI, shpchp: Use new pci_is_bridge() to simplify code
   PCI, cpcihp: Use new pci_is_bridge() to simplify code
   PCI, acpiphp: Use new pci_is_bridge() to simplify code
   PCI, pcmcia: Use new pci_is_bridge() to simplify code
   PCI, pciehp: Use new pci_is_bridge() to simplify code
 
  arch/ia64/pci/fixup.c  |4 +---
  arch/powerpc/kernel/pci-hotplug.c  |3 +--
  arch/powerpc/kernel/pci_of_scan.c  |3 +--
  arch/sparc/kernel/pci.c|3 +--
  arch/x86/pci/fixup.c   |4 +---
  drivers/pci/hotplug/acpiphp_glue.c |3 +--
  drivers/pci/hotplug/cpci_hotplug_pci.c |3 +--
  drivers/pci/hotplug/pciehp_pci.c   |3 +--
  drivers/pci/hotplug/rpadlpar_core.c|3 +--
  drivers/pci/hotplug/shpchp_pci.c   |3 +--
  drivers/pci/pci-acpi.c |8 +---
  drivers/pci/pci-driver.c   |8 
  drivers/pci/pci.h  |2 +-
  drivers/pci/probe.c|3 +--
  drivers/pci/setup-bus.c|4 +---
  drivers/pcmcia/cardbus.c   |3 +--
  include/linux/pci.h|   13 +
  17 files changed, 32 insertions(+), 41 deletions(-)

Applied to pci/pci_is_bridge for v3.16, thanks,

Bjorn
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alexander Graf


On 27.05.14 20:15, Alex Williamson wrote:

On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:

The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
  Documentation/vfio.txt  | 92 -
  drivers/vfio/pci/Makefile   |  1 +
  drivers/vfio/pci/vfio_pci.c | 20 +---
  drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
  drivers/vfio/pci/vfio_pci_private.h |  5 ++
  drivers/vfio/vfio_iommu_spapr_tce.c | 85 ++
  include/uapi/linux/vfio.h   | 66 ++
  7 files changed, 308 insertions(+), 7 deletions(-)
  create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c


[...]


+
+   return ret;
+}
+
  static long tce_iommu_ioctl(void *iommu_data,
 unsigned int cmd, unsigned long arg)
  {
@@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
tce_iommu_disable(container);
mutex_unlock(container-lock);
return 0;
+   case VFIO_EEH_PE_SET_OPTION:
+   case VFIO_EEH_PE_GET_STATE:
+   case VFIO_EEH_PE_RESET:
+   case VFIO_EEH_PE_CONFIGURE:
+   return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);

This is where it would have really made sense to have a single
VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
AlexG, are you really attached to splitting these out into separate
ioctls?


I don't see the problem. We need to forward 4 ioctls to a separate piece 
of code, so we forward 4 ioctls to a separate piece of code :). Putting 
them into one ioctl just moves the switch() into another function.



Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Kernel 3.15: Boot problems with a PA6T board

2014-05-27 Thread Christian Zigotzky

Hi Michael,

Thanks a lot for your answer.

On 26.05.2014 14:26, Michael Ellerman wrote:


Hi Christian,

I'm almost certain that is not the commit which breaks your machine. Or if it
is, something *really* weird is going on.

The code changed in that commit should never run on a PA6T.

You're right. I think this patch is for Power 8.

Experimental protocol:

git checkout -f 01d8885785a60ae8f4c37b0ed75bdc96d0fc6a44; git clean -fdx
(from 02/04/14) - Kernel boots
git checkout -f f1553174a207f68a4ec19d436003097e0a4dc405; git clean -fdx
(from 03/04/14) - Kernel boots
git checkout -f d40326f4b9f9617cdfd30f83a2db57d47e9c5bac; git clean -fdx
(from 04/04/14) - Kernel boots
git checkout -f 930b440cd8256f3861bdb0a59d26efaadac7941a; git clean -fdx
(from 05/04/14) - doesn't boot (rtc error)
git checkout -f 2b3a8fd735f86ebeb2b9d061054003000c36b654; git clean -fdx
(from 06/04/14) - doesn't boot (rtc error)
git checkout -f 26c12d93348f0bda0756aff83f4867d9ae58a5a6; git clean -fdx
(from 07/04/14) - doesn't boot (rtc error)
git checkout -f a6c8aff022d4d06e4b41455ae9b2a5d3d503bf76; git clean -fdx
(from 08/04/14) - Kernel boots
git checkout -f 035328c202d26a824b8632fd3b00635db5aee5a2; git clean -fdx
(from 08/04/14) - Kernel boots
git checkout -f 9000c17dc0f9c910267d2661225c9d33a227b27e; git clean -fdx
(from 08/04/14) powerpc/powernv: Fix endian issues with sensor code
One OPAL call and one device tree property needed byte swapping. -
doesn't boot (prom_init)
git checkout -f d3d35d957a9d0733dc51f14b5abc0bff5d3c5f3a; git clean -fdx
(from 08/04/14) - doesn't boot (prom_init)
git checkout -f c4586256f0c440bc2bdb29d2cbb915f0ca785d26; git clean -fdx
(from 09/04/14) - doesn't boot (prom_init)

So it looks like you manually picked commits based on the date?

Yes, it is.

That's a good start, but if you want to find the actual problem commit you need
to do a proper bisect.


I'm not a programmer but what can I do to solve this boot problem?

To start with you can probably narrow it down a bit by testing the following
commits:




18a1a7a1d862ae0794a0179473d08a414dd49234 - It doesn't boot. Error messages: 
Oops: Machine check, sig: 7 [#1] CPU: 1 PID: 1 Comm: swapper/0 not tainted



d8ff9cdf68fd119d491f3de90e1a612afc2f3b2b - It boots. :-)



0f5a869600141a0d5575e3190af01a050c081b07 - It boots. :-)



c7e64b9ce04aa2e3fad7396d92b5cb92056d16ac - It boots. :-)



d3e144532703fe2454b56eddb56f30d2d620187b - It boots. :-)


I think the machine check is the problem.

Cheers,

Christian
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 1/2] powerpc: numa: enable USE_PERCPU_NUMA_NODE_ID

2014-05-27 Thread Nishanth Aravamudan
On 19.05.2014 [11:14:23 -0700], Nishanth Aravamudan wrote:
 Hi Andrew,
 
 I found one issue with my patch, fixed below...
 
 On 16.05.2014 [16:39:45 -0700], Nishanth Aravamudan wrote:
  Based off 3bccd996 for ia64, convert powerpc to use the generic per-CPU
  topology tracking, specifically:
  
  initialize per cpu numa_node entry in start_secondary
  remove the powerpc cpu_to_node()
  define CONFIG_USE_PERCPU_NUMA_NODE_ID if NUMA
  
  Signed-off-by: Nishanth Aravamudan n...@linux.vnet.ibm.com
 
 snip
 
  diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
  index e2a4232..b95be24 100644
  --- a/arch/powerpc/kernel/smp.c
  +++ b/arch/powerpc/kernel/smp.c
  @@ -750,6 +750,11 @@ void start_secondary(void *unused)
  }
  traverse_core_siblings(cpu, true);
   
  +   /*
  +* numa_node_id() works after this.
  +*/
  +   set_numa_node(numa_cpu_lookup_table[cpu]);
  +
 
 Similar change is needed for the boot CPU. Update patch:
 
 
 powerpc: numa: enable USE_PERCPU_NUMA_NODE_ID
 
 Based off 3bccd996 for ia64, convert powerpc to use the generic per-CPU
 topology tracking, specifically:
 
 initialize per cpu numa_node entry in start_secondary
 remove the powerpc cpu_to_node()
 define CONFIG_USE_PERCPU_NUMA_NODE_ID if NUMA
 
 Signed-off-by: Nishanth Aravamudan n...@linux.vnet.ibm.com

Ping on this and patch 2/2. Ben, would you be willing to pull these into
your -next branch so they'd get some testing?

http://patchwork.ozlabs.org/patch/350368/
http://patchwork.ozlabs.org/patch/349838/

Without any further changes, these two help quite a bit with the slab
consumption on CONFIG_SLUB kernels when memoryless nodes are present.

Thanks,
Nish

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 1/2] powerpc: numa: enable USE_PERCPU_NUMA_NODE_ID

2014-05-27 Thread Benjamin Herrenschmidt
On Tue, 2014-05-27 at 16:44 -0700, Nishanth Aravamudan wrote:
  Signed-off-by: Nishanth Aravamudan n...@linux.vnet.ibm.com
 
 Ping on this and patch 2/2. Ben, would you be willing to pull these
 into
 your -next branch so they'd get some testing?
 
 http://patchwork.ozlabs.org/patch/350368/
 http://patchwork.ozlabs.org/patch/349838/
 
 Without any further changes, these two help quite a bit with the slab
 consumption on CONFIG_SLUB kernels when memoryless nodes are present.

I don't mind at all :-) I haven't really been following that story
so I was waiting for the dust to settle and maybe acks from MM people
but if you tell me they are good I'm prepared to trust you.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] tty/hvc/hvc_console: Fix wakeup of HVC thread on hvc_kick()

2014-05-27 Thread Benjamin Herrenschmidt
On Fri, 2014-05-23 at 19:41 +1000, Benjamin Herrenschmidt wrote:
 Some backends call hvc_kick() to wakeup the HVC thread from its
 slumber upon incoming characters. This however doesn't work
 properly because it uses msleep_interruptible() which is mostly
 immune to wake_up_process(). It will basically go back to sleep
 until the timeout is expired (only signals can really wake it).

 Replace it with a simple shedule_timeout_interruptible() instead,
 which may wakeup earlier every now and then but we really don't
 care in this case.

Nobody commented ? :-)

Greg, do you want to take this in the tty tree or can I stick it in
powerpc ?

Cheers,
Ben 

 Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
 ---
  drivers/tty/hvc/hvc_console.c |9 -
  1 file changed, 8 insertions(+), 1 deletion(-)
 
 diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
 index 94f9e3a..1094265 100644
 --- a/drivers/tty/hvc/hvc_console.c
 +++ b/drivers/tty/hvc/hvc_console.c
 @@ -760,10 +760,17 @@ static int khvcd(void *unused)
   if (poll_mask == 0)
   schedule();
   else {
 + unsigned long j_timeout;
 +
   if (timeout  MAX_TIMEOUT)
   timeout += (timeout  6) + 1;
  
 - msleep_interruptible(timeout);
 + /*
 +  * We don't use msleep_interruptible otherwise
 +  * kick will fail to wake us up
 +  */
 + j_timeout = msecs_to_jiffies(timeout) + 1;
 + schedule_timeout_interruptible(j_timeout);
   }
   }
   __set_current_state(TASK_RUNNING);
 


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 1/2] powerpc: numa: enable USE_PERCPU_NUMA_NODE_ID

2014-05-27 Thread Nishanth Aravamudan
On 28.05.2014 [09:56:14 +1000], Benjamin Herrenschmidt wrote:
 On Tue, 2014-05-27 at 16:44 -0700, Nishanth Aravamudan wrote:
   Signed-off-by: Nishanth Aravamudan n...@linux.vnet.ibm.com
  
  Ping on this and patch 2/2. Ben, would you be willing to pull these
  into
  your -next branch so they'd get some testing?
  
  http://patchwork.ozlabs.org/patch/350368/
  http://patchwork.ozlabs.org/patch/349838/
  
  Without any further changes, these two help quite a bit with the slab
  consumption on CONFIG_SLUB kernels when memoryless nodes are present.
 
 I don't mind at all :-) I haven't really been following that story
 so I was waiting for the dust to settle and maybe acks from MM people
 but if you tell me they are good I'm prepared to trust you.

The patches themselves are pretty minimal and similar to the ia64
changes (and the affected code seems like it hasn't changed in some
time, beyond in the common code). I'd mostly like to get some
broad-range build  boot testing.

Also, is NUMA a sufficient symbol to depend, you think? I figure most of
the PPC NUMA systems are the pSeries/IBM variety, which is where I've
run into memoryless nodes in the first place.

Thanks,
Nish

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Wed, 2014-05-28 at 00:49 +0200, Alexander Graf wrote:
 On 27.05.14 20:15, Alex Williamson wrote:
  On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
  The patch adds new IOCTL commands for sPAPR VFIO container device
  to support EEH functionality for PCI devices, which have been passed
  through from host to somebody else via VFIO.
 
  Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
  ---
Documentation/vfio.txt  | 92 
  -
drivers/vfio/pci/Makefile   |  1 +
drivers/vfio/pci/vfio_pci.c | 20 +---
drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
drivers/vfio/pci/vfio_pci_private.h |  5 ++
drivers/vfio/vfio_iommu_spapr_tce.c | 85 
  ++
include/uapi/linux/vfio.h   | 66 ++
7 files changed, 308 insertions(+), 7 deletions(-)
create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
 
 [...]
 
  +
  +  return ret;
  +}
  +
static long tce_iommu_ioctl(void *iommu_data,
  unsigned int cmd, unsigned long arg)
{
  @@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
 tce_iommu_disable(container);
 mutex_unlock(container-lock);
 return 0;
  +  case VFIO_EEH_PE_SET_OPTION:
  +  case VFIO_EEH_PE_GET_STATE:
  +  case VFIO_EEH_PE_RESET:
  +  case VFIO_EEH_PE_CONFIGURE:
  +  return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);
  This is where it would have really made sense to have a single
  VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
  AlexG, are you really attached to splitting these out into separate
  ioctls?
 
 I don't see the problem. We need to forward 4 ioctls to a separate piece 
 of code, so we forward 4 ioctls to a separate piece of code :). Putting 
 them into one ioctl just moves the switch() into another function.

And uses an extra 3 ioctl numbers and gives us extra things to update if
we ever need to add more ioctls, etc.  ioctl numbers are an address
space, how much address space do we really want to give to EEH?  It's
not a big difference, but I don't think it's completely even either.
Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alexander Graf


On 28.05.14 02:39, Alex Williamson wrote:

On Wed, 2014-05-28 at 00:49 +0200, Alexander Graf wrote:

On 27.05.14 20:15, Alex Williamson wrote:

On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:

The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
   Documentation/vfio.txt  | 92 
-
   drivers/vfio/pci/Makefile   |  1 +
   drivers/vfio/pci/vfio_pci.c | 20 +---
   drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
   drivers/vfio/pci/vfio_pci_private.h |  5 ++
   drivers/vfio/vfio_iommu_spapr_tce.c | 85 ++
   include/uapi/linux/vfio.h   | 66 ++
   7 files changed, 308 insertions(+), 7 deletions(-)
   create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c

[...]


+
+   return ret;
+}
+
   static long tce_iommu_ioctl(void *iommu_data,
 unsigned int cmd, unsigned long arg)
   {
@@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
tce_iommu_disable(container);
mutex_unlock(container-lock);
return 0;
+   case VFIO_EEH_PE_SET_OPTION:
+   case VFIO_EEH_PE_GET_STATE:
+   case VFIO_EEH_PE_RESET:
+   case VFIO_EEH_PE_CONFIGURE:
+   return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);

This is where it would have really made sense to have a single
VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
AlexG, are you really attached to splitting these out into separate
ioctls?

I don't see the problem. We need to forward 4 ioctls to a separate piece
of code, so we forward 4 ioctls to a separate piece of code :). Putting
them into one ioctl just moves the switch() into another function.

And uses an extra 3 ioctl numbers and gives us extra things to update if
we ever need to add more ioctls, etc.  ioctl numbers are an address
space, how much address space do we really want to give to EEH?  It's
not a big difference, but I don't think it's completely even either.
Thanks,


Yes, that's the point. I by far prefer to have you push back on anyone 
who introduces useless ioctls rather than have a separate EEH number 
space that people can just throw anything in they like ;).



Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 11/16] byteorder: provide a linux/byteorder.h with {be,le}_to_cpu() and cpu_to_{be,le}() macros

2014-05-27 Thread Joe Perches
On Tue, 2014-05-27 at 17:22 -0700, Cody P Schafer wrote:
 Rather manually specifying the size of the integer to be converted, key
 off of the type size. Reduces duplicate size info and the occurance of
 certain types of bugs (using the wrong sized conversion).
[]
 diff --git a/include/linux/byteorder.h b/include/linux/byteorder.h
[]
 @@ -0,0 +1,34 @@
 +#ifndef LINUX_BYTEORDER_H_
 +#define LINUX_BYTEORDER_H_
 +
 +#include asm/byteorder.h
 +
 +#define be_to_cpu(v) \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), be16_to_cpu(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), be32_to_cpu(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), be64_to_cpu(v), \
 + (void)0

probably better to use BUILD_BUG instead of these 0 returns

 +
 +#define le_to_cpu(v) \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), le16_to_cpu(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), le32_to_cpu(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), le64_to_cpu(v), \
 + (void)0
 +
 +#define cpu_to_le(v) \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), cpu_to_le16(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), cpu_to_le32(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), cpu_to_le64(v), \
 + (void)0
 +
 +#define cpu_to_be(v) \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), cpu_to_be16(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), cpu_to_be32(v), \
 + __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), cpu_to_be64(v), \
 + (void)0
 +
 +#endif



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Gavin Shan
On Tue, May 27, 2014 at 12:15:27PM -0600, Alex Williamson wrote:
On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
 The patch adds new IOCTL commands for sPAPR VFIO container device
 to support EEH functionality for PCI devices, which have been passed
 through from host to somebody else via VFIO.
 
 Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
 ---
  Documentation/vfio.txt  | 92 
 -
  drivers/vfio/pci/Makefile   |  1 +
  drivers/vfio/pci/vfio_pci.c | 20 +---
  drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
  drivers/vfio/pci/vfio_pci_private.h |  5 ++
  drivers/vfio/vfio_iommu_spapr_tce.c | 85 ++
  include/uapi/linux/vfio.h   | 66 ++
  7 files changed, 308 insertions(+), 7 deletions(-)
  create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
 
 diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
 index b9ca023..d890fed 100644
 --- a/Documentation/vfio.txt
 +++ b/Documentation/vfio.txt
 @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
 real mode which provides
  an excellent performance which has limitations such as inability to do
  locked pages accounting in real time.
  
 -So 3 additional ioctls have been added:
 +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
 +subtree that can be treated as a unit for the purposes of partitioning and
 +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
 +function of a multi-function IOA, or multiple IOAs (possibly including 
 switch
 +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
 errors
 +and recover from them via EEH RTAS services, which works on the basis of
 +additional ioctl commands.
 +
 +So 7 additional ioctls have been added:
  
  VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
  of the DMA window on the PCI bus.
 @@ -316,6 +324,17 @@ So 3 additional ioctls have been added:
  
  VFIO_IOMMU_DISABLE - disables the container.
  
 +VFIO_EEH_PE_SET_OPTION - enables or disables EEH functionality on the
 +specified device. Also, it can be used to remove IO or DMA
 +stopped state on the frozen PE.
 +
 +VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
 +
 +VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
 +error recovering.
 +
 +VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
 +one of the major steps for error recoverying.
  
  The code flow from the example above should be slightly changed:
  
 @@ -346,6 +365,77 @@ The code flow from the example above should be slightly 
 changed:
  ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
  .
  
 +Based on the initial example we have, the following piece of code could be
 +reference for EEH setup and error handling:
 +
 +struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
 +struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
 +struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
 +struct vfio_eeh_pe_configure configure = { .argsz = sizeof(configure) };
 +
 +
 +
 +/* Get a file descriptor for the device */
 +device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
 +
 +/* Enable the EEH functionality on the device */
 +option.option = VFIO_EEH_PE_SET_OPT_ENABLE;
 +ioctl(container, VFIO_EEH_PE_SET_OPTION, option);
 +
 +/* You're suggested to create additional data struct to represent
 + * PE, and put child devices belonging to same IOMMU group to the
 + * PE instance for later reference.
 + */
 +
 +/* Check the PE's state and make sure it's in functional state */
 +ioctl(container, VFIO_EEH_PE_GET_STATE, state);
 +
 +/* Save device's state. pci_save_state() would be good enough
 + * as an example.
 + */
 +
 +/* Test and setup the device */
 +ioctl(device, VFIO_DEVICE_GET_INFO, device_info);
 +
 +
 +
 +/* When 0xFF's returned from reading PCI config space or IO BARs
 + * of the PCI device. Check the PE state to see if that has been
 + * frozen.
 + */
 +ioctl(container, VFIO_EEH_PE_GET_STATE, state);
 +
 +/* Waiting for pending PCI transactions to be completed and don't
 + * produce any more PCI traffic from/to the affected PE until
 + * recovery is finished.
 + */
 +
 +/* Enable IO for the affected PE and collect logs. Usually, the
 + * standard part of PCI config space, AER registers are dumped
 + * as logs for further analysis.
 + */
 +option.option = VFIO_EEH_PE_SET_OPT_IO;
 +ioctl(container, VFIO_EEH_PE_SET_OPTION, option);
 +
 +/* Issue PE reset */
 +reset.option = VFIO_EEH_PE_RESET_HOT;
 +ioctl(container, VFIO_EEH_PE_RESET, reset);
 +reset.option = VFIO_EEH_PE_RESET_DEACTIVATE;
 

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Wed, 2014-05-28 at 02:44 +0200, Alexander Graf wrote:
 On 28.05.14 02:39, Alex Williamson wrote:
  On Wed, 2014-05-28 at 00:49 +0200, Alexander Graf wrote:
  On 27.05.14 20:15, Alex Williamson wrote:
  On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
  The patch adds new IOCTL commands for sPAPR VFIO container device
  to support EEH functionality for PCI devices, which have been passed
  through from host to somebody else via VFIO.
 
  Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
  ---
 Documentation/vfio.txt  | 92 
  -
 drivers/vfio/pci/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c | 20 +---
 drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
 drivers/vfio/pci/vfio_pci_private.h |  5 ++
 drivers/vfio/vfio_iommu_spapr_tce.c | 85 
  ++
 include/uapi/linux/vfio.h   | 66 ++
 7 files changed, 308 insertions(+), 7 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
  [...]
 
  +
  +return ret;
  +}
  +
 static long tce_iommu_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
 {
  @@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
   tce_iommu_disable(container);
   mutex_unlock(container-lock);
   return 0;
  +case VFIO_EEH_PE_SET_OPTION:
  +case VFIO_EEH_PE_GET_STATE:
  +case VFIO_EEH_PE_RESET:
  +case VFIO_EEH_PE_CONFIGURE:
  +return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);
  This is where it would have really made sense to have a single
  VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
  AlexG, are you really attached to splitting these out into separate
  ioctls?
  I don't see the problem. We need to forward 4 ioctls to a separate piece
  of code, so we forward 4 ioctls to a separate piece of code :). Putting
  them into one ioctl just moves the switch() into another function.
  And uses an extra 3 ioctl numbers and gives us extra things to update if
  we ever need to add more ioctls, etc.  ioctl numbers are an address
  space, how much address space do we really want to give to EEH?  It's
  not a big difference, but I don't think it's completely even either.
  Thanks,
 
 Yes, that's the point. I by far prefer to have you push back on anyone 
 who introduces useless ioctls rather than have a separate EEH number 
 space that people can just throw anything in they like ;).

Well, I appreciate that, but having them as separate ioctls doesn't
really prevent that either.  Any one of these 4 could be set to take a
sub-option to extend and contort the EEH interface.  The only way to
prevent that would be to avoid the argsz+flags hack that make the ioctl
extendable.  Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 00/16] perf: add support for parameterized events from sysfs (powerpc 24x7)

2014-05-27 Thread Cody P Schafer
What this patchset does:

 - the first patch (override sysfs in tools/perf via SYSFS_PATH) was sent out
   previously, but needed a resend anyhow. Having it is useful for testing the
   later changes to tools/perf.
 - the second patch is a bugfix to the powerpc hv-24x7 code which was
   previously sent out, which is a good idea to have when testing these patches
   on POWER8 hardware.

 - document perf sysfs and the changes to add parameterized events
   - semi-notably: removes the growing list of specific POWER cpu events and
 begins documenting them generically, much like the docs for
 /sys/modules/MODULENAME do for modules.
 - tools/perf changes to support parameterized events
 - export some parameterized events from the powerpc pmus hv_24x7 and hv_gpci

Description of event parameters from the documentation patch:

Event parameters are a basic way for partial events to be specified in
sysfs with per-event names given to the fields that need to be filled in
when using a particular event.

It is intended for supporting cases where the single 'cpu' parameter is
insufficient. For example, POWER 8 has events for physical
sockets/cores/cpus that are accessible from with virtual machines. To
keep using the single 'cpu' parameter we'd need to perform a mapping
between Linux's cpus and the physical machine's cpus (in this case
Linux is running under a hypervisor). This isn't possible because
bindings between our cpus and physical cpus may not be fixed, and we
probably won't have a cpu on each physical cpu.

Description of the sysfs contents when events are parameterized (copied from an
included patch):

Examples:

domain=0x1,offset=0x8,starting_index=phys_cpu

In the case of the last example, a value replacing phys_cpu
would need to be provided by the user selecting the particular
event. This is refered to as event parameterization. All
non-numerical values indicate an event parameter.

Notes on how perf-list displays parameterized events (and how to use them,
again culled from an included patch):

PARAMETERIZED EVENTS


Some pmu events listed by 'perf-list' will be displayed with '?' in 
them. For
example:

  hv_gpci/dtbp_ptitc,phys_processor_idx=?/

This means that when provided as an event, a value for 
phys_processor_idx must
also be supplied. For example:

  perf stat -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ...


Cody P Schafer (16):
  tools/perf: allow overriding sysfs and proc finding with env var
  powerpc/perf/hv-24x7: use kmem_cache instead of aligned stack
allocations
  perf Documentation: sysfs events/ interfaces
  perf Documentation: remove duplicated docs for powerpc cpu specific
events
  perf Documentation: add event parameters
  tools/perf: annotate list_head with type info
  tools/perf: support parsing parameterized events
  tools/perf: extend format_alias() to include event parameters
  tools/perf: document parameterized events and note symbolically formed
events
  perf: provide sysfs_show for struct perf_pmu_events_attr
  byteorder: provide a linux/byteorder.h with {be,le}_to_cpu() and
cpu_to_{be,le}() macros
  powerpc/perf/hv-24x7: parse catalog and populate sysfs with events
  powerpc/perf/hv-24x7: Documentaion for new sysfs entries which expose
descriptions
  perf: add PMU_EVENT_ATTR_STRING() helper
  powerpc/perf/{hv-gpci,hv-common}: generate requests with counters
annotated
  powerpc/perf/hv-gpci: add the remaining gpci requests

 .../testing/sysfs-bus-event_source-devices-events  | 617 ++--
 .../testing/sysfs-bus-event_source-devices-hv_24x7 |  22 +
 arch/powerpc/perf/hv-24x7-catalog.h|  25 +
 arch/powerpc/perf/hv-24x7-domains.h|  19 +
 arch/powerpc/perf/hv-24x7.c| 812 -
 arch/powerpc/perf/hv-24x7.h|  12 +-
 arch/powerpc/perf/hv-common.c  |  10 +-
 arch/powerpc/perf/hv-gpci-requests.h   | 258 +++
 arch/powerpc/perf/hv-gpci.c|   8 +
 arch/powerpc/perf/hv-gpci.h|  37 +-
 arch/powerpc/perf/req-gen/_begin.h |  13 +
 arch/powerpc/perf/req-gen/_clear.h |   5 +
 arch/powerpc/perf/req-gen/_end.h   |   4 +
 arch/powerpc/perf/req-gen/_request-begin.h |  15 +
 arch/powerpc/perf/req-gen/_request-end.h   |   8 +
 arch/powerpc/perf/req-gen/perf.h   | 155 
 include/linux/byteorder.h  |  34 +
 include/linux/perf_event.h |  10 +
 kernel/events/core.c   |   8 +
 tools/lib/api/fs/fs.c  |  43 +-
 tools/perf/Documentation/perf-list.txt |  13 +
 tools/perf/Documentation/perf-record.txt   |   

[PATCH 01/16] tools/perf: allow overriding sysfs and proc finding with env var

2014-05-27 Thread Cody P Schafer
SYSFS_PATH and PROC_PATH environment variables now let the user override
the detection of sysfs and proc locations for testing purposes.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 tools/lib/api/fs/fs.c | 43 ++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 5b5eb78..c1b49c3 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -1,8 +1,10 @@
 /* TODO merge/factor in debugfs.c here */
 
+#include ctype.h
 #include errno.h
 #include stdbool.h
 #include stdio.h
+#include stdlib.h
 #include string.h
 #include sys/vfs.h
 
@@ -96,12 +98,51 @@ static bool fs__check_mounts(struct fs *fs)
return false;
 }
 
+static void mem_toupper(char *f, size_t len)
+{
+   while (len) {
+   *f = toupper(*f);
+   f++;
+   len--;
+   }
+}
+
+/*
+ * Check for NAME_PATH environment variable to override fs location (for
+ * testing). This matches the recommendation in Documentation/sysfs-rules.txt
+ * for SYSFS_PATH.
+ */
+static bool fs__env_override(struct fs *fs)
+{
+   char *override_path;
+   size_t name_len = strlen(fs-name);
+   /* name + _PATH + '\0' */
+   char upper_name[name_len + 5 + 1];
+   memcpy(upper_name, fs-name, name_len);
+   mem_toupper(upper_name, name_len);
+   strcpy(upper_name[name_len], _PATH);
+
+   override_path = getenv(upper_name);
+   if (!override_path)
+   return false;
+
+   fs-found = true;
+   strncpy(fs-path, override_path, sizeof(fs-path));
+   return true;
+}
+
 static const char *fs__get_mountpoint(struct fs *fs)
 {
+   if (fs__env_override(fs))
+   return fs-path;
+
if (fs__check_mounts(fs))
return fs-path;
 
-   return fs__read_mounts(fs) ? fs-path : NULL;
+   if (fs__read_mounts(fs))
+   return fs-path;
+
+   return NULL;
 }
 
 static const char *fs__mountpoint(int idx)
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 02/16] powerpc/perf/hv-24x7: use kmem_cache instead of aligned stack allocations

2014-05-27 Thread Cody P Schafer
Ian pointed out the use of __aligned(4096) caused rather large stack
consumption in single_24x7_request(), so use the kmem_cache
hv_page_cache (which we've already got set up for other allocations)
insead of allocating locally.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Reported-by: Ian Munsie imun...@au1.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 arch/powerpc/perf/hv-24x7.c | 52 -
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index e0766b8..9a7a830 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -294,7 +294,7 @@ static unsigned long single_24x7_request(u8 domain, u32 
offset, u16 ix,
 u16 lpar, u64 *res,
 bool success_expected)
 {
-   unsigned long ret;
+   unsigned long ret = -ENOMEM;
 
/*
 * request_buffer and result_buffer are not required to be 4k aligned,
@@ -304,7 +304,27 @@ static unsigned long single_24x7_request(u8 domain, u32 
offset, u16 ix,
struct reqb {
struct hv_24x7_request_buffer buf;
struct hv_24x7_request req;
-   } __packed __aligned(4096) request_buffer = {
+   } __packed *request_buffer;
+   struct resb {
+   struct hv_24x7_data_result_buffer buf;
+   struct hv_24x7_result res;
+   struct hv_24x7_result_element elem;
+   __be64 result;
+   } __packed *result_buffer;
+
+   BUILD_BUG_ON(sizeof(*request_buffer)  4096);
+   BUILD_BUG_ON(sizeof(*result_buffer)  4096);
+
+   request_buffer = kmem_cache_alloc(hv_page_cache, GFP_USER);
+
+   if (!request_buffer)
+   goto out_reqb;
+
+   result_buffer = kmem_cache_zalloc(hv_page_cache, GFP_USER);
+   if (!result_buffer)
+   goto out_resb;
+
+   *request_buffer = (struct reqb) {
.buf = {
.interface_version = HV_24X7_IF_VERSION_CURRENT,
.num_requests = 1,
@@ -320,28 +340,30 @@ static unsigned long single_24x7_request(u8 domain, u32 
offset, u16 ix,
}
};
 
-   struct resb {
-   struct hv_24x7_data_result_buffer buf;
-   struct hv_24x7_result res;
-   struct hv_24x7_result_element elem;
-   __be64 result;
-   } __packed __aligned(4096) result_buffer = {};
-
ret = plpar_hcall_norets(H_GET_24X7_DATA,
-   virt_to_phys(request_buffer), sizeof(request_buffer),
-   virt_to_phys(result_buffer),  sizeof(result_buffer));
+   virt_to_phys(request_buffer), sizeof(*request_buffer),
+   virt_to_phys(result_buffer),  sizeof(*result_buffer));
 
if (ret) {
if (success_expected)
pr_err_ratelimited(hcall failed: %d %#x %#x %d = 
0x%lx (%ld) detail=0x%x failing ix=%x\n,
domain, offset, ix, lpar,
ret, ret,
-   result_buffer.buf.detailed_rc,
-   result_buffer.buf.failing_request_ix);
-   return ret;
+   result_buffer-buf.detailed_rc,
+   result_buffer-buf.failing_request_ix);
+   goto out_hcall;
}
 
-   *res = be64_to_cpu(result_buffer.result);
+   *res = be64_to_cpu(result_buffer-result);
+   kfree(result_buffer);
+   kfree(request_buffer);
+   return ret;
+
+out_hcall:
+   kfree(result_buffer);
+out_resb:
+   kfree(request_buffer);
+out_reqb:
return ret;
 }
 
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 03/16] perf Documentation: sysfs events/ interfaces

2014-05-27 Thread Cody P Schafer
Add documentation for the event, event.scale, and event.unit
files in sysfs.

event.scale and event.unit were undocumented.
event was previously documented only for specific powerpc pmu events.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 .../testing/sysfs-bus-event_source-devices-events  | 60 ++
 1 file changed, 60 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events 
b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
index 7b40a3c..a5226f0 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
@@ -599,3 +599,63 @@ Description:   POWER-systems specific performance 
monitoring events
Further, multiple terms like 'event=0x' can be specified
and separated with comma. All available terms are defined in
the /sys/bus/event_source/devices/dev/format file.
+
+What: /sys/bus/event_source/devices/pmu/events/event
+Date: 2014/02/24
+Contact:   Linux kernel mailing list linux-ker...@vger.kernel.org
+Description:   Per-pmu performance monitoring events specific to the running 
system
+
+   Each file (except for some of those with a '.' in them, '.unit'
+   and '.scale') in the 'events' directory describes a single
+   performance monitoring event supported by the pmu. The name
+   of the file is the name of the event.
+
+   File contents:
+
+   term[=value][,term[=value]]...
+
+   Where term is one of the terms listed under
+   /sys/bus/event_source/devices/pmu/format/ and value is
+   a number is base-16 format with a '0x' prefix (lowercase only).
+   If a term is specified alone (without an assigned value), it
+   is implied that 0x1 is assigned to that term.
+
+   Examples (each of these lines would be in a seperate file):
+
+   event=0x2abc
+   event=0x423,inv,cmask=0x3
+   domain=0x1,offset=0x8,starting_index=0x
+
+   Each of the assignments indicates a value to be assigned to a
+   particular set of bits (as defined by the format file
+   corresponding to the term) in the perf_event structure passed
+   to the perf_open syscall.
+
+What: /sys/bus/event_source/devices/pmu/events/event.unit
+Date: 2014/02/24
+Contact:   Linux kernel mailing list linux-ker...@vger.kernel.org
+Description:   Perf event units
+
+   A string specifying the English plural numerical unit that 
event
+   (once multiplied by event.scale) represents.
+
+   Example:
+
+   Joules
+
+What: /sys/bus/event_source/devices/pmu/events/event.scale
+Date: 2014/02/24
+Contact:   Linux kernel mailing list linux-ker...@vger.kernel.org
+Description:   Perf event scaling factors
+
+   A string representing a floating point value expressed in
+   scientific notation to be multiplied by the event count
+   recieved from the kernel to match the unit specified in the
+   event.unit file.
+
+   Example:
+
+   2.3283064365386962890625e-10
+
+   This is provided to avoid performing floating point arithmetic
+   in the kernel.
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 04/16] perf Documentation: remove duplicated docs for powerpc cpu specific events

2014-05-27 Thread Cody P Schafer
Listing specific events doesn't actually help us at all here because:
 - these events actually vary between different ppc processors, they
   aren't garunteed to be present.
 - the documentation of the (generic) file contents is now superceded by the
   docs for arbitrary event file contents.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 .../testing/sysfs-bus-event_source-devices-events  | 573 -
 1 file changed, 573 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events 
b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
index a5226f0..20979f8 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
@@ -27,579 +27,6 @@ Description:Generic performance monitoring events
basename.
 
 
-What:  /sys/devices/cpu/events/PM_1PLUS_PPC_CMPL
-   /sys/devices/cpu/events/PM_BRU_FIN
-   /sys/devices/cpu/events/PM_BR_MPRED
-   /sys/devices/cpu/events/PM_CMPLU_STALL
-   /sys/devices/cpu/events/PM_CMPLU_STALL_BRU
-   /sys/devices/cpu/events/PM_CMPLU_STALL_DCACHE_MISS
-   /sys/devices/cpu/events/PM_CMPLU_STALL_DFU
-   /sys/devices/cpu/events/PM_CMPLU_STALL_DIV
-   /sys/devices/cpu/events/PM_CMPLU_STALL_ERAT_MISS
-   /sys/devices/cpu/events/PM_CMPLU_STALL_FXU
-   /sys/devices/cpu/events/PM_CMPLU_STALL_IFU
-   /sys/devices/cpu/events/PM_CMPLU_STALL_LSU
-   /sys/devices/cpu/events/PM_CMPLU_STALL_REJECT
-   /sys/devices/cpu/events/PM_CMPLU_STALL_SCALAR
-   /sys/devices/cpu/events/PM_CMPLU_STALL_SCALAR_LONG
-   /sys/devices/cpu/events/PM_CMPLU_STALL_STORE
-   /sys/devices/cpu/events/PM_CMPLU_STALL_THRD
-   /sys/devices/cpu/events/PM_CMPLU_STALL_VECTOR
-   /sys/devices/cpu/events/PM_CMPLU_STALL_VECTOR_LONG
-   /sys/devices/cpu/events/PM_CYC
-   /sys/devices/cpu/events/PM_GCT_NOSLOT_BR_MPRED
-   /sys/devices/cpu/events/PM_GCT_NOSLOT_BR_MPRED_IC_MISS
-   /sys/devices/cpu/events/PM_GCT_NOSLOT_CYC
-   /sys/devices/cpu/events/PM_GCT_NOSLOT_IC_MISS
-   /sys/devices/cpu/events/PM_GRP_CMPL
-   /sys/devices/cpu/events/PM_INST_CMPL
-   /sys/devices/cpu/events/PM_LD_MISS_L1
-   /sys/devices/cpu/events/PM_LD_REF_L1
-   /sys/devices/cpu/events/PM_RUN_CYC
-   /sys/devices/cpu/events/PM_RUN_INST_CMPL
-   /sys/devices/cpu/events/PM_IC_DEMAND_L2_BR_ALL
-   /sys/devices/cpu/events/PM_GCT_UTIL_7_TO_10_SLOTS
-   /sys/devices/cpu/events/PM_PMC2_SAVED
-   /sys/devices/cpu/events/PM_VSU0_16FLOP
-   /sys/devices/cpu/events/PM_MRK_LSU_DERAT_MISS
-   /sys/devices/cpu/events/PM_MRK_ST_CMPL
-   /sys/devices/cpu/events/PM_NEST_PAIR3_ADD
-   /sys/devices/cpu/events/PM_L2_ST_DISP
-   /sys/devices/cpu/events/PM_L2_CASTOUT_MOD
-   /sys/devices/cpu/events/PM_ISEG
-   /sys/devices/cpu/events/PM_MRK_INST_TIMEO
-   /sys/devices/cpu/events/PM_L2_RCST_DISP_FAIL_ADDR
-   /sys/devices/cpu/events/PM_LSU1_DC_PREF_STREAM_CONFIRM
-   /sys/devices/cpu/events/PM_IERAT_WR_64K
-   /sys/devices/cpu/events/PM_MRK_DTLB_MISS_16M
-   /sys/devices/cpu/events/PM_IERAT_MISS
-   /sys/devices/cpu/events/PM_MRK_PTEG_FROM_LMEM
-   /sys/devices/cpu/events/PM_FLOP
-   /sys/devices/cpu/events/PM_THRD_PRIO_4_5_CYC
-   /sys/devices/cpu/events/PM_BR_PRED_TA
-   /sys/devices/cpu/events/PM_EXT_INT
-   /sys/devices/cpu/events/PM_VSU_FSQRT_FDIV
-   /sys/devices/cpu/events/PM_MRK_LD_MISS_EXPOSED_CYC
-   /sys/devices/cpu/events/PM_LSU1_LDF
-   /sys/devices/cpu/events/PM_IC_WRITE_ALL
-   /sys/devices/cpu/events/PM_LSU0_SRQ_STFWD
-   /sys/devices/cpu/events/PM_PTEG_FROM_RL2L3_MOD
-   /sys/devices/cpu/events/PM_MRK_DATA_FROM_L31_SHR
-   /sys/devices/cpu/events/PM_DATA_FROM_L21_MOD
-   /sys/devices/cpu/events/PM_VSU1_SCAL_DOUBLE_ISSUED
-   /sys/devices/cpu/events/PM_VSU0_8FLOP
-   /sys/devices/cpu/events/PM_POWER_EVENT1
-   /sys/devices/cpu/events/PM_DISP_CLB_HELD_BAL
-   /sys/devices/cpu/events/PM_VSU1_2FLOP
-   /sys/devices/cpu/events/PM_LWSYNC_HELD
-   /sys/devices/cpu/events/PM_PTEG_FROM_DL2L3_SHR
-   /sys/devices/cpu/events/PM_INST_FROM_L21_MOD
-   /sys/devices/cpu/events/PM_IERAT_XLATE_WR_16MPLUS
-   /sys/devices/cpu/events/PM_IC_REQ_ALL
-   

[PATCH 05/16] perf Documentation: add event parameters

2014-05-27 Thread Cody P Schafer
Event parameters are a basic way for partial events to be specified in
sysfs with per-event names given to the fields that need to be filled in
when using a particular event.

It is intended for supporting cases where the single 'cpu' parameter is
insufficient. For example, POWER 8 has events for physical
sockets/cores/cpus that are accessible from with virtual machines. To
keep using the single 'cpu' parameter we'd need to perform a mapping
between Linux's cpus and the physical machine's cpus (in this case
Linux is running under a hypervisor). This isn't possible because
bindings between our cpus and physical cpus may not be fixed, and we
probably won't have a cpu on each physical cpu.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 Documentation/ABI/testing/sysfs-bus-event_source-devices-events | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events 
b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
index 20979f8..c1f9850 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-events
@@ -52,12 +52,18 @@ Description:Per-pmu performance monitoring events 
specific to the running syste
event=0x2abc
event=0x423,inv,cmask=0x3
domain=0x1,offset=0x8,starting_index=0x
+   domain=0x1,offset=0x8,starting_index=phys_cpu
 
Each of the assignments indicates a value to be assigned to a
particular set of bits (as defined by the format file
corresponding to the term) in the perf_event structure passed
to the perf_open syscall.
 
+   In the case of the last example, a value replacing phys_cpu
+   would need to be provided by the user selecting the particular
+   event. This is refered to as event parameterization. All
+   non-numerical values indicate an event parameter.
+
 What: /sys/bus/event_source/devices/pmu/events/event.unit
 Date: 2014/02/24
 Contact:   Linux kernel mailing list linux-ker...@vger.kernel.org
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 06/16] tools/perf: annotate list_head with type info

2014-05-27 Thread Cody P Schafer
CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 tools/perf/util/pmu.c | 4 ++--
 tools/perf/util/pmu.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 00a7dcb..906ae40 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -14,8 +14,8 @@
 
 struct perf_pmu_alias {
char *name;
-   struct list_head terms;
-   struct list_head list;
+   struct list_head terms; /* HEAD struct parse_events_term - list */
+   struct list_head list;  /* ELEM */
char unit[UNIT_MAX_LEN+1];
double scale;
 };
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 8b64125..4a85230 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -17,9 +17,9 @@ struct perf_pmu {
char *name;
__u32 type;
struct cpu_map *cpus;
-   struct list_head format;
-   struct list_head aliases;
-   struct list_head list;
+   struct list_head format;  /* HEAD struct perf_pmu_format - list */
+   struct list_head aliases; /* HEAD struct perf_pmu_alias - list */
+   struct list_head list;/* ELEM */
 };
 
 struct perf_pmu *perf_pmu__find(const char *name);
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 07/16] tools/perf: support parsing parameterized events

2014-05-27 Thread Cody P Schafer
Enable event specification like:

pmu/event_name,param1=0x1,param2=0x4/

Assuming that

/sys/bus/event_source/devices/pmu/events/event_name

Contains something like

bar=param2,foo=1,baz=param1

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 tools/perf/util/parse-events.h |  1 +
 tools/perf/util/pmu.c  | 55 ++
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index f1cb4c4..1147e87 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -60,6 +60,7 @@ struct parse_events_term {
int type_val;
int type_term;
struct list_head list;
+   bool used;
 };
 
 struct parse_events_evlist {
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 906ae40..db53fac 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -504,27 +504,57 @@ static __u64 pmu_format_value(unsigned long *format, 
__u64 value)
 }
 
 /*
+ * Term is a string term, and might be a param-term. Try to look up it's value
+ * in the remaining terms.
+ * - We have a term like base-or-format-term=param-term,
+ * - We need to find the value supplied for param-term (with param-term named
+ *   in a config string) later on in the term list.
+ */
+static int pmu_resolve_param_term(struct parse_events_term *term,
+ struct list_head *head_terms,
+ __u64 *value)
+{
+   struct parse_events_term *t;
+
+   list_for_each_entry(t, head_terms, list)
+   if (t-type_val == PARSE_EVENTS__TERM_TYPE_NUM) {
+   if (!strcmp(t-config, term-val.str)) {
+   t-used = true;
+   *value = t-val.num;
+   return 0;
+   }
+   }
+
+   return -1;
+}
+
+/*
  * Setup one of config[12] attr members based on the
  * user input data - term parameter.
  */
 static int pmu_config_term(struct list_head *formats,
   struct perf_event_attr *attr,
-  struct parse_events_term *term)
+  struct parse_events_term *term,
+  struct list_head *head_terms)
 {
struct perf_pmu_format *format;
__u64 *vp;
+   __u64 val;
+
+   /*
+* If this is a parameter we've already used for parameterized-eval,
+* skip it in normal eval.
+*/
+   if (term-used)
+   return 0;
 
/*
-* Support only for hardcoded and numnerial terms.
 * Hardcoded terms should be already in, so nothing
 * to be done for them.
 */
if (parse_events__is_hardcoded_term(term))
return 0;
 
-   if (term-type_val != PARSE_EVENTS__TERM_TYPE_NUM)
-   return -EINVAL;
-
format = pmu_find_format(formats, term-config);
if (!format)
return -EINVAL;
@@ -544,11 +574,16 @@ static int pmu_config_term(struct list_head *formats,
}
 
/*
-* XXX If we ever decide to go with string values for
-* non-hardcoded terms, here's the place to translate
-* them into value.
+* Either directly use a numeric term, or try to translate string terms
+* using event parameters.
 */
-   *vp |= pmu_format_value(format-bits, term-val.num);
+   if (term-type_val == PARSE_EVENTS__TERM_TYPE_NUM)
+   val = term-val.num;
+   else
+   if (pmu_resolve_param_term(term, head_terms, val))
+   return -EINVAL;
+
+   *vp |= pmu_format_value(format-bits, val);
return 0;
 }
 
@@ -559,7 +594,7 @@ int perf_pmu__config_terms(struct list_head *formats,
struct parse_events_term *term;
 
list_for_each_entry(term, head_terms, list)
-   if (pmu_config_term(formats, attr, term))
+   if (pmu_config_term(formats, attr, term, head_terms))
return -EINVAL;
 
return 0;
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 08/16] tools/perf: extend format_alias() to include event parameters

2014-05-27 Thread Cody P Schafer
This causes `perf list pmu` to show parameters for parameterized events
like follows:

  pmu/event_name,param1=?,param2=?/ [Kernel PMU event]

An example:

  
hv_gpci/dispatch_timebase_by_processor_processor_time_in_timebase_cycles,phys_processor_idx=?/
 [Kernel PMU event]

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 tools/perf/util/pmu.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index db53fac..7b8d067 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -741,10 +741,33 @@ void perf_pmu__set_format(unsigned long *bits, long from, 
long to)
set_bit(b, bits);
 }
 
+static int sub_non_neg(int a, int b)
+{
+   if (b  a)
+   return 0;
+   return a - b;
+}
+
 static char *format_alias(char *buf, int len, struct perf_pmu *pmu,
  struct perf_pmu_alias *alias)
 {
-   snprintf(buf, len, %s/%s/, pmu-name, alias-name);
+   struct parse_events_term *term;
+   int used = snprintf(buf, len, %s/%s, pmu-name, alias-name);
+
+   list_for_each_entry(term, alias-terms, list)
+   if (term-type_val == PARSE_EVENTS__TERM_TYPE_STR)
+   used += snprintf(buf + used, sub_non_neg(len, used),
+   ,%s=?, term-val.str);
+
+   if (sub_non_neg(len, used)  0) {
+   buf[used] = '/';
+   used++;
+   }
+   if (sub_non_neg(len, used)  0) {
+   buf[used] = '\0';
+   used++;
+   } else
+   buf[len - 1] = '\0';
return buf;
 }
 
@@ -795,6 +818,7 @@ void print_pmu_events(const char *event_glob, bool 
name_only)
if (is_cpu  !name_only)
aliases[j] = format_alias_or(buf, sizeof(buf),
  pmu, alias);
+
aliases[j] = strdup(aliases[j]);
j++;
}
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 09/16] tools/perf: document parameterized events and note symbolically formed events

2014-05-27 Thread Cody P Schafer
CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 tools/perf/Documentation/perf-list.txt   | 13 +
 tools/perf/Documentation/perf-record.txt |  5 +
 2 files changed, 18 insertions(+)

diff --git a/tools/perf/Documentation/perf-list.txt 
b/tools/perf/Documentation/perf-list.txt
index 6fce6a6..626818b 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -89,6 +89,19 @@ raw encoding of 0x1A8 can be used:
 You should refer to the processor specific documentation for getting these
 details. Some of them are referenced in the SEE ALSO section below.
 
+PARAMETERIZED EVENTS
+
+
+Some pmu events listed by 'perf-list' will be displayed with '?' in them. For
+example:
+
+  hv_gpci/dtbp_ptitc,phys_processor_idx=?/
+
+This means that when provided as an event, a value for phys_processor_idx must
+also be supplied. For example:
+
+  perf stat -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ...
+
 OPTIONS
 ---
 
diff --git a/tools/perf/Documentation/perf-record.txt 
b/tools/perf/Documentation/perf-record.txt
index c71b0f3..c005180 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -33,6 +33,11 @@ OPTIONS
 - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a
  hexadecimal event descriptor.
 
+   - a symbolicly formed PMU event like 'pmu/value1=0x3,value2/' where
+ 'value1' and 'value2' are defined as formats in
+ /sys/bus/event_sources/devices/pmu/format/* OR are one of 'config',
+ 'config1', 'config2'.
+
 - a hardware breakpoint event in the form of '\mem:addr[:access]'
   where addr is the address in memory you want to break in.
   Access is the memory access type (read, write, execute) it can
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 10/16] perf: provide sysfs_show for struct perf_pmu_events_attr

2014-05-27 Thread Cody P Schafer
(struct perf_pmu_events_attr) is defined in include/linux/perf_event.h,
but the only show for it is in x86 and contains x86 specific stuff.

Make a generic one for those of us who are just using the event_str.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 include/linux/perf_event.h | 3 +++
 kernel/events/core.c   | 8 
 2 files changed, 11 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3356abc..6c1d6dd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -867,6 +867,9 @@ struct perf_pmu_events_attr {
const char *event_str;
 };
 
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute 
*attr,
+ char *page);
+
 #define PMU_EVENT_ATTR(_name, _var, _id, _show)
\
 static struct perf_pmu_events_attr _var = {\
.attr = __ATTR(_name, 0444, _show, NULL),   \
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a..6830e21 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7971,6 +7971,14 @@ void __init perf_event_init(void)
 != 1024);
 }
 
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute 
*attr,
+ char *page)
+{
+   struct perf_pmu_events_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_events_attr, attr);
+   return sprintf(page, %s\n, pmu_attr-event_str);
+}
+
 static int __init perf_event_sysfs_init(void)
 {
struct pmu *pmu;
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 11/16] byteorder: provide a linux/byteorder.h with {be, le}_to_cpu() and cpu_to_{be, le}() macros

2014-05-27 Thread Cody P Schafer
Rather manually specifying the size of the integer to be converted, key
off of the type size. Reduces duplicate size info and the occurance of
certain types of bugs (using the wrong sized conversion).

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 include/linux/byteorder.h | 34 ++
 1 file changed, 34 insertions(+)
 create mode 100644 include/linux/byteorder.h

diff --git a/include/linux/byteorder.h b/include/linux/byteorder.h
new file mode 100644
index 000..c7ab8da
--- /dev/null
+++ b/include/linux/byteorder.h
@@ -0,0 +1,34 @@
+#ifndef LINUX_BYTEORDER_H_
+#define LINUX_BYTEORDER_H_
+
+#include asm/byteorder.h
+
+#define be_to_cpu(v) \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), be16_to_cpu(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), be32_to_cpu(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), be64_to_cpu(v), \
+   (void)0
+
+#define le_to_cpu(v) \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), le16_to_cpu(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), le32_to_cpu(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), le64_to_cpu(v), \
+   (void)0
+
+#define cpu_to_le(v) \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), cpu_to_le16(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), cpu_to_le32(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), cpu_to_le64(v), \
+   (void)0
+
+#define cpu_to_be(v) \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint8_t) , v, \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint16_t), cpu_to_be16(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint32_t), cpu_to_be32(v), \
+   __builtin_choose_expr(sizeof(v) == sizeof(uint64_t), cpu_to_be64(v), \
+   (void)0
+
+#endif
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 12/16] powerpc/perf/hv-24x7: parse catalog and populate sysfs with events

2014-05-27 Thread Cody P Schafer
Retrieves and parses the 24x7 catalog on POWER systems that supply it
(right now, only POWER 8). Events are exposed via sysfs in the standard
fashion, and are all parameterized.

Catalog is (at the moment) only parsed on boot. It needs re-parsing
when a some hypervisor events occur. At that point we'll also need to
prevent old events from continuing to function (counter that is passed
in via spare space in the config values?).

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 arch/powerpc/perf/hv-24x7-catalog.h |  25 ++
 arch/powerpc/perf/hv-24x7-domains.h |  19 +
 arch/powerpc/perf/hv-24x7.c | 760 +++-
 arch/powerpc/perf/hv-24x7.h |  12 +-
 4 files changed, 804 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/perf/hv-24x7-domains.h

diff --git a/arch/powerpc/perf/hv-24x7-catalog.h 
b/arch/powerpc/perf/hv-24x7-catalog.h
index 21b19dd..69e2e1f 100644
--- a/arch/powerpc/perf/hv-24x7-catalog.h
+++ b/arch/powerpc/perf/hv-24x7-catalog.h
@@ -30,4 +30,29 @@ struct hv_24x7_catalog_page_0 {
__u8 reserved6[2];
 } __packed;
 
+struct hv_24x7_event_data {
+   __be16 length; /* in bytes, must be a multiple of 16 */
+   __u8 reserved1[2];
+   __u8 domain; /* Chip = 1, Core = 2 */
+   __u8 reserved2[1];
+   __be16 event_group_record_offs; /* in bytes, must be 8 byte aligned */
+   __be16 event_group_record_len; /* in bytes */
+
+   /* in bytes, offset from event_group_record */
+   __be16 event_counter_offs;
+
+   /* verified_state, unverified_state, caveat_state, broken_state, ... */
+   __be32 flags;
+
+   __be16 primary_group_ix;
+   __be16 group_count;
+   __be16 event_name_len;
+   __u8 remainder[];
+   /* __u8 event_name[event_name_len - 2]; */
+   /* __be16 event_description_len; */
+   /* __u8 event_desc[event_description_len - 2]; */
+   /* __be16 detailed_desc_len; */
+   /* __u8 detailed_desc[detailed_desc_len - 2]; */
+} __packed;
+
 #endif
diff --git a/arch/powerpc/perf/hv-24x7-domains.h 
b/arch/powerpc/perf/hv-24x7-domains.h
new file mode 100644
index 000..9c5c862
--- /dev/null
+++ b/arch/powerpc/perf/hv-24x7-domains.h
@@ -0,0 +1,19 @@
+
+/*
+ * DOMAIN(name, num, index_kind, is_physical)
+ *
+ * @name: an all caps token, suitable for use in generating an enum member and
+ *appending to an event name in sysfs.
+ * @num: the number corresponding to the domain as given in documentation. We
+ *   assume the catalog domain and the hcall domain have the same numbering
+ *   (so far they do), but this may need to be changed in the future.
+ * @index_kind: a stringifiable token describing the meaning of the index 
within the
+ *  given domain. Must fit the parsing rules of the perf sysfs api.
+ * @is_physical: true if the domain is physical, false otherwise (if virtual).
+ */
+DOMAIN(PHYSICAL_CHIP, 0x01, chip, true)
+DOMAIN(PHYSICAL_CORE, 0x02, core, true)
+DOMAIN(VIRTUAL_PROCESSOR_HOME_CORE, 0x03, vcpu, false)
+DOMAIN(VIRTUAL_PROCESSOR_HOME_CHIP, 0x04, vcpu, false)
+DOMAIN(VIRTUAL_PROCESSOR_HOME_NODE, 0x05, vcpu, false)
+DOMAIN(VIRTUAL_PROCESSOR_REMOTE_NODE, 0x06, vcpu, false)
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9a7a830..c9b7c55 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1,3 +1,4 @@
+#define DEBUG 1
 /*
  * Hypervisor supplied 24x7 performance counter support
  *
@@ -12,9 +13,13 @@
 
 #define pr_fmt(fmt) hv-24x7:  fmt
 
+#include linux/byteorder.h
 #include linux/perf_event.h
+#include linux/rbtree.h
 #include linux/module.h
 #include linux/slab.h
+#include linux/vmalloc.h
+
 #include asm/firmware.h
 #include asm/hvcall.h
 #include asm/io.h
@@ -23,6 +28,66 @@
 #include hv-24x7-catalog.h
 #include hv-common.h
 
+static const char *domain_to_index_string(unsigned domain)
+{
+   switch (domain) {
+#define DOMAIN(n, v, x, c) \
+   case HV_PERF_DOMAIN_##n:\
+   return #x;
+#include hv-24x7-domains.h
+#undef DOMAIN
+   default:
+   WARN(1, unknown domain %d\n, domain);
+   return UNKNOWN_DOMAIN_INDEX_STRING;
+   }
+}
+
+static const char *event_domain_suffix(unsigned domain)
+{
+   switch (domain) {
+#define DOMAIN(n, v, x, c) \
+   case HV_PERF_DOMAIN_##n:\
+   return __ #n;
+#include hv-24x7-domains.h
+#undef DOMAIN
+   default:
+   WARN(1, unknown domain %d\n, domain);
+   return __UNKNOWN_DOMAIN_SUFFIX;
+   }
+}
+
+static bool domain_is_valid(unsigned domain)
+{
+   switch (domain) {
+#define DOMAIN(n, v, x, c) \
+   case HV_PERF_DOMAIN_##n:\
+   /* fall through */
+#include hv-24x7-domains.h
+#undef DOMAIN
+   return true;
+   default:
+   return false;
+   }
+}
+
+static bool 

[PATCH 13/16] powerpc/perf/hv-24x7: Documentaion for new sysfs entries which expose descriptions

2014-05-27 Thread Cody P Schafer
CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 .../testing/sysfs-bus-event_source-devices-hv_24x7 | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 
b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
index e78ee79..5b501d7 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
@@ -21,3 +21,25 @@ Contact: Cody P Schafer c...@linux.vnet.ibm.com
 Description:
Exposes the version field of the 24x7 catalog. This is also
extractable from the provided binary catalog sysfs entry.
+
+What:  /sys/bus/event_source/devices/hv_24x7/event_descs/event-name
+Date:  February 2014
+Contact:   Cody P Schafer c...@linux.vnet.ibm.com
+Description:
+   Provides the description of a particular event as provided by
+   the firmware. If firmware does not provide a description, no
+   file will be created.
+
+   Note that the event-name lacks the domain suffix appended for
+   events in the events/ dir.
+
+What:  
/sys/bus/event_source/devices/hv_24x7/event_long_descs/event-name
+Date:  February 2014
+Contact:   Cody P Schafer c...@linux.vnet.ibm.com
+Description:
+   Provides the long description of a particular event as
+   provided by the firmware. If firmware does not provide a
+   description, no file will be created.
+
+   Note that the event-name lacks the domain suffix appended for
+   events in the events/ dir.
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 14/16] perf: add PMU_EVENT_ATTR_STRING() helper

2014-05-27 Thread Cody P Schafer
Helper for constructing static struct perf_pmu_events_attr s.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 include/linux/perf_event.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6c1d6dd..1313171 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -876,6 +876,13 @@ static struct perf_pmu_events_attr _var = {
\
.id   =  _id,   \
 };
 
+#define PMU_EVENT_ATTR_STRING(_name, _var, _value) \
+static struct perf_pmu_events_attr _var = {\
+   .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),   \
+   .event_str = _value,\
+};
+
+
 #define PMU_FORMAT_ATTR(_name, _format)
\
 static ssize_t \
 _name##_show(struct device *dev,   \
-- 
1.9.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 16/16] powerpc/perf/hv-gpci: add the remaining gpci requests

2014-05-27 Thread Cody P Schafer
Add the remaining gpci requests that contain counters suitable for use
by perf. Omit those that don't contain any counters (but note their
ommision).

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 arch/powerpc/perf/hv-gpci-requests.h | 179 +++
 1 file changed, 179 insertions(+)

diff --git a/arch/powerpc/perf/hv-gpci-requests.h 
b/arch/powerpc/perf/hv-gpci-requests.h
index 0dfc4d9..af3b73c 100644
--- a/arch/powerpc/perf/hv-gpci-requests.h
+++ b/arch/powerpc/perf/hv-gpci-requests.h
@@ -65,6 +65,33 @@ REQUEST(__count(0,   8,  
processor_time_in_timebase_cycles)
 )
 #include I(REQUEST_END)
 
+#define REQUEST_NAME 
entitled_capped_uncapped_donated_idle_timebase_by_partition
+#define REQUEST_NUM 0x20
+#define REQUEST_IDX_KIND sibling_part_id
+#include I(REQUEST_BEGIN)
+REQUEST(__field(0, 8,  partition_id)
+   __count(0x8,8,  entitled_cycles)
+   __count(0x10,   8,  consumed_capped_cycles)
+   __count(0x18,   8,  consumed_uncapped_cycles)
+   __count(0x20,   8,  cycles_donated)
+   __count(0x28,   8,  purr_idle_cycles)
+)
+#include I(REQUEST_END)
+
+/*
+ * Not avaliable for counter_info_version = 0x8, use
+ * run_instruction_cycles_by_partition(0x100) instead.
+ */
+#define REQUEST_NAME run_instructions_run_cycles_by_partition
+#define REQUEST_NUM 0x30
+#define REQUEST_IDX_KIND sibling_part_id
+#include I(REQUEST_BEGIN)
+REQUEST(__field(0, 8,  partition_id)
+   __count(0x8,8,  instructions_completed)
+   __count(0x10,   8,  cycles)
+)
+#include I(REQUEST_END)
+
 #define REQUEST_NAME system_performance_capabilities
 #define REQUEST_NUM 0x40
 #define REQUEST_IDX_KIND M1
@@ -75,5 +102,157 @@ REQUEST(__field(0, 1,  perf_collect_privileged)
 )
 #include I(REQUEST_END)
 
+#define REQUEST_NAME processor_bus_utilization_abc_links
+#define REQUEST_NUM 0x50
+#define REQUEST_IDX_KIND hw_chip_id
+#include I(REQUEST_BEGIN)
+REQUEST(__field(0, 4,  hw_chip_id)
+   __array(0x4,0xC,reserved1)
+   __count(0x10,   8,  total_link_cycles)
+   __count(0x18,   8,  idle_cycles_for_a_link)
+   __count(0x20,   8,  idle_cycles_for_b_link)
+   __count(0x28,   8,  idle_cycles_for_c_link)
+   __array(0x30,   0x20,   reserved2)
+)
+#include I(REQUEST_END)
+
+#define REQUEST_NAME processor_bus_utilization_wxyz_links
+#define REQUEST_NUM 0x60
+#define REQUEST_IDX_KIND hw_chip_id
+#include I(REQUEST_BEGIN)
+REQUEST(__field(0, 4,  hw_chip_id)
+   __array(0x4,0xC,reserved1)
+   __count(0x10,   8,  total_link_cycles)
+   __count(0x18,   8,  idle_cycles_for_w_link)
+   __count(0x20,   8,  idle_cycles_for_x_link)
+   __count(0x28,   8,  idle_cycles_for_y_link)
+   __count(0x30,   8,  idle_cycles_for_z_link)
+   __array(0x38,   0x28,   reserved2)
+)
+#include I(REQUEST_END)
+
+#define REQUEST_NAME processor_bus_utilization_gx_links
+#define REQUEST_NUM 0x70
+#define REQUEST_IDX_KIND hw_chip_id
+#include I(REQUEST_BEGIN)
+REQUEST(__field(0, 4,  hw_chip_id)
+   __array(0x4,0xC,reserved1)
+   __count(0x10,   8,  gx0_in_address_cycles)
+   __count(0x18,   8,  gx0_in_data_cycles)
+   __count(0x20,   8,  gx0_in_retries)
+   __count(0x28,   8,  gx0_in_bus_cycles)
+   __count(0x30,   8,  gx0_in_cycles_total)
+   __count(0x38,   8,  gx0_out_address_cycles)
+   __count(0x40,   8,  gx0_out_data_cycles)
+   __count(0x48,   8,  gx0_out_retries)
+   __count(0x50,   8,  gx0_out_bus_cycles)
+   __count(0x58,   8,  gx0_out_cycles_total)
+   __count(0x60,   8,  gx1_in_address_cycles)
+   __count(0x68,   8,  gx1_in_data_cycles)
+   __count(0x70,   8,  gx1_in_retries)
+   __count(0x78,   8,  gx1_in_bus_cycles)
+   __count(0x80,   8,  gx1_in_cycles_total)
+   __count(0x88,   8,  gx1_out_address_cycles)
+   __count(0x90,   8,  gx1_out_data_cycles)
+   __count(0x98,   8,  gx1_out_retries)
+   __count(0xA0,   8,  gx1_out_bus_cycles)
+   __count(0xA8,   8,  gx1_out_cycles_total)
+)
+#include I(REQUEST_END)
+
+#define REQUEST_NAME processor_bus_utilization_mc_links
+#define REQUEST_NUM 0x80
+#define REQUEST_IDX_KIND hw_chip_id
+#include I(REQUEST_BEGIN)
+REQUEST(__field(0, 4,  hw_chip_id)
+   __array(0x4,0xC,reserved1)
+   __count(0x10,   8,  mc0_frames)
+   __count(0x18,   8,  mc0_reads)
+   __count(0x20,   8,  mc0_write)
+   __count(0x28,   8,  mc0_total_cycles)
+   __count(0x30,   8,  mc1_frames)
+   __count(0x38,   8,  mc1_reads)
+   __count(0x40,   8,  mc1_writes)
+   __count(0x48,   8,  mc1_total_cycles)
+)
+#include I(REQUEST_END)
+
+/* Processor_config (0x90) skipped, no counters */
+/* Current_processor_frequency 

[PATCH 15/16] powerpc/perf/{hv-gpci, hv-common}: generate requests with counters annotated

2014-05-27 Thread Cody P Schafer
This adds (in req-gen/) a framework for defining gpci counter requests.
It uses macro magic similar to ftrace.

Also convert the existing hv-gpci request structures and enum values to
use the new framework (and adjust old users of the structs and enum
values to cope with changes in naming).

In exchange for this macro disaster, we get autogenerated event listing
for GPCI in sysfs, build time field offset checking, and zero
duplication of information about GPCI requests.

CC: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
Signed-off-by: Cody P Schafer d...@codyps.com
---
 arch/powerpc/perf/hv-common.c  |  10 +-
 arch/powerpc/perf/hv-gpci-requests.h   |  79 +++
 arch/powerpc/perf/hv-gpci.c|   8 ++
 arch/powerpc/perf/hv-gpci.h|  37 +++
 arch/powerpc/perf/req-gen/_begin.h |  13 +++
 arch/powerpc/perf/req-gen/_clear.h |   5 +
 arch/powerpc/perf/req-gen/_end.h   |   4 +
 arch/powerpc/perf/req-gen/_request-begin.h |  15 +++
 arch/powerpc/perf/req-gen/_request-end.h   |   8 ++
 arch/powerpc/perf/req-gen/perf.h   | 155 +
 10 files changed, 304 insertions(+), 30 deletions(-)
 create mode 100644 arch/powerpc/perf/hv-gpci-requests.h
 create mode 100644 arch/powerpc/perf/req-gen/_begin.h
 create mode 100644 arch/powerpc/perf/req-gen/_clear.h
 create mode 100644 arch/powerpc/perf/req-gen/_end.h
 create mode 100644 arch/powerpc/perf/req-gen/_request-begin.h
 create mode 100644 arch/powerpc/perf/req-gen/_request-end.h
 create mode 100644 arch/powerpc/perf/req-gen/perf.h

diff --git a/arch/powerpc/perf/hv-common.c b/arch/powerpc/perf/hv-common.c
index 47e02b3..7dce8f10 100644
--- a/arch/powerpc/perf/hv-common.c
+++ b/arch/powerpc/perf/hv-common.c
@@ -9,13 +9,13 @@ unsigned long hv_perf_caps_get(struct hv_perf_caps *caps)
unsigned long r;
struct p {
struct hv_get_perf_counter_info_params params;
-   struct cv_system_performance_capabilities caps;
+   struct hv_gpci_system_performance_capabilities caps;
} __packed __aligned(sizeof(uint64_t));
 
struct p arg = {
.params = {
.counter_request = cpu_to_be32(
-   CIR_SYSTEM_PERFORMANCE_CAPABILITIES),
+   HV_GPCI_system_performance_capabilities),
.starting_index = cpu_to_be32(-1),
.counter_info_version_in = 0,
}
@@ -31,9 +31,9 @@ unsigned long hv_perf_caps_get(struct hv_perf_caps *caps)
 
caps-version = arg.params.counter_info_version_out;
caps-collect_privileged = !!arg.caps.perf_collect_privileged;
-   caps-ga = !!(arg.caps.capability_mask  CV_CM_GA);
-   caps-expanded = !!(arg.caps.capability_mask  CV_CM_EXPANDED);
-   caps-lab = !!(arg.caps.capability_mask  CV_CM_LAB);
+   caps-ga = !!(arg.caps.capability_mask  HV_GPCI_CM_GA);
+   caps-expanded = !!(arg.caps.capability_mask  HV_GPCI_CM_EXPANDED);
+   caps-lab = !!(arg.caps.capability_mask  HV_GPCI_CM_LAB);
 
return r;
 }
diff --git a/arch/powerpc/perf/hv-gpci-requests.h 
b/arch/powerpc/perf/hv-gpci-requests.h
new file mode 100644
index 000..0dfc4d9
--- /dev/null
+++ b/arch/powerpc/perf/hv-gpci-requests.h
@@ -0,0 +1,79 @@
+
+#include req-gen/_begin.h
+
+/*
+ * Based on the document getPerfCountInfo v1.07
+ */
+
+/* this needs to be -1 encoded in hex suitable for parsing by tools/perf. */
+#define M1 0x
+
+/*
+ * #define REQUEST_NAME counter_request_name
+ * #define REQUEST_NUM r_num
+ * #define REQUEST_IDX_KIND starting_index_kind
+ * #include I(REQUEST_BEGIN)
+ * REQUEST(
+ * __field(...)
+ * __field(...)
+ * __array(...)
+ * __count(...)
+ * )
+ * #include I(REQUEST_END)
+ *
+ * - starting_index_kind is one of:
+ *   M1: must be -1
+ *   chip_id: hardware chip id or -1 for current hw chip
+ *   phys_processor_idx:
+ *
+ * __count(offset, bytes, name):
+ * a counter that should be exposed via perf
+ * __field(offset, bytes, name)
+ * a normal field
+ * __array(offset, bytes, name)
+ * an array of bytes
+ *
+ *
+ * @bytes for __count, and __field _must_ be a numeral token
+ * in decimal, not an expression and not in hex.
+ *
+ *
+ * TODO:
+ * - expose secondary index (if any counter ever uses it, only 0xA0
+ *   appears to use it right now, and it doesn't have any counters)
+ * - embed versioning info
+ * - include counter descriptions
+ */
+#define REQUEST_NAME dispatch_timebase_by_processor
+#define REQUEST_NUM 0x10
+#define REQUEST_IDX_KIND phys_processor_idx
+#include I(REQUEST_BEGIN)
+REQUEST(__count(0, 8,  processor_time_in_timebase_cycles)
+   __field(0x8,4,  hw_processor_id)
+   __field(0xC,2,  owning_part_id)
+   __field(0xE,1,  processor_state)
+   __field(0xF,1,  version)
+   

[git pull] Please pull powerpc.git merge branch

2014-05-27 Thread Benjamin Herrenschmidt
Hi Linus !

Here's a pair of powerpc fixes for 3.15 which are also going to stable.

One's a fix for building with newer binutils (the problem currently only
affects the BookE kernels but the affected macro might come back into
use on BookS platforms at any time). Unfortunately, the binutils maintainer
did a backward incompatible change to a construct that we use so we have
to add Makefile check.

The other one is a fix for CPUs getting stuck in kexec when running single
threaded. Since we routinely use kexec on power (including in our newer
bootloaders), I deemed that important enough.

Cheers,
Ben.

The following changes since commit 8050936caf125fbe54111ba5e696b68a360556ba:

  powerpc: irq work racing with timer interrupt can result in timer interrupt 
hang (2014-05-12 14:29:28 +1000)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git merge

for you to fetch changes up to 011e4b02f1da156ac7fea28a9da878f3c23af739:

  powerpc, kexec: Fix Processor X is stuck issue during kexec from ST mode 
(2014-05-28 13:24:26 +1000)


Guenter Roeck (1):
  powerpc: Fix 64 bit builds with binutils 2.24

Srivatsa S. Bhat (1):
  powerpc, kexec: Fix Processor X is stuck issue during kexec from ST mode

 arch/powerpc/Makefile  | 4 +++-
 arch/powerpc/include/asm/ppc_asm.h | 7 ++-
 arch/powerpc/kernel/machine_kexec_64.c | 2 +-
 kernel/kexec.c | 8 
 4 files changed, 18 insertions(+), 3 deletions(-)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Kernel 3.15: Boot problems with a PA6T board

2014-05-27 Thread Michael Ellerman
On Wed, 2014-05-28 at 01:08 +0200, Christian Zigotzky wrote:
 Hi Michael,
 
 Thanks a lot for your answer.
 
 ...

 18a1a7a1d862ae0794a0179473d08a414dd49234 - It doesn't boot. Error messages: 
 Oops: Machine check, sig: 7 [#1] CPU: 1 PID: 1 Comm: swapper/0 not tainted
 d8ff9cdf68fd119d491f3de90e1a612afc2f3b2b - It boots. :-)
 0f5a869600141a0d5575e3190af01a050c081b07 - It boots. :-)
 c7e64b9ce04aa2e3fad7396d92b5cb92056d16ac - It boots. :-)
 d3e144532703fe2454b56eddb56f30d2d620187b - It boots. :-)
 
 I think the machine check is the problem.

Yes I think it is. Do you get any more info, or just that one line?

So I think the latest working commit we have is 
d8ff9cdf68fd119d491f3de90e1a612afc2f3b2b.

I'm going to guess that cd427485357c0c4b99f69719251baacf25946e11 is BAD. Can
you please confirm or deny that?

Assuming cd42748 is bad, you should do a git bisect between it and 18a1a7a.
That should be a fairly quick bisect. That would be:

$ git bisect start
$ git bisect good d8ff9cdf68fd119d491f3de90e1a612afc2f3b2b
$ git bisect bad  cd427485357c0c4b99f69719251baacf25946e11

If cd42748 is *good*, then you'll need to do a bigger bisect from d8ff9cd to
18a1a7a.


cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 0/6] ppc, kvm, cpuidle: Allow offline and kvm standby threads to enter fastsleep

2014-05-27 Thread Preeti U Murthy
Fast sleep is a deep idle state on Power8. The support for the state was
added in commit 0d94873011. Today the idle threads in the host can
potentially be put to fast sleep. But when we launch guests using kvm,
the secondary threads are required to be offline and the offline threads
are put to nap. Besides this case, when secondary threads are woken up
to run guests and eventually go idle or when the guest is killed, they
enter nap. So when the entire core goes idle in both the above scenarios,
the maximum power savings that we can obtain is as much as we can get from
napping the cpus. This patchset adds support in the above two cases
for the threads to enter fast sleep.
---

Srivatsa S. Bhat (6):
  powernv, cpuidle: Move the flags used for idle state discovery to powernv 
core
  powerpc, powernv, CPU hotplug: Put offline CPUs in Fast-Sleep instead of 
Nap
  KVM: PPC: Book3S HV: Enable CPUs to run guest after waking up from 
fast-sleep
  KVM: PPC: Book3S HV: Consolidate the idle-state enter sequence in KVM
  KVM: PPC: Book3S HV: Put KVM standby hwthreads to fast-sleep instead of 
nap
  ppc,book3s: Go back to same idle state after handling machine check 
interrupt


 arch/powerpc/include/asm/processor.h|   12 
 arch/powerpc/kernel/exceptions-64s.S|   51 +--
 arch/powerpc/kernel/idle.c  |   52 
 arch/powerpc/kernel/idle_power7.S   |2 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  103 ---
 arch/powerpc/platforms/powernv/smp.c|   12 +++-
 drivers/cpuidle/cpuidle-powernv.c   |7 +-
 7 files changed, 190 insertions(+), 49 deletions(-)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/6] powernv, cpuidle: Move the flags used for idle state discovery to powernv core

2014-05-27 Thread Preeti U Murthy
From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com

These flags will be used by the cpuidle driver as well as in the cpu
offline path. The offline cpus should be put to fastsleep if the idle state
is discovered so as to gain maximum power savings in the offline state.

Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
[ Changelog added by pre...@linux.vnet.ibm.com ]
Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
---

 arch/powerpc/include/asm/processor.h |4 
 drivers/cpuidle/cpuidle-powernv.c|7 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index d660dc3..d922e5c 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -445,6 +445,10 @@ static inline unsigned long get_clean_sp(unsigned long sp, 
int is_32)
 }
 #endif
 
+/* Support for 'nap' and 'sleep' instructions, as discovered from the DT */
+#define IDLE_INST_NAP  0x0001 /* nap instruction can be used */
+#define IDLE_INST_SLEEP0x0002 /* sleep instruction can be used */
+
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 719f6fb..5d4f9e8 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -17,12 +17,11 @@
 #include asm/machdep.h
 #include asm/firmware.h
 #include asm/runlatch.h
+#include asm/processor.h
 
 /* Flags and constants used in PowerNV platform */
 
 #define MAX_POWERNV_IDLE_STATES8
-#define IDLE_USE_INST_NAP  0x0001 /* Use nap instruction */
-#define IDLE_USE_INST_SLEEP0x0002 /* Use sleep instruction */
 
 struct cpuidle_driver powernv_idle_driver = {
.name = powernv_idle,
@@ -187,7 +186,7 @@ static int powernv_add_idle_states(void)
 
for (i = 0; i  dt_idle_states; i++) {
 
-   if (flags[i]  IDLE_USE_INST_NAP) {
+   if (flags[i]  IDLE_INST_NAP) {
/* Add NAP state */
strcpy(powernv_states[nr_idle_states].name, Nap);
strcpy(powernv_states[nr_idle_states].desc, Nap);
@@ -198,7 +197,7 @@ static int powernv_add_idle_states(void)
nr_idle_states++;
}
 
-   if (flags[i]  IDLE_USE_INST_SLEEP) {
+   if (flags[i]  IDLE_INST_SLEEP) {
/* Add FASTSLEEP state */
strcpy(powernv_states[nr_idle_states].name, 
FastSleep);
strcpy(powernv_states[nr_idle_states].desc, 
FastSleep);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/6] powerpc, powernv, CPU hotplug: Put offline CPUs in Fast-Sleep instead of Nap

2014-05-27 Thread Preeti U Murthy
From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com

The offline cpus are put to fast sleep if the idle state is discovered in the
device tree. This is to gain maximum powersavings in the offline state.

Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
[ Changelog added by pre...@linux.vnet.ibm.com ]
Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
---

 arch/powerpc/include/asm/processor.h |8 +
 arch/powerpc/kernel/idle.c   |   52 ++
 arch/powerpc/platforms/powernv/smp.c |   12 +++-
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index d922e5c..c5256db 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -449,6 +449,14 @@ static inline unsigned long get_clean_sp(unsigned long sp, 
int is_32)
 #define IDLE_INST_NAP  0x0001 /* nap instruction can be used */
 #define IDLE_INST_SLEEP0x0002 /* sleep instruction can be used */
 
+/* Flags to indicate which of the CPU idle states are available for use */
+
+#define IDLE_USE_NAP   (1UL  0)
+#define IDLE_USE_SLEEP (1UL  1)
+
+extern unsigned int supported_cpuidle_states;
+extern unsigned int pnv_get_supported_cpuidle_states(void);
+
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index d7216c9..e51d574 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -25,6 +25,7 @@
 #include linux/cpu.h
 #include linux/sysctl.h
 #include linux/tick.h
+#include linux/of.h
 
 #include asm/processor.h
 #include asm/cputable.h
@@ -32,6 +33,7 @@
 #include asm/machdep.h
 #include asm/runlatch.h
 #include asm/smp.h
+#include asm/firmware.h
 
 
 unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
@@ -79,6 +81,56 @@ void arch_cpu_idle(void)
ppc64_runlatch_on();
 }
 
+#ifdef CONFIG_PPC_POWERNV
+
+unsigned int supported_cpuidle_states = 0;
+
+unsigned int pnv_get_supported_cpuidle_states(void)
+{
+   return supported_cpuidle_states;
+}
+
+static int __init pnv_probe_idle_states(void)
+{
+   struct device_node *power_mgt;
+   struct property *prop;
+   int dt_idle_states;
+   u32 *flags;
+   int i;
+
+   if (!firmware_has_feature(FW_FEATURE_OPALv3))
+   return 0;
+
+   power_mgt = of_find_node_by_path(/ibm,opal/power-mgt);
+   if (!power_mgt) {
+   pr_warn(opal: PowerMgmt Node not found\n);
+   return 0;
+   }
+
+   prop = of_find_property(power_mgt, ibm,cpu-idle-state-flags, NULL);
+   if (!prop) {
+   pr_warn(DT-PowerMgmt: missing ibm,cpu-idle-state-flags\n);
+   return 0;
+   }
+
+   dt_idle_states = prop-length / sizeof(u32);
+   flags = (u32 *) prop-value;
+
+   for (i = 0; i  dt_idle_states; i++) {
+   if (flags[i]  IDLE_INST_NAP)
+   supported_cpuidle_states |= IDLE_USE_NAP;
+
+   if (flags[i]  IDLE_INST_SLEEP)
+   supported_cpuidle_states |= IDLE_USE_SLEEP;
+   }
+
+   return 0;
+}
+
+__initcall(pnv_probe_idle_states);
+#endif
+
+
 int powersave_nap;
 
 #ifdef CONFIG_SYSCTL
diff --git a/arch/powerpc/platforms/powernv/smp.c 
b/arch/powerpc/platforms/powernv/smp.c
index bf5fcd4..fc83006 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -31,6 +31,7 @@
 #include asm/xics.h
 #include asm/opal.h
 #include asm/runlatch.h
+#include asm/processor.h
 
 #include powernv.h
 
@@ -142,6 +143,7 @@ static int pnv_smp_cpu_disable(void)
 static void pnv_smp_cpu_kill_self(void)
 {
unsigned int cpu;
+   unsigned long idle_states;
 
/* Standard hot unplug procedure */
local_irq_disable();
@@ -152,13 +154,21 @@ static void pnv_smp_cpu_kill_self(void)
generic_set_cpu_dead(cpu);
smp_wmb();
 
+   idle_states = pnv_get_supported_cpuidle_states();
+
/* We don't want to take decrementer interrupts while we are offline,
 * so clear LPCR:PECE1. We keep PECE2 enabled.
 */
mtspr(SPRN_LPCR, mfspr(SPRN_LPCR)  ~(u64)LPCR_PECE1);
while (!generic_check_cpu_restart(cpu)) {
ppc64_runlatch_off();
-   power7_nap();
+
+   /* If sleep is supported, go to sleep, instead of nap */
+   if (idle_states  IDLE_USE_SLEEP)
+   power7_sleep();
+   else
+   power7_nap();
+
ppc64_runlatch_on();
if (!generic_check_cpu_restart(cpu)) {
DBG(CPU%d Unexpected exit while offline !\n, cpu);

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 3/6] KVM: PPC: Book3S HV: Enable CPUs to run guest after waking up from fast-sleep

2014-05-27 Thread Preeti U Murthy
From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com

When guests have to be launched, the secondary threads which are offline
are woken up to run the guests. Today these threads wake up from nap
and check if they have to run guests. Now that the offline secondary threads
can go to fastsleep, add this check in the fastsleep wakeup path as well.

Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
[ Changelog added by pre...@linux.vnet.ibm.com ]
Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
---

 arch/powerpc/kernel/exceptions-64s.S |   30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 3afd391..b4bf464 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -100,6 +100,19 @@ system_reset_pSeries:
SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
+
+   GET_PACA(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+   li  r0,KVM_HWTHREAD_IN_KERNEL
+   stb r0,HSTATE_HWTHREAD_STATE(r13)
+   /* Order setting hwthread_state vs. testing hwthread_req */
+   sync
+   lbz r0,HSTATE_HWTHREAD_REQ(r13)
+   cmpwi   r0,0
+   beq 1f
+   b   kvm_start_guest
+1:
+#endif
/* Running native on arch 2.06 or later, check if we are
 * waking up from nap. We only handle no state loss and
 * supervisor state loss. We do -not- handle hypervisor
@@ -116,28 +129,15 @@ BEGIN_FTR_SECTION
 * OPAL v3 based powernv platforms have new idle states
 * which fall in this catagory.
 */
-   bgt cr1,8f
GET_PACA(r13)
-
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-   li  r0,KVM_HWTHREAD_IN_KERNEL
-   stb r0,HSTATE_HWTHREAD_STATE(r13)
-   /* Order setting hwthread_state vs. testing hwthread_req */
-   sync
-   lbz r0,HSTATE_HWTHREAD_REQ(r13)
-   cmpwi   r0,0
-   beq 1f
-   b   kvm_start_guest
-1:
-#endif
+   bgt cr1,8f
 
beq cr1,2f
b   .power7_wakeup_noloss
 2: b   .power7_wakeup_loss
 
/* Fast Sleep wakeup on PowerNV */
-8: GET_PACA(r13)
-   b   .power7_wakeup_tb_loss
+8: b   .power7_wakeup_tb_loss
 
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 4/6] KVM: PPC: Book3S HV: Consolidate the idle-state enter sequence in KVM

2014-05-27 Thread Preeti U Murthy
From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com

Now that the support for fast sleep idle state is present, the KVM
standby threads can be put to fast sleep when they are either idle
or do not have a guest to run. Today they enter nap in these scenarios.
The purpose is to gain maximum power savings in a KVM scenario as well
when an entire cpu core is idle.

As a precursor, consolidate the code common across all idle states.

Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
[ Changelog added by pre...@linux.vnet.ibm.com ]
Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
---

 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b031f93..43aa806 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -40,6 +40,17 @@
 #define NAPPING_CEDE   1
 #define NAPPING_NOVCPU 2
 
+#define IDLE_STATE_ENTER_SEQ_HV(IDLE_INST) \
+   /* Magic NAP/SLEEP/WINKLE mode enter sequence */\
+   std r0, HSTATE_SCRATCH0(r13);   \
+   ptesync;\
+   ld  r0, HSTATE_SCRATCH0(r13);   \
+1: cmpdr0, r0; \
+   bne 1b; \
+   IDLE_INST;  \
+   b   .
+
+
 /*
  * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
@@ -325,13 +336,9 @@ kvm_do_nap:
rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
mtspr   SPRN_LPCR, r4
isync
-   std r0, HSTATE_SCRATCH0(r13)
-   ptesync
-   ld  r0, HSTATE_SCRATCH0(r13)
-1: cmpdr0, r0
-   bne 1b
-   nap
-   b   .
+   IDLE_STATE_ENTER_SEQ_HV(PPC_NAP)
+   /* No return */
+
 
 /**
  **
@@ -2027,13 +2034,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr   SPRN_LPCR,r5
isync
li  r0, 0
-   std r0, HSTATE_SCRATCH0(r13)
-   ptesync
-   ld  r0, HSTATE_SCRATCH0(r13)
-1: cmpdr0, r0
-   bne 1b
-   nap
-   b   .
+   IDLE_STATE_ENTER_SEQ_HV(PPC_NAP)
+   /* No return */
 
 33:mr  r4, r3
li  r3, 0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 5/6] KVM: PPC: Book3S HV: Put KVM standby hwthreads to fast-sleep instead of nap

2014-05-27 Thread Preeti U Murthy
From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com

Now that the support for fast sleep idle state is present, allow
the KVM standby threads to go to fast sleep if the platform supports
it.This will fetch us maximum power savings if an entire core is idle.

Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
[ Changelog added by pre...@linux.vnet.ibm.com ]
Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
---

 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   73 ---
 1 file changed, 65 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 43aa806..69244cc 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -207,7 +207,7 @@ kvmppc_primary_no_guest:
li  r3, 1
stb r3, HSTATE_HWTHREAD_REQ(r13)
 
-   b   kvm_do_nap
+   b   kvm_do_idle
 
 kvm_novcpu_wakeup:
ld  r1, HSTATE_HOST_R1(r13)
@@ -247,7 +247,7 @@ kvm_novcpu_exit:
b   hdec_soon
 
 /*
- * We come in here when wakened from nap mode.
+ * We come in here when wakened from nap or fast-sleep mode.
  * Relocation is off and most register values are lost.
  * r13 points to the PACA.
  */
@@ -303,7 +303,7 @@ kvm_start_guest:
 
bl  kvmppc_hv_entry
 
-   /* Back from the guest, go back to nap */
+   /* Back from the guest, go back to nap or fastsleep */
/* Clear our vcpu pointer so we don't come back in early */
li  r0, 0
std r0, HSTATE_KVM_VCPU(r13)
@@ -314,7 +314,7 @@ kvm_start_guest:
 */
lwsync
 
-   /* increment the nap count and then go to nap mode */
+   /* increment the nap count and then go to nap or fast-sleep mode */
ld  r4, HSTATE_KVM_VCORE(r13)
addir4, r4, VCORE_NAP_COUNT
 51:lwarx   r3, 0, r4
@@ -325,6 +325,24 @@ kvm_start_guest:
 kvm_no_guest:
li  r0, KVM_HWTHREAD_IN_NAP
stb r0, HSTATE_HWTHREAD_STATE(r13)
+
+kvm_do_idle:
+   /*
+* if (supported_cpuidle_states  IDLE_USE_SLEEP)
+*  kvm_do_fastsleep();
+* else
+*  kvm_do_nap();
+*/
+   LOAD_REG_ADDRBASE(r3,supported_cpuidle_states)
+   lwz r4,ADDROFF(supported_cpuidle_states)(r3)
+   /*
+* andi. r4,r4,IDLE_USE_SLEEP. Replacing IDLE_USE_SLEEP
+* with the immediate value since it is a 32 bit instruction
+* and the operand needs to fit into this.
+*/
+   andi.   r4,r4,2
+   bne kvm_do_fastsleep
+
 kvm_do_nap:
/* Clear the runlatch bit before napping */
mfspr   r2, SPRN_CTRLF
@@ -339,6 +357,18 @@ kvm_do_nap:
IDLE_STATE_ENTER_SEQ_HV(PPC_NAP)
/* No return */
 
+kvm_do_fastsleep:
+   li  r3, LPCR_PECE0
+   mfspr   r4, SPRN_LPCR
+   /* Don't set LPCR_PECE1 since we want to wakeup only on an external
+* interrupt, and not on a decrementer interrupt.
+*/
+   rlwimi  r4, r3, 0, LPCR_PECE0
+   mtspr   SPRN_LPCR, r4
+   isync
+   IDLE_STATE_ENTER_SEQ_HV(PPC_SLEEP)
+   /* No return */
+
 
 /**
  **
@@ -2016,8 +2046,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
bl  kvmppc_save_fp
 
/*
-* Take a nap until a decrementer or external or doobell interrupt
-* occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
+* Go to fastsleep until an external or doobell interrupt
+* occurs, with PECE0 and PECEDP set in LPCR. Also clear the
 * runlatch bit before napping.
 */
mfspr   r2, SPRN_CTRLF
@@ -2026,6 +2056,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 
li  r0,1
stb r0,HSTATE_HWTHREAD_REQ(r13)
+   /*
+* if (supported_cpuidle_states  IDLE_USE_SLEEP)
+*  PPC_SLEEP;
+* else
+*  PPC_NAP;
+*/
+   LOAD_REG_ADDRBASE(r3,supported_cpuidle_states)
+   lwz r4,ADDROFF(supported_cpuidle_states)(r3)
+   /*
+* andi. r4,r4,IDLE_USE_SLEEP. Replacing IDLE_USE_SLEEP
+* with the immediate value since it is a 32 bit instruction
+* and the operand needs to fit into this.
+*/
+   andi.   r4,r4,2
+   bne 35f
+
mfspr   r5,SPRN_LPCR
ori r5,r5,LPCR_PECE0 | LPCR_PECE1
 BEGIN_FTR_SECTION
@@ -2037,6 +2083,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
IDLE_STATE_ENTER_SEQ_HV(PPC_NAP)
/* No return */
 
+35:mfspr   r5,SPRN_LPCR
+   ori r5,r5,LPCR_PECE0
+BEGIN_FTR_SECTION
+   orisr5,r5,LPCR_PECEDP@h
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+   mtspr   SPRN_LPCR,r5
+   isync
+   li  r0, 0
+   

[PATCH 6/6] ppc, book3s: Go back to same idle state after handling machine check interrupt

2014-05-27 Thread Preeti U Murthy
From: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com

Now that the support for fast sleep is present, threads could have woken up
from fast sleep on getting a machine check interrupt. Hence add code to allow
threads to go back to the idle state they woke up from after handling the
interrupt. Today they go back to nap by default.

Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
[ Changelog added by pre...@linux.vnet.ibm.com ]
Signed-off-by: Preeti U Murthy pre...@linux.vnet.ibm.com
---

 arch/powerpc/kernel/exceptions-64s.S |   21 +++--
 arch/powerpc/kernel/idle_power7.S|2 +-
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b4bf464..94cee3c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1396,15 +1396,16 @@ machine_check_handle_early:
 * of the following is true:
 * a. thread wasn't in power saving mode
 * b. thread was in power saving mode with no state loss or
-*supervisor state loss
+*supervisor state loss or hypervisor state loss (fastsleep)
 *
-* Go back to nap again if (b) is true.
+* Go back to nap or fastsleep again if (b) is true.
 */
rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */
beq 4f  /* No, it wasn;t */
-   /* Thread was in power saving mode. Go back to nap again. */
-   cmpwi   r11,2
-   bne 3f
+   /* Thread was in power saving mode. Go back to the same state again. */
+   cmpwi   cr1,r11,2
+   blt cr1,3f
+7:
/* Supervisor state loss */
li  r0,1
stb r0,PACA_NAPSTATELOST(r13)
@@ -1412,7 +1413,15 @@ machine_check_handle_early:
MACHINE_CHECK_HANDLER_WINDUP
GET_PACA(r13)
ld  r1,PACAR1(r13)
-   b   .power7_enter_nap_mode
+   /* We need to pass the idle state in r3: 0 - nap, 1 - sleep */
+   bgt cr1,8f
+   li  r3,0
+   b   .power7_enter_idle
+   /* No return */
+
+8: li  r3,1 /* Pass 1 in r3 to request sleep in power7_enter_idle */
+   b   .power7_enter_idle
+   /* No return */
 4:
 #endif
/*
diff --git a/arch/powerpc/kernel/idle_power7.S 
b/arch/powerpc/kernel/idle_power7.S
index c3ab869..e13e21b 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -95,7 +95,7 @@ _GLOBAL(power7_powersave_common)
std r9,_MSR(r1)
std r1,PACAR1(r13)
 
-_GLOBAL(power7_enter_nap_mode)
+_GLOBAL(power7_enter_idle)
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/* Tell KVM we're napping */
li  r4,KVM_HWTHREAD_IN_NAP

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev