date:20150512

Re: [PATCH V3 00/13] POWER DSCR fixes, improvements, docs and tests

2015-05-12 Thread Anshuman Khandual

On 04/10/2015 01:59 PM, Anshuman Khandual wrote:
   This patch series has patches for POWER DSCR fixes, improvements,
 in code documentaion, kernel support user documentation and selftest based
 test cases. It has got five test cases which are derived from Anton's DSCR
 test bucket which can be listed as follows.
 
 (1) http://ozlabs.org/~anton/junkcode/dscr_default_test.c
 (2) http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c
 (3) http://ozlabs.org/~anton/junkcode/dscr_inherit_exec_test.c
 (4) http://ozlabs.org/~anton/junkcode/dscr_inherit_test.c
 (5) http://ozlabs.org/~anton/junkcode/user_dscr_test.c
 
 So the derivied test cases have Anton's copyright along with mine but the
 commit message as of now has only my signed-off-by statement. As Anton
 mentioned before he would put his signed-off-by after reviewing these
 modified test cases.
 
 NOTE1: Anton's original inherit exec test expected the child to have system
 default DSCR value instead of the inherited DSCR value from it's parent.
 But looks like thats not the case when we execute the test, it always
 inherits it's parent's DSCR value over the exec call as well. So I have
 changed the test program assuming its correct to have the inherited DSCR
 value in the fork/execed child program. Please let me know if thats not
 correct and I am missing something there.
 
 Changes in V3:
 -
 - Minor changes to last couple of sysfs test cases
 - Added .gitignore file for the new test directory

Hey Michael/Ben/Mikey,

Do you have any thoughts or updates on this patch series. Its already
been a month for this latest version to be on the list.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH V4 3/3] mm: Clarify that the function operateds on hugepage pte

2015-05-12 Thread Aneesh Kumar K.V

We have confusing functions to clear pmd, pmd_clear_* and pmd_clear.
Add _huge_ to pmdp_clear functions so that we are clear that they
operate on hugepage pte.

We don't bother about other functions like pmdp_set_wrprotect,
pmdp_clear_flush_young, because they operate on PTE bits and hence
indicate they are operating on hugepage ptes

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
NOTE: Only tested on powerpc

 arch/mips/include/asm/pgtable.h  |  8 
 arch/powerpc/include/asm/pgtable-ppc64.h |  6 +++---
 arch/powerpc/mm/pgtable_64.c |  4 ++--
 arch/s390/include/asm/pgtable.h  | 22 +++---
 arch/sparc/include/asm/pgtable_64.h  |  8 
 arch/tile/include/asm/pgtable.h  |  8 
 arch/x86/include/asm/pgtable.h   |  4 ++--
 include/asm-generic/pgtable.h| 24 ++--
 include/linux/mmu_notifier.h | 12 ++--
 mm/huge_memory.c | 16 
 mm/migrate.c |  2 +-
 mm/pgtable-generic.c |  8 
 mm/rmap.c|  2 +-
 13 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 819af9d057a8..9d8106758142 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 }
 
 /*
- * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a
+ * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() 
with a
  * different prototype.
  */
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-  unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+   unsigned long address, pmd_t *pmdp)
 {
pmd_t old = *pmdp;
 
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 3d7e898d6b5e..88e916b507c2 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -553,9 +553,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct 
*vma,
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
  unsigned long address, pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-   unsigned long addr, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+unsigned long addr, pmd_t *pmdp);
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index f7775193d745..876232d64126 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -812,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, 
unsigned long addr,
return;
 }
 
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-unsigned long addr, pmd_t *pmdp)
+pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
 {
pmd_t old_pmd;
pgtable_t pgtable;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index fc642399b489..4565b33ed94b 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct 
vm_area_struct *vma,
return pmd_young(pmd);
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-  unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+   unsigned long address, pmd_t *pmdp)
 {
pmd_t pmd = *pmdp;
 
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct 
*mm,
return pmd;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
-   unsigned long address,
-   pmd_t *pmdp, int full)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+unsigned long address,
+pmd_t *pmdp, int full)
 {
pmd_t pmd = *pmdp;
 
@@ -1522,11 +1522,11

[PATCH V4 2/3] powerpc/mm: Use generic version of pmdp_clear_flush

2015-05-12 Thread Aneesh Kumar K.V

Also move the pmd_trans_huge check to generic code.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h |  4 
 arch/powerpc/mm/pgtable_64.c | 11 ---
 mm/pgtable-generic.c |  1 +
 3 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index b8d99227a0ac..3d7e898d6b5e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -557,10 +557,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct 
*vma,
 extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long 
address,
- pmd_t *pmdp);
-
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  pmd_t *pmdp)
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 049d961802aa..f7775193d745 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -554,17 +554,6 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, 
unsigned long addr,
return old;
 }
 
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-  pmd_t *pmdp)
-{
-   pmd_t pmd;
-
-   VM_BUG_ON(address  ~HPAGE_PMD_MASK);
-   VM_BUG_ON(!pmd_trans_huge(*pmdp));
-   pmd = pmdp_get_and_clear(vma-vm_mm, address, pmdp);
-   return pmd;
-}
-
 pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
  pmd_t *pmdp)
 {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c25f94b33811..dd9d04f17749 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -126,6 +126,7 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned 
long address,
 {
pmd_t pmd;
VM_BUG_ON(address  ~HPAGE_PMD_MASK);
+   VM_BUG_ON(!pmd_trans_huge(*pmdp));
pmd = pmdp_get_and_clear(vma-vm_mm, address, pmdp);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
-- 
2.1.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH V4 1/3] mm/thp: Split out pmd collpase flush into a separate functions

2015-05-12 Thread Aneesh Kumar K.V

Architectures like ppc64 [1] need to do special things while clearing
pmd before a collapse. For them this operation is largely different
from a normal hugepage pte clear. Hence add a separate function
to clear pmd before collapse. After this patch pmdp_* functions
operate only on hugepage pte, and not on regular pmd_t values
pointing to page table.

[1] ppc64 needs to invalidate all the normal page pte mappings we
already have inserted in the hardware hash page table. But before
doing that we need to make sure there are no parallel hash page
table insert going on. So we need to do a kick_all_cpus_sync()
before flushing the older hash table entries. By moving this to
a separate function we capture these details and mention how it
is different from a hugepage pte clear.

This patch is a cleanup and only does code movement for clarity.
There should not be any change in functionality.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h |  4 ++
 arch/powerpc/mm/pgtable_64.c | 76 +---
 include/asm-generic/pgtable.h| 19 
 mm/huge_memory.c |  2 +-
 4 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 43e6ad424c7f..b8d99227a0ac 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -576,6 +576,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct 
*mm, unsigned long addr,
 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 unsigned long address, pmd_t *pmdp);
 
+#define pmdp_collapse_flush pmdp_collapse_flush
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+unsigned long address, pmd_t *pmdp);
+
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
   pgtable_t pgtable);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 6bfadf1aa5cb..049d961802aa 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -560,41 +560,47 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, 
unsigned long address,
pmd_t pmd;
 
VM_BUG_ON(address  ~HPAGE_PMD_MASK);
-   if (pmd_trans_huge(*pmdp)) {
-   pmd = pmdp_get_and_clear(vma-vm_mm, address, pmdp);
-   } else {
-   /*
-* khugepaged calls this for normal pmd
-*/
-   pmd = *pmdp;
-   pmd_clear(pmdp);
-   /*
-* Wait for all pending hash_page to finish. This is needed
-* in case of subpage collapse. When we collapse normal pages
-* to hugepage, we first clear the pmd, then invalidate all
-* the PTE entries. The assumption here is that any low level
-* page fault will see a none pmd and take the slow path that
-* will wait on mmap_sem. But we could very well be in a
-* hash_page with local ptep pointer value. Such a hash page
-* can result in adding new HPTE entries for normal subpages.
-* That means we could be modifying the page content as we
-* copy them to a huge page. So wait for parallel hash_page
-* to finish before invalidating HPTE entries. We can do this
-* by sending an IPI to all the cpus and executing a dummy
-* function there.
-*/
-   kick_all_cpus_sync();
-   /*
-* Now invalidate the hpte entries in the range
-* covered by pmd. This make sure we take a
-* fault and will find the pmd as none, which will
-* result in a major fault which takes mmap_sem and
-* hence wait for collapse to complete. Without this
-* the __collapse_huge_page_copy can result in copying
-* the old content.
-*/
-   flush_tlb_pmd_range(vma-vm_mm, pmd, address);
-   }
+   VM_BUG_ON(!pmd_trans_huge(*pmdp));
+   pmd = pmdp_get_and_clear(vma-vm_mm, address, pmdp);
+   return pmd;
+}
+
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+   pmd_t pmd;
+
+   VM_BUG_ON(address  ~HPAGE_PMD_MASK);
+   VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+   pmd = *pmdp;
+   pmd_clear(pmdp);
+   /*
+* Wait for all pending hash_page to finish. This is needed
+* in case of subpage collapse. When we collapse normal pages
+* to hugepage, we first clear the pmd, then invalidate all
+* the PTE entries. The assumption here is that any low level
+* page fault will see

Re: [PATCH V3 2/9] powerpc/pci_dn: cache vf_index in pci_dn

2015-05-12 Thread Gavin Shan

On Mon, May 11, 2015 at 01:54:12PM +0800, Wei Yang wrote:
On Mon, May 11, 2015 at 12:21:04PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:31PM +0800, Wei Yang wrote:
This patch caches the index of a VF in its PF in pci_dn.


At least you can mention the purpose of vf_index to make the commit log
complete. The following message looks better?

The patch caches the VF index in pci_dn, which can be used to calculate
VF's bus, device and function number. Those information helps to locate
the VF's PCI device instance when doing hotplug during EEH recovery if
necessary.


Thanks, looks better. I added it in the log.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pci-bridge.h |1 +
 arch/powerpc/kernel/pci_dn.c  |5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44..9582aa2 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -199,6 +199,7 @@ struct pci_dn {
 #ifdef CONFIG_PCI_IOV
 u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
 u16 num_vfs;/* number of VFs enabled*/
+int vf_index;   /* Index to PF for VF dev */
^^^

  /* VF index in the PF */

Ok, changed in the code.


And I believe it can be unsigned int, or u16. We should have
non-negative vf_index, no?

Take a look in the virtfn_add(), the index in drivers/pci/iov.c is int. So I
copy that.


 int offset; /* PE# for the first VF PE */
 #define M64_PER_IOV 4
 int m64_per_iov;
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index b3b4df9..bf0fb873 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -138,7 +138,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)

 #ifdef CONFIG_PCI_IOV
 static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
-   struct pci_dev *pdev,
+   struct pci_dev *pdev, int vf_index,

 struct pci_dev *pdev,
 int vf_index;

Some reason for this comment?

That does not exceed 80 characters.


No, it doesn't exceed 80 characters as you said. You take one of the following
formats, not the one you're using:

add_one_dev_pci_data(foo1, foo2,  add_one_dev_pci_data(foo1,
 foo3, foo4,   foo2,
 foo5, foo6);:
   foo6);

int busno, int devfn)
 {
 struct pci_dn *pdn;
@@ -157,6 +157,7 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
 pdn-parent = parent;
 pdn-busno = busno;
 pdn-devfn = devfn;
+pdn-vf_index = vf_index;
 #ifdef CONFIG_PPC_POWERNV
 pdn-pe_number = IODA_INVALID_PE;
 #endif
@@ -196,7 +197,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
 return NULL;

 for (i = 0; i  pci_sriov_get_totalvfs(pdev); i++) {
-pdn = add_one_dev_pci_data(parent, NULL,
+pdn = add_one_dev_pci_data(parent, NULL, i,
pci_iov_virtfn_bus(pdev, i),
pci_iov_virtfn_devfn(pdev, i));
 if (!pdn) {

Thanks,
Gavin

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3 5/9] powerpc/eeh: create EEH_PE_VF for VF PE

2015-05-12 Thread Gavin Shan

On Mon, May 11, 2015 at 02:25:49PM +0800, Wei Yang wrote:
On Mon, May 11, 2015 at 12:37:07PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:34PM +0800, Wei Yang wrote:

Please reorder PATCH[6] with this one because the EEH device is expected
to be created before EEH PE.

That's a good idea.


On powernv platform, VF PE is a special PE which is different from the Bus
PE.  On the EEH side, it needs a corresponding concept to handle the VF PE
properly. For example, we need to create VF PE when VF's pci_dev is
initialized in kernel. And add a flag to mark it is a VF PF.
   ^


From above commit log, my understanding is that you're adding a flag to
identify VF PE, which is handled differently from bus PE. You missed the
details on the difference between them and the speical treament to VF PE.
Could you help add those information in the commit log to make it looks
complete?


This patch just introduce the VF PE. For those differences, we have another
patch handle VF PE properly to cover. In the log of that patch, I listed
those differences. Do you think this is fine?


It's fine to me.

This patch introduces the EEH_PE_VF type for VF PE and creates it for a VF.
At the mean time, it creates the sysfs and address cache for VF PE.
   

   it creates the sysfs and address cache for VF PE at PCI
device final fixup time.


Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |1 +
 arch/powerpc/kernel/eeh_pe.c |   12 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c |   12 
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a52db28..56e8cd9 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -70,6 +70,7 @@ struct pci_dn;
 #define EEH_PE_PHB  (1  1)/* PHB PE*/
 #define EEH_PE_DEVICE   (1  2)/* Device PE */
 #define EEH_PE_BUS  (1  3)/* Bus PE*/
+#define EEH_PE_VF   (1  4)/* VF PE */

 #define EEH_PE_ISOLATED (1  0)/* Isolated PE  
 */
 #define EEH_PE_RECOVERING   (1  1)/* Recovering PE*/
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 35f0b62..edfe63a 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -299,7 +299,12 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
  * EEH device already having associated PE, but
  * the direct parent EEH device doesn't have yet.
  */
-pdn = pdn ? pdn-parent : NULL;
+#ifdef CONFIG_PCI_IOV
+if (edev-mode  EEH_DEV_VF)
+pdn = pci_get_pdn(edev-physfn);
+else
+#endif
+pdn = pdn ? pdn-parent : NULL;

[A]

 while (pdn) {
 /* We're poking out of PCI territory */
 parent = pdn_to_eeh_dev(pdn);
@@ -382,7 +387,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 }

 /* Create a new EEH PE */
-pe = eeh_pe_alloc(edev-phb, EEH_PE_DEVICE);
+if (edev-mode  EEH_DEV_VF)
+pe = eeh_pe_alloc(edev-phb, EEH_PE_VF);
+else
+pe = eeh_pe_alloc(edev-phb, EEH_PE_DEVICE);

You don't have CONFIG_PCI_IOV here to protect the code, but you had
that at [A]. In order to keep the code look consistent, you either
add it or remove it for all places. I prefer to remove it, which
we don't need CONFIG_PCI_IOV.


Ok, that's fine to remove it.

BTW, if remove the CONFIG_PCI_IOV, we need to remove it around the physfn in
eeh_dev definition. That's fine?


It's fine to me.

 if (!pe) {
 pr_err(%s: out of memory!\n, __func__);
 return -ENOMEM;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 622f08c..5447481 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1540,3 +1540,15 @@ static int __init eeh_powernv_init(void)
 return ret;
 }
 machine_early_initcall(powernv, eeh_powernv_init);
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_fixup_vf_eeh(struct pci_dev *pdev)
+{

Please rename it to pnv_eeh_vf_final_fixup(). Names of all functions
in this file expect prefix pnv_eeh_. With _final_, it's clearly
to tell it's called on PCI device final fixup time.


ok

+/* sysfs files should only be added after devices are added */

It's nice to explain why here: sysfs for the PCI device isn't populated
and the MMIO resource isn't finalized for the PCI device yet.


Don't get your point.

sysfs of the PCI device is populated at this point.


You have two operations here: (A) add the PCI device to EEH address cache;
(B) add EEH related sysfs entries. (A) requires that the resources (MMIO
on PHB3) of the VF is finalized. (B) requires VF's sysfs entries have been

Re: [PATCH V3 8/9] powerpc/powernv: Support PCI config restore for VFs

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 09:31:34AM +0800, Wei Yang wrote:
On Mon, May 11, 2015 at 02:22:38PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:37PM +0800, Wei Yang wrote:
Since FW is not aware of VFs, the restore action for VF should be done in
   ^^
   skiboot firmware
kernel.

This patch introduces pnv_eeh_vf_restore_config() for VF.


Would it be better?

The patch introduces function pnv_eeh_vf_restore_config() to restore PCI
config space for VFs after reset.


Ok.

Also, the function name would be better with pnv_eeh_restore_vf_config()?

Ok.


Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pci-bridge.h|1 +
 arch/powerpc/platforms/powernv/eeh-powernv.c |   77 
 +-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 9582aa2..de55ef6 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -205,6 +205,7 @@ struct pci_dn {
 int m64_per_iov;
 #define IODA_INVALID_M64(-1)
 int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
+int mps;
 #endif /* CONFIG_PCI_IOV */
 #endif
 struct list_head child_list;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 1ad322f..6ba6d87 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1589,6 +1589,59 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 return ret;
 }

+#ifdef CONFIG_PCI_IOV
+static int pnv_eeh_vf_restore_config(struct pci_dn *pdn)
+{
+int pcie_cap, aer_cap, old_mps;
+u32 devctl, cmd, cap2, aer_capctl;
+
+/* Restore MPS */
+pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+if (pcie_cap) {
+old_mps = (ffs(pdn-mps) - 8)  5;
+pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+devctl = ~PCI_EXP_DEVCTL_PAYLOAD;
+devctl |= old_mps;
+pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+}
+

hrm, You can't use pnv_pci_cfg_{read,write} here. Instead, you should use
eeh_ops-{read,write}_config. By design, the PCI config accessors have been
classified to 2 classes: one is used for pci_config_{read,write}_* and another
one is eeh_ops-{read,write}. From EEH perspective, the former isn't 
controlled
strictly, but the later one is under control completely. Not controlled here
means the kernel can't determine when the PCI config is accessed, e.g. PCI
config accesses from user land.


Reasonable, will change it.

+/* Disable Completion Timeout */
+if (pcie_cap) {
+pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCAP2, 4, cap2);
+if (cap2  0x10) {
+pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL2, 4, 
cap2);
+cap2 |= 0x10;
+pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL2, 4, 
cap2);
+}
+}
+
+/* Enable SERR and parity checking */
+pnv_pci_cfg_read(pdn, PCI_COMMAND, 2, cmd);
+cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+pnv_pci_cfg_write(pdn, PCI_COMMAND, 2, cmd);
+
+/* Enable report various errors */
+if (pcie_cap) {
+pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+devctl = ~PCI_EXP_DEVCTL_CERE;
+devctl |= (PCI_EXP_DEVCTL_NFERE |
+   PCI_EXP_DEVCTL_FERE |
+   PCI_EXP_DEVCTL_URRE);
+pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+}
+
+/* Enable ECRC generation and check */
+if (pcie_cap) {
+aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+pnv_pci_cfg_read(pdn, aer_cap + PCI_ERR_CAP, 4, aer_capctl);
+aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+pnv_pci_cfg_write(pdn, aer_cap + PCI_ERR_CAP, 4, aer_capctl);
+}
+
+return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+

The code is copied over from skiboot firmware. I still dislike the fact that
we have to maintain two sets of similar functions in skiboot/kernel. I still
believe the way I suggested can help: the firmware exports the error routing
rules and kernel has support it based on the rules. With it, the skiboot is
the source of the information to avoid mismatching between kernel/firmware.

Yes, it looks we have duplicate code in kernel and skiboot.

As you suggest, if we export some bit map from device node, we still have the
real logic in kernel, until we remove that part in skiboot.

By removing that part in skiboot, we may have some compatibility problem. For
example, an old kernel may not run on the new version of skiboot.


It's fine to keep two set code which bear with same rule, which is exported
from skiboot. In that case, the rule is the only thing we have to care. We
don't need care the code any more to avoid

Re: [PATCH V3 8/9] powerpc/powernv: Support PCI config restore for VFs

2015-05-12 Thread Wei Yang

On Tue, May 12, 2015 at 04:34:03PM +1000, Gavin Shan wrote:

+   /* Disable Completion Timeout */
+   if (pcie_cap) {
+   pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCAP2, 4, cap2);
+   if (cap2  0x10) {
+   pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL2, 4, 
cap2);
+   cap2 |= 0x10;
+   pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL2, 4, 
cap2);
+   }
+   }
+
+   /* Enable SERR and parity checking */
+   pnv_pci_cfg_read(pdn, PCI_COMMAND, 2, cmd);
+   cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+   pnv_pci_cfg_write(pdn, PCI_COMMAND, 2, cmd);
+
+   /* Enable report various errors */
+   if (pcie_cap) {
+   pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+   devctl = ~PCI_EXP_DEVCTL_CERE;
+   devctl |= (PCI_EXP_DEVCTL_NFERE |
+  PCI_EXP_DEVCTL_FERE |
+  PCI_EXP_DEVCTL_URRE);
+   pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+   }
+
+   /* Enable ECRC generation and check */
+   if (pcie_cap) {
+   aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+   pnv_pci_cfg_read(pdn, aer_cap + PCI_ERR_CAP, 4, aer_capctl);
+   aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+   pnv_pci_cfg_write(pdn, aer_cap + PCI_ERR_CAP, 4, aer_capctl);
+   }
+
+   return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+

The code is copied over from skiboot firmware. I still dislike the fact that
we have to maintain two sets of similar functions in skiboot/kernel. I still
believe the way I suggested can help: the firmware exports the error routing
rules and kernel has support it based on the rules. With it, the skiboot is
the source of the information to avoid mismatching between kernel/firmware.

Yes, it looks we have duplicate code in kernel and skiboot.

As you suggest, if we export some bit map from device node, we still have the
real logic in kernel, until we remove that part in skiboot.

By removing that part in skiboot, we may have some compatibility problem. For
example, an old kernel may not run on the new version of skiboot.


It's fine to keep two set code which bear with same rule, which is exported
from skiboot. In that case, the rule is the only thing we have to care. We
don't need care the code any more to avoid mismatch between kernel/firmware.


Ok, duplication is reasonable, then the major point for this is we need to
have a clear rule for restoring configuration for a device.

Than I suggest we could have another patch set to handle this. Define the rule
clearly and restore the configuration in kernel when skiboot firmware export
such rules.

-- 
Richard Yang
Help you, Help me

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/eeh: Fix wrong printed PE number

2015-05-12 Thread Gavin Shan

On LE kernel, the non-existing PE number in BE format derived from
skiboot firmware isn't converted to LE format properly as following
kernel log indicates:

   EEH: Clear non-existing PHB#4-PE#200

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ce738ab..9e001e1 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1396,7 +1396,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
be64_to_cpu(frozen_pe_no), pe)) {
/* Try best to clear it */
pr_info(EEH: Clear non-existing 
PHB#%x-PE#%llx\n,
-   hose-global_number, frozen_pe_no);
+   hose-global_number, 
be64_to_cpu(frozen_pe_no));
pr_info(EEH: PHB location: %s\n,
eeh_pe_loc_get(phb_pe));
opal_pci_eeh_freeze_clear(phb-opal_id,
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/eeh: Dump PHB diag-data for non-existing PE

2015-05-12 Thread Gavin Shan

When detecting EEH error on non-existing PE, including the reserved
one, the PE is simply unfrozen without dumping the PHB diag-data,
which is useful for locating the root cause of the EEH error. The
patch dumps the PHB diag-data when non-existing PE reports error.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 9e001e1..bbcf81f 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1394,11 +1394,19 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 */
if (pnv_eeh_get_pe(hose,
be64_to_cpu(frozen_pe_no), pe)) {
-   /* Try best to clear it */
pr_info(EEH: Clear non-existing 
PHB#%x-PE#%llx\n,
hose-global_number, 
be64_to_cpu(frozen_pe_no));
pr_info(EEH: PHB location: %s\n,
eeh_pe_loc_get(phb_pe));
+
+   /* Dump PHB diag-data */
+   rc = opal_pci_get_phb_diag_data2(phb-opal_id,
+   phb-diag.blob, PNV_PCI_DIAG_BUF_SIZE);
+   if (rc == OPAL_SUCCESS)
+   pnv_pci_dump_phb_diag_data(hose,
+   phb-diag.blob);
+
+   /* Try best to clear it */
opal_pci_eeh_freeze_clear(phb-opal_id,
frozen_pe_no,
OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[alsa-devel][PATCH 3/3] ASoC: fsl_sai: add 12kHz, 24kHz, 176.4kHz and 192kHz sample rate support

2015-05-12 Thread Zidan Wang

Normally we don't support 12kHz, 24kHz in audio driver, alsa didn't
have formal definition of 12kHz, 24kHz, but alsa supply a way to
support these sample rates. And add 176.4kHz and 192kHz support.

Signed-off-by: Zidan Wang zidan.w...@freescale.com
---
 sound/soc/fsl/fsl_sai.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index 88f5861..7efcac4 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -27,6 +27,17 @@
 #define FSL_SAI_FLAGS (FSL_SAI_CSR_SEIE |\
   FSL_SAI_CSR_FEIE)
 
+static u32 fsl_sai_rates[] = {
+   8000, 11025, 12000, 16000, 22050,
+   24000, 32000, 44100, 48000, 64000,
+   88200, 96000, 176400, 192000
+};
+
+static struct snd_pcm_hw_constraint_list fsl_sai_rate_constraints = {
+   .count = ARRAY_SIZE(fsl_sai_rates),
+   .list = fsl_sai_rates,
+};
+
 static irqreturn_t fsl_sai_isr(int irq, void *devid)
 {
struct fsl_sai *sai = (struct fsl_sai *)devid;
@@ -531,7 +542,10 @@ static int fsl_sai_startup(struct snd_pcm_substream 
*substream,
regmap_update_bits(sai-regmap, FSL_SAI_xCR3(tx), FSL_SAI_CR3_TRCE,
   FSL_SAI_CR3_TRCE);
 
-   return 0;
+   ret = snd_pcm_hw_constraint_list(substream-runtime, 0,
+   SNDRV_PCM_HW_PARAM_RATE, fsl_sai_rate_constraints);
+
+   return ret;
 }
 
 static void fsl_sai_shutdown(struct snd_pcm_substream *substream,
@@ -586,14 +600,18 @@ static struct snd_soc_dai_driver fsl_sai_dai = {
.stream_name = CPU-Playback,
.channels_min = 1,
.channels_max = 2,
-   .rates = SNDRV_PCM_RATE_8000_96000,
+   .rate_min = 8000,
+   .rate_max = 192000,
+   .rates = SNDRV_PCM_RATE_KNOT,
.formats = FSL_SAI_FORMATS,
},
.capture = {
.stream_name = CPU-Capture,
.channels_min = 1,
.channels_max = 2,
-   .rates = SNDRV_PCM_RATE_8000_96000,
+   .rate_min = 8000,
+   .rate_max = 192000,
+   .rates = SNDRV_PCM_RATE_KNOT,
.formats = FSL_SAI_FORMATS,
},
.ops = fsl_sai_pcm_dai_ops,
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/3] cpuidle: updates related to tick_broadcast_enter() failures

2015-05-12 Thread Daniel Lezcano


On 05/12/2015 01:31 AM, Rafael J. Wysocki wrote:

On Monday, May 11, 2015 07:40:41 PM Daniel Lezcano wrote:

On 05/10/2015 01:15 AM, Rafael J. Wysocki wrote:

On Saturday, May 09, 2015 10:33:05 PM Rafael J. Wysocki wrote:

On Saturday, May 09, 2015 10:11:41 PM Rafael J. Wysocki wrote:

On Saturday, May 09, 2015 11:19:16 AM Preeti U Murthy wrote:

Hi Rafael,

On 05/08/2015 07:48 PM, Rafael J. Wysocki wrote:


[cut]



+   /* Take note of the planned idle state. */
+   idle_set_state(smp_processor_id(), target_state);


And I wouldn't do this either.

The behavior here is pretty much as though the driver demoted the state chosen
by the governor and we don't call idle_set_state() again in those cases.


Why is this wrong?


It is not wrong, but incomplete, because demotions done by the cpuidle driver
should also be taken into account in the same way.

But I'm seeing that the recent patch of mine that made cpuidle_enter_state()
call default_idle_call() was a mistake, because it might confuse 
find_idlest_cpu()
significantly as to what state the CPU is in.  I'll drop that one for now.


OK, done.

So after I've dropped it I think we need to do three things:
(1) Move the idle_set_state() calls to cpuidle_enter_state().
(2) Make cpuidle_enter_state() call default_idle_call() again, but this time
  do that *before* it has called idle_set_state() for target_state.
(3) Introduce demotion as per my last patch.

Let me cut patches for that.


Done as per the above and the patches follow in replies to this messge.

All on top of the current linux-next branch of the linux-pm.git tree.


IMO the resulting code is more and more confusing.


Why is it confusing?

What part of it is confusing?

Patches [1-2/3] simply replace https://patchwork.kernel.org/patch/6326761/
and I'm not sure why that would be confusing.

Patch [3/3] simply causes cpuidle_enter_state() to pick up a more suitable
state if tick_broadcast_enter() fails instead of returning an error code
in that case.  What exactly is confusing in that?


Except I miss something, the tick_broadcast_enter can fail only if the
local timer of the current cpu is used as a broadcast timer (which is
the case today for PPC only).


well, why does this matter?


The correct fix would be to tie this local timer with the cpu power
domain and disable the idle state powering down this domain like it was
done for the renesas cpuidle driver.

IOW, the cpu power domain is in use (because of its local timer), so we
shouldn't shut it down.

No ?


Sorry, I'm not sure what you're talking about.

The problem at hand is that tick_broadcast_enter() can fail and we need to
handle that.  If we can prevent it from ever failing, that would be awesome,
but quite honestly I don't see how to do that ATM.


Ok, sorry. Let me clarify.

You did a mechanism two years ago with pm_genpd_attach_cpuidle and 
power_on/off. That disables a cpuidle state when a power domain is in use.


The idea I was proposing is to reuse this approach.

The logic is:

The local timer is in use, this idle state power downs this timer, then 
disable it.


So it is when the broadcast timer is 'bound_on' a cpu, we disable the 
idle states. That could be done via a loop looking for the TIMER_STOP 
flag or via the power domain.


Hence the cpuidle_select will never return a state which powers downs 
the local cpu (because they are disabled) and tick_broadcast_enter can't 
fail because it is never called.


Does it make more sense ?


I am aware this is not easily fixable because the genpd framework is
incomplete and has some restrictions but I believe it is worth to have a
discussion. Add Kevin and Ulf in Cc.


So I'm going to queue up these patches for 4.2 and we can have a discussion
just fine regardless.




--
 http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  http://www.facebook.com/pages/Linaro Facebook |
http://twitter.com/#!/linaroorg Twitter |
http://www.linaro.org/linaro-blog/ Blog

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3 2/9] powerpc/pci_dn: cache vf_index in pci_dn

2015-05-12 Thread Wei Yang

On Tue, May 12, 2015 at 04:15:58PM +1000, Gavin Shan wrote:
On Mon, May 11, 2015 at 01:54:12PM +0800, Wei Yang wrote:
On Mon, May 11, 2015 at 12:21:04PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:31PM +0800, Wei Yang wrote:
This patch caches the index of a VF in its PF in pci_dn.


At least you can mention the purpose of vf_index to make the commit log
complete. The following message looks better?

The patch caches the VF index in pci_dn, which can be used to calculate
VF's bus, device and function number. Those information helps to locate
the VF's PCI device instance when doing hotplug during EEH recovery if
necessary.


Thanks, looks better. I added it in the log.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pci-bridge.h |1 +
 arch/powerpc/kernel/pci_dn.c  |5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44..9582aa2 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -199,6 +199,7 @@ struct pci_dn {
 #ifdef CONFIG_PCI_IOV
u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
u16 num_vfs;/* number of VFs enabled*/
+   int vf_index;   /* Index to PF for VF dev */
^^^

 /* VF index in the PF */

Ok, changed in the code.


And I believe it can be unsigned int, or u16. We should have
non-negative vf_index, no?

Take a look in the virtfn_add(), the index in drivers/pci/iov.c is int. So I
copy that.


int offset; /* PE# for the first VF PE */
 #define M64_PER_IOV 4
int m64_per_iov;
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index b3b4df9..bf0fb873 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -138,7 +138,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)

 #ifdef CONFIG_PCI_IOV
 static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
-  struct pci_dev *pdev,
+  struct pci_dev *pdev, int vf_index,

struct pci_dev *pdev,
int vf_index;

Some reason for this comment?

That does not exceed 80 characters.


No, it doesn't exceed 80 characters as you said. You take one of the following
formats, not the one you're using:

add_one_dev_pci_data(foo1, foo2,  add_one_dev_pci_data(foo1,
 foo3, foo4,   foo2,
 foo5, foo6);:
   foo6);


Thanks

   int busno, int devfn)
 {
struct pci_dn *pdn;
@@ -157,6 +157,7 @@ static struct pci_dn *add_one_dev_pci_data(struct 
pci_dn *parent,
pdn-parent = parent;
pdn-busno = busno;
pdn-devfn = devfn;
+   pdn-vf_index = vf_index;
 #ifdef CONFIG_PPC_POWERNV
pdn-pe_number = IODA_INVALID_PE;
 #endif
@@ -196,7 +197,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
return NULL;

for (i = 0; i  pci_sriov_get_totalvfs(pdev); i++) {
-   pdn = add_one_dev_pci_data(parent, NULL,
+   pdn = add_one_dev_pci_data(parent, NULL, i,
   pci_iov_virtfn_bus(pdev, i),
   pci_iov_virtfn_devfn(pdev, i));
if (!pdn) {

Thanks,
Gavin

-- 
Richard Yang
Help you, Help me

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3 5/9] powerpc/eeh: create EEH_PE_VF for VF PE

2015-05-12 Thread Wei Yang

On Tue, May 12, 2015 at 04:28:23PM +1000, Gavin Shan wrote:
On Mon, May 11, 2015 at 02:25:49PM +0800, Wei Yang wrote:
On Mon, May 11, 2015 at 12:37:07PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:34PM +0800, Wei Yang wrote:

Please reorder PATCH[6] with this one because the EEH device is expected
to be created before EEH PE.

That's a good idea.


On powernv platform, VF PE is a special PE which is different from the Bus
PE.  On the EEH side, it needs a corresponding concept to handle the VF PE
properly. For example, we need to create VF PE when VF's pci_dev is
initialized in kernel. And add a flag to mark it is a VF PF.
   ^


From above commit log, my understanding is that you're adding a flag to
identify VF PE, which is handled differently from bus PE. You missed the
details on the difference between them and the speical treament to VF PE.
Could you help add those information in the commit log to make it looks
complete?


This patch just introduce the VF PE. For those differences, we have another
patch handle VF PE properly to cover. In the log of that patch, I listed
those differences. Do you think this is fine?


It's fine to me.

This patch introduces the EEH_PE_VF type for VF PE and creates it for a VF.
At the mean time, it creates the sysfs and address cache for VF PE.
   

   it creates the sysfs and address cache for VF PE at PCI
device final fixup time.


Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |1 +
 arch/powerpc/kernel/eeh_pe.c |   12 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c |   12 
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a52db28..56e8cd9 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -70,6 +70,7 @@ struct pci_dn;
 #define EEH_PE_PHB (1  1)/* PHB PE*/
 #define EEH_PE_DEVICE  (1  2)/* Device PE */
 #define EEH_PE_BUS (1  3)/* Bus PE*/
+#define EEH_PE_VF  (1  4)/* VF PE */

 #define EEH_PE_ISOLATED(1  0)/* Isolated PE  
 */
 #define EEH_PE_RECOVERING  (1  1)/* Recovering PE*/
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 35f0b62..edfe63a 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -299,7 +299,12 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
 * EEH device already having associated PE, but
 * the direct parent EEH device doesn't have yet.
 */
-   pdn = pdn ? pdn-parent : NULL;
+#ifdef CONFIG_PCI_IOV
+   if (edev-mode  EEH_DEV_VF)
+   pdn = pci_get_pdn(edev-physfn);
+   else
+#endif
+   pdn = pdn ? pdn-parent : NULL;

[A]

while (pdn) {
/* We're poking out of PCI territory */
parent = pdn_to_eeh_dev(pdn);
@@ -382,7 +387,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
}

/* Create a new EEH PE */
-   pe = eeh_pe_alloc(edev-phb, EEH_PE_DEVICE);
+   if (edev-mode  EEH_DEV_VF)
+   pe = eeh_pe_alloc(edev-phb, EEH_PE_VF);
+   else
+   pe = eeh_pe_alloc(edev-phb, EEH_PE_DEVICE);

You don't have CONFIG_PCI_IOV here to protect the code, but you had
that at [A]. In order to keep the code look consistent, you either
add it or remove it for all places. I prefer to remove it, which
we don't need CONFIG_PCI_IOV.


Ok, that's fine to remove it.

BTW, if remove the CONFIG_PCI_IOV, we need to remove it around the physfn in
eeh_dev definition. That's fine?


It's fine to me.

if (!pe) {
pr_err(%s: out of memory!\n, __func__);
return -ENOMEM;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 622f08c..5447481 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1540,3 +1540,15 @@ static int __init eeh_powernv_init(void)
return ret;
 }
 machine_early_initcall(powernv, eeh_powernv_init);
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_fixup_vf_eeh(struct pci_dev *pdev)
+{

Please rename it to pnv_eeh_vf_final_fixup(). Names of all functions
in this file expect prefix pnv_eeh_. With _final_, it's clearly
to tell it's called on PCI device final fixup time.


ok

+   /* sysfs files should only be added after devices are added */

It's nice to explain why here: sysfs for the PCI device isn't populated
and the MMIO resource isn't finalized for the PCI device yet.


Don't get your point.

sysfs of the PCI device is populated at this point.


You have two operations here: (A) add the PCI device to EEH address cache;
(B) add EEH related sysfs entries. (A) requires that the resources (MMIO
on PHB3) of the VF is finalized. (B) requires

Re: [PATCH V3 6/9] powerpc/powernv: create/release eeh_dev for VF

2015-05-12 Thread Wei Yang

On Mon, May 11, 2015 at 12:48:56PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:35PM +0800, Wei Yang wrote:

Please order this patch and PATCH[5] because EEH device is expected to
be created before EEH PE.

EEH on powerpc platform needs eeh_dev structure to track the pci device
  ^^^
 PCI

status. Since VFs are created/released dynamically, VF's eeh_dev is also
dynamically created/released in system.

This patch creates/removes eeh_dev when pci_dn is created/removed for VFs,
and marks it with EEH_DEV_VF type.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h |7 +++
 arch/powerpc/kernel/eeh.c  |4 
 arch/powerpc/kernel/eeh_dev.c  |   20 
 arch/powerpc/kernel/pci_dn.c   |7 +++
 4 files changed, 38 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 56e8cd9..2067de4 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -124,6 +124,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER   (1  8)/* No error handler */
 #define EEH_DEV_SYSFS(1  9)/* Sysfs created
 */
 #define EEH_DEV_REMOVED  (1  10)   /* Removed permanently  
 */
+#define EEH_DEV_VF   (1  11)   /* VF port  */


Why you need this flag? I guess edev-physfn can be used to distinguish
it's a normal or VF eeh_dev.


Just like we have EEH_DEV_BRIDGE and EEH_DEV_DS_PORT, I use the flag
EEH_DEV_VF to mark it a VF eeh_dev.


-- 
Richard Yang
Help you, Help me

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[alsa-devel][PATCH 2/3] ASoC: fsl_sai: Add tdm slots operation for SAI master mode

2015-05-12 Thread Zidan Wang

Add tdm slot operation for SAI master mode. When using SAI as master
mode, we should use set_tdm_slot() helper function to set tdm slots
in machine driver, or it will using default value of slots and slot
width.

SAI will generate BCLK depends on sample rate, slots and slot width.
And there may be unused BCLK cycles before each LRCLK transition.

Signed-off-by: Zidan Wang zidan.w...@freescale.com
---
 sound/soc/fsl/fsl_sai.c | 28 ++--
 sound/soc/fsl/fsl_sai.h |  3 +++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index 1ccc10d1..88f5861 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -115,6 +115,17 @@ out:
return IRQ_HANDLED;
 }
 
+static int fsl_sai_set_dai_tdm_slot(struct snd_soc_dai *cpu_dai, u32 tx_mask,
+   u32 rx_mask, int slots, int slot_width)
+{
+   struct fsl_sai *sai = snd_soc_dai_get_drvdata(cpu_dai);
+
+   sai-slots = slots;
+   sai-slot_width = slot_width;
+
+   return 0;
+}
+
 static int fsl_sai_set_dai_sysclk_tr(struct snd_soc_dai *cpu_dai,
int clk_id, unsigned int freq, int fsl_dir)
 {
@@ -372,11 +383,13 @@ static int fsl_sai_hw_params(struct snd_pcm_substream 
*substream,
unsigned int channels = params_channels(params);
u32 word_width = snd_pcm_format_width(params_format(params));
u32 val_cr4 = 0, val_cr5 = 0;
+   u32 slot_width = word_width;
int ret;
 
if (!sai-is_slave_mode) {
+   slot_width = sai-slot_width;
ret = fsl_sai_set_bclk(cpu_dai, tx,
-   2 * word_width * params_rate(params));
+   sai-slots * slot_width * params_rate(params));
if (ret)
return ret;
 
@@ -388,21 +401,20 @@ static int fsl_sai_hw_params(struct snd_pcm_substream 
*substream,
 
sai-mclk_streams |= BIT(substream-stream);
}
-
}
 
if (!sai-is_dsp_mode)
-   val_cr4 |= FSL_SAI_CR4_SYWD(word_width);
+   val_cr4 |= FSL_SAI_CR4_SYWD(slot_width);
 
-   val_cr5 |= FSL_SAI_CR5_WNW(word_width);
-   val_cr5 |= FSL_SAI_CR5_W0W(word_width);
+   val_cr5 |= FSL_SAI_CR5_WNW(slot_width);
+   val_cr5 |= FSL_SAI_CR5_W0W(slot_width);
 
if (sai-is_lsb_first)
val_cr5 |= FSL_SAI_CR5_FBT(0);
else
val_cr5 |= FSL_SAI_CR5_FBT(word_width - 1);
 
-   val_cr4 |= FSL_SAI_CR4_FRSZ(channels);
+   val_cr4 |= FSL_SAI_CR4_FRSZ(sai-slots);
 
regmap_update_bits(sai-regmap, FSL_SAI_xCR4(tx),
   FSL_SAI_CR4_SYWD_MASK | FSL_SAI_CR4_FRSZ_MASK,
@@ -536,6 +548,7 @@ static void fsl_sai_shutdown(struct snd_pcm_substream 
*substream,
 static const struct snd_soc_dai_ops fsl_sai_pcm_dai_ops = {
.set_sysclk = fsl_sai_set_dai_sysclk,
.set_fmt= fsl_sai_set_dai_fmt,
+   .set_tdm_slot   = fsl_sai_set_dai_tdm_slot,
.hw_params  = fsl_sai_hw_params,
.hw_free= fsl_sai_hw_free,
.trigger= fsl_sai_trigger,
@@ -721,6 +734,9 @@ static int fsl_sai_probe(struct platform_device *pdev)
}
}
 
+   sai-slots = 2;
+   sai-slot_width = 32;
+
irq = platform_get_irq(pdev, 0);
if (irq  0) {
dev_err(pdev-dev, no irq for node %s\n, pdev-name);
diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h
index 0662809..1ec09d6 100644
--- a/sound/soc/fsl/fsl_sai.h
+++ b/sound/soc/fsl/fsl_sai.h
@@ -142,6 +142,9 @@ struct fsl_sai {
 
unsigned int mclk_id[2];
unsigned int mclk_streams;
+   unsigned int slots;
+   unsigned int slot_width;
+
struct snd_dmaengine_dai_dma_data dma_params_rx;
struct snd_dmaengine_dai_dma_data dma_params_tx;
 };
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[alsa-devel][PATCH 1/3] ASoC: fsl_sai: add sai master mode support

2015-05-12 Thread Zidan Wang

When sai works on master mode, set its bit clock and frame clock.

SAI has 4 MCLK source, bus clock, MCLK1, MCLK2 and MCLK3. fsl_sai_set_bclk
will select proper MCLK source, then calculate and set the bit clock divider.

After fsl_sai_set_bclk, enable the selected mclk in hw_params(), and add
hw_free() to disable the mclk.

Signed-off-by: Zidan Wang zidan.w...@freescale.com
---
 sound/soc/fsl/fsl_sai.c | 117 ++--
 sound/soc/fsl/fsl_sai.h |   9 +++-
 2 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index ee2671b..1ccc10d1 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c
@@ -1,7 +1,7 @@
 /*
  * Freescale ALSA SoC Digital Audio Interface (SAI) driver.
  *
- * Copyright 2012-2013 Freescale Semiconductor, Inc.
+ * Copyright 2012-2015 Freescale Semiconductor, Inc.
  *
  * This program is free software, you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -251,12 +251,14 @@ static int fsl_sai_set_dai_fmt_tr(struct snd_soc_dai 
*cpu_dai,
val_cr4 |= FSL_SAI_CR4_FSD_MSTR;
break;
case SND_SOC_DAIFMT_CBM_CFM:
+   sai-is_slave_mode = true;
break;
case SND_SOC_DAIFMT_CBS_CFM:
val_cr2 |= FSL_SAI_CR2_BCD_MSTR;
break;
case SND_SOC_DAIFMT_CBM_CFS:
val_cr4 |= FSL_SAI_CR4_FSD_MSTR;
+   sai-is_slave_mode = true;
break;
default:
return -EINVAL;
@@ -288,6 +290,79 @@ static int fsl_sai_set_dai_fmt(struct snd_soc_dai 
*cpu_dai, unsigned int fmt)
return ret;
 }
 
+static int fsl_sai_set_bclk(struct snd_soc_dai *dai, bool tx, u32 freq)
+{
+   struct fsl_sai *sai = snd_soc_dai_get_drvdata(dai);
+   unsigned long clk_rate;
+   u32 savediv = 0, ratio, savesub = freq;
+   u32 id;
+   int ret = 0;
+
+   /* Don't apply to slave mode */
+   if (sai-is_slave_mode)
+   return 0;
+
+   for (id = 0; id  FSL_SAI_MCLK_MAX; id++) {
+   clk_rate = clk_get_rate(sai-mclk_clk[id]);
+   if (!clk_rate)
+   continue;
+
+   ratio = clk_rate / freq;
+
+   ret = clk_rate - ratio * freq;
+
+   /*
+* Drop the source that can not be
+* divided into the required rate.
+*/
+   if (ret != 0  clk_rate / ret  1000)
+   continue;
+
+   dev_dbg(dai-dev,
+   ratio %d for freq %dHz based on clock %ldHz\n,
+   ratio, freq, clk_rate);
+
+   if (ratio % 2 == 0  ratio = 2  ratio = 512)
+   ratio /= 2;
+   else
+   continue;
+
+   if (ret  savesub) {
+   savediv = ratio;
+   sai-mclk_id[tx] = id;
+   savesub = ret;
+   }
+
+   if (ret == 0)
+   break;
+   }
+
+   if (savediv == 0) {
+   dev_err(dai-dev, failed to derive required %cx rate: %d\n,
+   tx ? 'T' : 'R', freq);
+   return -EINVAL;
+   }
+
+   if ((tx  sai-synchronous[TX]) || (!tx  !sai-synchronous[RX])) {
+   regmap_update_bits(sai-regmap, FSL_SAI_RCR2,
+  FSL_SAI_CR2_MSEL_MASK,
+  FSL_SAI_CR2_MSEL(sai-mclk_id[tx]));
+   regmap_update_bits(sai-regmap, FSL_SAI_RCR2,
+  FSL_SAI_CR2_DIV_MASK, savediv - 1);
+   } else {
+   regmap_update_bits(sai-regmap, FSL_SAI_TCR2,
+  FSL_SAI_CR2_MSEL_MASK,
+  FSL_SAI_CR2_MSEL(sai-mclk_id[tx]));
+   regmap_update_bits(sai-regmap, FSL_SAI_TCR2,
+  FSL_SAI_CR2_DIV_MASK, savediv - 1);
+   }
+
+   dev_dbg(dai-dev, best fit: clock id=%d, div=%d, deviation =%d\n,
+   sai-mclk_id[tx], savediv, savesub);
+
+   return 0;
+}
+
 static int fsl_sai_hw_params(struct snd_pcm_substream *substream,
struct snd_pcm_hw_params *params,
struct snd_soc_dai *cpu_dai)
@@ -297,6 +372,24 @@ static int fsl_sai_hw_params(struct snd_pcm_substream 
*substream,
unsigned int channels = params_channels(params);
u32 word_width = snd_pcm_format_width(params_format(params));
u32 val_cr4 = 0, val_cr5 = 0;
+   int ret;
+
+   if (!sai-is_slave_mode) {
+   ret = fsl_sai_set_bclk(cpu_dai, tx,
+   2 * word_width * params_rate(params));
+   if (ret)
+   return ret;
+
+   /* Do not enable the clock if it is already enabled */
+

Re: [PATCH RESEND v6 1/2] arm64: dts: Add the arasan sdhci nodes in apm-storm.dtsi

2015-05-12 Thread Arnd Bergmann

On Tuesday 12 May 2015 11:03:46 Suman Tripathi wrote:
  @@ -533,6 +567,15 @@
  interrupts = 0x0 0x4f 0x4;
  };
 
  +   sdhci0: sdhci@1c00 {
  +   compatible = arasan,sdhci-4.9a;
  +   reg = 0x0 0x1c00 0x0 0x100;
  +   interrupts = 0x0 0x49 0x4;
  +   dma-coherent;
  +   clock-names = clk_xin, clk_ahb;
  +   clocks = sdioclk 0, ahbclk 0;
  +   };
  +
  phy1: phy@1f21a000 {
  compatible = apm,xgene-phy;
  reg = 0x0 0x1f21a000 0x0 0x100;
  --
  1.8.2.1
 
 
 Can anyone from dt community review this patch ? I have changed the dts node
 names from sdhc to sdhci as per Arnd, Michael comments .

I was actually asking for it to be named 'mmc', not 'sdhci', because the
name is supposed to indicate the purpose of the device, not the
implementation. I realize that we are inconsistent here, just as with
'uart' vs 'serial', and that ePAPR does not define what to do.

We should probably add something to 
Documentation/devicetree/bindings/mmc/mmc.txt
about this topic and change all the dts files accordingly (unless there
is a risk for regressions). At the moment, the mmc.txt file also includes
an example with 'sdhci', not 'mmc'.

Arnd
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 1/2] perf/kvm: Port perf kvm to powerpc

2015-05-12 Thread Hemant Kumar


Hi Scott,

On 05/12/2015 03:38 AM, Scott Wood wrote:

On Fri, 2015-05-08 at 06:37 +0530, Hemant Kumar wrote:

From: Srikar Dronamraju sri...@linux.vnet.ibm.com

perf kvm can be used to analyze guest exit reasons. This support already
exists in x86. Hence, porting it to powerpc.

  - To trace KVM events :
   perf kvm stat record
   If many guests are running, we can track for a specific guest by using
   --pid as in : perf kvm stat record --pid pid

  - To see the results :
   perf kvm stat report

The result shows the number of exits (from the guest context to
host/hypervisor context) grouped by their respective exit reasons with
their frequency.

This patch makes use of the guest exit reasons available in
trace_book3s.h. It records on two already available tracepoints :
kvm_hv:kvm_guest_exit and kvm_hv:kvm_guest_enter.

Note : This patch has a dependency on the patch kvm/powerpc: Export
kvm exit reasons which exports the KVM exit reasons through the uapi.

Here is a sample o/p:
  # pgrep qemu
19378
60515

2 Guests are running on the host.

  # perf kvm stat record -a
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 4.153 MB perf.data.guest (39624 samples) ]

  # perf kvm stat report -p 60515
Analyze events for pid(s) 60515, all VCPUs:

VM-EXITSamples  Samples% Time%Min Time Max
Time Avg time

H_DATA_STORAGE   500635.30% 0.13%  1.94us 49.46us 
12.37us ( +-   0.52% )
HV_DECREMENTER   445731.43% 0.02%  0.72us 16.14us  
1.91us ( +-   0.96% )
SYSCALL   269018.97% 0.10%  2.84us528.24us 
18.29us ( +-   3.75% )
RETURN_TO_HOST   178912.61%99.76%  1.58us 672791.91us  
27470.23us ( +-   3.00% )
   EXTERNAL240 1.69% 0.00%0.69us 10.67us  
1.33us ( +-   5.34% )

Total Samples:14182, Total events handled time:49264158.30us.

Signed-off-by: Srikar Dronamraju sri...@linux.vnet.ibm.com
Signed-off-by: Hemant Kumar hem...@linux.vnet.ibm.com
---
Patch has a dependency on : https://patchwork.ozlabs.org/patch/469839/
which exports the exit reasons to perf through uapi.

Changes:
- Original series split into two patchsets now : perf and powerpc
   side changes.

  arch/powerpc/include/uapi/asm/kvm_perf.h | 15 +++
  tools/perf/arch/powerpc/Makefile |  1 +
  tools/perf/arch/powerpc/util/Build   |  1 +
  tools/perf/arch/powerpc/util/kvm-stat.c  | 33 
  4 files changed, 50 insertions(+)
  create mode 100644 arch/powerpc/include/uapi/asm/kvm_perf.h
  create mode 100644 tools/perf/arch/powerpc/util/kvm-stat.c

diff --git a/arch/powerpc/include/uapi/asm/kvm_perf.h 
b/arch/powerpc/include/uapi/asm/kvm_perf.h
new file mode 100644
index 000..30fa670
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/kvm_perf.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_POWERPC_KVM_PERF_H
+#define _ASM_POWERPC_KVM_PERF_H
+
+#include asm/trace_book3s.h
+#include asm/kvm.h
+
+#define DECODE_STR_LEN 20
+
+#define VCPU_ID vcpu_id
+
+#define KVM_ENTRY_TRACE kvm_hv:kvm_guest_enter
+#define KVM_EXIT_TRACE kvm_hv:kvm_guest_exit
+#define KVM_EXIT_REASON trap
+
+#endif /* _ASM_POWERPC_KVM_PERF_H */

Please make sure that anything book3s-specific is named that way.


Are you suggesting to name it to something like _ASM_POWERPC_BOOK3S_PERF_H ?


And shouldn't this be part of the arch/powerpc-side patchset?


It should. Thanks, will move this to arch/powerpc side patchset.


diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 7fbca17..21322e0 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,3 +1,4 @@
  ifndef NO_DWARF
  PERF_HAVE_DWARF_REGS := 1
  endif
+HAVE_KVM_STAT_SUPPORT := 1

Does this stuff fail gracefully if used on a PPC target that doesn't
support this?


Yes, it does.


-Scott


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


--
Thanks,
Hemant Kumar

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/3] cpuidle: updates related to tick_broadcast_enter() failures

2015-05-12 Thread Daniel Lezcano


On 05/12/2015 03:23 PM, Rafael J. Wysocki wrote:

On Tuesday, May 12, 2015 10:41:35 AM Daniel Lezcano wrote:

On 05/12/2015 01:31 AM, Rafael J. Wysocki wrote:

On Monday, May 11, 2015 07:40:41 PM Daniel Lezcano wrote:

On 05/10/2015 01:15 AM, Rafael J. Wysocki wrote:

On Saturday, May 09, 2015 10:33:05 PM Rafael J. Wysocki wrote:

On Saturday, May 09, 2015 10:11:41 PM Rafael J. Wysocki wrote:

On Saturday, May 09, 2015 11:19:16 AM Preeti U Murthy wrote:

Hi Rafael,

On 05/08/2015 07:48 PM, Rafael J. Wysocki wrote:


[cut]



+   /* Take note of the planned idle state. */
+   idle_set_state(smp_processor_id(), target_state);


And I wouldn't do this either.

The behavior here is pretty much as though the driver demoted the state chosen
by the governor and we don't call idle_set_state() again in those cases.


Why is this wrong?


It is not wrong, but incomplete, because demotions done by the cpuidle driver
should also be taken into account in the same way.

But I'm seeing that the recent patch of mine that made cpuidle_enter_state()
call default_idle_call() was a mistake, because it might confuse 
find_idlest_cpu()
significantly as to what state the CPU is in.  I'll drop that one for now.


OK, done.

So after I've dropped it I think we need to do three things:
(1) Move the idle_set_state() calls to cpuidle_enter_state().
(2) Make cpuidle_enter_state() call default_idle_call() again, but this time
   do that *before* it has called idle_set_state() for target_state.
(3) Introduce demotion as per my last patch.

Let me cut patches for that.


Done as per the above and the patches follow in replies to this messge.

All on top of the current linux-next branch of the linux-pm.git tree.


IMO the resulting code is more and more confusing.


Why is it confusing?

What part of it is confusing?

Patches [1-2/3] simply replace https://patchwork.kernel.org/patch/6326761/
and I'm not sure why that would be confusing.

Patch [3/3] simply causes cpuidle_enter_state() to pick up a more suitable
state if tick_broadcast_enter() fails instead of returning an error code
in that case.  What exactly is confusing in that?


Except I miss something, the tick_broadcast_enter can fail only if the
local timer of the current cpu is used as a broadcast timer (which is
the case today for PPC only).


well, why does this matter?


The correct fix would be to tie this local timer with the cpu power
domain and disable the idle state powering down this domain like it was
done for the renesas cpuidle driver.

IOW, the cpu power domain is in use (because of its local timer), so we
shouldn't shut it down.

No ?


Sorry, I'm not sure what you're talking about.

The problem at hand is that tick_broadcast_enter() can fail and we need to
handle that.  If we can prevent it from ever failing, that would be awesome,
but quite honestly I don't see how to do that ATM.


Ok, sorry. Let me clarify.

You did a mechanism two years ago with pm_genpd_attach_cpuidle and
power_on/off. That disables a cpuidle state when a power domain is in use.

The idea I was proposing is to reuse this approach.

The logic is:

The local timer is in use, this idle state power downs this timer, then
disable it.


I'm not sure it's about powering down.  Stopping rather (which may or may
not involve powering down).


So it is when the broadcast timer is 'bound_on' a cpu, we disable the
idle states. That could be done via a loop looking for the TIMER_STOP
flag or via the power domain.

Hence the cpuidle_select will never return a state which powers downs
the local cpu (because they are disabled) and tick_broadcast_enter can't
fail because it is never called.

Does it make more sense ?


Well, you've not explained what's confusing in the code after this series
in the first place. :-)


It is not the series itself but the sum of the recent changes in this 
area makes the overall more and more difficult to maintain. But that's a 
personal opinion. Sounds like we are trying to catch the corner cases 
each time there is a change somewhere.



Second, quite honestly, I don't see a connection to genpd here.


Probably I am not clear :)

The connection we have is the local timer and the cpuidle framework 
shutting it down. Why ? Because the local timer belongs to the cpu's 
power domain.


Using the genpd to describe this relation between an idle state and the 
devices impacted by via a power domain is, in my opinion, a nice 
abstraction and a good opportunity to integrate the different 
subsystems. Furthermore it is consistent with Kevin's investigation 
around the power domain and SoC idle.


Kevin ?


What you seem to be saying is maybe we can eliminate the need to check the
return value of tick_broadcast_enter() in the idle loop if we proactively
disable the TIMER_STOP idle states of a CPU when we start to use that CPU's
timer as a broadcast one.


Well, not exactly. That's the consequence.

I meant, using any devices in a specific power domain makes impossible 
to

Re: G5 Xserve rackmeter broken?

2015-05-12 Thread Aaro Koskinen

Hi,

On Mon, May 11, 2015 at 08:13:35AM +1000, Benjamin Herrenschmidt wrote:
 On Sun, 2015-05-10 at 21:32 +0300, Aaro Koskinen wrote:
  Hi,
  
  With 4.1-rc2 the rackmeter driver for G5 Xserve is giving bogus
  led patterns. So far I have seen at least the following:
  
  a) With static load the leds seems to be sane and report CPU
  usage properly, but after few minutes they go completely OFF,
  even if the CPU load remains high.
  
  b) On a completely idle system, leds alter between all OFF and all ON
  roughly once a second.
  
  Unfortunately I cannot say which was the last kernel where this worked
  properly... These servers were away from normal use for a while due
  to PSU issues.
 
 And mine is dead due to ... PSU issue :-(

I had 4 dead servers, of which I have now managed get 2 again back
running by recapping the PSU.

 It could be that what we use to get the idle time isn't correct
 anymore...

It seems sometimes the idle ticks exceed total ticks and mess up
load calculation. This will explain the b) case behaviour at least.
I added the following debug patch:

@@ -234,6 +234,10 @@ static void rackmeter_do_timer(struct work_struct *work)
 */
load = (9 * (total_ticks - idle_ticks)) / total_ticks;

+   if (load  10)
+   pr_err(load: %d total: %u idle: %u\n, load,
+   total_ticks, idle_ticks);
+
offset = cpu  3;
cumm = 0;
for (i = 0; i  8; i++) {

Which shows:

[  795.832701] load: 515 total: 833 idle: 8333661
[  796.792767] load: 515 total: 833 idle: 8333551
[  796.832770] load: 515 total: 833 idle: 8333656
[  797.292799] load: 515 total: 834 idle: 8333532
[  798.082856] load: 515 total: 834 idle: 8333591
[  798.792903] load: 515 total: 833 idle: 8333424
[  798.832909] load: 515 total: 833 idle: 8333571
[  799.292937] load: 515 total: 834 idle: 8333459
[  799.832973] load: 515 total: 833 idle: 8333551
[  800.793038] load: 515 total: 833 idle: 8333414
[  800.833045] load: 515 total: 833 idle: 8333583
[  801.293071] load: 515 total: 834 idle: 8333455
[  801.833107] load: 515 total: 833 idle: 8333564

I'm running with HZ=100 so the values are still probably within
jiffy resolution, so perhaps the calculation should first do
idle = min(idle, total)?

A.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 3/3] kvm/powerpc: Export HCALL reason codes

2015-05-12 Thread Hemant Kumar



On 05/12/2015 03:44 AM, Scott Wood wrote:

On Fri, 2015-05-08 at 06:23 +0530, Hemant Kumar wrote:

For perf to analyze the KVM events like hcalls, we need the
hypervisor calls and their codes to be exported through uapi.

This patch moves most of the pSeries hcall codes from
arch/powerpc/include/asm/hvcall.h to
arch/powerpc/include/uapi/asm/hcall_codes.h.
It also moves the mapping hcall_code-to-hcall_reason from
arch/powerpc/kvm/trace_hv.h to
arch/powerpc/include/uapi/asm/trace_hcall.h.

Signed-off-by: Hemant Kumar hem...@linux.vnet.ibm.com
---
  arch/powerpc/include/asm/hvcall.h   | 120 +--
  arch/powerpc/include/uapi/asm/hcall_codes.h | 123 
  arch/powerpc/include/uapi/asm/trace_hcall.h | 122 +++
  arch/powerpc/kvm/trace_hv.h | 117 +-

When moving to uapi please add proper namespacing to indicate that this
is pseries specific.



Sure, will add that.


diff --git a/arch/powerpc/include/uapi/asm/trace_hcall.h 
b/arch/powerpc/include/uapi/asm/trace_hcall.h
new file mode 100644
index 000..00eac01
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/trace_hcall.h
@@ -0,0 +1,122 @@
+#ifndef _KVM_TRACE_HCALL_MAP_H
+#define _KVM_TRACE_HCALL_MAP_H
+
+#include hcall_codes.h
+
+#define kvm_trace_symbol_hcall\
+   {H_REMOVE,H_REMOVE},\
+   {H_ENTER,H_ENTER},  \
+   {H_READ,H_READ},\
+   {H_CLEAR_MOD,H_CLEAR_MOD},  \

This is a rather odd way of exposing an array to userspace...



Didn't get you here. Can you please elaborate?

I see some other files like arch/x86/include/uapi/asm/vmx.h exposing the 
reasons in a similar way.


Thanks for the review.

--
Hemant Kumar

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 1/2] powerpc/powernv: Add poweroff (EPOW, DPO) events support for PowerNV platform

2015-05-12 Thread Triggering

Accidentally hit the reply button instead of reply all. Re sending.

 On 12-May-2015, at 11:55, trigg mr.triggtr...@gmail.com wrote:
 
 Hi Stewart,
 
 On Tue, May 12, 2015 at 4:01 AM, Stewart Smith
 stew...@linux.vnet.ibm.com wrote:
 trigg mr.triggtr...@gmail.com writes:
 --- a/arch/powerpc/include/asm/opal-api.h
 +++ b/arch/powerpc/include/asm/opal-api.h
 @@ -730,6 +730,36 @@ struct opal_i2c_request {
__be64 buffer_ra;   /* Buffer real address */
 };
 
 +/*
 + * EPOW status sharing (OPAL and the host)
 + *
 + * The host will pass on OPAL, a buffer of length OPAL_EPOW_MAX_CLASSES
 + * to fetch system wide EPOW status. Each element in the returned buffer
 + * will contain bitwise EPOW status for each EPOW sub class.
 + */
 +
 +/* EPOW types */
 +enum OpalEpow {
 +   OPAL_EPOW_POWER = 0,/* Power EPOW */
 +   OPAL_EPOW_TEMP  = 1,/* Temperature EPOW */
 +   OPAL_EPOW_COOLING   = 2,/* Cooling EPOW */
 +   OPAL_MAX_EPOW_CLASSES   = 3,/* Max EPOW categories */
 +};
 Dont explicitly assign sequential numbers in an enum. Its taken care
 of by the compiler.
 
 This header is shared with firmware, and the exact values of each item
 does matter as it's ABI.
 
 This enum has no semantic difference with a series of #defines. So you
 are better
 off using #defines for this. Using enums with explicitly defined
 values beats the whole
 purpose of using them.
 
 The explicit numbers means that people think
 twice before inserting a new value in the middle and subtley breaking
 firmware ABI.
 
 I disagree. It wont prevent (even worsen) errors like two
 enum-constants being explicitly defined to same values.
 With implicit numbering you can atleast be sure that each
 enum-constant will be unique.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/3] ASoC: fsl_sai: add sai master mode support

2015-05-12 Thread Mark Brown

On Mon, May 11, 2015 at 06:24:41PM +0800, Zidan Wang wrote:
 When sai works on master mode, set its bit clock and frame clock.
 
 SAI has 4 MCLK source, bus clock, MCLK1, MCLK2 and MCLK3. fsl_sai_set_bclk
 will select proper MCLK source, then calculate and set the bit clock divider.

Applied, thanks.


signature.asc
Description: Digital signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/3] ASoC: fsl_sai: Add tdm slots operation for SAI master mode

2015-05-12 Thread Mark Brown

On Mon, May 11, 2015 at 06:24:42PM +0800, Zidan Wang wrote:

   if (!sai-is_slave_mode) {
 + slot_width = sai-slot_width;
   ret = fsl_sai_set_bclk(cpu_dai, tx,
 - 2 * word_width * params_rate(params));
 + sai-slots * slot_width * params_rate(params));
   if (ret)

This seems to make TDM configuration mandatory which seems like a step
back - if it's been set up then of course we need to use it but if it's
not been configured we should be able to just infer something from
hw_params as we have been doing.  Checking to see if the relevant values
have been set and falling back to using hw_params seems better.

Otherwise this looks good.


signature.asc
Description: Digital signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] ASoC: fsl_sai: add 12kHz, 24kHz, 176.4kHz and 192kHz sample rate support

2015-05-12 Thread Mark Brown

On Mon, May 11, 2015 at 06:24:43PM +0800, Zidan Wang wrote:
 Normally we don't support 12kHz, 24kHz in audio driver, alsa didn't
 have formal definition of 12kHz, 24kHz, but alsa supply a way to
 support these sample rates. And add 176.4kHz and 192kHz support.

Applied, thanks.


signature.asc
Description: Digital signature
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH 1/2] powerpc/qman: Change fsl,qman-channel-id to cell-index

2015-05-12 Thread Roy Pledge

 
  I don't believe this is correct - let me explain the rational why we had two
 properties in the QMan portal to begin with.
 
  The two properties in question are cell-index and fsl,qman-channel-id.
 
  The cell-index property is used in u-boot as an index for the software 
  portal
 ID when adding the fsl,liodn from the U-boot table into the device tree.
 
 The device tree is not supposed to contain arbitrary software identifiers.

I agree - this is why the original device tree bindings removed cell-index as 
it can be calculated.
Unfortunately u-boot relied on this value being present so to be backward 
compatible we don't have a way to remove it.  I'm not sure on what the 
procedure is to change things u-boot relies on, I personal have always been 
very uncomfortable with the coupling between u-boot and Linux for things like 
this. 

 
  The  fsl,qman-channel-id property is used in Linux and corresponds to
  a hardware value that indicates which channel is dedicated to the
  software portal.
 
  While I'm not aware of a current SoC where the channel ID for a
  software portal does not match the index (i.e. SWP 0 uses channel 0,
  etc.)
 
 Thus there's no backward compatibility issue with redefining cell-index to
 mean the channel ID.

Channel IDs do change and are defined when the SoC is created (look at checks 
for QMan versions and adjustments for Pool Channel IDs in the driver).  If the 
channel ID for portal 0 ever becomes non zero we just end up having to make a 
mess in the code or reintroduce this field.

 
  it is possible that future SoCs could stray from this model, there is
  no reason for portal index to equal channel ID at all times.
 
 How can future SoCs dictate how we assign a software-defined identifier?
 If software wants it to be the same as the channel id, then it will be.
 
 If there is some aspect of the hardware itself (not the documentation) that
 cell-index currently corresponds to, other than the channel id, please make
 that clear.

Channel ID is defined in the SoC RTL - it is not controlled by software and it 
is not a software assigned identifier.  It is not possible for SW to set these 
values.

Roy
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: G5 Xserve rackmeter broken?

2015-05-12 Thread Benjamin Herrenschmidt

On Tue, 2015-05-12 at 20:55 +0300, Aaro Koskinen wrote:
 I'm running with HZ=100 so the values are still probably within
 jiffy resolution, so perhaps the calculation should first do
 idle = min(idle, total)?

Does it gives you a reasonable output if you do that ?

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH tip/core/rcu 09/15] powerpc: Fix smp_mb__before_spinlock()

2015-05-12 Thread Paul E. McKenney

From: Paul E. McKenney paul...@linux.vnet.ibm.com

Currently, smp_mb__before_spinlock() is defined to be smp_wmb()
in core code, but this is not sufficient on PowerPC.  This patch
therefore supplies an override for the generic definition to
strengthen smp_mb__before_spinlock() to smp_mb(), as is needed
on PowerPC.

Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/barrier.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index a3bf5be111ff..1124f59b8df4 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -89,5 +89,6 @@ do {  
\

 #define smp_mb__before_atomic() smp_mb()
 #define smp_mb__after_atomic()  smp_mb()
+#define smp_mb__before_spinlock()   smp_mb()

 #endif /* _ASM_POWERPC_BARRIER_H */
-- 
1.8.1.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3 9/9] powerpc/eeh: handle VF PE properly

2015-05-12 Thread Gavin Shan

On Mon, May 04, 2015 at 03:07:38PM +0800, Wei Yang wrote:
Compared with Bus PE, VF PE just has one single pci function. This
 ^^^
introduces the difference of error handling on a VF PE.


Lets have simple example to make the discussion easy: Suppose that one
particular PF has only one IOV BAR and we're going to enable 8 VFs. Also
the VF BAR size exceeds 64MB. For this case, we're going to have 2 VF
groups, which have 4 VFs. First of all, there are 8 PE numbers consumed
in total and lets say they're PE#(x) ... PE#(x+7).

For above case, the M64 runs in single mode. In other word, we just
need two M64 BARs here and they're owned by PE#x and PE#(x+1) separately.
Equally speaking, the relationship between PE# and VF index becomes as
below from MMIO's view: PE#x (VF#0/1/2/3), PE#x(VF#4/5/6/7). However,
this mapping isn't updated to PELTM. On the other hand, each VF has
separate PE# number as we update to PELTM. Lastly, we have to chain
PE#x/+1/+2/+3 and PE#x+4/5/6/7 because the VF group is sharing M64
resources. Hopefully, I didn't miss anything.

If above description is complete, I think you need expose VF (PE) group
to EEH. Potentially, I guess you need improve it later for VFIO so that
it can pass VF (PE) group to *same* guest. Generally speaking, the VF (PE)
group should be visible to EEH as single PE, which is similar to what I
did for huge BAR support where we have master/slave PE. The master PE
is exposed to EEH while the slave PEs are invisible from EEH. However,
the situation for VF (PE) group is different, but not too much: the
slave PEs (PE#x+1/2/3, PE#x+5/6/7 in this case) do contain PCI device.
So think you can something as below if you can't figure out better
solution:

- When the PE group is finalized in pcibios_fixup_sriov_enable(), you
  mark the master/slave PE accordingly, and put the slave PE to the
  master PE's slave list.
- When doing EEH probe on VF's edev, detach it to EEH with master PE#.

With above mechanism, each VF PE can potentially have multiple EEH devices,
not one single PCI function as you said in the commit log, which needs to
be changed accordingly.

For example in the hotplug case, EEH needs to remove and re-create the VF
properly. In the case when PF's error_detected() disable SRIOV, this patch
introduces a flag to mark the eeh_dev of a VF to avoid the slot_reset() and
resume(). Since the FW is not ware of the VF, this patch handles the VF
restore/reset in kernel directly.

This patch is to handle the VF PE properly in these cases.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h   |1 +
 arch/powerpc/kernel/eeh.c|1 +
 arch/powerpc/kernel/eeh_driver.c |  108 ++
 arch/powerpc/kernel/eeh_pe.c |3 +-
 4 files changed, 90 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 78c8bec..43e8a24 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -141,6 +141,7 @@ struct eeh_dev {
   struct pci_controller *phb; /* Associated PHB   */
   struct pci_dn *pdn; /* Associated PCI device node   */
   struct pci_dev *pdev;   /* Associated PCI device*/
+  intin_error;/* Error flag for eeh_dev   */

Begining from v2 (maybe I'm wrong about the revision), I suggested to
remove in_error because we already had similar flag EEH_DEV_NO_HANDLER.
I don't think it's different to merge them and I already told how to do
that. If you still find it's difficult to do it, I'm fine to keep it and
I'll fix it up later by myself.

 #ifdef CONFIG_PCI_IOV
   struct pci_dev *physfn; /* Associated PF PORT   */
 #endif /* CONFIG_PCI_IOV */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 221e280..077c3d1 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1226,6 +1226,7 @@ void eeh_remove_device(struct pci_dev *dev)
* from the parent PE during the BAR resotre.
*/
   edev-pdev = NULL;
+  edev-in_error = 0;
   dev-dev.archdata.edev = NULL;
   if (!(edev-pe-state  EEH_PE_KEEP))
   eeh_rmv_from_parent_pe(edev);
diff --git a/arch/powerpc/kernel/eeh_driver.c 
b/arch/powerpc/kernel/eeh_driver.c
index 89eb4bc..6e42cad 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -211,6 +211,7 @@ static void *eeh_report_error(void *data, void *userdata)
   if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
   if (*res == PCI_ERS_RESULT_NONE) *res = rc;

+  edev-in_error = 1;
   eeh_pcid_put(dev);
   return NULL;
 }
@@ -282,7 +283,8 @@ static void *eeh_report_reset(void *data, void *userdata)

   if (!driver-err_handler ||
   !driver-err_handler-slot_reset ||
-  (edev-mode  EEH_DEV_NO_HANDLER)) {
+  (edev-mode

Re: [PATCH V4 1/3] mm/thp: Split out pmd collpase flush into a separate functions

2015-05-12 Thread Kirill A. Shutemov

On Tue, May 12, 2015 at 11:38:32AM +0530, Aneesh Kumar K.V wrote:
 Architectures like ppc64 [1] need to do special things while clearing
 pmd before a collapse. For them this operation is largely different
 from a normal hugepage pte clear. Hence add a separate function
 to clear pmd before collapse. After this patch pmdp_* functions
 operate only on hugepage pte, and not on regular pmd_t values
 pointing to page table.
 
 [1] ppc64 needs to invalidate all the normal page pte mappings we
 already have inserted in the hardware hash page table. But before
 doing that we need to make sure there are no parallel hash page
 table insert going on. So we need to do a kick_all_cpus_sync()
 before flushing the older hash table entries. By moving this to
 a separate function we capture these details and mention how it
 is different from a hugepage pte clear.
 
 This patch is a cleanup and only does code movement for clarity.
 There should not be any change in functionality.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

For the patchset:

Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com

-- 
 Kirill A. Shutemov
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] powerpc/mce: fix off by one errors in mce event handling

2015-05-12 Thread Michael Ellerman

On Tue, 2015-05-12 at 13:23 +1000, Daniel Axtens wrote:
 Before 69111bac42f5 (powerpc: Replace __get_cpu_var uses), in
 save_mce_event, index got the value of mce_nest_count, and
 mce_nest_count was incremented *after* index was set.
 
 However, that patch changed the behaviour so that mce_nest count was
 incremented *before* setting index.
 
 This causes an off-by-one error, as get_mce_event sets index as
 mce_nest_count - 1 before reading mce_event.  Thus get_mce_event reads
 bogus data, causing warnings like
 Machine Check Exception, Unknown event version 0 !
 and breaking MCEs handling.
 
 Restore the old behaviour and unbreak MCE handling by subtracting one
 from the newly incremented value.
 
 The same broken change occured in machine_check_queue_event (which set
 a queue read by machine_check_process_queued_event).  Fix that too,
 unbreaking printing of MCE information.
 
 Fixes: 69111bac42f5 (powerpc: Replace __get_cpu_var uses)
 CC: sta...@vger.kernel.org
 CC: Mahesh Salgaonkar mah...@linux.vnet.ibm.com
 CC: Christoph Lameter c...@linux.com
 Signed-off-by: Daniel Axtens d...@axtens.net
 
 ---
 
 The code is still super racy, but this at least unbreaks the common,
 non-reentrant case for now until we figure out how to fix it properly.
 The proper fix will likely be quite invasive so it might be worth
 picking this up in stable rather than waiting for that?
 
 mpe: the generated asm is below
 
 0070 .save_mce_event:
   70:   e9 6d 00 30 ld  r11,48(r13)
   74:   3d 22 00 00 addis   r9,r2,0
   78:   39 29 00 00 addir9,r9,0
   7c:   7d 2a 4b 78 mr  r10,r9
   80:   39 29 00 08 addir9,r9,8
   84:   7d 8a 58 2e lwzxr12,r10,r11
   88:   39 8c 00 01 addir12,r12,1
   8c:   7d 8a 59 2e stwxr12,r10,r11
   90:   e9 0d 00 30 ld  r8,48(r13)
   94:   7d 4a 40 2e lwzxr10,r10,r8
   98:   39 4a ff ff addir10,r10,-1
   9c:   2f 8a 00 63 cmpwi   cr7,r10,99
 
 AIUI, we get the per-cpu area in 70, the addr of mce_nest_count itself
 in 80, then load, incr, stor in 84-8c, then we get the address and
 load again in 90-94, then subtract 1 to make the count sensible again,
 then 9c is the conditional `if (index = MAX_MC_EVT)'
 
 I think that was what you expected?

Sort of. I wasn't expecting it to reload it after the increment. But I guess
that's an artifact of the macros.

Anyway it's much better than the current code which is just broken always.

cheers


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3 8/9] powerpc/powernv: Support PCI config restore for VFs

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 04:16:45PM +0800, Wei Yang wrote:
On Tue, May 12, 2015 at 04:34:03PM +1000, Gavin Shan wrote:

+  /* Disable Completion Timeout */
+  if (pcie_cap) {
+  pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCAP2, 4, cap2);
+  if (cap2  0x10) {
+  pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL2, 4, 
cap2);
+  cap2 |= 0x10;
+  pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL2, 4, 
cap2);
+  }
+  }
+
+  /* Enable SERR and parity checking */
+  pnv_pci_cfg_read(pdn, PCI_COMMAND, 2, cmd);
+  cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+  pnv_pci_cfg_write(pdn, PCI_COMMAND, 2, cmd);
+
+  /* Enable report various errors */
+  if (pcie_cap) {
+  pnv_pci_cfg_read(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+  devctl = ~PCI_EXP_DEVCTL_CERE;
+  devctl |= (PCI_EXP_DEVCTL_NFERE |
+ PCI_EXP_DEVCTL_FERE |
+ PCI_EXP_DEVCTL_URRE);
+  pnv_pci_cfg_write(pdn, pcie_cap + PCI_EXP_DEVCTL, 2, devctl);
+  }
+
+  /* Enable ECRC generation and check */
+  if (pcie_cap) {
+  aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+  pnv_pci_cfg_read(pdn, aer_cap + PCI_ERR_CAP, 4, aer_capctl);
+  aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+  pnv_pci_cfg_write(pdn, aer_cap + PCI_ERR_CAP, 4, aer_capctl);
+  }
+
+  return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+

The code is copied over from skiboot firmware. I still dislike the fact that
we have to maintain two sets of similar functions in skiboot/kernel. I still
believe the way I suggested can help: the firmware exports the error routing
rules and kernel has support it based on the rules. With it, the skiboot is
the source of the information to avoid mismatching between kernel/firmware.

Yes, it looks we have duplicate code in kernel and skiboot.

As you suggest, if we export some bit map from device node, we still have the
real logic in kernel, until we remove that part in skiboot.

By removing that part in skiboot, we may have some compatibility problem. For
example, an old kernel may not run on the new version of skiboot.


It's fine to keep two set code which bear with same rule, which is exported
from skiboot. In that case, the rule is the only thing we have to care. We
don't need care the code any more to avoid mismatch between kernel/firmware.


Ok, duplication is reasonable, then the major point for this is we need to
have a clear rule for restoring configuration for a device.


Well, I have to explain a bit more if I didn't make myself clear enough, then
you change the code in another way, which will waste your time.

- From skiboot, each PHB's device node maintains the rules, which *could* be
  described as the data structures I have given in previous replies if you
  can't figure out better data structures.
- Skiboot will reinitialize those devices for the following 3 cases: PCI
  enumeration, PCI config space restore requested from EEH, after PCI 
hotplug/reset.
  Obviously, the code needs changes to utilize the rules in PHB's device node.
- Kernel will do similiar thing as skiboot will do: Reinitialize VFs according
  to the rules in PHB's device node.

Yes, we have duplicated code, not rules. Hopefully, I make myself clear enough.

Than I suggest we could have another patch set to handle this. Define the rule
clearly and restore the configuration in kernel when skiboot firmware export
such rules.


Sure.

Thanks,
Gavin

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: mpt2sas DMA mask

2015-05-12 Thread Benjamin Herrenschmidt

On Tue, 2015-05-12 at 17:07 -0500, Brian King wrote:
 The mpt2sas driver was changed late last year in that it now requests a 64 
 bit DMA
 mask, then requests a 32 bit coherent DMA mask, then later requests a 64 bit 
 coherent
 DMA mask. This was 5fb1bf8aaa832e1e9ca3198de7bbecb8eff7db9c. This breaks 64 
 bit DMA
 support for mpt2sas on Power and we always fall back to using 32 bit DMA. 
 Looking
 at the commit log, it looks like this was an intentional change.
 
 Ben - you had a patch set you had posted to the list back in Feb of this 
 year, but it
 doesn't look like it got merged. 

Right, it was broken, we can't do that unfortunately. It's a hard
problem to fix.

 https://lists.ozlabs.org/pipermail/linuxppc-dev/2015-February/125087.html
 
 This would fix the issue I'm seeing on mpt2sas. Do you plan to dust that 
 patch set
 off and upstream it? Were there issues with it that still need to be resolved?

No that patch doesn't actually work.

What we need is essentially a new set of DMA ops that can route an
individual map request via the iommu or the bypass window depending on
what mask applies.

Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] mpt2sas: Fall back to 64 bit coherent mask if 64 bit DMA / 32 bit coherent mask not supported

2015-05-12 Thread Brian King

On 05/12/2015 05:10 PM, Benjamin Herrenschmidt wrote:
 On Tue, 2015-05-12 at 17:07 -0500, Brian King wrote:
 The mpt2sas driver was changed late last year in that it now requests a 64 
 bit DMA
 mask, then requests a 32 bit coherent DMA mask, then later requests a 64 bit 
 coherent
 DMA mask. This was 5fb1bf8aaa832e1e9ca3198de7bbecb8eff7db9c. This breaks 64 
 bit DMA
 support for mpt2sas on Power and we always fall back to using 32 bit DMA. 
 Looking
 at the commit log, it looks like this was an intentional change.

 Ben - you had a patch set you had posted to the list back in Feb of this 
 year, but it
 doesn't look like it got merged. 
 
 Right, it was broken, we can't do that unfortunately. It's a hard
 problem to fix.
 
 https://lists.ozlabs.org/pipermail/linuxppc-dev/2015-February/125087.html

 This would fix the issue I'm seeing on mpt2sas. Do you plan to dust that 
 patch set
 off and upstream it? Were there issues with it that still need to be 
 resolved?
 
 No that patch doesn't actually work.
 
 What we need is essentially a new set of DMA ops that can route an
 individual map request via the iommu or the bypass window depending on
 what mask applies.

Ok. For the interim, then, Sreekanth, would you consider a patch such as the one
below to allow mpt2sas to support 64 bit DMA on architectures that don't support
a 32 bit coherent mask and a 64 bit dma mask at the same time? Only compile 
tested
at this point. Will run it on a Power machine shortly to confirm it doesn't
break anything...


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center


8---


Commit 5fb1bf8aaa832e1e9ca3198de7bbecb8eff7db9c broke 64 bit DMA for mpt2sas on 
Power.
That commit changed the sequence for setting up the DMA and coherent DMA masks 
so
that during initialization the driver requests a 64 bit DMA mask and a 32 bit 
consistent
DMA mask, then later requests a 64 bit consistent DMA mask. The Power 
architecture does
not currently support this, which results in always falling back to a 32 bit 
DMA window,
which has a negative impact on performance. Tweak this algorithm slightly so 
that
if requesting a 32 bit consistent mask fails after we've successfully set a 64 
bit
DMA mask, just try to get a 64 bit consistent mask. This should preserve 
existing
behavior on platforms that support mixed mask setting and restore previous 
functionality
to those that do not.

Signed-off-by: Brian King brk...@linux.vnet.ibm.com
---

 drivers/scsi/mpt2sas/mpt2sas_base.c |   14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff -puN drivers/scsi/mpt2sas/mpt2sas_base.c~mpt2sas_dma_mask 
drivers/scsi/mpt2sas/mpt2sas_base.c
--- linux/drivers/scsi/mpt2sas/mpt2sas_base.c~mpt2sas_dma_mask  2015-05-12 
16:09:58.228687852 -0500
+++ linux-bjking1/drivers/scsi/mpt2sas/mpt2sas_base.c   2015-05-12 
17:38:31.588060903 -0500
@@ -1200,12 +1200,14 @@ _base_config_dma_addressing(struct MPT2S
const uint64_t required_mask =
dma_get_required_mask(pdev-dev);
if ((required_mask  DMA_BIT_MASK(32)) 
-   !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) 
-   !pci_set_consistent_dma_mask(pdev, consistent_dma_mask)) {
-   ioc-base_add_sg_single = _base_add_sg_single_64;
-   ioc-sge_size = sizeof(Mpi2SGESimple64_t);
-   ioc-dma_mask = 64;
-   goto out;
+   !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+   if (!pci_set_consistent_dma_mask(pdev, 
consistent_dma_mask) ||
+   !pci_set_consistent_dma_mask(pdev, 
DMA_BIT_MASK(64))) {
+   ioc-base_add_sg_single = 
_base_add_sg_single_64;
+   ioc-sge_size = sizeof(Mpi2SGESimple64_t);
+   ioc-dma_mask = 64;
+   goto out;
+   }
}
}
 
_

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

mpt2sas DMA mask

2015-05-12 Thread Brian King

The mpt2sas driver was changed late last year in that it now requests a 64 bit 
DMA
mask, then requests a 32 bit coherent DMA mask, then later requests a 64 bit 
coherent
DMA mask. This was 5fb1bf8aaa832e1e9ca3198de7bbecb8eff7db9c. This breaks 64 bit 
DMA
support for mpt2sas on Power and we always fall back to using 32 bit DMA. 
Looking
at the commit log, it looks like this was an intentional change.

Ben - you had a patch set you had posted to the list back in Feb of this year, 
but it
doesn't look like it got merged. 

https://lists.ozlabs.org/pipermail/linuxppc-dev/2015-February/125087.html

This would fix the issue I'm seeing on mpt2sas. Do you plan to dust that patch 
set
off and upstream it? Were there issues with it that still need to be resolved?

Thanks,

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/2] powerpc/qman: Change fsl,qman-channel-id to cell-index

2015-05-12 Thread Scott Wood

On Tue, 2015-05-12 at 16:19 -0500, Pledge Roy-R01356 wrote:
  
   I don't believe this is correct - let me explain the rational why we had 
   two
  properties in the QMan portal to begin with.
  
   The two properties in question are cell-index and fsl,qman-channel-id.
  
   The cell-index property is used in u-boot as an index for the software 
   portal
  ID when adding the fsl,liodn from the U-boot table into the device tree.
  
  The device tree is not supposed to contain arbitrary software identifiers.
 
 I agree - this is why the original device tree bindings removed
 cell-index as it can be calculated.
 Unfortunately u-boot relied on
 this value being present so to be backward compatible we don't have a
 way to remove it.  I'm not sure on what the procedure is to change
 things u-boot relies on, 

Generally the procedure is that we don't change it.  It wouldn't be so
bad if using an old U-Boot just meant that datapath doesn't work with
upstream kernels (since that doesn't work now), but a dts that makes
existing U-Boot crash is another matter.

 I personal have always been very uncomfortable with the coupling
 between u-boot and Linux for things like this. 

Same here.  I've said for a while that I thought the dtses should live
in the U-Boot tree due to such coupling (the dtb/kernel interface has
well-defined binding documents; the dts/U-Boot interface doesn't), but
nobody else seemed interested.  It would also be good to minimize new
U-Boot fixups in favor of having the kernel do it, even if it results
in unpleasant code duplication.


  
   The  fsl,qman-channel-id property is used in Linux and corresponds to
   a hardware value that indicates which channel is dedicated to the
   software portal.
  
   While I'm not aware of a current SoC where the channel ID for a
   software portal does not match the index (i.e. SWP 0 uses channel 0,
   etc.)
  
  Thus there's no backward compatibility issue with redefining cell-index to
  mean the channel ID.
 
 Channel IDs do change and are defined when the SoC is created

But for SoCs that already exist, they won't change, right?  We don't
need to care about what existing U-Boot does on new SoCs, since U-Boot
would need to be changed to support the new SoC at all.

  (look at checks for QMan versions and adjustments for Pool Channel IDs
 in the driver).  If the channel ID for portal 0 ever becomes non zero
 we just end up having to make a mess in the code or reintroduce this
 field.

What defines that portal as portal 0?

   it is possible that future SoCs could stray from this model, there is
   no reason for portal index to equal channel ID at all times.
  
  How can future SoCs dictate how we assign a software-defined identifier?
  If software wants it to be the same as the channel id, then it will be.
  
  If there is some aspect of the hardware itself (not the documentation) that
  cell-index currently corresponds to, other than the channel id, please make
  that clear.
 
 Channel ID is defined in the SoC RTL - it is not controlled by software
 and it is not a software assigned identifier.  It is not possible for
 SW to set these values.

I said other than the channel id.  In particular, I was asking about
the concept of portal index.

-Scott


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3 6/9] powerpc/powernv: create/release eeh_dev for VF

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 04:06:43PM +0800, Wei Yang wrote:
On Mon, May 11, 2015 at 12:48:56PM +1000, Gavin Shan wrote:
On Mon, May 04, 2015 at 03:07:35PM +0800, Wei Yang wrote:

Please order this patch and PATCH[5] because EEH device is expected to
be created before EEH PE.

EEH on powerpc platform needs eeh_dev structure to track the pci device
  ^^^
PCI

status. Since VFs are created/released dynamically, VF's eeh_dev is also
dynamically created/released in system.

This patch creates/removes eeh_dev when pci_dn is created/removed for VFs,
and marks it with EEH_DEV_VF type.

Signed-off-by: Wei Yang weiy...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h |7 +++
 arch/powerpc/kernel/eeh.c  |4 
 arch/powerpc/kernel/eeh_dev.c  |   20 
 arch/powerpc/kernel/pci_dn.c   |7 +++
 4 files changed, 38 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 56e8cd9..2067de4 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -124,6 +124,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER  (1  8)/* No error handler */
 #define EEH_DEV_SYSFS   (1  9)/* Sysfs created
 */
 #define EEH_DEV_REMOVED (1  10)   /* Removed permanently  
 */
+#define EEH_DEV_VF  (1  11)   /* VF port  */


Why you need this flag? I guess edev-physfn can be used to distinguish
it's a normal or VF eeh_dev.


Just like we have EEH_DEV_BRIDGE and EEH_DEV_DS_PORT, I use the flag
EEH_DEV_VF to mark it a VF eeh_dev.



They're not equal from my point of review. When the VF edev is created
from VF's pdn, edev-physfn is determined. Furthermore, edev-physfn
is only meaningful to VF's edev. It's why I think you needn't introduce
another one flag (EEH_DEV_VF), which is just duplicated to (edev-physfn).
So I think you can use following condition to check if one edev is created
from VF's pdn or others:

if (edev-physfn)
/* VF's edev */
else
/* All other cases */

In order to use the above condition, edev-phyfn is initialized to dereference
the VF's physical function when it's created from VF's pdn in 
arch/powerpc/kernel/pdn.c.

Thanks,
Gavin

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH RESEND v6 1/2] arm64: dts: Add the arasan sdhci nodes in apm-storm.dtsi

2015-05-12 Thread Ulf Hansson

On 12 May 2015 at 09:28, Arnd Bergmann a...@arndb.de wrote:
 On Tuesday 12 May 2015 11:03:46 Suman Tripathi wrote:
  @@ -533,6 +567,15 @@
  interrupts = 0x0 0x4f 0x4;
  };
 
  +   sdhci0: sdhci@1c00 {
  +   compatible = arasan,sdhci-4.9a;
  +   reg = 0x0 0x1c00 0x0 0x100;
  +   interrupts = 0x0 0x49 0x4;
  +   dma-coherent;
  +   clock-names = clk_xin, clk_ahb;
  +   clocks = sdioclk 0, ahbclk 0;
  +   };
  +
  phy1: phy@1f21a000 {
  compatible = apm,xgene-phy;
  reg = 0x0 0x1f21a000 0x0 0x100;
  --
  1.8.2.1
 

 Can anyone from dt community review this patch ? I have changed the dts node
 names from sdhc to sdhci as per Arnd, Michael comments .

 I was actually asking for it to be named 'mmc', not 'sdhci', because the
 name is supposed to indicate the purpose of the device, not the
 implementation. I realize that we are inconsistent here, just as with
 'uart' vs 'serial', and that ePAPR does not define what to do.

Then we need a common name to address the following purposes: SD, MMC,
eMMC, SDIO.


 We should probably add something to 
 Documentation/devicetree/bindings/mmc/mmc.txt
 about this topic and change all the dts files accordingly (unless there
 is a risk for regressions). At the moment, the mmc.txt file also includes
 an example with 'sdhci', not 'mmc'.


I am happy to change, as long as we can come up with something better.

Kind regards
Uffe
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/3] cpuidle: updates related to tick_broadcast_enter() failures

2015-05-12 Thread Rafael J. Wysocki

On Tuesday, May 12, 2015 10:41:35 AM Daniel Lezcano wrote:
 On 05/12/2015 01:31 AM, Rafael J. Wysocki wrote:
  On Monday, May 11, 2015 07:40:41 PM Daniel Lezcano wrote:
  On 05/10/2015 01:15 AM, Rafael J. Wysocki wrote:
  On Saturday, May 09, 2015 10:33:05 PM Rafael J. Wysocki wrote:
  On Saturday, May 09, 2015 10:11:41 PM Rafael J. Wysocki wrote:
  On Saturday, May 09, 2015 11:19:16 AM Preeti U Murthy wrote:
  Hi Rafael,
 
  On 05/08/2015 07:48 PM, Rafael J. Wysocki wrote:
 
  [cut]
 
 
  +/* Take note of the planned idle state. */
  +idle_set_state(smp_processor_id(), target_state);
 
  And I wouldn't do this either.
 
  The behavior here is pretty much as though the driver demoted the 
  state chosen
  by the governor and we don't call idle_set_state() again in those 
  cases.
 
  Why is this wrong?
 
  It is not wrong, but incomplete, because demotions done by the 
  cpuidle driver
  should also be taken into account in the same way.
 
  But I'm seeing that the recent patch of mine that made 
  cpuidle_enter_state()
  call default_idle_call() was a mistake, because it might confuse 
  find_idlest_cpu()
  significantly as to what state the CPU is in.  I'll drop that one for 
  now.
 
  OK, done.
 
  So after I've dropped it I think we need to do three things:
  (1) Move the idle_set_state() calls to cpuidle_enter_state().
  (2) Make cpuidle_enter_state() call default_idle_call() again, but this 
  time
do that *before* it has called idle_set_state() for target_state.
  (3) Introduce demotion as per my last patch.
 
  Let me cut patches for that.
 
  Done as per the above and the patches follow in replies to this messge.
 
  All on top of the current linux-next branch of the linux-pm.git tree.
 
  IMO the resulting code is more and more confusing.
 
  Why is it confusing?
 
  What part of it is confusing?
 
  Patches [1-2/3] simply replace https://patchwork.kernel.org/patch/6326761/
  and I'm not sure why that would be confusing.
 
  Patch [3/3] simply causes cpuidle_enter_state() to pick up a more suitable
  state if tick_broadcast_enter() fails instead of returning an error code
  in that case.  What exactly is confusing in that?
 
  Except I miss something, the tick_broadcast_enter can fail only if the
  local timer of the current cpu is used as a broadcast timer (which is
  the case today for PPC only).
 
  well, why does this matter?
 
  The correct fix would be to tie this local timer with the cpu power
  domain and disable the idle state powering down this domain like it was
  done for the renesas cpuidle driver.
 
  IOW, the cpu power domain is in use (because of its local timer), so we
  shouldn't shut it down.
 
  No ?
 
  Sorry, I'm not sure what you're talking about.
 
  The problem at hand is that tick_broadcast_enter() can fail and we need to
  handle that.  If we can prevent it from ever failing, that would be awesome,
  but quite honestly I don't see how to do that ATM.
 
 Ok, sorry. Let me clarify.
 
 You did a mechanism two years ago with pm_genpd_attach_cpuidle and 
 power_on/off. That disables a cpuidle state when a power domain is in use.
 
 The idea I was proposing is to reuse this approach.
 
 The logic is:
 
 The local timer is in use, this idle state power downs this timer, then 
 disable it.

I'm not sure it's about powering down.  Stopping rather (which may or may
not involve powering down).

 So it is when the broadcast timer is 'bound_on' a cpu, we disable the 
 idle states. That could be done via a loop looking for the TIMER_STOP 
 flag or via the power domain.
 
 Hence the cpuidle_select will never return a state which powers downs 
 the local cpu (because they are disabled) and tick_broadcast_enter can't 
 fail because it is never called.
 
 Does it make more sense ?

Well, you've not explained what's confusing in the code after this series
in the first place. :-)

Second, quite honestly, I don't see a connection to genpd here.

What you seem to be saying is maybe we can eliminate the need to check the
return value of tick_broadcast_enter() in the idle loop if we proactively
disable the TIMER_STOP idle states of a CPU when we start to use that CPU's
timer as a broadcast one.

So this seems to be about the timekeeping rather than power domains, because
that's where the broadcast thing is done.  So the code setting up the CPU's
timer for broadcast would pretty much need to pause cpuidle, go through the
CPU's idle states and disable the TIMER_STOP ones.  And do the reverse when the
timer is not going the be used for broadcast any more.  So question is whether
or not this is actually really more straightforward than checking the return
value of tick_broadcast_enter() in the idle loop after all.


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/config: Enable bnx2x on ppc64 and pseries defconfigs

2015-05-12 Thread Anton Blanchard

Signed-off-by: Anton Blanchard an...@samba.org
---
 arch/powerpc/configs/ppc64_defconfig  | 1 +
 arch/powerpc/configs/pseries_defconfig| 1 +
 arch/powerpc/configs/pseries_le_defconfig | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index aad501a..a97efc2 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -155,6 +155,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index c2e39f6..d09c723 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -154,6 +154,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
diff --git a/arch/powerpc/configs/pseries_le_defconfig 
b/arch/powerpc/configs/pseries_le_defconfig
index 09bc96e..5ad0253 100644
--- a/arch/powerpc/configs/pseries_le_defconfig
+++ b/arch/powerpc/configs/pseries_le_defconfig
@@ -155,6 +155,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
-- 
2.1.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v10 03/34] powerpc/powernv/ioda: Clean up IOMMU group registration

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 01:38:52AM +1000, Alexey Kardashevskiy wrote:
The existing code has 3 calls to iommu_register_group() and
all 3 branches actually cover all possible cases.

This replaces 3 calls with one and moves the registration earlier;
the latter will make more sense when we add TCE table sharing.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks,
Gavin

---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9a77f3c..8ca7abd 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1784,6 +1784,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
   if (WARN_ON(pe-tce32_seg = 0))
   return;

+  tbl = pe-tce32_table;
+  iommu_register_group(tbl, phb-hose-global_number, pe-pe_number);
+
   /* Grab a 32-bit TCE table */
   pe-tce32_seg = base;
   pe_info(pe,  Setting up 32-bit TCE table at %08x..%08x\n,
@@ -1818,7 +1821,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
   }

   /* Setup linux iommu table */
-  tbl = pe-tce32_table;
   pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
 base  28, IOMMU_PAGE_SHIFT_4K);

@@ -1840,8 +1842,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
   iommu_init_table(tbl, phb-hose-node);

   if (pe-flags  PNV_IODA_PE_DEV) {
-  iommu_register_group(tbl, phb-hose-global_number,
-   pe-pe_number);
   /*
* Setting table base here only for carrying iommu_group
* further down to let iommu_add_device() do the job.
@@ -1849,14 +1849,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
*/
   set_iommu_table_base(pe-pdev-dev, tbl);
   iommu_add_device(pe-pdev-dev);
-  } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-  iommu_register_group(tbl, phb-hose-global_number,
-   pe-pe_number);
+  } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
   pnv_ioda_setup_bus_dma(pe, pe-pbus);
-  } else if (pe-flags  PNV_IODA_PE_VF) {
-  iommu_register_group(tbl, phb-hose-global_number,
-   pe-pe_number);
-  }

   return;
  fail:
@@ -1923,6 +1917,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
   if (WARN_ON(pe-tce32_seg = 0))
   return;

+  tbl = pe-tce32_table;
+  iommu_register_group(tbl, phb-hose-global_number, pe-pe_number);
+
   /* The PE will reserve all possible 32-bits space */
   pe-tce32_seg = 0;
   end = (1  ilog2(phb-ioda.m32_pci_base));
@@ -1954,7 +1951,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
   }

   /* Setup linux iommu table */
-  tbl = pe-tce32_table;
   pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
   IOMMU_PAGE_SHIFT_4K);

@@ -1974,8 +1970,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
   iommu_init_table(tbl, phb-hose-node);

   if (pe-flags  PNV_IODA_PE_DEV) {
-  iommu_register_group(tbl, phb-hose-global_number,
-   pe-pe_number);
   /*
* Setting table base here only for carrying iommu_group
* further down to let iommu_add_device() do the job.
@@ -1983,14 +1977,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
*/
   set_iommu_table_base(pe-pdev-dev, tbl);
   iommu_add_device(pe-pdev-dev);
-  } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-  iommu_register_group(tbl, phb-hose-global_number,
-   pe-pe_number);
+  } else if (pe-flags  (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
   pnv_ioda_setup_bus_dma(pe, pe-pbus);
-  } else if (pe-flags  PNV_IODA_PE_VF) {
-  iommu_register_group(tbl, phb-hose-global_number,
-   pe-pe_number);
-  }

   /* Also create a bypass window */
   if (!pnv_iommu_bypass_disabled)
-- 
2.4.0.rc3.8.gfb3e7d5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
At the moment iommu_free_table() only releases memory if
the table was initialized for the platform code use, i.e. it had
it_map initialized (which purpose is to track DMA memory space use).

With dynamic DMA windows, we will need to be able to release
iommu_table even if it was used for VFIO in which case it_map is NULL
so does the patch.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com

---
 arch/powerpc/kernel/iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3d47eb3..2c02d4c 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
   unsigned int order;

   if (!tbl || !tbl-it_map) {
-  printk(KERN_ERR %s: expected TCE map for %s\n, __func__,
-  node_name);
+  kfree(tbl);

I'm not sure if the tbl needs to be checked against NULL as kfree() already
has the check. But it looks a bit strange to free NULL tbl from the code
itself.

Thanks,
Gavin

   return;
   }

-- 
2.4.0.rc3.8.gfb3e7d5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v10 06/34] vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU driver

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 01:38:55AM +1000, Alexey Kardashevskiy wrote:
This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.

This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.

This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.

Besides the last part, the rest of the patch is mechanical.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
[aw: for the vfio related changes]
Acked-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: David Gibson da...@gibson.dropbear.id.au

Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com

---
Changes:
v9:
* added missing tce_iommu_clear call after iommu_release_ownership()
* brought @offset (a local variable) back to make patch even more
mechanical

v4:
* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
 arch/powerpc/include/asm/iommu.h|  4 --
 arch/powerpc/kernel/iommu.c | 55 -
 drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++--
 3 files changed, 67 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h 
b/arch/powerpc/include/asm/iommu.h
index 8353c86..e94a5e3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, 
unsigned long entry,
   unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
   unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-  unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-  unsigned long entry, unsigned long tce);

 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2c02d4c..8673c94 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -983,30 +983,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);

-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-  unsigned long entry, unsigned long pages)
-{
-  unsigned long oldtce;
-  struct page *page;
-
-  for ( ; pages; --pages, ++entry) {
-  oldtce = iommu_clear_tce(tbl, entry);
-  if (!oldtce)
-  continue;
-
-  page = pfn_to_page(oldtce  PAGE_SHIFT);
-  WARN_ON(!page);
-  if (page) {
-  if (oldtce  TCE_PCI_WRITE)
-  SetPageDirty(page);
-  put_page(page);
-  }
-  }
-
-  return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1036,35 +1012,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);

-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-  unsigned long tce)
-{
-  int ret;
-  struct page *page = NULL;
-  unsigned long hwaddr, offset = tce  IOMMU_PAGE_MASK(tbl)  ~PAGE_MASK;
-  enum dma_data_direction direction = iommu_tce_direction(tce);
-
-  ret = get_user_pages_fast(tce  PAGE_MASK, 1,
-  direction != DMA_TO_DEVICE, page);
-  if (unlikely(ret != 1)) {
-  /* pr_err(iommu_tce: get_user_pages_fast failed tce=%lx 
ioba=%lx ret=%d\n,
-  tce, entry  tbl-it_page_shift, ret); */
-  return -EFAULT;
-  }
-  hwaddr = (unsigned long) page_address(page) + offset;
-
-  ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-  if (ret)
-  put_page(page);
-
-  if (ret  0)
-  pr_err(iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n,
-  __func__, entry  tbl-it_page_shift, tce, ret);
-
-  return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
   unsigned long sz = (tbl-it_size + 7)  3;
@@ -1078,7 +1025,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
   }

   memset(tbl-it_map, 0xff, sz);
-  iommu_clear_tces_and_put_pages(tbl, tbl-it_offset, tbl-it_size);

   /*
* Disable iommu bypass, otherwise the user can DMA to all of
@@ -1096,7 +1042,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
   unsigned long sz = (tbl-it_size + 7)  3;

-  iommu_clear_tces_and_put_pages(tbl, tbl-it_offset,

Re: [PATCH v3 00/22] Convert the posix_clock_operations and k_clock structure to ready for 2038

2015-05-12 Thread Arnd Bergmann

On Monday 11 May 2015 19:08:38 Baolin Wang wrote:
 This patch series changes the 32-bit time type (timespec/itimerspec) to the 
 64-bit one
 (timespec64/itimerspec64), since 32-bit time types will break in the year 
 2038.
 
 This patch series introduces new methods with timespec64/itimerspec64 type,
 and removes the old ones with timespec/itimerspec type for 
 posix_clock_operations
 and k_clock structure.
 
 Also introduces some new functions with timespec64/itimerspec64 type, like 
 current_kernel_time64(),
 hrtimer_get_res64(), cputime_to_timespec64() and timespec64_to_cputime().
 
 Changes since v2:
   -Split the syscall conversion patch into small some patches.
 
 
 Baolin Wang (22):
   linux/time64.h:Introduce the 'struct itimerspec64' for 64bit
   timekeeping:Introduce the current_kernel_time64() function with
 timespec64 type
   time/hrtimer:Introduce hrtimer_get_res64() with timespec64 type for
 getting the timer resolution
   posix-timers:Split out the guts of the syscall and change the
 implementation for timer_gettime
   posix-timers:Convert to the 64bit methods for the timer_gettime
 syscall function

I have two more very general comments about the series:

a) something has gone wrong with your submission in v2 and v3 but was
   working earlier: normally all emails should be sent by git-send-email
   as replies to the [patch 00/22] mail. This is the default, and it
   is enabled by the '--thread --no-chain-reply' options. Please try
   to get this to work again.

b) it would be better to have a little shorter subject lines, to avoid
   line-wrapping in the list above. Here are some examples what you could
   use to replace the lines above:

   timekeeping: introduce struct itimerspec64
   timekeeping: introduce current_kernel_time64()
   hrtimer: introduce hrtimer_get_res64() 
   posix-timers: split up sys_timer_gettime()
   posix-timers: convert timer_gettime() to timespec64

   In general, try to come up with the shortest description that
   uniquely describes what your patch does, and move any details into
   the longer patch description.

Arnd
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 00/22] Convert the posix_clock_operations and k_clock structure to ready for 2038

2015-05-12 Thread Baolin Wang

On 12 May 2015 at 17:39, Arnd Bergmann a...@arndb.de wrote:

 On Monday 11 May 2015 19:08:38 Baolin Wang wrote:
  This patch series changes the 32-bit time type (timespec/itimerspec) to
 the 64-bit one
  (timespec64/itimerspec64), since 32-bit time types will break in the
 year 2038.
 
  This patch series introduces new methods with timespec64/itimerspec64
 type,
  and removes the old ones with timespec/itimerspec type for
 posix_clock_operations
  and k_clock structure.
 
  Also introduces some new functions with timespec64/itimerspec64 type,
 like current_kernel_time64(),
  hrtimer_get_res64(), cputime_to_timespec64() and timespec64_to_cputime().
 
  Changes since v2:
-Split the syscall conversion patch into small some patches.
 
 
  Baolin Wang (22):
linux/time64.h:Introduce the 'struct itimerspec64' for 64bit
timekeeping:Introduce the current_kernel_time64() function with
  timespec64 type
time/hrtimer:Introduce hrtimer_get_res64() with timespec64 type for
  getting the timer resolution
posix-timers:Split out the guts of the syscall and change the
  implementation for timer_gettime
posix-timers:Convert to the 64bit methods for the timer_gettime
  syscall function

 I have two more very general comments about the series:

 a) something has gone wrong with your submission in v2 and v3 but was
working earlier: normally all emails should be sent by git-send-email
as replies to the [patch 00/22] mail. This is the default, and it
is enabled by the '--thread --no-chain-reply' options. Please try
to get this to work again.

 b) it would be better to have a little shorter subject lines, to avoid
line-wrapping in the list above. Here are some examples what you could
use to replace the lines above:

timekeeping: introduce struct itimerspec64
timekeeping: introduce current_kernel_time64()
hrtimer: introduce hrtimer_get_res64()
posix-timers: split up sys_timer_gettime()
posix-timers: convert timer_gettime() to timespec64

In general, try to come up with the shortest description that
uniquely describes what your patch does, and move any details into
the longer patch description.

 Arnd


OK, i'll fix these in next patch series.Thanks for your comments.


-- 
Baolin.wang
Best Regards
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 3/3] kvm/powerpc: Export HCALL reason codes

2015-05-12 Thread Scott Wood

On Tue, 2015-05-12 at 21:46 +0530, Hemant Kumar wrote:
 On 05/12/2015 03:44 AM, Scott Wood wrote:
  On Fri, 2015-05-08 at 06:23 +0530, Hemant Kumar wrote:
  For perf to analyze the KVM events like hcalls, we need the
  hypervisor calls and their codes to be exported through uapi.
 
  This patch moves most of the pSeries hcall codes from
  arch/powerpc/include/asm/hvcall.h to
  arch/powerpc/include/uapi/asm/hcall_codes.h.
  It also moves the mapping hcall_code-to-hcall_reason from
  arch/powerpc/kvm/trace_hv.h to
  arch/powerpc/include/uapi/asm/trace_hcall.h.
 
  Signed-off-by: Hemant Kumar hem...@linux.vnet.ibm.com
  ---
arch/powerpc/include/asm/hvcall.h   | 120 
  +--
arch/powerpc/include/uapi/asm/hcall_codes.h | 123 
  
arch/powerpc/include/uapi/asm/trace_hcall.h | 122 
  +++
arch/powerpc/kvm/trace_hv.h | 117 
  +-
  When moving to uapi please add proper namespacing to indicate that this
  is pseries specific.
 
 
 Sure, will add that.

Thanks.

  diff --git a/arch/powerpc/include/uapi/asm/trace_hcall.h 
  b/arch/powerpc/include/uapi/asm/trace_hcall.h
  new file mode 100644
  index 000..00eac01
  --- /dev/null
  +++ b/arch/powerpc/include/uapi/asm/trace_hcall.h
  @@ -0,0 +1,122 @@
  +#ifndef _KVM_TRACE_HCALL_MAP_H
  +#define _KVM_TRACE_HCALL_MAP_H
  +
  +#include hcall_codes.h
  +
  +#define kvm_trace_symbol_hcall\
  +  {H_REMOVE,H_REMOVE},  \
  +  {H_ENTER,H_ENTER},\
  +  {H_READ,H_READ},  \
  +  {H_CLEAR_MOD,H_CLEAR_MOD},\
  This is a rather odd way of exposing an array to userspace...
 
 
 Didn't get you here. Can you please elaborate?
 
 I see some other files like arch/x86/include/uapi/asm/vmx.h exposing the 
 reasons in a similar way.

It just seemed like sloppy API to not have any typing associated with it
and require the user to wrap it in an actual array; I wasn't aware there
was precedent for this construction in uapi.  I also would have expected
a macro that doesn't behave like a normal element of C syntax (function,
variable name, type, etc) to have been capitalized.

-Scott



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v10 02/34] powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 01:38:51AM +1000, Alexey Kardashevskiy wrote:
The set_iommu_table_base_and_group() name suggests that the function
sets table base and add a device to an IOMMU group. However actual
table base setting happens in pnv_pci_ioda_dma_dev_setup().


On PHB3, the DMA32 IOMMU table is created during PHB fixup time
in ppc_md.pcibios_fixup(), which is invoked at end of PCI enumeration.
The IOMMU table of PCI devices are initialized at same time.
pnv_pci_ioda_dma_dev_setup() is called when adding PCI device
or fixing up PCI bus at PCI enumeration time. So the commit logs
here isn't accurate enough.

Basically, set_iommu_table_base_and_group() which does two things
in one function, which is nice. I guess you don't need this function
any more after DDW is supported and it's the reason to remove it?

The actual purpose for table base setting is to put some reference
into a device so later iommu_add_device() can get the IOMMU group
reference and the device to the group.

At the moment a group cannot be explicitly passed to iommu_add_device()
as we want it to work from the bus notifier, we can fix it later and
remove confusing calls of set_iommu_table_base().

This replaces set_iommu_table_base_and_group() with a couple of
set_iommu_table_base() + iommu_add_device() which makes reading the code
easier.

This adds few comments why set_iommu_table_base() and iommu_add_device()
are called where they are called.

For IODA1/2, this essentially removes iommu_add_device() call from
the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
place:
- for physical PE, the device is already attached by iommu_add_device()
in pnv_pci_ioda_setup_dma_pe();
- for virtual PE, the sysfs entries are not ready to create all symlinks
so actual adding is happening in tce_iommu_bus_notifier.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com

---
Changes:
v10:
* new to the series
---
 arch/powerpc/include/asm/iommu.h|  7 ---
 arch/powerpc/platforms/powernv/pci-ioda.c   | 27 +++
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  3 ++-
 arch/powerpc/platforms/pseries/iommu.c  | 15 ---
 4 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h 
b/arch/powerpc/include/asm/iommu.h
index 1e27d63..8353c86 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */

-static inline void set_iommu_table_base_and_group(struct device *dev,
-void *base)
-{
-  set_iommu_table_base(dev, base);
-  iommu_add_device(dev);
-}
-
 extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
   struct scatterlist *sglist, int nelems,
   unsigned long mask,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2f092bb..9a77f3c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb 
*phb, struct pci_dev *pdev

   pe = phb-ioda.pe_array[pdn-pe_number];
   WARN_ON(get_dma_ops(pdev-dev) != dma_iommu_ops);
-  set_iommu_table_base_and_group(pdev-dev, pe-tce32_table);
+  set_iommu_table_base(pdev-dev, pe-tce32_table);
+  /*
+   * Note: iommu_add_device() will fail here as
+   * for physical PE: the device is already added by now;
+   * for virtual PE: sysfs entries are not ready yet and
+   * tce_iommu_bus_notifier will add the device to a group later.
+   */

I didn't figure out how the IOMMU table is initialized for PCI device in this
function during bootup time. At system bootup time, the function is only called
when applying fixup to PCI bus in pcibios_fixup_bus(). At that time, we don't
have PE# yet, which is allocated at PHB fixup time (ppc_md.pcibios_fixup_phb).

 }

 static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
@@ -1659,7 +1665,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
*pe,
   struct pci_dev *dev;

   list_for_each_entry(dev, bus-devices, bus_list) {
-  set_iommu_table_base_and_group(dev-dev, pe-tce32_table);
+  set_iommu_table_base(dev-dev, pe-tce32_table);
+  iommu_add_device(dev-dev);

   if (dev-subordinate)
   pnv_ioda_setup_bus_dma(pe, dev-subordinate);
@@ -1835,7 +1842,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
*phb,
   if (pe-flags  PNV_IODA_PE_DEV) {
   iommu_register_group(tbl, phb-hose-global_number,
pe-pe_number);
-  set_iommu_table_base_and_group(pe-pdev-dev, tbl);
+  /*
+   * Setting table base

Re: [PATCH v3 1/2] perf/kvm: Port perf kvm to powerpc

2015-05-12 Thread Scott Wood

On Tue, 2015-05-12 at 21:34 +0530, Hemant Kumar wrote:
 Hi Scott,
 
 On 05/12/2015 03:38 AM, Scott Wood wrote:
  On Fri, 2015-05-08 at 06:37 +0530, Hemant Kumar wrote:
  diff --git a/arch/powerpc/include/uapi/asm/kvm_perf.h 
  b/arch/powerpc/include/uapi/asm/kvm_perf.h
  new file mode 100644
  index 000..30fa670
  --- /dev/null
  +++ b/arch/powerpc/include/uapi/asm/kvm_perf.h
  @@ -0,0 +1,15 @@
  +#ifndef _ASM_POWERPC_KVM_PERF_H
  +#define _ASM_POWERPC_KVM_PERF_H
  +
  +#include asm/trace_book3s.h
  +#include asm/kvm.h
  +
  +#define DECODE_STR_LEN 20
  +
  +#define VCPU_ID vcpu_id
  +
  +#define KVM_ENTRY_TRACE kvm_hv:kvm_guest_enter
  +#define KVM_EXIT_TRACE kvm_hv:kvm_guest_exit
  +#define KVM_EXIT_REASON trap
  +
  +#endif /* _ASM_POWERPC_KVM_PERF_H */
  Please make sure that anything book3s-specific is named that way.
 
 Are you suggesting to name it to something like _ASM_POWERPC_BOOK3S_PERF_H ?

My concern is seeing a generically named kvm_perf.h include a file
called trace_book3s.h which defines kvm_trace_symbol_hcall with
presumably book3s-specific content, as well as wondering how much of the
rest of the file would be applicable if booke PPC were to implement perf
kvm.

I don't know enough about perf kvm to answer that question, but I've
seen enough cases of book3s or pseries specific code that was apparently
written with the belief that no other ppc64 implementations exist, or
that no other ppc implementations would want to implement a certain
feature, to be suspicous.  Usually such cases can be dealt with after
the fact (albeit not as easily as if things were organized/namespaced
properly from the beginning), but this is uapi...

-Scott



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v10 04/34] powerpc/iommu: Put IOMMU group explicitly

2015-05-12 Thread Gavin Shan

On Tue, May 12, 2015 at 01:38:53AM +1000, Alexey Kardashevskiy wrote:
So far an iommu_table lifetime was the same as PE. Dynamic DMA windows
will change this and iommu_free_table() will not always require
the group to be released.

This moves iommu_group_put() out of iommu_free_table().

This adds a iommu_pseries_free_table() helper which does
iommu_group_put() and iommu_free_table(). Later it will be
changed to receive a table_group and we will have to change less
lines then.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

Reviewed-by: Gavin Shan gws...@linux.vnet.ibm.com

Thanks,
Gavin

---
 arch/powerpc/kernel/iommu.c   |  7 ---
 arch/powerpc/platforms/powernv/pci-ioda.c |  5 +
 arch/powerpc/platforms/pseries/iommu.c| 14 +-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..3d47eb3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
   if (tbl-it_offset == 0)
   clear_bit(0, tbl-it_map);

-#ifdef CONFIG_IOMMU_API
-  if (tbl-it_group) {
-  iommu_group_put(tbl-it_group);
-  BUG_ON(tbl-it_group);
-  }
-#endif
-
   /* verify that table contains no entries */
   if (!bitmap_empty(tbl-it_map, tbl-it_size))
   pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8ca7abd..8c3c4bf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -23,6 +23,7 @@
 #include linux/io.h
 #include linux/msi.h
 #include linux/memblock.h
+#include linux/iommu.h

 #include asm/sections.h
 #include asm/io.h
@@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
*dev, struct pnv_ioda_pe
   if (rc)
   pe_warn(pe, OPAL error %ld release DMA window\n, rc);

+  if (tbl-it_group) {
+  iommu_group_put(tbl-it_group);
+  BUG_ON(tbl-it_group);
+  }
   iommu_free_table(tbl, of_node_full_name(dev-dev.of_node));
   free_pages(addr, get_order(TCE32_TABLE_SIZE));
   pe-tce32_table = NULL;
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 05ab06d..89f557b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include linux/crash_dump.h
 #include linux/memory.h
 #include linux/of.h
+#include linux/iommu.h
 #include asm/io.h
 #include asm/prom.h
 #include asm/rtas.h
@@ -51,6 +52,16 @@

 #include pseries.h

+static void iommu_pseries_free_table(struct iommu_table *tbl,
+  const char *node_name)
+{
+  if (tbl-it_group) {
+  iommu_group_put(tbl-it_group);
+  BUG_ON(tbl-it_group);
+  }
+  iommu_free_table(tbl, node_name);
+}
+
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
 __be64 *startp, __be64 *endp)
 {
@@ -1271,7 +1282,8 @@ static int iommu_reconfig_notifier(struct notifier_block 
*nb, unsigned long acti
*/
   remove_ddw(np, false);
   if (pci  pci-iommu_table)
-  iommu_free_table(pci-iommu_table, np-full_name);
+  iommu_pseries_free_table(pci-iommu_table,
+  np-full_name);

   spin_lock(direct_window_list_lock);
   list_for_each_entry(window, direct_window_list, list) {
-- 
2.4.0.rc3.8.gfb3e7d5


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/4] Partially revert powerpc: Remove duplicate cacheable_memcpy/memzero functions

2015-05-12 Thread Christophe Leroy

This partially reverts
commit 'powerpc: Remove duplicate cacheable_memcpy/memzero functions
(f909a35bdfb7cb350d078a2cf888162eeb20381c)'

Functions cacheable_memcpy/memzero are more efficient than
memcpy/memset as they use the dcbz instruction which avoids refill
of the cacheline with the data that we will overwrite.

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
 arch/powerpc/lib/copy_32.S | 127 +
 1 file changed, 127 insertions(+)

diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index 6813f80..55f19f9 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -69,6 +69,54 @@ CACHELINE_BYTES = L1_CACHE_BYTES
 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
 CACHELINE_MASK = (L1_CACHE_BYTES-1)
 
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.  -- paulus
+ */
+_GLOBAL(cacheable_memzero)
+   mr  r5,r4
+   li  r4,0
+   addir6,r3,-4
+   cmplwi  0,r5,4
+   blt 7f
+   stwur4,4(r6)
+   beqlr
+   andi.   r0,r6,3
+   add r5,r0,r5
+   subfr6,r0,r6
+   clrlwi  r7,r6,32-LG_CACHELINE_BYTES
+   add r8,r7,r5
+   srwir9,r8,LG_CACHELINE_BYTES
+   addic.  r9,r9,-1/* total number of complete cachelines */
+   ble 2f
+   xorir0,r7,CACHELINE_MASK  ~3
+   srwi.   r0,r0,2
+   beq 3f
+   mtctr   r0
+4: stwur4,4(r6)
+   bdnz4b
+3: mtctr   r9
+   li  r7,4
+10:dcbzr7,r6
+   addir6,r6,CACHELINE_BYTES
+   bdnz10b
+   clrlwi  r5,r8,32-LG_CACHELINE_BYTES
+   addir5,r5,4
+2: srwir0,r5,2
+   mtctr   r0
+   bdz 6f
+1: stwur4,4(r6)
+   bdnz1b
+6: andi.   r5,r5,3
+7: cmpwi   0,r5,0
+   beqlr
+   mtctr   r5
+   addir6,r6,3
+8: stbur4,1(r6)
+   bdnz8b
+   blr
+
 _GLOBAL(memset)
rlwimi  r4,r4,8,16,23
rlwimi  r4,r4,16,0,15
@@ -94,6 +142,85 @@ _GLOBAL(memset)
bdnz8b
blr
 
+/*
+ * This version uses dcbz on the complete cache lines in the
+ * destination area to reduce memory traffic.  This requires that
+ * the destination area is cacheable.
+ * We only use this version if the source and dest don't overlap.
+ * -- paulus.
+ */
+_GLOBAL(cacheable_memcpy)
+   add r7,r3,r5/* test if the src  dst overlap */
+   add r8,r4,r5
+   cmplw   0,r4,r7
+   cmplw   1,r3,r8
+   crand   0,0,4   /* cr0.lt = cr1.lt */
+   blt memcpy  /* if regions overlap */
+
+   addir4,r4,-4
+   addir6,r3,-4
+   neg r0,r3
+   andi.   r0,r0,CACHELINE_MASK/* # bytes to start of cache line */
+   beq 58f
+
+   cmplw   0,r5,r0 /* is this more than total to do? */
+   blt 63f /* if not much to do */
+   andi.   r8,r0,3 /* get it word-aligned first */
+   subfr5,r0,r5
+   mtctr   r8
+   beq+61f
+70:lbz r9,4(r4)/* do some bytes */
+   stb r9,4(r6)
+   addir4,r4,1
+   addir6,r6,1
+   bdnz70b
+61:srwi.   r0,r0,2
+   mtctr   r0
+   beq 58f
+72:lwzur9,4(r4)/* do some words */
+   stwur9,4(r6)
+   bdnz72b
+
+58:srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+   clrlwi  r5,r5,32-LG_CACHELINE_BYTES
+   li  r11,4
+   mtctr   r0
+   beq 63f
+53:
+   dcbzr11,r6
+   COPY_16_BYTES
+#if L1_CACHE_BYTES = 32
+   COPY_16_BYTES
+#if L1_CACHE_BYTES = 64
+   COPY_16_BYTES
+   COPY_16_BYTES
+#if L1_CACHE_BYTES = 128
+   COPY_16_BYTES
+   COPY_16_BYTES
+   COPY_16_BYTES
+   COPY_16_BYTES
+#endif
+#endif
+#endif
+   bdnz53b
+
+63:srwi.   r0,r5,2
+   mtctr   r0
+   beq 64f
+30:lwzur0,4(r4)
+   stwur0,4(r6)
+   bdnz30b
+
+64:andi.   r0,r5,3
+   mtctr   r0
+   beq+65f
+40:lbz r0,4(r4)
+   stb r0,4(r6)
+   addir4,r4,1
+   addir6,r6,1
+   bdnz40b
+65:blr
+
 _GLOBAL(memmove)
cmplw   0,r3,r4
bgt backwards_memcpy
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 0/4] powerpc32: use cacheable alternatives of memcpy and memset

2015-05-12 Thread Christophe Leroy

This patchset implements use of cacheable versions of memset and
memcpy when the len is greater than the cacheline size and the
destination is in RAM.

On MPC885, we observe a 7% rate increase on FTP transfer

Christophe Leroy (4):
  Partially revert powerpc: Remove duplicate cacheable_memcpy/memzero
functions
  powerpc32: swap r4 and r5 in cacheable_memzero
  powerpc32: memset(0): use cacheable_memzero
  powerpc32: memcpy: use cacheable_memcpy

 arch/powerpc/lib/copy_32.S | 148 +
 1 file changed, 148 insertions(+)

-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 4/4] powerpc32: memcpy: use cacheable_memcpy

2015-05-12 Thread Christophe Leroy

cacheable_memcpy uses dcbz instruction and is more efficient than
memcpy when the destination is in RAM

This patch renames memcpy as generic_memcpy, and defines memcpy as a
prolog to cacheable_memcpy. This prolog checks if the buffer is
in RAM. If not, it falls back to generic_memcpy()

On MPC885, we get approximatly 7% increase of the transfer rate
on an FTP reception

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
 arch/powerpc/lib/copy_32.S | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index d8a9a86..8f76d49 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -161,13 +161,27 @@ _GLOBAL(generic_memset)
  * We only use this version if the source and dest don't overlap.
  * -- paulus.
  */
+_GLOBAL(memmove)
+   cmplw   0,r3,r4
+   bgt backwards_memcpy
+   /* fall through */
+
+_GLOBAL(memcpy)
+   cmplwi  r5,L1_CACHE_BYTES
+   blt-generic_memcpy
+   lis r8,max_pfn@ha
+   lwz r8,max_pfn@l(r8)
+   tophys  (r9,r3)
+   srwir9,r9,PAGE_SHIFT
+   cmplw   r9,r8
+   bge-generic_memcpy
 _GLOBAL(cacheable_memcpy)
add r7,r3,r5/* test if the src  dst overlap */
add r8,r4,r5
cmplw   0,r4,r7
cmplw   1,r3,r8
crand   0,0,4   /* cr0.lt = cr1.lt */
-   blt memcpy  /* if regions overlap */
+   blt generic_memcpy  /* if regions overlap */
 
addir4,r4,-4
addir6,r3,-4
@@ -233,12 +247,7 @@ _GLOBAL(cacheable_memcpy)
bdnz40b
 65:blr
 
-_GLOBAL(memmove)
-   cmplw   0,r3,r4
-   bgt backwards_memcpy
-   /* fall through */
-
-_GLOBAL(memcpy)
+_GLOBAL(generic_memcpy)
srwi.   r7,r5,3
addir6,r3,-4
addir4,r4,-4
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/4] powerpc32: swap r4 and r5 in cacheable_memzero

2015-05-12 Thread Christophe Leroy

We swap r4 and r5, this avoids having to move the len contained in r4
into r5

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
 arch/powerpc/lib/copy_32.S | 29 ++---
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index 55f19f9..cbca76c 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -75,18 +75,17 @@ CACHELINE_MASK = (L1_CACHE_BYTES-1)
  * area is cacheable.  -- paulus
  */
 _GLOBAL(cacheable_memzero)
-   mr  r5,r4
-   li  r4,0
+   li  r5,0
addir6,r3,-4
-   cmplwi  0,r5,4
+   cmplwi  0,r4,4
blt 7f
-   stwur4,4(r6)
+   stwur5,4(r6)
beqlr
andi.   r0,r6,3
-   add r5,r0,r5
+   add r4,r0,r4
subfr6,r0,r6
clrlwi  r7,r6,32-LG_CACHELINE_BYTES
-   add r8,r7,r5
+   add r8,r7,r4
srwir9,r8,LG_CACHELINE_BYTES
addic.  r9,r9,-1/* total number of complete cachelines */
ble 2f
@@ -94,26 +93,26 @@ _GLOBAL(cacheable_memzero)
srwi.   r0,r0,2
beq 3f
mtctr   r0
-4: stwur4,4(r6)
+4: stwur5,4(r6)
bdnz4b
 3: mtctr   r9
li  r7,4
 10:dcbzr7,r6
addir6,r6,CACHELINE_BYTES
bdnz10b
-   clrlwi  r5,r8,32-LG_CACHELINE_BYTES
-   addir5,r5,4
-2: srwir0,r5,2
+   clrlwi  r4,r8,32-LG_CACHELINE_BYTES
+   addir4,r4,4
+2: srwir0,r4,2
mtctr   r0
bdz 6f
-1: stwur4,4(r6)
+1: stwur5,4(r6)
bdnz1b
-6: andi.   r5,r5,3
-7: cmpwi   0,r5,0
+6: andi.   r4,r4,3
+7: cmpwi   0,r4,0
beqlr
-   mtctr   r5
+   mtctr   r4
addir6,r6,3
-8: stbur4,1(r6)
+8: stbur5,1(r6)
bdnz8b
blr
 
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 3/4] powerpc32: memset(0): use cacheable_memzero

2015-05-12 Thread Christophe Leroy

cacheable_memzero uses dcbz instruction and is more efficient than
memset(0) when the destination is in RAM

This patch renames memset as generic_memset, and defines memset
as a prolog to cacheable_memzero. This prolog checks if the byte
to set is 0 and if the buffer is in RAM. If not, it falls back to
generic_memcpy()

Signed-off-by: Christophe Leroy christophe.le...@c-s.fr
---
 arch/powerpc/lib/copy_32.S | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index cbca76c..d8a9a86 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -12,6 +12,7 @@
 #include asm/cache.h
 #include asm/errno.h
 #include asm/ppc_asm.h
+#include asm/page.h
 
 #define COPY_16_BYTES  \
lwz r7,4(r4);   \
@@ -74,6 +75,18 @@ CACHELINE_MASK = (L1_CACHE_BYTES-1)
  * to set them to zero.  This requires that the destination
  * area is cacheable.  -- paulus
  */
+_GLOBAL(memset)
+   cmplwi  r4,0
+   bne-generic_memset
+   cmplwi  r5,L1_CACHE_BYTES
+   blt-generic_memset
+   lis r8,max_pfn@ha
+   lwz r8,max_pfn@l(r8)
+   tophys  (r9,r3)
+   srwir9,r9,PAGE_SHIFT
+   cmplw   r9,r8
+   bge-generic_memset
+   mr  r4,r5
 _GLOBAL(cacheable_memzero)
li  r5,0
addir6,r3,-4
@@ -116,7 +129,7 @@ _GLOBAL(cacheable_memzero)
bdnz8b
blr
 
-_GLOBAL(memset)
+_GLOBAL(generic_memset)
rlwimi  r4,r4,8,16,23
rlwimi  r4,r4,16,0,15
addir6,r3,-4
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

55 matches

Mail list logo