[PATCH kernel v3 3/6] KVM: PPC: Make iommu_table::it_userspace big endian

2018-07-03 Thread Alexey Kardashevskiy
We are going to reuse multilevel TCE code for the userspace copy of
the TCE table and since it is big endian, let's make the copy big endian
too.

Reviewed-by: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/iommu.h|  2 +-
 arch/powerpc/kvm/book3s_64_vio.c| 11 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c | 10 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 19 +--
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 20febe0..803ac70 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -117,7 +117,7 @@ struct iommu_table {
unsigned long *it_map;   /* A simple allocation bitmap for now */
unsigned long  it_page_shift;/* table iommu page size */
struct list_head it_group_list;/* List of iommu_table_group_link */
-   unsigned long *it_userspace; /* userspace view of the table */
+   __be64 *it_userspace; /* userspace view of the table */
struct iommu_table_ops *it_ops;
struct krefit_kref;
 };
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 8167ce8..6f34edd 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -377,19 +377,19 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
 
if (!pua)
/* it_userspace allocation might be delayed */
return H_TOO_HARD;
 
-   mem = mm_iommu_lookup(kvm->mm, *pua, pgsize);
+   mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
if (!mem)
return H_TOO_HARD;
 
mm_iommu_mapped_dec(mem);
 
-   *pua = 0;
+   *pua = cpu_to_be64(0);
 
return H_SUCCESS;
 }
@@ -436,7 +436,8 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct 
iommu_table *tbl,
enum dma_data_direction dir)
 {
long ret;
-   unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   unsigned long hpa;
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
struct mm_iommu_table_group_mem_t *mem;
 
if (!pua)
@@ -463,7 +464,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct 
iommu_table *tbl,
if (dir != DMA_NONE)
kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
 
-   *pua = ua;
+   *pua = cpu_to_be64(ua);
 
return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 5b298f5..841aef7 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
 
if (!pua)
/* it_userspace allocation might be delayed */
@@ -210,13 +210,13 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm 
*kvm,
if (WARN_ON_ONCE_RM(!pua))
return H_HARDWARE;
 
-   mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize);
+   mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize);
if (!mem)
return H_TOO_HARD;
 
mm_iommu_mapped_dec(mem);
 
-   *pua = 0;
+   *pua = cpu_to_be64(0);
 
return H_SUCCESS;
 }
@@ -268,7 +268,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
 {
long ret;
unsigned long hpa = 0;
-   unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
struct mm_iommu_table_group_mem_t *mem;
 
if (!pua)
@@ -303,7 +303,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
if (dir != DMA_NONE)
kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
 
-   *pua = ua;
+   *pua = cpu_to_be64(ua);
 
return 0;
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 7cd63b0..17a418c 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -230,7 +230,7 @@ static long tce_iommu_userspace_view_alloc(struct 
iommu_table *tbl,
decrement_locked_vm(mm, cb >> PAGE_SHIFT);
return -ENOMEM;
}
-   tbl->it_userspace = uas;
+   tbl->it_userspace = (__be64 *) uas;
 
return 0;
 }
@@ -482,20 +482,20 @@ static void tce_iommu_unuse_page_v2(struct tce_container

[PATCH kernel v3 2/6] powerpc/powernv: Move TCE manupulation code to its own file

2018-07-03 Thread Alexey Kardashevskiy
Right now we have allocation code in pci-ioda.c and traversing code in
pci.c, let's keep them toghether. However both files are big enough
already so let's move this business to a new file.

While we at it, move the code which links IOMMU table groups to
IOMMU tables as it is not specific to any PNV PHB model.

These puts exported symbols from the new file together.

This fixes several warnings from checkpatch.pl like this:
"WARNING: Prefer 'unsigned int' to bare use of 'unsigned'".

As this is almost cut-n-paste, there should be no behavioral change.

Reviewed-by: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/Makefile   |   2 +-
 arch/powerpc/platforms/powernv/pci.h  |  41 ++--
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 313 ++
 arch/powerpc/platforms/powernv/pci-ioda.c | 146 
 arch/powerpc/platforms/powernv/pci.c  | 158 -
 5 files changed, 340 insertions(+), 320 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-ioda-tce.c

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 703a350..b540ce8e 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,7 +6,7 @@ obj-y   += opal-msglog.o opal-hmi.o 
opal-power.o opal-irqchip.o
 obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
opal-sensor-groups.o
 
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
+obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE) += pci-cxl.o
 obj-$(CONFIG_EEH)  += eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index eada4b6..fa90f60 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -201,13 +201,6 @@ struct pnv_phb {
 };
 
 extern struct pci_ops pnv_pci_ops;
-extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
-   unsigned long uaddr, enum dma_data_direction direction,
-   unsigned long attrs);
-extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
-extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
-   unsigned long *hpa, enum dma_data_direction *direction);
-extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
unsigned char *log_buff);
@@ -217,14 +210,6 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
  int where, int size, u32 val);
 extern struct iommu_table *pnv_pci_table_alloc(int nid);
 
-extern long pnv_pci_link_table_and_group(int node, int num,
-   struct iommu_table *tbl,
-   struct iommu_table_group *table_group);
-extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
-   struct iommu_table_group *table_group);
-extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
- void *tce_mem, u64 tce_size,
- u64 dma_offset, unsigned page_shift);
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_init_npu_phb(struct device_node *np);
@@ -272,4 +257,30 @@ extern void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev 
*pdev);
 /* phb ops (cxl switches these when enabling the kernel api on the phb) */
 extern const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops;
 
+/* pci-ioda-tce.c */
+#define POWERNV_IOMMU_DEFAULT_LEVELS   1
+#define POWERNV_IOMMU_MAX_LEVELS   5
+
+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+   unsigned long uaddr, enum dma_data_direction direction,
+   unsigned long attrs);
+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
+   unsigned long *hpa, enum dma_data_direction *direction);
+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
+
+extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+   __u32 page_shift, __u64 window_size, __u32 levels,
+   struct iommu_table *tbl);
+extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
+extern long pnv_pci_link_table_and_group(int node, int num,
+   struct iommu_table *tbl,
+   struct iommu_table_group *table_group);
+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+   struct iommu_table_group *table_group);
+extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+   void *tce_mem, u64 tce_size,
+   u64 dma_offset, u

[PATCH kernel v3 0/6] powerpc/powernv/iommu: Optimize memory use

2018-07-03 Thread Alexey Kardashevskiy


This patchset aims to reduce actual memory use for guests with
sparse memory. The pseries guest uses dynamic DMA windows to map
the entire guest RAM but it only actually maps onlined memory
which may be not be contiguous. I hit this when tried passing
through NVLink2-connected GPU RAM of NVIDIA V100 and trying to
map this RAM at the same offset as in the real hardware
forced me to rework I handle these windows.

This moves userspace-to-host-physical translation table
(iommu_table::it_userspace) from VFIO TCE IOMMU subdriver to
the platform code and reuses the already existing multilevel
TCE table code which we have for the hardware tables.
At last in 6/6 I switch to on-demand allocation so we do not
allocate huge chunks of the table if we do not have to;
there is some math in 6/6.

Changes:
v3:
* rebased on v4.18-rc3 and fixed compile error in 6/6

v2:
* bugfix and error handling in 6/6


This is based on sha1
021c917 Linus Torvalds "Linux 4.18-rc3".

Please comment. Thanks.



Alexey Kardashevskiy (6):
  powerpc/powernv: Remove useless wrapper
  powerpc/powernv: Move TCE manupulation code to its own file
  KVM: PPC: Make iommu_table::it_userspace big endian
  powerpc/powernv: Add indirect levels to it_userspace
  powerpc/powernv: Rework TCE level allocation
  powerpc/powernv/ioda: Allocate indirect TCE levels on demand

 arch/powerpc/platforms/powernv/Makefile   |   2 +-
 arch/powerpc/include/asm/iommu.h  |  11 +-
 arch/powerpc/platforms/powernv/pci.h  |  44 ++-
 arch/powerpc/kvm/book3s_64_vio.c  |  11 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c   |  18 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 399 ++
 arch/powerpc/platforms/powernv/pci-ioda.c | 184 ++--
 arch/powerpc/platforms/powernv/pci.c  | 158 --
 drivers/vfio/vfio_iommu_spapr_tce.c   |  65 +
 9 files changed, 478 insertions(+), 414 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/pci-ioda-tce.c

-- 
2.11.0



[PATCH kernel v3 6/6] powerpc/powernv/ioda: Allocate indirect TCE levels on demand

2018-07-03 Thread Alexey Kardashevskiy
At the moment we allocate the entire TCE table, twice (hardware part and
userspace translation cache). This normally works as we normally have
contigous memory and the guest will map entire RAM for 64bit DMA.

However if we have sparse RAM (one example is a memory device), then
we will allocate TCEs which will never be used as the guest only maps
actual memory for DMA. If it is a single level TCE table, there is nothing
we can really do but if it a multilevel table, we can skip allocating
TCEs we know we won't need.

This adds ability to allocate only first level, saving memory.

This changes iommu_table::free() to avoid allocating of an extra level;
iommu_table::set() will do this when needed.

This adds @alloc parameter to iommu_table::exchange() to tell the callback
if it can allocate an extra level; the flag is set to "false" for
the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns
H_TOO_HARD.

This still requires the entire table to be counted in mm::locked_vm.

To be conservative, this only does on-demand allocation when
the usespace cache table is requested which is the case of VFIO.

The example math for a system replicating a powernv setup with NVLink2
in a guest:
16GB RAM mapped at 0x0
128GB GPU RAM window (16GB of actual RAM) mapped at 0x2440

the table to cover that all with 64K pages takes:
(((0x2440 + 0x20) >> 16)*8)>>20 = 4556MB

If we allocate only necessary TCE levels, we will only need:
(((0x4 + 0x4) >> 16)*8)>>20 = 4MB (plus some for indirect
levels).

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
Changes:
v2:
* fixed bug in cleanup path which forced the entire table to be
allocated right before destroying
* added memory allocation error handling pnv_tce()
---
 arch/powerpc/include/asm/iommu.h  |  7 ++-
 arch/powerpc/platforms/powernv/pci.h  |  6 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c   |  4 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 73 +--
 arch/powerpc/platforms/powernv/pci-ioda.c |  8 +--
 drivers/vfio/vfio_iommu_spapr_tce.c   |  2 +-
 6 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4bdcf22..daa3ee5 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -70,7 +70,7 @@ struct iommu_table_ops {
unsigned long *hpa,
enum dma_data_direction *direction);
 
-   __be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
+   __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc);
 #endif
void (*clear)(struct iommu_table *tbl,
long index, long npages);
@@ -122,10 +122,13 @@ struct iommu_table {
__be64 *it_userspace; /* userspace view of the table */
struct iommu_table_ops *it_ops;
struct krefit_kref;
+   int it_nid;
 };
 
+#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
+   ((tbl)->it_ops->useraddrptr((tbl), (entry), false))
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
-   ((tbl)->it_ops->useraddrptr((tbl), (entry)))
+   ((tbl)->it_ops->useraddrptr((tbl), (entry), true))
 
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 2962f6d..0020937 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -266,8 +266,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
unsigned long attrs);
 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
 extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
-   unsigned long *hpa, enum dma_data_direction *direction);
-extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
+   unsigned long *hpa, enum dma_data_direction *direction,
+   bool alloc);
+extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
+   bool alloc);
 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 8cc1caf..efb90d8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
 
if (!pua)
/* it_userspace allocation might be delayed */
@@ -264,7 +264,7 @@ static long k

[PATCH kernel v3 5/6] powerpc/powernv: Rework TCE level allocation

2018-07-03 Thread Alexey Kardashevskiy
This moves actual pages allocation to a separate function which is going
to be reused later in on-demand TCE allocation.

While we are at it, remove unnecessary level size round up as the caller
does this already.

Reviewed-by: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 30 +--
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index f14b282..36c2eb0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -31,6 +31,23 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
tbl->it_type = TCE_PCI;
 }
 
+static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
+{
+   struct page *tce_mem = NULL;
+   __be64 *addr;
+
+   tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+   if (!tce_mem) {
+   pr_err("Failed to allocate a TCE memory, level shift=%d\n",
+   shift);
+   return NULL;
+   }
+   addr = page_address(tce_mem);
+   memset(addr, 0, 1UL << shift);
+
+   return addr;
+}
+
 static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx)
 {
__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -165,21 +182,12 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int 
nid, unsigned int shift,
unsigned int levels, unsigned long limit,
unsigned long *current_offset, unsigned long *total_allocated)
 {
-   struct page *tce_mem = NULL;
__be64 *addr, *tmp;
-   unsigned int order = max_t(unsigned int, shift, PAGE_SHIFT) -
-   PAGE_SHIFT;
-   unsigned long allocated = 1UL << (order + PAGE_SHIFT);
+   unsigned long allocated = 1UL << shift;
unsigned int entries = 1UL << (shift - 3);
long i;
 
-   tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
-   if (!tce_mem) {
-   pr_err("Failed to allocate a TCE memory, order=%d\n", order);
-   return NULL;
-   }
-   addr = page_address(tce_mem);
-   memset(addr, 0, allocated);
+   addr = pnv_alloc_tce_level(nid, shift);
*total_allocated += allocated;
 
--levels;
-- 
2.11.0



[PATCH kernel v3 4/6] powerpc/powernv: Add indirect levels to it_userspace

2018-07-03 Thread Alexey Kardashevskiy
We want to support sparse memory and therefore huge chunks of DMA windows
do not need to be mapped. If a DMA window big enough to require 2 or more
indirect levels, and a DMA window is used to map all RAM (which is
a default case for 64bit window), we can actually save some memory by
not allocation TCE for regions which we are not going to map anyway.

The hardware tables alreary support indirect levels but we also keep
host-physical-to-userspace translation array which is allocated by
vmalloc() and is a flat array which might use quite some memory.

This converts it_userspace from vmalloc'ed array to a multi level table.

As the format becomes platform dependend, this replaces the direct access
to it_usespace with a iommu_table_ops::useraddrptr hook which returns
a pointer to the userspace copy of a TCE; future extension will return
NULL if the level was not allocated.

This should not change non-KVM handling of TCE tables and it_userspace
will not be allocated for non-KVM tables.

Reviewed-by: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v2:
* fixed compile error by ditching one inline helper
---
 arch/powerpc/include/asm/iommu.h  |  6 +--
 arch/powerpc/platforms/powernv/pci.h  |  3 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c   |  8 
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 65 +--
 arch/powerpc/platforms/powernv/pci-ioda.c | 23 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c   | 46 ---
 6 files changed, 73 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 803ac70..4bdcf22 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -69,6 +69,8 @@ struct iommu_table_ops {
long index,
unsigned long *hpa,
enum dma_data_direction *direction);
+
+   __be64 *(*useraddrptr)(struct iommu_table *tbl, long index);
 #endif
void (*clear)(struct iommu_table *tbl,
long index, long npages);
@@ -123,9 +125,7 @@ struct iommu_table {
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
-   ((tbl)->it_userspace ? \
-   &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
-   NULL)
+   ((tbl)->it_ops->useraddrptr((tbl), (entry)))
 
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index fa90f60..2962f6d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -267,11 +267,12 @@ extern int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
 extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
unsigned long *hpa, enum dma_data_direction *direction);
+extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index);
 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
__u32 page_shift, __u64 window_size, __u32 levels,
-   struct iommu_table *tbl);
+   bool alloc_userspace_copy, struct iommu_table *tbl);
 extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 extern long pnv_pci_link_table_and_group(int node, int num,
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 841aef7..8cc1caf 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -206,10 +206,6 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
/* it_userspace allocation might be delayed */
return H_TOO_HARD;
 
-   pua = (void *) vmalloc_to_phys(pua);
-   if (WARN_ON_ONCE_RM(!pua))
-   return H_HARDWARE;
-
mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize);
if (!mem)
return H_TOO_HARD;
@@ -283,10 +279,6 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
&hpa)))
return H_HARDWARE;
 
-   pua = (void *) vmalloc_to_phys(pua);
-   if (WARN_ON_ONCE_RM(!pua))
-   return H_HARDWARE;
-
if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
return H_CLOSED;
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 700ceb1..f14b282 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -31,9 +31,9 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
tbl->it_type = TCE_PCI;
 }
 
-static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
+static __be64 *pnv_tce(struct iomm

[PATCH kernel v3 1/6] powerpc/powernv: Remove useless wrapper

2018-07-03 Thread Alexey Kardashevskiy
This gets rid of a useless wrapper around
pnv_pci_ioda2_table_free_pages().

Reviewed-by: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index cc5942d..02275a0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2199,11 +2199,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, 
long index,
pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
-static void pnv_ioda2_table_free(struct iommu_table *tbl)
-{
-   pnv_pci_ioda2_table_free_pages(tbl);
-}
-
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
@@ -2212,7 +2207,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 #endif
.clear = pnv_ioda2_tce_free,
.get = pnv_tce_get,
-   .free = pnv_ioda2_table_free,
+   .free = pnv_pci_ioda2_table_free_pages,
 };
 
 static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
-- 
2.11.0



Re: [PATCH 2/3] powerpc/powernv: DMA operations for discontiguous allocation

2018-07-03 Thread Russell Currey
On Fri, 2018-06-29 at 17:34 +1000, Russell Currey wrote:



> + /*
> +  * The TCE isn't being used, so let's try and
> allocate it.
> +  * Bits 0 and 1 are read/write, and we use bit 2 as
> a "lock"
> +  * bit.  This is to prevent any race where the value
> is set in
> +  * the TCE table but the invalidate/mb() hasn't
> finished yet.
> +  */
> + entry = cpu_to_be64((addr - offset) | 7);
> + ret = cmpxchg(&pe->tces[i], tce, entry);
> + if (ret != tce) {
> + /* conflict, start looking again just in
> case */
> + i--;
> + continue;
> + }
> + pnv_pci_phb3_tce_invalidate(pe, 0, 0, addr - offset,
> 1);

This is wrong and won't work outside of PHB3, will make a generic
handler

> + mb();
> + /* clear the lock bit now that we know it's active
> */
> + ret = cmpxchg(&pe->tces[i], entry, cpu_to_be64((addr
> - offset) | 3));
> + if (ret != entry) {
> + /* conflict, start looking again just in
> case */
> + i--;
> + continue;
> + }
> +
> + return (i << phb->ioda.max_tce_order) | offset;
> + }
> + /* If we get here, the table must be full, so error out. */
> + return -1ULL;
> +}
> +

qtpass.desktop
Description: application/desktop


[PATCH kernel v3 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-03 Thread Alexey Kardashevskiy
A VM which has:
 - a DMA capable device passed through to it (eg. network card);
 - running a malicious kernel that ignores H_PUT_TCE failure;
 - capability of using IOMMU pages bigger that physical pages
can create an IOMMU mapping that exposes (for example) 16MB of
the host physical memory to the device when only 64K was allocated to the VM.

The remaining 16MB - 64K will be some other content of host memory, possibly
including pages of the VM, but also pages of host kernel memory, host
programs or other VMs.

The attacking VM does not control the location of the page it can map,
and is only allowed to map as many pages as it has pages of RAM.

We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
an IOMMU page is contained in the physical page so the PCI hardware won't
get access to unassigned host memory; however this check is missing in
the KVM fastpath (H_PUT_TCE accelerated code). We were lucky so far and
did not hit this yet as the very first time when the mapping happens
we do not have tbl::it_userspace allocated yet and fall back to
the userspace which in turn calls VFIO IOMMU driver, this fails and
the guest does not retry,

This stores the smallest preregistered page size in the preregistered
region descriptor and changes the mm_iommu_xxx API to check this against
the IOMMU page size. This only allows huge pages use if the entire
preregistered block is backed with huge pages which are completely
contained the preregistered chunk; otherwise this defaults to PAGE_SIZE.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v3:
* fixed upper limit for the page size
* added checks that we don't register parts of a huge page

v2:
* explicitely check for compound pages before calling compound_order()

---
The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
advertise 16MB pages to the guest; a typical pseries guest will use 16MB
for IOMMU pages without checking the mmu pagesize and this will fail
at 
https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256

With the change, mapping will fail in KVM and the guest will print:

mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 18 1f 
returned 0 (liobn = 0x8001 starting addr = 800 0)
mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
/pci@8002000/ethernet@0
mlx5_core :00:00.0: failed to map direct window for 
/pci@8002000/ethernet@0: -1
---
 arch/powerpc/include/asm/mmu_context.h |  4 ++--
 arch/powerpc/kvm/book3s_64_vio.c   |  2 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c|  6 +++--
 arch/powerpc/mm/mmu_context_iommu.c| 41 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c|  2 +-
 5 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 896efa5..79d570c 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -35,9 +35,9 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
-   unsigned long ua, unsigned long *hpa);
+   unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
-   unsigned long ua, unsigned long *hpa);
+   unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index d066e37..8c456fa 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -449,7 +449,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct 
iommu_table *tbl,
/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
return H_TOO_HARD;
 
-   if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+   if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
return H_HARDWARE;
 
if (mm_iommu_mapped_inc(mem))
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 925fc31..5b298f5 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -279,7 +279,8 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
if (!mem)
return H_TOO_HARD;
 
-   if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+   if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
+   &hpa)))
re

[PATCH kernel v3 0/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-03 Thread Alexey Kardashevskiy


This is to improve page boundaries checking and should probably
be cc:stable. I came accross this while debugging nvlink2 passthrough
but the lack of checking might be exploited by the existing userspace.

Changes:
v3:
* enforced huge pages not to cross preregistered chunk boundaries

v2:
* 2/2: explicitly check for compound pages before calling compound_order()


This is based on sha1
021c917 Linus Torvalds "Linux 4.18-rc3".

Please comment. Thanks.



Alexey Kardashevskiy (2):
  vfio/spapr: Use IOMMU pageshift rather than pagesize
  KVM: PPC: Check if IOMMU page is contained in the pinned physical page

 arch/powerpc/include/asm/mmu_context.h |  4 ++--
 arch/powerpc/kvm/book3s_64_vio.c   |  2 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c|  6 +++--
 arch/powerpc/mm/mmu_context_iommu.c| 41 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c| 10 -
 5 files changed, 50 insertions(+), 13 deletions(-)

-- 
2.11.0



[PATCH kernel v3 1/2] vfio/spapr: Use IOMMU pageshift rather than pagesize

2018-07-03 Thread Alexey Kardashevskiy
The size is always equal to 1 page so let's use this. Later on this will
be used for other checks which use page shifts to check the granularity
of access.

This should cause no behavioral change.

Reviewed-by: David Gibson 
Acked-by: Alex Williamson 
Signed-off-by: Alexey Kardashevskiy 
---

As Alex suggested, this should go via the ppc tree which the next patch
is going to (which is ppc-kvm, probably?).

---
 drivers/vfio/vfio_iommu_spapr_tce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 759a5bd..2da5f05 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -457,13 +457,13 @@ static void tce_iommu_unuse_page(struct tce_container 
*container,
 }
 
 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
-   unsigned long tce, unsigned long size,
+   unsigned long tce, unsigned long shift,
unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 {
long ret = 0;
struct mm_iommu_table_group_mem_t *mem;
 
-   mem = mm_iommu_lookup(container->mm, tce, size);
+   mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
if (!mem)
return -EINVAL;
 
@@ -487,7 +487,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container 
*container,
if (!pua)
return;
 
-   ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl),
+   ret = tce_iommu_prereg_ua_to_hpa(container, *pua, tbl->it_page_shift,
&hpa, &mem);
if (ret)
pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
@@ -611,7 +611,7 @@ static long tce_iommu_build_v2(struct tce_container 
*container,
entry + i);
 
ret = tce_iommu_prereg_ua_to_hpa(container,
-   tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem);
+   tce, tbl->it_page_shift, &hpa, &mem);
if (ret)
break;
 
-- 
2.11.0



Re: [PATCHv3 3/4] drivers/base: clean up the usage of devices_kset_move_last()

2018-07-03 Thread Pingfan Liu
On Tue, Jul 3, 2018 at 10:28 PM Rafael J. Wysocki  wrote:
>
> On Tuesday, July 3, 2018 8:50:41 AM CEST Pingfan Liu wrote:
> > Clean up the referring to the code in commit 52cdbdd49853 ("driver core:
> > correct device's shutdown order"). So later we can revert it safely.
> >
> > Cc: Greg Kroah-Hartman 
> > Cc: Rafael J. Wysocki 
> > Cc: Grygorii Strashko 
> > Cc: Christoph Hellwig 
> > Cc: Bjorn Helgaas 
> > Cc: Dave Young 
> > Cc: linux-...@vger.kernel.org
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Signed-off-by: Pingfan Liu 
> > ---
> >  drivers/base/core.c | 7 ---
> >  1 file changed, 7 deletions(-)
> >
> > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > index 684b994..db3deb8 100644
> > --- a/drivers/base/core.c
> > +++ b/drivers/base/core.c
> > @@ -127,13 +127,6 @@ static int device_reorder_to_tail(struct device *dev, 
> > void *not_used)
> >  {
> >   struct device_link *link;
> >
> > - /*
> > -  * Devices that have not been registered yet will be put to the ends
> > -  * of the lists during the registration, so skip them here.
> > -  */
> > - if (device_is_registered(dev))
> > - devices_kset_move_last(dev);
> > -
> >   if (device_pm_initialized(dev))
> >   device_pm_move_last(dev);
>
> You can't do this.
>
> If you do it, that will break power management in some situations.
>
Could you shed light on it? I had a quick browsing of pm code, but it
is a big function, and I got lost in it.
If the above code causes failure, then does it imply that the seq in
devices_kset should be the same as dpm_list? But in device_shutdown(),
it only intersect with pm by
pm_runtime_get_noresume(dev) and pm_runtime_barrier(dev). How do these
function affect the seq in dpm_list? Need your help to see how to
handle this issue.

Thanks and regards,
Pingfan


Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public

2018-07-03 Thread Baoquan He
On 07/03/18 at 11:57pm, Andy Shevchenko wrote:
> On Tue, Jul 3, 2018 at 5:55 PM, Baoquan He  wrote:
> > On 06/12/18 at 05:24pm, Andy Shevchenko wrote:
> >> On Tue, Jun 12, 2018 at 5:20 PM, Andy Shevchenko
> >>  wrote:
> 
> >> > I briefly looked at the code and error codes we have, so, my proposal
> >> > is one of the following
> >>
> >> >  - use -ECANCELED (not the best choice for first occurrence here,
> >> > though I can't find better)
> >>
> >> Actually -ENOTSUPP might suit the first case (although the actual
> >> would be something like -EOVERLAP, which we don't have)
> >
> > Sorry for late reply, and many thanks for your great suggestion.
> >
> 
> > I am fine to use -ENOTSUPP as the first returned value, and -ECANCELED
> > for the 2nd one.
> 
> I have no strong opinion, but I like (slightly better) this approach ^^^

Done, post v6 in this way, many thanks.

> 
> > Or define an enum as you suggested inside the function
> > or in header file.
> 
> >
> > Or use -EBUSY for the first case because existing resource is
> > overlapping but not fully contained by 'res'; and -EINVAL for
> > the 2nd case since didn't find any one resources which is contained by
> > 'res', means we passed in a invalid resource.
> >
> > All is fine to me, I can repost with each of them.
> 
> >> >  - use positive integers (or enum), like
> >> >   #define RES_REPARENTED 0
> >> >   #define RES_OVERLAPPED 1
> >> >   #define RES_NOCONFLICT 2
> 
> -- 
> With Best Regards,
> Andy Shevchenko


[PATCH v6 4/4] kexec_file: Load kernel at top of system RAM if required

2018-07-03 Thread Baoquan He
For kexec_file loading, if kexec_buf.top_down is 'true', the memory which
is used to load kernel/initrd/purgatory is supposed to be allocated from
top to down. This is what we have been doing all along in the old kexec
loading interface and the kexec loading is still default setting in some
distributions. However, the current kexec_file loading interface doesn't
do likt this. The function arch_kexec_walk_mem() it calls ignores checking
kexec_buf.top_down, but calls walk_system_ram_res() directly to go through
all resources of System RAM from bottom to up, to try to find memory region
which can contain the specific kexec buffer, then call 
locate_mem_hole_callback()
to allocate memory in that found memory region from top to down. This brings
confusion especially when KASLR is widely supported , users have to make clear
why kexec/kdump kernel loading position is different between these two
interfaces in order to exclude unnecessary noises. Hence these two interfaces
need be unified on behaviour.

Here add checking if kexec_buf.top_down is 'true' in arch_kexec_walk_mem(),
if yes, call the newly added walk_system_ram_res_rev() to find memory region
from top to down to load kernel.

Signed-off-by: Baoquan He 
Cc: Eric Biederman 
Cc: Vivek Goyal 
Cc: Dave Young 
Cc: Andrew Morton 
Cc: Yinghai Lu 
Cc: ke...@lists.infradead.org
---
 kernel/kexec_file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c6a3b6851372..75226c1d08ce 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -518,6 +518,8 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
   IORESOURCE_SYSTEM_RAM | 
IORESOURCE_BUSY,
   crashk_res.start, crashk_res.end,
   kbuf, func);
+   else if (kbuf->top_down)
+   return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
-- 
2.13.6



[PATCH v6 3/4] resource: add walk_system_ram_res_rev()

2018-07-03 Thread Baoquan He
This function, being a variant of walk_system_ram_res() introduced in
commit 8c86e70acead ("resource: provide new functions to walk through
resources"), walks through a list of all the resources of System RAM
in reversed order, i.e., from higher to lower.

It will be used in kexec_file code.

Signed-off-by: Baoquan He 
Cc: Andrew Morton 
Cc: Thomas Gleixner 
Cc: Brijesh Singh 
Cc: "Jérôme Glisse" 
Cc: Borislav Petkov 
Cc: Tom Lendacky 
Cc: Wei Yang 
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c  | 40 
 2 files changed, 43 insertions(+)

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index b7456ae889dd..066cc263e2cc 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -279,6 +279,9 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *));
 extern int
+walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+   int (*func)(struct resource *, void *));
+extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 
end,
void *arg, int (*func)(struct resource *, void *));
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 6d647a3824b1..4c5fbef4ea24 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 
@@ -443,6 +445,44 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 }
 
 /*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+   int (*func)(struct resource *, void *))
+{
+   unsigned long flags;
+   struct resource *res;
+   int ret = -1;
+
+   flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+   read_lock(&resource_lock);
+   list_for_each_entry_reverse(res, &iomem_resource.child, sibling) {
+   if (start >= end)
+   break;
+   if ((res->flags & flags) != flags)
+   continue;
+   if (res->desc != IORES_DESC_NONE)
+   continue;
+   if (res->end < start)
+   break;
+
+   if ((res->end >= start) && (res->start < end)) {
+   ret = (*func)(res, arg);
+   if (ret)
+   break;
+   }
+   end = res->start - 1;
+
+   }
+   read_unlock(&resource_lock);
+   return ret;
+}
+
+/*
  * This function calls the @func callback against all memory ranges, which
  * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
  */
-- 
2.13.6



[PATCH v6 2/4] resource: Use list_head to link sibling resource

2018-07-03 Thread Baoquan He
The struct resource uses singly linked list to link siblings, implemented
by pointer operation. Replace it with list_head for better code readability.

Based on this list_head replacement, it will be very easy to do reverse
iteration on iomem_resource's sibling list in later patch.

Besides, type of member variables of struct resource, sibling and child, are
changed from 'struct resource *' to 'struct list_head'. This brings two
pointers of size increase.

Suggested-by: Andrew Morton 
Signed-off-by: Baoquan He 
Cc: Patrik Jakobsson 
Cc: David Airlie 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Dmitry Torokhov 
Cc: Dan Williams 
Cc: Rob Herring 
Cc: Frank Rowand 
Cc: Keith Busch 
Cc: Jonathan Derrick 
Cc: Lorenzo Pieralisi 
Cc: Bjorn Helgaas 
Cc: Thomas Gleixner 
Cc: Brijesh Singh 
Cc: "Jérôme Glisse" 
Cc: Borislav Petkov 
Cc: Tom Lendacky 
Cc: Greg Kroah-Hartman 
Cc: Yaowei Bai 
Cc: Wei Yang 
Cc: de...@linuxdriverproject.org
Cc: linux-in...@vger.kernel.org
Cc: linux-nvd...@lists.01.org
Cc: devicet...@vger.kernel.org
Cc: linux-...@vger.kernel.org
---
 arch/arm/plat-samsung/pm-check.c|   6 +-
 arch/microblaze/pci/pci-common.c|   4 +-
 arch/powerpc/kernel/pci-common.c|   4 +-
 arch/sparc/kernel/ioport.c  |   2 +-
 arch/xtensa/include/asm/pci-bridge.h|   4 +-
 drivers/eisa/eisa-bus.c |   2 +
 drivers/gpu/drm/drm_memory.c|   3 +-
 drivers/gpu/drm/gma500/gtt.c|   5 +-
 drivers/hv/vmbus_drv.c  |  52 +++
 drivers/input/joystick/iforce/iforce-main.c |   4 +-
 drivers/nvdimm/namespace_devs.c |   6 +-
 drivers/nvdimm/nd.h |   5 +-
 drivers/of/address.c|   4 +-
 drivers/parisc/lba_pci.c|   4 +-
 drivers/pci/controller/vmd.c|   8 +-
 drivers/pci/probe.c |   2 +
 drivers/pci/setup-bus.c |   2 +-
 include/linux/ioport.h  |  17 ++-
 kernel/resource.c   | 208 ++--
 19 files changed, 175 insertions(+), 167 deletions(-)

diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c
index cd2c02c68bc3..5494355b1c49 100644
--- a/arch/arm/plat-samsung/pm-check.c
+++ b/arch/arm/plat-samsung/pm-check.c
@@ -46,8 +46,8 @@ typedef u32 *(run_fn_t)(struct resource *ptr, u32 *arg);
 static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
 {
while (ptr != NULL) {
-   if (ptr->child != NULL)
-   s3c_pm_run_res(ptr->child, fn, arg);
+   if (!list_empty(&ptr->child))
+   s3c_pm_run_res(resource_first_child(&ptr->child), fn, 
arg);
 
if ((ptr->flags & IORESOURCE_SYSTEM_RAM)
== IORESOURCE_SYSTEM_RAM) {
@@ -57,7 +57,7 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, 
u32 *arg)
arg = (fn)(ptr, arg);
}
 
-   ptr = ptr->sibling;
+   ptr = resource_sibling(ptr);
}
 }
 
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 7899bafab064..2bf73e27e231 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -533,7 +533,9 @@ void pci_process_bridge_OF_ranges(struct pci_controller 
*hose,
res->flags = range.flags;
res->start = range.cpu_addr;
res->end = range.cpu_addr + range.size - 1;
-   res->parent = res->child = res->sibling = NULL;
+   res->parent = NULL;
+   INIT_LIST_HEAD(&res->child);
+   INIT_LIST_HEAD(&res->sibling);
}
}
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 926035bb378d..28fbe83c9daf 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -761,7 +761,9 @@ void pci_process_bridge_OF_ranges(struct pci_controller 
*hose,
res->flags = range.flags;
res->start = range.cpu_addr;
res->end = range.cpu_addr + range.size - 1;
-   res->parent = res->child = res->sibling = NULL;
+   res->parent = NULL;
+   INIT_LIST_HEAD(&res->child);
+   INIT_LIST_HEAD(&res->sibling);
}
}
 }
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index cca9134cfa7d..99efe4e98b16 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -669,7 +669,7 @@ static int sparc_io_proc_show(struct seq_file *m, void *v)
struct resource *root = m->private, *r;
const char *nm;
 
-   for (r = root->child; r != NULL; r = r->sibling) {
+  

[PATCH v6 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public

2018-07-03 Thread Baoquan He
reparent_resources() is duplicated in arch/microblaze/pci/pci-common.c
and arch/powerpc/kernel/pci-common.c, so move it to kernel/resource.c
so that it's shared.

Signed-off-by: Baoquan He 
---
 arch/microblaze/pci/pci-common.c | 37 -
 arch/powerpc/kernel/pci-common.c | 35 ---
 include/linux/ioport.h   |  1 +
 kernel/resource.c| 39 +++
 4 files changed, 40 insertions(+), 72 deletions(-)

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index f34346d56095..7899bafab064 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -619,43 +619,6 @@ int pcibios_add_device(struct pci_dev *dev)
 EXPORT_SYMBOL(pcibios_add_device);
 
 /*
- * Reparent resource children of pr that conflict with res
- * under res, and make res replace those children.
- */
-static int __init reparent_resources(struct resource *parent,
-struct resource *res)
-{
-   struct resource *p, **pp;
-   struct resource **firstpp = NULL;
-
-   for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
-   if (p->end < res->start)
-   continue;
-   if (res->end < p->start)
-   break;
-   if (p->start < res->start || p->end > res->end)
-   return -1;  /* not completely contained */
-   if (firstpp == NULL)
-   firstpp = pp;
-   }
-   if (firstpp == NULL)
-   return -1;  /* didn't find any conflicting entries? */
-   res->parent = parent;
-   res->child = *firstpp;
-   res->sibling = *pp;
-   *firstpp = res;
-   *pp = NULL;
-   for (p = res->child; p != NULL; p = p->sibling) {
-   p->parent = res;
-   pr_debug("PCI: Reparented %s [%llx..%llx] under %s\n",
-p->name,
-(unsigned long long)p->start,
-(unsigned long long)p->end, res->name);
-   }
-   return 0;
-}
-
-/*
  *  Handle resources of PCI devices.  If the world were perfect, we could
  *  just allocate all the resource regions and do nothing more.  It isn't.
  *  On the other hand, we cannot just re-allocate all devices, as it would
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index fe9733aa..926035bb378d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1088,41 +1088,6 @@ resource_size_t pcibios_align_resource(void *data, const 
struct resource *res,
 EXPORT_SYMBOL(pcibios_align_resource);
 
 /*
- * Reparent resource children of pr that conflict with res
- * under res, and make res replace those children.
- */
-static int reparent_resources(struct resource *parent,
-struct resource *res)
-{
-   struct resource *p, **pp;
-   struct resource **firstpp = NULL;
-
-   for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) {
-   if (p->end < res->start)
-   continue;
-   if (res->end < p->start)
-   break;
-   if (p->start < res->start || p->end > res->end)
-   return -1;  /* not completely contained */
-   if (firstpp == NULL)
-   firstpp = pp;
-   }
-   if (firstpp == NULL)
-   return -1;  /* didn't find any conflicting entries? */
-   res->parent = parent;
-   res->child = *firstpp;
-   res->sibling = *pp;
-   *firstpp = res;
-   *pp = NULL;
-   for (p = res->child; p != NULL; p = p->sibling) {
-   p->parent = res;
-   pr_debug("PCI: Reparented %s %pR under %s\n",
-p->name, p, res->name);
-   }
-   return 0;
-}
-
-/*
  *  Handle resources of PCI devices.  If the world were perfect, we could
  *  just allocate all the resource regions and do nothing more.  It isn't.
  *  On the other hand, we cannot just re-allocate all devices, as it would
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..dfdcd0bfe54e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -192,6 +192,7 @@ extern int allocate_resource(struct resource *root, struct 
resource *new,
 struct resource *lookup_resource(struct resource *root, resource_size_t start);
 int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size);
+int reparent_resources(struct resource *parent, struct resource *res);
 resource_size_t resource_alignment(struct resource *res);
 static inline resource_size_t resource_size(const struct resource *res)
 {
diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..d1cbf4b50e17 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ 

[PATCH v6 0/4] resource: Use list_head to link sibling resource

2018-07-03 Thread Baoquan He
This patchset is doing:
1) Replace struct resource's sibling list from singly linked list to
list_head. Clearing out those pointer operation within singly linked
list for better code readability.
2) Based on list_head replacement, add a new function
walk_system_ram_res_rev() which can does reversed iteration on
iomem_resource's siblings.
3) Change kexec_file loading to search system RAM top down for kernel
loadin, using walk_system_ram_res_rev().

Note:
This patchset only passed testing on  x86_64 arch with network
enabling. The thing we need pay attetion to is that a root resource's
child member need be initialized specifically with LIST_HEAD_INIT() if
statically defined or INIT_LIST_HEAD() for dynamically definition. Here
Just like we do for iomem_resource/ioport_resource, or the change in
get_pci_domain_busn_res().

v5:
http://lkml.kernel.org/r/20180612032831.29747-1-...@redhat.com

v4:
http://lkml.kernel.org/r/20180507063224.24229-1-...@redhat.com

v3:
http://lkml.kernel.org/r/20180419001848.3041-1-...@redhat.com

v2:
http://lkml.kernel.org/r/20180408024724.16812-1-...@redhat.com

v1:
http://lkml.kernel.org/r/20180322033722.9279-1-...@redhat.com

Changelog:
v5->v6:
  Fix code style problems in reparent_resources() and use existing
  error codes, according to Andy's suggestion.

  Fix bugs test robot reported.
  
v4->v5:
  Add new patch 0001 to move duplicated reparent_resources() to
  kernel/resource.c to make it be shared by different ARCH-es.

  Fix several code bugs reported by test robot on ARCH powerpc and
  microblaze.
v3->v4:
  Fix several bugs test robot reported. Rewrite cover letter and patch
  log according to reviewer's comment.

v2->v3:
  Rename resource functions first_child() and sibling() to
  resource_first_chils() and resource_sibling(). Dan suggested this.

  Move resource_first_chils() and resource_sibling() to linux/ioport.h
  and make them as inline function. Rob suggested this. Accordingly add
  linux/list.h including in linux/ioport.h, please help review if this
  bring efficiency degradation or code redundancy.

  The change on struct resource {} bring two pointers of size increase,
  mention this in git log to make it more specifically, Rob suggested
  this.

v1->v2:
  Use list_head instead to link resource siblings. This is suggested by
  Andrew.

  Rewrite walk_system_ram_res_rev() after list_head is taken to link
  resouce siblings.



Baoquan He (4):
  resource: Move reparent_resources() to kernel/resource.c and make it
public
  resource: Use list_head to link sibling resource
  resource: add walk_system_ram_res_rev()
  kexec_file: Load kernel at top of system RAM if required

 arch/arm/plat-samsung/pm-check.c|   6 +-
 arch/microblaze/pci/pci-common.c|  41 +
 arch/powerpc/kernel/pci-common.c|  39 +
 arch/sparc/kernel/ioport.c  |   2 +-
 arch/xtensa/include/asm/pci-bridge.h|   4 +-
 drivers/eisa/eisa-bus.c |   2 +
 drivers/gpu/drm/drm_memory.c|   3 +-
 drivers/gpu/drm/gma500/gtt.c|   5 +-
 drivers/hv/vmbus_drv.c  |  52 +++---
 drivers/input/joystick/iforce/iforce-main.c |   4 +-
 drivers/nvdimm/namespace_devs.c |   6 +-
 drivers/nvdimm/nd.h |   5 +-
 drivers/of/address.c|   4 +-
 drivers/parisc/lba_pci.c|   4 +-
 drivers/pci/controller/vmd.c|   8 +-
 drivers/pci/probe.c |   2 +
 drivers/pci/setup-bus.c |   2 +-
 include/linux/ioport.h  |  21 ++-
 kernel/kexec_file.c |   2 +
 kernel/resource.c   | 263 ++--
 20 files changed, 248 insertions(+), 227 deletions(-)

-- 
2.13.6



Re: [PATCHv5 1/4] kbuild: Add build salt to the kernel and modules

2018-07-03 Thread Masahiro Yamada
Hi.

Thanks for the update.


2018-07-04 8:34 GMT+09:00 Laura Abbott :
>
> The build id generated from --build-id can be generated in several different
> ways, with the default being the sha1 on the output of the linked file. For
> distributions, it can be useful to make sure this ID is unique, even if the
> actual file contents don't change. The easiest way to do this is to insert
> a section with some data.
>
> Add an ELF note to both the kernel and module which contains some data based
> off of a config option.
>
> Signed-off-by: Masahiro Yamada 
> Signed-off-by: Laura Abbott 
> ---
> v5: I used S-o-b here since the majority of the code was written
> already.


I think Suggested-by is good enough.
S-o-b is appended as a patch is passed from people to people.

Anyway, this looks good except one bike-shed.

> Please feel free to change the tag if you think it's not
> appropriate. I also tweaked this to take an ascii string instead of just
> a hex value since this makes things much easier on the distribution
> side.
> ---


> diff --git a/init/Kconfig b/init/Kconfig
> index 041f3a022122..8de789f40db9 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -107,6 +107,15 @@ config LOCALVERSION_AUTO
>
>   which is done within the script "scripts/setlocalversion".)
>
> +config BUILD_SALT
> +   string "Build ID Salt"
> +   default "Linux"


How about empty string ""
for default?




-- 
Best Regards
Masahiro Yamada


Re: [PATCHv5 0/4] Salted build ids via ELF notes

2018-07-03 Thread Masahiro Yamada
Hi.

2018-07-04 8:21 GMT+09:00 Laura Abbott :
>
> Hi,
>
> This is v5 of the series to allow unique build ids in the kernel. As a
> reminder of the context:




> ""
> In Fedora, the debug information is packaged separately (foo-debuginfo) and
> can be installed separately. There's been a long standing issue where only one
> version of a debuginfo info package can be installed at a time. Mark Wielaard
> made an effort for Fedora 27 to allow parallel installation of debuginfo (see
> https://fedoraproject.org/wiki/Changes/ParallelInstallableDebuginfo for
> more details)
>
> Part of the requirement to allow this to work is that build ids are
> unique between builds. The existing upstream rpm implementation ensures
> this by re-calculating the build-id using the version and release as a
> seed. This doesn't work 100% for the kernel because of the vDSO which is
> its own binary and doesn't get updated. After poking holes in a few of my
> ideas, there was a discussion with some people from the binutils team about
> adding --build-id-salt to let ld do the calculation debugedit is doing. There
> was a counter proposal made to add in the salt while building. The
> easiest proposal was to add an item in the linker script vs. linking in
> an object since we need the salt to go in every module as well as the
> kernel and vmlinux.
> ""

I think this information is helpful to explain the background
of this work, but the cover letter cannot be committed in git.

Could you add this in 1/4 please?


If I read only the simple log in 1/4,
I would wonder why it is useful...







> v5 uses the approach suggested by Masahiro Yamada which uses the
> existing ELF note macro to more easily add the salt (vs previous
> approaches which tried to adjust via linker section).
>
> If arch maintainers are okay, I'd like acks for this so this can go
> through the kbuild tree.
>
> Thanks,
> Laura
>
> Laura Abbott (4):
>   kbuild: Add build salt to the kernel and modules
>   x86: Add build salt to the vDSO
>   powerpc: Add build salt to the vDSO
>   arm64: Add build salt to the vDSO
>
>  arch/arm64/kernel/vdso/note.S |  3 +++
>  arch/powerpc/kernel/vdso32/note.S |  3 +++
>  arch/x86/entry/vdso/vdso-note.S   |  3 +++
>  arch/x86/entry/vdso/vdso32/note.S |  3 +++
>  include/linux/build-salt.h| 20 
>  init/Kconfig  |  9 +
>  init/version.c|  3 +++
>  scripts/mod/modpost.c |  3 +++
>  8 files changed, 47 insertions(+)
>  create mode 100644 include/linux/build-salt.h
>
> --
> 2.17.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kbuild" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Best Regards
Masahiro Yamada


Re: [PATCHv5 4/4] arm64: Add build salt to the vDSO

2018-07-03 Thread Masahiro Yamada
Hi.

2018-07-04 8:34 GMT+09:00 Laura Abbott :
>
> The vDSO needs to have a unique build id in a similar manner
> to the kernel and modules. Use the build salt macro.
>
> Signed-off-by: Laura Abbott 
> ---
> v5: I was previously focused on x86 only but since powerpc gave a patch,
> I figured I would do arm64 since the changes were also fairly simple.
> ---
>  arch/arm64/kernel/vdso/note.S | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S
> index b82c85e5d972..2c429dfd3f45 100644
> --- a/arch/arm64/kernel/vdso/note.S
> +++ b/arch/arm64/kernel/vdso/note.S
> @@ -22,7 +22,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  ELFNOTE_START(Linux, 0, "a")
> .long LINUX_VERSION_CODE
>  ELFNOTE_END
> +
> +BUILD_SALT;



I think this works, but
I prefer no-semicolon in assembly files.

For coding consistency,
I want ';' as statement delimiter in .c files.
But, only new line after each statement in .S files.

For example, in arch/x86/xen/xen-head.S
I see no semicolon after ELFNOTE().

I found this:
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0473k/dom1359731141352.html
It says ';' starts a comment line
although it is not the case of GAS.


Same for 3/4.



-- 
Best Regards
Masahiro Yamada


RE: [PATCH v11 00/26] Speculative page faults

2018-07-03 Thread Song, HaiyanX
Hi Laurent,


For the test result on Intel 4s skylake platform (192 CPUs, 768G Memory), the 
below test cases all were run 3 times.
I check the test results, only page_fault3_thread/enable THP have 6% stddev for 
head commit, other tests have lower stddev.

And I did not find other high variation on test case result.

a). Enable THP
testcase  base stddev   change  head 
stddev metric
page_fault3/enable THP   10519  ± 3%-20.5%  8368  
±6%  will-it-scale.per_thread_ops
page_fault2/enalbe THP8281  ± 2%-18.8%  6728
   will-it-scale.per_thread_ops
brk1/eanble THP 998475   -2.2%976893
   will-it-scale.per_process_ops
context_switch1/enable THP  223910   -1.3%220930
   will-it-scale.per_process_ops
context_switch1/enable THP  233722   -1.0%231288
   will-it-scale.per_thread_ops

b). Disable THP
page_fault3/disable THP  10856  -23.1%  8344
   will-it-scale.per_thread_ops
page_fault2/disable THP   8147  -18.8%  6613
   will-it-scale.per_thread_ops
brk1/disable THP   957-7.9%  881
   will-it-scale.per_thread_ops
context_switch1/disable THP 237006-2.2%231907   
   will-it-scale.per_thread_ops
brk1/disable THP997317-2.0%98   
   will-it-scale.per_process_ops
page_fault3/disable THP 467454-1.8%459251   
   will-it-scale.per_process_ops
context_switch1/disable THP 224431-1.3%221567   
   will-it-scale.per_process_ops


Best regards,
Haiyan Song

From: Laurent Dufour [lduf...@linux.vnet.ibm.com]
Sent: Monday, July 02, 2018 4:59 PM
To: Song, HaiyanX
Cc: a...@linux-foundation.org; mho...@kernel.org; pet...@infradead.org; 
kir...@shutemov.name; a...@linux.intel.com; d...@stgolabs.net; j...@suse.cz; 
Matthew Wilcox; khand...@linux.vnet.ibm.com; aneesh.ku...@linux.vnet.ibm.com; 
b...@kernel.crashing.org; m...@ellerman.id.au; pau...@samba.org; Thomas 
Gleixner; Ingo Molnar; h...@zytor.com; Will Deacon; Sergey Senozhatsky; 
sergey.senozhatsky.w...@gmail.com; Andrea Arcangeli; Alexei Starovoitov; Wang, 
Kemi; Daniel Jordan; David Rientjes; Jerome Glisse; Ganesh Mahendran; Minchan 
Kim; Punit Agrawal; vinayak menon; Yang Shi; linux-ker...@vger.kernel.org; 
linux...@kvack.org; ha...@linux.vnet.ibm.com; npig...@gmail.com; 
bsinghar...@gmail.com; paul...@linux.vnet.ibm.com; Tim Chen; 
linuxppc-dev@lists.ozlabs.org; x...@kernel.org
Subject: Re: [PATCH v11 00/26] Speculative page faults

On 11/06/2018 09:49, Song, HaiyanX wrote:
> Hi Laurent,
>
> Regression test for v11 patch serials have been run, some regression is found 
> by LKP-tools (linux kernel performance)
> tested on Intel 4s skylake platform. This time only test the cases which have 
> been run and found regressions on
> V9 patch serials.
>
> The regression result is sorted by the metric will-it-scale.per_thread_ops.
> branch: Laurent-Dufour/Speculative-page-faults/20180520-045126
> commit id:
>   head commit : a7a8993bfe3ccb54ad468b9f1799649e4ad1ff12
>   base commit : ba98a1cdad71d259a194461b3a61471b49b14df1
> Benchmark: will-it-scale
> Download link: https://github.com/antonblanchard/will-it-scale/tree/master
>
> Metrics:
>   will-it-scale.per_process_ops=processes/nr_cpu
>   will-it-scale.per_thread_ops=threads/nr_cpu
>   test box: lkp-skl-4sp1(nr_cpu=192,memory=768G)
> THP: enable / disable
> nr_task:100%
>
> 1. Regressions:
>
> a). Enable THP
> testcase  base   change  head   
> metric
> page_fault3/enable THP   10519  -20.5%836  
> will-it-scale.per_thread_ops
> page_fault2/enalbe THP8281  -18.8%   6728  
> will-it-scale.per_thread_ops
> brk1/eanble THP 998475   -2.2% 976893  
> will-it-scale.per_process_ops
> context_switch1/enable THP  223910   -1.3% 220930  
> will-it-scale.per_process_ops
> context_switch1/enable THP  233722   -1.0% 231288  
> will-it-scale.per_thread_ops
>
> b). Disable THP
> page_fault3/disable THP  10856  -23.1%   8344  
> will-it-scale.per_thread_ops
> page_fault2/disable THP   8147  -18.8%   6613  
> will-it-scale.per_thread_ops
> brk1/disable THP   957   -7.9%881  
> will-it-scale.per_thread_ops
> context_switch1/disable THP 237006   -2.2% 231907  
> will-it-scale.per_thread_ops
> brk1/disable THP997317   -2.0% 98  
> will-

Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices

2018-07-03 Thread Pingfan Liu
On Tue, Jul 3, 2018 at 5:26 PM Pingfan Liu  wrote:
>
> On Tue, Jul 3, 2018 at 3:51 PM Lukas Wunner  wrote:
> >
> > On Tue, Jul 03, 2018 at 02:50:40PM +0800, Pingfan Liu wrote:
> > > commit 52cdbdd49853 ("driver core: correct device's shutdown order")
> > > places an assumption of supplier<-consumer order on the process of probe.
> > > But it turns out to break down the parent <- child order in some scene.
> > > E.g in pci, a bridge is enabled by pci core, and behind it, the devices
> > > have been probed. Then comes the bridge's module, which enables extra
> > > feature(such as hotplug) on this bridge. This will break the
> > > parent<-children order and cause failure when "kexec -e" in some scenario.
> > >
> > > The detailed description of the scenario:
> > > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod)
> > > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due
> > > to some issue. For this case, the bridge is moved after its children in
> > > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not
> > > write back buffer in flight due to the former shutdown of the bridge which
> > > clears the BusMaster bit.
> >
> > If you revert commit cc27b735ad3a ("PCI/portdrv: Turn off PCIe services
> > during shutdown"), does the issue go away?
>
> Yes, it is gone.

Have not figured out why the issue was gone. But I think it just cover
some fault.

re-fetch the boot log of mainline kernel without any patch, and filter
out the pci domain 0004
grep "devices_kset: Moving 0004:" newlog.txt
[2.114986] devices_kset: Moving 0004:00:00.0 to end of list   <---
pcie port drive's probe, but it failed
[2.115192] devices_kset: Moving 0004:01:00.0 to end of list
[2.115591] devices_kset: Moving 0004:02:02.0 to end of list
[2.115923] devices_kset: Moving 0004:02:0a.0 to end of list
[2.116141] devices_kset: Moving 0004:02:0b.0 to end of list
[2.116358] devices_kset: Moving 0004:02:0c.0 to end of list
[3.181860] devices_kset: Moving 0004:03:00.0 to end of list  <---
the ata disk controller which sits behind the bridge
[   10.267081] devices_kset: Moving 0004:00:00.0 to end of list  <---
shpc_probe() on this bridge, failed too.

Hence we have the bridge (parent) after the child in devices_kset.

Thanks,
Pingfan


Re: [PATCHv3 0/4] drivers/base: bugfix for supplier<-consumer ordering in device_kset

2018-07-03 Thread Pingfan Liu
On Tue, Jul 3, 2018 at 10:36 PM Rafael J. Wysocki  wrote:
>
> On Tuesday, July 3, 2018 8:50:38 AM CEST Pingfan Liu wrote:
> > commit 52cdbdd49853 ("driver core: correct device's shutdown order")
> > places an assumption of supplier<-consumer order on the process of probe.
> > But it turns out to break down the parent <- child order in some scene.
> > E.g in pci, a bridge is enabled by pci core, and behind it, the devices
> > have been probed. Then comes the bridge's module, which enables extra
> > feature(such as hotplug) on this bridge.
>
> So what *exactly* does happen in that case?
>
I saw the  shpc_probe() is called on the bridge, although the probing
failed on that bare-metal. But if it success, then it will enable the
hotplug feature on the bridge.

Thanks,
Pingfan


[PATCHv5 4/4] arm64: Add build salt to the vDSO

2018-07-03 Thread Laura Abbott


The vDSO needs to have a unique build id in a similar manner
to the kernel and modules. Use the build salt macro.

Signed-off-by: Laura Abbott 
---
v5: I was previously focused on x86 only but since powerpc gave a patch,
I figured I would do arm64 since the changes were also fairly simple.
---
 arch/arm64/kernel/vdso/note.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S
index b82c85e5d972..2c429dfd3f45 100644
--- a/arch/arm64/kernel/vdso/note.S
+++ b/arch/arm64/kernel/vdso/note.S
@@ -22,7 +22,10 @@
 #include 
 #include 
 #include 
+#include 
 
 ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
 ELFNOTE_END
+
+BUILD_SALT;
-- 
2.17.1



[PATCHv5 3/4] powerpc: Add build salt to the vDSO

2018-07-03 Thread Laura Abbott


The vDSO needs to have a unique build id in a similar manner
to the kernel and modules. Use the build salt macro.

Signed-off-by: Laura Abbott 
---
v5: New approach with the BUILD_SALT macro
---
 arch/powerpc/kernel/vdso32/note.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/vdso32/note.S 
b/arch/powerpc/kernel/vdso32/note.S
index d4b5be4f3d5f..ec1d05265c75 100644
--- a/arch/powerpc/kernel/vdso32/note.S
+++ b/arch/powerpc/kernel/vdso32/note.S
@@ -5,6 +5,7 @@
 
 #include 
 #include 
+#include 
 
 #define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)\
.section name, flags; \
@@ -23,3 +24,5 @@
ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
.long LINUX_VERSION_CODE
ASM_ELF_NOTE_END
+
+BUILD_SALT;
-- 
2.17.1



[PATCHv5 2/4] x86: Add build salt to the vDSO

2018-07-03 Thread Laura Abbott


The vDSO needs to have a unique build id in a similar manner
to the kernel and modules. Use the build salt macro.

Signed-off-by: Laura Abbott 
---
v5: Switched to using the single line BUILD_SALT macro
---
 arch/x86/entry/vdso/vdso-note.S   | 3 +++
 arch/x86/entry/vdso/vdso32/note.S | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
index 79a071e4357e..79423170118f 100644
--- a/arch/x86/entry/vdso/vdso-note.S
+++ b/arch/x86/entry/vdso/vdso-note.S
@@ -3,6 +3,7 @@
  * Here we can supply some information useful to userland.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -10,3 +11,5 @@
 ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
 ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso32/note.S 
b/arch/x86/entry/vdso/vdso32/note.S
index 9fd51f206314..e78047d119f6 100644
--- a/arch/x86/entry/vdso/vdso32/note.S
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -4,6 +4,7 @@
  * Here we can supply some information useful to userland.
  */
 
+#include 
 #include 
 #include 
 
@@ -14,6 +15,8 @@ ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
 ELFNOTE_END
 
+BUILD_SALT
+
 #ifdef CONFIG_XEN
 /*
  * Add a special note telling glibc's dynamic linker a fake hardware
-- 
2.17.1



[PATCHv5 1/4] kbuild: Add build salt to the kernel and modules

2018-07-03 Thread Laura Abbott


The build id generated from --build-id can be generated in several different
ways, with the default being the sha1 on the output of the linked file. For
distributions, it can be useful to make sure this ID is unique, even if the
actual file contents don't change. The easiest way to do this is to insert
a section with some data.

Add an ELF note to both the kernel and module which contains some data based
off of a config option.

Signed-off-by: Masahiro Yamada 
Signed-off-by: Laura Abbott 
---
v5: I used S-o-b here since the majority of the code was written
already. Please feel free to change the tag if you think it's not
appropriate. I also tweaked this to take an ascii string instead of just
a hex value since this makes things much easier on the distribution
side.
---
 include/linux/build-salt.h | 20 
 init/Kconfig   |  9 +
 init/version.c |  3 +++
 scripts/mod/modpost.c  |  3 +++
 4 files changed, 35 insertions(+)
 create mode 100644 include/linux/build-salt.h

diff --git a/include/linux/build-salt.h b/include/linux/build-salt.h
new file mode 100644
index ..bb007bd05e7a
--- /dev/null
+++ b/include/linux/build-salt.h
@@ -0,0 +1,20 @@
+#ifndef __BUILD_SALT_H
+#define __BUILD_SALT_H
+
+#include 
+
+#define LINUX_ELFNOTE_BUILD_SALT   0x100
+
+#ifdef __ASSEMBLER__
+
+#define BUILD_SALT \
+   ELFNOTE(Linux, LINUX_ELFNOTE_BUILD_SALT, .asciz CONFIG_BUILD_SALT)
+
+#else
+
+#define BUILD_SALT \
+   ELFNOTE32("Linux", LINUX_ELFNOTE_BUILD_SALT, CONFIG_BUILD_SALT)
+
+#endif
+
+#endif /* __BUILD_SALT_H */
diff --git a/init/Kconfig b/init/Kconfig
index 041f3a022122..8de789f40db9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -107,6 +107,15 @@ config LOCALVERSION_AUTO
 
  which is done within the script "scripts/setlocalversion".)
 
+config BUILD_SALT
+   string "Build ID Salt"
+   default "Linux"
+   help
+  The build ID is used to link binaries and their debug info. Setting
+  this option will use the value in the calculation of the build id.
+  This is mostly useful for distributions which want to ensure the
+  build is unique between builds. It's safe to leave the default.
+
 config HAVE_KERNEL_GZIP
bool
 
diff --git a/init/version.c b/init/version.c
index bfb4e3f4955e..ef4012ec4375 100644
--- a/init/version.c
+++ b/init/version.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -49,3 +50,5 @@ const char linux_proc_banner[] =
"%s version %s"
" (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")"
" (" LINUX_COMPILER ") %s\n";
+
+BUILD_SALT;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 1663fb19343a..dc6d714e4dcb 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -2125,10 +2125,13 @@ static int check_modname_len(struct module *mod)
  **/
 static void add_header(struct buffer *b, struct module *mod)
 {
+   buf_printf(b, "#include \n");
buf_printf(b, "#include \n");
buf_printf(b, "#include \n");
buf_printf(b, "#include \n");
buf_printf(b, "\n");
+   buf_printf(b, "BUILD_SALT;\n");
+   buf_printf(b, "\n");
buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n");
buf_printf(b, "\n");
-- 
2.17.1



[PATCHv5 0/4] Salted build ids via ELF notes

2018-07-03 Thread Laura Abbott


Hi,

This is v5 of the series to allow unique build ids in the kernel. As a
reminder of the context:

""
In Fedora, the debug information is packaged separately (foo-debuginfo) and
can be installed separately. There's been a long standing issue where only one
version of a debuginfo info package can be installed at a time. Mark Wielaard
made an effort for Fedora 27 to allow parallel installation of debuginfo (see
https://fedoraproject.org/wiki/Changes/ParallelInstallableDebuginfo for
more details)

Part of the requirement to allow this to work is that build ids are
unique between builds. The existing upstream rpm implementation ensures
this by re-calculating the build-id using the version and release as a
seed. This doesn't work 100% for the kernel because of the vDSO which is
its own binary and doesn't get updated. After poking holes in a few of my
ideas, there was a discussion with some people from the binutils team about
adding --build-id-salt to let ld do the calculation debugedit is doing. There
was a counter proposal made to add in the salt while building. The
easiest proposal was to add an item in the linker script vs. linking in
an object since we need the salt to go in every module as well as the
kernel and vmlinux.
""

v5 uses the approach suggested by Masahiro Yamada which uses the
existing ELF note macro to more easily add the salt (vs previous
approaches which tried to adjust via linker section).

If arch maintainers are okay, I'd like acks for this so this can go
through the kbuild tree.

Thanks,
Laura

Laura Abbott (4):
  kbuild: Add build salt to the kernel and modules
  x86: Add build salt to the vDSO
  powerpc: Add build salt to the vDSO
  arm64: Add build salt to the vDSO

 arch/arm64/kernel/vdso/note.S |  3 +++
 arch/powerpc/kernel/vdso32/note.S |  3 +++
 arch/x86/entry/vdso/vdso-note.S   |  3 +++
 arch/x86/entry/vdso/vdso32/note.S |  3 +++
 include/linux/build-salt.h| 20 
 init/Kconfig  |  9 +
 init/version.c|  3 +++
 scripts/mod/modpost.c |  3 +++
 8 files changed, 47 insertions(+)
 create mode 100644 include/linux/build-salt.h

-- 
2.17.1



[PATCHv5 0/4] Salted build ids via ELF notes

2018-07-03 Thread Laura Abbott


Hi,

This is v5 of the series to allow unique build ids in the kernel. As a
reminder of the context:

""
In Fedora, the debug information is packaged separately (foo-debuginfo) and
can be installed separately. There's been a long standing issue where only one
version of a debuginfo info package can be installed at a time. Mark Wielaard
made an effort for Fedora 27 to allow parallel installation of debuginfo (see
https://fedoraproject.org/wiki/Changes/ParallelInstallableDebuginfo for
more details)

Part of the requirement to allow this to work is that build ids are
unique between builds. The existing upstream rpm implementation ensures
this by re-calculating the build-id using the version and release as a
seed. This doesn't work 100% for the kernel because of the vDSO which is
its own binary and doesn't get updated. After poking holes in a few of my
ideas, there was a discussion with some people from the binutils team about
adding --build-id-salt to let ld do the calculation debugedit is doing. There
was a counter proposal made to add in the salt while building. The
easiest proposal was to add an item in the linker script vs. linking in
an object since we need the salt to go in every module as well as the
kernel and vmlinux.
""

v5 uses the approach suggested by Masahiro Yamada which uses the
existing ELF note macro to more easily add the salt (vs previous
approaches which tried to adjust via linker section).

If arch maintainers are okay, I'd like acks for this so this can go
through the kbuild tree.

Thanks,
Laura

Laura Abbott (4):
  kbuild: Add build salt to the kernel and modules
  x86: Add build salt to the vDSO
  powerpc: Add build salt to the vDSO
  arm64: Add build salt to the vDSO

 arch/arm64/kernel/vdso/note.S |  3 +++
 arch/powerpc/kernel/vdso32/note.S |  3 +++
 arch/x86/entry/vdso/vdso-note.S   |  3 +++
 arch/x86/entry/vdso/vdso32/note.S |  3 +++
 include/linux/build-salt.h| 20 
 init/Kconfig  |  9 +
 init/version.c|  3 +++
 scripts/mod/modpost.c |  3 +++
 8 files changed, 47 insertions(+)
 create mode 100644 include/linux/build-salt.h

-- 
2.17.1



Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices

2018-07-03 Thread Pavel Tatashin
Thank you Andy for the heads up. I might need to rebase my work
(http://lkml.kernel.org/r/20180629182541.6735-1-pasha.tatas...@oracle.com)
based on this change. But, it is possible it is going to be harder to
parallelize based on device tree. I will need to think about it.

Pavel

On Tue, Jul 3, 2018 at 6:59 AM Andy Shevchenko
 wrote:
>
> I think Pavel would be interested to see this as well (he is doing
> some parallel device shutdown stuff)
>
> On Tue, Jul 3, 2018 at 9:50 AM, Pingfan Liu  wrote:
> > commit 52cdbdd49853 ("driver core: correct device's shutdown order")
> > places an assumption of supplier<-consumer order on the process of probe.
> > But it turns out to break down the parent <- child order in some scene.
> > E.g in pci, a bridge is enabled by pci core, and behind it, the devices
> > have been probed. Then comes the bridge's module, which enables extra
> > feature(such as hotplug) on this bridge. This will break the
> > parent<-children order and cause failure when "kexec -e" in some scenario.
> >
> > The detailed description of the scenario:
> > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod)
> > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due
> > to some issue. For this case, the bridge is moved after its children in
> > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not
> > write back buffer in flight due to the former shutdown of the bridge which
> > clears the BusMaster bit.
> >
> > It is a little hard to impose both "parent<-child" and "supplier<-consumer"
> > order on devices_kset. Take the following scene:
> > step0: before a consumer's probing, (note child_a is supplier of consumer_a)
> >   [ consumer-X, child_a, , child_z] [... consumer_a, ..., consumer_z, 
> > ...] supplier-X
> >  ^^ affected range 
> > ^^
> > step1: when probing, moving consumer-X after supplier-X
> >   [ child_a, , child_z] [ consumer_a, ..., consumer_z, ...] 
> > supplier-X, consumer-X
> > step2: the children of consumer-X should be re-ordered to maintain the seq
> >   [... consumer_a, ..., consumer_z, ] supplier-X  [consumer-X, child_a, 
> > , child_z]
> > step3: the consumer_a should be re-ordered to maintain the seq
> >   [... consumer_z, ...] supplier-X [ consumer-X, child_a, consumer_a ..., 
> > child_z]
> >
> > It requires two nested recursion to drain out all out-of-order item in
> > "affected range". To avoid such complicated code, this patch suggests
> > to utilize the info in device tree, instead of using the order of
> > devices_kset during shutdown. It iterates the device tree, and firstly
> > shutdown a device's children and consumers. After this patch, the buggy
> > commit is hollow and left to clean.
> >
> > Cc: Greg Kroah-Hartman 
> > Cc: Rafael J. Wysocki 
> > Cc: Grygorii Strashko 
> > Cc: Christoph Hellwig 
> > Cc: Bjorn Helgaas 
> > Cc: Dave Young 
> > Cc: linux-...@vger.kernel.org
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Signed-off-by: Pingfan Liu 
> > ---
> >  drivers/base/core.c| 48 
> > +++-
> >  include/linux/device.h |  1 +
> >  2 files changed, 44 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > index a48868f..684b994 100644
> > --- a/drivers/base/core.c
> > +++ b/drivers/base/core.c
> > @@ -1446,6 +1446,7 @@ void device_initialize(struct device *dev)
> > INIT_LIST_HEAD(&dev->links.consumers);
> > INIT_LIST_HEAD(&dev->links.suppliers);
> > dev->links.status = DL_DEV_NO_DRIVER;
> > +   dev->shutdown = false;
> >  }
> >  EXPORT_SYMBOL_GPL(device_initialize);
> >
> > @@ -2811,7 +2812,6 @@ static void __device_shutdown(struct device *dev)
> >  * lock is to be held
> >  */
> > parent = get_device(dev->parent);
> > -   get_device(dev);
> > /*
> >  * Make sure the device is off the kset list, in the
> >  * event that dev->*->shutdown() doesn't remove it.
> > @@ -2842,23 +2842,60 @@ static void __device_shutdown(struct device *dev)
> > dev_info(dev, "shutdown\n");
> > dev->driver->shutdown(dev);
> > }
> > -
> > +   dev->shutdown = true;
> > device_unlock(dev);
> > if (parent)
> > device_unlock(parent);
> >
> > -   put_device(dev);
> > put_device(parent);
> > spin_lock(&devices_kset->list_lock);
> >  }
> >
> > +/* shutdown dev's children and consumer firstly, then itself */
> > +static int device_for_each_child_shutdown(struct device *dev)
> > +{
> > +   struct klist_iter i;
> > +   struct device *child;
> > +   struct device_link *link;
> > +
> > +   /* already shutdown, then skip this sub tree */
> > +   if (dev->shutdown)
> > +   return 0;
> > +
> > +   if (!dev->p)
> > +   goto check_consumers;
> > +
> > +   /* ther

Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public

2018-07-03 Thread Andy Shevchenko
On Tue, Jul 3, 2018 at 5:55 PM, Baoquan He  wrote:
> On 06/12/18 at 05:24pm, Andy Shevchenko wrote:
>> On Tue, Jun 12, 2018 at 5:20 PM, Andy Shevchenko
>>  wrote:

>> > I briefly looked at the code and error codes we have, so, my proposal
>> > is one of the following
>>
>> >  - use -ECANCELED (not the best choice for first occurrence here,
>> > though I can't find better)
>>
>> Actually -ENOTSUPP might suit the first case (although the actual
>> would be something like -EOVERLAP, which we don't have)
>
> Sorry for late reply, and many thanks for your great suggestion.
>

> I am fine to use -ENOTSUPP as the first returned value, and -ECANCELED
> for the 2nd one.

I have no strong opinion, but I like (slightly better) this approach ^^^

> Or define an enum as you suggested inside the function
> or in header file.

>
> Or use -EBUSY for the first case because existing resource is
> overlapping but not fully contained by 'res'; and -EINVAL for
> the 2nd case since didn't find any one resources which is contained by
> 'res', means we passed in a invalid resource.
>
> All is fine to me, I can repost with each of them.

>> >  - use positive integers (or enum), like
>> >   #define RES_REPARENTED 0
>> >   #define RES_OVERLAPPED 1
>> >   #define RES_NOCONFLICT 2

-- 
With Best Regards,
Andy Shevchenko


Re: GCC strcmp optimizations causing valgrind uninitialized conditional jumps

2018-07-03 Thread Segher Boessenkool
On Tue, Jul 03, 2018 at 11:59:14AM -0700, William Kennington wrote:
> Is there a bug tracking the issue?

https://bugs.kde.org/show_bug.cgi?id=386945

> Also, unless your malloc is
> guaranteed to be zeroing out the data or have a strcmp that is writing
> doubleworld aligned data to the string, the strcmp implementation is
> branching based on data existing after the null terminating character
> that may be uninitialized. Both sides of the branch do the right thing
> though, and locate the null terminator, throwing away the calculations
> done on the uninitialized data.

Yes, there is one branch that depends in part on irrelevant data, but
that is handled immediately afterwards.

> -fno-builtin-strcmp or -mstring-compare-inline-limit=0 do work fine
> but we don't control the binaries we are linking against in all cases
> and are seeing the issue pop up there.

Yeah, nasty.  I don't know what to do then (other than fix valgrind, which
isn't so easy either though!)


Segher


Re: GCC strcmp optimizations causing valgrind uninitialized conditional jumps

2018-07-03 Thread Segher Boessenkool
Hi!

On Tue, Jul 03, 2018 at 11:26:55AM -0700, William Kennington wrote:
> I've noticed while trying to do some valgrind testing on code linked
> against system libraries that have inlined strcmps that valgrind is
> unhappy about branches depending on uninitialized memory. I've read

The branches here do *not* depend on uninitialised memory.  Valgrind
does not realise that however.  The valgrind people are aware of this
problem.

> Any ideas on how to workaround / fix this?

Does -fno-builtin-strcmp do the trick?


Segher


GCC strcmp optimizations causing valgrind uninitialized conditional jumps

2018-07-03 Thread William Kennington
Hi,

I've noticed while trying to do some valgrind testing on code linked
against system libraries that have inlined strcmps that valgrind is
unhappy about branches depending on uninitialized memory. I've read
through the generated inline assembly, and it looks like the assembly
will always produce the correct result regardless of the intermediate
compare using uninitialized data. I'm trying to figure out what to do
as next steps since it's non-trivial to suppress this inlined code.

Using the following simple test case which generates the optmizations
using gcc 7.3.0 and gcc 9 devel. This is reproducible using `gcc -O2
-o test test.c` with the following code.

#include 
#include 
#include 

int main()
{
char *heap_str = malloc(16);
strcpy(heap_str, "RandString");
int res = strcmp("RandString", heap_str) == 0;
free(heap_str);
return res;
}

A snippet of the inline assembly causing issues is, valgrind correctly
identifies a branch taken based on uninitialized memory for the first
`bne` in `.L7` since it's reading bytes 8-16 from the 11 byte string:
 # test.c:9:int res = strcmp("RandString", heap_str) == 0;
.loc 1 9 12 view .LVU20
li 8,0   # tmp172,
cmpb 3,9,10  # tmp169, tmp133, tmp134
cmpb 8,9,8   # tmp170, tmp133, tmp172
orc 3,8,3#, tmp169, tmp170, tmp169
cntlzd 3,3   # tmp171, tmp169
addi 3,3,8   # tmp171, tmp171,
rldcl 9,9,3,56   # tmp171, tmp174, tmp133,
rldcl 3,10,3,56  # tmp171, tmp176, tmp134,
subf 3,3,9   # tmp132, tmp176, tmp174
b .L2#
.L7:
addi 9,5,8   # tmp190, tmp126,
addi 10,30,8 # tmp191, heap_str,
ldbrx 9,0,9  # tmp133, MEM[(void *)"RandString"]
ldbrx 10,0,10# tmp134, MEM[(void *)heap_str_5]
subf. 3,10,9 # tmp132, tmp134, tmp133
bne 0,.L4#
cmpb 10,9,3  # tmp140, tmp133, tmp132
cmpdi 7,10,0 #, tmp141, tmp140
bne 7,.L2

Sample valgrind output looks like:
==63162== Memcheck, a memory error detector
==63162== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==63162== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==63162== Command: ./test
==63162==
==63162== Conditional jump or move depends on uninitialised value(s)
==63162==at 0x15D4: main (test.c:9)
==63162==  Uninitialised value was created by a heap allocation
==63162==at 0x4093F00: malloc (in
/usr/lib/valgrind/vgpreload_memcheck-ppc64le-linux.so)
==63162==by 0x1513: main (test.c:7)
==63162==
==63162== Conditional jump or move depends on uninitialised value(s)
==63162==at 0x15E0: main (test.c:9)
==63162==  Uninitialised value was created by a heap allocation
==63162==at 0x4093F00: malloc (in
/usr/lib/valgrind/vgpreload_memcheck-ppc64le-linux.so)
==63162==by 0x1513: main (test.c:7)
==63162==
==63162== Syscall param exit_group(status) contains uninitialised byte(s)
==63162==at 0x41E8C2C: _Exit (_exit.c:31)
==63162==by 0x4132C67: __run_exit_handlers (exit.c:132)
==63162==by 0x4104423: generic_start_main.isra.0 (libc-start.c:344)
==63162==by 0x4104617: (below main) (libc-start.c:116)
==63162==  Uninitialised value was created by a heap allocation
==63162==at 0x4093F00: malloc (in
/usr/lib/valgrind/vgpreload_memcheck-ppc64le-linux.so)
==63162==by 0x1513: main (test.c:7)
==63162==
==63162==
==63162== HEAP SUMMARY:
==63162== in use at exit: 0 bytes in 0 blocks
==63162==   total heap usage: 1 allocs, 1 frees, 16 bytes allocated
==63162==
==63162== All heap blocks were freed -- no leaks are possible
==63162==
==63162== For counts of detected and suppressed errors, rerun with: -v
==63162== ERROR SUMMARY: 3 errors from 3 contexts (suppressed: 0 from 0)

Any ideas on how to workaround / fix this?

Best,
William


Re: [v2 PATCH 2/2] powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores

2018-07-03 Thread Murilo Opsfelder Araujo
On Tue, Jul 03, 2018 at 04:33:51PM +0530, Gautham R. Shenoy wrote:
> From: "Gautham R. Shenoy" 
> 
> A pair of IBM POWER9 SMT4 cores can be fused together to form a big-core
> with 8 SMT threads. This can be discovered via the "ibm,thread-groups"
> CPU property in the device tree which will indicate which group of
> threads that share the L1 cache, translation cache and instruction data
> flow. If there are multiple such group of threads, then the core is a
> big-core.
> 
> Furthermore, if the thread-ids of the threads of the big-core can be
> obtained by interleaving the thread-ids of the thread-groups
> (component small core), then such a big-core is called an interleaved
> big-core.
> 
> Eg: Threads in the pair of component SMT4 cores of an interleaved
> big-core are numbered {0,2,4,6} and {1,3,5,7} respectively.
> 
> The SMT4 cores forming a big-core are more or less independent
> units. Thus when multiple tasks are scheduled to run on the fused
> core, we get the best performance when the tasks are spread across the
> pair of SMT4 cores.
> 
> This patch enables CPU_FTR_ASYM_SMT bit in the cpu-features on
> detecting the presence of interleaved big-cores at boot up. This will
> will bias the load-balancing of tasks on smaller numbered threads,
> which will automatically result in spreading the tasks uniformly
> across the associated pair of SMT4 cores.
> 
> Signed-off-by: Gautham R. Shenoy 
> ---
>  arch/powerpc/kernel/setup-common.c | 67 
> +-
>  1 file changed, 66 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index a78ec66..f63d797 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -537,6 +537,56 @@ int get_cpu_thread_group_start(int cpu, struct 
> thread_groups *tg)
>   return -1;
>  }
>  
> +/*
> + * check_interleaved_big_core - Checks if the thread group tg
> + * corresponds to a big-core whose threads are interleavings of the
> + * threads of the component small cores.
> + *
> + * @tg: A thread-group struct for the core.
> + *
> + * Returns true if the core is a interleaved big-core.
> + * Returns false otherwise.
> + */
> +static inline bool check_interleaved_big_core(struct thread_groups *tg)
> +{
> + int nr_groups;
> + int threads_per_group;
> + int cur_cpu, next_cpu, i, j;
> +
> + nr_groups = tg->nr_groups;
> + threads_per_group = tg->threads_per_group;
> +
> + if (tg->property != 1)
> + return false;
> +
> + if (nr_groups < 2 || threads_per_group < 2)
> + return false;
> +
> + /*
> +  * In case of an interleaved big-core, the thread-ids of the
> +  * big-core can be obtained by interleaving the the thread-ids
> +  * of the component small
> +  *
> +  * Eg: On a 8-thread big-core with two SMT4 small cores, the
> +  * threads of the two component small cores will be
> +  * {0, 2, 4, 6} and {1, 3, 5, 7}.
> +  */
> + for (i = 0; i < nr_groups; i++) {
> + int group_start = i * threads_per_group;
> +
> + for (j = 0; j < threads_per_group - 1; j++) {
> + int cur_idx = group_start + j;
> +
> + cur_cpu = tg->thread_list[cur_idx];
> + next_cpu = tg->thread_list[cur_idx + 1];
> + if (next_cpu != cur_cpu + nr_groups)
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
>  /**
>   * setup_cpu_maps - initialize the following cpu maps:
>   *  cpu_possible_mask
> @@ -560,6 +610,7 @@ void __init smp_setup_cpu_maps(void)
>   struct device_node *dn;
>   int cpu = 0;
>   int nthreads = 1;
> + bool has_interleaved_big_core = true;
>  
>   DBG("smp_setup_cpu_maps()\n");
>  
> @@ -613,6 +664,12 @@ void __init smp_setup_cpu_maps(void)
>   if (parse_thread_groups(dn, &tg) ||
>   tg.nr_groups < 1 || tg.property != 1) {
>   has_big_cores = false;
> + has_interleaved_big_core = false;
> + }
> +
> + if (has_interleaved_big_core) {
> + has_interleaved_big_core =
> + check_interleaved_big_core(&tg);
>   }
>  
>   if (cpu >= nr_cpu_ids) {
> @@ -669,7 +726,15 @@ void __init smp_setup_cpu_maps(void)
>   vdso_data->processorCount = num_present_cpus();
>  #endif /* CONFIG_PPC64 */
>  
> -/* Initialize CPU <=> thread mapping/
> + if (has_interleaved_big_core) {
> + int key = __builtin_ctzl(CPU_FTR_ASYM_SMT);
> +
> + cur_cpu_spec->cpu_features |= CPU_FTR_ASYM_SMT;
> + static_branch_enable(&cpu_feature_keys[key]);
> + pr_info("Detected interleaved big-cores\n");
> + }

Shouldn't we use cpu_has_feature(CPU_FTR_ASYM_SMT) before setting it?

> +
> + /* Initi

Re: [v2 PATCH 1/2] powerpc: Detect the presence of big-cores via "ibm, thread-groups"

2018-07-03 Thread Murilo Opsfelder Araujo
On Tue, Jul 03, 2018 at 04:33:50PM +0530, Gautham R. Shenoy wrote:
> From: "Gautham R. Shenoy" 
> 
> On IBM POWER9, the device tree exposes a property array identifed by
> "ibm,thread-groups" which will indicate which groups of threads share a
> particular set of resources.
> 
> As of today we only have one form of grouping identifying the group of
> threads in the core that share the L1 cache, translation cache and
> instruction data flow.
> 
> This patch defines the helper function to parse the contents of
> "ibm,thread-groups" and a new structure to contain the parsed output.
> 
> The patch also creates the sysfs file named "small_core_siblings" that
> returns the physical ids of the threads in the core that share the L1
> cache, translation cache and instruction data flow.
> 
> Signed-off-by: Gautham R. Shenoy 
> ---
>  Documentation/ABI/testing/sysfs-devices-system-cpu |   8 ++
>  arch/powerpc/include/asm/cputhreads.h  |  22 +
>  arch/powerpc/kernel/setup-common.c | 110 
> +
>  arch/powerpc/kernel/sysfs.c|  35 +++
>  4 files changed, 175 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
> b/Documentation/ABI/testing/sysfs-devices-system-cpu
> index 9c5e7732..53a823a 100644
> --- a/Documentation/ABI/testing/sysfs-devices-system-cpu
> +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
> @@ -487,3 +487,11 @@ Description: Information about CPU vulnerabilities
>   "Not affected"CPU is not affected by the vulnerability
>   "Vulnerable"  CPU is affected and no mitigation in effect
>   "Mitigation: $M"  CPU is affected and mitigation $M is in effect
> +
> +What:/sys/devices/system/cpu/cpu[0-9]+/small_core_sibings

s/small_core_sibings/small_core_siblings

By the way, big_core_siblings was mentioned in the introductory email.

> +Date:03-Jul-2018
> +KernelVersion:   v4.18.0
> +Contact: Gautham R. Shenoy 
> +Description: List of Physical ids of CPUs which share the the L1 cache,
> + translation cache and instruction data-flow with this CPU.
> +Values:  Comma separated list of decimal integers.
> diff --git a/arch/powerpc/include/asm/cputhreads.h 
> b/arch/powerpc/include/asm/cputhreads.h
> index d71a909..33226d7 100644
> --- a/arch/powerpc/include/asm/cputhreads.h
> +++ b/arch/powerpc/include/asm/cputhreads.h
> @@ -23,11 +23,13 @@
>  extern int threads_per_core;
>  extern int threads_per_subcore;
>  extern int threads_shift;
> +extern bool has_big_cores;
>  extern cpumask_t threads_core_mask;
>  #else
>  #define threads_per_core 1
>  #define threads_per_subcore  1
>  #define threads_shift0
> +#define has_big_cores0
>  #define threads_core_mask(*get_cpu_mask(0))
>  #endif
>  
> @@ -69,12 +71,32 @@ static inline cpumask_t cpu_online_cores_map(void)
>   return cpu_thread_mask_to_cores(cpu_online_mask);
>  }
>  
> +#define MAX_THREAD_LIST_SIZE 8
> +struct thread_groups {
> + unsigned int property;
> + unsigned int nr_groups;
> + unsigned int threads_per_group;
> + unsigned int thread_list[MAX_THREAD_LIST_SIZE];
> +};
> +
>  #ifdef CONFIG_SMP
>  int cpu_core_index_of_thread(int cpu);
>  int cpu_first_thread_of_core(int core);
> +int parse_thread_groups(struct device_node *dn, struct thread_groups *tg);
> +int get_cpu_thread_group_start(int cpu, struct thread_groups *tg);
>  #else
>  static inline int cpu_core_index_of_thread(int cpu) { return cpu; }
>  static inline int cpu_first_thread_of_core(int core) { return core; }
> +static inline int parse_thread_groups(struct device_node *dn,
> +   struct thread_groups *tg)
> +{
> + return -ENODATA;
> +}
> +
> +static inline int get_cpu_thread_group_start(int cpu, struct thread_groups 
> *tg)
> +{
> + return -1;
> +}
>  #endif
>  
>  static inline int cpu_thread_in_core(int cpu)
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index 40b44bb..a78ec66 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -402,10 +402,12 @@ void __init check_for_initrd(void)
>  #ifdef CONFIG_SMP
>  
>  int threads_per_core, threads_per_subcore, threads_shift;
> +bool has_big_cores = true;
>  cpumask_t threads_core_mask;
>  EXPORT_SYMBOL_GPL(threads_per_core);
>  EXPORT_SYMBOL_GPL(threads_per_subcore);
>  EXPORT_SYMBOL_GPL(threads_shift);
> +EXPORT_SYMBOL_GPL(has_big_cores);
>  EXPORT_SYMBOL_GPL(threads_core_mask);
>  
>  static void __init cpu_init_thread_core_maps(int tpc)
> @@ -433,6 +435,108 @@ static void __init cpu_init_thread_core_maps(int tpc)
>  
>  u32 *cpu_to_phys_id = NULL;
>  
> +/*
> + * parse_thread_groups: Parses the "ibm,thread-groups" device tree
> + *  property for the CPU device node dn and stores
> + *  the parsed output in

Re: Oops in kmem_cache_free() via bioset_exit() (was Re: [next-20180601][nvme][ppc] Kernel Oops is triggered when creating lvm snapshots on nvme disks)

2018-07-03 Thread Abdul Haleem
On Fri, 2018-06-29 at 00:42 +1000, Michael Ellerman wrote:
> Kent, Jens,
> 
> This looks like it might be related to the recent bioset changes?
> 
> cheers
> 
> Abdul Haleem  writes:
> > On Tue, 2018-06-26 at 23:36 +1000, Michael Ellerman wrote:
> >> Abdul Haleem  writes:
> ...
> > I was able to reproduce again with slub_debug=FZP and DEBUG_INFO enabled
> > on 4.17.0-rc7-next-20180601, but not much traces other than the Oops stack 
> > trace
> 
> Are you still testing on that revision? It's nearly a month old.
> 
> Please try to reproduce on mainline or today's linux-next.

Problem is not reproducible on 4.18.0-rc3 mainline and today's next
kernel

-- 
Regard's

Abdul Haleem
IBM Linux Technology Centre





RE: [PATCH] QE GPIO: Add qe_gpio_set_multiple

2018-07-03 Thread Leo Li


> -Original Message-
> From: York Sun
> Sent: Tuesday, July 3, 2018 10:27 AM
> To: jo...@infinera.com ; Leo Li
> 
> Cc: linuxppc-dev@lists.ozlabs.org; m...@ellerman.id.au; Qiang Zhao
> 
> Subject: Re: [PATCH] QE GPIO: Add qe_gpio_set_multiple
> 
> +Leo
> 
> On 07/03/2018 03:30 AM, Joakim Tjernlund wrote:
> > On Tue, 2018-06-26 at 23:41 +1000, Michael Ellerman wrote:
> >>
> >> Joakim Tjernlund  writes:
> >>> On Thu, 2018-06-21 at 02:38 +, Qiang Zhao wrote:
>  On 06/19/2018 09:22 AM, Joakim Tjernlund wrote:
>  -Original Message-
>  From: Linuxppc-dev [mailto:linuxppc-dev-
> bounces+qiang.zhao=nxp@lists.ozlabs.org] On Behalf Of Joakim
> Tjernlund
>  Sent: 2018年6月20日 0:22
>  To: York Sun ; linuxppc-dev  d...@lists.ozlabs.org>
>  Subject: [PATCH] QE GPIO: Add qe_gpio_set_multiple
> 
>  This cousin to gpio-mpc8xxx was lacking a multiple pins method, add
> one.
> 
>  Signed-off-by: Joakim Tjernlund 
>  ---
>   drivers/soc/fsl/qe/gpio.c | 28 
>   1 file changed, 28 insertions(+)
> >>
> >> ...
>   static int qe_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)  {
>  struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ -
> 298,6 +325,7 @@ static int __init qe_add_gpiochips(void)
>  gc->direction_output = qe_gpio_dir_out;
>  gc->get = qe_gpio_get;
>  gc->set = qe_gpio_set;
>  +   gc->set_multiple = qe_gpio_set_multiple;
> 
>  ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc);
>  if (ret)
> 
>  Reviewed-by: Qiang Zhao 
> 
> >>>
> >>> Who picks up this patch ? Is it queued somewhere already?
> >>
> >> Not me.
> >
> > York? You seem to be the only one left.
> >
> 
> I am not a Linux maintainer. Even I want to, I can't merge this patch.
> 
> Leo, who can merge this patch and request a pull?

Since it falls under the driver/soc/fsl/ folder.  I can take it.

Regards,
Leo


Re: [PATCH 7/7 v5] arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

fsl-mc bus support the new iommu-map property. Comply to this binding
for fsl_mc bus.

Signed-off-by: Nipun Gupta 
Reviewed-by: Laurentiu Tudor 
---
  arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi 
b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
index 137ef4d..6010505 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
@@ -184,6 +184,7 @@
#address-cells = <2>;
#size-cells = <2>;
ranges;
+   dma-ranges = <0x0 0x0 0x0 0x0 0x1 0x>;
  
  		clockgen: clocking@130 {

compatible = "fsl,ls2080a-clockgen";
@@ -357,6 +358,8 @@
reg = <0x0008 0x0c00 0 0x40>,  /* MC portal 
base */
  <0x 0x0834 0 0x4>; /* MC control 
reg */
msi-parent = <&its>;
+   iommu-map = <0 &smmu 0 0>;/* This is fixed-up by 
u-boot */
+   dma-coherent;
#address-cells = <3>;
#size-cells = <1>;
  
@@ -460,6 +463,8 @@

compatible = "arm,mmu-500";
reg = <0 0x500 0 0x80>;
#global-interrupts = <12>;
+   #iommu-cells = <1>;
+   stream-match-mask = <0x7C00>;
interrupts = <0 13 4>, /* global secure fault */
 <0 14 4>, /* combined secure interrupt */
 <0 15 4>, /* global non-secure fault */
@@ -502,7 +507,6 @@
 <0 204 4>, <0 205 4>,
 <0 206 4>, <0 207 4>,
 <0 208 4>, <0 209 4>;
-   mmu-masters = <&fsl_mc 0x300 0>;


Since we're in here, is the SMMU itself also coherent? If it is, you 
probably want to say so and avoid the overhead of pointlessly cleaning 
cache lines on every page table update.


Robin.


};
  
  		dspi: dspi@210 {




Re: [PATCH 6/7 v5] bus: fsl-mc: set coherent dma mask for devices on fsl-mc bus

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

of_dma_configure() API expects coherent_dma_mask to be correctly
set in the devices. This patch does the needful.


Reviewed-by: Robin Murphy 


Signed-off-by: Nipun Gupta 
---
  drivers/bus/fsl-mc/fsl-mc-bus.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index fa43c7d..624828b 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -627,6 +627,7 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc,
mc_dev->icid = parent_mc_dev->icid;
mc_dev->dma_mask = FSL_MC_DEFAULT_DMA_MASK;
mc_dev->dev.dma_mask = &mc_dev->dma_mask;
+   mc_dev->dev.coherent_dma_mask = mc_dev->dma_mask;
dev_set_msi_domain(&mc_dev->dev,
   dev_get_msi_domain(&parent_mc_dev->dev));
}



Re: [PATCH 5/7 v5] bus: fsl-mc: support dma configure for devices on fsl-mc bus

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

This patch adds support of dma configuration for devices on fsl-mc
bus using 'dma_configure' callback for busses. Also, directly calling
arch_setup_dma_ops is removed from the fsl-mc bus.


Looks like this is the final arch_setup_dma_ops offender, yay!


Signed-off-by: Nipun Gupta 
Reviewed-by: Laurentiu Tudor 
---
  drivers/bus/fsl-mc/fsl-mc-bus.c | 15 +++
  1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 5d8266c..fa43c7d 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -127,6 +127,16 @@ static int fsl_mc_bus_uevent(struct device *dev, struct 
kobj_uevent_env *env)
return 0;
  }
  
+static int fsl_mc_dma_configure(struct device *dev)

+{
+   struct device *dma_dev = dev;
+
+   while (dev_is_fsl_mc(dma_dev))
+   dma_dev = dma_dev->parent;
+
+   return of_dma_configure(dev, dma_dev->of_node, 0);
+}
+
  static ssize_t modalias_show(struct device *dev, struct device_attribute 
*attr,
 char *buf)
  {
@@ -148,6 +158,7 @@ struct bus_type fsl_mc_bus_type = {
.name = "fsl-mc",
.match = fsl_mc_bus_match,
.uevent = fsl_mc_bus_uevent,
+   .dma_configure  = fsl_mc_dma_configure,
.dev_groups = fsl_mc_dev_groups,
  };
  EXPORT_SYMBOL_GPL(fsl_mc_bus_type);
@@ -633,10 +644,6 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc,
goto error_cleanup_dev;
}
  
-	/* Objects are coherent, unless 'no shareability' flag set. */

-   if (!(obj_desc->flags & FSL_MC_OBJ_FLAG_NO_MEM_SHAREABILITY))


Although it seems we do end up without any handling of this 
"non-coherent object behind coherent MC" case, and I'm not sure how 
easily that could be accommodated by generic code... :/ How important is 
the quirk?


Robin.


-   arch_setup_dma_ops(&mc_dev->dev, 0, 0, NULL, true);
-
/*
 * The device-specific probe callback will get invoked by device_add()
 */



Re: [PATCH 4/7 v5] iommu: arm-smmu: Add support for the fsl-mc bus

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

Implement bus specific support for the fsl-mc bus including
registering arm_smmu_ops and bus specific device add operations.

Signed-off-by: Nipun Gupta 
---
  drivers/iommu/arm-smmu.c |  7 +++
  drivers/iommu/iommu.c| 21 +
  include/linux/fsl/mc.h   |  8 
  include/linux/iommu.h|  2 ++
  4 files changed, 38 insertions(+)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 69e7c60..e1d5090 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -52,6 +52,7 @@
  #include 
  
  #include 

+#include 
  
  #include "io-pgtable.h"

  #include "arm-smmu-regs.h"
@@ -1459,6 +1460,8 @@ static struct iommu_group *arm_smmu_device_group(struct 
device *dev)
  
  	if (dev_is_pci(dev))

group = pci_device_group(dev);
+   else if (dev_is_fsl_mc(dev))
+   group = fsl_mc_device_group(dev);
else
group = generic_device_group(dev);
  
@@ -2037,6 +2040,10 @@ static void arm_smmu_bus_init(void)

bus_set_iommu(&pci_bus_type, &arm_smmu_ops);
}
  #endif
+#ifdef CONFIG_FSL_MC_BUS
+   if (!iommu_present(&fsl_mc_bus_type))
+   bus_set_iommu(&fsl_mc_bus_type, &arm_smmu_ops);
+#endif
  }
  
  static int arm_smmu_device_probe(struct platform_device *pdev)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d2aa2320..6d4ce35 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,6 +32,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  
  static struct kset *iommu_group_kset;

@@ -987,6 +988,26 @@ struct iommu_group *pci_device_group(struct device *dev)
return iommu_group_alloc();
  }
  
+/* Get the IOMMU group for device on fsl-mc bus */

+struct iommu_group *fsl_mc_device_group(struct device *dev)
+{
+   struct device *cont_dev = fsl_mc_cont_dev(dev);
+   struct iommu_group *group;
+
+   /* Container device is responsible for creating the iommu group */
+   if (fsl_mc_is_cont_dev(dev)) {


Why duplicate what fsl_mc_cont_dev() has already figured out?

AFAICS the overall operation here boils down to something like:

cont_dev = fsl_mc_cont_dev(dev);
group = iommu_group_get(cont_dev);
if (!group)
group = iommu_group_alloc();
return group;


+   group = iommu_group_alloc();
+   if (IS_ERR(group))
+   return NULL;


iommu_group_get_for_dev() expects a PTR_ERR, so I don't think munging 
the return here is the right thing to do.



+   } else {
+   get_device(cont_dev);


If races are a concern, isn't this a bit late? Maybe in that case you 
want {get,put}_fsl_mc_cont_dev() routines instead of the simple macro 
below. But on the other hand if dev already has cont_dev as its parent, 
wouldn't the reference taken in device_add() be sufficient to prevent it 
from vanishing unexpectedly in this timescale?


Robin.


+   group = iommu_group_get(cont_dev);
+   put_device(cont_dev);
+   }
+
+   return group;
+}
+
  /**
   * iommu_group_get_for_dev - Find or create the IOMMU group for a device
   * @dev: target device
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index f27cb14..dddaca1 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -351,6 +351,14 @@ struct fsl_mc_io {
  #define dev_is_fsl_mc(_dev) (0)
  #endif
  
+/* Macro to check if a device is a container device */

+#define fsl_mc_is_cont_dev(_dev) (to_fsl_mc_device(_dev)->flags & \
+   FSL_MC_IS_DPRC)
+
+/* Macro to get the container device of a MC device */
+#define fsl_mc_cont_dev(_dev) (fsl_mc_is_cont_dev(_dev) ? \
+   (_dev) : (_dev)->parent)
+
  /*
   * module_fsl_mc_driver() - Helper macro for drivers that don't do
   * anything special in module init/exit.  This eliminates a lot of
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 19938ee..2981200 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -389,6 +389,8 @@ static inline size_t iommu_map_sg(struct iommu_domain 
*domain,
  extern struct iommu_group *pci_device_group(struct device *dev);
  /* Generic device grouping function */
  extern struct iommu_group *generic_device_group(struct device *dev);
+/* FSL-MC device grouping function */
+struct iommu_group *fsl_mc_device_group(struct device *dev);
  
  /**

   * struct iommu_fwspec - per-device IOMMU instance data



Re: [PATCH 3/7 v5] iommu: support iommu configuration for fsl-mc devices

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

With of_pci_map_rid available for all the busses, use the function
for configuration of devices on fsl-mc bus


FWIW I had a quick hack at factoring out the commonality with 
of_pci_iommu_init(), at which point I reckon this change is easier to 
follow as-is for the moment, so:


Reviewed-by: Robin Murphy 


Signed-off-by: Nipun Gupta 
---
  drivers/iommu/of_iommu.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 811e160..284474d 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -24,6 +24,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #define NO_IOMMU	1
  
@@ -159,6 +160,23 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)

return err;
  }
  
+static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev,

+   struct device_node *master_np)
+{
+   struct of_phandle_args iommu_spec = { .args_count = 1 };
+   int err;
+
+   err = of_map_rid(master_np, mc_dev->icid, "iommu-map",
+"iommu-map-mask", &iommu_spec.np,
+iommu_spec.args);
+   if (err)
+   return err == -ENODEV ? NO_IOMMU : err;
+
+   err = of_iommu_xlate(&mc_dev->dev, &iommu_spec);
+   of_node_put(iommu_spec.np);
+   return err;
+}
+
  const struct iommu_ops *of_iommu_configure(struct device *dev,
   struct device_node *master_np)
  {
@@ -190,6 +208,8 @@ const struct iommu_ops *of_iommu_configure(struct device 
*dev,
  
  		err = pci_for_each_dma_alias(to_pci_dev(dev),

 of_pci_iommu_init, &info);
+   } else if (dev_is_fsl_mc(dev)) {
+   err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np);
} else {
struct of_phandle_args iommu_spec;
int idx = 0;



Re: [PATCH] QE GPIO: Add qe_gpio_set_multiple

2018-07-03 Thread York Sun
+Leo

On 07/03/2018 03:30 AM, Joakim Tjernlund wrote:
> On Tue, 2018-06-26 at 23:41 +1000, Michael Ellerman wrote:
>>
>> Joakim Tjernlund  writes:
>>> On Thu, 2018-06-21 at 02:38 +, Qiang Zhao wrote:
 On 06/19/2018 09:22 AM, Joakim Tjernlund wrote:
 -Original Message-
 From: Linuxppc-dev 
 [mailto:linuxppc-dev-bounces+qiang.zhao=nxp@lists.ozlabs.org] On 
 Behalf Of Joakim Tjernlund
 Sent: 2018年6月20日 0:22
 To: York Sun ; linuxppc-dev 
 
 Subject: [PATCH] QE GPIO: Add qe_gpio_set_multiple

 This cousin to gpio-mpc8xxx was lacking a multiple pins method, add one.

 Signed-off-by: Joakim Tjernlund 
 ---
  drivers/soc/fsl/qe/gpio.c | 28 
  1 file changed, 28 insertions(+)
>>
>> ...
  static int qe_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)  {
 struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ -298,6 
 +325,7 @@ static int __init qe_add_gpiochips(void)
 gc->direction_output = qe_gpio_dir_out;
 gc->get = qe_gpio_get;
 gc->set = qe_gpio_set;
 +   gc->set_multiple = qe_gpio_set_multiple;

 ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc);
 if (ret)

 Reviewed-by: Qiang Zhao 

>>>
>>> Who picks up this patch ? Is it queued somewhere already?
>>
>> Not me.
> 
> York? You seem to be the only one left.
> 

I am not a Linux maintainer. Even I want to, I can't merge this patch.

Leo, who can merge this patch and request a pull?

York


Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public

2018-07-03 Thread Baoquan He
Hi Andy,

On 06/12/18 at 05:24pm, Andy Shevchenko wrote:
> On Tue, Jun 12, 2018 at 5:20 PM, Andy Shevchenko
>  wrote:
> >> Hmm, I just copied it from arch/powerpc/kernel/pci-common.c. The
> >> function interface expects an integer returned value, not sure what a
> >> real error codes look like, could you give more hints? Will change
> >> accordingly.
> >
> > I briefly looked at the code and error codes we have, so, my proposal
> > is one of the following
> 
> >  - use -ECANCELED (not the best choice for first occurrence here,
> > though I can't find better)
> 
> Actually -ENOTSUPP might suit the first case (although the actual
> would be something like -EOVERLAP, which we don't have)

Sorry for late reply, and many thanks for your great suggestion.

I am fine to use -ENOTSUPP as the first returned value, and -ECANCELED
for the 2nd one. Or define an enum as you suggested inside the function
or in header file.

Or use -EBUSY for the first case because existing resource is
overlapping but not fully contained by 'res'; and -EINVAL for
the 2nd case since didn't find any one resources which is contained by
'res', means we passed in a invalid resource. 

All is fine to me, I can repost with each of them.

Thanks
Baoquan

> 
> >  - use positive integers (or enum), like
> >   #define RES_REPARENTED 0
> >   #define RES_OVERLAPPED 1
> >   #define RES_NOCONFLICT 2
> >
> >
> >>> > +   if (firstpp == NULL)
> >>> > +   firstpp = pp;
> >>> > +   }
> >>>
> >>> > +   if (firstpp == NULL)
> >>> > +   return -1;  /* didn't find any conflicting entries? 
> >>> > */
> >>>
> >>> Ditto.
> >
> > Ditto.
> >
> >>>
> >>> > +}
> >>> > +EXPORT_SYMBOL(reparent_resources);
> >
> > --
> > With Best Regards,
> > Andy Shevchenko
> 
> 
> 
> -- 
> With Best Regards,
> Andy Shevchenko


Re: [PATCH 2/7 v5] iommu: of: make of_pci_map_rid() available for other devices too

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

iommu-map property is also used by devices with fsl-mc. This
patch moves the of_pci_map_rid to generic location, so that it
can be used by other busses too.

'of_pci_map_rid' is renamed here to 'of_map_rid' and there is no
functional change done in the API.


Reviewed-by: Robin Murphy 

Signed-off-by: Nipun Gupta 
Reviewed-by: Rob Herring 
Acked-by: Bjorn Helgaas 
---
  drivers/iommu/of_iommu.c |   5 +--
  drivers/of/base.c| 102 +++
  drivers/of/irq.c |   5 +--
  drivers/pci/of.c | 101 --
  include/linux/of.h   |  11 +
  include/linux/of_pci.h   |  10 -
  6 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 5c36a8b..811e160 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -149,9 +149,8 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 
alias, void *data)
struct of_phandle_args iommu_spec = { .args_count = 1 };
int err;
  
-	err = of_pci_map_rid(info->np, alias, "iommu-map",

-"iommu-map-mask", &iommu_spec.np,
-iommu_spec.args);
+   err = of_map_rid(info->np, alias, "iommu-map", "iommu-map-mask",
+&iommu_spec.np, iommu_spec.args);
if (err)
return err == -ENODEV ? NO_IOMMU : err;
  
diff --git a/drivers/of/base.c b/drivers/of/base.c

index 848f549..c7aac81 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1995,3 +1995,105 @@ int of_find_last_cache_level(unsigned int cpu)
  
  	return cache_level;

  }
+
+/**
+ * of_map_rid - Translate a requester ID through a downstream mapping.
+ * @np: root complex device node.
+ * @rid: device requester ID to map.
+ * @map_name: property name of the map to use.
+ * @map_mask_name: optional property name of the mask to use.
+ * @target: optional pointer to a target device node.
+ * @id_out: optional pointer to receive the translated ID.
+ *
+ * Given a device requester ID, look up the appropriate implementation-defined
+ * platform ID and/or the target device which receives transactions on that
+ * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or
+ * @id_out may be NULL if only the other is required. If @target points to
+ * a non-NULL device node pointer, only entries targeting that node will be
+ * matched; if it points to a NULL value, it will receive the device node of
+ * the first matching target phandle, with a reference held.
+ *
+ * Return: 0 on success or a standard error code on failure.
+ */
+int of_map_rid(struct device_node *np, u32 rid,
+  const char *map_name, const char *map_mask_name,
+  struct device_node **target, u32 *id_out)
+{
+   u32 map_mask, masked_rid;
+   int map_len;
+   const __be32 *map = NULL;
+
+   if (!np || !map_name || (!target && !id_out))
+   return -EINVAL;
+
+   map = of_get_property(np, map_name, &map_len);
+   if (!map) {
+   if (target)
+   return -ENODEV;
+   /* Otherwise, no map implies no translation */
+   *id_out = rid;
+   return 0;
+   }
+
+   if (!map_len || map_len % (4 * sizeof(*map))) {
+   pr_err("%pOF: Error: Bad %s length: %d\n", np,
+   map_name, map_len);
+   return -EINVAL;
+   }
+
+   /* The default is to select all bits. */
+   map_mask = 0x;
+
+   /*
+* Can be overridden by "{iommu,msi}-map-mask" property.
+* If of_property_read_u32() fails, the default is used.
+*/
+   if (map_mask_name)
+   of_property_read_u32(np, map_mask_name, &map_mask);
+
+   masked_rid = map_mask & rid;
+   for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) {
+   struct device_node *phandle_node;
+   u32 rid_base = be32_to_cpup(map + 0);
+   u32 phandle = be32_to_cpup(map + 1);
+   u32 out_base = be32_to_cpup(map + 2);
+   u32 rid_len = be32_to_cpup(map + 3);
+
+   if (rid_base & ~map_mask) {
+   pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) 
ignores rid-base (0x%x)\n",
+   np, map_name, map_name,
+   map_mask, rid_base);
+   return -EFAULT;
+   }
+
+   if (masked_rid < rid_base || masked_rid >= rid_base + rid_len)
+   continue;
+
+   phandle_node = of_find_node_by_phandle(phandle);
+   if (!phandle_node)
+   return -ENODEV;
+
+   if (target) {
+   if (*target)
+   of_node_put(phandle_node);
+   else
+   

Re: [PATCH 1/7 v5] Docs: dt: add fsl-mc iommu-map device-tree binding

2018-07-03 Thread Robin Murphy

On 20/05/18 14:49, Nipun Gupta wrote:

The existing IOMMU bindings cannot be used to specify the relationship
between fsl-mc devices and IOMMUs. This patch adds a generic binding for
mapping fsl-mc devices to IOMMUs, using iommu-map property.

Signed-off-by: Nipun Gupta 
Reviewed-by: Rob Herring 
---
  .../devicetree/bindings/misc/fsl,qoriq-mc.txt  | 39 ++
  1 file changed, 39 insertions(+)

diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt 
b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
index 6611a7c..8cbed4f 100644
--- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
+++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
@@ -9,6 +9,25 @@ blocks that can be used to create functional hardware 
objects/devices
  such as network interfaces, crypto accelerator instances, L2 switches,
  etc.
  
+For an overview of the DPAA2 architecture and fsl-mc bus see:

+drivers/staging/fsl-mc/README.txt


Nit: Looks like that's Documentation/networking/dpaa2/overview.rst now.


+
+As described in the above overview, all DPAA2 objects in a DPRC share the
+same hardware "isolation context" and a 10-bit value called an ICID
+(isolation context id) is expressed by the hardware to identify
+the requester.
+
+The generic 'iommus' property is insufficient to describe the relationship
+between ICIDs and IOMMUs, so an iommu-map property is used to define
+the set of possible ICIDs under a root DPRC and how they map to
+an IOMMU.
+
+For generic IOMMU bindings, see
+Documentation/devicetree/bindings/iommu/iommu.txt.
+
+For arm-smmu binding, see:
+Documentation/devicetree/bindings/iommu/arm,smmu.txt.
+
  Required properties:
  
  - compatible

@@ -88,14 +107,34 @@ Sub-nodes:
Value type: 
Definition: Specifies the phandle to the PHY device node 
associated
with the this dpmac.
+Optional properties:
+
+- iommu-map: Maps an ICID to an IOMMU and associated iommu-specifier
+  data.
+
+  The property is an arbitrary number of tuples of
+  (icid-base,iommu,iommu-base,length).
+
+  Any ICID i in the interval [icid-base, icid-base + length) is
+  associated with the listed IOMMU, with the iommu-specifier
+  (i - icid-base + iommu-base).
  
  Example:
  
+smmu: iommu@500 {

+   compatible = "arm,mmu-500";
+   #iommu-cells = <2>;


This should be 1 if stream-match-mask is present. Bad example is bad :)

Robin.


+   stream-match-mask = <0x7C00>;
+   ...
+};
+
  fsl_mc: fsl-mc@80c00 {
  compatible = "fsl,qoriq-mc";
  reg = <0x0008 0x0c00 0 0x40>,/* MC portal base */
<0x 0x0834 0 0x4>; /* MC control reg */
  msi-parent = <&its>;
+/* define map for ICIDs 23-64 */
+iommu-map = <23 &smmu 23 41>;
  #address-cells = <3>;
  #size-cells = <1>;
  



Re: [PATCHv3 0/4] drivers/base: bugfix for supplier<-consumer ordering in device_kset

2018-07-03 Thread Rafael J. Wysocki
On Tuesday, July 3, 2018 8:50:38 AM CEST Pingfan Liu wrote:
> commit 52cdbdd49853 ("driver core: correct device's shutdown order")
> places an assumption of supplier<-consumer order on the process of probe.
> But it turns out to break down the parent <- child order in some scene.
> E.g in pci, a bridge is enabled by pci core, and behind it, the devices
> have been probed. Then comes the bridge's module, which enables extra
> feature(such as hotplug) on this bridge.

So what *exactly* does happen in that case?



Re: [PATCHv3 3/4] drivers/base: clean up the usage of devices_kset_move_last()

2018-07-03 Thread Rafael J. Wysocki
On Tuesday, July 3, 2018 8:50:41 AM CEST Pingfan Liu wrote:
> Clean up the referring to the code in commit 52cdbdd49853 ("driver core:
> correct device's shutdown order"). So later we can revert it safely.
> 
> Cc: Greg Kroah-Hartman 
> Cc: Rafael J. Wysocki 
> Cc: Grygorii Strashko 
> Cc: Christoph Hellwig 
> Cc: Bjorn Helgaas 
> Cc: Dave Young 
> Cc: linux-...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Pingfan Liu 
> ---
>  drivers/base/core.c | 7 ---
>  1 file changed, 7 deletions(-)
> 
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index 684b994..db3deb8 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -127,13 +127,6 @@ static int device_reorder_to_tail(struct device *dev, 
> void *not_used)
>  {
>   struct device_link *link;
>  
> - /*
> -  * Devices that have not been registered yet will be put to the ends
> -  * of the lists during the registration, so skip them here.
> -  */
> - if (device_is_registered(dev))
> - devices_kset_move_last(dev);
> -
>   if (device_pm_initialized(dev))
>   device_pm_move_last(dev);

You can't do this.

If you do it, that will break power management in some situations.

Thanks,
Rafael



How is this possible - Register r30 contains 0xc2236400 instead of 0xc6236400

2018-07-03 Thread Christophe LEROY
Kernel Oops at 0xc0334d5c for reading at address 0xc2236450 which 
corresponds to r30 + 80


But r30 should contain what's at r3 + 16 that is at 0xc619ec10 so r30 
should be c6236400 as shown below (print_hex_dump(regs->gpr[3]) added at 
end of __die() )


So how can r30 contain 0xc2236400 instead ?

And this is not random, it happens at most if not every startup.

c0334d44 :
c0334d44:   7c 08 02 a6 mflr    r0
c0334d48:   94 21 ff f0 stwu    r1,-16(r1)
c0334d4c:   bf c1 00 08 stmw    r30,8(r1)
c0334d50:   90 01 00 14 stw r0,20(r1)
c0334d54:   83 c3 00 10 lwz r30,16(r3)
c0334d58:   81 23 00 a8 lwz r9,168(r3)
c0334d5c:   81 5e 00 50 lwz r10,80(r30)


[  152.288237] Unable to handle kernel paging request for data at 
address 0xc2236450

[  152.295444] Faulting instruction address: 0xc0334d5c
[  152.300369] Oops: Kernel access of bad area, sig: 11 [#1]
[  152.305665] BE PREEMPT DEBUG_PAGEALLOC CMPC885
[  152.313630] CPU: 0 PID: 269 Comm: in:imuxsock Not tainted 
4.14.52-00025-g5bada429cf-dirty #36

[  152.322729] task: c623e100 task.stack: c650c000
[  152.327202] NIP:  c0334d5c LR: c043602c CTR: c0435fb8
[  152.332200] REGS: c650dc00 TRAP: 0300   Not tainted 
(4.14.52-00025-g5bada429cf-dirty)

[  152.340699] MSR:  9032   CR: 28002822 XER: 2000
[  152.347333] DAR: c2236450 DSISR: c000
[  152.347333] GPR00: c043602c c650dcb0 c623e100 c619ec00 c642c060 
0008 0018 c650dd4c
[  152.347333] GPR08: c0435fb8 02b0 c068d830 0004 28004822 
100d4208  7780c848
[  152.347333] GPR16: 0ff58398 777674b0 1024b050 1024b0a8 1005ddbc 
0ff5a7bc 03e8 
[  152.347333] GPR24: 008e c5011650 c650deb8 008e c619ec00 
0040 c2236400 c619ec00

[  152.385015] NIP [c0334d5c] sock_wfree+0x18/0xa4
[  152.389458] LR [c043602c] unix_destruct_scm+0x74/0x88
[  152.394399] Call Trace:
[  152.396868] [c650dcb0] [c006348c] ns_to_timeval+0x4c/0x7c (unreliable)
[  152.403305] [c650dcc0] [c043602c] unix_destruct_scm+0x74/0x88
[  152.408999] [c650dcf0] [c033a10c] skb_release_head_state+0x8c/0x110
[  152.415184] [c650dd00] [c033a3c4] skb_release_all+0x18/0x50
[  152.420690] [c650dd10] [c033a7cc] consume_skb+0x38/0xec
[  152.425869] [c650dd20] [c0342d7c] skb_free_datagram+0x1c/0x68
[  152.431535] [c650dd30] [c0435c8c] unix_dgram_recvmsg+0x19c/0x4ac
[  152.437476] [c650ddb0] [c0331370] ___sys_recvmsg+0x98/0x138
[  152.442984] [c650deb0] [c0333280] __sys_recvmsg+0x40/0x84
[  152.448321] [c650df10] [c0333680] SyS_socketcall+0xb8/0x1d4
[  152.453832] [c650df40] [c000d1ac] ret_from_syscall+0x0/0x38
[  152.459286] Instruction dump:
[  152.462225] 41beffac 4b58 3883 4ba0 3881 4b98 
7c0802a6 9421fff0
[  152.469881] bfc10008 90010014 83c30010 812300a8 <815e0050> 3bfe00e0 
71480200 4082003c

[  152.477739] c619ec00: 00 00 00 00 00 00 00 00 00 00 00 23 6f d9 b1 65
[  152.484100] c619ec10: c6 23 64 00 00 00 00 00 c6 42 c0 60 00 00 03 e8
[  152.490471] c619ec20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  152.496837] c619ec30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  152.503205] c619ec40: 00 00 00 00 00 00 00 00 00 00 00 00 c0 43 5f b8
[  152.509575] c619ec50: 00 00 00 00 00 00 00 00 00 00 00 8e 00 00 00 00
[  152.515943] c619ec60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  152.522311] c619ec70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  152.528680] c619ec80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[  152.535048] c619ec90: 00 00 ff ff 00 00 ff ff c6 42 30 8e c6 42 31 50
[  152.541417] c619eca0: c6 42 30 00 c6 42 30 00 00 00 02 b0 00 00 00 01
[  152.547781] ---[ end trace 0710a9d231876a27 ]---

Christophe


Re: CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()

2018-07-03 Thread Nicholas Piggin
On Tue, 3 Jul 2018 13:40:55 +0200
Mathieu Malaterre  wrote:

> On Tue, Jul 3, 2018 at 11:40 AM Mathieu Malaterre  wrote:
> >
> > Hi Nick,
> >
> > Would you consider this a bug:
> >
> > $ touch drivers/macintosh/via-pmu.c
> > $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc
> > CROSS_COMPILE=powerpc-linux-gnu-
> > ...
> >   LD  vmlinux.o
> >   MODPOST vmlinux.o
> > WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from
> > the variable via_pmu_driver to the function .init.text:pmu_init()
> > The variable via_pmu_driver references
> > the function __init pmu_init()
> > If the reference is valid then annotate the
> > variable with __init* or __refdata (see linux/init.h) or name the variable:
> > *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console
> >
> > While:
> >
> > $ touch drivers/macintosh/via-pmu.c
> > $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc
> > CROSS_COMPILE=powerpc-linux-gnu-
> > ...
> >   AR  init/built-in.a
> >   AR  built-in.a
> >   LD  vmlinux.o
> >   MODPOST vmlinux.o
> >   KSYM.tmp_kallsyms1.o
> >   KSYM.tmp_kallsyms2.o
> >   LD  vmlinux
> >   SORTEX  vmlinux
> >   SYSMAP  System.map
> > ...
> >
> > Thanks for comment  
> 
> Just to clarify I reverted 58935176ad17976b7a7f6ea25c0ceb2ca4308a30
> just as to reproduce a warning. So my question (rephrased):
> 
> Is this expected that CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y inhibit
> the behavior of  CONFIG_DEBUG_SECTION_MISMATCH=y ?

Well that's a good question actually. Section mismatch
analysis is done on the throwaway vmlinux.o which is not linked
with --gc-sections (and is not a final link), so the via_pmu_driver
symbol should exist and be picked up.

I wonder if something about the  -ffunction-sections is breaking
the reference detection.

Thanks,
Nick


Re: CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()

2018-07-03 Thread Mathieu Malaterre
On Tue, Jul 3, 2018 at 11:40 AM Mathieu Malaterre  wrote:
>
> Hi Nick,
>
> Would you consider this a bug:
>
> $ touch drivers/macintosh/via-pmu.c
> $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc
> CROSS_COMPILE=powerpc-linux-gnu-
> ...
>   LD  vmlinux.o
>   MODPOST vmlinux.o
> WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from
> the variable via_pmu_driver to the function .init.text:pmu_init()
> The variable via_pmu_driver references
> the function __init pmu_init()
> If the reference is valid then annotate the
> variable with __init* or __refdata (see linux/init.h) or name the variable:
> *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console
>
> While:
>
> $ touch drivers/macintosh/via-pmu.c
> $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc
> CROSS_COMPILE=powerpc-linux-gnu-
> ...
>   AR  init/built-in.a
>   AR  built-in.a
>   LD  vmlinux.o
>   MODPOST vmlinux.o
>   KSYM.tmp_kallsyms1.o
>   KSYM.tmp_kallsyms2.o
>   LD  vmlinux
>   SORTEX  vmlinux
>   SYSMAP  System.map
> ...
>
> Thanks for comment

Just to clarify I reverted 58935176ad17976b7a7f6ea25c0ceb2ca4308a30
just as to reproduce a warning. So my question (rephrased):

Is this expected that CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y inhibit
the behavior of  CONFIG_DEBUG_SECTION_MISMATCH=y ?

Thanks


Re: Oops in sock_wfree

2018-07-03 Thread Christophe LEROY




Le 03/07/2018 à 10:51, Christophe LEROY a écrit :

Hi,

I was having strange unexpected memory corruption, therefore I activated 
DEBUG_PAGEALLOC and I now end up with the following Oops, which tends to 
make me think we have somewhere in the network code a use-after-free 
bug. I saw a few of such bugs have been fixed for IPv4 and IPv6. Maybe 
we have one remaining for Unix sockets ? How can I spot it off and fix it ?


[   39.645644] Unable to handle kernel paging request for data at 
address 0xc2235010


In fact, must be something else. This page has never been allocated.
In seems that skb->sk should be c6234fc0 and suddenly it has changed to 
c2234fc0


How can I track that ?

Christophe


[   39.652860] Faulting instruction address: 0xc0334d5c
[   39.657783] Oops: Kernel access of bad area, sig: 11 [#1]
[   39.663085] BE PREEMPT DEBUG_PAGEALLOC CMPC885
[   39.667488] SAF3000 DIE NOTIFICATION
[   39.671050] CPU: 0 PID: 269 Comm: in:imuxsock Not tainted 
4.14.52-00025-g5bada429cf #22

[   39.679633] task: c623e100 task.stack: c651e000
[   39.684106] NIP:  c0334d5c LR: c043602c CTR: c0435fb8
[   39.689103] REGS: c651fc00 TRAP: 0300   Not tainted 
(4.14.52-00025-g5bada429cf)

[   39.697087] MSR:  9032   CR: 28002822 XER: 2000
[   39.703720] DAR: c2235010 DSISR: c000
[   39.703720] GPR00: c043602c c651fcb0 c623e100 c619eec0 c642c540 
0008 0018 c651fd4c
[   39.703720] GPR08: c0435fb8 02b0 c068d830 0004 28004822 
100d4208  77990848
[   39.703720] GPR16: 0ff58398 778eb4b0 1039f050 1039f0a8 1005ddbc 
0ff5a7bc  
[   39.703720] GPR24: 0072 c5011650 c651feb8 0072 c619eec0 
0040 c2234fc0 c619eec0

[   39.741401] NIP [c0334d5c] sock_wfree+0x18/0xa4
[   39.745843] LR [c043602c] unix_destruct_scm+0x74/0x88
[   39.750786] Call Trace:
[   39.753253] [c651fcb0] [c006348c] ns_to_timeval+0x4c/0x7c (unreliable)
[   39.759690] [c651fcc0] [c043602c] unix_destruct_scm+0x74/0x88
[   39.765385] [c651fcf0] [c033a10c] skb_release_head_state+0x8c/0x110
[   39.771571] [c651fd00] [c033a3c4] skb_release_all+0x18/0x50
[   39.777078] [c651fd10] [c033a7cc] consume_skb+0x38/0xec
[   39.782255] [c651fd20] [c0342d7c] skb_free_datagram+0x1c/0x68
[   39.787922] [c651fd30] [c0435c8c] unix_dgram_recvmsg+0x19c/0x4ac
[   39.793863] [c651fdb0] [c0331370] ___sys_recvmsg+0x98/0x138
[   39.799371] [c651feb0] [c0333280] __sys_recvmsg+0x40/0x84
[   39.804707] [c651ff10] [c0333680] SyS_socketcall+0xb8/0x1d4
[   39.810220] [c651ff40] [c000d1ac] ret_from_syscall+0x0/0x38
[   39.815673] Instruction dump:
[   39.818612] 41beffac 4b58 3883 4ba0 3881 4b98 
7c0802a6 9421fff0
[   39.826267] bfc10008 90010014 83c30010 812300a8 <815e0050> 3bfe00e0 
71480200 4082003c

[   39.834113] ---[ end trace 8affde0490d7e25e ]---

Thanks
Christophe


[v2 PATCH 1/2] powerpc: Detect the presence of big-cores via "ibm, thread-groups"

2018-07-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

On IBM POWER9, the device tree exposes a property array identifed by
"ibm,thread-groups" which will indicate which groups of threads share a
particular set of resources.

As of today we only have one form of grouping identifying the group of
threads in the core that share the L1 cache, translation cache and
instruction data flow.

This patch defines the helper function to parse the contents of
"ibm,thread-groups" and a new structure to contain the parsed output.

The patch also creates the sysfs file named "small_core_siblings" that
returns the physical ids of the threads in the core that share the L1
cache, translation cache and instruction data flow.

Signed-off-by: Gautham R. Shenoy 
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |   8 ++
 arch/powerpc/include/asm/cputhreads.h  |  22 +
 arch/powerpc/kernel/setup-common.c | 110 +
 arch/powerpc/kernel/sysfs.c|  35 +++
 4 files changed, 175 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 9c5e7732..53a823a 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -487,3 +487,11 @@ Description:   Information about CPU vulnerabilities
"Not affected"CPU is not affected by the vulnerability
"Vulnerable"  CPU is affected and no mitigation in effect
"Mitigation: $M"  CPU is affected and mitigation $M is in effect
+
+What:  /sys/devices/system/cpu/cpu[0-9]+/small_core_sibings
+Date:  03-Jul-2018
+KernelVersion: v4.18.0
+Contact:   Gautham R. Shenoy 
+Description:   List of Physical ids of CPUs which share the the L1 cache,
+   translation cache and instruction data-flow with this CPU.
+Values:Comma separated list of decimal integers.
diff --git a/arch/powerpc/include/asm/cputhreads.h 
b/arch/powerpc/include/asm/cputhreads.h
index d71a909..33226d7 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -23,11 +23,13 @@
 extern int threads_per_core;
 extern int threads_per_subcore;
 extern int threads_shift;
+extern bool has_big_cores;
 extern cpumask_t threads_core_mask;
 #else
 #define threads_per_core   1
 #define threads_per_subcore1
 #define threads_shift  0
+#define has_big_cores  0
 #define threads_core_mask  (*get_cpu_mask(0))
 #endif
 
@@ -69,12 +71,32 @@ static inline cpumask_t cpu_online_cores_map(void)
return cpu_thread_mask_to_cores(cpu_online_mask);
 }
 
+#define MAX_THREAD_LIST_SIZE   8
+struct thread_groups {
+   unsigned int property;
+   unsigned int nr_groups;
+   unsigned int threads_per_group;
+   unsigned int thread_list[MAX_THREAD_LIST_SIZE];
+};
+
 #ifdef CONFIG_SMP
 int cpu_core_index_of_thread(int cpu);
 int cpu_first_thread_of_core(int core);
+int parse_thread_groups(struct device_node *dn, struct thread_groups *tg);
+int get_cpu_thread_group_start(int cpu, struct thread_groups *tg);
 #else
 static inline int cpu_core_index_of_thread(int cpu) { return cpu; }
 static inline int cpu_first_thread_of_core(int core) { return core; }
+static inline int parse_thread_groups(struct device_node *dn,
+ struct thread_groups *tg)
+{
+   return -ENODATA;
+}
+
+static inline int get_cpu_thread_group_start(int cpu, struct thread_groups *tg)
+{
+   return -1;
+}
 #endif
 
 static inline int cpu_thread_in_core(int cpu)
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 40b44bb..a78ec66 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -402,10 +402,12 @@ void __init check_for_initrd(void)
 #ifdef CONFIG_SMP
 
 int threads_per_core, threads_per_subcore, threads_shift;
+bool has_big_cores = true;
 cpumask_t threads_core_mask;
 EXPORT_SYMBOL_GPL(threads_per_core);
 EXPORT_SYMBOL_GPL(threads_per_subcore);
 EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(has_big_cores);
 EXPORT_SYMBOL_GPL(threads_core_mask);
 
 static void __init cpu_init_thread_core_maps(int tpc)
@@ -433,6 +435,108 @@ static void __init cpu_init_thread_core_maps(int tpc)
 
 u32 *cpu_to_phys_id = NULL;
 
+/*
+ * parse_thread_groups: Parses the "ibm,thread-groups" device tree
+ *  property for the CPU device node dn and stores
+ *  the parsed output in the thread_groups
+ *  structure tg.
+ *
+ * ibm,thread-groups[0..N-1] array defines which group of threads in
+ * the CPU-device node can be grouped together based on the property.
+ *
+ * ibm,thread-groups[0] tells us the property based on which the
+ * threads are being grouped together. If this value is 1, it implies
+ * that the threads in the same group share L1, translation cache.
+ *
+ 

[v2 PATCH 0/2] powerpc: Detection and scheduler optimization for POWER9 bigcore

2018-07-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

This is the second iteration of the patchset to add support for big-core on 
POWER9.

The earlier version can be found here: https://lkml.org/lkml/2018/5/11/245.

The changes from the previous version:
- Added comments explaining the "ibm,thread-groups" device tree property.
- Uses cleaner device-tree parsing functions to parse the u32 arrays.
- Adds a sysfs file listing the small-core siblings for every CPU.
- Enables the scheduler optimization by setting the CPU_FTR_ASYM_SMT bit
  in the cur_cpu_spec->cpu_features on detecting the presence
  of interleaved big-core.
- Handles the corner case where there is only a single thread-group
  or when there is a single thread in a thread-group.


Description:

A pair of IBM POWER9 SMT4 cores can be fused together to form a
big-core with 8 SMT threads. This can be discovered via the
"ibm,thread-groups" CPU property in the device tree which will
indicate which group of threads that share the L1 cache, translation
cache and instruction data flow.  If there are multiple such group of
threads, then the core is a big-core. Furthermore, the thread-ids of
such a big-core is obtained by interleaving the thread-ids of the
component SMT4 cores.

Eg: Threads in the pair of component SMT4 cores of an interleaved
big-core are numbered {0,2,4,6} and {1,3,5,7} respectively.

On such a big-core, when multiple tasks are scheduled to run on the
big-core, we get the best performance when the tasks are spread across
the pair of SMT4 cores.

The Linux scheduler supports a flag called "SD_ASYM_PACKING" which
when set in the SMT sched-domain, biases the load-balancing of the
tasks on the smaller numbered threads in the core. On an big-core
whose threads are interleavings of the threads of the small cores,
enabling SD_ASYM_PACKING in the SMT sched-domain automatically results
in spreading the tasks uniformly across the associated pair of SMT4
cores, thereby yielding better performance.

This patchset contains two patches which on detecting the presence of
interleaved big-cores will enable the the CPU_FTR_ASYM_SMT bit in the
cur_cpu_spec->cpu_feature.

Patch 1: adds support to detect the presence of
big-cores and reports the small-core siblings of each CPU X
via the sysfs file "/sys/devices/system/cpu/cpuX/big_core_siblings".

Patch 2: checks if the thread-ids of the component small-cores are
interleaved, in which case we enable the the CPU_FTR_ASYM_SMT bit in
the cur_cpu_spec->cpu_features which results in the SD_ASYM_PACKING
flag being set at the SMT level sched-domain.

Results:
~
Experimental results for ebizzy with 2 threads, bound to a single big-core
show a marked improvement with this patchset over the 4.18-rc3 vanilla
kernel.

The result of 100 such runs for 4.18-rc3 kernel and the 4.18-rc3 +
big-core-patches are as follows

4.18-rc3:
~~~
records/s:  # samples  : Histogram
~~~ 
[0 -   100]  :  0  : #
[100 - 200]  :  3  : #
[200 - 300]  :  16 : 
[300 - 400]  :  11 : ###
[400 - 500]  :  0  : #
[500 - 600]  :  70 : ###

4.18-rc3 + big-core-patches
~~~
records/s:  # samples  : Histogram
~~~ 
[0 -   100]  :  0  : #
[100 - 200]  :  0  : #
[200 - 300]  :  6  : ##
[300 - 400]  :  0  : #
[400 - 500]  :  2  : #
[500 - 600]  :  92 : ###



Gautham R. Shenoy (2):
  powerpc: Detect the presence of big-cores via "ibm,thread-groups"
  powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores

 Documentation/ABI/testing/sysfs-devices-system-cpu |   8 +
 arch/powerpc/include/asm/cputhreads.h  |  22 +++
 arch/powerpc/kernel/setup-common.c | 177 -
 arch/powerpc/kernel/sysfs.c|  35 
 4 files changed, 241 insertions(+), 1 deletion(-)

-- 
1.9.4



[v2 PATCH 2/2] powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores

2018-07-03 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

A pair of IBM POWER9 SMT4 cores can be fused together to form a big-core
with 8 SMT threads. This can be discovered via the "ibm,thread-groups"
CPU property in the device tree which will indicate which group of
threads that share the L1 cache, translation cache and instruction data
flow. If there are multiple such group of threads, then the core is a
big-core.

Furthermore, if the thread-ids of the threads of the big-core can be
obtained by interleaving the thread-ids of the thread-groups
(component small core), then such a big-core is called an interleaved
big-core.

Eg: Threads in the pair of component SMT4 cores of an interleaved
big-core are numbered {0,2,4,6} and {1,3,5,7} respectively.

The SMT4 cores forming a big-core are more or less independent
units. Thus when multiple tasks are scheduled to run on the fused
core, we get the best performance when the tasks are spread across the
pair of SMT4 cores.

This patch enables CPU_FTR_ASYM_SMT bit in the cpu-features on
detecting the presence of interleaved big-cores at boot up. This will
will bias the load-balancing of tasks on smaller numbered threads,
which will automatically result in spreading the tasks uniformly
across the associated pair of SMT4 cores.

Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/setup-common.c | 67 +-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index a78ec66..f63d797 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -537,6 +537,56 @@ int get_cpu_thread_group_start(int cpu, struct 
thread_groups *tg)
return -1;
 }
 
+/*
+ * check_interleaved_big_core - Checks if the thread group tg
+ * corresponds to a big-core whose threads are interleavings of the
+ * threads of the component small cores.
+ *
+ * @tg: A thread-group struct for the core.
+ *
+ * Returns true if the core is a interleaved big-core.
+ * Returns false otherwise.
+ */
+static inline bool check_interleaved_big_core(struct thread_groups *tg)
+{
+   int nr_groups;
+   int threads_per_group;
+   int cur_cpu, next_cpu, i, j;
+
+   nr_groups = tg->nr_groups;
+   threads_per_group = tg->threads_per_group;
+
+   if (tg->property != 1)
+   return false;
+
+   if (nr_groups < 2 || threads_per_group < 2)
+   return false;
+
+   /*
+* In case of an interleaved big-core, the thread-ids of the
+* big-core can be obtained by interleaving the the thread-ids
+* of the component small
+*
+* Eg: On a 8-thread big-core with two SMT4 small cores, the
+* threads of the two component small cores will be
+* {0, 2, 4, 6} and {1, 3, 5, 7}.
+*/
+   for (i = 0; i < nr_groups; i++) {
+   int group_start = i * threads_per_group;
+
+   for (j = 0; j < threads_per_group - 1; j++) {
+   int cur_idx = group_start + j;
+
+   cur_cpu = tg->thread_list[cur_idx];
+   next_cpu = tg->thread_list[cur_idx + 1];
+   if (next_cpu != cur_cpu + nr_groups)
+   return false;
+   }
+   }
+
+   return true;
+}
+
 /**
  * setup_cpu_maps - initialize the following cpu maps:
  *  cpu_possible_mask
@@ -560,6 +610,7 @@ void __init smp_setup_cpu_maps(void)
struct device_node *dn;
int cpu = 0;
int nthreads = 1;
+   bool has_interleaved_big_core = true;
 
DBG("smp_setup_cpu_maps()\n");
 
@@ -613,6 +664,12 @@ void __init smp_setup_cpu_maps(void)
if (parse_thread_groups(dn, &tg) ||
tg.nr_groups < 1 || tg.property != 1) {
has_big_cores = false;
+   has_interleaved_big_core = false;
+   }
+
+   if (has_interleaved_big_core) {
+   has_interleaved_big_core =
+   check_interleaved_big_core(&tg);
}
 
if (cpu >= nr_cpu_ids) {
@@ -669,7 +726,15 @@ void __init smp_setup_cpu_maps(void)
vdso_data->processorCount = num_present_cpus();
 #endif /* CONFIG_PPC64 */
 
-/* Initialize CPU <=> thread mapping/
+   if (has_interleaved_big_core) {
+   int key = __builtin_ctzl(CPU_FTR_ASYM_SMT);
+
+   cur_cpu_spec->cpu_features |= CPU_FTR_ASYM_SMT;
+   static_branch_enable(&cpu_feature_keys[key]);
+   pr_info("Detected interleaved big-cores\n");
+   }
+
+   /* Initialize CPU <=> thread mapping/
 *
 * WARNING: We assume that the number of threads is the same for
 * every CPU in the system. If that is not the case, then some code
-- 
1.9.4



Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices

2018-07-03 Thread Andy Shevchenko
I think Pavel would be interested to see this as well (he is doing
some parallel device shutdown stuff)

On Tue, Jul 3, 2018 at 9:50 AM, Pingfan Liu  wrote:
> commit 52cdbdd49853 ("driver core: correct device's shutdown order")
> places an assumption of supplier<-consumer order on the process of probe.
> But it turns out to break down the parent <- child order in some scene.
> E.g in pci, a bridge is enabled by pci core, and behind it, the devices
> have been probed. Then comes the bridge's module, which enables extra
> feature(such as hotplug) on this bridge. This will break the
> parent<-children order and cause failure when "kexec -e" in some scenario.
>
> The detailed description of the scenario:
> An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod)
> match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due
> to some issue. For this case, the bridge is moved after its children in
> devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not
> write back buffer in flight due to the former shutdown of the bridge which
> clears the BusMaster bit.
>
> It is a little hard to impose both "parent<-child" and "supplier<-consumer"
> order on devices_kset. Take the following scene:
> step0: before a consumer's probing, (note child_a is supplier of consumer_a)
>   [ consumer-X, child_a, , child_z] [... consumer_a, ..., consumer_z, 
> ...] supplier-X
>  ^^ affected range ^^
> step1: when probing, moving consumer-X after supplier-X
>   [ child_a, , child_z] [ consumer_a, ..., consumer_z, ...] 
> supplier-X, consumer-X
> step2: the children of consumer-X should be re-ordered to maintain the seq
>   [... consumer_a, ..., consumer_z, ] supplier-X  [consumer-X, child_a, 
> , child_z]
> step3: the consumer_a should be re-ordered to maintain the seq
>   [... consumer_z, ...] supplier-X [ consumer-X, child_a, consumer_a ..., 
> child_z]
>
> It requires two nested recursion to drain out all out-of-order item in
> "affected range". To avoid such complicated code, this patch suggests
> to utilize the info in device tree, instead of using the order of
> devices_kset during shutdown. It iterates the device tree, and firstly
> shutdown a device's children and consumers. After this patch, the buggy
> commit is hollow and left to clean.
>
> Cc: Greg Kroah-Hartman 
> Cc: Rafael J. Wysocki 
> Cc: Grygorii Strashko 
> Cc: Christoph Hellwig 
> Cc: Bjorn Helgaas 
> Cc: Dave Young 
> Cc: linux-...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Pingfan Liu 
> ---
>  drivers/base/core.c| 48 +++-
>  include/linux/device.h |  1 +
>  2 files changed, 44 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index a48868f..684b994 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -1446,6 +1446,7 @@ void device_initialize(struct device *dev)
> INIT_LIST_HEAD(&dev->links.consumers);
> INIT_LIST_HEAD(&dev->links.suppliers);
> dev->links.status = DL_DEV_NO_DRIVER;
> +   dev->shutdown = false;
>  }
>  EXPORT_SYMBOL_GPL(device_initialize);
>
> @@ -2811,7 +2812,6 @@ static void __device_shutdown(struct device *dev)
>  * lock is to be held
>  */
> parent = get_device(dev->parent);
> -   get_device(dev);
> /*
>  * Make sure the device is off the kset list, in the
>  * event that dev->*->shutdown() doesn't remove it.
> @@ -2842,23 +2842,60 @@ static void __device_shutdown(struct device *dev)
> dev_info(dev, "shutdown\n");
> dev->driver->shutdown(dev);
> }
> -
> +   dev->shutdown = true;
> device_unlock(dev);
> if (parent)
> device_unlock(parent);
>
> -   put_device(dev);
> put_device(parent);
> spin_lock(&devices_kset->list_lock);
>  }
>
> +/* shutdown dev's children and consumer firstly, then itself */
> +static int device_for_each_child_shutdown(struct device *dev)
> +{
> +   struct klist_iter i;
> +   struct device *child;
> +   struct device_link *link;
> +
> +   /* already shutdown, then skip this sub tree */
> +   if (dev->shutdown)
> +   return 0;
> +
> +   if (!dev->p)
> +   goto check_consumers;
> +
> +   /* there is breakage of lock in __device_shutdown(), and the redundant
> +* ref++ on srcu protected consumer is harmless since shutdown is not
> +* hot path.
> +*/
> +   get_device(dev);
> +
> +   klist_iter_init(&dev->p->klist_children, &i);
> +   while ((child = next_device(&i)))
> +   device_for_each_child_shutdown(child);
> +   klist_iter_exit(&i);
> +
> +check_consumers:
> +   list_for_each_entry_rcu(link, &dev->links.consumers, s_node) {
> +   if (!link->consumer->shutdown)
> + 

Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-07-03 Thread Michal Suchánek
On Tue, 3 Jul 2018 08:08:14 +1000
Nicholas Piggin  wrote:

> On Mon, 02 Jul 2018 11:17:06 +0530
> Mahesh J Salgaonkar  wrote:
> 
> > From: Mahesh Salgaonkar 
> > 
> > On pseries, as of today system crashes if we get a machine check
> > exceptions due to SLB errors. These are soft errors and can be
> > fixed by flushing the SLBs so the kernel can continue to function
> > instead of system crash. We do this in real mode before turning on
> > MMU. Otherwise we would run into nested machine checks. This patch
> > now fetches the rtas error log in real mode and flushes the SLBs on
> > SLB errors.
> > 
> > Signed-off-by: Mahesh Salgaonkar 
> > ---
> >  arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
> >  arch/powerpc/include/asm/machdep.h|1 
> >  arch/powerpc/kernel/exceptions-64s.S  |   42
> > + arch/powerpc/kernel/mce.c
> > |   16 +++- arch/powerpc/mm/slb.c |
> > 6 +++ arch/powerpc/platforms/powernv/opal.c |1 
> >  arch/powerpc/platforms/pseries/pseries.h  |1 
> >  arch/powerpc/platforms/pseries/ras.c  |   51
> > +
> > arch/powerpc/platforms/pseries/setup.c|1 9 files
> > changed, 116 insertions(+), 4 deletions(-) 
> 
> 
> > +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> > +BEGIN_FTR_SECTION
> > +   EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > +   mr  r10,r1  /* Save r1 */
> > +   ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency
> > stack */
> > +   subir1,r1,INT_FRAME_SIZE/* alloc stack
> > frame   */
> > +   mfspr   r11,SPRN_SRR0   /* Save SRR0 */
> > +   mfspr   r12,SPRN_SRR1   /* Save SRR1 */
> > +   EXCEPTION_PROLOG_COMMON_1()
> > +   EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > +   EXCEPTION_PROLOG_COMMON_3(0x200)
> > +   addir3,r1,STACK_FRAME_OVERHEAD
> > +   BRANCH_LINK_TO_FAR(machine_check_early) /* Function call
> > ABI */  
> 
> Is there any reason you can't use the existing
> machine_check_powernv_early code to do all this?

Code sharing is nice but if we envision this going to stable kernels
butchering the existing handler is going to be a nightmare. The code is
quite a bit different between kernel versions.

This code as is requires the bit that introduces
EXCEPTION_PROLOG_COMMON_1 and then should work on Linux 3.14+

Thanks

Michal


Re: [PATCH v5 2/7] powerpc/pseries: Defer the logging of rtas error to irq work queue.

2018-07-03 Thread Mahesh Jagannath Salgaonkar
On 07/03/2018 08:55 AM, Nicholas Piggin wrote:
> On Mon, 02 Jul 2018 11:16:29 +0530
> Mahesh J Salgaonkar  wrote:
> 
>> From: Mahesh Salgaonkar 
>>
>> rtas_log_buf is a buffer to hold RTAS event data that are communicated
>> to kernel by hypervisor. This buffer is then used to pass RTAS event
>> data to user through proc fs. This buffer is allocated from vmalloc
>> (non-linear mapping) area.
>>
>> On Machine check interrupt, register r3 points to RTAS extended event
>> log passed by hypervisor that contains the MCE event. The pseries
>> machine check handler then logs this error into rtas_log_buf. The
>> rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
>> page fault (vector 0x300) while accessing it. Since machine check
>> interrupt handler runs in NMI context we can not afford to take any
>> page fault. Page faults are not honored in NMI context and causes
>> kernel panic. Apart from that, as Nick pointed out, pSeries_log_error()
>> also takes a spin_lock while logging error which is not safe in NMI
>> context. It may endup in deadlock if we get another MCE before releasing
>> the lock. Fix this by deferring the logging of rtas error to irq work queue.
>>
>> Current implementation uses two different buffers to hold rtas error log
>> depending on whether extended log is provided or not. This makes bit
>> difficult to identify which buffer has valid data that needs to logged
>> later in irq work. Simplify this using single buffer, one per paca, and
>> copy rtas log to it irrespective of whether extended log is provided or
>> not. Allocate this buffer below RMA region so that it can be accessed
>> in real mode mce handler.
>>
>> Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable 
>> interrupt")
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Mahesh Salgaonkar 
> 
> I think this looks reasonable. It doesn't fix that commit so much as
> fixes the problem that's apparent after it's applied. I don't know if
> we should backport this to a wider set of stable kernels? Aside from
> that,

Since the commit b96672dd840f went into 4.13, I think it is good if we
backport this to V4.14 and above stable kernels.

Thanks,
-Mahesh

> 
> Reviewed-by: Nicholas Piggin 
> 
> Thanks,
> Nick
> 
>> ---
>>  arch/powerpc/include/asm/paca.h|3 ++
>>  arch/powerpc/platforms/pseries/ras.c   |   47 
>> ++--
>>  arch/powerpc/platforms/pseries/setup.c |   16 +++
>>  3 files changed, 51 insertions(+), 15 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/paca.h 
>> b/arch/powerpc/include/asm/paca.h
>> index 3f109a3e3edb..b441fef53077 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -251,6 +251,9 @@ struct paca_struct {
>>  void *rfi_flush_fallback_area;
>>  u64 l1d_flush_size;
>>  #endif
>> +#ifdef CONFIG_PPC_PSERIES
>> +u8 *mce_data_buf;   /* buffer to hold per cpu rtas errlog */
>> +#endif /* CONFIG_PPC_PSERIES */
>>  } cacheline_aligned;
>>  
>>  extern void copy_mm_to_paca(struct mm_struct *mm);
>> diff --git a/arch/powerpc/platforms/pseries/ras.c 
>> b/arch/powerpc/platforms/pseries/ras.c
>> index ef104144d4bc..14a46b07ab2f 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -22,6 +22,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  #include 
>>  #include 
>> @@ -32,11 +33,13 @@
>>  static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
>>  static DEFINE_SPINLOCK(ras_log_buf_lock);
>>  
>> -static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
>> -static DEFINE_PER_CPU(__u64, mce_data_buf);
>> -
>>  static int ras_check_exception_token;
>>  
>> +static void mce_process_errlog_event(struct irq_work *work);
>> +static struct irq_work mce_errlog_process_work = {
>> +.func = mce_process_errlog_event,
>> +};
>> +
>>  #define EPOW_SENSOR_TOKEN   9
>>  #define EPOW_SENSOR_INDEX   0
>>  
>> @@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void 
>> *dev_id)
>>  A) >= 0x7000) && ((A) < 0x7ff0)) || \
>>  (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16
>>  
>> +static inline struct rtas_error_log *fwnmi_get_errlog(void)
>> +{
>> +return (struct rtas_error_log *)local_paca->mce_data_buf;
>> +}
>> +
>>  /*
>>   * Get the error information for errors coming through the
>>   * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
>>   * the actual r3 if possible, and a ptr to the error log entry
>>   * will be returned if found.
>>   *
>> - * If the RTAS error is not of the extended type, then we put it in a per
>> - * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
>> + * Use one buffer mce_data_buf per cpu to store RTAS error.
>>   *
>> - * The global_mce_data_buf does not have any locks or protection around it,
>> + * The mce_data_buf does not have any locks or protection around it,
>>   * if a second machine check comes in,

Re: [PATCH] QE GPIO: Add qe_gpio_set_multiple

2018-07-03 Thread Joakim Tjernlund
On Tue, 2018-06-26 at 23:41 +1000, Michael Ellerman wrote:
> 
> Joakim Tjernlund  writes:
> > On Thu, 2018-06-21 at 02:38 +, Qiang Zhao wrote:
> > > On 06/19/2018 09:22 AM, Joakim Tjernlund wrote:
> > > -Original Message-
> > > From: Linuxppc-dev 
> > > [mailto:linuxppc-dev-bounces+qiang.zhao=nxp@lists.ozlabs.org] On 
> > > Behalf Of Joakim Tjernlund
> > > Sent: 2018年6月20日 0:22
> > > To: York Sun ; linuxppc-dev 
> > > 
> > > Subject: [PATCH] QE GPIO: Add qe_gpio_set_multiple
> > > 
> > > This cousin to gpio-mpc8xxx was lacking a multiple pins method, add one.
> > > 
> > > Signed-off-by: Joakim Tjernlund 
> > > ---
> > >  drivers/soc/fsl/qe/gpio.c | 28 
> > >  1 file changed, 28 insertions(+)
> 
> ...
> > >  static int qe_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)  {
> > > struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ -298,6 
> > > +325,7 @@ static int __init qe_add_gpiochips(void)
> > > gc->direction_output = qe_gpio_dir_out;
> > > gc->get = qe_gpio_get;
> > > gc->set = qe_gpio_set;
> > > +   gc->set_multiple = qe_gpio_set_multiple;
> > > 
> > > ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc);
> > > if (ret)
> > > 
> > > Reviewed-by: Qiang Zhao 
> > > 
> > 
> > Who picks up this patch ? Is it queued somewhere already?
> 
> Not me.

York? You seem to be the only one left.

 Jocke

Re: CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()

2018-07-03 Thread Christophe LEROY




Le 03/07/2018 à 11:40, Mathieu Malaterre a écrit :

Hi Nick,

Would you consider this a bug:


Looks more like a feature ...

In /drivers/macintosh/adb.c you have, and it's the only place 
via_pmu_driver is used.


#if defined(CONFIG_ADB_PMU) || defined(CONFIG_ADB_PMU68K)
&via_pmu_driver,
#endif

Is one of those defined in your .config ? If not then via_pmu_driver 
gets eliminated hence nothing to warn on.



Christophe




$ touch drivers/macintosh/via-pmu.c
$ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc
CROSS_COMPILE=powerpc-linux-gnu-
...
   LD  vmlinux.o
   MODPOST vmlinux.o
WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from
the variable via_pmu_driver to the function .init.text:pmu_init()
The variable via_pmu_driver references
the function __init pmu_init()
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console

While:

$ touch drivers/macintosh/via-pmu.c
$ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc
CROSS_COMPILE=powerpc-linux-gnu-
...
   AR  init/built-in.a
   AR  built-in.a
   LD  vmlinux.o
   MODPOST vmlinux.o
   KSYM.tmp_kallsyms1.o
   KSYM.tmp_kallsyms2.o
   LD  vmlinux
   SORTEX  vmlinux
   SYSMAP  System.map
...

Thanks for comment



Re: [PATCH v3 2/2] powernv/cpuidle: Use parsed device tree values for cpuidle_init

2018-07-03 Thread Gautham R Shenoy
Hi Akshay,

On Tue, Jul 03, 2018 at 02:50:56PM +0530, Akshay Adiga wrote:
> Export pnv_idle_states and nr_pnv_idle_states so that its accessible to
> cpuidle driver. Use properties from pnv_idle_states structure for powernv
> cpuidle_init.
> 
> Signed-off-by: Akshay Adiga 
> Reviewed-by: Nicholas Piggin 
> ---

[..snip..]

> 
>   /*
>* For nap and fastsleep, use default target_residency
>* values if f/w does not expose it.
>*/

This comment can also go, since we are no longer hard-coding the
residency values in the kernel.

Otherwise looks good to me.

Reviewed-by: Gautham R. Shenoy 


--
Thanks and Regards
gautham.



Re: [PATCH v3 1/2] powernv/cpuidle: Parse dt idle properties into global structure

2018-07-03 Thread Gautham R Shenoy
Hi Akshay,

On Tue, Jul 03, 2018 at 02:50:55PM +0530, Akshay Adiga wrote:
> Device-tree parsing happens twice, once while deciding idle state to be
> used for hotplug and once during cpuidle init. Hence, parsing the device
> tree and caching it will reduce code duplication. Parsing code has been
> moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states(). In addition
> to the properties in the device tree the number of available states is
> also required.
> 
> Signed-off-by: Akshay Adiga 
> Reviewed-by: Nicholas Piggin 

Looks good.
Reviewed-by: Gautham R. Shenoy 



CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()

2018-07-03 Thread Mathieu Malaterre
Hi Nick,

Would you consider this a bug:

$ touch drivers/macintosh/via-pmu.c
$ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc
CROSS_COMPILE=powerpc-linux-gnu-
...
  LD  vmlinux.o
  MODPOST vmlinux.o
WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from
the variable via_pmu_driver to the function .init.text:pmu_init()
The variable via_pmu_driver references
the function __init pmu_init()
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console

While:

$ touch drivers/macintosh/via-pmu.c
$ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc
CROSS_COMPILE=powerpc-linux-gnu-
...
  AR  init/built-in.a
  AR  built-in.a
  LD  vmlinux.o
  MODPOST vmlinux.o
  KSYM.tmp_kallsyms1.o
  KSYM.tmp_kallsyms2.o
  LD  vmlinux
  SORTEX  vmlinux
  SYSMAP  System.map
...

Thanks for comment


Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices

2018-07-03 Thread Pingfan Liu
On Tue, Jul 3, 2018 at 3:51 PM Lukas Wunner  wrote:
>
> On Tue, Jul 03, 2018 at 02:50:40PM +0800, Pingfan Liu wrote:
> > commit 52cdbdd49853 ("driver core: correct device's shutdown order")
> > places an assumption of supplier<-consumer order on the process of probe.
> > But it turns out to break down the parent <- child order in some scene.
> > E.g in pci, a bridge is enabled by pci core, and behind it, the devices
> > have been probed. Then comes the bridge's module, which enables extra
> > feature(such as hotplug) on this bridge. This will break the
> > parent<-children order and cause failure when "kexec -e" in some scenario.
> >
> > The detailed description of the scenario:
> > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod)
> > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due
> > to some issue. For this case, the bridge is moved after its children in
> > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not
> > write back buffer in flight due to the former shutdown of the bridge which
> > clears the BusMaster bit.
>
> If you revert commit cc27b735ad3a ("PCI/portdrv: Turn off PCIe services
> during shutdown"), does the issue go away?

Yes, it is gone.


[PATCH v3 2/2] powernv/cpuidle: Use parsed device tree values for cpuidle_init

2018-07-03 Thread Akshay Adiga
Export pnv_idle_states and nr_pnv_idle_states so that its accessible to
cpuidle driver. Use properties from pnv_idle_states structure for powernv
cpuidle_init.

Signed-off-by: Akshay Adiga 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/cpuidle.h |   2 +
 drivers/cpuidle/cpuidle-powernv.c  | 154 +
 2 files changed, 28 insertions(+), 128 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 574b0ce1d671..43e5f31fe64d 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -90,6 +90,8 @@ struct pnv_idle_states_t {
bool valid;
 };
 
+extern struct pnv_idle_states_t *pnv_idle_states;
+extern int nr_pnv_idle_states;
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index d29e4f041efe..208d57c77305 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -242,6 +242,7 @@ static inline void add_powernv_state(int index, const char 
*name,
powernv_states[index].target_residency = target_residency;
powernv_states[index].exit_latency = exit_latency;
powernv_states[index].enter = idle_fn;
+   /* For power8 and below psscr_* will be 0 */
stop_psscr_table[index].val = psscr_val;
stop_psscr_table[index].mask = psscr_mask;
 }
@@ -263,186 +264,84 @@ static inline int validate_dt_prop_sizes(const char 
*prop1, int prop1_len,
 extern u32 pnv_get_supported_cpuidle_states(void);
 static int powernv_add_idle_states(void)
 {
-   struct device_node *power_mgt;
int nr_idle_states = 1; /* Snooze */
-   int dt_idle_states, count;
-   u32 latency_ns[CPUIDLE_STATE_MAX];
-   u32 residency_ns[CPUIDLE_STATE_MAX];
-   u32 flags[CPUIDLE_STATE_MAX];
-   u64 psscr_val[CPUIDLE_STATE_MAX];
-   u64 psscr_mask[CPUIDLE_STATE_MAX];
-   const char *names[CPUIDLE_STATE_MAX];
+   int dt_idle_states;
u32 has_stop_states = 0;
-   int i, rc;
+   int i;
u32 supported_flags = pnv_get_supported_cpuidle_states();
 
 
/* Currently we have snooze statically defined */
-
-   power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
-   if (!power_mgt) {
-   pr_warn("opal: PowerMgmt Node not found\n");
+   if (nr_pnv_idle_states <= 0) {
+   pr_warn("cpuidle-powernv : Only Snooze is available\n");
goto out;
}
 
-   /* Read values of any property to determine the num of idle states */
-   dt_idle_states = of_property_count_u32_elems(power_mgt, 
"ibm,cpu-idle-state-flags");
-   if (dt_idle_states < 0) {
-   pr_warn("cpuidle-powernv: no idle states found in the DT\n");
-   goto out;
-   }
-
-   count = of_property_count_u32_elems(power_mgt,
-   "ibm,cpu-idle-state-latencies-ns");
-
-   if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states,
-  "ibm,cpu-idle-state-latencies-ns",
-  count) != 0)
-   goto out;
-
-   count = of_property_count_strings(power_mgt,
- "ibm,cpu-idle-state-names");
-   if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states,
-  "ibm,cpu-idle-state-names",
-  count) != 0)
-   goto out;
+   /* TODO: Count only states which are eligible for cpuidle */
+   dt_idle_states = nr_pnv_idle_states;
 
/*
 * Since snooze is used as first idle state, max idle states allowed is
 * CPUIDLE_STATE_MAX -1
 */
-   if (dt_idle_states > CPUIDLE_STATE_MAX - 1) {
+   if (nr_pnv_idle_states > CPUIDLE_STATE_MAX - 1) {
pr_warn("cpuidle-powernv: discovered idle states more than 
allowed");
dt_idle_states = CPUIDLE_STATE_MAX - 1;
}
 
-   if (of_property_read_u32_array(power_mgt,
-   "ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
-   pr_warn("cpuidle-powernv : missing ibm,cpu-idle-state-flags in 
DT\n");
-   goto out;
-   }
-
-   if (of_property_read_u32_array(power_mgt,
-   "ibm,cpu-idle-state-latencies-ns", latency_ns,
-   dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing 
ibm,cpu-idle-state-latencies-ns in DT\n");
-   goto out;
-   }
-   if (of_property_read_string_array(power_mgt,
-   "ibm,cpu-idle-state-names", names, dt_idle_states) < 0) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in 
DT\n");
-   goto out;
-   }
-
/*
 * If the idle states use stop instruction, probe for psscr values
 * an

[PATCH v3 1/2] powernv/cpuidle: Parse dt idle properties into global structure

2018-07-03 Thread Akshay Adiga
Device-tree parsing happens twice, once while deciding idle state to be
used for hotplug and once during cpuidle init. Hence, parsing the device
tree and caching it will reduce code duplication. Parsing code has been
moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states(). In addition
to the properties in the device tree the number of available states is
also required.

Signed-off-by: Akshay Adiga 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/cpuidle.h|  11 ++
 arch/powerpc/platforms/powernv/idle.c | 216 --
 2 files changed, 149 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index e210a83eb196..574b0ce1d671 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -79,6 +79,17 @@ struct stop_sprs {
u64 mmcra;
 };
 
+#define PNV_IDLE_NAME_LEN16
+struct pnv_idle_states_t {
+   char name[PNV_IDLE_NAME_LEN];
+   u32 latency_ns;
+   u32 residency_ns;
+   u64 psscr_val;
+   u64 psscr_mask;
+   u32 flags;
+   bool valid;
+};
+
 extern u32 pnv_fastsleep_workaround_at_entry[];
 extern u32 pnv_fastsleep_workaround_at_exit[];
 
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 1c5d0675b43c..7cf71b3e03a1 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -36,6 +36,8 @@
 #define P9_STOP_SPR_PSSCR  855
 
 static u32 supported_cpuidle_states;
+struct pnv_idle_states_t *pnv_idle_states;
+int nr_pnv_idle_states;
 
 /*
  * The default stop state that will be used by ppc_md.power_save
@@ -622,48 +624,10 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 
*psscr_mask, u32 flags)
  * @dt_idle_states: Number of idle state entries
  * Returns 0 on success
  */
-static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
-   int dt_idle_states)
+static int __init pnv_power9_idle_init(void)
 {
-   u64 *psscr_val = NULL;
-   u64 *psscr_mask = NULL;
-   u32 *residency_ns = NULL;
u64 max_residency_ns = 0;
-   int rc = 0, i;
-
-   psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
-   psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
-   residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns),
-  GFP_KERNEL);
-
-   if (!psscr_val || !psscr_mask || !residency_ns) {
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u64_array(np,
-   "ibm,cpu-idle-state-psscr",
-   psscr_val, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in 
DT\n");
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u64_array(np,
-  "ibm,cpu-idle-state-psscr-mask",
-  psscr_mask, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask 
in DT\n");
-   rc = -1;
-   goto out;
-   }
-
-   if (of_property_read_u32_array(np,
-  "ibm,cpu-idle-state-residency-ns",
-   residency_ns, dt_idle_states)) {
-   pr_warn("cpuidle-powernv: missing 
ibm,cpu-idle-state-residency-ns in DT\n");
-   rc = -1;
-   goto out;
-   }
+   int i;
 
/*
 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
@@ -679,33 +643,36 @@ static int __init pnv_power9_idle_init(struct device_node 
*np, u32 *flags,
 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
 */
pnv_first_deep_stop_state = MAX_STOP_STATE;
-   for (i = 0; i < dt_idle_states; i++) {
+   for (i = 0; i < nr_pnv_idle_states; i++) {
int err;
-   u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK;
+   struct pnv_idle_states_t *state = &pnv_idle_states[i];
+   u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
 
-   if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) &&
-(pnv_first_deep_stop_state > psscr_rl))
+   if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
+   pnv_first_deep_stop_state > psscr_rl)
pnv_first_deep_stop_state = psscr_rl;
 
-   err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i],
- flags[i]);
+   err = validate_psscr_val_mask(&state->psscr_val,
+ &state->psscr_mask,
+ state->flags);
if (err) {
-   report_invalid_psscr_val(psscr_val[i], err);
+   state->valid = false;

[PATCH v3 0/2] powernv/cpuidle Device-tree parsing cleanup

2018-07-03 Thread Akshay Adiga


Device-tree parsed multiple time in powernv cpuidle and powernv
hotplug code. 

First to identify supported flags. Second time, to identify deepest_state
and first deep state. Third time, during cpuidle init to find the available
idle states. Any change in device-tree format will lead to make changes in
these 3 places. Errors in device-tree can be handled in a better manner.

This series adds code to parse device tree once and save in global structure.

Changes from v2 :
 - Fix build error (moved a hunk from patch 1 to patch 2)
Changes from v1 :
 - fold first 2 patches into 1
 - rename pm_ctrl_reg_* as psscr_*
 - added comment stating removal of pmicr parsing code
 - removed parsing code for pmicr
 - add member valid in pnv_idle_states_t to indicate if the psscr-mask/val
are valid combination,
 - Change function description of pnv_parse_cpuidle_dt
 - Added error handling code.


Akshay Adiga (2):
  powernv/cpuidle: Parse dt idle properties into global structure
  powernv/cpuidle: Use parsed device tree values for cpuidle_init

 arch/powerpc/include/asm/cpuidle.h|  13 ++
 arch/powerpc/platforms/powernv/idle.c | 216 --
 drivers/cpuidle/cpuidle-powernv.c | 154 --
 3 files changed, 177 insertions(+), 206 deletions(-)

-- 
2.18.0.rc2.85.g1fb9df7



Oops in sock_wfree

2018-07-03 Thread Christophe LEROY

Hi,

I was having strange unexpected memory corruption, therefore I activated 
DEBUG_PAGEALLOC and I now end up with the following Oops, which tends to 
make me think we have somewhere in the network code a use-after-free 
bug. I saw a few of such bugs have been fixed for IPv4 and IPv6. Maybe 
we have one remaining for Unix sockets ? How can I spot it off and fix it ?


[   39.645644] Unable to handle kernel paging request for data at 
address 0xc2235010

[   39.652860] Faulting instruction address: 0xc0334d5c
[   39.657783] Oops: Kernel access of bad area, sig: 11 [#1]
[   39.663085] BE PREEMPT DEBUG_PAGEALLOC CMPC885
[   39.667488] SAF3000 DIE NOTIFICATION
[   39.671050] CPU: 0 PID: 269 Comm: in:imuxsock Not tainted 
4.14.52-00025-g5bada429cf #22

[   39.679633] task: c623e100 task.stack: c651e000
[   39.684106] NIP:  c0334d5c LR: c043602c CTR: c0435fb8
[   39.689103] REGS: c651fc00 TRAP: 0300   Not tainted 
(4.14.52-00025-g5bada429cf)

[   39.697087] MSR:  9032   CR: 28002822 XER: 2000
[   39.703720] DAR: c2235010 DSISR: c000
[   39.703720] GPR00: c043602c c651fcb0 c623e100 c619eec0 c642c540 
0008 0018 c651fd4c
[   39.703720] GPR08: c0435fb8 02b0 c068d830 0004 28004822 
100d4208  77990848
[   39.703720] GPR16: 0ff58398 778eb4b0 1039f050 1039f0a8 1005ddbc 
0ff5a7bc  
[   39.703720] GPR24: 0072 c5011650 c651feb8 0072 c619eec0 
0040 c2234fc0 c619eec0

[   39.741401] NIP [c0334d5c] sock_wfree+0x18/0xa4
[   39.745843] LR [c043602c] unix_destruct_scm+0x74/0x88
[   39.750786] Call Trace:
[   39.753253] [c651fcb0] [c006348c] ns_to_timeval+0x4c/0x7c (unreliable)
[   39.759690] [c651fcc0] [c043602c] unix_destruct_scm+0x74/0x88
[   39.765385] [c651fcf0] [c033a10c] skb_release_head_state+0x8c/0x110
[   39.771571] [c651fd00] [c033a3c4] skb_release_all+0x18/0x50
[   39.777078] [c651fd10] [c033a7cc] consume_skb+0x38/0xec
[   39.782255] [c651fd20] [c0342d7c] skb_free_datagram+0x1c/0x68
[   39.787922] [c651fd30] [c0435c8c] unix_dgram_recvmsg+0x19c/0x4ac
[   39.793863] [c651fdb0] [c0331370] ___sys_recvmsg+0x98/0x138
[   39.799371] [c651feb0] [c0333280] __sys_recvmsg+0x40/0x84
[   39.804707] [c651ff10] [c0333680] SyS_socketcall+0xb8/0x1d4
[   39.810220] [c651ff40] [c000d1ac] ret_from_syscall+0x0/0x38
[   39.815673] Instruction dump:
[   39.818612] 41beffac 4b58 3883 4ba0 3881 4b98 
7c0802a6 9421fff0
[   39.826267] bfc10008 90010014 83c30010 812300a8 <815e0050> 3bfe00e0 
71480200 4082003c

[   39.834113] ---[ end trace 8affde0490d7e25e ]---

Thanks
Christophe


Re: [PATCH v9 0/6] add support for relative references in special sections

2018-07-03 Thread Ingo Molnar


* Ard Biesheuvel  wrote:

> On 27 June 2018 at 17:15, Will Deacon  wrote:
> > Hi Ard,
> >
> > On Tue, Jun 26, 2018 at 08:27:55PM +0200, Ard Biesheuvel wrote:
> >> This adds support for emitting special sections such as initcall arrays,
> >> PCI fixups and tracepoints as relative references rather than absolute
> >> references. This reduces the size by 50% on 64-bit architectures, but
> >> more importantly, it removes the need for carrying relocation metadata
> >> for these sections in relocatable kernels (e.g., for KASLR) that needs
> >> to be fixed up at boot time. On arm64, this reduces the vmlinux footprint
> >> of such a reference by 8x (8 byte absolute reference + 24 byte RELA entry
> >> vs 4 byte relative reference)
> >>
> >> Patch #3 was sent out before as a single patch. This series supersedes
> >> the previous submission. This version makes relative ksymtab entries
> >> dependent on the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS rather
> >> than trying to infer from kbuild test robot replies for which architectures
> >> it should be blacklisted.
> >>
> >> Patch #1 introduces the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS,
> >> and sets it for the main architectures that are expected to benefit the
> >> most from this feature, i.e., 64-bit architectures or ones that use
> >> runtime relocations.
> >>
> >> Patch #2 add support for #define'ing __DISABLE_EXPORTS to get rid of
> >> ksymtab/kcrctab sections in decompressor and EFI stub objects when
> >> rebuilding existing C files to run in a different context.
> >
> > I had a small question on patch 3, but it's really for my understanding.
> > So, for patches 1-3:
> >
> > Reviewed-by: Will Deacon 
> >
> 
> Thanks all.
> 
> Thomas, Ingo,
> 
> Except for the below tweak against patch #3 for powerpc, which may
> apparently get confused by an input section called .discard without
> any suffixes, this series is good to go, but requires your ack to
> proceed, so I would like to ask you to share your comments and/or
> objections. Also, any suggestions or recommendations regarding the
> route these patches should take are highly appreciated.

LGTM:

Acked-by: Ingo Molnar 

Regarding route - I suspect -mm would be good, or any other tree that does a 
lot 
of cross-arch testing?

Thanks,

Ingo


Re: 83a092cf95f28: powerpc: Link warning for orphan sections

2018-07-03 Thread Mathieu Malaterre
On Tue, Jul 3, 2018 at 9:23 AM Nicholas Piggin  wrote:
>
> On Tue, 3 Jul 2018 09:03:46 +0200
> Mathieu Malaterre  wrote:
>
> > Hi Nick,
> >
> > I am building my kernel (ppc32) with both
> > CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y and CONFIG_UBSAN=y. This leads
> > to ~316428 warnings such as:
> >
> > + powerpc-linux-gnu-ld -EB -m elf32ppc -Bstatic --orphan-handling=warn
> > --build-id --gc-sections -X -o .tmp_vmlinux1 -T
> > ./arch/powerpc/kernel/vmlinux.lds --who
> > le-archive built-in.a --no-whole-archive --start-group lib/lib.a --end-group
> > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data393'
> > from `init/main.o' being placed in section `.data..Lubsan_data393'.
> > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data394'
> > from `init/main.o' being placed in section `.data..Lubsan_data394'.
> > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data395'
> > from `init/main.o' being placed in section `.data..Lubsan_data395'.
> > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data396'
> > from `init/main.o' being placed in section `.data..Lubsan_data396'.
> > ...
> >
> > What would you recommend to reduce the number of warnings produced ? I
> > tried `--warn-once` but that only affect undefined symbols.
>
> I'm not sure if the linker can be quietened. You could try putting
> *(.data..Lubsan_data*) into the .data section in
> powerpc/kernel/vmlinux.lds.S

Nice ! Will submit a patch. Thanks

> Thanks,
> Nick


Re: [PATCH v2 2/3] powerpc/fsl: Add barrier_nospec implementation for NXP PowerPC Book3E

2018-07-03 Thread Michael Ellerman
Hi Diana,

A few comments below ...

Diana Craciun  writes:
> Implement the barrier_nospec as a isync;sync instruction sequence.
> The implementation uses the infrastructure built for BOOK3S 64.

Do you have any details on why that sequence functions as an effective
barrier on your chips?

In a lot of places we have eg:

 +#if defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_FSL_BOOK3E)

Can you please add a Kconfig symbol to capture that. eg.

config PPC_NOSPEC
bool
default y
depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E


And then use that everywhere.

> diff --git a/arch/powerpc/include/asm/barrier.h 
> b/arch/powerpc/include/asm/barrier.h
> index f67b3f6..405d572 100644
> --- a/arch/powerpc/include/asm/barrier.h
> +++ b/arch/powerpc/include/asm/barrier.h
> @@ -86,6 +86,16 @@ do {   
> \
>  // This also acts as a compiler barrier due to the memory clobber.
>  #define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: 
> "memory")
>  
> +#elif defined(CONFIG_PPC_FSL_BOOK3E)
> +/*
> + * Prevent the execution of subsequent instructions speculatively using a
> + * isync;sync instruction sequence.
> + */
> +#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; nop; nop
> +
> +// This also acts as a compiler barrier due to the memory clobber.
> +#define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: 
> "memory")
> +
>  #else /* !CONFIG_PPC_BOOK3S_64 */
>  #define barrier_nospec_asm
>  #define barrier_nospec()

If we have CONFIG_PPC_NOSPEC this can be done something like:

#ifdef CONFIG_PPC_BOOK3S_64
#define NOSPEC_BARRIER_SLOT nop
#elif defined(CONFIG_PPC_FSL_BOOK3E)
#define NOSPEC_BARRIER_SLOT nop; nop
#endif /* CONFIG_PPC_BOOK3S_64 */

#ifdef CONFIG_PPC_NOSPEC
/*
 * Prevent execution of subsequent instructions until preceding branches have
 * been fully resolved and are no longer executing speculatively.
 */
#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; NOSPEC_BARRIER_SLOT

// This also acts as a compiler barrier due to the memory clobber.
#define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory")
#else
#define barrier_nospec_asm
#define barrier_nospec()
#endif /* CONFIG_PPC_NOSPEC */


> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 2b4c40b2..d9dee43 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -76,7 +76,7 @@ endif
>  obj64-$(CONFIG_HIBERNATION)  += swsusp_asm64.o
>  obj-$(CONFIG_MODULES)+= module.o module_$(BITS).o
>  obj-$(CONFIG_44x)+= cpu_setup_44x.o
> -obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o
> +obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o security.o
>  obj-$(CONFIG_PPC_DOORBELL)   += dbell.o
>  obj-$(CONFIG_JUMP_LABEL) += jump_label.o

Can we instead do:

obj-$(CONFIG_PPC_NOSPEC)+= security.o

> diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
> index c55e102..797c975 100644
> --- a/arch/powerpc/kernel/security.c
> +++ b/arch/powerpc/kernel/security.c
> @@ -13,7 +13,9 @@
>  #include 
>  
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
>  unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT;
> +#endif /* CONFIG_PPC_BOOK3S_64 */

Why are you making that book3s specific?

By doing that you then can't use the existing versions of
setup_barrier_nospec() and cpu_show_spectre_v1/v2().

> @@ -24,6 +26,7 @@ static void enable_barrier_nospec(bool enable)
>   do_barrier_nospec_fixups(enable);
>  }
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
>  void setup_barrier_nospec(void)
>  {
>   bool enable;
> @@ -46,6 +49,15 @@ void setup_barrier_nospec(void)
>   if (!no_nospec)
>   enable_barrier_nospec(enable);
>  }
> +#endif /* CONFIG_PPC_BOOK3S_64 */
> +
> +#ifdef CONFIG_PPC_FSL_BOOK3E
> +void setup_barrier_nospec(void)
> +{
> + if (!no_nospec)
> + enable_barrier_nospec(true);
> +}
> +#endif /* CONFIG_PPC_FSL_BOOK3E */

eg. that is identical to the existing version except for the feature check:

enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
 security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR);


Both those bits are enabled by default, so you shouldn't need to elide
that check.

So basically you should be able to use the existing
setup_barrier_nospec() if you just remove the ifdef around
powerpc_security_features. Or am I missing something?


> diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
> index 7445748..80c1e6e 100644
> --- a/arch/powerpc/kernel/setup_32.c
> +++ b/arch/powerpc/kernel/setup_32.c
> @@ -116,6 +116,11 @@ notrace void __init machine_init(u64 dt_ptr)
>   /* Do some early initialization based on the flat device tree */
>   early_init_devtree(__va(dt_ptr));
>  
> + /* Apply the speculation barrier fixup */
> +#ifdef CONFIG_PPC_FSL_BOOK3E
> + setup_barrier_nospec();
> +#endif

Re: [PATCH v2 3/3] powerpc/fsl: Implement cpu_show_spectre_v1/v2 for NXP PowerPC Book3E

2018-07-03 Thread Michael Ellerman
Michal Suchánek  writes:
> On Tue, 12 Jun 2018 02:59:11 +
> Bharat Bhushan  wrote:
>
>> Hi Diana,
>> 
>> > -Original Message-
>> > From: Diana Craciun [mailto:diana.crac...@nxp.com]
>> > Sent: Monday, June 11, 2018 6:23 PM
>> > To: linuxppc-dev@lists.ozlabs.org
>> > Cc: m...@ellerman.id.au; o...@buserror.net; Leo Li
>> > ; Bharat Bhushan ;
>> > Diana Madalina Craciun 
>> > Subject: [PATCH v2 3/3] powerpc/fsl: Implement
>> > cpu_show_spectre_v1/v2 for NXP PowerPC Book3E  
>> 
>> Please add some description
>
> To me the subject is self-explanatory. It implements a kernel interface
> that was already described elsewhere.
>
> What are you missing here?

It should at least explain why it's reimplementing a function that
already exists for powerpc. ie. Why can't the existing version be used?

cheers


Re: [PATCH v2 1/3] powerpc/fsl: Disable the speculation barrier from the command line

2018-07-03 Thread Michael Ellerman
Diana Craciun  writes:

> The speculation barrier can be disabled from the command line
> with the parameter: "nospectre_v1".

Can you please send another patch adding it to 
Documentation/admin-guide/kernel-parameters.txt

You should mark it as "PPC" only for now.

cheers


Re: 83a092cf95f28: powerpc: Link warning for orphan sections

2018-07-03 Thread Nicholas Piggin
On Tue, 3 Jul 2018 09:03:46 +0200
Mathieu Malaterre  wrote:

> Hi Nick,
> 
> I am building my kernel (ppc32) with both
> CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y and CONFIG_UBSAN=y. This leads
> to ~316428 warnings such as:
> 
> + powerpc-linux-gnu-ld -EB -m elf32ppc -Bstatic --orphan-handling=warn
> --build-id --gc-sections -X -o .tmp_vmlinux1 -T
> ./arch/powerpc/kernel/vmlinux.lds --who
> le-archive built-in.a --no-whole-archive --start-group lib/lib.a --end-group
> powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data393'
> from `init/main.o' being placed in section `.data..Lubsan_data393'.
> powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data394'
> from `init/main.o' being placed in section `.data..Lubsan_data394'.
> powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data395'
> from `init/main.o' being placed in section `.data..Lubsan_data395'.
> powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data396'
> from `init/main.o' being placed in section `.data..Lubsan_data396'.
> ...
> 
> What would you recommend to reduce the number of warnings produced ? I
> tried `--warn-once` but that only affect undefined symbols.

I'm not sure if the linker can be quietened. You could try putting
*(.data..Lubsan_data*) into the .data section in
powerpc/kernel/vmlinux.lds.S

Thanks,
Nick


Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-07-03 Thread Mahesh Jagannath Salgaonkar
On 07/03/2018 03:38 AM, Nicholas Piggin wrote:
> On Mon, 02 Jul 2018 11:17:06 +0530
> Mahesh J Salgaonkar  wrote:
> 
>> From: Mahesh Salgaonkar 
>>
>> On pseries, as of today system crashes if we get a machine check
>> exceptions due to SLB errors. These are soft errors and can be fixed by
>> flushing the SLBs so the kernel can continue to function instead of
>> system crash. We do this in real mode before turning on MMU. Otherwise
>> we would run into nested machine checks. This patch now fetches the
>> rtas error log in real mode and flushes the SLBs on SLB errors.
>>
>> Signed-off-by: Mahesh Salgaonkar 
>> ---
>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
>>  arch/powerpc/include/asm/machdep.h|1 
>>  arch/powerpc/kernel/exceptions-64s.S  |   42 +
>>  arch/powerpc/kernel/mce.c |   16 +++-
>>  arch/powerpc/mm/slb.c |6 +++
>>  arch/powerpc/platforms/powernv/opal.c |1 
>>  arch/powerpc/platforms/pseries/pseries.h  |1 
>>  arch/powerpc/platforms/pseries/ras.c  |   51 
>> +
>>  arch/powerpc/platforms/pseries/setup.c|1 
>>  9 files changed, 116 insertions(+), 4 deletions(-)
>>
> 
> 
>> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
>> +BEGIN_FTR_SECTION
>> +EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
>> +mr  r10,r1  /* Save r1 */
>> +ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
>> +subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/
>> +mfspr   r11,SPRN_SRR0   /* Save SRR0 */
>> +mfspr   r12,SPRN_SRR1   /* Save SRR1 */
>> +EXCEPTION_PROLOG_COMMON_1()
>> +EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
>> +EXCEPTION_PROLOG_COMMON_3(0x200)
>> +addir3,r1,STACK_FRAME_OVERHEAD
>> +BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
> 
> Is there any reason you can't use the existing
> machine_check_powernv_early code to do all this?

I did think about that :-). But the machine_check_powernv_early code
does bit of extra stuff which isn't required in pseries like touching ME
bit in MSR and lots of checks that are done in
machine_check_handle_early() before going to virtual mode. But on second
look I see that we can bypass all that with HVMODE FTR section. Will
rename machine_check_powernv_early to machine_check_common_early and
reuse it.

> 
>> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
>> index efdd16a79075..221271c96a57 100644
>> --- a/arch/powerpc/kernel/mce.c
>> +++ b/arch/powerpc/kernel/mce.c
>> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
>>  {
>>  long handled = 0;
>>  
>> -__this_cpu_inc(irq_stat.mce_exceptions);
>> +/*
>> + * For pSeries we count mce when we go into virtual mode machine
>> + * check handler. Hence skip it. Also, We can't access per cpu
>> + * variables in real mode for LPAR.
>> + */
>> +if (early_cpu_has_feature(CPU_FTR_HVMODE))
>> +__this_cpu_inc(irq_stat.mce_exceptions);
>>  
>> -if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
>> +/*
>> + * See if platform is capable of handling machine check.
>> + * Otherwise fallthrough and allow CPU to handle this machine check.
>> + */
>> +if (ppc_md.machine_check_early)
>> +handled = ppc_md.machine_check_early(regs);
>> +else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
>>  handled = cur_cpu_spec->machine_check_early(regs);
> 
> Would be good to add a powernv ppc_md handler which does the
> cur_cpu_spec->machine_check_early() call now that other platforms are
> calling this code. Because those aren't valid as a fallback call, but
> specific to powernv.
> 
>> diff --git a/arch/powerpc/platforms/powernv/opal.c 
>> b/arch/powerpc/platforms/powernv/opal.c
>> index 48fbb41af5d1..ed548d40a9e1 100644
>> --- a/arch/powerpc/platforms/powernv/opal.c
>> +++ b/arch/powerpc/platforms/powernv/opal.c
>> @@ -417,7 +417,6 @@ static int opal_recover_mce(struct pt_regs *regs,
>>  
>>  if (!(regs->msr & MSR_RI)) {
>>  /* If MSR_RI isn't set, we cannot recover */
>> -pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
> 
> What's the reason for this change?

Err... This is by mistake.. My bad. Thanks for catching this. Will
remove this hunk in next revision. We need a similar print for pSeries
in ras.c.

> 
>>  recovered = 0;
>>  } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
>>  /* Platform corrected itself */
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
>> b/arch/powerpc/platforms/pseries/pseries.h
>> index 60db2ee511fb..3611db5dd583 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -24,6 +24,7 @@ struct pt_regs;
>>  
>>  extern int pSeries_system_reset_exception(struct pt_regs

83a092cf95f28: powerpc: Link warning for orphan sections

2018-07-03 Thread Mathieu Malaterre
Hi Nick,

I am building my kernel (ppc32) with both
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y and CONFIG_UBSAN=y. This leads
to ~316428 warnings such as:

+ powerpc-linux-gnu-ld -EB -m elf32ppc -Bstatic --orphan-handling=warn
--build-id --gc-sections -X -o .tmp_vmlinux1 -T
./arch/powerpc/kernel/vmlinux.lds --who
le-archive built-in.a --no-whole-archive --start-group lib/lib.a --end-group
powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data393'
from `init/main.o' being placed in section `.data..Lubsan_data393'.
powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data394'
from `init/main.o' being placed in section `.data..Lubsan_data394'.
powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data395'
from `init/main.o' being placed in section `.data..Lubsan_data395'.
powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data396'
from `init/main.o' being placed in section `.data..Lubsan_data396'.
...

What would you recommend to reduce the number of warnings produced ? I
tried `--warn-once` but that only affect undefined symbols.

Thanks


[PATCHv3 4/4] Revert "driver core: correct device's shutdown order"

2018-07-03 Thread Pingfan Liu
This reverts commit 52cdbdd49853dfa856082edb0f4c4c0249d9df07.
Since the device_shutdown() does not rely on the order in devices_kset any
more, reverting that commit safely.

Cc: Greg Kroah-Hartman 
Cc: Rafael J. Wysocki 
Cc: Grygorii Strashko 
Cc: Christoph Hellwig 
Cc: Bjorn Helgaas 
Cc: Dave Young 
Cc: linux-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Pingfan Liu 
---
 drivers/base/base.h |  1 -
 drivers/base/core.c | 49 -
 drivers/base/dd.c   |  8 
 3 files changed, 58 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index a75c302..5bc9064 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -135,7 +135,6 @@ extern void device_unblock_probing(void);
 
 /* /sys/devices directory */
 extern struct kset *devices_kset;
-extern void devices_kset_move_last(struct device *dev);
 
 #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
 extern void module_add_driver(struct module *mod, struct device_driver *drv);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index db3deb8..570aeee 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1259,52 +1259,6 @@ static DEVICE_ATTR_RO(dev);
 struct kset *devices_kset;
 
 /**
- * devices_kset_move_before - Move device in the devices_kset's list.
- * @deva: Device to move.
- * @devb: Device @deva should come before.
- */
-static void devices_kset_move_before(struct device *deva, struct device *devb)
-{
-   if (!devices_kset)
-   return;
-   pr_debug("devices_kset: Moving %s before %s\n",
-dev_name(deva), dev_name(devb));
-   spin_lock(&devices_kset->list_lock);
-   list_move_tail(&deva->kobj.entry, &devb->kobj.entry);
-   spin_unlock(&devices_kset->list_lock);
-}
-
-/**
- * devices_kset_move_after - Move device in the devices_kset's list.
- * @deva: Device to move
- * @devb: Device @deva should come after.
- */
-static void devices_kset_move_after(struct device *deva, struct device *devb)
-{
-   if (!devices_kset)
-   return;
-   pr_debug("devices_kset: Moving %s after %s\n",
-dev_name(deva), dev_name(devb));
-   spin_lock(&devices_kset->list_lock);
-   list_move(&deva->kobj.entry, &devb->kobj.entry);
-   spin_unlock(&devices_kset->list_lock);
-}
-
-/**
- * devices_kset_move_last - move the device to the end of devices_kset's list.
- * @dev: device to move
- */
-void devices_kset_move_last(struct device *dev)
-{
-   if (!devices_kset)
-   return;
-   pr_debug("devices_kset: Moving %s to end of list\n", dev_name(dev));
-   spin_lock(&devices_kset->list_lock);
-   list_move_tail(&dev->kobj.entry, &devices_kset->list);
-   spin_unlock(&devices_kset->list_lock);
-}
-
-/**
  * device_create_file - create sysfs attribute file for device.
  * @dev: device.
  * @attr: device attribute descriptor.
@@ -2776,15 +2730,12 @@ int device_move(struct device *dev, struct device 
*new_parent,
break;
case DPM_ORDER_DEV_AFTER_PARENT:
device_pm_move_after(dev, new_parent);
-   devices_kset_move_after(dev, new_parent);
break;
case DPM_ORDER_PARENT_BEFORE_DEV:
device_pm_move_before(new_parent, dev);
-   devices_kset_move_before(new_parent, dev);
break;
case DPM_ORDER_DEV_LAST:
device_pm_move_last(dev);
-   devices_kset_move_last(dev);
break;
}
 
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 1435d72..6ebcd65 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -434,14 +434,6 @@ static int really_probe(struct device *dev, struct 
device_driver *drv)
goto probe_failed;
}
 
-   /*
-* Ensure devices are listed in devices_kset in correct order
-* It's important to move Dev to the end of devices_kset before
-* calling .probe, because it could be recursive and parent Dev
-* should always go first
-*/
-   devices_kset_move_last(dev);
-
if (dev->bus->probe) {
ret = dev->bus->probe(dev);
if (ret)
-- 
2.7.4