[PATCH kernel v3 3/6] KVM: PPC: Make iommu_table::it_userspace big endian
We are going to reuse multilevel TCE code for the userspace copy of the TCE table and since it is big endian, let's make the copy big endian too. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 2 +- arch/powerpc/kvm/book3s_64_vio.c| 11 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 10 +- drivers/vfio/vfio_iommu_spapr_tce.c | 19 +-- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 20febe0..803ac70 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -117,7 +117,7 @@ struct iommu_table { unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ struct list_head it_group_list;/* List of iommu_table_group_link */ - unsigned long *it_userspace; /* userspace view of the table */ + __be64 *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; struct krefit_kref; }; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8167ce8..6f34edd 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -377,19 +377,19 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ return H_TOO_HARD; - mem = mm_iommu_lookup(kvm->mm, *pua, pgsize); + mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; mm_iommu_mapped_dec(mem); - *pua = 0; + *pua = cpu_to_be64(0); return H_SUCCESS; } @@ -436,7 +436,8 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, enum dma_data_direction dir) { long ret; - unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + unsigned long hpa; + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) @@ -463,7 +464,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (dir != DMA_NONE) kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); - *pua = ua; + *pua = cpu_to_be64(ua); return 0; } diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 5b298f5..841aef7 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -210,13 +210,13 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, if (WARN_ON_ONCE_RM(!pua)) return H_HARDWARE; - mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); + mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; mm_iommu_mapped_dec(mem); - *pua = 0; + *pua = cpu_to_be64(0); return H_SUCCESS; } @@ -268,7 +268,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) @@ -303,7 +303,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (dir != DMA_NONE) kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); - *pua = ua; + *pua = cpu_to_be64(ua); return 0; } diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 7cd63b0..17a418c 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -230,7 +230,7 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, decrement_locked_vm(mm, cb >> PAGE_SHIFT); return -ENOMEM; } - tbl->it_userspace = uas; + tbl->it_userspace = (__be64 *) uas; return 0; } @@ -482,20 +482,20 @@ static void tce_iommu_unuse_page_v2(struct tce_container
[PATCH kernel v3 2/6] powerpc/powernv: Move TCE manupulation code to its own file
Right now we have allocation code in pci-ioda.c and traversing code in pci.c, let's keep them toghether. However both files are big enough already so let's move this business to a new file. While we at it, move the code which links IOMMU table groups to IOMMU tables as it is not specific to any PNV PHB model. These puts exported symbols from the new file together. This fixes several warnings from checkpatch.pl like this: "WARNING: Prefer 'unsigned int' to bare use of 'unsigned'". As this is almost cut-n-paste, there should be no behavioral change. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/pci.h | 41 ++-- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 313 ++ arch/powerpc/platforms/powernv/pci-ioda.c | 146 arch/powerpc/platforms/powernv/pci.c | 158 - 5 files changed, 340 insertions(+), 320 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/pci-ioda-tce.c diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 703a350..b540ce8e 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o +obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index eada4b6..fa90f60 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -201,13 +201,6 @@ struct pnv_phb { }; extern struct pci_ops pnv_pci_ops; -extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs); -extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); -extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff); @@ -217,14 +210,6 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); extern struct iommu_table *pnv_pci_table_alloc(int nid); -extern long pnv_pci_link_table_and_group(int node, int num, - struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, - void *tce_mem, u64 tce_size, - u64 dma_offset, unsigned page_shift); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); @@ -272,4 +257,30 @@ extern void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev); /* phb ops (cxl switches these when enabling the kernel api on the phb) */ extern const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops; +/* pci-ioda-tce.c */ +#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_MAX_LEVELS 5 + +extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + unsigned long attrs); +extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction); +extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); + +extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl); +extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); + +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, u
[PATCH kernel v3 0/6] powerpc/powernv/iommu: Optimize memory use
This patchset aims to reduce actual memory use for guests with sparse memory. The pseries guest uses dynamic DMA windows to map the entire guest RAM but it only actually maps onlined memory which may be not be contiguous. I hit this when tried passing through NVLink2-connected GPU RAM of NVIDIA V100 and trying to map this RAM at the same offset as in the real hardware forced me to rework I handle these windows. This moves userspace-to-host-physical translation table (iommu_table::it_userspace) from VFIO TCE IOMMU subdriver to the platform code and reuses the already existing multilevel TCE table code which we have for the hardware tables. At last in 6/6 I switch to on-demand allocation so we do not allocate huge chunks of the table if we do not have to; there is some math in 6/6. Changes: v3: * rebased on v4.18-rc3 and fixed compile error in 6/6 v2: * bugfix and error handling in 6/6 This is based on sha1 021c917 Linus Torvalds "Linux 4.18-rc3". Please comment. Thanks. Alexey Kardashevskiy (6): powerpc/powernv: Remove useless wrapper powerpc/powernv: Move TCE manupulation code to its own file KVM: PPC: Make iommu_table::it_userspace big endian powerpc/powernv: Add indirect levels to it_userspace powerpc/powernv: Rework TCE level allocation powerpc/powernv/ioda: Allocate indirect TCE levels on demand arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/include/asm/iommu.h | 11 +- arch/powerpc/platforms/powernv/pci.h | 44 ++- arch/powerpc/kvm/book3s_64_vio.c | 11 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 18 +- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 399 ++ arch/powerpc/platforms/powernv/pci-ioda.c | 184 ++-- arch/powerpc/platforms/powernv/pci.c | 158 -- drivers/vfio/vfio_iommu_spapr_tce.c | 65 + 9 files changed, 478 insertions(+), 414 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/pci-ioda-tce.c -- 2.11.0
[PATCH kernel v3 6/6] powerpc/powernv/ioda: Allocate indirect TCE levels on demand
At the moment we allocate the entire TCE table, twice (hardware part and userspace translation cache). This normally works as we normally have contigous memory and the guest will map entire RAM for 64bit DMA. However if we have sparse RAM (one example is a memory device), then we will allocate TCEs which will never be used as the guest only maps actual memory for DMA. If it is a single level TCE table, there is nothing we can really do but if it a multilevel table, we can skip allocating TCEs we know we won't need. This adds ability to allocate only first level, saving memory. This changes iommu_table::free() to avoid allocating of an extra level; iommu_table::set() will do this when needed. This adds @alloc parameter to iommu_table::exchange() to tell the callback if it can allocate an extra level; the flag is set to "false" for the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns H_TOO_HARD. This still requires the entire table to be counted in mm::locked_vm. To be conservative, this only does on-demand allocation when the usespace cache table is requested which is the case of VFIO. The example math for a system replicating a powernv setup with NVLink2 in a guest: 16GB RAM mapped at 0x0 128GB GPU RAM window (16GB of actual RAM) mapped at 0x2440 the table to cover that all with 64K pages takes: (((0x2440 + 0x20) >> 16)*8)>>20 = 4556MB If we allocate only necessary TCE levels, we will only need: (((0x4 + 0x4) >> 16)*8)>>20 = 4MB (plus some for indirect levels). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson --- Changes: v2: * fixed bug in cleanup path which forced the entire table to be allocated right before destroying * added memory allocation error handling pnv_tce() --- arch/powerpc/include/asm/iommu.h | 7 ++- arch/powerpc/platforms/powernv/pci.h | 6 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 4 +- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 73 +-- arch/powerpc/platforms/powernv/pci-ioda.c | 8 +-- drivers/vfio/vfio_iommu_spapr_tce.c | 2 +- 6 files changed, 73 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 4bdcf22..daa3ee5 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -70,7 +70,7 @@ struct iommu_table_ops { unsigned long *hpa, enum dma_data_direction *direction); - __be64 *(*useraddrptr)(struct iommu_table *tbl, long index); + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -122,10 +122,13 @@ struct iommu_table { __be64 *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; struct krefit_kref; + int it_nid; }; +#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ + ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ - ((tbl)->it_ops->useraddrptr((tbl), (entry))) + ((tbl)->it_ops->useraddrptr((tbl), (entry), true)) /* Pure 2^n version of get_order */ static inline __attribute_const__ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 2962f6d..0020937 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -266,8 +266,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, + bool alloc); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 8cc1caf..efb90d8 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -264,7 +264,7 @@ static long k
[PATCH kernel v3 5/6] powerpc/powernv: Rework TCE level allocation
This moves actual pages allocation to a separate function which is going to be reused later in on-demand TCE allocation. While we are at it, remove unnecessary level size round up as the caller does this already. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 30 +-- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index f14b282..36c2eb0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -31,6 +31,23 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, tbl->it_type = TCE_PCI; } +static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) +{ + struct page *tce_mem = NULL; + __be64 *addr; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory, level shift=%d\n", + shift); + return NULL; + } + addr = page_address(tce_mem); + memset(addr, 0, 1UL << shift); + + return addr; +} + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -165,21 +182,12 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, unsigned int levels, unsigned long limit, unsigned long *current_offset, unsigned long *total_allocated) { - struct page *tce_mem = NULL; __be64 *addr, *tmp; - unsigned int order = max_t(unsigned int, shift, PAGE_SHIFT) - - PAGE_SHIFT; - unsigned long allocated = 1UL << (order + PAGE_SHIFT); + unsigned long allocated = 1UL << shift; unsigned int entries = 1UL << (shift - 3); long i; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); - if (!tce_mem) { - pr_err("Failed to allocate a TCE memory, order=%d\n", order); - return NULL; - } - addr = page_address(tce_mem); - memset(addr, 0, allocated); + addr = pnv_alloc_tce_level(nid, shift); *total_allocated += allocated; --levels; -- 2.11.0
[PATCH kernel v3 4/6] powerpc/powernv: Add indirect levels to it_userspace
We want to support sparse memory and therefore huge chunks of DMA windows do not need to be mapped. If a DMA window big enough to require 2 or more indirect levels, and a DMA window is used to map all RAM (which is a default case for 64bit window), we can actually save some memory by not allocation TCE for regions which we are not going to map anyway. The hardware tables alreary support indirect levels but we also keep host-physical-to-userspace translation array which is allocated by vmalloc() and is a flat array which might use quite some memory. This converts it_userspace from vmalloc'ed array to a multi level table. As the format becomes platform dependend, this replaces the direct access to it_usespace with a iommu_table_ops::useraddrptr hook which returns a pointer to the userspace copy of a TCE; future extension will return NULL if the level was not allocated. This should not change non-KVM handling of TCE tables and it_userspace will not be allocated for non-KVM tables. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy --- Changes: v2: * fixed compile error by ditching one inline helper --- arch/powerpc/include/asm/iommu.h | 6 +-- arch/powerpc/platforms/powernv/pci.h | 3 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 8 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 65 +-- arch/powerpc/platforms/powernv/pci-ioda.c | 23 +++--- drivers/vfio/vfio_iommu_spapr_tce.c | 46 --- 6 files changed, 73 insertions(+), 78 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 803ac70..4bdcf22 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -69,6 +69,8 @@ struct iommu_table_ops { long index, unsigned long *hpa, enum dma_data_direction *direction); + + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -123,9 +125,7 @@ struct iommu_table { }; #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ - ((tbl)->it_userspace ? \ - &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \ - NULL) + ((tbl)->it_ops->useraddrptr((tbl), (entry))) /* Pure 2^n version of get_order */ static inline __attribute_const__ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index fa90f60..2962f6d 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -267,11 +267,12 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); + bool alloc_userspace_copy, struct iommu_table *tbl); extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); extern long pnv_pci_link_table_and_group(int node, int num, diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 841aef7..8cc1caf 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -206,10 +206,6 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, /* it_userspace allocation might be delayed */ return H_TOO_HARD; - pua = (void *) vmalloc_to_phys(pua); - if (WARN_ON_ONCE_RM(!pua)) - return H_HARDWARE; - mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; @@ -283,10 +279,6 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, &hpa))) return H_HARDWARE; - pua = (void *) vmalloc_to_phys(pua); - if (WARN_ON_ONCE_RM(!pua)) - return H_HARDWARE; - if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) return H_CLOSED; diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 700ceb1..f14b282 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -31,9 +31,9 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, tbl->it_type = TCE_PCI; } -static __be64 *pnv_tce(struct iommu_table *tbl, long idx) +static __be64 *pnv_tce(struct iomm
[PATCH kernel v3 1/6] powerpc/powernv: Remove useless wrapper
This gets rid of a useless wrapper around pnv_pci_ioda2_table_free_pages(). Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 7 +-- 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index cc5942d..02275a0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2199,11 +2199,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } -static void pnv_ioda2_table_free(struct iommu_table *tbl) -{ - pnv_pci_ioda2_table_free_pages(tbl); -} - static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API @@ -2212,7 +2207,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, - .free = pnv_ioda2_table_free, + .free = pnv_pci_ioda2_table_free_pages, }; static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) -- 2.11.0
Re: [PATCH 2/3] powerpc/powernv: DMA operations for discontiguous allocation
On Fri, 2018-06-29 at 17:34 +1000, Russell Currey wrote: > + /* > + * The TCE isn't being used, so let's try and > allocate it. > + * Bits 0 and 1 are read/write, and we use bit 2 as > a "lock" > + * bit. This is to prevent any race where the value > is set in > + * the TCE table but the invalidate/mb() hasn't > finished yet. > + */ > + entry = cpu_to_be64((addr - offset) | 7); > + ret = cmpxchg(&pe->tces[i], tce, entry); > + if (ret != tce) { > + /* conflict, start looking again just in > case */ > + i--; > + continue; > + } > + pnv_pci_phb3_tce_invalidate(pe, 0, 0, addr - offset, > 1); This is wrong and won't work outside of PHB3, will make a generic handler > + mb(); > + /* clear the lock bit now that we know it's active > */ > + ret = cmpxchg(&pe->tces[i], entry, cpu_to_be64((addr > - offset) | 3)); > + if (ret != entry) { > + /* conflict, start looking again just in > case */ > + i--; > + continue; > + } > + > + return (i << phb->ioda.max_tce_order) | offset; > + } > + /* If we get here, the table must be full, so error out. */ > + return -1ULL; > +} > + qtpass.desktop Description: application/desktop
[PATCH kernel v3 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
A VM which has: - a DMA capable device passed through to it (eg. network card); - running a malicious kernel that ignores H_PUT_TCE failure; - capability of using IOMMU pages bigger that physical pages can create an IOMMU mapping that exposes (for example) 16MB of the host physical memory to the device when only 64K was allocated to the VM. The remaining 16MB - 64K will be some other content of host memory, possibly including pages of the VM, but also pages of host kernel memory, host programs or other VMs. The attacking VM does not control the location of the page it can map, and is only allowed to map as many pages as it has pages of RAM. We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that an IOMMU page is contained in the physical page so the PCI hardware won't get access to unassigned host memory; however this check is missing in the KVM fastpath (H_PUT_TCE accelerated code). We were lucky so far and did not hit this yet as the very first time when the mapping happens we do not have tbl::it_userspace allocated yet and fall back to the userspace which in turn calls VFIO IOMMU driver, this fails and the guest does not retry, This stores the smallest preregistered page size in the preregistered region descriptor and changes the mm_iommu_xxx API to check this against the IOMMU page size. This only allows huge pages use if the entire preregistered block is backed with huge pages which are completely contained the preregistered chunk; otherwise this defaults to PAGE_SIZE. Signed-off-by: Alexey Kardashevskiy --- Changes: v3: * fixed upper limit for the page size * added checks that we don't register parts of a huge page v2: * explicitely check for compound pages before calling compound_order() --- The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to advertise 16MB pages to the guest; a typical pseries guest will use 16MB for IOMMU pages without checking the mmu pagesize and this will fail at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256 With the change, mapping will fail in KVM and the guest will print: mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 18 1f returned 0 (liobn = 0x8001 starting addr = 800 0) mlx5_core :00:00.0: created tce table LIOBN 0x8001 for /pci@8002000/ethernet@0 mlx5_core :00:00.0: failed to map direct window for /pci@8002000/ethernet@0: -1 --- arch/powerpc/include/asm/mmu_context.h | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c | 2 +- arch/powerpc/kvm/book3s_64_vio_hv.c| 6 +++-- arch/powerpc/mm/mmu_context_iommu.c| 41 +++--- drivers/vfio/vfio_iommu_spapr_tce.c| 2 +- 5 files changed, 46 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 896efa5..79d570c 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -35,9 +35,9 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm( extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, unsigned long ua, unsigned long entries); extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned long *hpa); + unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, - unsigned long ua, unsigned long *hpa); + unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index d066e37..8c456fa 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -449,7 +449,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, /* This only handles v2 IOMMU type, v1 is handled via ioctl() */ return H_TOO_HARD; - if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa))) + if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa))) return H_HARDWARE; if (mm_iommu_mapped_inc(mem)) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 925fc31..5b298f5 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -279,7 +279,8 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (!mem) return H_TOO_HARD; - if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) + if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, + &hpa))) re
[PATCH kernel v3 0/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page
This is to improve page boundaries checking and should probably be cc:stable. I came accross this while debugging nvlink2 passthrough but the lack of checking might be exploited by the existing userspace. Changes: v3: * enforced huge pages not to cross preregistered chunk boundaries v2: * 2/2: explicitly check for compound pages before calling compound_order() This is based on sha1 021c917 Linus Torvalds "Linux 4.18-rc3". Please comment. Thanks. Alexey Kardashevskiy (2): vfio/spapr: Use IOMMU pageshift rather than pagesize KVM: PPC: Check if IOMMU page is contained in the pinned physical page arch/powerpc/include/asm/mmu_context.h | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c | 2 +- arch/powerpc/kvm/book3s_64_vio_hv.c| 6 +++-- arch/powerpc/mm/mmu_context_iommu.c| 41 +++--- drivers/vfio/vfio_iommu_spapr_tce.c| 10 - 5 files changed, 50 insertions(+), 13 deletions(-) -- 2.11.0
[PATCH kernel v3 1/2] vfio/spapr: Use IOMMU pageshift rather than pagesize
The size is always equal to 1 page so let's use this. Later on this will be used for other checks which use page shifts to check the granularity of access. This should cause no behavioral change. Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Alexey Kardashevskiy --- As Alex suggested, this should go via the ppc tree which the next patch is going to (which is ppc-kvm, probably?). --- drivers/vfio/vfio_iommu_spapr_tce.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 759a5bd..2da5f05 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -457,13 +457,13 @@ static void tce_iommu_unuse_page(struct tce_container *container, } static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, - unsigned long tce, unsigned long size, + unsigned long tce, unsigned long shift, unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) { long ret = 0; struct mm_iommu_table_group_mem_t *mem; - mem = mm_iommu_lookup(container->mm, tce, size); + mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift); if (!mem) return -EINVAL; @@ -487,7 +487,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container, if (!pua) return; - ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), + ret = tce_iommu_prereg_ua_to_hpa(container, *pua, tbl->it_page_shift, &hpa, &mem); if (ret) pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", @@ -611,7 +611,7 @@ static long tce_iommu_build_v2(struct tce_container *container, entry + i); ret = tce_iommu_prereg_ua_to_hpa(container, - tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); + tce, tbl->it_page_shift, &hpa, &mem); if (ret) break; -- 2.11.0
Re: [PATCHv3 3/4] drivers/base: clean up the usage of devices_kset_move_last()
On Tue, Jul 3, 2018 at 10:28 PM Rafael J. Wysocki wrote: > > On Tuesday, July 3, 2018 8:50:41 AM CEST Pingfan Liu wrote: > > Clean up the referring to the code in commit 52cdbdd49853 ("driver core: > > correct device's shutdown order"). So later we can revert it safely. > > > > Cc: Greg Kroah-Hartman > > Cc: Rafael J. Wysocki > > Cc: Grygorii Strashko > > Cc: Christoph Hellwig > > Cc: Bjorn Helgaas > > Cc: Dave Young > > Cc: linux-...@vger.kernel.org > > Cc: linuxppc-dev@lists.ozlabs.org > > Signed-off-by: Pingfan Liu > > --- > > drivers/base/core.c | 7 --- > > 1 file changed, 7 deletions(-) > > > > diff --git a/drivers/base/core.c b/drivers/base/core.c > > index 684b994..db3deb8 100644 > > --- a/drivers/base/core.c > > +++ b/drivers/base/core.c > > @@ -127,13 +127,6 @@ static int device_reorder_to_tail(struct device *dev, > > void *not_used) > > { > > struct device_link *link; > > > > - /* > > - * Devices that have not been registered yet will be put to the ends > > - * of the lists during the registration, so skip them here. > > - */ > > - if (device_is_registered(dev)) > > - devices_kset_move_last(dev); > > - > > if (device_pm_initialized(dev)) > > device_pm_move_last(dev); > > You can't do this. > > If you do it, that will break power management in some situations. > Could you shed light on it? I had a quick browsing of pm code, but it is a big function, and I got lost in it. If the above code causes failure, then does it imply that the seq in devices_kset should be the same as dpm_list? But in device_shutdown(), it only intersect with pm by pm_runtime_get_noresume(dev) and pm_runtime_barrier(dev). How do these function affect the seq in dpm_list? Need your help to see how to handle this issue. Thanks and regards, Pingfan
Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
On 07/03/18 at 11:57pm, Andy Shevchenko wrote: > On Tue, Jul 3, 2018 at 5:55 PM, Baoquan He wrote: > > On 06/12/18 at 05:24pm, Andy Shevchenko wrote: > >> On Tue, Jun 12, 2018 at 5:20 PM, Andy Shevchenko > >> wrote: > > >> > I briefly looked at the code and error codes we have, so, my proposal > >> > is one of the following > >> > >> > - use -ECANCELED (not the best choice for first occurrence here, > >> > though I can't find better) > >> > >> Actually -ENOTSUPP might suit the first case (although the actual > >> would be something like -EOVERLAP, which we don't have) > > > > Sorry for late reply, and many thanks for your great suggestion. > > > > > I am fine to use -ENOTSUPP as the first returned value, and -ECANCELED > > for the 2nd one. > > I have no strong opinion, but I like (slightly better) this approach ^^^ Done, post v6 in this way, many thanks. > > > Or define an enum as you suggested inside the function > > or in header file. > > > > > Or use -EBUSY for the first case because existing resource is > > overlapping but not fully contained by 'res'; and -EINVAL for > > the 2nd case since didn't find any one resources which is contained by > > 'res', means we passed in a invalid resource. > > > > All is fine to me, I can repost with each of them. > > >> > - use positive integers (or enum), like > >> > #define RES_REPARENTED 0 > >> > #define RES_OVERLAPPED 1 > >> > #define RES_NOCONFLICT 2 > > -- > With Best Regards, > Andy Shevchenko
[PATCH v6 4/4] kexec_file: Load kernel at top of system RAM if required
For kexec_file loading, if kexec_buf.top_down is 'true', the memory which is used to load kernel/initrd/purgatory is supposed to be allocated from top to down. This is what we have been doing all along in the old kexec loading interface and the kexec loading is still default setting in some distributions. However, the current kexec_file loading interface doesn't do likt this. The function arch_kexec_walk_mem() it calls ignores checking kexec_buf.top_down, but calls walk_system_ram_res() directly to go through all resources of System RAM from bottom to up, to try to find memory region which can contain the specific kexec buffer, then call locate_mem_hole_callback() to allocate memory in that found memory region from top to down. This brings confusion especially when KASLR is widely supported , users have to make clear why kexec/kdump kernel loading position is different between these two interfaces in order to exclude unnecessary noises. Hence these two interfaces need be unified on behaviour. Here add checking if kexec_buf.top_down is 'true' in arch_kexec_walk_mem(), if yes, call the newly added walk_system_ram_res_rev() to find memory region from top to down to load kernel. Signed-off-by: Baoquan He Cc: Eric Biederman Cc: Vivek Goyal Cc: Dave Young Cc: Andrew Morton Cc: Yinghai Lu Cc: ke...@lists.infradead.org --- kernel/kexec_file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c6a3b6851372..75226c1d08ce 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -518,6 +518,8 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); + else if (kbuf->top_down) + return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } -- 2.13.6
[PATCH v6 3/4] resource: add walk_system_ram_res_rev()
This function, being a variant of walk_system_ram_res() introduced in commit 8c86e70acead ("resource: provide new functions to walk through resources"), walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. It will be used in kexec_file code. Signed-off-by: Baoquan He Cc: Andrew Morton Cc: Thomas Gleixner Cc: Brijesh Singh Cc: "Jérôme Glisse" Cc: Borislav Petkov Cc: Tom Lendacky Cc: Wei Yang --- include/linux/ioport.h | 3 +++ kernel/resource.c | 40 2 files changed, 43 insertions(+) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index b7456ae889dd..066cc263e2cc 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -279,6 +279,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int +walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); diff --git a/kernel/resource.c b/kernel/resource.c index 6d647a3824b1..4c5fbef4ea24 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include @@ -443,6 +445,44 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, } /* + * This function, being a variant of walk_system_ram_res(), calls the @func + * callback against all memory ranges of type System RAM which are marked as + * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from + * higher to lower. + */ +int walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + unsigned long flags; + struct resource *res; + int ret = -1; + + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + read_lock(&resource_lock); + list_for_each_entry_reverse(res, &iomem_resource.child, sibling) { + if (start >= end) + break; + if ((res->flags & flags) != flags) + continue; + if (res->desc != IORES_DESC_NONE) + continue; + if (res->end < start) + break; + + if ((res->end >= start) && (res->start < end)) { + ret = (*func)(res, arg); + if (ret) + break; + } + end = res->start - 1; + + } + read_unlock(&resource_lock); + return ret; +} + +/* * This function calls the @func callback against all memory ranges, which * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. */ -- 2.13.6
[PATCH v6 2/4] resource: Use list_head to link sibling resource
The struct resource uses singly linked list to link siblings, implemented by pointer operation. Replace it with list_head for better code readability. Based on this list_head replacement, it will be very easy to do reverse iteration on iomem_resource's sibling list in later patch. Besides, type of member variables of struct resource, sibling and child, are changed from 'struct resource *' to 'struct list_head'. This brings two pointers of size increase. Suggested-by: Andrew Morton Signed-off-by: Baoquan He Cc: Patrik Jakobsson Cc: David Airlie Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Dmitry Torokhov Cc: Dan Williams Cc: Rob Herring Cc: Frank Rowand Cc: Keith Busch Cc: Jonathan Derrick Cc: Lorenzo Pieralisi Cc: Bjorn Helgaas Cc: Thomas Gleixner Cc: Brijesh Singh Cc: "Jérôme Glisse" Cc: Borislav Petkov Cc: Tom Lendacky Cc: Greg Kroah-Hartman Cc: Yaowei Bai Cc: Wei Yang Cc: de...@linuxdriverproject.org Cc: linux-in...@vger.kernel.org Cc: linux-nvd...@lists.01.org Cc: devicet...@vger.kernel.org Cc: linux-...@vger.kernel.org --- arch/arm/plat-samsung/pm-check.c| 6 +- arch/microblaze/pci/pci-common.c| 4 +- arch/powerpc/kernel/pci-common.c| 4 +- arch/sparc/kernel/ioport.c | 2 +- arch/xtensa/include/asm/pci-bridge.h| 4 +- drivers/eisa/eisa-bus.c | 2 + drivers/gpu/drm/drm_memory.c| 3 +- drivers/gpu/drm/gma500/gtt.c| 5 +- drivers/hv/vmbus_drv.c | 52 +++ drivers/input/joystick/iforce/iforce-main.c | 4 +- drivers/nvdimm/namespace_devs.c | 6 +- drivers/nvdimm/nd.h | 5 +- drivers/of/address.c| 4 +- drivers/parisc/lba_pci.c| 4 +- drivers/pci/controller/vmd.c| 8 +- drivers/pci/probe.c | 2 + drivers/pci/setup-bus.c | 2 +- include/linux/ioport.h | 17 ++- kernel/resource.c | 208 ++-- 19 files changed, 175 insertions(+), 167 deletions(-) diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c index cd2c02c68bc3..5494355b1c49 100644 --- a/arch/arm/plat-samsung/pm-check.c +++ b/arch/arm/plat-samsung/pm-check.c @@ -46,8 +46,8 @@ typedef u32 *(run_fn_t)(struct resource *ptr, u32 *arg); static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg) { while (ptr != NULL) { - if (ptr->child != NULL) - s3c_pm_run_res(ptr->child, fn, arg); + if (!list_empty(&ptr->child)) + s3c_pm_run_res(resource_first_child(&ptr->child), fn, arg); if ((ptr->flags & IORESOURCE_SYSTEM_RAM) == IORESOURCE_SYSTEM_RAM) { @@ -57,7 +57,7 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg) arg = (fn)(ptr, arg); } - ptr = ptr->sibling; + ptr = resource_sibling(ptr); } } diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 7899bafab064..2bf73e27e231 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -533,7 +533,9 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, res->flags = range.flags; res->start = range.cpu_addr; res->end = range.cpu_addr + range.size - 1; - res->parent = res->child = res->sibling = NULL; + res->parent = NULL; + INIT_LIST_HEAD(&res->child); + INIT_LIST_HEAD(&res->sibling); } } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 926035bb378d..28fbe83c9daf 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -761,7 +761,9 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, res->flags = range.flags; res->start = range.cpu_addr; res->end = range.cpu_addr + range.size - 1; - res->parent = res->child = res->sibling = NULL; + res->parent = NULL; + INIT_LIST_HEAD(&res->child); + INIT_LIST_HEAD(&res->sibling); } } } diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index cca9134cfa7d..99efe4e98b16 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -669,7 +669,7 @@ static int sparc_io_proc_show(struct seq_file *m, void *v) struct resource *root = m->private, *r; const char *nm; - for (r = root->child; r != NULL; r = r->sibling) { +
[PATCH v6 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
reparent_resources() is duplicated in arch/microblaze/pci/pci-common.c and arch/powerpc/kernel/pci-common.c, so move it to kernel/resource.c so that it's shared. Signed-off-by: Baoquan He --- arch/microblaze/pci/pci-common.c | 37 - arch/powerpc/kernel/pci-common.c | 35 --- include/linux/ioport.h | 1 + kernel/resource.c| 39 +++ 4 files changed, 40 insertions(+), 72 deletions(-) diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index f34346d56095..7899bafab064 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -619,43 +619,6 @@ int pcibios_add_device(struct pci_dev *dev) EXPORT_SYMBOL(pcibios_add_device); /* - * Reparent resource children of pr that conflict with res - * under res, and make res replace those children. - */ -static int __init reparent_resources(struct resource *parent, -struct resource *res) -{ - struct resource *p, **pp; - struct resource **firstpp = NULL; - - for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) { - if (p->end < res->start) - continue; - if (res->end < p->start) - break; - if (p->start < res->start || p->end > res->end) - return -1; /* not completely contained */ - if (firstpp == NULL) - firstpp = pp; - } - if (firstpp == NULL) - return -1; /* didn't find any conflicting entries? */ - res->parent = parent; - res->child = *firstpp; - res->sibling = *pp; - *firstpp = res; - *pp = NULL; - for (p = res->child; p != NULL; p = p->sibling) { - p->parent = res; - pr_debug("PCI: Reparented %s [%llx..%llx] under %s\n", -p->name, -(unsigned long long)p->start, -(unsigned long long)p->end, res->name); - } - return 0; -} - -/* * Handle resources of PCI devices. If the world were perfect, we could * just allocate all the resource regions and do nothing more. It isn't. * On the other hand, we cannot just re-allocate all devices, as it would diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index fe9733aa..926035bb378d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1088,41 +1088,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, EXPORT_SYMBOL(pcibios_align_resource); /* - * Reparent resource children of pr that conflict with res - * under res, and make res replace those children. - */ -static int reparent_resources(struct resource *parent, -struct resource *res) -{ - struct resource *p, **pp; - struct resource **firstpp = NULL; - - for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) { - if (p->end < res->start) - continue; - if (res->end < p->start) - break; - if (p->start < res->start || p->end > res->end) - return -1; /* not completely contained */ - if (firstpp == NULL) - firstpp = pp; - } - if (firstpp == NULL) - return -1; /* didn't find any conflicting entries? */ - res->parent = parent; - res->child = *firstpp; - res->sibling = *pp; - *firstpp = res; - *pp = NULL; - for (p = res->child; p != NULL; p = p->sibling) { - p->parent = res; - pr_debug("PCI: Reparented %s %pR under %s\n", -p->name, p, res->name); - } - return 0; -} - -/* * Handle resources of PCI devices. If the world were perfect, we could * just allocate all the resource regions and do nothing more. It isn't. * On the other hand, we cannot just re-allocate all devices, as it would diff --git a/include/linux/ioport.h b/include/linux/ioport.h index da0ebaec25f0..dfdcd0bfe54e 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -192,6 +192,7 @@ extern int allocate_resource(struct resource *root, struct resource *new, struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); +int reparent_resources(struct resource *parent, struct resource *res); resource_size_t resource_alignment(struct resource *res); static inline resource_size_t resource_size(const struct resource *res) { diff --git a/kernel/resource.c b/kernel/resource.c index 30e1bc68503b..d1cbf4b50e17 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@
[PATCH v6 0/4] resource: Use list_head to link sibling resource
This patchset is doing: 1) Replace struct resource's sibling list from singly linked list to list_head. Clearing out those pointer operation within singly linked list for better code readability. 2) Based on list_head replacement, add a new function walk_system_ram_res_rev() which can does reversed iteration on iomem_resource's siblings. 3) Change kexec_file loading to search system RAM top down for kernel loadin, using walk_system_ram_res_rev(). Note: This patchset only passed testing on x86_64 arch with network enabling. The thing we need pay attetion to is that a root resource's child member need be initialized specifically with LIST_HEAD_INIT() if statically defined or INIT_LIST_HEAD() for dynamically definition. Here Just like we do for iomem_resource/ioport_resource, or the change in get_pci_domain_busn_res(). v5: http://lkml.kernel.org/r/20180612032831.29747-1-...@redhat.com v4: http://lkml.kernel.org/r/20180507063224.24229-1-...@redhat.com v3: http://lkml.kernel.org/r/20180419001848.3041-1-...@redhat.com v2: http://lkml.kernel.org/r/20180408024724.16812-1-...@redhat.com v1: http://lkml.kernel.org/r/20180322033722.9279-1-...@redhat.com Changelog: v5->v6: Fix code style problems in reparent_resources() and use existing error codes, according to Andy's suggestion. Fix bugs test robot reported. v4->v5: Add new patch 0001 to move duplicated reparent_resources() to kernel/resource.c to make it be shared by different ARCH-es. Fix several code bugs reported by test robot on ARCH powerpc and microblaze. v3->v4: Fix several bugs test robot reported. Rewrite cover letter and patch log according to reviewer's comment. v2->v3: Rename resource functions first_child() and sibling() to resource_first_chils() and resource_sibling(). Dan suggested this. Move resource_first_chils() and resource_sibling() to linux/ioport.h and make them as inline function. Rob suggested this. Accordingly add linux/list.h including in linux/ioport.h, please help review if this bring efficiency degradation or code redundancy. The change on struct resource {} bring two pointers of size increase, mention this in git log to make it more specifically, Rob suggested this. v1->v2: Use list_head instead to link resource siblings. This is suggested by Andrew. Rewrite walk_system_ram_res_rev() after list_head is taken to link resouce siblings. Baoquan He (4): resource: Move reparent_resources() to kernel/resource.c and make it public resource: Use list_head to link sibling resource resource: add walk_system_ram_res_rev() kexec_file: Load kernel at top of system RAM if required arch/arm/plat-samsung/pm-check.c| 6 +- arch/microblaze/pci/pci-common.c| 41 + arch/powerpc/kernel/pci-common.c| 39 + arch/sparc/kernel/ioport.c | 2 +- arch/xtensa/include/asm/pci-bridge.h| 4 +- drivers/eisa/eisa-bus.c | 2 + drivers/gpu/drm/drm_memory.c| 3 +- drivers/gpu/drm/gma500/gtt.c| 5 +- drivers/hv/vmbus_drv.c | 52 +++--- drivers/input/joystick/iforce/iforce-main.c | 4 +- drivers/nvdimm/namespace_devs.c | 6 +- drivers/nvdimm/nd.h | 5 +- drivers/of/address.c| 4 +- drivers/parisc/lba_pci.c| 4 +- drivers/pci/controller/vmd.c| 8 +- drivers/pci/probe.c | 2 + drivers/pci/setup-bus.c | 2 +- include/linux/ioport.h | 21 ++- kernel/kexec_file.c | 2 + kernel/resource.c | 263 ++-- 20 files changed, 248 insertions(+), 227 deletions(-) -- 2.13.6
Re: [PATCHv5 1/4] kbuild: Add build salt to the kernel and modules
Hi. Thanks for the update. 2018-07-04 8:34 GMT+09:00 Laura Abbott : > > The build id generated from --build-id can be generated in several different > ways, with the default being the sha1 on the output of the linked file. For > distributions, it can be useful to make sure this ID is unique, even if the > actual file contents don't change. The easiest way to do this is to insert > a section with some data. > > Add an ELF note to both the kernel and module which contains some data based > off of a config option. > > Signed-off-by: Masahiro Yamada > Signed-off-by: Laura Abbott > --- > v5: I used S-o-b here since the majority of the code was written > already. I think Suggested-by is good enough. S-o-b is appended as a patch is passed from people to people. Anyway, this looks good except one bike-shed. > Please feel free to change the tag if you think it's not > appropriate. I also tweaked this to take an ascii string instead of just > a hex value since this makes things much easier on the distribution > side. > --- > diff --git a/init/Kconfig b/init/Kconfig > index 041f3a022122..8de789f40db9 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -107,6 +107,15 @@ config LOCALVERSION_AUTO > > which is done within the script "scripts/setlocalversion".) > > +config BUILD_SALT > + string "Build ID Salt" > + default "Linux" How about empty string "" for default? -- Best Regards Masahiro Yamada
Re: [PATCHv5 0/4] Salted build ids via ELF notes
Hi. 2018-07-04 8:21 GMT+09:00 Laura Abbott : > > Hi, > > This is v5 of the series to allow unique build ids in the kernel. As a > reminder of the context: > "" > In Fedora, the debug information is packaged separately (foo-debuginfo) and > can be installed separately. There's been a long standing issue where only one > version of a debuginfo info package can be installed at a time. Mark Wielaard > made an effort for Fedora 27 to allow parallel installation of debuginfo (see > https://fedoraproject.org/wiki/Changes/ParallelInstallableDebuginfo for > more details) > > Part of the requirement to allow this to work is that build ids are > unique between builds. The existing upstream rpm implementation ensures > this by re-calculating the build-id using the version and release as a > seed. This doesn't work 100% for the kernel because of the vDSO which is > its own binary and doesn't get updated. After poking holes in a few of my > ideas, there was a discussion with some people from the binutils team about > adding --build-id-salt to let ld do the calculation debugedit is doing. There > was a counter proposal made to add in the salt while building. The > easiest proposal was to add an item in the linker script vs. linking in > an object since we need the salt to go in every module as well as the > kernel and vmlinux. > "" I think this information is helpful to explain the background of this work, but the cover letter cannot be committed in git. Could you add this in 1/4 please? If I read only the simple log in 1/4, I would wonder why it is useful... > v5 uses the approach suggested by Masahiro Yamada which uses the > existing ELF note macro to more easily add the salt (vs previous > approaches which tried to adjust via linker section). > > If arch maintainers are okay, I'd like acks for this so this can go > through the kbuild tree. > > Thanks, > Laura > > Laura Abbott (4): > kbuild: Add build salt to the kernel and modules > x86: Add build salt to the vDSO > powerpc: Add build salt to the vDSO > arm64: Add build salt to the vDSO > > arch/arm64/kernel/vdso/note.S | 3 +++ > arch/powerpc/kernel/vdso32/note.S | 3 +++ > arch/x86/entry/vdso/vdso-note.S | 3 +++ > arch/x86/entry/vdso/vdso32/note.S | 3 +++ > include/linux/build-salt.h| 20 > init/Kconfig | 9 + > init/version.c| 3 +++ > scripts/mod/modpost.c | 3 +++ > 8 files changed, 47 insertions(+) > create mode 100644 include/linux/build-salt.h > > -- > 2.17.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kbuild" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Best Regards Masahiro Yamada
Re: [PATCHv5 4/4] arm64: Add build salt to the vDSO
Hi. 2018-07-04 8:34 GMT+09:00 Laura Abbott : > > The vDSO needs to have a unique build id in a similar manner > to the kernel and modules. Use the build salt macro. > > Signed-off-by: Laura Abbott > --- > v5: I was previously focused on x86 only but since powerpc gave a patch, > I figured I would do arm64 since the changes were also fairly simple. > --- > arch/arm64/kernel/vdso/note.S | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S > index b82c85e5d972..2c429dfd3f45 100644 > --- a/arch/arm64/kernel/vdso/note.S > +++ b/arch/arm64/kernel/vdso/note.S > @@ -22,7 +22,10 @@ > #include > #include > #include > +#include > > ELFNOTE_START(Linux, 0, "a") > .long LINUX_VERSION_CODE > ELFNOTE_END > + > +BUILD_SALT; I think this works, but I prefer no-semicolon in assembly files. For coding consistency, I want ';' as statement delimiter in .c files. But, only new line after each statement in .S files. For example, in arch/x86/xen/xen-head.S I see no semicolon after ELFNOTE(). I found this: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0473k/dom1359731141352.html It says ';' starts a comment line although it is not the case of GAS. Same for 3/4. -- Best Regards Masahiro Yamada
RE: [PATCH v11 00/26] Speculative page faults
Hi Laurent, For the test result on Intel 4s skylake platform (192 CPUs, 768G Memory), the below test cases all were run 3 times. I check the test results, only page_fault3_thread/enable THP have 6% stddev for head commit, other tests have lower stddev. And I did not find other high variation on test case result. a). Enable THP testcase base stddev change head stddev metric page_fault3/enable THP 10519 ± 3%-20.5% 8368 ±6% will-it-scale.per_thread_ops page_fault2/enalbe THP8281 ± 2%-18.8% 6728 will-it-scale.per_thread_ops brk1/eanble THP 998475 -2.2%976893 will-it-scale.per_process_ops context_switch1/enable THP 223910 -1.3%220930 will-it-scale.per_process_ops context_switch1/enable THP 233722 -1.0%231288 will-it-scale.per_thread_ops b). Disable THP page_fault3/disable THP 10856 -23.1% 8344 will-it-scale.per_thread_ops page_fault2/disable THP 8147 -18.8% 6613 will-it-scale.per_thread_ops brk1/disable THP 957-7.9% 881 will-it-scale.per_thread_ops context_switch1/disable THP 237006-2.2%231907 will-it-scale.per_thread_ops brk1/disable THP997317-2.0%98 will-it-scale.per_process_ops page_fault3/disable THP 467454-1.8%459251 will-it-scale.per_process_ops context_switch1/disable THP 224431-1.3%221567 will-it-scale.per_process_ops Best regards, Haiyan Song From: Laurent Dufour [lduf...@linux.vnet.ibm.com] Sent: Monday, July 02, 2018 4:59 PM To: Song, HaiyanX Cc: a...@linux-foundation.org; mho...@kernel.org; pet...@infradead.org; kir...@shutemov.name; a...@linux.intel.com; d...@stgolabs.net; j...@suse.cz; Matthew Wilcox; khand...@linux.vnet.ibm.com; aneesh.ku...@linux.vnet.ibm.com; b...@kernel.crashing.org; m...@ellerman.id.au; pau...@samba.org; Thomas Gleixner; Ingo Molnar; h...@zytor.com; Will Deacon; Sergey Senozhatsky; sergey.senozhatsky.w...@gmail.com; Andrea Arcangeli; Alexei Starovoitov; Wang, Kemi; Daniel Jordan; David Rientjes; Jerome Glisse; Ganesh Mahendran; Minchan Kim; Punit Agrawal; vinayak menon; Yang Shi; linux-ker...@vger.kernel.org; linux...@kvack.org; ha...@linux.vnet.ibm.com; npig...@gmail.com; bsinghar...@gmail.com; paul...@linux.vnet.ibm.com; Tim Chen; linuxppc-dev@lists.ozlabs.org; x...@kernel.org Subject: Re: [PATCH v11 00/26] Speculative page faults On 11/06/2018 09:49, Song, HaiyanX wrote: > Hi Laurent, > > Regression test for v11 patch serials have been run, some regression is found > by LKP-tools (linux kernel performance) > tested on Intel 4s skylake platform. This time only test the cases which have > been run and found regressions on > V9 patch serials. > > The regression result is sorted by the metric will-it-scale.per_thread_ops. > branch: Laurent-Dufour/Speculative-page-faults/20180520-045126 > commit id: > head commit : a7a8993bfe3ccb54ad468b9f1799649e4ad1ff12 > base commit : ba98a1cdad71d259a194461b3a61471b49b14df1 > Benchmark: will-it-scale > Download link: https://github.com/antonblanchard/will-it-scale/tree/master > > Metrics: > will-it-scale.per_process_ops=processes/nr_cpu > will-it-scale.per_thread_ops=threads/nr_cpu > test box: lkp-skl-4sp1(nr_cpu=192,memory=768G) > THP: enable / disable > nr_task:100% > > 1. Regressions: > > a). Enable THP > testcase base change head > metric > page_fault3/enable THP 10519 -20.5%836 > will-it-scale.per_thread_ops > page_fault2/enalbe THP8281 -18.8% 6728 > will-it-scale.per_thread_ops > brk1/eanble THP 998475 -2.2% 976893 > will-it-scale.per_process_ops > context_switch1/enable THP 223910 -1.3% 220930 > will-it-scale.per_process_ops > context_switch1/enable THP 233722 -1.0% 231288 > will-it-scale.per_thread_ops > > b). Disable THP > page_fault3/disable THP 10856 -23.1% 8344 > will-it-scale.per_thread_ops > page_fault2/disable THP 8147 -18.8% 6613 > will-it-scale.per_thread_ops > brk1/disable THP 957 -7.9%881 > will-it-scale.per_thread_ops > context_switch1/disable THP 237006 -2.2% 231907 > will-it-scale.per_thread_ops > brk1/disable THP997317 -2.0% 98 > will-
Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices
On Tue, Jul 3, 2018 at 5:26 PM Pingfan Liu wrote: > > On Tue, Jul 3, 2018 at 3:51 PM Lukas Wunner wrote: > > > > On Tue, Jul 03, 2018 at 02:50:40PM +0800, Pingfan Liu wrote: > > > commit 52cdbdd49853 ("driver core: correct device's shutdown order") > > > places an assumption of supplier<-consumer order on the process of probe. > > > But it turns out to break down the parent <- child order in some scene. > > > E.g in pci, a bridge is enabled by pci core, and behind it, the devices > > > have been probed. Then comes the bridge's module, which enables extra > > > feature(such as hotplug) on this bridge. This will break the > > > parent<-children order and cause failure when "kexec -e" in some scenario. > > > > > > The detailed description of the scenario: > > > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod) > > > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due > > > to some issue. For this case, the bridge is moved after its children in > > > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not > > > write back buffer in flight due to the former shutdown of the bridge which > > > clears the BusMaster bit. > > > > If you revert commit cc27b735ad3a ("PCI/portdrv: Turn off PCIe services > > during shutdown"), does the issue go away? > > Yes, it is gone. Have not figured out why the issue was gone. But I think it just cover some fault. re-fetch the boot log of mainline kernel without any patch, and filter out the pci domain 0004 grep "devices_kset: Moving 0004:" newlog.txt [2.114986] devices_kset: Moving 0004:00:00.0 to end of list <--- pcie port drive's probe, but it failed [2.115192] devices_kset: Moving 0004:01:00.0 to end of list [2.115591] devices_kset: Moving 0004:02:02.0 to end of list [2.115923] devices_kset: Moving 0004:02:0a.0 to end of list [2.116141] devices_kset: Moving 0004:02:0b.0 to end of list [2.116358] devices_kset: Moving 0004:02:0c.0 to end of list [3.181860] devices_kset: Moving 0004:03:00.0 to end of list <--- the ata disk controller which sits behind the bridge [ 10.267081] devices_kset: Moving 0004:00:00.0 to end of list <--- shpc_probe() on this bridge, failed too. Hence we have the bridge (parent) after the child in devices_kset. Thanks, Pingfan
Re: [PATCHv3 0/4] drivers/base: bugfix for supplier<-consumer ordering in device_kset
On Tue, Jul 3, 2018 at 10:36 PM Rafael J. Wysocki wrote: > > On Tuesday, July 3, 2018 8:50:38 AM CEST Pingfan Liu wrote: > > commit 52cdbdd49853 ("driver core: correct device's shutdown order") > > places an assumption of supplier<-consumer order on the process of probe. > > But it turns out to break down the parent <- child order in some scene. > > E.g in pci, a bridge is enabled by pci core, and behind it, the devices > > have been probed. Then comes the bridge's module, which enables extra > > feature(such as hotplug) on this bridge. > > So what *exactly* does happen in that case? > I saw the shpc_probe() is called on the bridge, although the probing failed on that bare-metal. But if it success, then it will enable the hotplug feature on the bridge. Thanks, Pingfan
[PATCHv5 4/4] arm64: Add build salt to the vDSO
The vDSO needs to have a unique build id in a similar manner to the kernel and modules. Use the build salt macro. Signed-off-by: Laura Abbott --- v5: I was previously focused on x86 only but since powerpc gave a patch, I figured I would do arm64 since the changes were also fairly simple. --- arch/arm64/kernel/vdso/note.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S index b82c85e5d972..2c429dfd3f45 100644 --- a/arch/arm64/kernel/vdso/note.S +++ b/arch/arm64/kernel/vdso/note.S @@ -22,7 +22,10 @@ #include #include #include +#include ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END + +BUILD_SALT; -- 2.17.1
[PATCHv5 3/4] powerpc: Add build salt to the vDSO
The vDSO needs to have a unique build id in a similar manner to the kernel and modules. Use the build salt macro. Signed-off-by: Laura Abbott --- v5: New approach with the BUILD_SALT macro --- arch/powerpc/kernel/vdso32/note.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/vdso32/note.S b/arch/powerpc/kernel/vdso32/note.S index d4b5be4f3d5f..ec1d05265c75 100644 --- a/arch/powerpc/kernel/vdso32/note.S +++ b/arch/powerpc/kernel/vdso32/note.S @@ -5,6 +5,7 @@ #include #include +#include #define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)\ .section name, flags; \ @@ -23,3 +24,5 @@ ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) .long LINUX_VERSION_CODE ASM_ELF_NOTE_END + +BUILD_SALT; -- 2.17.1
[PATCHv5 2/4] x86: Add build salt to the vDSO
The vDSO needs to have a unique build id in a similar manner to the kernel and modules. Use the build salt macro. Signed-off-by: Laura Abbott --- v5: Switched to using the single line BUILD_SALT macro --- arch/x86/entry/vdso/vdso-note.S | 3 +++ arch/x86/entry/vdso/vdso32/note.S | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S index 79a071e4357e..79423170118f 100644 --- a/arch/x86/entry/vdso/vdso-note.S +++ b/arch/x86/entry/vdso/vdso-note.S @@ -3,6 +3,7 @@ * Here we can supply some information useful to userland. */ +#include #include #include #include @@ -10,3 +11,5 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END + +BUILD_SALT diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index 9fd51f206314..e78047d119f6 100644 --- a/arch/x86/entry/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -4,6 +4,7 @@ * Here we can supply some information useful to userland. */ +#include #include #include @@ -14,6 +15,8 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END +BUILD_SALT + #ifdef CONFIG_XEN /* * Add a special note telling glibc's dynamic linker a fake hardware -- 2.17.1
[PATCHv5 1/4] kbuild: Add build salt to the kernel and modules
The build id generated from --build-id can be generated in several different ways, with the default being the sha1 on the output of the linked file. For distributions, it can be useful to make sure this ID is unique, even if the actual file contents don't change. The easiest way to do this is to insert a section with some data. Add an ELF note to both the kernel and module which contains some data based off of a config option. Signed-off-by: Masahiro Yamada Signed-off-by: Laura Abbott --- v5: I used S-o-b here since the majority of the code was written already. Please feel free to change the tag if you think it's not appropriate. I also tweaked this to take an ascii string instead of just a hex value since this makes things much easier on the distribution side. --- include/linux/build-salt.h | 20 init/Kconfig | 9 + init/version.c | 3 +++ scripts/mod/modpost.c | 3 +++ 4 files changed, 35 insertions(+) create mode 100644 include/linux/build-salt.h diff --git a/include/linux/build-salt.h b/include/linux/build-salt.h new file mode 100644 index ..bb007bd05e7a --- /dev/null +++ b/include/linux/build-salt.h @@ -0,0 +1,20 @@ +#ifndef __BUILD_SALT_H +#define __BUILD_SALT_H + +#include + +#define LINUX_ELFNOTE_BUILD_SALT 0x100 + +#ifdef __ASSEMBLER__ + +#define BUILD_SALT \ + ELFNOTE(Linux, LINUX_ELFNOTE_BUILD_SALT, .asciz CONFIG_BUILD_SALT) + +#else + +#define BUILD_SALT \ + ELFNOTE32("Linux", LINUX_ELFNOTE_BUILD_SALT, CONFIG_BUILD_SALT) + +#endif + +#endif /* __BUILD_SALT_H */ diff --git a/init/Kconfig b/init/Kconfig index 041f3a022122..8de789f40db9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -107,6 +107,15 @@ config LOCALVERSION_AUTO which is done within the script "scripts/setlocalversion".) +config BUILD_SALT + string "Build ID Salt" + default "Linux" + help + The build ID is used to link binaries and their debug info. Setting + this option will use the value in the calculation of the build id. + This is mostly useful for distributions which want to ensure the + build is unique between builds. It's safe to leave the default. + config HAVE_KERNEL_GZIP bool diff --git a/init/version.c b/init/version.c index bfb4e3f4955e..ef4012ec4375 100644 --- a/init/version.c +++ b/init/version.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -49,3 +50,5 @@ const char linux_proc_banner[] = "%s version %s" " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")" " (" LINUX_COMPILER ") %s\n"; + +BUILD_SALT; diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 1663fb19343a..dc6d714e4dcb 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2125,10 +2125,13 @@ static int check_modname_len(struct module *mod) **/ static void add_header(struct buffer *b, struct module *mod) { + buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "#include \n"); buf_printf(b, "\n"); + buf_printf(b, "BUILD_SALT;\n"); + buf_printf(b, "\n"); buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n"); buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n"); buf_printf(b, "\n"); -- 2.17.1
[PATCHv5 0/4] Salted build ids via ELF notes
Hi, This is v5 of the series to allow unique build ids in the kernel. As a reminder of the context: "" In Fedora, the debug information is packaged separately (foo-debuginfo) and can be installed separately. There's been a long standing issue where only one version of a debuginfo info package can be installed at a time. Mark Wielaard made an effort for Fedora 27 to allow parallel installation of debuginfo (see https://fedoraproject.org/wiki/Changes/ParallelInstallableDebuginfo for more details) Part of the requirement to allow this to work is that build ids are unique between builds. The existing upstream rpm implementation ensures this by re-calculating the build-id using the version and release as a seed. This doesn't work 100% for the kernel because of the vDSO which is its own binary and doesn't get updated. After poking holes in a few of my ideas, there was a discussion with some people from the binutils team about adding --build-id-salt to let ld do the calculation debugedit is doing. There was a counter proposal made to add in the salt while building. The easiest proposal was to add an item in the linker script vs. linking in an object since we need the salt to go in every module as well as the kernel and vmlinux. "" v5 uses the approach suggested by Masahiro Yamada which uses the existing ELF note macro to more easily add the salt (vs previous approaches which tried to adjust via linker section). If arch maintainers are okay, I'd like acks for this so this can go through the kbuild tree. Thanks, Laura Laura Abbott (4): kbuild: Add build salt to the kernel and modules x86: Add build salt to the vDSO powerpc: Add build salt to the vDSO arm64: Add build salt to the vDSO arch/arm64/kernel/vdso/note.S | 3 +++ arch/powerpc/kernel/vdso32/note.S | 3 +++ arch/x86/entry/vdso/vdso-note.S | 3 +++ arch/x86/entry/vdso/vdso32/note.S | 3 +++ include/linux/build-salt.h| 20 init/Kconfig | 9 + init/version.c| 3 +++ scripts/mod/modpost.c | 3 +++ 8 files changed, 47 insertions(+) create mode 100644 include/linux/build-salt.h -- 2.17.1
[PATCHv5 0/4] Salted build ids via ELF notes
Hi, This is v5 of the series to allow unique build ids in the kernel. As a reminder of the context: "" In Fedora, the debug information is packaged separately (foo-debuginfo) and can be installed separately. There's been a long standing issue where only one version of a debuginfo info package can be installed at a time. Mark Wielaard made an effort for Fedora 27 to allow parallel installation of debuginfo (see https://fedoraproject.org/wiki/Changes/ParallelInstallableDebuginfo for more details) Part of the requirement to allow this to work is that build ids are unique between builds. The existing upstream rpm implementation ensures this by re-calculating the build-id using the version and release as a seed. This doesn't work 100% for the kernel because of the vDSO which is its own binary and doesn't get updated. After poking holes in a few of my ideas, there was a discussion with some people from the binutils team about adding --build-id-salt to let ld do the calculation debugedit is doing. There was a counter proposal made to add in the salt while building. The easiest proposal was to add an item in the linker script vs. linking in an object since we need the salt to go in every module as well as the kernel and vmlinux. "" v5 uses the approach suggested by Masahiro Yamada which uses the existing ELF note macro to more easily add the salt (vs previous approaches which tried to adjust via linker section). If arch maintainers are okay, I'd like acks for this so this can go through the kbuild tree. Thanks, Laura Laura Abbott (4): kbuild: Add build salt to the kernel and modules x86: Add build salt to the vDSO powerpc: Add build salt to the vDSO arm64: Add build salt to the vDSO arch/arm64/kernel/vdso/note.S | 3 +++ arch/powerpc/kernel/vdso32/note.S | 3 +++ arch/x86/entry/vdso/vdso-note.S | 3 +++ arch/x86/entry/vdso/vdso32/note.S | 3 +++ include/linux/build-salt.h| 20 init/Kconfig | 9 + init/version.c| 3 +++ scripts/mod/modpost.c | 3 +++ 8 files changed, 47 insertions(+) create mode 100644 include/linux/build-salt.h -- 2.17.1
Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices
Thank you Andy for the heads up. I might need to rebase my work (http://lkml.kernel.org/r/20180629182541.6735-1-pasha.tatas...@oracle.com) based on this change. But, it is possible it is going to be harder to parallelize based on device tree. I will need to think about it. Pavel On Tue, Jul 3, 2018 at 6:59 AM Andy Shevchenko wrote: > > I think Pavel would be interested to see this as well (he is doing > some parallel device shutdown stuff) > > On Tue, Jul 3, 2018 at 9:50 AM, Pingfan Liu wrote: > > commit 52cdbdd49853 ("driver core: correct device's shutdown order") > > places an assumption of supplier<-consumer order on the process of probe. > > But it turns out to break down the parent <- child order in some scene. > > E.g in pci, a bridge is enabled by pci core, and behind it, the devices > > have been probed. Then comes the bridge's module, which enables extra > > feature(such as hotplug) on this bridge. This will break the > > parent<-children order and cause failure when "kexec -e" in some scenario. > > > > The detailed description of the scenario: > > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod) > > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due > > to some issue. For this case, the bridge is moved after its children in > > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not > > write back buffer in flight due to the former shutdown of the bridge which > > clears the BusMaster bit. > > > > It is a little hard to impose both "parent<-child" and "supplier<-consumer" > > order on devices_kset. Take the following scene: > > step0: before a consumer's probing, (note child_a is supplier of consumer_a) > > [ consumer-X, child_a, , child_z] [... consumer_a, ..., consumer_z, > > ...] supplier-X > > ^^ affected range > > ^^ > > step1: when probing, moving consumer-X after supplier-X > > [ child_a, , child_z] [ consumer_a, ..., consumer_z, ...] > > supplier-X, consumer-X > > step2: the children of consumer-X should be re-ordered to maintain the seq > > [... consumer_a, ..., consumer_z, ] supplier-X [consumer-X, child_a, > > , child_z] > > step3: the consumer_a should be re-ordered to maintain the seq > > [... consumer_z, ...] supplier-X [ consumer-X, child_a, consumer_a ..., > > child_z] > > > > It requires two nested recursion to drain out all out-of-order item in > > "affected range". To avoid such complicated code, this patch suggests > > to utilize the info in device tree, instead of using the order of > > devices_kset during shutdown. It iterates the device tree, and firstly > > shutdown a device's children and consumers. After this patch, the buggy > > commit is hollow and left to clean. > > > > Cc: Greg Kroah-Hartman > > Cc: Rafael J. Wysocki > > Cc: Grygorii Strashko > > Cc: Christoph Hellwig > > Cc: Bjorn Helgaas > > Cc: Dave Young > > Cc: linux-...@vger.kernel.org > > Cc: linuxppc-dev@lists.ozlabs.org > > Signed-off-by: Pingfan Liu > > --- > > drivers/base/core.c| 48 > > +++- > > include/linux/device.h | 1 + > > 2 files changed, 44 insertions(+), 5 deletions(-) > > > > diff --git a/drivers/base/core.c b/drivers/base/core.c > > index a48868f..684b994 100644 > > --- a/drivers/base/core.c > > +++ b/drivers/base/core.c > > @@ -1446,6 +1446,7 @@ void device_initialize(struct device *dev) > > INIT_LIST_HEAD(&dev->links.consumers); > > INIT_LIST_HEAD(&dev->links.suppliers); > > dev->links.status = DL_DEV_NO_DRIVER; > > + dev->shutdown = false; > > } > > EXPORT_SYMBOL_GPL(device_initialize); > > > > @@ -2811,7 +2812,6 @@ static void __device_shutdown(struct device *dev) > > * lock is to be held > > */ > > parent = get_device(dev->parent); > > - get_device(dev); > > /* > > * Make sure the device is off the kset list, in the > > * event that dev->*->shutdown() doesn't remove it. > > @@ -2842,23 +2842,60 @@ static void __device_shutdown(struct device *dev) > > dev_info(dev, "shutdown\n"); > > dev->driver->shutdown(dev); > > } > > - > > + dev->shutdown = true; > > device_unlock(dev); > > if (parent) > > device_unlock(parent); > > > > - put_device(dev); > > put_device(parent); > > spin_lock(&devices_kset->list_lock); > > } > > > > +/* shutdown dev's children and consumer firstly, then itself */ > > +static int device_for_each_child_shutdown(struct device *dev) > > +{ > > + struct klist_iter i; > > + struct device *child; > > + struct device_link *link; > > + > > + /* already shutdown, then skip this sub tree */ > > + if (dev->shutdown) > > + return 0; > > + > > + if (!dev->p) > > + goto check_consumers; > > + > > + /* ther
Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
On Tue, Jul 3, 2018 at 5:55 PM, Baoquan He wrote: > On 06/12/18 at 05:24pm, Andy Shevchenko wrote: >> On Tue, Jun 12, 2018 at 5:20 PM, Andy Shevchenko >> wrote: >> > I briefly looked at the code and error codes we have, so, my proposal >> > is one of the following >> >> > - use -ECANCELED (not the best choice for first occurrence here, >> > though I can't find better) >> >> Actually -ENOTSUPP might suit the first case (although the actual >> would be something like -EOVERLAP, which we don't have) > > Sorry for late reply, and many thanks for your great suggestion. > > I am fine to use -ENOTSUPP as the first returned value, and -ECANCELED > for the 2nd one. I have no strong opinion, but I like (slightly better) this approach ^^^ > Or define an enum as you suggested inside the function > or in header file. > > Or use -EBUSY for the first case because existing resource is > overlapping but not fully contained by 'res'; and -EINVAL for > the 2nd case since didn't find any one resources which is contained by > 'res', means we passed in a invalid resource. > > All is fine to me, I can repost with each of them. >> > - use positive integers (or enum), like >> > #define RES_REPARENTED 0 >> > #define RES_OVERLAPPED 1 >> > #define RES_NOCONFLICT 2 -- With Best Regards, Andy Shevchenko
Re: GCC strcmp optimizations causing valgrind uninitialized conditional jumps
On Tue, Jul 03, 2018 at 11:59:14AM -0700, William Kennington wrote: > Is there a bug tracking the issue? https://bugs.kde.org/show_bug.cgi?id=386945 > Also, unless your malloc is > guaranteed to be zeroing out the data or have a strcmp that is writing > doubleworld aligned data to the string, the strcmp implementation is > branching based on data existing after the null terminating character > that may be uninitialized. Both sides of the branch do the right thing > though, and locate the null terminator, throwing away the calculations > done on the uninitialized data. Yes, there is one branch that depends in part on irrelevant data, but that is handled immediately afterwards. > -fno-builtin-strcmp or -mstring-compare-inline-limit=0 do work fine > but we don't control the binaries we are linking against in all cases > and are seeing the issue pop up there. Yeah, nasty. I don't know what to do then (other than fix valgrind, which isn't so easy either though!) Segher
Re: GCC strcmp optimizations causing valgrind uninitialized conditional jumps
Hi! On Tue, Jul 03, 2018 at 11:26:55AM -0700, William Kennington wrote: > I've noticed while trying to do some valgrind testing on code linked > against system libraries that have inlined strcmps that valgrind is > unhappy about branches depending on uninitialized memory. I've read The branches here do *not* depend on uninitialised memory. Valgrind does not realise that however. The valgrind people are aware of this problem. > Any ideas on how to workaround / fix this? Does -fno-builtin-strcmp do the trick? Segher
GCC strcmp optimizations causing valgrind uninitialized conditional jumps
Hi, I've noticed while trying to do some valgrind testing on code linked against system libraries that have inlined strcmps that valgrind is unhappy about branches depending on uninitialized memory. I've read through the generated inline assembly, and it looks like the assembly will always produce the correct result regardless of the intermediate compare using uninitialized data. I'm trying to figure out what to do as next steps since it's non-trivial to suppress this inlined code. Using the following simple test case which generates the optmizations using gcc 7.3.0 and gcc 9 devel. This is reproducible using `gcc -O2 -o test test.c` with the following code. #include #include #include int main() { char *heap_str = malloc(16); strcpy(heap_str, "RandString"); int res = strcmp("RandString", heap_str) == 0; free(heap_str); return res; } A snippet of the inline assembly causing issues is, valgrind correctly identifies a branch taken based on uninitialized memory for the first `bne` in `.L7` since it's reading bytes 8-16 from the 11 byte string: # test.c:9:int res = strcmp("RandString", heap_str) == 0; .loc 1 9 12 view .LVU20 li 8,0 # tmp172, cmpb 3,9,10 # tmp169, tmp133, tmp134 cmpb 8,9,8 # tmp170, tmp133, tmp172 orc 3,8,3#, tmp169, tmp170, tmp169 cntlzd 3,3 # tmp171, tmp169 addi 3,3,8 # tmp171, tmp171, rldcl 9,9,3,56 # tmp171, tmp174, tmp133, rldcl 3,10,3,56 # tmp171, tmp176, tmp134, subf 3,3,9 # tmp132, tmp176, tmp174 b .L2# .L7: addi 9,5,8 # tmp190, tmp126, addi 10,30,8 # tmp191, heap_str, ldbrx 9,0,9 # tmp133, MEM[(void *)"RandString"] ldbrx 10,0,10# tmp134, MEM[(void *)heap_str_5] subf. 3,10,9 # tmp132, tmp134, tmp133 bne 0,.L4# cmpb 10,9,3 # tmp140, tmp133, tmp132 cmpdi 7,10,0 #, tmp141, tmp140 bne 7,.L2 Sample valgrind output looks like: ==63162== Memcheck, a memory error detector ==63162== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. ==63162== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info ==63162== Command: ./test ==63162== ==63162== Conditional jump or move depends on uninitialised value(s) ==63162==at 0x15D4: main (test.c:9) ==63162== Uninitialised value was created by a heap allocation ==63162==at 0x4093F00: malloc (in /usr/lib/valgrind/vgpreload_memcheck-ppc64le-linux.so) ==63162==by 0x1513: main (test.c:7) ==63162== ==63162== Conditional jump or move depends on uninitialised value(s) ==63162==at 0x15E0: main (test.c:9) ==63162== Uninitialised value was created by a heap allocation ==63162==at 0x4093F00: malloc (in /usr/lib/valgrind/vgpreload_memcheck-ppc64le-linux.so) ==63162==by 0x1513: main (test.c:7) ==63162== ==63162== Syscall param exit_group(status) contains uninitialised byte(s) ==63162==at 0x41E8C2C: _Exit (_exit.c:31) ==63162==by 0x4132C67: __run_exit_handlers (exit.c:132) ==63162==by 0x4104423: generic_start_main.isra.0 (libc-start.c:344) ==63162==by 0x4104617: (below main) (libc-start.c:116) ==63162== Uninitialised value was created by a heap allocation ==63162==at 0x4093F00: malloc (in /usr/lib/valgrind/vgpreload_memcheck-ppc64le-linux.so) ==63162==by 0x1513: main (test.c:7) ==63162== ==63162== ==63162== HEAP SUMMARY: ==63162== in use at exit: 0 bytes in 0 blocks ==63162== total heap usage: 1 allocs, 1 frees, 16 bytes allocated ==63162== ==63162== All heap blocks were freed -- no leaks are possible ==63162== ==63162== For counts of detected and suppressed errors, rerun with: -v ==63162== ERROR SUMMARY: 3 errors from 3 contexts (suppressed: 0 from 0) Any ideas on how to workaround / fix this? Best, William
Re: [v2 PATCH 2/2] powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores
On Tue, Jul 03, 2018 at 04:33:51PM +0530, Gautham R. Shenoy wrote: > From: "Gautham R. Shenoy" > > A pair of IBM POWER9 SMT4 cores can be fused together to form a big-core > with 8 SMT threads. This can be discovered via the "ibm,thread-groups" > CPU property in the device tree which will indicate which group of > threads that share the L1 cache, translation cache and instruction data > flow. If there are multiple such group of threads, then the core is a > big-core. > > Furthermore, if the thread-ids of the threads of the big-core can be > obtained by interleaving the thread-ids of the thread-groups > (component small core), then such a big-core is called an interleaved > big-core. > > Eg: Threads in the pair of component SMT4 cores of an interleaved > big-core are numbered {0,2,4,6} and {1,3,5,7} respectively. > > The SMT4 cores forming a big-core are more or less independent > units. Thus when multiple tasks are scheduled to run on the fused > core, we get the best performance when the tasks are spread across the > pair of SMT4 cores. > > This patch enables CPU_FTR_ASYM_SMT bit in the cpu-features on > detecting the presence of interleaved big-cores at boot up. This will > will bias the load-balancing of tasks on smaller numbered threads, > which will automatically result in spreading the tasks uniformly > across the associated pair of SMT4 cores. > > Signed-off-by: Gautham R. Shenoy > --- > arch/powerpc/kernel/setup-common.c | 67 > +- > 1 file changed, 66 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/kernel/setup-common.c > b/arch/powerpc/kernel/setup-common.c > index a78ec66..f63d797 100644 > --- a/arch/powerpc/kernel/setup-common.c > +++ b/arch/powerpc/kernel/setup-common.c > @@ -537,6 +537,56 @@ int get_cpu_thread_group_start(int cpu, struct > thread_groups *tg) > return -1; > } > > +/* > + * check_interleaved_big_core - Checks if the thread group tg > + * corresponds to a big-core whose threads are interleavings of the > + * threads of the component small cores. > + * > + * @tg: A thread-group struct for the core. > + * > + * Returns true if the core is a interleaved big-core. > + * Returns false otherwise. > + */ > +static inline bool check_interleaved_big_core(struct thread_groups *tg) > +{ > + int nr_groups; > + int threads_per_group; > + int cur_cpu, next_cpu, i, j; > + > + nr_groups = tg->nr_groups; > + threads_per_group = tg->threads_per_group; > + > + if (tg->property != 1) > + return false; > + > + if (nr_groups < 2 || threads_per_group < 2) > + return false; > + > + /* > + * In case of an interleaved big-core, the thread-ids of the > + * big-core can be obtained by interleaving the the thread-ids > + * of the component small > + * > + * Eg: On a 8-thread big-core with two SMT4 small cores, the > + * threads of the two component small cores will be > + * {0, 2, 4, 6} and {1, 3, 5, 7}. > + */ > + for (i = 0; i < nr_groups; i++) { > + int group_start = i * threads_per_group; > + > + for (j = 0; j < threads_per_group - 1; j++) { > + int cur_idx = group_start + j; > + > + cur_cpu = tg->thread_list[cur_idx]; > + next_cpu = tg->thread_list[cur_idx + 1]; > + if (next_cpu != cur_cpu + nr_groups) > + return false; > + } > + } > + > + return true; > +} > + > /** > * setup_cpu_maps - initialize the following cpu maps: > * cpu_possible_mask > @@ -560,6 +610,7 @@ void __init smp_setup_cpu_maps(void) > struct device_node *dn; > int cpu = 0; > int nthreads = 1; > + bool has_interleaved_big_core = true; > > DBG("smp_setup_cpu_maps()\n"); > > @@ -613,6 +664,12 @@ void __init smp_setup_cpu_maps(void) > if (parse_thread_groups(dn, &tg) || > tg.nr_groups < 1 || tg.property != 1) { > has_big_cores = false; > + has_interleaved_big_core = false; > + } > + > + if (has_interleaved_big_core) { > + has_interleaved_big_core = > + check_interleaved_big_core(&tg); > } > > if (cpu >= nr_cpu_ids) { > @@ -669,7 +726,15 @@ void __init smp_setup_cpu_maps(void) > vdso_data->processorCount = num_present_cpus(); > #endif /* CONFIG_PPC64 */ > > -/* Initialize CPU <=> thread mapping/ > + if (has_interleaved_big_core) { > + int key = __builtin_ctzl(CPU_FTR_ASYM_SMT); > + > + cur_cpu_spec->cpu_features |= CPU_FTR_ASYM_SMT; > + static_branch_enable(&cpu_feature_keys[key]); > + pr_info("Detected interleaved big-cores\n"); > + } Shouldn't we use cpu_has_feature(CPU_FTR_ASYM_SMT) before setting it? > + > + /* Initi
Re: [v2 PATCH 1/2] powerpc: Detect the presence of big-cores via "ibm, thread-groups"
On Tue, Jul 03, 2018 at 04:33:50PM +0530, Gautham R. Shenoy wrote: > From: "Gautham R. Shenoy" > > On IBM POWER9, the device tree exposes a property array identifed by > "ibm,thread-groups" which will indicate which groups of threads share a > particular set of resources. > > As of today we only have one form of grouping identifying the group of > threads in the core that share the L1 cache, translation cache and > instruction data flow. > > This patch defines the helper function to parse the contents of > "ibm,thread-groups" and a new structure to contain the parsed output. > > The patch also creates the sysfs file named "small_core_siblings" that > returns the physical ids of the threads in the core that share the L1 > cache, translation cache and instruction data flow. > > Signed-off-by: Gautham R. Shenoy > --- > Documentation/ABI/testing/sysfs-devices-system-cpu | 8 ++ > arch/powerpc/include/asm/cputhreads.h | 22 + > arch/powerpc/kernel/setup-common.c | 110 > + > arch/powerpc/kernel/sysfs.c| 35 +++ > 4 files changed, 175 insertions(+) > > diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu > b/Documentation/ABI/testing/sysfs-devices-system-cpu > index 9c5e7732..53a823a 100644 > --- a/Documentation/ABI/testing/sysfs-devices-system-cpu > +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu > @@ -487,3 +487,11 @@ Description: Information about CPU vulnerabilities > "Not affected"CPU is not affected by the vulnerability > "Vulnerable" CPU is affected and no mitigation in effect > "Mitigation: $M" CPU is affected and mitigation $M is in effect > + > +What:/sys/devices/system/cpu/cpu[0-9]+/small_core_sibings s/small_core_sibings/small_core_siblings By the way, big_core_siblings was mentioned in the introductory email. > +Date:03-Jul-2018 > +KernelVersion: v4.18.0 > +Contact: Gautham R. Shenoy > +Description: List of Physical ids of CPUs which share the the L1 cache, > + translation cache and instruction data-flow with this CPU. > +Values: Comma separated list of decimal integers. > diff --git a/arch/powerpc/include/asm/cputhreads.h > b/arch/powerpc/include/asm/cputhreads.h > index d71a909..33226d7 100644 > --- a/arch/powerpc/include/asm/cputhreads.h > +++ b/arch/powerpc/include/asm/cputhreads.h > @@ -23,11 +23,13 @@ > extern int threads_per_core; > extern int threads_per_subcore; > extern int threads_shift; > +extern bool has_big_cores; > extern cpumask_t threads_core_mask; > #else > #define threads_per_core 1 > #define threads_per_subcore 1 > #define threads_shift0 > +#define has_big_cores0 > #define threads_core_mask(*get_cpu_mask(0)) > #endif > > @@ -69,12 +71,32 @@ static inline cpumask_t cpu_online_cores_map(void) > return cpu_thread_mask_to_cores(cpu_online_mask); > } > > +#define MAX_THREAD_LIST_SIZE 8 > +struct thread_groups { > + unsigned int property; > + unsigned int nr_groups; > + unsigned int threads_per_group; > + unsigned int thread_list[MAX_THREAD_LIST_SIZE]; > +}; > + > #ifdef CONFIG_SMP > int cpu_core_index_of_thread(int cpu); > int cpu_first_thread_of_core(int core); > +int parse_thread_groups(struct device_node *dn, struct thread_groups *tg); > +int get_cpu_thread_group_start(int cpu, struct thread_groups *tg); > #else > static inline int cpu_core_index_of_thread(int cpu) { return cpu; } > static inline int cpu_first_thread_of_core(int core) { return core; } > +static inline int parse_thread_groups(struct device_node *dn, > + struct thread_groups *tg) > +{ > + return -ENODATA; > +} > + > +static inline int get_cpu_thread_group_start(int cpu, struct thread_groups > *tg) > +{ > + return -1; > +} > #endif > > static inline int cpu_thread_in_core(int cpu) > diff --git a/arch/powerpc/kernel/setup-common.c > b/arch/powerpc/kernel/setup-common.c > index 40b44bb..a78ec66 100644 > --- a/arch/powerpc/kernel/setup-common.c > +++ b/arch/powerpc/kernel/setup-common.c > @@ -402,10 +402,12 @@ void __init check_for_initrd(void) > #ifdef CONFIG_SMP > > int threads_per_core, threads_per_subcore, threads_shift; > +bool has_big_cores = true; > cpumask_t threads_core_mask; > EXPORT_SYMBOL_GPL(threads_per_core); > EXPORT_SYMBOL_GPL(threads_per_subcore); > EXPORT_SYMBOL_GPL(threads_shift); > +EXPORT_SYMBOL_GPL(has_big_cores); > EXPORT_SYMBOL_GPL(threads_core_mask); > > static void __init cpu_init_thread_core_maps(int tpc) > @@ -433,6 +435,108 @@ static void __init cpu_init_thread_core_maps(int tpc) > > u32 *cpu_to_phys_id = NULL; > > +/* > + * parse_thread_groups: Parses the "ibm,thread-groups" device tree > + * property for the CPU device node dn and stores > + * the parsed output in
Re: Oops in kmem_cache_free() via bioset_exit() (was Re: [next-20180601][nvme][ppc] Kernel Oops is triggered when creating lvm snapshots on nvme disks)
On Fri, 2018-06-29 at 00:42 +1000, Michael Ellerman wrote: > Kent, Jens, > > This looks like it might be related to the recent bioset changes? > > cheers > > Abdul Haleem writes: > > On Tue, 2018-06-26 at 23:36 +1000, Michael Ellerman wrote: > >> Abdul Haleem writes: > ... > > I was able to reproduce again with slub_debug=FZP and DEBUG_INFO enabled > > on 4.17.0-rc7-next-20180601, but not much traces other than the Oops stack > > trace > > Are you still testing on that revision? It's nearly a month old. > > Please try to reproduce on mainline or today's linux-next. Problem is not reproducible on 4.18.0-rc3 mainline and today's next kernel -- Regard's Abdul Haleem IBM Linux Technology Centre
RE: [PATCH] QE GPIO: Add qe_gpio_set_multiple
> -Original Message- > From: York Sun > Sent: Tuesday, July 3, 2018 10:27 AM > To: jo...@infinera.com ; Leo Li > > Cc: linuxppc-dev@lists.ozlabs.org; m...@ellerman.id.au; Qiang Zhao > > Subject: Re: [PATCH] QE GPIO: Add qe_gpio_set_multiple > > +Leo > > On 07/03/2018 03:30 AM, Joakim Tjernlund wrote: > > On Tue, 2018-06-26 at 23:41 +1000, Michael Ellerman wrote: > >> > >> Joakim Tjernlund writes: > >>> On Thu, 2018-06-21 at 02:38 +, Qiang Zhao wrote: > On 06/19/2018 09:22 AM, Joakim Tjernlund wrote: > -Original Message- > From: Linuxppc-dev [mailto:linuxppc-dev- > bounces+qiang.zhao=nxp@lists.ozlabs.org] On Behalf Of Joakim > Tjernlund > Sent: 2018年6月20日 0:22 > To: York Sun ; linuxppc-dev d...@lists.ozlabs.org> > Subject: [PATCH] QE GPIO: Add qe_gpio_set_multiple > > This cousin to gpio-mpc8xxx was lacking a multiple pins method, add > one. > > Signed-off-by: Joakim Tjernlund > --- > drivers/soc/fsl/qe/gpio.c | 28 > 1 file changed, 28 insertions(+) > >> > >> ... > static int qe_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { > struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ - > 298,6 +325,7 @@ static int __init qe_add_gpiochips(void) > gc->direction_output = qe_gpio_dir_out; > gc->get = qe_gpio_get; > gc->set = qe_gpio_set; > + gc->set_multiple = qe_gpio_set_multiple; > > ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc); > if (ret) > > Reviewed-by: Qiang Zhao > > >>> > >>> Who picks up this patch ? Is it queued somewhere already? > >> > >> Not me. > > > > York? You seem to be the only one left. > > > > I am not a Linux maintainer. Even I want to, I can't merge this patch. > > Leo, who can merge this patch and request a pull? Since it falls under the driver/soc/fsl/ folder. I can take it. Regards, Leo
Re: [PATCH 7/7 v5] arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc
On 20/05/18 14:49, Nipun Gupta wrote: fsl-mc bus support the new iommu-map property. Comply to this binding for fsl_mc bus. Signed-off-by: Nipun Gupta Reviewed-by: Laurentiu Tudor --- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index 137ef4d..6010505 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -184,6 +184,7 @@ #address-cells = <2>; #size-cells = <2>; ranges; + dma-ranges = <0x0 0x0 0x0 0x0 0x1 0x>; clockgen: clocking@130 { compatible = "fsl,ls2080a-clockgen"; @@ -357,6 +358,8 @@ reg = <0x0008 0x0c00 0 0x40>, /* MC portal base */ <0x 0x0834 0 0x4>; /* MC control reg */ msi-parent = <&its>; + iommu-map = <0 &smmu 0 0>;/* This is fixed-up by u-boot */ + dma-coherent; #address-cells = <3>; #size-cells = <1>; @@ -460,6 +463,8 @@ compatible = "arm,mmu-500"; reg = <0 0x500 0 0x80>; #global-interrupts = <12>; + #iommu-cells = <1>; + stream-match-mask = <0x7C00>; interrupts = <0 13 4>, /* global secure fault */ <0 14 4>, /* combined secure interrupt */ <0 15 4>, /* global non-secure fault */ @@ -502,7 +507,6 @@ <0 204 4>, <0 205 4>, <0 206 4>, <0 207 4>, <0 208 4>, <0 209 4>; - mmu-masters = <&fsl_mc 0x300 0>; Since we're in here, is the SMMU itself also coherent? If it is, you probably want to say so and avoid the overhead of pointlessly cleaning cache lines on every page table update. Robin. }; dspi: dspi@210 {
Re: [PATCH 6/7 v5] bus: fsl-mc: set coherent dma mask for devices on fsl-mc bus
On 20/05/18 14:49, Nipun Gupta wrote: of_dma_configure() API expects coherent_dma_mask to be correctly set in the devices. This patch does the needful. Reviewed-by: Robin Murphy Signed-off-by: Nipun Gupta --- drivers/bus/fsl-mc/fsl-mc-bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index fa43c7d..624828b 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -627,6 +627,7 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc, mc_dev->icid = parent_mc_dev->icid; mc_dev->dma_mask = FSL_MC_DEFAULT_DMA_MASK; mc_dev->dev.dma_mask = &mc_dev->dma_mask; + mc_dev->dev.coherent_dma_mask = mc_dev->dma_mask; dev_set_msi_domain(&mc_dev->dev, dev_get_msi_domain(&parent_mc_dev->dev)); }
Re: [PATCH 5/7 v5] bus: fsl-mc: support dma configure for devices on fsl-mc bus
On 20/05/18 14:49, Nipun Gupta wrote: This patch adds support of dma configuration for devices on fsl-mc bus using 'dma_configure' callback for busses. Also, directly calling arch_setup_dma_ops is removed from the fsl-mc bus. Looks like this is the final arch_setup_dma_ops offender, yay! Signed-off-by: Nipun Gupta Reviewed-by: Laurentiu Tudor --- drivers/bus/fsl-mc/fsl-mc-bus.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 5d8266c..fa43c7d 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -127,6 +127,16 @@ static int fsl_mc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } +static int fsl_mc_dma_configure(struct device *dev) +{ + struct device *dma_dev = dev; + + while (dev_is_fsl_mc(dma_dev)) + dma_dev = dma_dev->parent; + + return of_dma_configure(dev, dma_dev->of_node, 0); +} + static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -148,6 +158,7 @@ struct bus_type fsl_mc_bus_type = { .name = "fsl-mc", .match = fsl_mc_bus_match, .uevent = fsl_mc_bus_uevent, + .dma_configure = fsl_mc_dma_configure, .dev_groups = fsl_mc_dev_groups, }; EXPORT_SYMBOL_GPL(fsl_mc_bus_type); @@ -633,10 +644,6 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc, goto error_cleanup_dev; } - /* Objects are coherent, unless 'no shareability' flag set. */ - if (!(obj_desc->flags & FSL_MC_OBJ_FLAG_NO_MEM_SHAREABILITY)) Although it seems we do end up without any handling of this "non-coherent object behind coherent MC" case, and I'm not sure how easily that could be accommodated by generic code... :/ How important is the quirk? Robin. - arch_setup_dma_ops(&mc_dev->dev, 0, 0, NULL, true); - /* * The device-specific probe callback will get invoked by device_add() */
Re: [PATCH 4/7 v5] iommu: arm-smmu: Add support for the fsl-mc bus
On 20/05/18 14:49, Nipun Gupta wrote: Implement bus specific support for the fsl-mc bus including registering arm_smmu_ops and bus specific device add operations. Signed-off-by: Nipun Gupta --- drivers/iommu/arm-smmu.c | 7 +++ drivers/iommu/iommu.c| 21 + include/linux/fsl/mc.h | 8 include/linux/iommu.h| 2 ++ 4 files changed, 38 insertions(+) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 69e7c60..e1d5090 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -52,6 +52,7 @@ #include #include +#include #include "io-pgtable.h" #include "arm-smmu-regs.h" @@ -1459,6 +1460,8 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) if (dev_is_pci(dev)) group = pci_device_group(dev); + else if (dev_is_fsl_mc(dev)) + group = fsl_mc_device_group(dev); else group = generic_device_group(dev); @@ -2037,6 +2040,10 @@ static void arm_smmu_bus_init(void) bus_set_iommu(&pci_bus_type, &arm_smmu_ops); } #endif +#ifdef CONFIG_FSL_MC_BUS + if (!iommu_present(&fsl_mc_bus_type)) + bus_set_iommu(&fsl_mc_bus_type, &arm_smmu_ops); +#endif } static int arm_smmu_device_probe(struct platform_device *pdev) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d2aa2320..6d4ce35 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include static struct kset *iommu_group_kset; @@ -987,6 +988,26 @@ struct iommu_group *pci_device_group(struct device *dev) return iommu_group_alloc(); } +/* Get the IOMMU group for device on fsl-mc bus */ +struct iommu_group *fsl_mc_device_group(struct device *dev) +{ + struct device *cont_dev = fsl_mc_cont_dev(dev); + struct iommu_group *group; + + /* Container device is responsible for creating the iommu group */ + if (fsl_mc_is_cont_dev(dev)) { Why duplicate what fsl_mc_cont_dev() has already figured out? AFAICS the overall operation here boils down to something like: cont_dev = fsl_mc_cont_dev(dev); group = iommu_group_get(cont_dev); if (!group) group = iommu_group_alloc(); return group; + group = iommu_group_alloc(); + if (IS_ERR(group)) + return NULL; iommu_group_get_for_dev() expects a PTR_ERR, so I don't think munging the return here is the right thing to do. + } else { + get_device(cont_dev); If races are a concern, isn't this a bit late? Maybe in that case you want {get,put}_fsl_mc_cont_dev() routines instead of the simple macro below. But on the other hand if dev already has cont_dev as its parent, wouldn't the reference taken in device_add() be sufficient to prevent it from vanishing unexpectedly in this timescale? Robin. + group = iommu_group_get(cont_dev); + put_device(cont_dev); + } + + return group; +} + /** * iommu_group_get_for_dev - Find or create the IOMMU group for a device * @dev: target device diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index f27cb14..dddaca1 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -351,6 +351,14 @@ struct fsl_mc_io { #define dev_is_fsl_mc(_dev) (0) #endif +/* Macro to check if a device is a container device */ +#define fsl_mc_is_cont_dev(_dev) (to_fsl_mc_device(_dev)->flags & \ + FSL_MC_IS_DPRC) + +/* Macro to get the container device of a MC device */ +#define fsl_mc_cont_dev(_dev) (fsl_mc_is_cont_dev(_dev) ? \ + (_dev) : (_dev)->parent) + /* * module_fsl_mc_driver() - Helper macro for drivers that don't do * anything special in module init/exit. This eliminates a lot of diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 19938ee..2981200 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -389,6 +389,8 @@ static inline size_t iommu_map_sg(struct iommu_domain *domain, extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ extern struct iommu_group *generic_device_group(struct device *dev); +/* FSL-MC device grouping function */ +struct iommu_group *fsl_mc_device_group(struct device *dev); /** * struct iommu_fwspec - per-device IOMMU instance data
Re: [PATCH 3/7 v5] iommu: support iommu configuration for fsl-mc devices
On 20/05/18 14:49, Nipun Gupta wrote: With of_pci_map_rid available for all the busses, use the function for configuration of devices on fsl-mc bus FWIW I had a quick hack at factoring out the commonality with of_pci_iommu_init(), at which point I reckon this change is easier to follow as-is for the moment, so: Reviewed-by: Robin Murphy Signed-off-by: Nipun Gupta --- drivers/iommu/of_iommu.c | 20 1 file changed, 20 insertions(+) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 811e160..284474d 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #define NO_IOMMU 1 @@ -159,6 +160,23 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) return err; } +static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev, + struct device_node *master_np) +{ + struct of_phandle_args iommu_spec = { .args_count = 1 }; + int err; + + err = of_map_rid(master_np, mc_dev->icid, "iommu-map", +"iommu-map-mask", &iommu_spec.np, +iommu_spec.args); + if (err) + return err == -ENODEV ? NO_IOMMU : err; + + err = of_iommu_xlate(&mc_dev->dev, &iommu_spec); + of_node_put(iommu_spec.np); + return err; +} + const struct iommu_ops *of_iommu_configure(struct device *dev, struct device_node *master_np) { @@ -190,6 +208,8 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, err = pci_for_each_dma_alias(to_pci_dev(dev), of_pci_iommu_init, &info); + } else if (dev_is_fsl_mc(dev)) { + err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np); } else { struct of_phandle_args iommu_spec; int idx = 0;
Re: [PATCH] QE GPIO: Add qe_gpio_set_multiple
+Leo On 07/03/2018 03:30 AM, Joakim Tjernlund wrote: > On Tue, 2018-06-26 at 23:41 +1000, Michael Ellerman wrote: >> >> Joakim Tjernlund writes: >>> On Thu, 2018-06-21 at 02:38 +, Qiang Zhao wrote: On 06/19/2018 09:22 AM, Joakim Tjernlund wrote: -Original Message- From: Linuxppc-dev [mailto:linuxppc-dev-bounces+qiang.zhao=nxp@lists.ozlabs.org] On Behalf Of Joakim Tjernlund Sent: 2018年6月20日 0:22 To: York Sun ; linuxppc-dev Subject: [PATCH] QE GPIO: Add qe_gpio_set_multiple This cousin to gpio-mpc8xxx was lacking a multiple pins method, add one. Signed-off-by: Joakim Tjernlund --- drivers/soc/fsl/qe/gpio.c | 28 1 file changed, 28 insertions(+) >> >> ... static int qe_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ -298,6 +325,7 @@ static int __init qe_add_gpiochips(void) gc->direction_output = qe_gpio_dir_out; gc->get = qe_gpio_get; gc->set = qe_gpio_set; + gc->set_multiple = qe_gpio_set_multiple; ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc); if (ret) Reviewed-by: Qiang Zhao >>> >>> Who picks up this patch ? Is it queued somewhere already? >> >> Not me. > > York? You seem to be the only one left. > I am not a Linux maintainer. Even I want to, I can't merge this patch. Leo, who can merge this patch and request a pull? York
Re: [PATCH v5 1/4] resource: Move reparent_resources() to kernel/resource.c and make it public
Hi Andy, On 06/12/18 at 05:24pm, Andy Shevchenko wrote: > On Tue, Jun 12, 2018 at 5:20 PM, Andy Shevchenko > wrote: > >> Hmm, I just copied it from arch/powerpc/kernel/pci-common.c. The > >> function interface expects an integer returned value, not sure what a > >> real error codes look like, could you give more hints? Will change > >> accordingly. > > > > I briefly looked at the code and error codes we have, so, my proposal > > is one of the following > > > - use -ECANCELED (not the best choice for first occurrence here, > > though I can't find better) > > Actually -ENOTSUPP might suit the first case (although the actual > would be something like -EOVERLAP, which we don't have) Sorry for late reply, and many thanks for your great suggestion. I am fine to use -ENOTSUPP as the first returned value, and -ECANCELED for the 2nd one. Or define an enum as you suggested inside the function or in header file. Or use -EBUSY for the first case because existing resource is overlapping but not fully contained by 'res'; and -EINVAL for the 2nd case since didn't find any one resources which is contained by 'res', means we passed in a invalid resource. All is fine to me, I can repost with each of them. Thanks Baoquan > > > - use positive integers (or enum), like > > #define RES_REPARENTED 0 > > #define RES_OVERLAPPED 1 > > #define RES_NOCONFLICT 2 > > > > > >>> > + if (firstpp == NULL) > >>> > + firstpp = pp; > >>> > + } > >>> > >>> > + if (firstpp == NULL) > >>> > + return -1; /* didn't find any conflicting entries? > >>> > */ > >>> > >>> Ditto. > > > > Ditto. > > > >>> > >>> > +} > >>> > +EXPORT_SYMBOL(reparent_resources); > > > > -- > > With Best Regards, > > Andy Shevchenko > > > > -- > With Best Regards, > Andy Shevchenko
Re: [PATCH 2/7 v5] iommu: of: make of_pci_map_rid() available for other devices too
On 20/05/18 14:49, Nipun Gupta wrote: iommu-map property is also used by devices with fsl-mc. This patch moves the of_pci_map_rid to generic location, so that it can be used by other busses too. 'of_pci_map_rid' is renamed here to 'of_map_rid' and there is no functional change done in the API. Reviewed-by: Robin Murphy Signed-off-by: Nipun Gupta Reviewed-by: Rob Herring Acked-by: Bjorn Helgaas --- drivers/iommu/of_iommu.c | 5 +-- drivers/of/base.c| 102 +++ drivers/of/irq.c | 5 +-- drivers/pci/of.c | 101 -- include/linux/of.h | 11 + include/linux/of_pci.h | 10 - 6 files changed, 117 insertions(+), 117 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 5c36a8b..811e160 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -149,9 +149,8 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) struct of_phandle_args iommu_spec = { .args_count = 1 }; int err; - err = of_pci_map_rid(info->np, alias, "iommu-map", -"iommu-map-mask", &iommu_spec.np, -iommu_spec.args); + err = of_map_rid(info->np, alias, "iommu-map", "iommu-map-mask", +&iommu_spec.np, iommu_spec.args); if (err) return err == -ENODEV ? NO_IOMMU : err; diff --git a/drivers/of/base.c b/drivers/of/base.c index 848f549..c7aac81 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1995,3 +1995,105 @@ int of_find_last_cache_level(unsigned int cpu) return cache_level; } + +/** + * of_map_rid - Translate a requester ID through a downstream mapping. + * @np: root complex device node. + * @rid: device requester ID to map. + * @map_name: property name of the map to use. + * @map_mask_name: optional property name of the mask to use. + * @target: optional pointer to a target device node. + * @id_out: optional pointer to receive the translated ID. + * + * Given a device requester ID, look up the appropriate implementation-defined + * platform ID and/or the target device which receives transactions on that + * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or + * @id_out may be NULL if only the other is required. If @target points to + * a non-NULL device node pointer, only entries targeting that node will be + * matched; if it points to a NULL value, it will receive the device node of + * the first matching target phandle, with a reference held. + * + * Return: 0 on success or a standard error code on failure. + */ +int of_map_rid(struct device_node *np, u32 rid, + const char *map_name, const char *map_mask_name, + struct device_node **target, u32 *id_out) +{ + u32 map_mask, masked_rid; + int map_len; + const __be32 *map = NULL; + + if (!np || !map_name || (!target && !id_out)) + return -EINVAL; + + map = of_get_property(np, map_name, &map_len); + if (!map) { + if (target) + return -ENODEV; + /* Otherwise, no map implies no translation */ + *id_out = rid; + return 0; + } + + if (!map_len || map_len % (4 * sizeof(*map))) { + pr_err("%pOF: Error: Bad %s length: %d\n", np, + map_name, map_len); + return -EINVAL; + } + + /* The default is to select all bits. */ + map_mask = 0x; + + /* +* Can be overridden by "{iommu,msi}-map-mask" property. +* If of_property_read_u32() fails, the default is used. +*/ + if (map_mask_name) + of_property_read_u32(np, map_mask_name, &map_mask); + + masked_rid = map_mask & rid; + for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { + struct device_node *phandle_node; + u32 rid_base = be32_to_cpup(map + 0); + u32 phandle = be32_to_cpup(map + 1); + u32 out_base = be32_to_cpup(map + 2); + u32 rid_len = be32_to_cpup(map + 3); + + if (rid_base & ~map_mask) { + pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores rid-base (0x%x)\n", + np, map_name, map_name, + map_mask, rid_base); + return -EFAULT; + } + + if (masked_rid < rid_base || masked_rid >= rid_base + rid_len) + continue; + + phandle_node = of_find_node_by_phandle(phandle); + if (!phandle_node) + return -ENODEV; + + if (target) { + if (*target) + of_node_put(phandle_node); + else +
Re: [PATCH 1/7 v5] Docs: dt: add fsl-mc iommu-map device-tree binding
On 20/05/18 14:49, Nipun Gupta wrote: The existing IOMMU bindings cannot be used to specify the relationship between fsl-mc devices and IOMMUs. This patch adds a generic binding for mapping fsl-mc devices to IOMMUs, using iommu-map property. Signed-off-by: Nipun Gupta Reviewed-by: Rob Herring --- .../devicetree/bindings/misc/fsl,qoriq-mc.txt | 39 ++ 1 file changed, 39 insertions(+) diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt index 6611a7c..8cbed4f 100644 --- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt +++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt @@ -9,6 +9,25 @@ blocks that can be used to create functional hardware objects/devices such as network interfaces, crypto accelerator instances, L2 switches, etc. +For an overview of the DPAA2 architecture and fsl-mc bus see: +drivers/staging/fsl-mc/README.txt Nit: Looks like that's Documentation/networking/dpaa2/overview.rst now. + +As described in the above overview, all DPAA2 objects in a DPRC share the +same hardware "isolation context" and a 10-bit value called an ICID +(isolation context id) is expressed by the hardware to identify +the requester. + +The generic 'iommus' property is insufficient to describe the relationship +between ICIDs and IOMMUs, so an iommu-map property is used to define +the set of possible ICIDs under a root DPRC and how they map to +an IOMMU. + +For generic IOMMU bindings, see +Documentation/devicetree/bindings/iommu/iommu.txt. + +For arm-smmu binding, see: +Documentation/devicetree/bindings/iommu/arm,smmu.txt. + Required properties: - compatible @@ -88,14 +107,34 @@ Sub-nodes: Value type: Definition: Specifies the phandle to the PHY device node associated with the this dpmac. +Optional properties: + +- iommu-map: Maps an ICID to an IOMMU and associated iommu-specifier + data. + + The property is an arbitrary number of tuples of + (icid-base,iommu,iommu-base,length). + + Any ICID i in the interval [icid-base, icid-base + length) is + associated with the listed IOMMU, with the iommu-specifier + (i - icid-base + iommu-base). Example: +smmu: iommu@500 { + compatible = "arm,mmu-500"; + #iommu-cells = <2>; This should be 1 if stream-match-mask is present. Bad example is bad :) Robin. + stream-match-mask = <0x7C00>; + ... +}; + fsl_mc: fsl-mc@80c00 { compatible = "fsl,qoriq-mc"; reg = <0x0008 0x0c00 0 0x40>,/* MC portal base */ <0x 0x0834 0 0x4>; /* MC control reg */ msi-parent = <&its>; +/* define map for ICIDs 23-64 */ +iommu-map = <23 &smmu 23 41>; #address-cells = <3>; #size-cells = <1>;
Re: [PATCHv3 0/4] drivers/base: bugfix for supplier<-consumer ordering in device_kset
On Tuesday, July 3, 2018 8:50:38 AM CEST Pingfan Liu wrote: > commit 52cdbdd49853 ("driver core: correct device's shutdown order") > places an assumption of supplier<-consumer order on the process of probe. > But it turns out to break down the parent <- child order in some scene. > E.g in pci, a bridge is enabled by pci core, and behind it, the devices > have been probed. Then comes the bridge's module, which enables extra > feature(such as hotplug) on this bridge. So what *exactly* does happen in that case?
Re: [PATCHv3 3/4] drivers/base: clean up the usage of devices_kset_move_last()
On Tuesday, July 3, 2018 8:50:41 AM CEST Pingfan Liu wrote: > Clean up the referring to the code in commit 52cdbdd49853 ("driver core: > correct device's shutdown order"). So later we can revert it safely. > > Cc: Greg Kroah-Hartman > Cc: Rafael J. Wysocki > Cc: Grygorii Strashko > Cc: Christoph Hellwig > Cc: Bjorn Helgaas > Cc: Dave Young > Cc: linux-...@vger.kernel.org > Cc: linuxppc-dev@lists.ozlabs.org > Signed-off-by: Pingfan Liu > --- > drivers/base/core.c | 7 --- > 1 file changed, 7 deletions(-) > > diff --git a/drivers/base/core.c b/drivers/base/core.c > index 684b994..db3deb8 100644 > --- a/drivers/base/core.c > +++ b/drivers/base/core.c > @@ -127,13 +127,6 @@ static int device_reorder_to_tail(struct device *dev, > void *not_used) > { > struct device_link *link; > > - /* > - * Devices that have not been registered yet will be put to the ends > - * of the lists during the registration, so skip them here. > - */ > - if (device_is_registered(dev)) > - devices_kset_move_last(dev); > - > if (device_pm_initialized(dev)) > device_pm_move_last(dev); You can't do this. If you do it, that will break power management in some situations. Thanks, Rafael
How is this possible - Register r30 contains 0xc2236400 instead of 0xc6236400
Kernel Oops at 0xc0334d5c for reading at address 0xc2236450 which corresponds to r30 + 80 But r30 should contain what's at r3 + 16 that is at 0xc619ec10 so r30 should be c6236400 as shown below (print_hex_dump(regs->gpr[3]) added at end of __die() ) So how can r30 contain 0xc2236400 instead ? And this is not random, it happens at most if not every startup. c0334d44 : c0334d44: 7c 08 02 a6 mflr r0 c0334d48: 94 21 ff f0 stwu r1,-16(r1) c0334d4c: bf c1 00 08 stmw r30,8(r1) c0334d50: 90 01 00 14 stw r0,20(r1) c0334d54: 83 c3 00 10 lwz r30,16(r3) c0334d58: 81 23 00 a8 lwz r9,168(r3) c0334d5c: 81 5e 00 50 lwz r10,80(r30) [ 152.288237] Unable to handle kernel paging request for data at address 0xc2236450 [ 152.295444] Faulting instruction address: 0xc0334d5c [ 152.300369] Oops: Kernel access of bad area, sig: 11 [#1] [ 152.305665] BE PREEMPT DEBUG_PAGEALLOC CMPC885 [ 152.313630] CPU: 0 PID: 269 Comm: in:imuxsock Not tainted 4.14.52-00025-g5bada429cf-dirty #36 [ 152.322729] task: c623e100 task.stack: c650c000 [ 152.327202] NIP: c0334d5c LR: c043602c CTR: c0435fb8 [ 152.332200] REGS: c650dc00 TRAP: 0300 Not tainted (4.14.52-00025-g5bada429cf-dirty) [ 152.340699] MSR: 9032 CR: 28002822 XER: 2000 [ 152.347333] DAR: c2236450 DSISR: c000 [ 152.347333] GPR00: c043602c c650dcb0 c623e100 c619ec00 c642c060 0008 0018 c650dd4c [ 152.347333] GPR08: c0435fb8 02b0 c068d830 0004 28004822 100d4208 7780c848 [ 152.347333] GPR16: 0ff58398 777674b0 1024b050 1024b0a8 1005ddbc 0ff5a7bc 03e8 [ 152.347333] GPR24: 008e c5011650 c650deb8 008e c619ec00 0040 c2236400 c619ec00 [ 152.385015] NIP [c0334d5c] sock_wfree+0x18/0xa4 [ 152.389458] LR [c043602c] unix_destruct_scm+0x74/0x88 [ 152.394399] Call Trace: [ 152.396868] [c650dcb0] [c006348c] ns_to_timeval+0x4c/0x7c (unreliable) [ 152.403305] [c650dcc0] [c043602c] unix_destruct_scm+0x74/0x88 [ 152.408999] [c650dcf0] [c033a10c] skb_release_head_state+0x8c/0x110 [ 152.415184] [c650dd00] [c033a3c4] skb_release_all+0x18/0x50 [ 152.420690] [c650dd10] [c033a7cc] consume_skb+0x38/0xec [ 152.425869] [c650dd20] [c0342d7c] skb_free_datagram+0x1c/0x68 [ 152.431535] [c650dd30] [c0435c8c] unix_dgram_recvmsg+0x19c/0x4ac [ 152.437476] [c650ddb0] [c0331370] ___sys_recvmsg+0x98/0x138 [ 152.442984] [c650deb0] [c0333280] __sys_recvmsg+0x40/0x84 [ 152.448321] [c650df10] [c0333680] SyS_socketcall+0xb8/0x1d4 [ 152.453832] [c650df40] [c000d1ac] ret_from_syscall+0x0/0x38 [ 152.459286] Instruction dump: [ 152.462225] 41beffac 4b58 3883 4ba0 3881 4b98 7c0802a6 9421fff0 [ 152.469881] bfc10008 90010014 83c30010 812300a8 <815e0050> 3bfe00e0 71480200 4082003c [ 152.477739] c619ec00: 00 00 00 00 00 00 00 00 00 00 00 23 6f d9 b1 65 [ 152.484100] c619ec10: c6 23 64 00 00 00 00 00 c6 42 c0 60 00 00 03 e8 [ 152.490471] c619ec20: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 152.496837] c619ec30: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 152.503205] c619ec40: 00 00 00 00 00 00 00 00 00 00 00 00 c0 43 5f b8 [ 152.509575] c619ec50: 00 00 00 00 00 00 00 00 00 00 00 8e 00 00 00 00 [ 152.515943] c619ec60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 152.522311] c619ec70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 152.528680] c619ec80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 152.535048] c619ec90: 00 00 ff ff 00 00 ff ff c6 42 30 8e c6 42 31 50 [ 152.541417] c619eca0: c6 42 30 00 c6 42 30 00 00 00 02 b0 00 00 00 01 [ 152.547781] ---[ end trace 0710a9d231876a27 ]--- Christophe
Re: CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()
On Tue, 3 Jul 2018 13:40:55 +0200 Mathieu Malaterre wrote: > On Tue, Jul 3, 2018 at 11:40 AM Mathieu Malaterre wrote: > > > > Hi Nick, > > > > Would you consider this a bug: > > > > $ touch drivers/macintosh/via-pmu.c > > $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc > > CROSS_COMPILE=powerpc-linux-gnu- > > ... > > LD vmlinux.o > > MODPOST vmlinux.o > > WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from > > the variable via_pmu_driver to the function .init.text:pmu_init() > > The variable via_pmu_driver references > > the function __init pmu_init() > > If the reference is valid then annotate the > > variable with __init* or __refdata (see linux/init.h) or name the variable: > > *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console > > > > While: > > > > $ touch drivers/macintosh/via-pmu.c > > $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc > > CROSS_COMPILE=powerpc-linux-gnu- > > ... > > AR init/built-in.a > > AR built-in.a > > LD vmlinux.o > > MODPOST vmlinux.o > > KSYM.tmp_kallsyms1.o > > KSYM.tmp_kallsyms2.o > > LD vmlinux > > SORTEX vmlinux > > SYSMAP System.map > > ... > > > > Thanks for comment > > Just to clarify I reverted 58935176ad17976b7a7f6ea25c0ceb2ca4308a30 > just as to reproduce a warning. So my question (rephrased): > > Is this expected that CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y inhibit > the behavior of CONFIG_DEBUG_SECTION_MISMATCH=y ? Well that's a good question actually. Section mismatch analysis is done on the throwaway vmlinux.o which is not linked with --gc-sections (and is not a final link), so the via_pmu_driver symbol should exist and be picked up. I wonder if something about the -ffunction-sections is breaking the reference detection. Thanks, Nick
Re: CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()
On Tue, Jul 3, 2018 at 11:40 AM Mathieu Malaterre wrote: > > Hi Nick, > > Would you consider this a bug: > > $ touch drivers/macintosh/via-pmu.c > $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc > CROSS_COMPILE=powerpc-linux-gnu- > ... > LD vmlinux.o > MODPOST vmlinux.o > WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from > the variable via_pmu_driver to the function .init.text:pmu_init() > The variable via_pmu_driver references > the function __init pmu_init() > If the reference is valid then annotate the > variable with __init* or __refdata (see linux/init.h) or name the variable: > *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console > > While: > > $ touch drivers/macintosh/via-pmu.c > $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc > CROSS_COMPILE=powerpc-linux-gnu- > ... > AR init/built-in.a > AR built-in.a > LD vmlinux.o > MODPOST vmlinux.o > KSYM.tmp_kallsyms1.o > KSYM.tmp_kallsyms2.o > LD vmlinux > SORTEX vmlinux > SYSMAP System.map > ... > > Thanks for comment Just to clarify I reverted 58935176ad17976b7a7f6ea25c0ceb2ca4308a30 just as to reproduce a warning. So my question (rephrased): Is this expected that CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y inhibit the behavior of CONFIG_DEBUG_SECTION_MISMATCH=y ? Thanks
Re: Oops in sock_wfree
Le 03/07/2018 à 10:51, Christophe LEROY a écrit : Hi, I was having strange unexpected memory corruption, therefore I activated DEBUG_PAGEALLOC and I now end up with the following Oops, which tends to make me think we have somewhere in the network code a use-after-free bug. I saw a few of such bugs have been fixed for IPv4 and IPv6. Maybe we have one remaining for Unix sockets ? How can I spot it off and fix it ? [ 39.645644] Unable to handle kernel paging request for data at address 0xc2235010 In fact, must be something else. This page has never been allocated. In seems that skb->sk should be c6234fc0 and suddenly it has changed to c2234fc0 How can I track that ? Christophe [ 39.652860] Faulting instruction address: 0xc0334d5c [ 39.657783] Oops: Kernel access of bad area, sig: 11 [#1] [ 39.663085] BE PREEMPT DEBUG_PAGEALLOC CMPC885 [ 39.667488] SAF3000 DIE NOTIFICATION [ 39.671050] CPU: 0 PID: 269 Comm: in:imuxsock Not tainted 4.14.52-00025-g5bada429cf #22 [ 39.679633] task: c623e100 task.stack: c651e000 [ 39.684106] NIP: c0334d5c LR: c043602c CTR: c0435fb8 [ 39.689103] REGS: c651fc00 TRAP: 0300 Not tainted (4.14.52-00025-g5bada429cf) [ 39.697087] MSR: 9032 CR: 28002822 XER: 2000 [ 39.703720] DAR: c2235010 DSISR: c000 [ 39.703720] GPR00: c043602c c651fcb0 c623e100 c619eec0 c642c540 0008 0018 c651fd4c [ 39.703720] GPR08: c0435fb8 02b0 c068d830 0004 28004822 100d4208 77990848 [ 39.703720] GPR16: 0ff58398 778eb4b0 1039f050 1039f0a8 1005ddbc 0ff5a7bc [ 39.703720] GPR24: 0072 c5011650 c651feb8 0072 c619eec0 0040 c2234fc0 c619eec0 [ 39.741401] NIP [c0334d5c] sock_wfree+0x18/0xa4 [ 39.745843] LR [c043602c] unix_destruct_scm+0x74/0x88 [ 39.750786] Call Trace: [ 39.753253] [c651fcb0] [c006348c] ns_to_timeval+0x4c/0x7c (unreliable) [ 39.759690] [c651fcc0] [c043602c] unix_destruct_scm+0x74/0x88 [ 39.765385] [c651fcf0] [c033a10c] skb_release_head_state+0x8c/0x110 [ 39.771571] [c651fd00] [c033a3c4] skb_release_all+0x18/0x50 [ 39.777078] [c651fd10] [c033a7cc] consume_skb+0x38/0xec [ 39.782255] [c651fd20] [c0342d7c] skb_free_datagram+0x1c/0x68 [ 39.787922] [c651fd30] [c0435c8c] unix_dgram_recvmsg+0x19c/0x4ac [ 39.793863] [c651fdb0] [c0331370] ___sys_recvmsg+0x98/0x138 [ 39.799371] [c651feb0] [c0333280] __sys_recvmsg+0x40/0x84 [ 39.804707] [c651ff10] [c0333680] SyS_socketcall+0xb8/0x1d4 [ 39.810220] [c651ff40] [c000d1ac] ret_from_syscall+0x0/0x38 [ 39.815673] Instruction dump: [ 39.818612] 41beffac 4b58 3883 4ba0 3881 4b98 7c0802a6 9421fff0 [ 39.826267] bfc10008 90010014 83c30010 812300a8 <815e0050> 3bfe00e0 71480200 4082003c [ 39.834113] ---[ end trace 8affde0490d7e25e ]--- Thanks Christophe
[v2 PATCH 1/2] powerpc: Detect the presence of big-cores via "ibm, thread-groups"
From: "Gautham R. Shenoy" On IBM POWER9, the device tree exposes a property array identifed by "ibm,thread-groups" which will indicate which groups of threads share a particular set of resources. As of today we only have one form of grouping identifying the group of threads in the core that share the L1 cache, translation cache and instruction data flow. This patch defines the helper function to parse the contents of "ibm,thread-groups" and a new structure to contain the parsed output. The patch also creates the sysfs file named "small_core_siblings" that returns the physical ids of the threads in the core that share the L1 cache, translation cache and instruction data flow. Signed-off-by: Gautham R. Shenoy --- Documentation/ABI/testing/sysfs-devices-system-cpu | 8 ++ arch/powerpc/include/asm/cputhreads.h | 22 + arch/powerpc/kernel/setup-common.c | 110 + arch/powerpc/kernel/sysfs.c| 35 +++ 4 files changed, 175 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9c5e7732..53a823a 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -487,3 +487,11 @@ Description: Information about CPU vulnerabilities "Not affected"CPU is not affected by the vulnerability "Vulnerable" CPU is affected and no mitigation in effect "Mitigation: $M" CPU is affected and mitigation $M is in effect + +What: /sys/devices/system/cpu/cpu[0-9]+/small_core_sibings +Date: 03-Jul-2018 +KernelVersion: v4.18.0 +Contact: Gautham R. Shenoy +Description: List of Physical ids of CPUs which share the the L1 cache, + translation cache and instruction data-flow with this CPU. +Values:Comma separated list of decimal integers. diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index d71a909..33226d7 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -23,11 +23,13 @@ extern int threads_per_core; extern int threads_per_subcore; extern int threads_shift; +extern bool has_big_cores; extern cpumask_t threads_core_mask; #else #define threads_per_core 1 #define threads_per_subcore1 #define threads_shift 0 +#define has_big_cores 0 #define threads_core_mask (*get_cpu_mask(0)) #endif @@ -69,12 +71,32 @@ static inline cpumask_t cpu_online_cores_map(void) return cpu_thread_mask_to_cores(cpu_online_mask); } +#define MAX_THREAD_LIST_SIZE 8 +struct thread_groups { + unsigned int property; + unsigned int nr_groups; + unsigned int threads_per_group; + unsigned int thread_list[MAX_THREAD_LIST_SIZE]; +}; + #ifdef CONFIG_SMP int cpu_core_index_of_thread(int cpu); int cpu_first_thread_of_core(int core); +int parse_thread_groups(struct device_node *dn, struct thread_groups *tg); +int get_cpu_thread_group_start(int cpu, struct thread_groups *tg); #else static inline int cpu_core_index_of_thread(int cpu) { return cpu; } static inline int cpu_first_thread_of_core(int core) { return core; } +static inline int parse_thread_groups(struct device_node *dn, + struct thread_groups *tg) +{ + return -ENODATA; +} + +static inline int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) +{ + return -1; +} #endif static inline int cpu_thread_in_core(int cpu) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 40b44bb..a78ec66 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -402,10 +402,12 @@ void __init check_for_initrd(void) #ifdef CONFIG_SMP int threads_per_core, threads_per_subcore, threads_shift; +bool has_big_cores = true; cpumask_t threads_core_mask; EXPORT_SYMBOL_GPL(threads_per_core); EXPORT_SYMBOL_GPL(threads_per_subcore); EXPORT_SYMBOL_GPL(threads_shift); +EXPORT_SYMBOL_GPL(has_big_cores); EXPORT_SYMBOL_GPL(threads_core_mask); static void __init cpu_init_thread_core_maps(int tpc) @@ -433,6 +435,108 @@ static void __init cpu_init_thread_core_maps(int tpc) u32 *cpu_to_phys_id = NULL; +/* + * parse_thread_groups: Parses the "ibm,thread-groups" device tree + * property for the CPU device node dn and stores + * the parsed output in the thread_groups + * structure tg. + * + * ibm,thread-groups[0..N-1] array defines which group of threads in + * the CPU-device node can be grouped together based on the property. + * + * ibm,thread-groups[0] tells us the property based on which the + * threads are being grouped together. If this value is 1, it implies + * that the threads in the same group share L1, translation cache. + * +
[v2 PATCH 0/2] powerpc: Detection and scheduler optimization for POWER9 bigcore
From: "Gautham R. Shenoy" Hi, This is the second iteration of the patchset to add support for big-core on POWER9. The earlier version can be found here: https://lkml.org/lkml/2018/5/11/245. The changes from the previous version: - Added comments explaining the "ibm,thread-groups" device tree property. - Uses cleaner device-tree parsing functions to parse the u32 arrays. - Adds a sysfs file listing the small-core siblings for every CPU. - Enables the scheduler optimization by setting the CPU_FTR_ASYM_SMT bit in the cur_cpu_spec->cpu_features on detecting the presence of interleaved big-core. - Handles the corner case where there is only a single thread-group or when there is a single thread in a thread-group. Description: A pair of IBM POWER9 SMT4 cores can be fused together to form a big-core with 8 SMT threads. This can be discovered via the "ibm,thread-groups" CPU property in the device tree which will indicate which group of threads that share the L1 cache, translation cache and instruction data flow. If there are multiple such group of threads, then the core is a big-core. Furthermore, the thread-ids of such a big-core is obtained by interleaving the thread-ids of the component SMT4 cores. Eg: Threads in the pair of component SMT4 cores of an interleaved big-core are numbered {0,2,4,6} and {1,3,5,7} respectively. On such a big-core, when multiple tasks are scheduled to run on the big-core, we get the best performance when the tasks are spread across the pair of SMT4 cores. The Linux scheduler supports a flag called "SD_ASYM_PACKING" which when set in the SMT sched-domain, biases the load-balancing of the tasks on the smaller numbered threads in the core. On an big-core whose threads are interleavings of the threads of the small cores, enabling SD_ASYM_PACKING in the SMT sched-domain automatically results in spreading the tasks uniformly across the associated pair of SMT4 cores, thereby yielding better performance. This patchset contains two patches which on detecting the presence of interleaved big-cores will enable the the CPU_FTR_ASYM_SMT bit in the cur_cpu_spec->cpu_feature. Patch 1: adds support to detect the presence of big-cores and reports the small-core siblings of each CPU X via the sysfs file "/sys/devices/system/cpu/cpuX/big_core_siblings". Patch 2: checks if the thread-ids of the component small-cores are interleaved, in which case we enable the the CPU_FTR_ASYM_SMT bit in the cur_cpu_spec->cpu_features which results in the SD_ASYM_PACKING flag being set at the SMT level sched-domain. Results: ~ Experimental results for ebizzy with 2 threads, bound to a single big-core show a marked improvement with this patchset over the 4.18-rc3 vanilla kernel. The result of 100 such runs for 4.18-rc3 kernel and the 4.18-rc3 + big-core-patches are as follows 4.18-rc3: ~~~ records/s: # samples : Histogram ~~~ [0 - 100] : 0 : # [100 - 200] : 3 : # [200 - 300] : 16 : [300 - 400] : 11 : ### [400 - 500] : 0 : # [500 - 600] : 70 : ### 4.18-rc3 + big-core-patches ~~~ records/s: # samples : Histogram ~~~ [0 - 100] : 0 : # [100 - 200] : 0 : # [200 - 300] : 6 : ## [300 - 400] : 0 : # [400 - 500] : 2 : # [500 - 600] : 92 : ### Gautham R. Shenoy (2): powerpc: Detect the presence of big-cores via "ibm,thread-groups" powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores Documentation/ABI/testing/sysfs-devices-system-cpu | 8 + arch/powerpc/include/asm/cputhreads.h | 22 +++ arch/powerpc/kernel/setup-common.c | 177 - arch/powerpc/kernel/sysfs.c| 35 4 files changed, 241 insertions(+), 1 deletion(-) -- 1.9.4
[v2 PATCH 2/2] powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores
From: "Gautham R. Shenoy" A pair of IBM POWER9 SMT4 cores can be fused together to form a big-core with 8 SMT threads. This can be discovered via the "ibm,thread-groups" CPU property in the device tree which will indicate which group of threads that share the L1 cache, translation cache and instruction data flow. If there are multiple such group of threads, then the core is a big-core. Furthermore, if the thread-ids of the threads of the big-core can be obtained by interleaving the thread-ids of the thread-groups (component small core), then such a big-core is called an interleaved big-core. Eg: Threads in the pair of component SMT4 cores of an interleaved big-core are numbered {0,2,4,6} and {1,3,5,7} respectively. The SMT4 cores forming a big-core are more or less independent units. Thus when multiple tasks are scheduled to run on the fused core, we get the best performance when the tasks are spread across the pair of SMT4 cores. This patch enables CPU_FTR_ASYM_SMT bit in the cpu-features on detecting the presence of interleaved big-cores at boot up. This will will bias the load-balancing of tasks on smaller numbered threads, which will automatically result in spreading the tasks uniformly across the associated pair of SMT4 cores. Signed-off-by: Gautham R. Shenoy --- arch/powerpc/kernel/setup-common.c | 67 +- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index a78ec66..f63d797 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -537,6 +537,56 @@ int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) return -1; } +/* + * check_interleaved_big_core - Checks if the thread group tg + * corresponds to a big-core whose threads are interleavings of the + * threads of the component small cores. + * + * @tg: A thread-group struct for the core. + * + * Returns true if the core is a interleaved big-core. + * Returns false otherwise. + */ +static inline bool check_interleaved_big_core(struct thread_groups *tg) +{ + int nr_groups; + int threads_per_group; + int cur_cpu, next_cpu, i, j; + + nr_groups = tg->nr_groups; + threads_per_group = tg->threads_per_group; + + if (tg->property != 1) + return false; + + if (nr_groups < 2 || threads_per_group < 2) + return false; + + /* +* In case of an interleaved big-core, the thread-ids of the +* big-core can be obtained by interleaving the the thread-ids +* of the component small +* +* Eg: On a 8-thread big-core with two SMT4 small cores, the +* threads of the two component small cores will be +* {0, 2, 4, 6} and {1, 3, 5, 7}. +*/ + for (i = 0; i < nr_groups; i++) { + int group_start = i * threads_per_group; + + for (j = 0; j < threads_per_group - 1; j++) { + int cur_idx = group_start + j; + + cur_cpu = tg->thread_list[cur_idx]; + next_cpu = tg->thread_list[cur_idx + 1]; + if (next_cpu != cur_cpu + nr_groups) + return false; + } + } + + return true; +} + /** * setup_cpu_maps - initialize the following cpu maps: * cpu_possible_mask @@ -560,6 +610,7 @@ void __init smp_setup_cpu_maps(void) struct device_node *dn; int cpu = 0; int nthreads = 1; + bool has_interleaved_big_core = true; DBG("smp_setup_cpu_maps()\n"); @@ -613,6 +664,12 @@ void __init smp_setup_cpu_maps(void) if (parse_thread_groups(dn, &tg) || tg.nr_groups < 1 || tg.property != 1) { has_big_cores = false; + has_interleaved_big_core = false; + } + + if (has_interleaved_big_core) { + has_interleaved_big_core = + check_interleaved_big_core(&tg); } if (cpu >= nr_cpu_ids) { @@ -669,7 +726,15 @@ void __init smp_setup_cpu_maps(void) vdso_data->processorCount = num_present_cpus(); #endif /* CONFIG_PPC64 */ -/* Initialize CPU <=> thread mapping/ + if (has_interleaved_big_core) { + int key = __builtin_ctzl(CPU_FTR_ASYM_SMT); + + cur_cpu_spec->cpu_features |= CPU_FTR_ASYM_SMT; + static_branch_enable(&cpu_feature_keys[key]); + pr_info("Detected interleaved big-cores\n"); + } + + /* Initialize CPU <=> thread mapping/ * * WARNING: We assume that the number of threads is the same for * every CPU in the system. If that is not the case, then some code -- 1.9.4
Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices
I think Pavel would be interested to see this as well (he is doing some parallel device shutdown stuff) On Tue, Jul 3, 2018 at 9:50 AM, Pingfan Liu wrote: > commit 52cdbdd49853 ("driver core: correct device's shutdown order") > places an assumption of supplier<-consumer order on the process of probe. > But it turns out to break down the parent <- child order in some scene. > E.g in pci, a bridge is enabled by pci core, and behind it, the devices > have been probed. Then comes the bridge's module, which enables extra > feature(such as hotplug) on this bridge. This will break the > parent<-children order and cause failure when "kexec -e" in some scenario. > > The detailed description of the scenario: > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod) > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due > to some issue. For this case, the bridge is moved after its children in > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not > write back buffer in flight due to the former shutdown of the bridge which > clears the BusMaster bit. > > It is a little hard to impose both "parent<-child" and "supplier<-consumer" > order on devices_kset. Take the following scene: > step0: before a consumer's probing, (note child_a is supplier of consumer_a) > [ consumer-X, child_a, , child_z] [... consumer_a, ..., consumer_z, > ...] supplier-X > ^^ affected range ^^ > step1: when probing, moving consumer-X after supplier-X > [ child_a, , child_z] [ consumer_a, ..., consumer_z, ...] > supplier-X, consumer-X > step2: the children of consumer-X should be re-ordered to maintain the seq > [... consumer_a, ..., consumer_z, ] supplier-X [consumer-X, child_a, > , child_z] > step3: the consumer_a should be re-ordered to maintain the seq > [... consumer_z, ...] supplier-X [ consumer-X, child_a, consumer_a ..., > child_z] > > It requires two nested recursion to drain out all out-of-order item in > "affected range". To avoid such complicated code, this patch suggests > to utilize the info in device tree, instead of using the order of > devices_kset during shutdown. It iterates the device tree, and firstly > shutdown a device's children and consumers. After this patch, the buggy > commit is hollow and left to clean. > > Cc: Greg Kroah-Hartman > Cc: Rafael J. Wysocki > Cc: Grygorii Strashko > Cc: Christoph Hellwig > Cc: Bjorn Helgaas > Cc: Dave Young > Cc: linux-...@vger.kernel.org > Cc: linuxppc-dev@lists.ozlabs.org > Signed-off-by: Pingfan Liu > --- > drivers/base/core.c| 48 +++- > include/linux/device.h | 1 + > 2 files changed, 44 insertions(+), 5 deletions(-) > > diff --git a/drivers/base/core.c b/drivers/base/core.c > index a48868f..684b994 100644 > --- a/drivers/base/core.c > +++ b/drivers/base/core.c > @@ -1446,6 +1446,7 @@ void device_initialize(struct device *dev) > INIT_LIST_HEAD(&dev->links.consumers); > INIT_LIST_HEAD(&dev->links.suppliers); > dev->links.status = DL_DEV_NO_DRIVER; > + dev->shutdown = false; > } > EXPORT_SYMBOL_GPL(device_initialize); > > @@ -2811,7 +2812,6 @@ static void __device_shutdown(struct device *dev) > * lock is to be held > */ > parent = get_device(dev->parent); > - get_device(dev); > /* > * Make sure the device is off the kset list, in the > * event that dev->*->shutdown() doesn't remove it. > @@ -2842,23 +2842,60 @@ static void __device_shutdown(struct device *dev) > dev_info(dev, "shutdown\n"); > dev->driver->shutdown(dev); > } > - > + dev->shutdown = true; > device_unlock(dev); > if (parent) > device_unlock(parent); > > - put_device(dev); > put_device(parent); > spin_lock(&devices_kset->list_lock); > } > > +/* shutdown dev's children and consumer firstly, then itself */ > +static int device_for_each_child_shutdown(struct device *dev) > +{ > + struct klist_iter i; > + struct device *child; > + struct device_link *link; > + > + /* already shutdown, then skip this sub tree */ > + if (dev->shutdown) > + return 0; > + > + if (!dev->p) > + goto check_consumers; > + > + /* there is breakage of lock in __device_shutdown(), and the redundant > +* ref++ on srcu protected consumer is harmless since shutdown is not > +* hot path. > +*/ > + get_device(dev); > + > + klist_iter_init(&dev->p->klist_children, &i); > + while ((child = next_device(&i))) > + device_for_each_child_shutdown(child); > + klist_iter_exit(&i); > + > +check_consumers: > + list_for_each_entry_rcu(link, &dev->links.consumers, s_node) { > + if (!link->consumer->shutdown) > +
Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.
On Tue, 3 Jul 2018 08:08:14 +1000 Nicholas Piggin wrote: > On Mon, 02 Jul 2018 11:17:06 +0530 > Mahesh J Salgaonkar wrote: > > > From: Mahesh Salgaonkar > > > > On pseries, as of today system crashes if we get a machine check > > exceptions due to SLB errors. These are soft errors and can be > > fixed by flushing the SLBs so the kernel can continue to function > > instead of system crash. We do this in real mode before turning on > > MMU. Otherwise we would run into nested machine checks. This patch > > now fetches the rtas error log in real mode and flushes the SLBs on > > SLB errors. > > > > Signed-off-by: Mahesh Salgaonkar > > --- > > arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 > > arch/powerpc/include/asm/machdep.h|1 > > arch/powerpc/kernel/exceptions-64s.S | 42 > > + arch/powerpc/kernel/mce.c > > | 16 +++- arch/powerpc/mm/slb.c | > > 6 +++ arch/powerpc/platforms/powernv/opal.c |1 > > arch/powerpc/platforms/pseries/pseries.h |1 > > arch/powerpc/platforms/pseries/ras.c | 51 > > + > > arch/powerpc/platforms/pseries/setup.c|1 9 files > > changed, 116 insertions(+), 4 deletions(-) > > > > +TRAMP_REAL_BEGIN(machine_check_pSeries_early) > > +BEGIN_FTR_SECTION > > + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) > > + mr r10,r1 /* Save r1 */ > > + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency > > stack */ > > + subir1,r1,INT_FRAME_SIZE/* alloc stack > > frame */ > > + mfspr r11,SPRN_SRR0 /* Save SRR0 */ > > + mfspr r12,SPRN_SRR1 /* Save SRR1 */ > > + EXCEPTION_PROLOG_COMMON_1() > > + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) > > + EXCEPTION_PROLOG_COMMON_3(0x200) > > + addir3,r1,STACK_FRAME_OVERHEAD > > + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call > > ABI */ > > Is there any reason you can't use the existing > machine_check_powernv_early code to do all this? Code sharing is nice but if we envision this going to stable kernels butchering the existing handler is going to be a nightmare. The code is quite a bit different between kernel versions. This code as is requires the bit that introduces EXCEPTION_PROLOG_COMMON_1 and then should work on Linux 3.14+ Thanks Michal
Re: [PATCH v5 2/7] powerpc/pseries: Defer the logging of rtas error to irq work queue.
On 07/03/2018 08:55 AM, Nicholas Piggin wrote: > On Mon, 02 Jul 2018 11:16:29 +0530 > Mahesh J Salgaonkar wrote: > >> From: Mahesh Salgaonkar >> >> rtas_log_buf is a buffer to hold RTAS event data that are communicated >> to kernel by hypervisor. This buffer is then used to pass RTAS event >> data to user through proc fs. This buffer is allocated from vmalloc >> (non-linear mapping) area. >> >> On Machine check interrupt, register r3 points to RTAS extended event >> log passed by hypervisor that contains the MCE event. The pseries >> machine check handler then logs this error into rtas_log_buf. The >> rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a >> page fault (vector 0x300) while accessing it. Since machine check >> interrupt handler runs in NMI context we can not afford to take any >> page fault. Page faults are not honored in NMI context and causes >> kernel panic. Apart from that, as Nick pointed out, pSeries_log_error() >> also takes a spin_lock while logging error which is not safe in NMI >> context. It may endup in deadlock if we get another MCE before releasing >> the lock. Fix this by deferring the logging of rtas error to irq work queue. >> >> Current implementation uses two different buffers to hold rtas error log >> depending on whether extended log is provided or not. This makes bit >> difficult to identify which buffer has valid data that needs to logged >> later in irq work. Simplify this using single buffer, one per paca, and >> copy rtas log to it irrespective of whether extended log is provided or >> not. Allocate this buffer below RMA region so that it can be accessed >> in real mode mce handler. >> >> Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable >> interrupt") >> Cc: sta...@vger.kernel.org >> Signed-off-by: Mahesh Salgaonkar > > I think this looks reasonable. It doesn't fix that commit so much as > fixes the problem that's apparent after it's applied. I don't know if > we should backport this to a wider set of stable kernels? Aside from > that, Since the commit b96672dd840f went into 4.13, I think it is good if we backport this to V4.14 and above stable kernels. Thanks, -Mahesh > > Reviewed-by: Nicholas Piggin > > Thanks, > Nick > >> --- >> arch/powerpc/include/asm/paca.h|3 ++ >> arch/powerpc/platforms/pseries/ras.c | 47 >> ++-- >> arch/powerpc/platforms/pseries/setup.c | 16 +++ >> 3 files changed, 51 insertions(+), 15 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/paca.h >> b/arch/powerpc/include/asm/paca.h >> index 3f109a3e3edb..b441fef53077 100644 >> --- a/arch/powerpc/include/asm/paca.h >> +++ b/arch/powerpc/include/asm/paca.h >> @@ -251,6 +251,9 @@ struct paca_struct { >> void *rfi_flush_fallback_area; >> u64 l1d_flush_size; >> #endif >> +#ifdef CONFIG_PPC_PSERIES >> +u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */ >> +#endif /* CONFIG_PPC_PSERIES */ >> } cacheline_aligned; >> >> extern void copy_mm_to_paca(struct mm_struct *mm); >> diff --git a/arch/powerpc/platforms/pseries/ras.c >> b/arch/powerpc/platforms/pseries/ras.c >> index ef104144d4bc..14a46b07ab2f 100644 >> --- a/arch/powerpc/platforms/pseries/ras.c >> +++ b/arch/powerpc/platforms/pseries/ras.c >> @@ -22,6 +22,7 @@ >> #include >> #include >> #include >> +#include >> >> #include >> #include >> @@ -32,11 +33,13 @@ >> static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; >> static DEFINE_SPINLOCK(ras_log_buf_lock); >> >> -static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; >> -static DEFINE_PER_CPU(__u64, mce_data_buf); >> - >> static int ras_check_exception_token; >> >> +static void mce_process_errlog_event(struct irq_work *work); >> +static struct irq_work mce_errlog_process_work = { >> +.func = mce_process_errlog_event, >> +}; >> + >> #define EPOW_SENSOR_TOKEN 9 >> #define EPOW_SENSOR_INDEX 0 >> >> @@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void >> *dev_id) >> A) >= 0x7000) && ((A) < 0x7ff0)) || \ >> (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16 >> >> +static inline struct rtas_error_log *fwnmi_get_errlog(void) >> +{ >> +return (struct rtas_error_log *)local_paca->mce_data_buf; >> +} >> + >> /* >> * Get the error information for errors coming through the >> * FWNMI vectors. The pt_regs' r3 will be updated to reflect >> * the actual r3 if possible, and a ptr to the error log entry >> * will be returned if found. >> * >> - * If the RTAS error is not of the extended type, then we put it in a per >> - * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. >> + * Use one buffer mce_data_buf per cpu to store RTAS error. >> * >> - * The global_mce_data_buf does not have any locks or protection around it, >> + * The mce_data_buf does not have any locks or protection around it, >> * if a second machine check comes in,
Re: [PATCH] QE GPIO: Add qe_gpio_set_multiple
On Tue, 2018-06-26 at 23:41 +1000, Michael Ellerman wrote: > > Joakim Tjernlund writes: > > On Thu, 2018-06-21 at 02:38 +, Qiang Zhao wrote: > > > On 06/19/2018 09:22 AM, Joakim Tjernlund wrote: > > > -Original Message- > > > From: Linuxppc-dev > > > [mailto:linuxppc-dev-bounces+qiang.zhao=nxp@lists.ozlabs.org] On > > > Behalf Of Joakim Tjernlund > > > Sent: 2018年6月20日 0:22 > > > To: York Sun ; linuxppc-dev > > > > > > Subject: [PATCH] QE GPIO: Add qe_gpio_set_multiple > > > > > > This cousin to gpio-mpc8xxx was lacking a multiple pins method, add one. > > > > > > Signed-off-by: Joakim Tjernlund > > > --- > > > drivers/soc/fsl/qe/gpio.c | 28 > > > 1 file changed, 28 insertions(+) > > ... > > > static int qe_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { > > > struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ -298,6 > > > +325,7 @@ static int __init qe_add_gpiochips(void) > > > gc->direction_output = qe_gpio_dir_out; > > > gc->get = qe_gpio_get; > > > gc->set = qe_gpio_set; > > > + gc->set_multiple = qe_gpio_set_multiple; > > > > > > ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc); > > > if (ret) > > > > > > Reviewed-by: Qiang Zhao > > > > > > > Who picks up this patch ? Is it queued somewhere already? > > Not me. York? You seem to be the only one left. Jocke
Re: CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()
Le 03/07/2018 à 11:40, Mathieu Malaterre a écrit : Hi Nick, Would you consider this a bug: Looks more like a feature ... In /drivers/macintosh/adb.c you have, and it's the only place via_pmu_driver is used. #if defined(CONFIG_ADB_PMU) || defined(CONFIG_ADB_PMU68K) &via_pmu_driver, #endif Is one of those defined in your .config ? If not then via_pmu_driver gets eliminated hence nothing to warn on. Christophe $ touch drivers/macintosh/via-pmu.c $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- ... LD vmlinux.o MODPOST vmlinux.o WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init() The variable via_pmu_driver references the function __init pmu_init() If the reference is valid then annotate the variable with __init* or __refdata (see linux/init.h) or name the variable: *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console While: $ touch drivers/macintosh/via-pmu.c $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- ... AR init/built-in.a AR built-in.a LD vmlinux.o MODPOST vmlinux.o KSYM.tmp_kallsyms1.o KSYM.tmp_kallsyms2.o LD vmlinux SORTEX vmlinux SYSMAP System.map ... Thanks for comment
Re: [PATCH v3 2/2] powernv/cpuidle: Use parsed device tree values for cpuidle_init
Hi Akshay, On Tue, Jul 03, 2018 at 02:50:56PM +0530, Akshay Adiga wrote: > Export pnv_idle_states and nr_pnv_idle_states so that its accessible to > cpuidle driver. Use properties from pnv_idle_states structure for powernv > cpuidle_init. > > Signed-off-by: Akshay Adiga > Reviewed-by: Nicholas Piggin > --- [..snip..] > > /* >* For nap and fastsleep, use default target_residency >* values if f/w does not expose it. >*/ This comment can also go, since we are no longer hard-coding the residency values in the kernel. Otherwise looks good to me. Reviewed-by: Gautham R. Shenoy -- Thanks and Regards gautham.
Re: [PATCH v3 1/2] powernv/cpuidle: Parse dt idle properties into global structure
Hi Akshay, On Tue, Jul 03, 2018 at 02:50:55PM +0530, Akshay Adiga wrote: > Device-tree parsing happens twice, once while deciding idle state to be > used for hotplug and once during cpuidle init. Hence, parsing the device > tree and caching it will reduce code duplication. Parsing code has been > moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states(). In addition > to the properties in the device tree the number of available states is > also required. > > Signed-off-by: Akshay Adiga > Reviewed-by: Nicholas Piggin Looks good. Reviewed-by: Gautham R. Shenoy
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION: Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init()
Hi Nick, Would you consider this a bug: $ touch drivers/macintosh/via-pmu.c $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- ... LD vmlinux.o MODPOST vmlinux.o WARNING: vmlinux.o(.data+0x216018): Section mismatch in reference from the variable via_pmu_driver to the function .init.text:pmu_init() The variable via_pmu_driver references the function __init pmu_init() If the reference is valid then annotate the variable with __init* or __refdata (see linux/init.h) or name the variable: *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console While: $ touch drivers/macintosh/via-pmu.c $ CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- ... AR init/built-in.a AR built-in.a LD vmlinux.o MODPOST vmlinux.o KSYM.tmp_kallsyms1.o KSYM.tmp_kallsyms2.o LD vmlinux SORTEX vmlinux SYSMAP System.map ... Thanks for comment
Re: [PATCHv3 2/4] drivers/base: utilize device tree info to shutdown devices
On Tue, Jul 3, 2018 at 3:51 PM Lukas Wunner wrote: > > On Tue, Jul 03, 2018 at 02:50:40PM +0800, Pingfan Liu wrote: > > commit 52cdbdd49853 ("driver core: correct device's shutdown order") > > places an assumption of supplier<-consumer order on the process of probe. > > But it turns out to break down the parent <- child order in some scene. > > E.g in pci, a bridge is enabled by pci core, and behind it, the devices > > have been probed. Then comes the bridge's module, which enables extra > > feature(such as hotplug) on this bridge. This will break the > > parent<-children order and cause failure when "kexec -e" in some scenario. > > > > The detailed description of the scenario: > > An IBM Power9 machine on which, two drivers portdrv_pci and shpchp(a mod) > > match the PCI_CLASS_BRIDGE_PCI, but neither of them success to probe due > > to some issue. For this case, the bridge is moved after its children in > > devices_kset. Then, when "kexec -e", a ata-disk behind the bridge can not > > write back buffer in flight due to the former shutdown of the bridge which > > clears the BusMaster bit. > > If you revert commit cc27b735ad3a ("PCI/portdrv: Turn off PCIe services > during shutdown"), does the issue go away? Yes, it is gone.
[PATCH v3 2/2] powernv/cpuidle: Use parsed device tree values for cpuidle_init
Export pnv_idle_states and nr_pnv_idle_states so that its accessible to cpuidle driver. Use properties from pnv_idle_states structure for powernv cpuidle_init. Signed-off-by: Akshay Adiga Reviewed-by: Nicholas Piggin --- arch/powerpc/include/asm/cpuidle.h | 2 + drivers/cpuidle/cpuidle-powernv.c | 154 + 2 files changed, 28 insertions(+), 128 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 574b0ce1d671..43e5f31fe64d 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -90,6 +90,8 @@ struct pnv_idle_states_t { bool valid; }; +extern struct pnv_idle_states_t *pnv_idle_states; +extern int nr_pnv_idle_states; extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index d29e4f041efe..208d57c77305 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -242,6 +242,7 @@ static inline void add_powernv_state(int index, const char *name, powernv_states[index].target_residency = target_residency; powernv_states[index].exit_latency = exit_latency; powernv_states[index].enter = idle_fn; + /* For power8 and below psscr_* will be 0 */ stop_psscr_table[index].val = psscr_val; stop_psscr_table[index].mask = psscr_mask; } @@ -263,186 +264,84 @@ static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len, extern u32 pnv_get_supported_cpuidle_states(void); static int powernv_add_idle_states(void) { - struct device_node *power_mgt; int nr_idle_states = 1; /* Snooze */ - int dt_idle_states, count; - u32 latency_ns[CPUIDLE_STATE_MAX]; - u32 residency_ns[CPUIDLE_STATE_MAX]; - u32 flags[CPUIDLE_STATE_MAX]; - u64 psscr_val[CPUIDLE_STATE_MAX]; - u64 psscr_mask[CPUIDLE_STATE_MAX]; - const char *names[CPUIDLE_STATE_MAX]; + int dt_idle_states; u32 has_stop_states = 0; - int i, rc; + int i; u32 supported_flags = pnv_get_supported_cpuidle_states(); /* Currently we have snooze statically defined */ - - power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); - if (!power_mgt) { - pr_warn("opal: PowerMgmt Node not found\n"); + if (nr_pnv_idle_states <= 0) { + pr_warn("cpuidle-powernv : Only Snooze is available\n"); goto out; } - /* Read values of any property to determine the num of idle states */ - dt_idle_states = of_property_count_u32_elems(power_mgt, "ibm,cpu-idle-state-flags"); - if (dt_idle_states < 0) { - pr_warn("cpuidle-powernv: no idle states found in the DT\n"); - goto out; - } - - count = of_property_count_u32_elems(power_mgt, - "ibm,cpu-idle-state-latencies-ns"); - - if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states, - "ibm,cpu-idle-state-latencies-ns", - count) != 0) - goto out; - - count = of_property_count_strings(power_mgt, - "ibm,cpu-idle-state-names"); - if (validate_dt_prop_sizes("ibm,cpu-idle-state-flags", dt_idle_states, - "ibm,cpu-idle-state-names", - count) != 0) - goto out; + /* TODO: Count only states which are eligible for cpuidle */ + dt_idle_states = nr_pnv_idle_states; /* * Since snooze is used as first idle state, max idle states allowed is * CPUIDLE_STATE_MAX -1 */ - if (dt_idle_states > CPUIDLE_STATE_MAX - 1) { + if (nr_pnv_idle_states > CPUIDLE_STATE_MAX - 1) { pr_warn("cpuidle-powernv: discovered idle states more than allowed"); dt_idle_states = CPUIDLE_STATE_MAX - 1; } - if (of_property_read_u32_array(power_mgt, - "ibm,cpu-idle-state-flags", flags, dt_idle_states)) { - pr_warn("cpuidle-powernv : missing ibm,cpu-idle-state-flags in DT\n"); - goto out; - } - - if (of_property_read_u32_array(power_mgt, - "ibm,cpu-idle-state-latencies-ns", latency_ns, - dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n"); - goto out; - } - if (of_property_read_string_array(power_mgt, - "ibm,cpu-idle-state-names", names, dt_idle_states) < 0) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n"); - goto out; - } - /* * If the idle states use stop instruction, probe for psscr values * an
[PATCH v3 1/2] powernv/cpuidle: Parse dt idle properties into global structure
Device-tree parsing happens twice, once while deciding idle state to be used for hotplug and once during cpuidle init. Hence, parsing the device tree and caching it will reduce code duplication. Parsing code has been moved to pnv_parse_cpuidle_dt() from pnv_probe_idle_states(). In addition to the properties in the device tree the number of available states is also required. Signed-off-by: Akshay Adiga Reviewed-by: Nicholas Piggin --- arch/powerpc/include/asm/cpuidle.h| 11 ++ arch/powerpc/platforms/powernv/idle.c | 216 -- 2 files changed, 149 insertions(+), 78 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index e210a83eb196..574b0ce1d671 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -79,6 +79,17 @@ struct stop_sprs { u64 mmcra; }; +#define PNV_IDLE_NAME_LEN16 +struct pnv_idle_states_t { + char name[PNV_IDLE_NAME_LEN]; + u32 latency_ns; + u32 residency_ns; + u64 psscr_val; + u64 psscr_mask; + u32 flags; + bool valid; +}; + extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 1c5d0675b43c..7cf71b3e03a1 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -36,6 +36,8 @@ #define P9_STOP_SPR_PSSCR 855 static u32 supported_cpuidle_states; +struct pnv_idle_states_t *pnv_idle_states; +int nr_pnv_idle_states; /* * The default stop state that will be used by ppc_md.power_save @@ -622,48 +624,10 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) * @dt_idle_states: Number of idle state entries * Returns 0 on success */ -static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, - int dt_idle_states) +static int __init pnv_power9_idle_init(void) { - u64 *psscr_val = NULL; - u64 *psscr_mask = NULL; - u32 *residency_ns = NULL; u64 max_residency_ns = 0; - int rc = 0, i; - - psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); - psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); - residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns), - GFP_KERNEL); - - if (!psscr_val || !psscr_mask || !residency_ns) { - rc = -1; - goto out; - } - - if (of_property_read_u64_array(np, - "ibm,cpu-idle-state-psscr", - psscr_val, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n"); - rc = -1; - goto out; - } - - if (of_property_read_u64_array(np, - "ibm,cpu-idle-state-psscr-mask", - psscr_mask, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n"); - rc = -1; - goto out; - } - - if (of_property_read_u32_array(np, - "ibm,cpu-idle-state-residency-ns", - residency_ns, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n"); - rc = -1; - goto out; - } + int i; /* * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask}, @@ -679,33 +643,36 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state. */ pnv_first_deep_stop_state = MAX_STOP_STATE; - for (i = 0; i < dt_idle_states; i++) { + for (i = 0; i < nr_pnv_idle_states; i++) { int err; - u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK; + struct pnv_idle_states_t *state = &pnv_idle_states[i]; + u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK; - if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) && -(pnv_first_deep_stop_state > psscr_rl)) + if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) && + pnv_first_deep_stop_state > psscr_rl) pnv_first_deep_stop_state = psscr_rl; - err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i], - flags[i]); + err = validate_psscr_val_mask(&state->psscr_val, + &state->psscr_mask, + state->flags); if (err) { - report_invalid_psscr_val(psscr_val[i], err); + state->valid = false;
[PATCH v3 0/2] powernv/cpuidle Device-tree parsing cleanup
Device-tree parsed multiple time in powernv cpuidle and powernv hotplug code. First to identify supported flags. Second time, to identify deepest_state and first deep state. Third time, during cpuidle init to find the available idle states. Any change in device-tree format will lead to make changes in these 3 places. Errors in device-tree can be handled in a better manner. This series adds code to parse device tree once and save in global structure. Changes from v2 : - Fix build error (moved a hunk from patch 1 to patch 2) Changes from v1 : - fold first 2 patches into 1 - rename pm_ctrl_reg_* as psscr_* - added comment stating removal of pmicr parsing code - removed parsing code for pmicr - add member valid in pnv_idle_states_t to indicate if the psscr-mask/val are valid combination, - Change function description of pnv_parse_cpuidle_dt - Added error handling code. Akshay Adiga (2): powernv/cpuidle: Parse dt idle properties into global structure powernv/cpuidle: Use parsed device tree values for cpuidle_init arch/powerpc/include/asm/cpuidle.h| 13 ++ arch/powerpc/platforms/powernv/idle.c | 216 -- drivers/cpuidle/cpuidle-powernv.c | 154 -- 3 files changed, 177 insertions(+), 206 deletions(-) -- 2.18.0.rc2.85.g1fb9df7
Oops in sock_wfree
Hi, I was having strange unexpected memory corruption, therefore I activated DEBUG_PAGEALLOC and I now end up with the following Oops, which tends to make me think we have somewhere in the network code a use-after-free bug. I saw a few of such bugs have been fixed for IPv4 and IPv6. Maybe we have one remaining for Unix sockets ? How can I spot it off and fix it ? [ 39.645644] Unable to handle kernel paging request for data at address 0xc2235010 [ 39.652860] Faulting instruction address: 0xc0334d5c [ 39.657783] Oops: Kernel access of bad area, sig: 11 [#1] [ 39.663085] BE PREEMPT DEBUG_PAGEALLOC CMPC885 [ 39.667488] SAF3000 DIE NOTIFICATION [ 39.671050] CPU: 0 PID: 269 Comm: in:imuxsock Not tainted 4.14.52-00025-g5bada429cf #22 [ 39.679633] task: c623e100 task.stack: c651e000 [ 39.684106] NIP: c0334d5c LR: c043602c CTR: c0435fb8 [ 39.689103] REGS: c651fc00 TRAP: 0300 Not tainted (4.14.52-00025-g5bada429cf) [ 39.697087] MSR: 9032 CR: 28002822 XER: 2000 [ 39.703720] DAR: c2235010 DSISR: c000 [ 39.703720] GPR00: c043602c c651fcb0 c623e100 c619eec0 c642c540 0008 0018 c651fd4c [ 39.703720] GPR08: c0435fb8 02b0 c068d830 0004 28004822 100d4208 77990848 [ 39.703720] GPR16: 0ff58398 778eb4b0 1039f050 1039f0a8 1005ddbc 0ff5a7bc [ 39.703720] GPR24: 0072 c5011650 c651feb8 0072 c619eec0 0040 c2234fc0 c619eec0 [ 39.741401] NIP [c0334d5c] sock_wfree+0x18/0xa4 [ 39.745843] LR [c043602c] unix_destruct_scm+0x74/0x88 [ 39.750786] Call Trace: [ 39.753253] [c651fcb0] [c006348c] ns_to_timeval+0x4c/0x7c (unreliable) [ 39.759690] [c651fcc0] [c043602c] unix_destruct_scm+0x74/0x88 [ 39.765385] [c651fcf0] [c033a10c] skb_release_head_state+0x8c/0x110 [ 39.771571] [c651fd00] [c033a3c4] skb_release_all+0x18/0x50 [ 39.777078] [c651fd10] [c033a7cc] consume_skb+0x38/0xec [ 39.782255] [c651fd20] [c0342d7c] skb_free_datagram+0x1c/0x68 [ 39.787922] [c651fd30] [c0435c8c] unix_dgram_recvmsg+0x19c/0x4ac [ 39.793863] [c651fdb0] [c0331370] ___sys_recvmsg+0x98/0x138 [ 39.799371] [c651feb0] [c0333280] __sys_recvmsg+0x40/0x84 [ 39.804707] [c651ff10] [c0333680] SyS_socketcall+0xb8/0x1d4 [ 39.810220] [c651ff40] [c000d1ac] ret_from_syscall+0x0/0x38 [ 39.815673] Instruction dump: [ 39.818612] 41beffac 4b58 3883 4ba0 3881 4b98 7c0802a6 9421fff0 [ 39.826267] bfc10008 90010014 83c30010 812300a8 <815e0050> 3bfe00e0 71480200 4082003c [ 39.834113] ---[ end trace 8affde0490d7e25e ]--- Thanks Christophe
Re: [PATCH v9 0/6] add support for relative references in special sections
* Ard Biesheuvel wrote: > On 27 June 2018 at 17:15, Will Deacon wrote: > > Hi Ard, > > > > On Tue, Jun 26, 2018 at 08:27:55PM +0200, Ard Biesheuvel wrote: > >> This adds support for emitting special sections such as initcall arrays, > >> PCI fixups and tracepoints as relative references rather than absolute > >> references. This reduces the size by 50% on 64-bit architectures, but > >> more importantly, it removes the need for carrying relocation metadata > >> for these sections in relocatable kernels (e.g., for KASLR) that needs > >> to be fixed up at boot time. On arm64, this reduces the vmlinux footprint > >> of such a reference by 8x (8 byte absolute reference + 24 byte RELA entry > >> vs 4 byte relative reference) > >> > >> Patch #3 was sent out before as a single patch. This series supersedes > >> the previous submission. This version makes relative ksymtab entries > >> dependent on the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS rather > >> than trying to infer from kbuild test robot replies for which architectures > >> it should be blacklisted. > >> > >> Patch #1 introduces the new Kconfig symbol HAVE_ARCH_PREL32_RELOCATIONS, > >> and sets it for the main architectures that are expected to benefit the > >> most from this feature, i.e., 64-bit architectures or ones that use > >> runtime relocations. > >> > >> Patch #2 add support for #define'ing __DISABLE_EXPORTS to get rid of > >> ksymtab/kcrctab sections in decompressor and EFI stub objects when > >> rebuilding existing C files to run in a different context. > > > > I had a small question on patch 3, but it's really for my understanding. > > So, for patches 1-3: > > > > Reviewed-by: Will Deacon > > > > Thanks all. > > Thomas, Ingo, > > Except for the below tweak against patch #3 for powerpc, which may > apparently get confused by an input section called .discard without > any suffixes, this series is good to go, but requires your ack to > proceed, so I would like to ask you to share your comments and/or > objections. Also, any suggestions or recommendations regarding the > route these patches should take are highly appreciated. LGTM: Acked-by: Ingo Molnar Regarding route - I suspect -mm would be good, or any other tree that does a lot of cross-arch testing? Thanks, Ingo
Re: 83a092cf95f28: powerpc: Link warning for orphan sections
On Tue, Jul 3, 2018 at 9:23 AM Nicholas Piggin wrote: > > On Tue, 3 Jul 2018 09:03:46 +0200 > Mathieu Malaterre wrote: > > > Hi Nick, > > > > I am building my kernel (ppc32) with both > > CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y and CONFIG_UBSAN=y. This leads > > to ~316428 warnings such as: > > > > + powerpc-linux-gnu-ld -EB -m elf32ppc -Bstatic --orphan-handling=warn > > --build-id --gc-sections -X -o .tmp_vmlinux1 -T > > ./arch/powerpc/kernel/vmlinux.lds --who > > le-archive built-in.a --no-whole-archive --start-group lib/lib.a --end-group > > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data393' > > from `init/main.o' being placed in section `.data..Lubsan_data393'. > > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data394' > > from `init/main.o' being placed in section `.data..Lubsan_data394'. > > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data395' > > from `init/main.o' being placed in section `.data..Lubsan_data395'. > > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data396' > > from `init/main.o' being placed in section `.data..Lubsan_data396'. > > ... > > > > What would you recommend to reduce the number of warnings produced ? I > > tried `--warn-once` but that only affect undefined symbols. > > I'm not sure if the linker can be quietened. You could try putting > *(.data..Lubsan_data*) into the .data section in > powerpc/kernel/vmlinux.lds.S Nice ! Will submit a patch. Thanks > Thanks, > Nick
Re: [PATCH v2 2/3] powerpc/fsl: Add barrier_nospec implementation for NXP PowerPC Book3E
Hi Diana, A few comments below ... Diana Craciun writes: > Implement the barrier_nospec as a isync;sync instruction sequence. > The implementation uses the infrastructure built for BOOK3S 64. Do you have any details on why that sequence functions as an effective barrier on your chips? In a lot of places we have eg: +#if defined(CONFIG_PPC_BOOK3S_64) || defined(CONFIG_PPC_FSL_BOOK3E) Can you please add a Kconfig symbol to capture that. eg. config PPC_NOSPEC bool default y depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E And then use that everywhere. > diff --git a/arch/powerpc/include/asm/barrier.h > b/arch/powerpc/include/asm/barrier.h > index f67b3f6..405d572 100644 > --- a/arch/powerpc/include/asm/barrier.h > +++ b/arch/powerpc/include/asm/barrier.h > @@ -86,6 +86,16 @@ do { > \ > // This also acts as a compiler barrier due to the memory clobber. > #define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: > "memory") > > +#elif defined(CONFIG_PPC_FSL_BOOK3E) > +/* > + * Prevent the execution of subsequent instructions speculatively using a > + * isync;sync instruction sequence. > + */ > +#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; nop; nop > + > +// This also acts as a compiler barrier due to the memory clobber. > +#define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: > "memory") > + > #else /* !CONFIG_PPC_BOOK3S_64 */ > #define barrier_nospec_asm > #define barrier_nospec() If we have CONFIG_PPC_NOSPEC this can be done something like: #ifdef CONFIG_PPC_BOOK3S_64 #define NOSPEC_BARRIER_SLOT nop #elif defined(CONFIG_PPC_FSL_BOOK3E) #define NOSPEC_BARRIER_SLOT nop; nop #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_NOSPEC /* * Prevent execution of subsequent instructions until preceding branches have * been fully resolved and are no longer executing speculatively. */ #define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; NOSPEC_BARRIER_SLOT // This also acts as a compiler barrier due to the memory clobber. #define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory") #else #define barrier_nospec_asm #define barrier_nospec() #endif /* CONFIG_PPC_NOSPEC */ > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile > index 2b4c40b2..d9dee43 100644 > --- a/arch/powerpc/kernel/Makefile > +++ b/arch/powerpc/kernel/Makefile > @@ -76,7 +76,7 @@ endif > obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o > obj-$(CONFIG_MODULES)+= module.o module_$(BITS).o > obj-$(CONFIG_44x)+= cpu_setup_44x.o > -obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o > +obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o security.o > obj-$(CONFIG_PPC_DOORBELL) += dbell.o > obj-$(CONFIG_JUMP_LABEL) += jump_label.o Can we instead do: obj-$(CONFIG_PPC_NOSPEC)+= security.o > diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c > index c55e102..797c975 100644 > --- a/arch/powerpc/kernel/security.c > +++ b/arch/powerpc/kernel/security.c > @@ -13,7 +13,9 @@ > #include > > > +#ifdef CONFIG_PPC_BOOK3S_64 > unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT; > +#endif /* CONFIG_PPC_BOOK3S_64 */ Why are you making that book3s specific? By doing that you then can't use the existing versions of setup_barrier_nospec() and cpu_show_spectre_v1/v2(). > @@ -24,6 +26,7 @@ static void enable_barrier_nospec(bool enable) > do_barrier_nospec_fixups(enable); > } > > +#ifdef CONFIG_PPC_BOOK3S_64 > void setup_barrier_nospec(void) > { > bool enable; > @@ -46,6 +49,15 @@ void setup_barrier_nospec(void) > if (!no_nospec) > enable_barrier_nospec(enable); > } > +#endif /* CONFIG_PPC_BOOK3S_64 */ > + > +#ifdef CONFIG_PPC_FSL_BOOK3E > +void setup_barrier_nospec(void) > +{ > + if (!no_nospec) > + enable_barrier_nospec(true); > +} > +#endif /* CONFIG_PPC_FSL_BOOK3E */ eg. that is identical to the existing version except for the feature check: enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR); Both those bits are enabled by default, so you shouldn't need to elide that check. So basically you should be able to use the existing setup_barrier_nospec() if you just remove the ifdef around powerpc_security_features. Or am I missing something? > diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c > index 7445748..80c1e6e 100644 > --- a/arch/powerpc/kernel/setup_32.c > +++ b/arch/powerpc/kernel/setup_32.c > @@ -116,6 +116,11 @@ notrace void __init machine_init(u64 dt_ptr) > /* Do some early initialization based on the flat device tree */ > early_init_devtree(__va(dt_ptr)); > > + /* Apply the speculation barrier fixup */ > +#ifdef CONFIG_PPC_FSL_BOOK3E > + setup_barrier_nospec(); > +#endif
Re: [PATCH v2 3/3] powerpc/fsl: Implement cpu_show_spectre_v1/v2 for NXP PowerPC Book3E
Michal Suchánek writes: > On Tue, 12 Jun 2018 02:59:11 + > Bharat Bhushan wrote: > >> Hi Diana, >> >> > -Original Message- >> > From: Diana Craciun [mailto:diana.crac...@nxp.com] >> > Sent: Monday, June 11, 2018 6:23 PM >> > To: linuxppc-dev@lists.ozlabs.org >> > Cc: m...@ellerman.id.au; o...@buserror.net; Leo Li >> > ; Bharat Bhushan ; >> > Diana Madalina Craciun >> > Subject: [PATCH v2 3/3] powerpc/fsl: Implement >> > cpu_show_spectre_v1/v2 for NXP PowerPC Book3E >> >> Please add some description > > To me the subject is self-explanatory. It implements a kernel interface > that was already described elsewhere. > > What are you missing here? It should at least explain why it's reimplementing a function that already exists for powerpc. ie. Why can't the existing version be used? cheers
Re: [PATCH v2 1/3] powerpc/fsl: Disable the speculation barrier from the command line
Diana Craciun writes: > The speculation barrier can be disabled from the command line > with the parameter: "nospectre_v1". Can you please send another patch adding it to Documentation/admin-guide/kernel-parameters.txt You should mark it as "PPC" only for now. cheers
Re: 83a092cf95f28: powerpc: Link warning for orphan sections
On Tue, 3 Jul 2018 09:03:46 +0200 Mathieu Malaterre wrote: > Hi Nick, > > I am building my kernel (ppc32) with both > CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y and CONFIG_UBSAN=y. This leads > to ~316428 warnings such as: > > + powerpc-linux-gnu-ld -EB -m elf32ppc -Bstatic --orphan-handling=warn > --build-id --gc-sections -X -o .tmp_vmlinux1 -T > ./arch/powerpc/kernel/vmlinux.lds --who > le-archive built-in.a --no-whole-archive --start-group lib/lib.a --end-group > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data393' > from `init/main.o' being placed in section `.data..Lubsan_data393'. > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data394' > from `init/main.o' being placed in section `.data..Lubsan_data394'. > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data395' > from `init/main.o' being placed in section `.data..Lubsan_data395'. > powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data396' > from `init/main.o' being placed in section `.data..Lubsan_data396'. > ... > > What would you recommend to reduce the number of warnings produced ? I > tried `--warn-once` but that only affect undefined symbols. I'm not sure if the linker can be quietened. You could try putting *(.data..Lubsan_data*) into the .data section in powerpc/kernel/vmlinux.lds.S Thanks, Nick
Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.
On 07/03/2018 03:38 AM, Nicholas Piggin wrote: > On Mon, 02 Jul 2018 11:17:06 +0530 > Mahesh J Salgaonkar wrote: > >> From: Mahesh Salgaonkar >> >> On pseries, as of today system crashes if we get a machine check >> exceptions due to SLB errors. These are soft errors and can be fixed by >> flushing the SLBs so the kernel can continue to function instead of >> system crash. We do this in real mode before turning on MMU. Otherwise >> we would run into nested machine checks. This patch now fetches the >> rtas error log in real mode and flushes the SLBs on SLB errors. >> >> Signed-off-by: Mahesh Salgaonkar >> --- >> arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 >> arch/powerpc/include/asm/machdep.h|1 >> arch/powerpc/kernel/exceptions-64s.S | 42 + >> arch/powerpc/kernel/mce.c | 16 +++- >> arch/powerpc/mm/slb.c |6 +++ >> arch/powerpc/platforms/powernv/opal.c |1 >> arch/powerpc/platforms/pseries/pseries.h |1 >> arch/powerpc/platforms/pseries/ras.c | 51 >> + >> arch/powerpc/platforms/pseries/setup.c|1 >> 9 files changed, 116 insertions(+), 4 deletions(-) >> > > >> +TRAMP_REAL_BEGIN(machine_check_pSeries_early) >> +BEGIN_FTR_SECTION >> +EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) >> +mr r10,r1 /* Save r1 */ >> +ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ >> +subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/ >> +mfspr r11,SPRN_SRR0 /* Save SRR0 */ >> +mfspr r12,SPRN_SRR1 /* Save SRR1 */ >> +EXCEPTION_PROLOG_COMMON_1() >> +EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) >> +EXCEPTION_PROLOG_COMMON_3(0x200) >> +addir3,r1,STACK_FRAME_OVERHEAD >> +BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */ > > Is there any reason you can't use the existing > machine_check_powernv_early code to do all this? I did think about that :-). But the machine_check_powernv_early code does bit of extra stuff which isn't required in pseries like touching ME bit in MSR and lots of checks that are done in machine_check_handle_early() before going to virtual mode. But on second look I see that we can bypass all that with HVMODE FTR section. Will rename machine_check_powernv_early to machine_check_common_early and reuse it. > >> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c >> index efdd16a79075..221271c96a57 100644 >> --- a/arch/powerpc/kernel/mce.c >> +++ b/arch/powerpc/kernel/mce.c >> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs) >> { >> long handled = 0; >> >> -__this_cpu_inc(irq_stat.mce_exceptions); >> +/* >> + * For pSeries we count mce when we go into virtual mode machine >> + * check handler. Hence skip it. Also, We can't access per cpu >> + * variables in real mode for LPAR. >> + */ >> +if (early_cpu_has_feature(CPU_FTR_HVMODE)) >> +__this_cpu_inc(irq_stat.mce_exceptions); >> >> -if (cur_cpu_spec && cur_cpu_spec->machine_check_early) >> +/* >> + * See if platform is capable of handling machine check. >> + * Otherwise fallthrough and allow CPU to handle this machine check. >> + */ >> +if (ppc_md.machine_check_early) >> +handled = ppc_md.machine_check_early(regs); >> +else if (cur_cpu_spec && cur_cpu_spec->machine_check_early) >> handled = cur_cpu_spec->machine_check_early(regs); > > Would be good to add a powernv ppc_md handler which does the > cur_cpu_spec->machine_check_early() call now that other platforms are > calling this code. Because those aren't valid as a fallback call, but > specific to powernv. > >> diff --git a/arch/powerpc/platforms/powernv/opal.c >> b/arch/powerpc/platforms/powernv/opal.c >> index 48fbb41af5d1..ed548d40a9e1 100644 >> --- a/arch/powerpc/platforms/powernv/opal.c >> +++ b/arch/powerpc/platforms/powernv/opal.c >> @@ -417,7 +417,6 @@ static int opal_recover_mce(struct pt_regs *regs, >> >> if (!(regs->msr & MSR_RI)) { >> /* If MSR_RI isn't set, we cannot recover */ >> -pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); > > What's the reason for this change? Err... This is by mistake.. My bad. Thanks for catching this. Will remove this hunk in next revision. We need a similar print for pSeries in ras.c. > >> recovered = 0; >> } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { >> /* Platform corrected itself */ >> diff --git a/arch/powerpc/platforms/pseries/pseries.h >> b/arch/powerpc/platforms/pseries/pseries.h >> index 60db2ee511fb..3611db5dd583 100644 >> --- a/arch/powerpc/platforms/pseries/pseries.h >> +++ b/arch/powerpc/platforms/pseries/pseries.h >> @@ -24,6 +24,7 @@ struct pt_regs; >> >> extern int pSeries_system_reset_exception(struct pt_regs
83a092cf95f28: powerpc: Link warning for orphan sections
Hi Nick, I am building my kernel (ppc32) with both CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y and CONFIG_UBSAN=y. This leads to ~316428 warnings such as: + powerpc-linux-gnu-ld -EB -m elf32ppc -Bstatic --orphan-handling=warn --build-id --gc-sections -X -o .tmp_vmlinux1 -T ./arch/powerpc/kernel/vmlinux.lds --who le-archive built-in.a --no-whole-archive --start-group lib/lib.a --end-group powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data393' from `init/main.o' being placed in section `.data..Lubsan_data393'. powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data394' from `init/main.o' being placed in section `.data..Lubsan_data394'. powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data395' from `init/main.o' being placed in section `.data..Lubsan_data395'. powerpc-linux-gnu-ld: warning: orphan section `.data..Lubsan_data396' from `init/main.o' being placed in section `.data..Lubsan_data396'. ... What would you recommend to reduce the number of warnings produced ? I tried `--warn-once` but that only affect undefined symbols. Thanks
[PATCHv3 4/4] Revert "driver core: correct device's shutdown order"
This reverts commit 52cdbdd49853dfa856082edb0f4c4c0249d9df07. Since the device_shutdown() does not rely on the order in devices_kset any more, reverting that commit safely. Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Grygorii Strashko Cc: Christoph Hellwig Cc: Bjorn Helgaas Cc: Dave Young Cc: linux-...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Pingfan Liu --- drivers/base/base.h | 1 - drivers/base/core.c | 49 - drivers/base/dd.c | 8 3 files changed, 58 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index a75c302..5bc9064 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -135,7 +135,6 @@ extern void device_unblock_probing(void); /* /sys/devices directory */ extern struct kset *devices_kset; -extern void devices_kset_move_last(struct device *dev); #if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS) extern void module_add_driver(struct module *mod, struct device_driver *drv); diff --git a/drivers/base/core.c b/drivers/base/core.c index db3deb8..570aeee 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1259,52 +1259,6 @@ static DEVICE_ATTR_RO(dev); struct kset *devices_kset; /** - * devices_kset_move_before - Move device in the devices_kset's list. - * @deva: Device to move. - * @devb: Device @deva should come before. - */ -static void devices_kset_move_before(struct device *deva, struct device *devb) -{ - if (!devices_kset) - return; - pr_debug("devices_kset: Moving %s before %s\n", -dev_name(deva), dev_name(devb)); - spin_lock(&devices_kset->list_lock); - list_move_tail(&deva->kobj.entry, &devb->kobj.entry); - spin_unlock(&devices_kset->list_lock); -} - -/** - * devices_kset_move_after - Move device in the devices_kset's list. - * @deva: Device to move - * @devb: Device @deva should come after. - */ -static void devices_kset_move_after(struct device *deva, struct device *devb) -{ - if (!devices_kset) - return; - pr_debug("devices_kset: Moving %s after %s\n", -dev_name(deva), dev_name(devb)); - spin_lock(&devices_kset->list_lock); - list_move(&deva->kobj.entry, &devb->kobj.entry); - spin_unlock(&devices_kset->list_lock); -} - -/** - * devices_kset_move_last - move the device to the end of devices_kset's list. - * @dev: device to move - */ -void devices_kset_move_last(struct device *dev) -{ - if (!devices_kset) - return; - pr_debug("devices_kset: Moving %s to end of list\n", dev_name(dev)); - spin_lock(&devices_kset->list_lock); - list_move_tail(&dev->kobj.entry, &devices_kset->list); - spin_unlock(&devices_kset->list_lock); -} - -/** * device_create_file - create sysfs attribute file for device. * @dev: device. * @attr: device attribute descriptor. @@ -2776,15 +2730,12 @@ int device_move(struct device *dev, struct device *new_parent, break; case DPM_ORDER_DEV_AFTER_PARENT: device_pm_move_after(dev, new_parent); - devices_kset_move_after(dev, new_parent); break; case DPM_ORDER_PARENT_BEFORE_DEV: device_pm_move_before(new_parent, dev); - devices_kset_move_before(new_parent, dev); break; case DPM_ORDER_DEV_LAST: device_pm_move_last(dev); - devices_kset_move_last(dev); break; } diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 1435d72..6ebcd65 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -434,14 +434,6 @@ static int really_probe(struct device *dev, struct device_driver *drv) goto probe_failed; } - /* -* Ensure devices are listed in devices_kset in correct order -* It's important to move Dev to the end of devices_kset before -* calling .probe, because it could be recursive and parent Dev -* should always go first -*/ - devices_kset_move_last(dev); - if (dev->bus->probe) { ret = dev->bus->probe(dev); if (ret) -- 2.7.4