Re: [PATCH 14/18] powerpc/kprobes: Support kprobes on prefixed instructions
On Tue, Nov 26, 2019 at 04:21:37PM +1100, Jordan Niethe wrote: > A prefixed instruction is composed of a word prefix followed by a word > suffix. It does not make sense to be able to have a kprobe on the suffix > of a prefixed instruction, so make this impossible. > > Kprobes work by replacing an instruction with a trap and saving that > instruction to be single stepped out of place later. Currently there is > not enough space allocated to keep a prefixed instruction for single > stepping. Increase the amount of space allocated for holding the > instruction copy. > > kprobe_post_handler() expects all instructions to be 4 bytes long which > means that it does not function correctly for prefixed instructions. > Add checks for prefixed instructions which will use a length of 8 bytes > instead. > > For optprobes we normally patch in loading the instruction we put a > probe on into r4 before calling emulate_step(). We now make space and > patch in loading the suffix into r5 as well. > > Signed-off-by: Jordan Niethe > --- > arch/powerpc/include/asm/kprobes.h | 5 +-- > arch/powerpc/kernel/kprobes.c| 46 +--- > arch/powerpc/kernel/optprobes.c | 31 +++ > arch/powerpc/kernel/optprobes_head.S | 6 > 4 files changed, 62 insertions(+), 26 deletions(-) > > diff --git a/arch/powerpc/include/asm/kprobes.h > b/arch/powerpc/include/asm/kprobes.h > index 66b3f2983b22..1f03a1cacb1e 100644 > --- a/arch/powerpc/include/asm/kprobes.h > +++ b/arch/powerpc/include/asm/kprobes.h > @@ -38,12 +38,13 @@ extern kprobe_opcode_t optprobe_template_entry[]; > extern kprobe_opcode_t optprobe_template_op_address[]; > extern kprobe_opcode_t optprobe_template_call_handler[]; > extern kprobe_opcode_t optprobe_template_insn[]; > +extern kprobe_opcode_t optprobe_template_sufx[]; > extern kprobe_opcode_t optprobe_template_call_emulate[]; > extern kprobe_opcode_t optprobe_template_ret[]; > extern kprobe_opcode_t optprobe_template_end[]; > > -/* Fixed instruction size for powerpc */ > -#define MAX_INSN_SIZE1 > +/* Prefixed instructions are two words */ > +#define MAX_INSN_SIZE2 > #define MAX_OPTIMIZED_LENGTH sizeof(kprobe_opcode_t) /* 4 bytes */ > #define MAX_OPTINSN_SIZE (optprobe_template_end - > optprobe_template_entry) > #define RELATIVEJUMP_SIZEsizeof(kprobe_opcode_t) /* 4 bytes */ > diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c > index 7303fe3856cc..aa15b3480385 100644 > --- a/arch/powerpc/kernel/kprobes.c > +++ b/arch/powerpc/kernel/kprobes.c > @@ -104,17 +104,30 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, > unsigned int offset) > > int arch_prepare_kprobe(struct kprobe *p) > { > + int len; > int ret = 0; > + struct kprobe *prev; > kprobe_opcode_t insn = *p->addr; > + kprobe_opcode_t prfx = *(p->addr - 1); > > + preempt_disable(); > if ((unsigned long)p->addr & 0x03) { > printk("Attempt to register kprobe at an unaligned address\n"); > ret = -EINVAL; > } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) { > printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n"); > ret = -EINVAL; > + } else if (IS_PREFIX(prfx)) { > + printk("Cannot register a kprobe on the second word of prefixed > instruction\n"); Let's have line with in 80 columns length. > + ret = -EINVAL; > + } > + prev = get_kprobe(p->addr - 1); > + if (prev && IS_PREFIX(*prev->ainsn.insn)) { > + printk("Cannot register a kprobe on the second word of prefixed > instruction\n"); same here. -- Bala > + ret = -EINVAL; > } > > + > /* insn must be on a special executable page on ppc64. This is >* not explicitly required on ppc32 (right now), but it doesn't hurt */ > if (!ret) { > @@ -124,14 +137,18 @@ int arch_prepare_kprobe(struct kprobe *p) > } > > if (!ret) { > - memcpy(p->ainsn.insn, p->addr, > - MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); > + if (IS_PREFIX(insn)) > + len = MAX_INSN_SIZE * sizeof(kprobe_opcode_t); > + else > + len = sizeof(kprobe_opcode_t); > + memcpy(p->ainsn.insn, p->addr, len); > p->opcode = *p->addr; > flush_icache_range((unsigned long)p->ainsn.insn, > (unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t)); > } > > p->ainsn.boostable = 0; > + preempt_enable_no_resched(); > return ret; > } > NOKPROBE_SYMBOL(arch_prepare_kprobe); > @@ -216,10 +233,11 @@ NOKPROBE_SYMBOL(arch_prepare_kretprobe); > static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) > { > int ret; > - unsigned int insn = *p->ainsn.insn; > + unsigned int insn = p->ainsn.insn[0]; > + unsigned int su
[PATCH v2] powerpc/ptdump: don't entirely rebuild kernel when selecting CONFIG_PPC_DEBUG_WX
Selecting CONFIG_PPC_DEBUG_WX only impacts ptdump and pgtable_32/64 init calls. Declaring related functions in asm/pgtable.h implies rebuilding almost everything. Move ptdump_check_wx() declaration in mm/mmu_decl.h Signed-off-by: Christophe Leroy --- v2: use mm/mmu_decl.h instead of a new asm/ptdump.h --- arch/powerpc/include/asm/pgtable.h | 6 -- arch/powerpc/mm/mmu_decl.h | 6 ++ arch/powerpc/mm/ptdump/ptdump.c| 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 0e4ec8cc37b7..8cc543ed114c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -94,12 +94,6 @@ void mark_initmem_nx(void); static inline void mark_initmem_nx(void) { } #endif -#ifdef CONFIG_PPC_DEBUG_WX -void ptdump_check_wx(void); -#else -static inline void ptdump_check_wx(void) { } -#endif - /* * When used, PTE_FRAG_NR is defined in subarch pgtable.h * so we are sure it is included when arriving here. diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 8e99649c24fc..7097e07a209a 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -181,3 +181,9 @@ void mmu_mark_rodata_ro(void); static inline void mmu_mark_initmem_nx(void) { } static inline void mmu_mark_rodata_ro(void) { } #endif + +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void); +#else +static inline void ptdump_check_wx(void) { } +#endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2f9ddc29c535..4af0d5d9589e 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -24,6 +24,8 @@ #include #include +#include + #include "ptdump.h" /* -- 2.13.3
Re: [PATCH -next] powerpc/pmac/smp: Fix old-style declaration
Christophe Leroy writes: > YueHaibing a écrit : > >> There expect the 'static' keyword to come first in a declaration >> >> arch/powerpc/platforms/powermac/smp.c:664:1: warning: static is not >> at beginning of declaration [-Wold-style-declaration] >> arch/powerpc/platforms/powermac/smp.c:665:1: warning: static is not >> at beginning of declaration [-Wold-style-declaration] >> >> Signed-off-by: YueHaibing >> --- >> arch/powerpc/platforms/powermac/smp.c | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/arch/powerpc/platforms/powermac/smp.c >> b/arch/powerpc/platforms/powermac/smp.c >> index f95fbde..7233b85 100644 >> --- a/arch/powerpc/platforms/powermac/smp.c >> +++ b/arch/powerpc/platforms/powermac/smp.c >> @@ -661,8 +661,8 @@ static void smp_core99_gpio_tb_freeze(int freeze) >> #endif /* !CONFIG_PPC64 */ >> >> /* L2 and L3 cache settings to pass from CPU0 to CPU1 on G4 cpus */ >> -volatile static long int core99_l2_cache; >> -volatile static long int core99_l3_cache; >> +static volatile long int core99_l2_cache; >> +static volatile long int core99_l3_cache; > > Is it correct to declare it as volatile ? I don't see any reason why it needs to be volatile, so I think we can just remove that? cheers
Re: [PATCH 5/5] powernv/pci: Move pnv_pci_dma_bus_setup() to pci-ioda.c
On 10/01/2020 18:02, Oliver O'Halloran wrote: > This is only used in pci-ioda.c so move it there and rename it to match. > > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 22 +- > arch/powerpc/platforms/powernv/pci.c | 20 > arch/powerpc/platforms/powernv/pci.h | 1 - > 3 files changed, 21 insertions(+), 22 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index e2a9440..4701621 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -3628,9 +3628,29 @@ static void pnv_pci_ioda_shutdown(struct > pci_controller *hose) > OPAL_ASSERT_RESET); > } > > +static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) > +{ > + struct pci_controller *hose = bus->sysdata; > + struct pnv_phb *phb = hose->private_data; > + struct pnv_ioda_pe *pe; > + > + list_for_each_entry(pe, &phb->ioda.pe_list, list) { > + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) > + continue; > + > + if (!pe->pbus) > + continue; > + > + if (bus->number == ((pe->rid >> 8) & 0xFF)) { > + pe->pbus = bus; > + break; > + } > + } > +} > + > static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { > .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, > - .dma_bus_setup = pnv_pci_dma_bus_setup, > + .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, > .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, > .setup_msi_irqs = pnv_setup_msi_irqs, > .teardown_msi_irqs = pnv_teardown_msi_irqs, > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 31f1949..a17bc1a1 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -810,26 +810,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) > return tbl; > } > > -void pnv_pci_dma_bus_setup(struct pci_bus *bus) > -{ > - struct pci_controller *hose = bus->sysdata; > - struct pnv_phb *phb = hose->private_data; > - struct pnv_ioda_pe *pe; > - > - list_for_each_entry(pe, &phb->ioda.pe_list, list) { > - if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) > - continue; > - > - if (!pe->pbus) > - continue; > - > - if (bus->number == ((pe->rid >> 8) & 0xFF)) { > - pe->pbus = bus; > - break; > - } > - } > -} > - > struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) > { > struct pci_controller *hose = pci_bus_to_host(dev->bus); > diff --git a/arch/powerpc/platforms/powernv/pci.h > b/arch/powerpc/platforms/powernv/pci.h > index 0cdc9ba..d3bbdea 100644 > --- a/arch/powerpc/platforms/powernv/pci.h > +++ b/arch/powerpc/platforms/powernv/pci.h > @@ -188,7 +188,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, > unsigned long msr); > extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); > extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); > > -extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); > extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); > extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); > extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); > -- Alexey
Re: [PATCH 4/5] powernv/pci: Fold pnv_pci_dma_dev_setup() into the pci-ioda.c version
On 10/01/2020 18:02, Oliver O'Halloran wrote: > pnv_pci_dma_dev_setup() does nothing but call the phb->dma_dev_setup() > callback, if one exists. That callback is only set for normal PCIe PHBs so > we can remove the layer of indirection and use the ioda version in > the pci_controller_ops. > > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 7 --- > arch/powerpc/platforms/powernv/pci.c | 9 - > arch/powerpc/platforms/powernv/pci.h | 2 -- > 3 files changed, 4 insertions(+), 14 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index ae177ee..e2a9440 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -1743,8 +1743,10 @@ int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 > num_vfs) > } > #endif /* CONFIG_PCI_IOV */ > > -static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev > *pdev) > +static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) > { > + struct pci_controller *hose = pci_bus_to_host(pdev->bus); > + struct pnv_phb *phb = hose->private_data; > struct pci_dn *pdn = pci_get_pdn(pdev); > struct pnv_ioda_pe *pe; > > @@ -3627,7 +3629,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller > *hose) > } > > static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { > - .dma_dev_setup = pnv_pci_dma_dev_setup, > + .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, > .dma_bus_setup = pnv_pci_dma_bus_setup, > .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, > .setup_msi_irqs = pnv_setup_msi_irqs, > @@ -3886,7 +3888,6 @@ static void __init pnv_pci_init_ioda_phb(struct > device_node *np, > hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops; > break; > default: > - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; > hose->controller_ops = pnv_pci_ioda_controller_ops; > } > > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index 8307e1f..31f1949 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -810,15 +810,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) > return tbl; > } > > -void pnv_pci_dma_dev_setup(struct pci_dev *pdev) > -{ > - struct pci_controller *hose = pci_bus_to_host(pdev->bus); > - struct pnv_phb *phb = hose->private_data; > - > - if (phb && phb->dma_dev_setup) > - phb->dma_dev_setup(phb, pdev); > -} > - > void pnv_pci_dma_bus_setup(struct pci_bus *bus) > { > struct pci_controller *hose = bus->sysdata; > diff --git a/arch/powerpc/platforms/powernv/pci.h > b/arch/powerpc/platforms/powernv/pci.h > index f914f0b..0cdc9ba 100644 > --- a/arch/powerpc/platforms/powernv/pci.h > +++ b/arch/powerpc/platforms/powernv/pci.h > @@ -108,7 +108,6 @@ struct pnv_phb { > int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, >unsigned int hwirq, unsigned int virq, >unsigned int is_64, struct msi_msg *msg); > - void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); > int (*init_m64)(struct pnv_phb *phb); > int (*get_pe_state)(struct pnv_phb *phb, int pe_no); > void (*freeze_pe)(struct pnv_phb *phb, int pe_no); > @@ -189,7 +188,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, > unsigned long msr); > extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); > extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); > > -extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); > extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); > extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); > extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); > -- Alexey
Re: [PATCH 3/5] powerpc/iov: Move VF pdev fixup into pcibios_fixup_iov()
On 10/01/2020 18:02, Oliver O'Halloran wrote: > An ioda_pe for each VF is allocated in pnv_pci_sriov_enable() before the > pci_dev for the VF is created. We need to set the pe->pdev pointer at > some point after the pci_dev is created. Currently we do that in: > > pcibios_bus_add_device() > pnv_pci_dma_dev_setup() (via phb->ops.dma_dev_setup) > /* fixup is done here */ > pnv_pci_ioda_dma_dev_setup() (via pnv_phb->dma_dev_setup) > > The fixup needs to be done before setting up DMA for for the VF's PE, but > there's no real reason to delay it until this point. Move the fixup into > pnv_pci_ioda_fixup_iov() so the ordering is: > > pcibios_add_device() > pnv_pci_ioda_fixup_iov() (via ppc_md.pcibios_fixup_sriov) > > pcibios_bus_add_device() > ... > > This isn't strictly required, but it's slightly a slightly more logical s/slightly a slightly/slightly/ ? Reviewed-by: Alexey Kardashevskiy > place to do the fixup and it simplifies pnv_pci_dma_dev_setup(). > > Signed-off-by: Oliver O'Halloran > --- > arch/powerpc/platforms/powernv/pci-ioda.c | 29 + > arch/powerpc/platforms/powernv/pci.c | 14 -- > 2 files changed, 25 insertions(+), 18 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c > b/arch/powerpc/platforms/powernv/pci-ioda.c > index 33d4039..ae177ee 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -2900,9 +2900,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct > pci_dev *pdev) > struct pci_dn *pdn; > int mul, total_vfs; > > - if (!pdev->is_physfn || pci_dev_is_added(pdev)) > - return; > - > pdn = pci_get_pdn(pdev); > pdn->vfs_expanded = 0; > pdn->m64_single_mode = false; > @@ -2977,6 +2974,30 @@ static void pnv_pci_ioda_fixup_iov_resources(struct > pci_dev *pdev) > res->end = res->start - 1; > } > } > + > +static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) > +{ > + if (WARN_ON(pci_dev_is_added(pdev))) > + return; > + > + if (pdev->is_virtfn) { > + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); > + > + /* > + * VF PEs are single-device PEs so their pdev pointer needs to > + * be set. The pdev doesn't exist when the PE is allocated (in > + * (pcibios_sriov_enable()) so we fix it up here. > + */ > + pe->pdev = pdev; > + WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); > + } else if (pdev->is_physfn) { > + /* > + * For PFs adjust their allocated IOV resources to match what > + * the PHB can support using it's M64 BAR table. > + */ > + pnv_pci_ioda_fixup_iov_resources(pdev); > + } > +} > #endif /* CONFIG_PCI_IOV */ > > static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, > @@ -3872,7 +3893,7 @@ static void __init pnv_pci_init_ioda_phb(struct > device_node *np, > ppc_md.pcibios_default_alignment = pnv_pci_default_alignment; > > #ifdef CONFIG_PCI_IOV > - ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; > + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov; > ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; > ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable; > ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable; > diff --git a/arch/powerpc/platforms/powernv/pci.c > b/arch/powerpc/platforms/powernv/pci.c > index e8e58a2c..8307e1f 100644 > --- a/arch/powerpc/platforms/powernv/pci.c > +++ b/arch/powerpc/platforms/powernv/pci.c > @@ -814,20 +814,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) > { > struct pci_controller *hose = pci_bus_to_host(pdev->bus); > struct pnv_phb *phb = hose->private_data; > -#ifdef CONFIG_PCI_IOV > - struct pnv_ioda_pe *pe; > - > - /* Fix the VF pdn PE number */ > - if (pdev->is_virtfn) { > - list_for_each_entry(pe, &phb->ioda.pe_list, list) { > - if (pe->rid == ((pdev->bus->number << 8) | > - (pdev->devfn & 0xff))) { > - pe->pdev = pdev; > - break; > - } > - } > - } > -#endif /* CONFIG_PCI_IOV */ > > if (phb && phb->dma_dev_setup) > phb->dma_dev_setup(phb, pdev); > -- Alexey
Re: [PATCH 1/5] powerpc/pci: Fold pcibios_setup_device() into pcibios_bus_add_device()
On 10/01/2020 18:02, Oliver O'Halloran wrote: > pcibios_bus_add_device() is the only caller of pcibios_setup_device(). > Fold them together since there's no real reason to keep them separate. > > Signed-off-by: Oliver O'Halloran Reviewed-by: Alexey Kardashevskiy > --- > arch/powerpc/kernel/pci-common.c | 10 ++ > 1 file changed, 2 insertions(+), 8 deletions(-) > > diff --git a/arch/powerpc/kernel/pci-common.c > b/arch/powerpc/kernel/pci-common.c > index f8a59d7..c6c0341 100644 > --- a/arch/powerpc/kernel/pci-common.c > +++ b/arch/powerpc/kernel/pci-common.c > @@ -958,7 +958,7 @@ void pcibios_setup_bus_self(struct pci_bus *bus) > phb->controller_ops.dma_bus_setup(bus); > } > > -static void pcibios_setup_device(struct pci_dev *dev) > +void pcibios_bus_add_device(struct pci_dev *dev) > { > struct pci_controller *phb; > /* Fixup NUMA node as it may not be setup yet by the generic > @@ -979,15 +979,9 @@ static void pcibios_setup_device(struct pci_dev *dev) > pci_read_irq_line(dev); > if (ppc_md.pci_irq_fixup) > ppc_md.pci_irq_fixup(dev); > -} > - > -void pcibios_bus_add_device(struct pci_dev *pdev) > -{ > - /* Perform platform-specific device setup */ > - pcibios_setup_device(pdev); > > if (ppc_md.pcibios_bus_add_device) > - ppc_md.pcibios_bus_add_device(pdev); > + ppc_md.pcibios_bus_add_device(dev); > } > > int pcibios_add_device(struct pci_dev *dev) > -- Alexey
Re: [PATCH] evh_bytechan: fix out of bounds accesses
Hi Timur, On Mon, 13 Jan 2020 19:10:11 -0600 Timur Tabi wrote: > > On 1/13/20 2:25 PM, Stephen Rothwell wrote: > > The problem is not really the declaration, the problem is that > > ev_byte_channel_send always accesses 16 bytes from the buffer and it is > > not always passed a buffer that long (in one case it is passed a > > pointer to a single byte). So the alternative to the memcpy approach I > > have take is to complicate ev_byte_channel_send so that only accesses > > count bytes from the buffer. > > Ah, I see now. This is all coming back to me. > > I would prefer that ev_byte_channel_send() is updated to access only > 'count' bytes. If that means adding a memcpy to the > ev_byte_channel_send() itself, then so be it. Trying to figure out how > to stuff n bytes into 4 32-bit registers is probably not worth the effort. Like this (I have compile tested this): From: Stephen Rothwell Date: Thu, 9 Jan 2020 18:23:48 +1100 Subject: [PATCH v2] evh_bytechan: fix out of bounds accesses ev_byte_channel_send() assumes that its third argument is a 16 byte array. Some places where it is called it may not be (or we can't easily tell if it is). Newer compilers have started producing warnings about this, so copy the bytes to send into a local array if the passed length is to short. Since all the calls of ev_byte_channel_send() are in one file, lets move it there from the header file and let the compiler decide if it wants to inline it. The warnings are: In file included from include/linux/byteorder/big_endian.h:5, from arch/powerpc/include/uapi/asm/byteorder.h:14, from include/asm-generic/bitops/le.h:6, from arch/powerpc/include/asm/bitops.h:250, from include/linux/bitops.h:29, from include/linux/kernel.h:12, from include/asm-generic/bug.h:19, from arch/powerpc/include/asm/bug.h:109, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/gfp.h:5, from include/linux/slab.h:15, from drivers/tty/ehv_bytechan.c:24: drivers/tty/ehv_bytechan.c: In function ‘ehv_bc_udbg_putc’: arch/powerpc/include/asm/epapr_hcalls.h:298:20: warning: array subscript 1 is outside array bounds of ‘const char[1]’ [-Warray-bounds] 298 | r6 = be32_to_cpu(p[1]); include/uapi/linux/byteorder/big_endian.h:40:51: note: in definition of macro ‘__be32_to_cpu’ 40 | #define __be32_to_cpu(x) ((__force __u32)(__be32)(x)) | ^ arch/powerpc/include/asm/epapr_hcalls.h:298:7: note: in expansion of macro ‘be32_to_cpu’ 298 | r6 = be32_to_cpu(p[1]); | ^~~ drivers/tty/ehv_bytechan.c:166:13: note: while referencing ‘data’ 166 | static void ehv_bc_udbg_putc(char c) | ^~~~ In file included from include/linux/byteorder/big_endian.h:5, from arch/powerpc/include/uapi/asm/byteorder.h:14, from include/asm-generic/bitops/le.h:6, from arch/powerpc/include/asm/bitops.h:250, from include/linux/bitops.h:29, from include/linux/kernel.h:12, from include/asm-generic/bug.h:19, from arch/powerpc/include/asm/bug.h:109, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/gfp.h:5, from include/linux/slab.h:15, from drivers/tty/ehv_bytechan.c:24: arch/powerpc/include/asm/epapr_hcalls.h:299:20: warning: array subscript 2 is outside array bounds of ‘const char[1]’ [-Warray-bounds] 299 | r7 = be32_to_cpu(p[2]); include/uapi/linux/byteorder/big_endian.h:40:51: note: in definition of macro ‘__be32_to_cpu’ 40 | #define __be32_to_cpu(x) ((__force __u32)(__be32)(x)) | ^ arch/powerpc/include/asm/epapr_hcalls.h:299:7: note: in expansion of macro ‘be32_to_cpu’ 299 | r7 = be32_to_cpu(p[2]); | ^~~ drivers/tty/ehv_bytechan.c:166:13: note: while referencing ‘data’ 166 | static void ehv_bc_udbg_putc(char c) | ^~~~ In file included from include/linux/byteorder/big_endian.h:5, from arch/powerpc/include/uapi/asm/byteorder.h:14, from include/asm-generic/bitops/le.h:6, from arch/powerpc/include/asm/bitops.h:250, from include/linux/bitops.h:29, from include/linux/kernel.h:12, from include/asm-generic/bug.h:19, from arch/powerpc/include/asm/bug.h:109, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/gfp.h:5, from include/linux/slab.h:15, from drivers/tty/ehv_bytechan.c:24: arch
Re: [PATCH] powerpc/ptdump: don't entirely rebuild kernel when selecting CONFIG_PPC_DEBUG_WX
Christophe Leroy writes: > Selecting CONFIG_PPC_DEBUG_WX only impacts ptdump and pgtable_32/64 > init calls. Declaring related functions in asm/pgtable.h implies > rebuilding almost everything. > > Move ptdump_check_wx() declaration in a new dedicated header file. Can you put it in arch/powerpc/mm/mmu_decl.h ? That already exists for things that are private to arch/powerpc/mm, which this function is AFAICT. cheers > arch/powerpc/include/asm/pgtable.h | 6 -- > arch/powerpc/include/asm/ptdump.h | 15 +++ > arch/powerpc/mm/pgtable_32.c | 1 + > arch/powerpc/mm/pgtable_64.c | 1 + > arch/powerpc/mm/ptdump/ptdump.c| 1 + > 5 files changed, 18 insertions(+), 6 deletions(-) > create mode 100644 arch/powerpc/include/asm/ptdump.h > > diff --git a/arch/powerpc/include/asm/pgtable.h > b/arch/powerpc/include/asm/pgtable.h > index 0e4ec8cc37b7..8cc543ed114c 100644 > --- a/arch/powerpc/include/asm/pgtable.h > +++ b/arch/powerpc/include/asm/pgtable.h > @@ -94,12 +94,6 @@ void mark_initmem_nx(void); > static inline void mark_initmem_nx(void) { } > #endif > > -#ifdef CONFIG_PPC_DEBUG_WX > -void ptdump_check_wx(void); > -#else > -static inline void ptdump_check_wx(void) { } > -#endif > - > /* > * When used, PTE_FRAG_NR is defined in subarch pgtable.h > * so we are sure it is included when arriving here. > diff --git a/arch/powerpc/include/asm/ptdump.h > b/arch/powerpc/include/asm/ptdump.h > new file mode 100644 > index ..246b92c21729 > --- /dev/null > +++ b/arch/powerpc/include/asm/ptdump.h > @@ -0,0 +1,15 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_POWERPC_PTDUMP_H > +#define _ASM_POWERPC_PTDUMP_H > + > +#ifndef __ASSEMBLY__ > + > +#ifdef CONFIG_PPC_DEBUG_WX > +void ptdump_check_wx(void); > +#else > +static inline void ptdump_check_wx(void) { } > +#endif > + > +#endif /* __ASSEMBLY__ */ > + > +#endif /* _ASM_POWERPC_PTDUMP_H */ > diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c > index 73b84166d06a..6c866f1b1eeb 100644 > --- a/arch/powerpc/mm/pgtable_32.c > +++ b/arch/powerpc/mm/pgtable_32.c > @@ -29,6 +29,7 @@ > #include > #include > #include > +#include > > #include > > diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c > index e78832dce7bb..3686cd887c2f 100644 > --- a/arch/powerpc/mm/pgtable_64.c > +++ b/arch/powerpc/mm/pgtable_64.c > @@ -45,6 +45,7 @@ > #include > #include > #include > +#include > > #include > > diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c > index 2f9ddc29c535..d7b02bcd0691 100644 > --- a/arch/powerpc/mm/ptdump/ptdump.c > +++ b/arch/powerpc/mm/ptdump/ptdump.c > @@ -23,6 +23,7 @@ > #include > #include > #include > +#include > > #include "ptdump.h" > > -- > 2.13.3
Re: [linux-next/mainline][bisected 3acac06][ppc] Oops when unloading mpt3sas driver
On Thu, 2020-01-09 at 06:22 -0800, Christoph Hellwig wrote: > On Thu, Jan 09, 2020 at 02:27:25PM +0530, Abdul Haleem wrote: > > + CC Christoph Hellwig > > The only thing this commit changed for the dma coherent case (which > ppc64 uses) is that we now look up the page to free by the DMA address > instead of the virtual address passed in. Which suggests this call > stack passes in a broken dma address. I suspect we somehow managed > to disable the ppc iommu bypass mode after allocating memory, which > would cause symptoms like this, and thus the commit is just exposing > a pre-existing problem. Trace with printk added for page->addr, will this help ? mpt3sas_cm0: removing handle(0x000f), sas_addr(0x500304801f080d3d) mpt3sas_cm0: enclosure logical id(0x500304801f080d3f), slot(12) mpt3sas_cm0: enclosure level(0x), connector name( ) mpt3sas_cm0: mpt3sas_transport_port_remove: removed: sas_addr(0x500304801f080d3f) mpt3sas_cm0: expander_remove: handle(0x0009), sas_addr(0x500304801f080d3f) mpt3sas_cm0: sending diag reset !! mpt3sas_cm0: diag reset: SUCCESS page->vaddr = 0xc03f2d20 page->vaddr = 0xc03f2ef0 page->vaddr = 0xc03f3843 page->vaddr = 0xc03f3d7d page->vaddr = 0xc03f7576 BUG: Unable to handle kernel data access on write at 0xc04a00017c34 Faulting instruction address: 0xc02fb2b0 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc btrfs blake2b_generic xor zstd_decompress zstd_compress lzo_compress iptable_filter raid6_pq mpt3sas(-) vmx_crypto gf128mul nfsd powernv_rng rng_core raid_class scsi_transport_sas kvm_hv kvm binfmt_misc ip_tables x_tables xfs libcrc32c qla2xxx ixgbe i40e nvme_fc nvme_fabrics mdio nvme_core autofs4 CPU: 13 PID: 17267 Comm: rmmod Not tainted 5.5.0-rc5-next-20200108-autotest-2-g36e1367-dirty #1 NIP: c02fb2b0 LR: c01aa5b4 CTR: c004a010 REGS: c03fc3f9f5d0 TRAP: 0380 Not tainted (5.5.0-rc5-next-20200108-autotest-2-g36e1367-dirty) MSR: 90009033 CR: 22002424 XER: 2000 CFAR: c01aa5b0 IRQMASK: 0 GPR00: c004a0a8 c03fc3f9f860 c1311300 c04a00017c00 GPR04: c03f7576 003e00017c00 GPR08: c13bd000 c04a00017c34 05bf GPR12: c004a010 c03f4a80 GPR16: 01001b4e0180 10020098 GPR20: 10020050 10020038 10020078 05f0 GPR24: c0d4e8a8 c0d4e8c8 05f0 GPR28: c1299398 c03f7576 0001 c03fdde0d0a8 NIP [c02fb2b0] __free_pages+0x10/0x50 LR [c01aa5b4] dma_direct_free_pages+0x54/0x90 Call Trace: [c03fc3f9f860] [c0d4e8a8] str_spec.72296+0x199114/0x2009cc (unreliable) [c03fc3f9f880] [c004a0a8] dma_iommu_free_coherent+0x98/0xd0 [c03fc3f9f8d0] [c01a95e8] dma_free_attrs+0xf8/0x100 [c03fc3f9f920] [c031205c] dma_pool_destroy+0x19c/0x230 [c03fc3f9f9d0] [c0081c181e98] _base_release_memory_pools+0x1d8/0x4b0 [mpt3sas] [c03fc3f9fa60] [c0081c18b9f0] mpt3sas_base_detach+0x40/0x150 [mpt3sas] [c03fc3f9fad0] [c0081c19c92c] scsih_remove+0x24c/0x3e0 [mpt3sas] [c03fc3f9fb90] [c06126a4] pci_device_remove+0x64/0x110 [c03fc3f9fbd0] [c06c7ea4] device_release_driver_internal+0x154/0x260 [c03fc3f9fc10] [c06c807c] driver_detach+0x8c/0x140 [c03fc3f9fc50] [c06c6188] bus_remove_driver+0x78/0x100 [c03fc3f9fc80] [c06c8d90] driver_unregister+0x40/0x90 [c03fc3f9fcf0] [c0611dc8] pci_unregister_driver+0x38/0x110 [c03fc3f9fd40] [c0081c1af108] _mpt3sas_exit+0x50/0x40c8 [mpt3sas] [c03fc3f9fda0] [c01d8ed8] sys_delete_module+0x1a8/0x2a0 [c03fc3f9fe20] [c000b9d0] system_call+0x5c/0x68 Instruction dump: 88830051 2fa4 41de0008 4bffe87c 7d234b78 4bfffe94 6000 6042 3c4c0101 38426060 39430034 7c0004ac <7d005028> 3108 7d00512d 40c2fff4 ---[ end trace c5ab52378eb942ad ]--- Segmentation fault -- Regard's Abdul Haleem IBM Linux Technology Centre
Re: linux-next: build warning after merge of the bpf-next tree
On Sun, Jan 12, 2020 at 8:33 PM Zong Li wrote: > > I'm not quite familiar with btf, so I have no idea why there are two > weak symbols be added in 8580ac9404f6 ("bpf: Process in-kernel BTF") I can explain what these weak symbols are for, but that won't change the fact that compiler or linker are buggy. The weak symbols should work in all cases and compiler should pick correct relocation. In this case it sounds that compiler picked relative relocation and failed to reach zero from that address.
Re: [PATCH] evh_bytechan: fix out of bounds accesses
On Mon, 2020-01-13 at 19:13 -0600, Timur Tabi wrote: > On 1/13/20 7:10 PM, Timur Tabi wrote: > > I would prefer that ev_byte_channel_send() is updated to access only > > 'count' bytes. If that means adding a memcpy to the > > ev_byte_channel_send() itself, then so be it. Trying to figure out how > > to stuff n bytes into 4 32- > > bit registers is probably not worth the effort. > > Looks like ev_byte_channel_receive() has the same bug, but in reverse. It only has one user, which always passes in a 16-byte buffer. -Scott
Re: [PATCH] powerpc/xive: discard ESB load value when interrupt is invalid
Cédric Le Goater writes: > On 1/13/20 2:01 PM, Cédric Le Goater wrote: >> From: Frederic Barrat >> >> A load on an ESB page returning all 1's means that the underlying >> device has invalidated the access to the PQ state of the interrupt >> through mmio. It may happen, for example when querying a PHB interrupt >> while the PHB is in an error state. >> >> In that case, we should consider the interrupt to be invalid when >> checking its state in the irq_get_irqchip_state() handler. > > > and we need also these tags : > > Fixes: da15c03b047d ("powerpc/xive: Implement get_irqchip_state method for > XIVE to fix shutdown race") > Cc: sta...@vger.kernel.org # v5.3+ I added those, although it's v5.4+, as the offending commit was first included in v5.4-rc1. cheers
Re: [PATCH] evh_bytechan: fix out of bounds accesses
On 1/13/20 7:10 PM, Timur Tabi wrote: I would prefer that ev_byte_channel_send() is updated to access only 'count' bytes. If that means adding a memcpy to the ev_byte_channel_send() itself, then so be it. Trying to figure out how to stuff n bytes into 4 32-bit registers is probably not worth the effort. Looks like ev_byte_channel_receive() has the same bug, but in reverse.
Re: [PATCH] evh_bytechan: fix out of bounds accesses
Laurentiu Tudor writes: > Hello, > > On 13.01.2020 15:48, Timur Tabi wrote: >> On 1/13/20 6:26 AM, Michael Ellerman wrote: >>> I've never heard of it, and I have no idea how to test it. >>> >>> It's not used by qemu, I guess there is/was a Freescale hypervisor that >>> used it. >> >> Yes, there is/was a Freescale hypervisor that I and a few others worked >> on. I've added a couple people on CC that might be able to tell the >> current disposition of it. > > More info on this: it's opensource and it's published here [1]. We still > claim to maintain it but there wasn't much activity past years, as one > might notice. > >>> But maybe it's time to remove it if it's not being maintained/used by >>> anyone? >> >> I wouldn't be completely opposed to that if there really are no more >> users. There really weren't any users even when I wrote the driver. > > There are a few users that I know of, but I can't tell if that's enough > to justify keeping the driver. It is, I don't want to remove code that people are actually using, unless it's causing unsustainable maintenance burden. But this should be easy enough to get fixed. Could you or someone else at NXP volunteer to maintain this driver? That shouldn't involve much work, other than fixing small things like this warning. cheers
Re: [PATCH] evh_bytechan: fix out of bounds accesses
On 1/13/20 2:25 PM, Stephen Rothwell wrote: The problem is not really the declaration, the problem is that ev_byte_channel_send always accesses 16 bytes from the buffer and it is not always passed a buffer that long (in one case it is passed a pointer to a single byte). So the alternative to the memcpy approach I have take is to complicate ev_byte_channel_send so that only accesses count bytes from the buffer. Ah, I see now. This is all coming back to me. I would prefer that ev_byte_channel_send() is updated to access only 'count' bytes. If that means adding a memcpy to the ev_byte_channel_send() itself, then so be it. Trying to figure out how to stuff n bytes into 4 32-bit registers is probably not worth the effort.
[RFC PATCH 1/1] powerpc/pgtable: Skip serialize_against_pte_lookup() when unmapping
If a process (qemu) with a lot of CPUs (128) try to munmap() a large chunk of memory (496GB) mapped with THP, it takes an average of 275 seconds, which can cause a lot of problems to the load (in qemu case, the guest will lock for this time). Trying to find the source of this bug, I found out most of this time is spent on serialize_against_pte_lookup(). This function will take a lot of time in smp_call_function_many() if there is more than a couple CPUs running the user process. Since it has to happen to all THP mapped, it will take a very long time for large amounts of memory. By the docs, serialize_against_pte_lookup() is needed in order to avoid pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless pagetable walk, to happen concurrently with THP splitting/collapsing. In this case, as the page is being munmapped, there is no need to call serialize_against_pte_lookup(), given it will not be used after or during munmap. This patch does so by adding option to skip serializing on radix__pmdp_huge_get_and_clear(). This option is used by the proxy __pmdp_huge_get_and_clear(), that is called with 'unmap == true' on an (new) arch version of pmdp_huge_get_and_clear_full(), and with 'unmap == false' on pmdp_huge_get_and_clear(), that is used on generic code. pmdp_huge_get_and_clear_full() is only called in zap_huge_pmd(), so it's safe to assume it will always be called on memory that will be unmapped. On my workload (qemu: 128vcpus + 500GB), I could see munmap's time reduction from 275 seconds to 39ms. Signed-off-by: Leonardo Bras --- arch/powerpc/include/asm/book3s/64/pgtable.h | 25 +--- arch/powerpc/include/asm/book3s/64/radix.h | 3 ++- arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +++-- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b01624e5c467..5e3e7c48624a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1243,14 +1243,21 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR -static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) +static inline pmd_t __pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + bool unmap) { if (radix_enabled()) - return radix__pmdp_huge_get_and_clear(mm, addr, pmdp); + return radix__pmdp_huge_get_and_clear(mm, addr, pmdp, !unmap); return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); } +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + return __pmdp_huge_get_and_clear(mm, addr, pmdp, false); +} + static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { @@ -1337,6 +1344,18 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, pte_t *, pte_t, pte_t); +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL +static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, +unsigned long address, +pmd_t *pmdp, +int full) +{ + /* +* Called only on unmapping +*/ + return __pmdp_huge_get_and_clear(mm, address, pmdp, true); +} + /* * Returns true for a R -> RW upgrade of pte */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index d97db3ad9aae..148874aa5260 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -253,7 +253,8 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); + unsigned long addr, pmd_t *pmdp, + bool serialize); static inline int radix__has_transparent_hugepage(void) { /* For radix 2M at PMD level means thp */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 974109bb85db..eac8409cd316 100644 --- a/arch/powe
Re: [PATCH v4 5/9] trace/bpf_trace: open access for CAP_SYS_PERFMON privileged process
> On Dec 18, 2019, at 1:28 AM, Alexey Budankov > wrote: > > > Open access to bpf_trace monitoring for CAP_SYS_PERFMON privileged > processes. For backward compatibility reasons access to bpf_trace > monitoring remains open for CAP_SYS_ADMIN privileged processes but > CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged > with respect to CAP_SYS_PERFMON capability. > > Signed-off-by: Alexey Budankov Acked-by: Song Liu > --- > kernel/trace/bpf_trace.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 44bd08f2443b..bafe21ac6d92 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -1272,7 +1272,7 @@ int perf_event_query_prog_array(struct perf_event > *event, void __user *info) > u32 *ids, prog_cnt, ids_len; > int ret; > > - if (!capable(CAP_SYS_ADMIN)) > + if (!perfmon_capable()) > return -EPERM; > if (event->attr.type != PERF_TYPE_TRACEPOINT) > return -EINVAL; I guess we need to fix this check for kprobe/uprobe created with perf_event_open()... Thanks, Song
Re: [PATCH v4 1/9] capabilities: introduce CAP_SYS_PERFMON to kernel and user space
> On Dec 18, 2019, at 1:24 AM, Alexey Budankov > wrote: > > > Introduce CAP_SYS_PERFMON capability devoted to secure system performance > monitoring and observability operations so that CAP_SYS_PERFMON would > assist CAP_SYS_ADMIN capability in its governing role for perf_events, > i915_perf and other subsystems of the kernel. > > CAP_SYS_PERFMON intends to harden system security and integrity during > system performance monitoring and observability operations by decreasing > attack surface that is available to CAP_SYS_ADMIN privileged processes. > > CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related > to system performance monitoring and observability operations and balance > amount of CAP_SYS_ADMIN credentials in accordance with the recommendations > provided in the man page for CAP_SYS_ADMIN [1]: "Note: this capability > is overloaded; see Notes to kernel developers, below." > > [1] > https://urldefense.proofpoint.com/v2/url?u=http-3A__man7.org_linux_man-2Dpages_man7_capabilities.7.html&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=dR8692q0_uaizy0jkrBJQM5k2hfm4CiFxYT8KaysFrg&m=L5qCuMRrTvYhyjR1rpgE9vEv4HppVlOXDIzKzoGL30c&s=FNJpET4buKFRuqktVHQphaY1qE7IsdFpU4iYwpCn4tY&e= > > > Signed-off-by: Alexey Budankov Acked-by: Song Liu
Re: [PATCH] evh_bytechan: fix out of bounds accesses
Hi Timur, On Mon, 13 Jan 2020 10:03:18 -0600 Timur Tabi wrote: > > Why not simply correct the parameters of ev_byte_channel_send? > > static inline unsigned int ev_byte_channel_send(unsigned int handle, > -unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES]) > +unsigned int *count, const char *buffer) > > Back then, I probably thought I was just being clever with this code, > but I realize now that it doesn't make sense to do the way I did. The problem is not really the declaration, the problem is that ev_byte_channel_send always accesses 16 bytes from the buffer and it is not always passed a buffer that long (in one case it is passed a pointer to a single byte). So the alternative to the memcpy approach I have take is to complicate ev_byte_channel_send so that only accesses count bytes from the buffer. -- Cheers, Stephen Rothwell pgp69n8DME1wI.pgp Description: OpenPGP digital signature
Re: [PATCH 08/10] soc: lantiq: convert to devm_platform_ioremap_resource
Hello, Yangtao Li wrote: > Use devm_platform_ioremap_resource() to simplify code. Applied to mips-next. > commit 23c25c732530 > https://git.kernel.org/mips/c/23c25c732530 > > Signed-off-by: Yangtao Li > Signed-off-by: Paul Burton Thanks, Paul [ This message was auto-generated; if you believe anything is incorrect then please email paulbur...@kernel.org to report it. ]
[RFC PATCH v3 12/12] powerpc/vdso: provide __arch_is_hw_counter_valid()
To avoid a double verification of the hw_counter validity, provide __arch_is_hw_counter_valid() Before: clock-gettime-realtime:vdso: 1078 nsec/call After: clock-gettime-realtime:vdso: 1064 nsec/call The * shows the test of the clock type. The > shows the additional test on the counter, that goes away with this patch Before: 5e0: 81 25 00 04 lwz r9,4(r5) *5e4: 2f 89 00 00 cmpwi cr7,r9,0 *5e8: 40 9e 01 3c bne cr7,724 <__c_kernel_clock_gettime+0x17c> 5ec: 94 21 ff e0 stwur1,-32(r1) 5f0: 93 61 00 0c stw r27,12(r1) 5f4: 93 81 00 10 stw r28,16(r1) 5f8: 93 a1 00 14 stw r29,20(r1) 5fc: 93 c1 00 18 stw r30,24(r1) 600: 93 e1 00 1c stw r31,28(r1) 604: 7d 8d 42 e6 mftbu r12 608: 7f ac 42 e6 mftbr29 60c: 7c cd 42 e6 mftbu r6 610: 7f 8c 30 40 cmplw cr7,r12,r6 614: 40 9e ff f0 bne cr7,604 <__c_kernel_clock_gettime+0x5c> >618: 2f 8c 00 00 cmpwi cr7,r12,0 61c: 83 6b 00 28 lwz r27,40(r11) 620: 83 8b 00 2c lwz r28,44(r11) 624: 81 45 00 08 lwz r10,8(r5) 628: 80 e5 00 0c lwz r7,12(r5) >62c: 41 9c 00 b4 blt cr7,6e0 <__c_kernel_clock_gettime+0x138> 630: 81 05 00 18 lwz r8,24(r5) 634: 83 c5 00 1c lwz r30,28(r5) 638: 80 cb 00 24 lwz r6,36(r11) 63c: 83 e5 00 00 lwz r31,0(r5) 640: 7f 9f 00 40 cmplw cr7,r31,r0 644: 40 9e 00 84 bne cr7,6c8 <__c_kernel_clock_gettime+0x120> After: 5e0: 81 25 00 04 lwz r9,4(r5) *5e4: 2f 89 00 00 cmpwi cr7,r9,0 *5e8: 40 9e 01 88 bne cr7,770 <__c_kernel_clock_gettime+0x1c8> 5ec: 94 21 ff e0 stwur1,-32(r1) 5f0: 93 61 00 0c stw r27,12(r1) 5f4: 93 81 00 10 stw r28,16(r1) 5f8: 93 a1 00 14 stw r29,20(r1) 5fc: 93 c1 00 18 stw r30,24(r1) 600: 93 e1 00 1c stw r31,28(r1) 604: 7f cd 42 e6 mftbu r30 608: 7f ac 42 e6 mftbr29 60c: 7c cd 42 e6 mftbu r6 610: 7f 9e 30 40 cmplw cr7,r30,r6 614: 40 9e ff f0 bne cr7,604 <__c_kernel_clock_gettime+0x5c> 618: 83 6b 00 28 lwz r27,40(r11) 61c: 83 8b 00 2c lwz r28,44(r11) 620: 81 45 00 08 lwz r10,8(r5) 624: 80 e5 00 0c lwz r7,12(r5) 628: 81 05 00 18 lwz r8,24(r5) 62c: 83 e5 00 1c lwz r31,28(r5) 630: 80 cb 00 24 lwz r6,36(r11) 634: 81 85 00 00 lwz r12,0(r5) 638: 7f 8c 00 40 cmplw cr7,r12,r0 63c: 40 9e 00 84 bne cr7,6c0 <__c_kernel_clock_gettime+0x118> Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/vdso/gettimeofday.h | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index d1e702e0ea86..c5a24f31382e 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -69,15 +69,18 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) } #endif -static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +static __always_inline bool __arch_is_hw_counter_valid(s32 clock_mode) { /* * clock_mode == 0 implies that vDSO are enabled otherwise * fallback on syscall. */ - if (clock_mode != 0) - return U64_MAX; + return clock_mode == 0 ? true : false; +} +#define __arch_is_hw_counter_valid __arch_is_hw_counter_valid +static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) +{ return get_tb(); } -- 2.13.3
[RFC PATCH v3 11/12] lib: vdso: split clock verification out of __arch_get_hw_counter()
__arch_get_hw_counter() returns the current value of the counter if the counter is valid or a negative number if the counter is not valid. This is suboptimal because the validity is checked twice: once before reading the counter and once after reading the counter. Optionaly split the verification out of __arch_get_hw_counter() by providing an optional __arch_is_hw_counter_valid() function. Signed-off-by: Christophe Leroy --- lib/vdso/gettimeofday.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index ea1a55507af5..001f6329e846 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -67,11 +67,18 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, do { seq = vdso_read_begin(vd); +#ifdef __arch_is_hw_counter_valid + if (unlikely(!__arch_is_hw_counter_valid(vd->clock_mode))) + return -1; +#endif + cycles = __arch_get_hw_counter(vd->clock_mode); ns = vdso_ts->nsec; last = vd->cycle_last; +#ifndef __arch_is_hw_counter_valid if (unlikely((s64)cycles < 0)) return -1; +#endif ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult); nsec = ns >> vd->shift; -- 2.13.3
[RFC PATCH v3 10/12] powerpc/vdso: provide vdso data pointer from the ASM caller.
__arch_get_vdso_data() clobbers the link register, requiring the caller to save it. As the ASM calling function already has to set a stack frame and saves the link register before calling the C vdso function, retriving the vdso data pointer there is lighter. The improvement is significant: Before: gettimeofday:vdso: 1027 nsec/call clock-getres-realtime-coarse:vdso: 699 nsec/call clock-gettime-realtime-coarse:vdso: 784 nsec/call clock-gettime-realtime:vdso: 1231 nsec/call After: gettimeofday:vdso: 908 nsec/call clock-getres-realtime-coarse:vdso: 545 nsec/call clock-gettime-realtime-coarse:vdso: 617 nsec/call clock-gettime-realtime:vdso: 1078 nsec/call Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/vdso/gettimeofday.h | 12 ++-- arch/powerpc/kernel/vdso32/gettimeofday.S| 3 +++ arch/powerpc/kernel/vdso32/vgettimeofday.c | 19 +++ arch/powerpc/kernel/vdso64/gettimeofday.S| 3 +++ arch/powerpc/kernel/vdso64/vgettimeofday.c | 19 +++ 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index 343c81a7e951..d1e702e0ea86 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -6,13 +6,14 @@ #include #include -#include #include #define VDSO_HAS_CLOCK_GETRES 1 #define VDSO_HAS_TIME 1 +#define VDSO_GETS_VD_PTR_FROM_ARCH 1 + static __always_inline int do_syscall_2(const unsigned long _r0, const unsigned long _r3, const unsigned long _r4) { @@ -80,15 +81,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode) return get_tb(); } -void *__get_datapage(void); - -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - struct vdso_arch_data *vdso_data = __get_datapage(); - - return vdso_data->data; -} - /* * powerpc specific delta calculation. * diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index ba0bd64b3da3..0d43878e462c 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,6 +19,7 @@ stwur1, -16(r1) mflrr0 stw r0, 20(r1) + get_datapager5, VDSO_DATA_OFFSET bl \funct lwz r0, 20(r1) cmpwi r3, 0 @@ -79,6 +81,7 @@ V_FUNCTION_BEGIN(__kernel_time) stwur1, -16(r1) mflrr0 stw r0, 20(r1) + get_datapager4, VDSO_DATA_OFFSET bl __c_kernel_time lwz r0, 20(r1) crclr cr0*4+so diff --git a/arch/powerpc/kernel/vdso32/vgettimeofday.c b/arch/powerpc/kernel/vdso32/vgettimeofday.c index 4ed1bf2ae30e..7fdccf896a9c 100644 --- a/arch/powerpc/kernel/vdso32/vgettimeofday.c +++ b/arch/powerpc/kernel/vdso32/vgettimeofday.c @@ -5,22 +5,25 @@ #include #include -int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts) +int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, +const struct vdso_data *vd) { - return __cvdso_clock_gettime32(clock, ts); + return __cvdso_clock_gettime32(vd, clock, ts); } -int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) +int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, + const struct vdso_data *vd) { - return __cvdso_gettimeofday(tv, tz); + return __cvdso_gettimeofday(vd, tv, tz); } -int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res) +int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, + const struct vdso_data *vd) { - return __cvdso_clock_getres_time32(clock_id, res); + return __cvdso_clock_getres_time32(vd, clock_id, res); } -time_t __c_kernel_time(time_t *time) +time_t __c_kernel_time(time_t *time, const struct vdso_data *vd) { - return __cvdso_time(time); + return __cvdso_time(vd, time); } diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index 22f4f1f73bbc..f61c53eb6600 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,6 +19,7 @@ mflrr0 std r0, 16(r1) stdur1, -128(r1) + get_datapager5, VDSO_DATA_OFFSET bl \funct addir1, r1, 128 ld r0, 16(r1) @@ -79,6 +81,7 @@ V_FUNCTION_BEGIN(__kernel_time) mflrr0 std r0, 16(r1) stdur1, -128(r1) + get_datapager
[RFC PATCH v3 08/12] lib: vdso: allow arches to provide vdso data pointer
On powerpc, __arch_get_vdso_data() clobbers the link register, requiring the caller to save it. As the parent function already has to set a stack frame and saves the link register before calling the C vdso function, retriving the vdso data pointer there is lighter. Give arches the opportunity to hand the vdso data pointer to C vdso functions. Signed-off-by: Christophe Leroy --- lib/vdso/gettimeofday.c | 56 + 1 file changed, 56 insertions(+) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index da15a8842825..ea1a55507af5 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -104,9 +104,15 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, } static __maybe_unused int +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +__cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock, + struct __kernel_timespec *ts) +{ +#else __cvdso_clock_gettime_common(clockid_t clock, struct __kernel_timespec *ts) { const struct vdso_data *vd = __arch_get_vdso_data(); +#endif u32 msk; /* Check for negative values or invalid clocks */ @@ -131,9 +137,16 @@ __cvdso_clock_gettime_common(clockid_t clock, struct __kernel_timespec *ts) } static __maybe_unused int +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +__cvdso_clock_gettime(const struct vdso_data *vd, clockid_t clock, + struct __kernel_timespec *ts) +{ + int ret = __cvdso_clock_gettime_common(vd, clock, ts); +#else __cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { int ret = __cvdso_clock_gettime_common(clock, ts); +#endif if (unlikely(ret)) return clock_gettime_fallback(clock, ts); @@ -141,12 +154,21 @@ __cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) } static __maybe_unused int +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +__cvdso_clock_gettime32(const struct vdso_data *vd, clockid_t clock, + struct old_timespec32 *res) +#else __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) +#endif { struct __kernel_timespec ts; int ret; +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH + ret = __cvdso_clock_gettime_common(vd, clock, &ts); +#else ret = __cvdso_clock_gettime_common(clock, &ts); +#endif #ifdef VDSO_HAS_32BIT_FALLBACK if (unlikely(ret)) @@ -164,9 +186,15 @@ __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) } static __maybe_unused int +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +__cvdso_gettimeofday(const struct vdso_data *vd, struct __kernel_old_timeval *tv, +struct timezone *tz) +{ +#else __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { const struct vdso_data *vd = __arch_get_vdso_data(); +#endif if (likely(tv != NULL)) { struct __kernel_timespec ts; @@ -187,9 +215,15 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) } #ifdef VDSO_HAS_TIME +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +static __maybe_unused __kernel_old_time_t +__cvdso_time(const struct vdso_data *vd, __kernel_old_time_t *time) +{ +#else static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) { const struct vdso_data *vd = __arch_get_vdso_data(); +#endif __kernel_old_time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); if (time) @@ -201,9 +235,15 @@ static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time #ifdef VDSO_HAS_CLOCK_GETRES static __maybe_unused +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, + struct __kernel_timespec *res) +{ +#else int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) { const struct vdso_data *vd = __arch_get_vdso_data(); +#endif u32 msk; u64 ns; @@ -238,9 +278,16 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) } static __maybe_unused +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +int __cvdso_clock_getres(const struct vdso_data *vd, clockid_t clock, +struct __kernel_timespec *res) +{ + int ret = __cvdso_clock_getres_common(vd, clock, res); +#else int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) { int ret = __cvdso_clock_getres_common(clock, res); +#endif if (unlikely(ret)) return clock_getres_fallback(clock, res); @@ -248,12 +295,21 @@ int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) } static __maybe_unused int +#ifdef VDSO_GETS_VD_PTR_FROM_ARCH +__cvdso_clock_getres_time32(const struct vdso_data *vd, clockid_t clock, + struct old_timespec32 *res) +#else __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec3
[RFC PATCH v3 09/12] powerpc/vdso: provide inline alternative to __get_datapage()
__get_datapage() is only a few instructions to retrieve the address of the page where the kernel stores data to the VDSO. By inlining this function into its users, a bl/blr pair and a mflr/mtlr pair is avoided, plus a few reg moves. The improvement is noticeable (about 55 nsec/call on an 8xx) With current __get_datapage() function: gettimeofday:vdso: 731 nsec/call clock-gettime-realtime-coarse:vdso: 668 nsec/call clock-gettime-monotonic-coarse:vdso: 745 nsec/call Using the __get_datapage macro provided by this patch: gettimeofday:vdso: 677 nsec/call clock-gettime-realtime-coarse:vdso: 613 nsec/call clock-gettime-monotonic-coarse:vdso: 690 nsec/call Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/vdso_datapage.h | 11 +++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index 4d7965bf369e..7342cc0c1ae4 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -105,6 +105,17 @@ struct vdso_arch_data { extern struct vdso_arch_data *vdso_data; +#else /* __ASSEMBLY__ */ + +.macro get_datapage ptr, offset=0 + bcl 20, 31, .+4 + mflr\ptr +#if CONFIG_PPC_PAGE_SHIFT > 14 + addis \ptr, \ptr, (_vdso_datapage + \offset - (.-4))@ha +#endif + addi\ptr, \ptr, (_vdso_datapage + \offset - (.-4))@l +.endm + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ -- 2.13.3
[RFC PATCH v3 07/12] powerpc/vdso: simplify __get_datapage()
The VDSO datapage and the text pages are always located immediately next to each other, so it can be hardcoded without an indirection through __kernel_datapage_offset In order to ease things, move the data page in front like other arches, that way there is no need to know the size of the library to locate the data page. Before: clock-getres-realtime-coarse:vdso: 714 nsec/call clock-gettime-realtime-coarse:vdso: 792 nsec/call clock-gettime-realtime:vdso: 1243 nsec/call After: clock-getres-realtime-coarse:vdso: 699 nsec/call clock-gettime-realtime-coarse:vdso: 784 nsec/call clock-gettime-realtime:vdso: 1231 nsec/call Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso.c | 53 + arch/powerpc/kernel/vdso32/datapage.S | 10 +++ arch/powerpc/kernel/vdso32/vdso32.lds.S | 7 ++--- arch/powerpc/kernel/vdso64/datapage.S | 10 +++ arch/powerpc/kernel/vdso64/vdso64.lds.S | 7 ++--- 5 files changed, 19 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 16a44bffe698..c093d90a222a 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -191,7 +191,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * install_special_mapping or the perf counter mmap tracking code * will fail to recognise it as a vDSO (since arch_vma_name fails). */ - current->mm->context.vdso_base = vdso_base; + current->mm->context.vdso_base = vdso_base + PAGE_SIZE; /* * our vma flags don't have VM_WRITE so by default, the process isn't @@ -488,42 +488,6 @@ static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32, vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32"); } -static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, - struct lib64_elfinfo *v64) -{ -#ifdef CONFIG_VDSO32 - Elf32_Sym *sym32; -#endif -#ifdef CONFIG_PPC64 - Elf64_Sym *sym64; - - sym64 = find_symbol64(v64, "__kernel_datapage_offset"); - if (sym64 == NULL) { - printk(KERN_ERR "vDSO64: Can't find symbol " - "__kernel_datapage_offset !\n"); - return -1; - } - *((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) = - (vdso64_pages << PAGE_SHIFT) - - (sym64->st_value - VDSO64_LBASE); -#endif /* CONFIG_PPC64 */ - -#ifdef CONFIG_VDSO32 - sym32 = find_symbol32(v32, "__kernel_datapage_offset"); - if (sym32 == NULL) { - printk(KERN_ERR "vDSO32: Can't find symbol " - "__kernel_datapage_offset !\n"); - return -1; - } - *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) = - (vdso32_pages << PAGE_SHIFT) - - (sym32->st_value - VDSO32_LBASE); -#endif - - return 0; -} - - static __init int vdso_fixup_features(struct lib32_elfinfo *v32, struct lib64_elfinfo *v64) { @@ -624,9 +588,6 @@ static __init int vdso_setup(void) if (vdso_do_find_sections(&v32, &v64)) return -1; - if (vdso_fixup_datapage(&v32, &v64)) - return -1; - if (vdso_fixup_features(&v32, &v64)) return -1; @@ -771,26 +732,26 @@ static int __init vdso_init(void) vdso32_pagelist = kcalloc(vdso32_pages + 2, sizeof(struct page *), GFP_KERNEL); BUG_ON(vdso32_pagelist == NULL); + vdso32_pagelist[0] = virt_to_page(vdso_data); for (i = 0; i < vdso32_pages; i++) { struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); get_page(pg); - vdso32_pagelist[i] = pg; + vdso32_pagelist[i + 1] = pg; } - vdso32_pagelist[i++] = virt_to_page(vdso_data); - vdso32_pagelist[i] = NULL; + vdso32_pagelist[i + 1] = NULL; #endif #ifdef CONFIG_PPC64 vdso64_pagelist = kcalloc(vdso64_pages + 2, sizeof(struct page *), GFP_KERNEL); BUG_ON(vdso64_pagelist == NULL); + vdso64_pagelist[0] = virt_to_page(vdso_data); for (i = 0; i < vdso64_pages; i++) { struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); get_page(pg); - vdso64_pagelist[i] = pg; + vdso64_pagelist[i + 1] = pg; } - vdso64_pagelist[i++] = virt_to_page(vdso_data); - vdso64_pagelist[i] = NULL; + vdso64_pagelist[i + 1] = NULL; #endif /* CONFIG_PPC64 */ get_page(virt_to_page(vdso_data)); diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 6c7401bd284e..d839aa1a4f01 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -12,9 +12,
[RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time
Using __iter_div_ulong_rem() is suboptimal on 32 bits. Nanoseconds are only 32 bits, and VDSO data is updated every 10ms so nsec will never overflow 32 bits. Add an equivalent of __iter_div_u64_rem() but based on unsigned long to better fit with 32 bits arches. Before: gettimeofday:vdso: 1078 nsec/call clock-gettime-monotonic-raw:vdso: 1317 nsec/call clock-gettime-monotonic:vdso: 1255 nsec/call After: gettimeofday:vdso: 1032 nsec/call clock-gettime-monotonic-raw:vdso: 1312 nsec/call clock-gettime-monotonic:vdso: 1243 nsec/call Signed-off-by: Christophe Leroy --- lib/vdso/gettimeofday.c | 26 +++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index decd3f2b37af..da15a8842825 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #endif +static __always_inline u32 +__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder) +{ + u32 ret = 0; + + while (dividend >= divisor) { + /* The following asm() prevents the compiler from + optimising this loop into a modulo operation. */ + asm("" : "+rm"(dividend)); + + dividend -= divisor; + ret++; + } + + *remainder = dividend; + + return ret; +} + static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u64 cycles, last, sec, ns; u32 seq; + unsigned long nsec; do { seq = vdso_read_begin(vd); @@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, return -1; ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult); - ns >>= vd->shift; + nsec = ns >> vd->shift; sec = vdso_ts->sec; } while (unlikely(vdso_read_retry(vd, seq))); @@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, * Do this outside the loop: a race inside the loop could result * in __iter_div_u64_rem() being extremely slow. */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; + ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec); + ts->tv_nsec = nsec; return 0; } -- 2.13.3
[RFC PATCH v3 05/12] lib: vdso: Avoid duplication in __cvdso_clock_getres()
VDSO_HRES and VDSO_RAW clocks are handled the same way. Don't duplicate code. Before the patch: clock-getres-monotonic-raw:vdso: 737 nsec/call clock-getres-monotonic-coarse:vdso: 753 nsec/call clock-getres-monotonic:vdso: 691 nsec/call After the patch: clock-getres-monotonic-raw:vdso: 715 nsec/call clock-getres-monotonic-coarse:vdso: 715 nsec/call clock-getres-monotonic:vdso: 714 nsec/call Signed-off-by: Christophe Leroy Reviewed-by: Andy Lutomirski --- lib/vdso/gettimeofday.c | 11 ++- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index d75e44ba716f..decd3f2b37af 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -184,7 +184,6 @@ static __maybe_unused int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) { const struct vdso_data *vd = __arch_get_vdso_data(); - u64 hrtimer_res; u32 msk; u64 ns; @@ -192,27 +191,21 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) if (unlikely((u32) clock >= MAX_CLOCKS)) return -1; - hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); /* * Convert the clockid to a bitmask and use it to check which * clocks are handled in the VDSO directly. */ msk = 1U << clock; - if (msk & VDSO_HRES) { + if (msk & (VDSO_HRES | VDSO_RAW)) { /* * Preserves the behaviour of posix_get_hrtimer_res(). */ - ns = hrtimer_res; + ns = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); } else if (msk & VDSO_COARSE) { /* * Preserves the behaviour of posix_get_coarse_res(). */ ns = LOW_RES_NSEC; - } else if (msk & VDSO_RAW) { - /* -* Preserves the behaviour of posix_get_hrtimer_res(). -*/ - ns = hrtimer_res; } else { return -1; } -- 2.13.3
[RFC PATCH v3 04/12] lib: vdso: inline do_hres() and do_coarse()
do_hres() is called from several places, so GCC doesn't inline it at first. do_hres() takes a struct __kernel_timespec * parameter for passing the result. In the 32 bits case, this parameter corresponds to a local var in the caller. In order to provide a pointer to this structure, the caller has to put it in its stack and do_hres() has to write the result in the stack. This is suboptimal, especially on RISC processor like powerpc. By making GCC inline the function, the struct __kernel_timespec remains a local var using registers, avoiding the need to write and read stack. The improvement is significant on powerpc: Before: gettimeofday:vdso: 1379 nsec/call clock-gettime-realtime-coarse:vdso: 868 nsec/call clock-gettime-realtime:vdso: 1511 nsec/call clock-gettime-monotonic-raw:vdso: 1576 nsec/call After: gettimeofday:vdso: 1078 nsec/call clock-gettime-realtime-coarse:vdso: 807 nsec/call clock-gettime-realtime:vdso: 1256 nsec/call clock-gettime-monotonic-raw:vdso: 1316 nsec/call At the same time, change the return type of do_coarse() to int, this increase readability of the if/elseif/elseif/else section in __cvdso_clock_gettime_common() Signed-off-by: Christophe Leroy --- lib/vdso/gettimeofday.c | 29 - 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 42bd8ab955fa..d75e44ba716f 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -38,8 +38,8 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) } #endif -static int do_hres(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, + struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u64 cycles, last, sec, ns; @@ -68,8 +68,8 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk, return 0; } -static void do_coarse(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, +struct __kernel_timespec *ts) { const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; u32 seq; @@ -79,6 +79,8 @@ static void do_coarse(const struct vdso_data *vd, clockid_t clk, ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; } while (unlikely(vdso_read_retry(vd, seq))); + + return 0; } static __maybe_unused int @@ -96,15 +98,16 @@ __cvdso_clock_gettime_common(clockid_t clock, struct __kernel_timespec *ts) * clocks are handled in the VDSO directly. */ msk = 1U << clock; - if (likely(msk & VDSO_HRES)) { - return do_hres(&vd[CS_HRES_COARSE], clock, ts); - } else if (msk & VDSO_COARSE) { - do_coarse(&vd[CS_HRES_COARSE], clock, ts); - return 0; - } else if (msk & VDSO_RAW) { - return do_hres(&vd[CS_RAW], clock, ts); - } - return -1; + if (likely(msk & VDSO_HRES)) + vd += CS_HRES_COARSE; + else if (msk & VDSO_COARSE) + return do_coarse(&vd[CS_HRES_COARSE], clock, ts); + else if (msk & VDSO_RAW) + vd += CS_RAW; + else + return -1; + + return do_hres(vd, clock, ts); } static __maybe_unused int -- 2.13.3
[RFC PATCH v3 02/12] powerpc/vdso: Switch VDSO to generic C implementation.
This is a minimalistic conversion to VDSO generic C implementation. On powerpc 8xx, performance is degraded by 40-45% for gettime and by 50% for getres. Optimisations will follow in next patches, some in the core VDSO functions, some in the powerpc arch. powerpc is a bit special for VDSO as well as system calls in the way that it requires setting CR SO bit which cannot be done in C. Therefore, entry/exit needs to be performed in ASM. On a powerpc885 at 132MHz: With current powerpc/32 ASM VDSO: gettimeofday:vdso: 737 nsec/call clock-getres-realtime-coarse:vdso: 3081 nsec/call clock-gettime-realtime-coarse:vdso: 2861 nsec/call clock-getres-realtime:vdso: 475 nsec/call clock-gettime-realtime:vdso: 892 nsec/call clock-getres-boottime:vdso: 2621 nsec/call clock-gettime-boottime:vdso: 3857 nsec/call clock-getres-tai:vdso: 2620 nsec/call clock-gettime-tai:vdso: 3854 nsec/call clock-getres-monotonic-raw:vdso: 2621 nsec/call clock-gettime-monotonic-raw:vdso: 3499 nsec/call clock-getres-monotonic-coarse:vdso: 3083 nsec/call clock-gettime-monotonic-coarse:vdso: 3082 nsec/call clock-getres-monotonic:vdso: 475 nsec/call clock-gettime-monotonic:vdso: 1014 nsec/call Once switched to C implementation: gettimeofday:vdso: 1379 nsec/call getcpu:vdso: not tested clock-getres-realtime-coarse:vdso: 984 nsec/call clock-gettime-realtime-coarse:vdso: 868 nsec/call clock-getres-realtime:vdso: 922 nsec/call clock-gettime-realtime:vdso: 1511 nsec/call clock-getres-boottime:vdso: 922 nsec/call clock-gettime-boottime:vdso: 1510 nsec/call clock-getres-tai:vdso: 923 nsec/call clock-gettime-tai:vdso: 1511 nsec/call clock-getres-monotonic-raw:vdso: 968 nsec/call clock-gettime-monotonic-raw:vdso: 1576 nsec/call clock-getres-monotonic-coarse:vdso: 984 nsec/call clock-gettime-monotonic-coarse:vdso: 868 nsec/call clock-getres-monotonic:vdso: 922 nsec/call clock-gettime-monotonic:vdso: 1510 nsec/call Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 2 + arch/powerpc/include/asm/vdso/gettimeofday.h | 109 arch/powerpc/include/asm/vdso/vsyscall.h | 25 +++ arch/powerpc/include/asm/vdso_datapage.h | 41 ++--- arch/powerpc/kernel/asm-offsets.c| 46 + arch/powerpc/kernel/time.c | 90 -- arch/powerpc/kernel/vdso.c | 5 +- arch/powerpc/kernel/vdso32/Makefile | 27 ++- arch/powerpc/kernel/vdso32/gettimeofday.S| 255 +++ arch/powerpc/kernel/vdso32/vgettimeofday.c | 26 +++ arch/powerpc/kernel/vdso64/Makefile | 23 ++- arch/powerpc/kernel/vdso64/datapage.S| 3 +- arch/powerpc/kernel/vdso64/gettimeofday.S| 254 +++--- arch/powerpc/kernel/vdso64/vgettimeofday.c | 26 +++ 14 files changed, 312 insertions(+), 620 deletions(-) create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1ec34e16ed65..bd04c68baf91 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -169,6 +169,7 @@ config PPC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select GENERIC_GETTIMEOFDAY select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU select HAVE_ARCH_JUMP_LABEL @@ -198,6 +199,7 @@ config PPC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC + select HAVE_GENERIC_VDSO select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h new file mode 100644 index ..343c81a7e951 --- /dev/null +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_GETTIMEOFDAY_H +#define __ASM_VDSO_GETTIMEOFDAY_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include + +#define VDSO_HAS_CLOCK_GETRES 1 + +#define VDSO_HAS_TIME 1 + +static __always_inline int do_syscall_2(const unsigned long _r0, const unsigned long _r3, + const unsigned long _r4) +{ + register long r0 asm("r0") = _r0; + register unsigned long r3 asm("r3") = _r3; + register unsigned long r4 asm("r4") = _r4; + register int ret asm ("r3"); + + asm volatile( +
[RFC PATCH v3 01/12] powerpc/64: Don't provide time functions in compat VDSO32
In order to allow use of generic C VDSO, remove time functions from the 32 bits VDSO on PPC64. This (temporary) removal is needed because powerpc kernel C headers are not prepared for building 32 bits code on PPC64. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/vdso32/Makefile | 3 ++- arch/powerpc/kernel/vdso32/vdso32.lds.S | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 06f54d947057..738d52105392 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -3,7 +3,8 @@ # List of files in the vdso, has to be asm only for now obj-vdso32-$(CONFIG_PPC64) = getcpu.o -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ +obj-vdso32-$(CONFIG_PPC32) = gettimeofday.o +obj-vdso32 = sigtramp.o datapage.o cacheflush.o note.o \ $(obj-vdso32-y) # Build rules diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 00c025ba4a92..9400b182e163 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -144,7 +144,7 @@ VERSION __kernel_datapage_offset; __kernel_get_syscall_map; -#ifndef CONFIG_PPC_BOOK3S_601 +#if defined(CONFIG_PPC32) && !defined(CONFIG_PPC_BOOK3S_601) __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; -- 2.13.3
[RFC PATCH v3 00/12] powerpc: switch VDSO to C implementation.
This is a third tentative to switch powerpc VDSO to generic C implementation. This version should work on PPC64 (untested). VDSO32 for PPC64 is impossible to build and has been de-activated, because the powerpc ASM header files for C are not prepared to build 32 bits code with CONFIG_PPC64. powerpc is a bit special for VDSO as well as system calls in the way that it requires setting CR SO bit which cannot be done in C. Therefore, entry/exit and fallback need to be performed in ASM. On a powerpc8xx, with current powerpc/32 ASM VDSO: gettimeofday:vdso: 737 nsec/call clock-getres-realtime:vdso: 475 nsec/call clock-gettime-realtime:vdso: 892 nsec/call The first patch adds VDSO generic C support without any changes to common code. Performance is as follows: gettimeofday:vdso: 1379 nsec/call clock-getres-realtime-coarse:vdso: 984 nsec/call clock-gettime-realtime-coarse:vdso: 868 nsec/call clock-getres-realtime:vdso: 922 nsec/call clock-gettime-realtime:vdso: 1511 nsec/call clock-getres-monotonic-raw:vdso: 968 nsec/call clock-gettime-monotonic-raw:vdso: 1576 nsec/call Then a few changes in the common code have allowed performance improvement. At the end of the series we have: gettimeofday:vdso: 899 nsec/call clock-getres-realtime-coarse:vdso: 546 nsec/call clock-gettime-realtime-coarse:vdso: 615 nsec/call clock-getres-realtime:vdso: 545 nsec/call clock-gettime-realtime:vdso: 1064 nsec/call clock-getres-monotonic-raw:vdso: 546 nsec/call clock-gettime-monotonic-raw:vdso: 1125 nsec/call Christophe Leroy (12): powerpc/64: Don't provide time functions in compat VDSO32 powerpc/vdso: Switch VDSO to generic C implementation. lib: vdso: mark __cvdso_clock_getres() as static lib: vdso: inline do_hres() and do_coarse() lib: vdso: Avoid duplication in __cvdso_clock_getres() lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time powerpc/vdso: simplify __get_datapage() lib: vdso: allow arches to provide vdso data pointer powerpc/vdso: provide inline alternative to __get_datapage() powerpc/vdso: provide vdso data pointer from the ASM caller. lib: vdso: split clock verification out of __arch_get_hw_counter() powerpc/vdso: provide __arch_is_hw_counter_valid() arch/powerpc/Kconfig | 2 + arch/powerpc/include/asm/vdso/gettimeofday.h | 104 +++ arch/powerpc/include/asm/vdso/vsyscall.h | 25 +++ arch/powerpc/include/asm/vdso_datapage.h | 52 +++--- arch/powerpc/kernel/asm-offsets.c| 46 + arch/powerpc/kernel/time.c | 90 -- arch/powerpc/kernel/vdso.c | 58 ++ arch/powerpc/kernel/vdso32/Makefile | 30 +++- arch/powerpc/kernel/vdso32/datapage.S| 10 +- arch/powerpc/kernel/vdso32/gettimeofday.S| 258 --- arch/powerpc/kernel/vdso32/vdso32.lds.S | 9 +- arch/powerpc/kernel/vdso32/vgettimeofday.c | 29 +++ arch/powerpc/kernel/vdso64/Makefile | 23 ++- arch/powerpc/kernel/vdso64/datapage.S| 13 +- arch/powerpc/kernel/vdso64/gettimeofday.S| 257 -- arch/powerpc/kernel/vdso64/vdso64.lds.S | 7 +- arch/powerpc/kernel/vdso64/vgettimeofday.c | 29 +++ lib/vdso/gettimeofday.c | 130 +++--- 18 files changed, 457 insertions(+), 715 deletions(-) create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c -- 2.13.3
[RFC PATCH v3 03/12] lib: vdso: mark __cvdso_clock_getres() as static
When __cvdso_clock_getres() became __cvdso_clock_getres_common() and a new __cvdso_clock_getres() was added, static qualifier was forgotten. This change allows the compiler to inline __cvdso_clock_getres_common(), and the performance improvement is significant: Before: clock-getres-realtime-coarse:vdso: 984 nsec/call clock-getres-realtime:vdso: 922 nsec/call clock-getres-monotonic-raw:vdso: 968 nsec/call After: clock-getres-realtime-coarse:vdso: 753 nsec/call clock-getres-realtime:vdso: 691 nsec/call clock-getres-monotonic-raw:vdso: 737 nsec/call Fixes: 502a590a170b ("lib/vdso: Move fallback invocation to the callers") Signed-off-by: Christophe Leroy --- lib/vdso/gettimeofday.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index 9ecfd3b547ba..42bd8ab955fa 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -221,6 +221,7 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res) return 0; } +static __maybe_unused int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) { int ret = __cvdso_clock_getres_common(clock, res); -- 2.13.3
[PATCH] powerpc/xive: discard ESB load value when interrupt is invalid
From: Frederic Barrat A load on an ESB page returning all 1's means that the underlying device has invalidated the access to the PQ state of the interrupt through mmio. It may happen, for example when querying a PHB interrupt while the PHB is in an error state. In that case, we should consider the interrupt to be invalid when checking its state in the irq_get_irqchip_state() handler. Cc: Paul Mackerras Signed-off-by: Frederic Barrat [ clg: - wrote a commit log - introduced XIVE_ESB_INVALID ] Signed-off-by: Cédric Le Goater --- arch/powerpc/include/asm/xive-regs.h | 1 + arch/powerpc/sysdev/xive/common.c| 15 --- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h index f2dfcd50a2d3..33aee7490cbb 100644 --- a/arch/powerpc/include/asm/xive-regs.h +++ b/arch/powerpc/include/asm/xive-regs.h @@ -39,6 +39,7 @@ #define XIVE_ESB_VAL_P 0x2 #define XIVE_ESB_VAL_Q 0x1 +#define XIVE_ESB_INVALID 0xFF /* * Thread Management (aka "TM") registers diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f5fadbd2533a..9651ca061828 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -972,12 +972,21 @@ static int xive_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + u8 pq; switch (which) { case IRQCHIP_STATE_ACTIVE: - *state = !xd->stale_p && -(xd->saved_p || - !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); + pq = xive_esb_read(xd, XIVE_ESB_GET); + + /* +* The esb value being all 1's means we couldn't get +* the PQ state of the interrupt through mmio. It may +* happen, for example when querying a PHB interrupt +* while the PHB is in an error state. We consider the +* interrupt to be inactive in that case. +*/ + *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && + (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); return 0; default: return -EINVAL; -- 2.21.1
Re: [PATCH] evh_bytechan: fix out of bounds accesses
On Thu, Jan 9, 2020 at 1:41 AM Stephen Rothwell wrote: > > ev_byte_channel_send() assumes that its third argument is a 16 byte array. > Some places where it is called it may not be (or we can't easily tell > if it is). Newer compilers have started producing warnings about this, > so make sure we actually pass a 16 byte array. ... > +static unsigned int local_ev_byte_channel_send(unsigned int handle, > +unsigned int *count, const char *p) > +{ > + char buffer[EV_BYTE_CHANNEL_MAX_BYTES]; > + unsigned int c = *count; > + > + if (c < sizeof(buffer)) { > + memcpy(buffer, p, c); > + memset(&buffer[c], 0, sizeof(buffer) - c); > + p = buffer; > + } > + return ev_byte_channel_send(handle, count, p); > +} Why not simply correct the parameters of ev_byte_channel_send? static inline unsigned int ev_byte_channel_send(unsigned int handle, -unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES]) +unsigned int *count, const char *buffer) Back then, I probably thought I was just being clever with this code, but I realize now that it doesn't make sense to do the way I did.
Re: [PATCH] evh_bytechan: fix out of bounds accesses
On 1/13/20 8:34 AM, Laurentiu Tudor wrote: There are a few users that I know of, but I can't tell if that's enough to justify keeping the driver. [1]https://source.codeaurora.org/external/qoriq/qoriq-yocto-sdk/hypervisor/ IIRC, the driver is the only reasonable way to get a serial console from a guest. So if there are users of the hypervisor, then I think there's a good chance at least one is using the byte channel driver.
Re: [PATCH] lib: vdso: mark __cvdso_clock_getres() as static
Christophe Leroy writes: > Is there a git tree with the latest VDSO status ? > > https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git?h=timers%2Fvdso > is 6 monthes old. Will push a new one later today. I'll let you know. Thanks, tglx
Re: [PATCH] evh_bytechan: fix out of bounds accesses
Hello, On 13.01.2020 15:48, Timur Tabi wrote: > On 1/13/20 6:26 AM, Michael Ellerman wrote: >> I've never heard of it, and I have no idea how to test it. >> >> It's not used by qemu, I guess there is/was a Freescale hypervisor that >> used it. > > Yes, there is/was a Freescale hypervisor that I and a few others worked > on. I've added a couple people on CC that might be able to tell the > current disposition of it. More info on this: it's opensource and it's published here [1]. We still claim to maintain it but there wasn't much activity past years, as one might notice. >> But maybe it's time to remove it if it's not being maintained/used by >> anyone? > > I wouldn't be completely opposed to that if there really are no more > users. There really weren't any users even when I wrote the driver. There are a few users that I know of, but I can't tell if that's enough to justify keeping the driver. [1] https://source.codeaurora.org/external/qoriq/qoriq-yocto-sdk/hypervisor/ --- Best Regards, Laurentiu
Re: [PATCH] powerpc/xive: discard ESB load value when interrupt is invalid
On 1/13/20 2:01 PM, Cédric Le Goater wrote: > From: Frederic Barrat > > A load on an ESB page returning all 1's means that the underlying > device has invalidated the access to the PQ state of the interrupt > through mmio. It may happen, for example when querying a PHB interrupt > while the PHB is in an error state. > > In that case, we should consider the interrupt to be invalid when > checking its state in the irq_get_irqchip_state() handler. and we need also these tags : Fixes: da15c03b047d ("powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race") Cc: sta...@vger.kernel.org # v5.3+ > Cc: Paul Mackerras > Signed-off-by: Frederic Barrat > [ clg: - wrote a commit log >- introduced XIVE_ESB_INVALID ] > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/include/asm/xive-regs.h | 1 + > arch/powerpc/sysdev/xive/common.c| 15 --- > 2 files changed, 13 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/include/asm/xive-regs.h > b/arch/powerpc/include/asm/xive-regs.h > index f2dfcd50a2d3..33aee7490cbb 100644 > --- a/arch/powerpc/include/asm/xive-regs.h > +++ b/arch/powerpc/include/asm/xive-regs.h > @@ -39,6 +39,7 @@ > > #define XIVE_ESB_VAL_P 0x2 > #define XIVE_ESB_VAL_Q 0x1 > +#define XIVE_ESB_INVALID 0xFF > > /* > * Thread Management (aka "TM") registers > diff --git a/arch/powerpc/sysdev/xive/common.c > b/arch/powerpc/sysdev/xive/common.c > index f5fadbd2533a..9651ca061828 100644 > --- a/arch/powerpc/sysdev/xive/common.c > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -972,12 +972,21 @@ static int xive_get_irqchip_state(struct irq_data *data, > enum irqchip_irq_state which, bool *state) > { > struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); > + u8 pq; > > switch (which) { > case IRQCHIP_STATE_ACTIVE: > - *state = !xd->stale_p && > - (xd->saved_p || > - !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); > + pq = xive_esb_read(xd, XIVE_ESB_GET); > + > + /* > + * The esb value being all 1's means we couldn't get > + * the PQ state of the interrupt through mmio. It may > + * happen, for example when querying a PHB interrupt > + * while the PHB is in an error state. We consider the > + * interrupt to be inactive in that case. > + */ > + *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p && > + (xd->saved_p || !!(pq & XIVE_ESB_VAL_P)); > return 0; > default: > return -EINVAL; >
Re: [PATCH] evh_bytechan: fix out of bounds accesses
On 1/13/20 6:26 AM, Michael Ellerman wrote: I've never heard of it, and I have no idea how to test it. It's not used by qemu, I guess there is/was a Freescale hypervisor that used it. Yes, there is/was a Freescale hypervisor that I and a few others worked on. I've added a couple people on CC that might be able to tell the current disposition of it. But maybe it's time to remove it if it's not being maintained/used by anyone? I wouldn't be completely opposed to that if there really are no more users. There really weren't any users even when I wrote the driver. I haven't had a chance to study the patch, but my first instinct is that there's got to be a better way to fix this than introducing a memcpy.
Re: [PATCH v3] powerpc/smp: Use nid as fallback for package_id
Hi Srikar, Still some comments sorry ... Srikar Dronamraju writes: > Package_id is to find out all cores that are part of the same chip. On > PowerNV machines, package_id defaults to chip_id. However ibm,chip_id > property is not present in device-tree of PowerVM Lpars. Hence lscpu > output shows one core per socket and multiple cores. > > To overcome this, use nid as the package_id on PowerVM Lpars. > > Before the patch. > --- > Architecture:ppc64le > Byte Order: Little Endian > CPU(s): 128 > On-line CPU(s) list: 0-127 > Thread(s) per core: 8 > Core(s) per socket: 1 <-- > Socket(s): 16 <-- > NUMA node(s):2 > Model: 2.2 (pvr 004e 0202) > Model name: POWER9 (architected), altivec supported > Hypervisor vendor: pHyp > Virtualization type: para > L1d cache: 32K > L1i cache: 32K > L2 cache:512K > L3 cache:10240K > NUMA node0 CPU(s): 0-63 > NUMA node1 CPU(s): 64-127 > # > # cat /sys/devices/system/cpu/cpu0/topology/physical_package_id > -1 > # > > After the patch > --- > Architecture:ppc64le > Byte Order: Little Endian > CPU(s): 128 > On-line CPU(s) list: 0-127 > Thread(s) per core: 8<-- > Core(s) per socket: 8<-- > Socket(s): 2 > NUMA node(s):2 > Model: 2.2 (pvr 004e 0202) > Model name: POWER9 (architected), altivec supported > Hypervisor vendor: pHyp > Virtualization type: para > L1d cache: 32K > L1i cache: 32K > L2 cache:512K > L3 cache:10240K > NUMA node0 CPU(s): 0-63 > NUMA node1 CPU(s): 64-127 > # > # cat /sys/devices/system/cpu/cpu0/topology/physical_package_id > 0 > # > Now lscpu output is more in line with the system configuration. > > Signed-off-by: Srikar Dronamraju > Cc: linuxppc-dev@lists.ozlabs.org > Cc: Michael Ellerman > Cc: Vasant Hegde > Cc: Vaidyanathan Srinivasan > --- > Changelog from v2: https://patchwork.ozlabs.org/patch/1151649 > - Rebased to v5.5-rc2 > - Renamed the new function to cpu_to_nid > - Removed checks to fadump property. (Looked too excessive) > - Moved pseries specific code to pseries/lpar.c > > Changelog from v1: https://patchwork.ozlabs.org/patch/1126145 > - In v1 cpu_to_chip_id was overloaded to fallback on nid. Michael > Ellerman wasn't comfortable with nid being shown up as chip_id. > > arch/powerpc/include/asm/topology.h | 7 ++- > arch/powerpc/kernel/smp.c | 6 +++--- > arch/powerpc/platforms/pseries/lpar.c | 22 ++ > 3 files changed, 31 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/topology.h > b/arch/powerpc/include/asm/topology.h > index 2f7e1ea5089e..7422ef913c75 100644 > --- a/arch/powerpc/include/asm/topology.h > +++ b/arch/powerpc/include/asm/topology.h > @@ -130,11 +130,16 @@ static inline void shared_proc_topology_init(void) {} > > #ifdef CONFIG_SMP > #include > +#ifdef CONFIG_PPC_SPLPAR > +int cpu_to_nid(int); > +#else > +#define cpu_to_nid(cpu) cpu_to_chip_id(cpu) > +#endif We already have cpu_to_node(), which returns the numa node (nid) for a given CPU, so I think introducing a new accessor with an almost identical name is going to cause confusion. ie. when should code use cpu_to_node() and when should it use cpu_to_nid()? > #ifdef CONFIG_PPC64 > #include > > -#define topology_physical_package_id(cpu)(cpu_to_chip_id(cpu)) > +#define topology_physical_package_id(cpu)(cpu_to_nid(cpu)) I think you should be overriding topology_physical_package_id() instead of introducing cpu_to_nid(). > #define topology_sibling_cpumask(cpu)(per_cpu(cpu_sibling_map, cpu)) > #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) > #define topology_core_id(cpu)(cpu_to_core_id(cpu)) > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c > index ea6adbf6a221..b0c1438d8d9a 100644 > --- a/arch/powerpc/kernel/smp.c > +++ b/arch/powerpc/kernel/smp.c > @@ -1188,7 +1188,7 @@ static inline void add_cpu_to_smallcore_masks(int cpu) > static void add_cpu_to_masks(int cpu) > { > int first_thread = cpu_first_thread_sibling(cpu); > - int chipid = cpu_to_chip_id(cpu); > + int nid = cpu_to_nid(cpu); And here you should be using topology_physical_package_id(), rather than cpu_to_nid() directly. > int i; > > /* > @@ -1217,11 +1217,11 @@ static void add_cpu_to_masks(int cpu) > for_each_cpu(i, cpu_l2_cache_mask(cpu)) > set_cpus_related(cpu, i, cpu_core_mask); > > - if (chipid == -1) > + if (nid == -1) > return; > > for_each_cpu(i, cpu_online_mask) > - if (cpu_to_chip_id(i) == chip
[Bug 205201] Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M
https://bugzilla.kernel.org/show_bug.cgi?id=205201 Michael Ellerman (mich...@ellerman.id.au) changed: What|Removed |Added Status|RESOLVED|CLOSED CC||mich...@ellerman.id.au -- You are receiving this mail because: You are watching the assignee of the bug.
Re: [PATCH] evh_bytechan: fix out of bounds accesses
Stephen Rothwell writes: > ev_byte_channel_send() assumes that its third argument is a 16 byte array. > Some places where it is called it may not be (or we can't easily tell > if it is). Newer compilers have started producing warnings about this, > so make sure we actually pass a 16 byte array. > > There may be more elegant solutions to this, but the driver is quite > old and hasn't been updated in many years. ... > Fixes: dcd83aaff1c8 ("tty/powerpc: introduce the ePAPR embedded hypervisor > byte channel driver") > Cc: Michael Ellerman > Cc: PowerPC Mailing List > Signed-off-by: Stephen Rothwell > --- > drivers/tty/ehv_bytechan.c | 20 +--- > 1 file changed, 17 insertions(+), 3 deletions(-) > > I have only build tested this change so it would be good to get some > response from the PowerPC maintainers/developers. I've never heard of it, and I have no idea how to test it. It's not used by qemu, I guess there is/was a Freescale hypervisor that used it. But maybe it's time to remove it if it's not being maintained/used by anyone? cheers > diff --git a/drivers/tty/ehv_bytechan.c b/drivers/tty/ehv_bytechan.c > index 769e0a5d1dfc..546f80c49ae6 100644 > --- a/drivers/tty/ehv_bytechan.c > +++ b/drivers/tty/ehv_bytechan.c > @@ -136,6 +136,20 @@ static int find_console_handle(void) > return 1; > } > > +static unsigned int local_ev_byte_channel_send(unsigned int handle, > +unsigned int *count, const char *p) > +{ > + char buffer[EV_BYTE_CHANNEL_MAX_BYTES]; > + unsigned int c = *count; > + > + if (c < sizeof(buffer)) { > + memcpy(buffer, p, c); > + memset(&buffer[c], 0, sizeof(buffer) - c); > + p = buffer; > + } > + return ev_byte_channel_send(handle, count, p); > +} > + > /*** EARLY CONSOLE DRIVER > ***/ > > #ifdef CONFIG_PPC_EARLY_DEBUG_EHV_BC > @@ -154,7 +168,7 @@ static void byte_channel_spin_send(const char data) > > do { > count = 1; > - ret = ev_byte_channel_send(CONFIG_PPC_EARLY_DEBUG_EHV_BC_HANDLE, > + ret = > local_ev_byte_channel_send(CONFIG_PPC_EARLY_DEBUG_EHV_BC_HANDLE, > &count, &data); > } while (ret == EV_EAGAIN); > } > @@ -221,7 +235,7 @@ static int ehv_bc_console_byte_channel_send(unsigned int > handle, const char *s, > while (count) { > len = min_t(unsigned int, count, EV_BYTE_CHANNEL_MAX_BYTES); > do { > - ret = ev_byte_channel_send(handle, &len, s); > + ret = local_ev_byte_channel_send(handle, &len, s); > } while (ret == EV_EAGAIN); > count -= len; > s += len; > @@ -401,7 +415,7 @@ static void ehv_bc_tx_dequeue(struct ehv_bc_data *bc) > CIRC_CNT_TO_END(bc->head, bc->tail, BUF_SIZE), > EV_BYTE_CHANNEL_MAX_BYTES); > > - ret = ev_byte_channel_send(bc->handle, &len, bc->buf + > bc->tail); > + ret = local_ev_byte_channel_send(bc->handle, &len, bc->buf + > bc->tail); > > /* 'len' is valid only if the return code is 0 or EV_EAGAIN */ > if (!ret || (ret == EV_EAGAIN)) > -- > 2.25.0.rc1 > > -- > Cheers, > Stephen Rothwell
Re: [RESEND PATCH v5 2/5] arm64/crash_core: Export TCR_EL1.T1SZ in vmcoreinfo
Hi James, On 01/11/2020 12:30 AM, Dave Anderson wrote: - Original Message - Hi Bhupesh, On 25/12/2019 19:01, Bhupesh Sharma wrote: On 12/12/2019 04:02 PM, James Morse wrote: On 29/11/2019 19:59, Bhupesh Sharma wrote: vabits_actual variable on arm64 indicates the actual VA space size, and allows a single binary to support both 48-bit and 52-bit VA spaces. If the ARMv8.2-LVA optional feature is present, and we are running with a 64KB page size; then it is possible to use 52-bits of address space for both userspace and kernel addresses. However, any kernel binary that supports 52-bit must also be able to fall back to 48-bit at early boot time if the hardware feature is not present. Since TCR_EL1.T1SZ indicates the size offset of the memory region addressed by TTBR1_EL1 (and hence can be used for determining the vabits_actual value) it makes more sense to export the same in vmcoreinfo rather than vabits_actual variable, as the name of the variable can change in future kernel versions, but the architectural constructs like TCR_EL1.T1SZ can be used better to indicate intended specific fields to user-space. User-space utilities like makedumpfile and crash-utility, need to read/write this value from/to vmcoreinfo (write?) Yes, also write so that the vmcoreinfo from an (crashing) arm64 system can be used for analysis of the root-cause of panic/crash on say an x86_64 host using utilities like crash-utility/gdb. I read this as as "User-space [...] needs to write to vmcoreinfo". That's correct. But for writing to vmcore dump in the kdump kernel, we need to read the symbols from the vmcoreinfo in the primary kernel. for determining if a virtual address lies in the linear map range. I think this is a fragile example. The debugger shouldn't need to know this. Well that the current user-space utility design, so I am not sure we can tweak that too much. The user-space computation for determining whether an address lies in the linear map range is the same as we have in kernel-space: #define __is_lm_address(addr)(!(((u64)addr) & BIT(vabits_actual - 1))) This was changed with 14c127c957c1 ("arm64: mm: Flip kernel VA space"). If user-space tools rely on 'knowing' the kernel memory layout, they must have to constantly be fixed and updated. This is a poor argument for adding this to something that ends up as ABI. See above. The user-space has to rely on some ABI/guaranteed hardware-symbols which can be used for 'determining' the kernel memory layout. I disagree. Everything and anything in the kernel will change. The ABI rules apply to stuff exposed via syscalls and kernel filesystems. It does not apply to kernel internals, like the memory layout we used yesterday. 14c127c957c1 is a case in point. A debugger trying to rely on this sort of thing would have to play catchup whenever it changes. Exactly. That's the whole point. The crash utility and makedumpfile are not in the same league as other user-space tools. They have always had to "play catchup" precisely because they depend upon kernel internals, which constantly change. I agree with you and DaveA here. Software user-space debuggers are dependent on kernel internals (which can change from time-to-time) and will have to play catch-up (which has been the case since the very start). Unfortunately we don't have any clear ABI for software debugging tools - may be something to look for in future. A case in point is gdb/kgdb, which still needs to run with KASLR turned-off (nokaslr) for debugging, as it confuses gdb which resolve kernel symbol address from symbol table of vmlinux. But we can work-around the same in makedumpfile/crash by reading the 'kaslr_offset' value. And I have several users telling me now they cannot use gdb on KASLR enabled kernel to debug panics, but can makedumpfile + crash combination to achieve the same. So, we should be looking to fix these utilities which are broken since the 52-bit changes for arm64. Accordingly, I will try to send the v6 soon while incorporating the comments posted on the v5. Thanks, Bhupesh
Re: [PATCH] lib: vdso: mark __cvdso_clock_getres() as static
Le 11/01/2020 à 20:59, Thomas Gleixner a écrit : Christophe Leroy writes: When __cvdso_clock_getres() became __cvdso_clock_getres_common() and a new __cvdso_clock_getres() was added, static qualifier was forgotten. Fixes: 502a590a170b ("lib/vdso: Move fallback invocation to the callers") Signed-off-by: Christophe Leroy I've already queued: https://lore.kernel.org/r/20191128111719.8282-1-vincenzo.frasc...@arm.com but thanks for caring! Is there a git tree with the latest VDSO status ? https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git?h=timers%2Fvdso is 6 monthes old. Christophe
[PATCH v3 3/3] powerpc/powernv: Parse device tree, population of SPR support
Parse the device tree for nodes self-save, self-restore and populate support for the preferred SPRs based what was advertised by the device tree. Signed-off-by: Pratik Rajesh Sampat Reviewed-by: Ram Pai --- arch/powerpc/platforms/powernv/idle.c | 104 ++ 1 file changed, 104 insertions(+) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 97aeb45e897b..384980c743eb 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -1436,6 +1436,107 @@ static void __init pnv_probe_idle_states(void) supported_cpuidle_states |= pnv_idle_states[i].flags; } +/* + * Extracts and populates the self save or restore capabilities + * passed from the device tree node + */ +static int extract_save_restore_state_dt(struct device_node *np, int type) +{ + int nr_sprns = 0, i, bitmask_index; + int rc = 0; + u64 *temp_u64; + const char *state_prop; + u64 bit_pos; + + state_prop = of_get_property(np, "status", NULL); + if (!state_prop) { + pr_warn("opal: failed to find the active value for self save/restore node"); + return -EINVAL; + } + if (strncmp(state_prop, "disabled", 8) == 0) { + /* +* if the feature is not active, strip the preferred_sprs from +* that capability. +*/ + if (type == SELF_RESTORE_TYPE) { + for (i = 0; i < nr_preferred_sprs; i++) { + preferred_sprs[i].supported_mode &= + ~SELF_RESTORE_STRICT; + } + } else { + for (i = 0; i < nr_preferred_sprs; i++) { + preferred_sprs[i].supported_mode &= + ~SELF_SAVE_STRICT; + } + } + return 0; + } + nr_sprns = of_property_count_u64_elems(np, "sprn-bitmask"); + if (nr_sprns <= 0) + return rc; + temp_u64 = kcalloc(nr_sprns, sizeof(u64), GFP_KERNEL); + if (of_property_read_u64_array(np, "sprn-bitmask", + temp_u64, nr_sprns)) { + pr_warn("cpuidle-powernv: failed to find registers in DT\n"); + kfree(temp_u64); + return -EINVAL; + } + /* +* Populate acknowledgment of support for the sprs in the global vector +* gotten by the registers supplied by the firmware. +* The registers are in a bitmask, bit index within +* that specifies the SPR +*/ + for (i = 0; i < nr_preferred_sprs; i++) { + bitmask_index = preferred_sprs[i].spr / 64; + bit_pos = preferred_sprs[i].spr % 64; + if ((temp_u64[bitmask_index] & (1UL << bit_pos)) == 0) { + if (type == SELF_RESTORE_TYPE) + preferred_sprs[i].supported_mode &= + ~SELF_RESTORE_STRICT; + else + preferred_sprs[i].supported_mode &= + ~SELF_SAVE_STRICT; + continue; + } + if (type == SELF_RESTORE_TYPE) { + preferred_sprs[i].supported_mode |= + SELF_RESTORE_STRICT; + } else { + preferred_sprs[i].supported_mode |= + SELF_SAVE_STRICT; + } + } + + kfree(temp_u64); + return rc; +} + +static int pnv_parse_deepstate_dt(void) +{ + struct device_node *sr_np, *ss_np; + int rc = 0; + + /* Self restore register population */ + sr_np = of_find_node_by_path("/ibm,opal/power-mgt/self-restore"); + if (!sr_np) { + pr_warn("opal: self restore Node not found"); + } else { + rc = extract_save_restore_state_dt(sr_np, SELF_RESTORE_TYPE); + if (rc != 0) + return rc; + } + /* Self save register population */ + ss_np = of_find_node_by_path("/ibm,opal/power-mgt/self-save"); + if (!ss_np) { + pr_warn("opal: self save Node not found"); + pr_warn("Legacy firmware. Assuming default self-restore support"); + } else { + rc = extract_save_restore_state_dt(ss_np, SELF_SAVE_TYPE); + } + return rc; +} + /* * This function parses device-tree and populates all the information * into pnv_idle_states structure. It also sets up nr_pnv_idle_states @@ -1584,6 +1685,9 @@ static int __init pnv_init_idle_states(void) return rc; pnv_probe_idle_states(); + rc = pnv_parse_deepstate_dt(); + if (rc) + return rc; if (!cp
[PATCH v3 2/3] powerpc/powernv: Introduce Self save support
This commit introduces and leverages the Self save API which OPAL now supports. Add the new Self Save OPAL API call in the list of OPAL calls. Implement the self saving of the SPRs based on the support populated while respecting it's preferences. This implementation allows mixing of support for the SPRs, which means that a SPR can be self restored while another SPR be self saved if they support and prefer it to be so. Signed-off-by: Pratik Rajesh Sampat Reviewed-by: Ram Pai --- arch/powerpc/include/asm/opal-api.h| 3 ++- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/idle.c | 22 ++ arch/powerpc/platforms/powernv/opal-call.c | 1 + 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..1b6e1a68d431 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,8 @@ #define OPAL_SECVAR_GET176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE 178 -#define OPAL_LAST 178 +#define OPAL_SLW_SELF_SAVE_REG 181 +#define OPAL_LAST 181 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..389a85b63805 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -203,6 +203,7 @@ int64_t opal_handle_hmi(void); int64_t opal_handle_hmi2(__be64 *out_flags); int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end); int64_t opal_unregister_dump_region(uint32_t id); +int64_t opal_slw_self_save_reg(uint64_t cpu_pir, uint64_t sprn); int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val); int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag); int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 03fe835aadd1..97aeb45e897b 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -279,6 +279,26 @@ static int pnv_self_save_restore_sprs(void) if (rc != 0) return rc; break; + } else if (preferred & curr_spr.supported_mode + & SELF_SAVE_STRICT) { + is_initialized = true; + if (curr_spr.spr == SPRN_HMEER && + cpu_thread_in_core(cpu) != 0) { + continue; + } + rc = opal_slw_self_save_reg(pir, + curr_spr.spr); + if (rc != 0) + return rc; + switch (curr_spr.spr) { + case SPRN_LPCR: + is_lpcr_self_save = true; + break; + case SPRN_PTCR: + is_ptcr_self_save = true; + break; + } + break; } preferred_sprs[index].preferred_mode = preferred_sprs[index].preferred_mode >> @@ -1159,6 +1179,8 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) if (!is_lpcr_self_save) opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + else + opal_slw_self_save_reg(pir, SPRN_LPCR); } } diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 5cd0f52d258f..11e0ceb90de0 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -223,6 +223,7 @@ OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); OPAL_CALL(opal_handle_hmi2,OPAL_HANDLE_HMI2); OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); OPAL_CALL(opal_slw_set_reg,OPAL_SLW_SET_REG); +OPAL_CALL(opal_slw_self_save_reg, OPAL_SLW_SELF_SAV
[PATCH v3 1/3] powerpc/powernv: Interface to define support and preference for a SPR
Define a bitmask interface to determine support for the Self Restore, Self Save or both. Also define an interface to determine the preference of that SPR to be strictly saved or restored or encapsulated with an order of preference. The preference bitmask is shown as below: |... | 2nd pref | 1st pref | MSB LSB The preference from higher to lower is from LSB to MSB with a shift of 8 bits. Example: Prefer self save first, if not available then prefer self restore The preference mask for this scenario will be seen as below. ((SELF_RESTORE_STRICT << PREFERENCE_SHIFT) | SELF_SAVE_STRICT) - |... | Self restore | Self save | - MSB LSB Finally, declare a list of preferred SPRs which encapsulate the bitmaks for preferred and supported with defaults of both being set to support legacy firmware. This commit also implements using the above interface and retains the legacy functionality of self restore. Signed-off-by: Pratik Rajesh Sampat Reviewed-by: Ram Pai --- arch/powerpc/platforms/powernv/idle.c | 316 +- 1 file changed, 259 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..03fe835aadd1 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -32,9 +32,112 @@ #define P9_STOP_SPR_MSR 2000 #define P9_STOP_SPR_PSSCR 855 +/* Interface for the stop state supported and preference */ +#define SELF_RESTORE_TYPE0 +#define SELF_SAVE_TYPE 1 + +#define NR_PREFERENCES2 +#define PREFERENCE_SHIFT 4 +#define PREFERENCE_MASK 0xf + +#define UNSUPPORTED 0x0 +#define SELF_RESTORE_STRICT 0x1 +#define SELF_SAVE_STRICT0x2 + +/* + * Bitmask defining the kind of preferences available. + * Note : The higher to lower preference is from LSB to MSB, with a shift of + * 4 bits. + * + * || 2nd pref | 1st pref | + * + * MSB LSB + */ +/* Prefer Restore if available, otherwise unsupported */ +#define PREFER_SELF_RESTORE_ONLY SELF_RESTORE_STRICT +/* Prefer Save if available, otherwise unsupported */ +#define PREFER_SELF_SAVE_ONLY SELF_SAVE_STRICT +/* Prefer Restore when available, otherwise prefer Save */ +#define PREFER_RESTORE_SAVE((SELF_SAVE_STRICT << \ + PREFERENCE_SHIFT)\ + | SELF_RESTORE_STRICT) +/* Prefer Save when available, otherwise prefer Restore*/ +#define PREFER_SAVE_RESTORE((SELF_RESTORE_STRICT <<\ + PREFERENCE_SHIFT)\ + | SELF_SAVE_STRICT) static u32 supported_cpuidle_states; struct pnv_idle_states_t *pnv_idle_states; int nr_pnv_idle_states; +/* Caching the lpcr & ptcr support to use later */ +static bool is_lpcr_self_save; +static bool is_ptcr_self_save; + +struct preferred_sprs { + u64 spr; + u32 preferred_mode; + u32 supported_mode; +}; + +/* + * Preferred mode: Order of precedence when both self-save and self-restore + *supported + * Supported mode: Default support. Can be overwritten during system + *initialization + */ +struct preferred_sprs preferred_sprs[] = { + { + .spr = SPRN_HSPRG0, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_LPCR, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_PTCR, + .preferred_mode = PREFER_SAVE_RESTORE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_HMEER, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_HID0, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = P9_STOP_SPR_MSR, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = P9_STOP_SPR_PSSCR, + .preferred_mode = PREFER_SAVE_RESTORE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_HID1, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_HID4, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, +
[PATCH v3 0/3] Introduce Self-Save API for deep stop states
Skiboot patches: https://patchwork.ozlabs.org/patch/1221011/ v2 patches: https://lkml.org/lkml/2020/1/12/253 Changelog Patch v2 --> v3 1. Addressed minor comments from Ram Pai Currently the stop-API supports a mechanism called as self-restore which allows us to restore the values of certain SPRs on wakeup from a deep-stop state to a desired value. To use this, the Kernel makes an OPAL call passing the PIR of the CPU, the SPR number and the value to which the SPR should be restored when that CPU wakes up from a deep stop state. Recently, a new feature, named self-save has been enabled in the stop-api, which is an alternative mechanism to do the same, except that self-save will save the current content of the SPR before entering a deep stop state and also restore the content back on waking up from a deep stop state. This patch series aims at introducing and leveraging the self-save feature in the kernel. Now, as the kernel has a choice to prefer one mode over the other and there can be registers in both the save/restore SPR list which are sent from the device tree, a new interface has been defined for the seamless handing of the modes for each SPR. A list of preferred SPRs are maintained in the kernel which contains two properties: 1. supported_mode: Helps in identifying if it strictly supports self save or restore or both. Initialized using the information from device tree. 2. preferred_mode: Calls out what mode is preferred for each SPR. It could be strictly self save or restore, or it can also determine the preference of mode over the other if both are present by encapsulating the other in bitmask from LSB to MSB. Initialized statically. Below is a table to show the Scenario::Consequence when the self save and self restore modes are available or disabled in different combinations as perceived from the device tree thus giving complete backwards compatibly regardless of an older firmware running a newer kernel or vise-versa. Support for self save or self-restore is embedded in the device tree, along with the set of registers it supports. SR = Self restore; SS = Self save .---.. | Scenario |Consequence | :---+: | Legacy Firmware. No SS or SR node | Self restore is called for all | | | supported SPRs | :---+: | SR: !active SS: !active | Deep stop states disabled | :---+: | SR: active SS: !active| Self restore is called for all | | | supported SPRs | :---+: | SR: active SS: active | Goes through the preferences for each | | | SPR and executes of the modes | | | accordingly. Currently, Self restore is| | | called for all the SPRs except PSSCR | | | which is self saved| :---+: | SR: active(only HID0) SS: active | Self save called for all supported | | | registers expect HID0 (as HID0 cannot | | | be self saved currently) | :---+: | SR: !active SS: active| currently will disable deep states as | | | HID0 is needed to be self restored and | | | cannot be self saved | '---'' Pratik Rajesh Sampat (3): powerpc/powernv: Interface to define support and preference for a SPR powerpc/powernv: Introduce Self save support powerpc/powernv: Parse device tree, population of SPR support arch/powerpc/include/asm/opal-api.h| 3 +- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/idle.c | 442 ++--- arch/powerpc/platforms/powernv/opal-call.c | 1 + 4 files changed, 389 insertions(+), 58 deletions(-) -- 2.24.1
Re: [PATCH 15/18] powerpc/uprobes: Add support for prefixed instructions
On Tue, Nov 26, 2019 at 04:21:38PM +1100, Jordan Niethe wrote: > Uprobes can execute instructions out of line. Increase the size of the > buffer used for this so that this works for prefixed instructions. Take > into account the length of prefixed instructions when fixing up the nip. > > Signed-off-by: Jordan Niethe > --- > arch/powerpc/include/asm/uprobes.h | 18 ++ > arch/powerpc/kernel/uprobes.c | 4 ++-- > 2 files changed, 16 insertions(+), 6 deletions(-) > > diff --git a/arch/powerpc/include/asm/uprobes.h > b/arch/powerpc/include/asm/uprobes.h > index 2bbdf27d09b5..5b5e8a3d2f55 100644 > --- a/arch/powerpc/include/asm/uprobes.h > +++ b/arch/powerpc/include/asm/uprobes.h > @@ -14,18 +14,28 @@ > > typedef ppc_opcode_t uprobe_opcode_t; > > +/* > + * We have to ensure we have enought space for prefixed instructions, which minor typo of `enought` and we can have something like below, s/We have to ensure we have enought/Ensure we have enough -- Bala > + * are double the size of a word instruction, i.e. 8 bytes. However, > + * sometimes it is simpler to treat a prefixed instruction like 2 word > + * instructions. > + */ > #define MAX_UINSN_BYTES 4 > -#define UPROBE_XOL_SLOT_BYTES(MAX_UINSN_BYTES) > +#define UPROBE_XOL_SLOT_BYTES(2 * MAX_UINSN_BYTES) > > /* The following alias is needed for reference from arch-agnostic code */ > #define UPROBE_SWBP_INSN BREAKPOINT_INSTRUCTION > #define UPROBE_SWBP_INSN_SIZE4 /* swbp insn size in bytes */ > > struct arch_uprobe { > + /* > + * Ensure there is enough space for prefixed instructions. Prefixed > + * instructions must not cross 64-byte boundaries. > + */ > union { > - u32 insn; > - u32 ixol; > - }; > + uprobe_opcode_t insn[2]; > + uprobe_opcode_t ixol[2]; > + } __aligned(64); > }; > > struct arch_uprobe_task { > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > index ab1077dc6148..cfcea6946f8b 100644 > --- a/arch/powerpc/kernel/uprobes.c > +++ b/arch/powerpc/kernel/uprobes.c > @@ -111,7 +111,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, > struct pt_regs *regs) >* support doesn't exist and have to fix-up the next instruction >* to be executed. >*/ > - regs->nip = utask->vaddr + MAX_UINSN_BYTES; > + regs->nip = utask->vaddr + ((IS_PREFIX(auprobe->insn[0])) ? 8 : 4); > > user_disable_single_step(current); > return 0; > @@ -173,7 +173,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, > struct pt_regs *regs) >* emulate_step() returns 1 if the insn was successfully emulated. >* For all other cases, we need to single-step in hardware. >*/ > - ret = emulate_step(regs, auprobe->insn, 0); > + ret = emulate_step(regs, auprobe->insn[0], auprobe->insn[1]); > if (ret > 0) > return true; > > -- > 2.20.1 >
Re: [RESEND PATCH v2 2/3] powerpc/powernv: Introduce Self save support
I made a mistake while arranging the patches in the series. I'll re-arrange it correctly now. Sorry about that. On 13/01/20 1:21 pm, Ram Pai wrote: On Mon, Jan 13, 2020 at 09:15:08AM +0530, Pratik Rajesh Sampat wrote: This commit introduces and leverages the Self save API which OPAL now supports. Add the new Self Save OPAL API call in the list of OPAL calls. Implement the self saving of the SPRs based on the support populated while respecting it's preferences. This implementation allows mixing of support for the SPRs, which means that a SPR can be self restored while another SPR be self saved if they support and prefer it to be so. Signed-off-by: Pratik Rajesh Sampat --- arch/powerpc/include/asm/opal-api.h| 3 ++- arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/platforms/powernv/idle.c | 2 ++ arch/powerpc/platforms/powernv/opal-call.c | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..1b6e1a68d431 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,8 @@ #define OPAL_SECVAR_GET 176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE178 -#define OPAL_LAST 178 +#define OPAL_SLW_SELF_SAVE_REG 181 +#define OPAL_LAST 181 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..389a85b63805 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -203,6 +203,7 @@ int64_t opal_handle_hmi(void); int64_t opal_handle_hmi2(__be64 *out_flags); int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end); int64_t opal_unregister_dump_region(uint32_t id); +int64_t opal_slw_self_save_reg(uint64_t cpu_pir, uint64_t sprn); int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val); int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag); int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 2f328403b0dc..d67d4d0b169b 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -1172,6 +1172,8 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) if (!is_lpcr_self_save) opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + else + opal_slw_self_save_reg(pir, SPRN_LPCR); opal_slw_self_save_reg() was used in the prior patch too. How did it compile, if the definition is in this patch? Reviewed-by: Ram Pai RP
Re: [RESEND PATCH v2 1/3] powerpc/powernv: Interface to define support and preference for a SPR
Thanks for the review. The support just signifies what is default. Self restore is known to be supported by legacy systems. I'll mention a comment saying that this can change when the system is initialized. On 13/01/20 1:14 pm, Ram Pai wrote: On Mon, Jan 13, 2020 at 09:15:07AM +0530, Pratik Rajesh Sampat wrote: Define a bitmask interface to determine support for the Self Restore, Self Save or both. Also define an interface to determine the preference of that SPR to be strictly saved or restored or encapsulated with an order of preference. The preference bitmask is shown as below: |... | 2nd pref | 1st pref | MSB LSB The preference from higher to lower is from LSB to MSB with a shift of 8 bits. Example: Prefer self save first, if not available then prefer self restore The preference mask for this scenario will be seen as below. ((SELF_RESTORE_STRICT << PREFERENCE_SHIFT) | SELF_SAVE_STRICT) - |... | Self restore | Self save | - MSB LSB Finally, declare a list of preferred SPRs which encapsulate the bitmaks for preferred and supported with defaults of both being set to support legacy firmware. This commit also implements using the above interface and retains the legacy functionality of self restore. Signed-off-by: Pratik Rajesh Sampat --- arch/powerpc/platforms/powernv/idle.c | 327 +- 1 file changed, 271 insertions(+), 56 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 78599bca66c2..2f328403b0dc 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -32,9 +32,106 @@ #define P9_STOP_SPR_MSR 2000 #define P9_STOP_SPR_PSSCR 855 +/* Interface for the stop state supported and preference */ +#define SELF_RESTORE_TYPE0 +#define SELF_SAVE_TYPE 1 + +#define NR_PREFERENCES2 +#define PREFERENCE_SHIFT 4 +#define PREFERENCE_MASK 0xf + +#define UNSUPPORTED 0x0 +#define SELF_RESTORE_STRICT 0x1 +#define SELF_SAVE_STRICT0x2 + +/* + * Bitmask defining the kind of preferences available. + * Note : The higher to lower preference is from LSB to MSB, with a shift of + * 4 bits. + * + * || 2nd pref | 1st pref | + * + * MSB LSB + */ +/* Prefer Restore if available, otherwise unsupported */ +#define PREFER_SELF_RESTORE_ONLY SELF_RESTORE_STRICT +/* Prefer Save if available, otherwise unsupported */ +#define PREFER_SELF_SAVE_ONLY SELF_SAVE_STRICT +/* Prefer Restore when available, otherwise prefer Save */ +#define PREFER_RESTORE_SAVE((SELF_SAVE_STRICT << \ + PREFERENCE_SHIFT)\ + | SELF_RESTORE_STRICT) +/* Prefer Save when available, otherwise prefer Restore*/ +#define PREFER_SAVE_RESTORE((SELF_RESTORE_STRICT <<\ + PREFERENCE_SHIFT)\ + | SELF_SAVE_STRICT) static u32 supported_cpuidle_states; struct pnv_idle_states_t *pnv_idle_states; int nr_pnv_idle_states; +/* Caching the lpcr & ptcr support to use later */ +static bool is_lpcr_self_save; +static bool is_ptcr_self_save; + +struct preferred_sprs { + u64 spr; + u32 preferred_mode; + u32 supported_mode; +}; + +struct preferred_sprs preferred_sprs[] = { + { + .spr = SPRN_HSPRG0, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_LPCR, + .preferred_mode = PREFER_RESTORE_SAVE, + .supported_mode = SELF_RESTORE_STRICT, + }, + { + .spr = SPRN_PTCR, + .preferred_mode = PREFER_SAVE_RESTORE, + .supported_mode = SELF_RESTORE_STRICT, + }, This confuses me. It says SAVE takes precedence over RESTORE. and than it says it is strictly 'RESTORE' only. Maybe you should not initialize the 'supported_mode' ? or put a comment somewhere here, saying this value will be overwritten during system initialization? Otherwise the code looks correct. Reviewed-by: Ram Pai RP