date:20200113

Re: [PATCH 14/18] powerpc/kprobes: Support kprobes on prefixed instructions

2020-01-13 Thread Balamuruhan S

On Tue, Nov 26, 2019 at 04:21:37PM +1100, Jordan Niethe wrote:
> A prefixed instruction is composed of a word prefix followed by a word
> suffix. It does not make sense to be able to have a kprobe on the suffix
> of a prefixed instruction, so make this impossible.
> 
> Kprobes work by replacing an instruction with a trap and saving that
> instruction to be single stepped out of place later. Currently there is
> not enough space allocated to keep a prefixed instruction for single
> stepping. Increase the amount of space allocated for holding the
> instruction copy.
> 
> kprobe_post_handler() expects all instructions to be 4 bytes long which
> means that it does not function correctly for prefixed instructions.
> Add checks for prefixed instructions which will use a length of 8 bytes
> instead.
> 
> For optprobes we normally patch in loading the instruction we put a
> probe on into r4 before calling emulate_step(). We now make space and
> patch in loading the suffix into r5 as well.
> 
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/kprobes.h   |  5 +--
>  arch/powerpc/kernel/kprobes.c| 46 +---
>  arch/powerpc/kernel/optprobes.c  | 31 +++
>  arch/powerpc/kernel/optprobes_head.S |  6 
>  4 files changed, 62 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kprobes.h 
> b/arch/powerpc/include/asm/kprobes.h
> index 66b3f2983b22..1f03a1cacb1e 100644
> --- a/arch/powerpc/include/asm/kprobes.h
> +++ b/arch/powerpc/include/asm/kprobes.h
> @@ -38,12 +38,13 @@ extern kprobe_opcode_t optprobe_template_entry[];
>  extern kprobe_opcode_t optprobe_template_op_address[];
>  extern kprobe_opcode_t optprobe_template_call_handler[];
>  extern kprobe_opcode_t optprobe_template_insn[];
> +extern kprobe_opcode_t optprobe_template_sufx[];
>  extern kprobe_opcode_t optprobe_template_call_emulate[];
>  extern kprobe_opcode_t optprobe_template_ret[];
>  extern kprobe_opcode_t optprobe_template_end[];
>  
> -/* Fixed instruction size for powerpc */
> -#define MAX_INSN_SIZE1
> +/* Prefixed instructions are two words */
> +#define MAX_INSN_SIZE2
>  #define MAX_OPTIMIZED_LENGTH sizeof(kprobe_opcode_t) /* 4 bytes */
>  #define MAX_OPTINSN_SIZE (optprobe_template_end - 
> optprobe_template_entry)
>  #define RELATIVEJUMP_SIZEsizeof(kprobe_opcode_t) /* 4 bytes */
> diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
> index 7303fe3856cc..aa15b3480385 100644
> --- a/arch/powerpc/kernel/kprobes.c
> +++ b/arch/powerpc/kernel/kprobes.c
> @@ -104,17 +104,30 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
> unsigned int offset)
>  
>  int arch_prepare_kprobe(struct kprobe *p)
>  {
> + int len;
>   int ret = 0;
> + struct kprobe *prev;
>   kprobe_opcode_t insn = *p->addr;
> + kprobe_opcode_t prfx = *(p->addr - 1);
>  
> + preempt_disable();
>   if ((unsigned long)p->addr & 0x03) {
>   printk("Attempt to register kprobe at an unaligned address\n");
>   ret = -EINVAL;
>   } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) {
>   printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n");
>   ret = -EINVAL;
> + } else if (IS_PREFIX(prfx)) {
> + printk("Cannot register a kprobe on the second word of prefixed 
> instruction\n");

Let's have line with in 80 columns length.

> + ret = -EINVAL;
> + }
> + prev = get_kprobe(p->addr - 1);
> + if (prev && IS_PREFIX(*prev->ainsn.insn)) {
> + printk("Cannot register a kprobe on the second word of prefixed 
> instruction\n");

same here.

-- Bala
> + ret = -EINVAL;
>   }
>  
> +
>   /* insn must be on a special executable page on ppc64.  This is
>* not explicitly required on ppc32 (right now), but it doesn't hurt */
>   if (!ret) {
> @@ -124,14 +137,18 @@ int arch_prepare_kprobe(struct kprobe *p)
>   }
>  
>   if (!ret) {
> - memcpy(p->ainsn.insn, p->addr,
> - MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
> + if (IS_PREFIX(insn))
> + len = MAX_INSN_SIZE * sizeof(kprobe_opcode_t);
> + else
> + len = sizeof(kprobe_opcode_t);
> + memcpy(p->ainsn.insn, p->addr, len);
>   p->opcode = *p->addr;
>   flush_icache_range((unsigned long)p->ainsn.insn,
>   (unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t));
>   }
>  
>   p->ainsn.boostable = 0;
> + preempt_enable_no_resched();
>   return ret;
>  }
>  NOKPROBE_SYMBOL(arch_prepare_kprobe);
> @@ -216,10 +233,11 @@ NOKPROBE_SYMBOL(arch_prepare_kretprobe);
>  static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
>  {
>   int ret;
> - unsigned int insn = *p->ainsn.insn;
> + unsigned int insn = p->ainsn.insn[0];
> + unsigned int su

[PATCH v2] powerpc/ptdump: don't entirely rebuild kernel when selecting CONFIG_PPC_DEBUG_WX

2020-01-13 Thread Christophe Leroy

Selecting CONFIG_PPC_DEBUG_WX only impacts ptdump and pgtable_32/64
init calls. Declaring related functions in asm/pgtable.h implies
rebuilding almost everything.

Move ptdump_check_wx() declaration in mm/mmu_decl.h

Signed-off-by: Christophe Leroy 
---
v2: use mm/mmu_decl.h instead of a new asm/ptdump.h
---
 arch/powerpc/include/asm/pgtable.h | 6 --
 arch/powerpc/mm/mmu_decl.h | 6 ++
 arch/powerpc/mm/ptdump/ptdump.c| 2 ++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 0e4ec8cc37b7..8cc543ed114c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -94,12 +94,6 @@ void mark_initmem_nx(void);
 static inline void mark_initmem_nx(void) { }
 #endif
 
-#ifdef CONFIG_PPC_DEBUG_WX
-void ptdump_check_wx(void);
-#else
-static inline void ptdump_check_wx(void) { }
-#endif
-
 /*
  * When used, PTE_FRAG_NR is defined in subarch pgtable.h
  * so we are sure it is included when arriving here.
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 8e99649c24fc..7097e07a209a 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -181,3 +181,9 @@ void mmu_mark_rodata_ro(void);
 static inline void mmu_mark_initmem_nx(void) { }
 static inline void mmu_mark_rodata_ro(void) { }
 #endif
+
+#ifdef CONFIG_PPC_DEBUG_WX
+void ptdump_check_wx(void);
+#else
+static inline void ptdump_check_wx(void) { }
+#endif
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 2f9ddc29c535..4af0d5d9589e 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 
+#include 
+
 #include "ptdump.h"
 
 /*
-- 
2.13.3

Re: [PATCH -next] powerpc/pmac/smp: Fix old-style declaration

2020-01-13 Thread Michael Ellerman

Christophe Leroy  writes:
> YueHaibing  a écrit :
>
>> There expect the 'static' keyword to come first in a declaration
>>
>> arch/powerpc/platforms/powermac/smp.c:664:1: warning: static is not  
>> at beginning of declaration [-Wold-style-declaration]
>> arch/powerpc/platforms/powermac/smp.c:665:1: warning: static is not  
>> at beginning of declaration [-Wold-style-declaration]
>>
>> Signed-off-by: YueHaibing 
>> ---
>>  arch/powerpc/platforms/powermac/smp.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powermac/smp.c  
>> b/arch/powerpc/platforms/powermac/smp.c
>> index f95fbde..7233b85 100644
>> --- a/arch/powerpc/platforms/powermac/smp.c
>> +++ b/arch/powerpc/platforms/powermac/smp.c
>> @@ -661,8 +661,8 @@ static void smp_core99_gpio_tb_freeze(int freeze)
>>  #endif /* !CONFIG_PPC64 */
>>
>>  /* L2 and L3 cache settings to pass from CPU0 to CPU1 on G4 cpus */
>> -volatile static long int core99_l2_cache;
>> -volatile static long int core99_l3_cache;
>> +static volatile long int core99_l2_cache;
>> +static volatile long int core99_l3_cache;
>
> Is it correct to declare it as volatile ?

I don't see any reason why it needs to be volatile, so I think we can
just remove that?

cheers

Re: [PATCH 5/5] powernv/pci: Move pnv_pci_dma_bus_setup() to pci-ioda.c

2020-01-13 Thread Alexey Kardashevskiy




On 10/01/2020 18:02, Oliver O'Halloran wrote:
> This is only used in pci-ioda.c so move it there and rename it to match.
> 
> Signed-off-by: Oliver O'Halloran 




Reviewed-by: Alexey Kardashevskiy 


> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 22 +-
>  arch/powerpc/platforms/powernv/pci.c  | 20 
>  arch/powerpc/platforms/powernv/pci.h  |  1 -
>  3 files changed, 21 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index e2a9440..4701621 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -3628,9 +3628,29 @@ static void pnv_pci_ioda_shutdown(struct 
> pci_controller *hose)
>  OPAL_ASSERT_RESET);
>  }
>  
> +static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
> +{
> + struct pci_controller *hose = bus->sysdata;
> + struct pnv_phb *phb = hose->private_data;
> + struct pnv_ioda_pe *pe;
> +
> + list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
> + continue;
> +
> + if (!pe->pbus)
> + continue;
> +
> + if (bus->number == ((pe->rid >> 8) & 0xFF)) {
> + pe->pbus = bus;
> + break;
> + }
> + }
> +}
> +
>  static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
>   .dma_dev_setup  = pnv_pci_ioda_dma_dev_setup,
> - .dma_bus_setup  = pnv_pci_dma_bus_setup,
> + .dma_bus_setup  = pnv_pci_ioda_dma_bus_setup,
>   .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
>   .setup_msi_irqs = pnv_setup_msi_irqs,
>   .teardown_msi_irqs  = pnv_teardown_msi_irqs,
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index 31f1949..a17bc1a1 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -810,26 +810,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
>   return tbl;
>  }
>  
> -void pnv_pci_dma_bus_setup(struct pci_bus *bus)
> -{
> - struct pci_controller *hose = bus->sysdata;
> - struct pnv_phb *phb = hose->private_data;
> - struct pnv_ioda_pe *pe;
> -
> - list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> - if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
> - continue;
> -
> - if (!pe->pbus)
> - continue;
> -
> - if (bus->number == ((pe->rid >> 8) & 0xFF)) {
> - pe->pbus = bus;
> - break;
> - }
> - }
> -}
> -
>  struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
>  {
>   struct pci_controller *hose = pci_bus_to_host(dev->bus);
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index 0cdc9ba..d3bbdea 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -188,7 +188,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, 
> unsigned long msr);
>  extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
>  extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
>  
> -extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
>  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
>  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>  extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
> 

-- 
Alexey

Re: [PATCH 4/5] powernv/pci: Fold pnv_pci_dma_dev_setup() into the pci-ioda.c version

2020-01-13 Thread Alexey Kardashevskiy




On 10/01/2020 18:02, Oliver O'Halloran wrote:
> pnv_pci_dma_dev_setup() does nothing but call the phb->dma_dev_setup()
> callback, if one exists. That callback is only set for normal PCIe PHBs so
> we can remove the layer of indirection and use the ioda version in
> the pci_controller_ops.
> 
> Signed-off-by: Oliver O'Halloran 



Reviewed-by: Alexey Kardashevskiy 



> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 7 ---
>  arch/powerpc/platforms/powernv/pci.c  | 9 -
>  arch/powerpc/platforms/powernv/pci.h  | 2 --
>  3 files changed, 4 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index ae177ee..e2a9440 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1743,8 +1743,10 @@ int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 
> num_vfs)
>  }
>  #endif /* CONFIG_PCI_IOV */
>  
> -static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev 
> *pdev)
> +static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  {
> + struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> + struct pnv_phb *phb = hose->private_data;
>   struct pci_dn *pdn = pci_get_pdn(pdev);
>   struct pnv_ioda_pe *pe;
>  
> @@ -3627,7 +3629,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller 
> *hose)
>  }
>  
>  static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
> - .dma_dev_setup  = pnv_pci_dma_dev_setup,
> + .dma_dev_setup  = pnv_pci_ioda_dma_dev_setup,
>   .dma_bus_setup  = pnv_pci_dma_bus_setup,
>   .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
>   .setup_msi_irqs = pnv_setup_msi_irqs,
> @@ -3886,7 +3888,6 @@ static void __init pnv_pci_init_ioda_phb(struct 
> device_node *np,
>   hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
>   break;
>   default:
> - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
>   hose->controller_ops = pnv_pci_ioda_controller_ops;
>   }
>  
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index 8307e1f..31f1949 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -810,15 +810,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
>   return tbl;
>  }
>  
> -void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
> -{
> - struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> - struct pnv_phb *phb = hose->private_data;
> -
> - if (phb && phb->dma_dev_setup)
> - phb->dma_dev_setup(phb, pdev);
> -}
> -
>  void pnv_pci_dma_bus_setup(struct pci_bus *bus)
>  {
>   struct pci_controller *hose = bus->sysdata;
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index f914f0b..0cdc9ba 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -108,7 +108,6 @@ struct pnv_phb {
>   int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
>unsigned int hwirq, unsigned int virq,
>unsigned int is_64, struct msi_msg *msg);
> - void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
>   int (*init_m64)(struct pnv_phb *phb);
>   int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>   void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
> @@ -189,7 +188,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, 
> unsigned long msr);
>  extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
>  extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
>  
> -extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
>  extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
>  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
>  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
> 

-- 
Alexey

Re: [PATCH 3/5] powerpc/iov: Move VF pdev fixup into pcibios_fixup_iov()

2020-01-13 Thread Alexey Kardashevskiy




On 10/01/2020 18:02, Oliver O'Halloran wrote:
> An ioda_pe for each VF is allocated in pnv_pci_sriov_enable() before the
> pci_dev for the VF is created. We need to set the pe->pdev pointer at
> some point after the pci_dev is created. Currently we do that in:
> 
> pcibios_bus_add_device()
>   pnv_pci_dma_dev_setup() (via phb->ops.dma_dev_setup)
>   /* fixup is done here */
>   pnv_pci_ioda_dma_dev_setup() (via pnv_phb->dma_dev_setup)
> 
> The fixup needs to be done before setting up DMA for for the VF's PE, but
> there's no real reason to delay it until this point. Move the fixup into
> pnv_pci_ioda_fixup_iov() so the ordering is:
> 
>   pcibios_add_device()
>   pnv_pci_ioda_fixup_iov() (via ppc_md.pcibios_fixup_sriov)
> 
>   pcibios_bus_add_device()
>   ...
> 
> This isn't strictly required, but it's slightly a slightly more logical


s/slightly a slightly/slightly/ ?



Reviewed-by: Alexey Kardashevskiy 



> place to do the fixup and it simplifies pnv_pci_dma_dev_setup().
> 
> Signed-off-by: Oliver O'Halloran 
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 29 +
>  arch/powerpc/platforms/powernv/pci.c  | 14 --
>  2 files changed, 25 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 33d4039..ae177ee 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2900,9 +2900,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
> pci_dev *pdev)
>   struct pci_dn *pdn;
>   int mul, total_vfs;
>  
> - if (!pdev->is_physfn || pci_dev_is_added(pdev))
> - return;
> -
>   pdn = pci_get_pdn(pdev);
>   pdn->vfs_expanded = 0;
>   pdn->m64_single_mode = false;
> @@ -2977,6 +2974,30 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
> pci_dev *pdev)
>   res->end = res->start - 1;
>   }
>  }
> +
> +static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
> +{
> + if (WARN_ON(pci_dev_is_added(pdev)))
> + return;
> +
> + if (pdev->is_virtfn) {
> + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
> +
> + /*
> +  * VF PEs are single-device PEs so their pdev pointer needs to
> +  * be set. The pdev doesn't exist when the PE is allocated (in
> +  * (pcibios_sriov_enable()) so we fix it up here.
> +  */
> + pe->pdev = pdev;
> + WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
> + } else if (pdev->is_physfn) {
> + /*
> +  * For PFs adjust their allocated IOV resources to match what
> +  * the PHB can support using it's M64 BAR table.
> +  */
> + pnv_pci_ioda_fixup_iov_resources(pdev);
> + }
> +}
>  #endif /* CONFIG_PCI_IOV */
>  
>  static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
> @@ -3872,7 +3893,7 @@ static void __init pnv_pci_init_ioda_phb(struct 
> device_node *np,
>   ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
>  
>  #ifdef CONFIG_PCI_IOV
> - ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
> + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
>   ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
>   ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
>   ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index e8e58a2c..8307e1f 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -814,20 +814,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
>  {
>   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>   struct pnv_phb *phb = hose->private_data;
> -#ifdef CONFIG_PCI_IOV
> - struct pnv_ioda_pe *pe;
> -
> - /* Fix the VF pdn PE number */
> - if (pdev->is_virtfn) {
> - list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> - if (pe->rid == ((pdev->bus->number << 8) |
> - (pdev->devfn & 0xff))) {
> - pe->pdev = pdev;
> - break;
> - }
> - }
> - }
> -#endif /* CONFIG_PCI_IOV */
>  
>   if (phb && phb->dma_dev_setup)
>   phb->dma_dev_setup(phb, pdev);
> 

-- 
Alexey

Re: [PATCH 1/5] powerpc/pci: Fold pcibios_setup_device() into pcibios_bus_add_device()

2020-01-13 Thread Alexey Kardashevskiy




On 10/01/2020 18:02, Oliver O'Halloran wrote:
> pcibios_bus_add_device() is the only caller of pcibios_setup_device().
> Fold them together since there's no real reason to keep them separate.
> 
> Signed-off-by: Oliver O'Halloran 


Reviewed-by: Alexey Kardashevskiy 


> ---
>  arch/powerpc/kernel/pci-common.c | 10 ++
>  1 file changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c 
> b/arch/powerpc/kernel/pci-common.c
> index f8a59d7..c6c0341 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -958,7 +958,7 @@ void pcibios_setup_bus_self(struct pci_bus *bus)
>   phb->controller_ops.dma_bus_setup(bus);
>  }
>  
> -static void pcibios_setup_device(struct pci_dev *dev)
> +void pcibios_bus_add_device(struct pci_dev *dev)
>  {
>   struct pci_controller *phb;
>   /* Fixup NUMA node as it may not be setup yet by the generic
> @@ -979,15 +979,9 @@ static void pcibios_setup_device(struct pci_dev *dev)
>   pci_read_irq_line(dev);
>   if (ppc_md.pci_irq_fixup)
>   ppc_md.pci_irq_fixup(dev);
> -}
> -
> -void pcibios_bus_add_device(struct pci_dev *pdev)
> -{
> - /* Perform platform-specific device setup */
> - pcibios_setup_device(pdev);
>  
>   if (ppc_md.pcibios_bus_add_device)
> - ppc_md.pcibios_bus_add_device(pdev);
> + ppc_md.pcibios_bus_add_device(dev);
>  }
>  
>  int pcibios_add_device(struct pci_dev *dev)
> 

-- 
Alexey

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Stephen Rothwell

Hi Timur,

On Mon, 13 Jan 2020 19:10:11 -0600 Timur Tabi  wrote:
>
> On 1/13/20 2:25 PM, Stephen Rothwell wrote:
> > The problem is not really the declaration, the problem is that
> > ev_byte_channel_send always accesses 16 bytes from the buffer and it is
> > not always passed a buffer that long (in one case it is passed a
> > pointer to a single byte).  So the alternative to the memcpy approach I
> > have take is to complicate ev_byte_channel_send so that only accesses
> > count bytes from the buffer.  
> 
> Ah, I see now.  This is all coming back to me.
> 
> I would prefer that ev_byte_channel_send() is updated to access only 
> 'count' bytes.  If that means adding a memcpy to the 
> ev_byte_channel_send() itself, then so be it.  Trying to figure out how 
> to stuff n bytes into 4 32-bit registers is probably not worth the effort.

Like this (I have compile tested this):

From: Stephen Rothwell 
Date: Thu, 9 Jan 2020 18:23:48 +1100
Subject: [PATCH v2] evh_bytechan: fix out of bounds accesses

ev_byte_channel_send() assumes that its third argument is a 16 byte array.
Some places where it is called it may not be (or we can't easily tell
if it is).  Newer compilers have started producing warnings about this,
so copy the bytes to send into a local array if the passed length is
to short.

Since all the calls of ev_byte_channel_send() are in one file, lets move
it there from the header file and let the compiler decide if it wants
to inline it.

The warnings are:

In file included from include/linux/byteorder/big_endian.h:5,
 from arch/powerpc/include/uapi/asm/byteorder.h:14,
 from include/asm-generic/bitops/le.h:6,
 from arch/powerpc/include/asm/bitops.h:250,
 from include/linux/bitops.h:29,
 from include/linux/kernel.h:12,
 from include/asm-generic/bug.h:19,
 from arch/powerpc/include/asm/bug.h:109,
 from include/linux/bug.h:5,
 from include/linux/mmdebug.h:5,
 from include/linux/gfp.h:5,
 from include/linux/slab.h:15,
 from drivers/tty/ehv_bytechan.c:24:
drivers/tty/ehv_bytechan.c: In function ‘ehv_bc_udbg_putc’:
arch/powerpc/include/asm/epapr_hcalls.h:298:20: warning: array subscript 1 is 
outside array bounds of ‘const char[1]’ [-Warray-bounds]
  298 |  r6 = be32_to_cpu(p[1]);
include/uapi/linux/byteorder/big_endian.h:40:51: note: in definition of macro 
‘__be32_to_cpu’
   40 | #define __be32_to_cpu(x) ((__force __u32)(__be32)(x))
  |   ^
arch/powerpc/include/asm/epapr_hcalls.h:298:7: note: in expansion of macro 
‘be32_to_cpu’
  298 |  r6 = be32_to_cpu(p[1]);
  |   ^~~
drivers/tty/ehv_bytechan.c:166:13: note: while referencing ‘data’
  166 | static void ehv_bc_udbg_putc(char c)
  | ^~~~
In file included from include/linux/byteorder/big_endian.h:5,
 from arch/powerpc/include/uapi/asm/byteorder.h:14,
 from include/asm-generic/bitops/le.h:6,
 from arch/powerpc/include/asm/bitops.h:250,
 from include/linux/bitops.h:29,
 from include/linux/kernel.h:12,
 from include/asm-generic/bug.h:19,
 from arch/powerpc/include/asm/bug.h:109,
 from include/linux/bug.h:5,
 from include/linux/mmdebug.h:5,
 from include/linux/gfp.h:5,
 from include/linux/slab.h:15,
 from drivers/tty/ehv_bytechan.c:24:
arch/powerpc/include/asm/epapr_hcalls.h:299:20: warning: array subscript 2 is 
outside array bounds of ‘const char[1]’ [-Warray-bounds]
  299 |  r7 = be32_to_cpu(p[2]);
include/uapi/linux/byteorder/big_endian.h:40:51: note: in definition of macro 
‘__be32_to_cpu’
   40 | #define __be32_to_cpu(x) ((__force __u32)(__be32)(x))
  |   ^
arch/powerpc/include/asm/epapr_hcalls.h:299:7: note: in expansion of macro 
‘be32_to_cpu’
  299 |  r7 = be32_to_cpu(p[2]);
  |   ^~~
drivers/tty/ehv_bytechan.c:166:13: note: while referencing ‘data’
  166 | static void ehv_bc_udbg_putc(char c)
  | ^~~~
In file included from include/linux/byteorder/big_endian.h:5,
 from arch/powerpc/include/uapi/asm/byteorder.h:14,
 from include/asm-generic/bitops/le.h:6,
 from arch/powerpc/include/asm/bitops.h:250,
 from include/linux/bitops.h:29,
 from include/linux/kernel.h:12,
 from include/asm-generic/bug.h:19,
 from arch/powerpc/include/asm/bug.h:109,
 from include/linux/bug.h:5,
 from include/linux/mmdebug.h:5,
 from include/linux/gfp.h:5,
 from include/linux/slab.h:15,
 from drivers/tty/ehv_bytechan.c:24:
arch

Re: [PATCH] powerpc/ptdump: don't entirely rebuild kernel when selecting CONFIG_PPC_DEBUG_WX

2020-01-13 Thread Michael Ellerman

Christophe Leroy  writes:
> Selecting CONFIG_PPC_DEBUG_WX only impacts ptdump and pgtable_32/64
> init calls. Declaring related functions in asm/pgtable.h implies
> rebuilding almost everything.
>
> Move ptdump_check_wx() declaration in a new dedicated header file.

Can you put it in arch/powerpc/mm/mmu_decl.h ? That already exists for
things that are private to arch/powerpc/mm, which this function is
AFAICT.

cheers

>  arch/powerpc/include/asm/pgtable.h |  6 --
>  arch/powerpc/include/asm/ptdump.h  | 15 +++
>  arch/powerpc/mm/pgtable_32.c   |  1 +
>  arch/powerpc/mm/pgtable_64.c   |  1 +
>  arch/powerpc/mm/ptdump/ptdump.c|  1 +
>  5 files changed, 18 insertions(+), 6 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/ptdump.h
>
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index 0e4ec8cc37b7..8cc543ed114c 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -94,12 +94,6 @@ void mark_initmem_nx(void);
>  static inline void mark_initmem_nx(void) { }
>  #endif
>  
> -#ifdef CONFIG_PPC_DEBUG_WX
> -void ptdump_check_wx(void);
> -#else
> -static inline void ptdump_check_wx(void) { }
> -#endif
> -
>  /*
>   * When used, PTE_FRAG_NR is defined in subarch pgtable.h
>   * so we are sure it is included when arriving here.
> diff --git a/arch/powerpc/include/asm/ptdump.h 
> b/arch/powerpc/include/asm/ptdump.h
> new file mode 100644
> index ..246b92c21729
> --- /dev/null
> +++ b/arch/powerpc/include/asm/ptdump.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_PTDUMP_H
> +#define _ASM_POWERPC_PTDUMP_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#ifdef CONFIG_PPC_DEBUG_WX
> +void ptdump_check_wx(void);
> +#else
> +static inline void ptdump_check_wx(void) { }
> +#endif
> +
> +#endif /* __ASSEMBLY__ */
> +
> +#endif /* _ASM_POWERPC_PTDUMP_H */
> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> index 73b84166d06a..6c866f1b1eeb 100644
> --- a/arch/powerpc/mm/pgtable_32.c
> +++ b/arch/powerpc/mm/pgtable_32.c
> @@ -29,6 +29,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index e78832dce7bb..3686cd887c2f 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -45,6 +45,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
> index 2f9ddc29c535..d7b02bcd0691 100644
> --- a/arch/powerpc/mm/ptdump/ptdump.c
> +++ b/arch/powerpc/mm/ptdump/ptdump.c
> @@ -23,6 +23,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "ptdump.h"
>  
> -- 
> 2.13.3

Re: [linux-next/mainline][bisected 3acac06][ppc] Oops when unloading mpt3sas driver

2020-01-13 Thread Abdul Haleem

On Thu, 2020-01-09 at 06:22 -0800, Christoph Hellwig wrote:
> On Thu, Jan 09, 2020 at 02:27:25PM +0530, Abdul Haleem wrote:
> > + CC Christoph Hellwig
> 
> The only thing this commit changed for the dma coherent case (which
> ppc64 uses) is that we now look up the page to free by the DMA address
> instead of the virtual address passed in.  Which suggests this call
> stack passes in a broken dma address.  I suspect we somehow managed
> to disable the ppc iommu bypass mode after allocating memory, which
> would cause symptoms like this, and thus the commit is just exposing
> a pre-existing problem.

Trace with printk added for page->addr, will this help ?

mpt3sas_cm0: removing handle(0x000f), sas_addr(0x500304801f080d3d)
mpt3sas_cm0: enclosure logical id(0x500304801f080d3f), slot(12)
mpt3sas_cm0: enclosure level(0x), connector name( )
mpt3sas_cm0: mpt3sas_transport_port_remove: removed:
sas_addr(0x500304801f080d3f)
mpt3sas_cm0: expander_remove: handle(0x0009),
sas_addr(0x500304801f080d3f)
mpt3sas_cm0: sending diag reset !!
mpt3sas_cm0: diag reset: SUCCESS 
page->vaddr = 0xc03f2d20
page->vaddr = 0xc03f2ef0
page->vaddr = 0xc03f3843
page->vaddr = 0xc03f3d7d
page->vaddr = 0xc03f7576
BUG: Unable to handle kernel data access on write at 0xc04a00017c34
Faulting instruction address: 0xc02fb2b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: iptable_mangle xt_MASQUERADE iptable_nat nf_nat
xt_conntrack nf_conntrack nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4
xt_tcpudp tun bridge stp llc btrfs blake2b_generic xor zstd_decompress
zstd_compress lzo_compress iptable_filter raid6_pq mpt3sas(-) vmx_crypto
gf128mul nfsd powernv_rng rng_core raid_class scsi_transport_sas kvm_hv
kvm binfmt_misc ip_tables x_tables xfs libcrc32c qla2xxx ixgbe i40e
nvme_fc nvme_fabrics mdio nvme_core autofs4
CPU: 13 PID: 17267 Comm: rmmod Not tainted 
5.5.0-rc5-next-20200108-autotest-2-g36e1367-dirty #1
NIP:  c02fb2b0 LR: c01aa5b4 CTR: c004a010
REGS: c03fc3f9f5d0 TRAP: 0380   Not tainted  
(5.5.0-rc5-next-20200108-autotest-2-g36e1367-dirty)
MSR:  90009033   CR: 22002424  XER: 2000  
CFAR: c01aa5b0 IRQMASK: 0 
GPR00: c004a0a8 c03fc3f9f860 c1311300 c04a00017c00 
GPR04:  c03f7576 003e00017c00  
GPR08:  c13bd000 c04a00017c34 05bf 
GPR12: c004a010 c03f4a80   
GPR16:   01001b4e0180 10020098 
GPR20: 10020050 10020038 10020078 05f0 
GPR24: c0d4e8a8 c0d4e8c8 05f0  
GPR28: c1299398 c03f7576 0001 c03fdde0d0a8 
NIP [c02fb2b0] __free_pages+0x10/0x50
LR [c01aa5b4] dma_direct_free_pages+0x54/0x90
Call Trace:
[c03fc3f9f860] [c0d4e8a8] str_spec.72296+0x199114/0x2009cc 
(unreliable)
[c03fc3f9f880] [c004a0a8] dma_iommu_free_coherent+0x98/0xd0
[c03fc3f9f8d0] [c01a95e8] dma_free_attrs+0xf8/0x100
[c03fc3f9f920] [c031205c] dma_pool_destroy+0x19c/0x230
[c03fc3f9f9d0] [c0081c181e98] _base_release_memory_pools+0x1d8/0x4b0 
[mpt3sas]
[c03fc3f9fa60] [c0081c18b9f0] mpt3sas_base_detach+0x40/0x150 [mpt3sas]
[c03fc3f9fad0] [c0081c19c92c] scsih_remove+0x24c/0x3e0 [mpt3sas]
[c03fc3f9fb90] [c06126a4] pci_device_remove+0x64/0x110
[c03fc3f9fbd0] [c06c7ea4] device_release_driver_internal+0x154/0x260
[c03fc3f9fc10] [c06c807c] driver_detach+0x8c/0x140
[c03fc3f9fc50] [c06c6188] bus_remove_driver+0x78/0x100
[c03fc3f9fc80] [c06c8d90] driver_unregister+0x40/0x90
[c03fc3f9fcf0] [c0611dc8] pci_unregister_driver+0x38/0x110
[c03fc3f9fd40] [c0081c1af108] _mpt3sas_exit+0x50/0x40c8 [mpt3sas]
[c03fc3f9fda0] [c01d8ed8] sys_delete_module+0x1a8/0x2a0
[c03fc3f9fe20] [c000b9d0] system_call+0x5c/0x68
Instruction dump:
88830051 2fa4 41de0008 4bffe87c 7d234b78 4bfffe94 6000 6042 
3c4c0101 38426060 39430034 7c0004ac <7d005028> 3108 7d00512d 40c2fff4 
---[ end trace c5ab52378eb942ad ]---
Segmentation fault

-- 
Regard's

Abdul Haleem
IBM Linux Technology Centre

Re: linux-next: build warning after merge of the bpf-next tree

2020-01-13 Thread Alexei Starovoitov

On Sun, Jan 12, 2020 at 8:33 PM Zong Li  wrote:
>
> I'm not quite familiar with btf, so I have no idea why there are two
> weak symbols be added in 8580ac9404f6 ("bpf: Process in-kernel BTF")

I can explain what these weak symbols are for, but that won't change
the fact that compiler or linker are buggy. The weak symbols should work
in all cases and compiler should pick correct relocation.
In this case it sounds that compiler picked relative relocation and failed
to reach zero from that address.

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Scott Wood

On Mon, 2020-01-13 at 19:13 -0600, Timur Tabi wrote:
> On 1/13/20 7:10 PM, Timur Tabi wrote:
> > I would prefer that ev_byte_channel_send() is updated to access only 
> > 'count' bytes.  If that means adding a memcpy to the 
> > ev_byte_channel_send() itself, then so be it.  Trying to figure out how 
> > to stuff n bytes into 4 32-
> > bit registers is probably not worth the effort.
> 
> Looks like ev_byte_channel_receive() has the same bug, but in reverse.

It only has one user, which always passes in a 16-byte buffer.

-Scott

Re: [PATCH] powerpc/xive: discard ESB load value when interrupt is invalid

2020-01-13 Thread Michael Ellerman

Cédric Le Goater  writes:
> On 1/13/20 2:01 PM, Cédric Le Goater wrote:
>> From: Frederic Barrat 
>> 
>> A load on an ESB page returning all 1's means that the underlying
>> device has invalidated the access to the PQ state of the interrupt
>> through mmio. It may happen, for example when querying a PHB interrupt
>> while the PHB is in an error state.
>> 
>> In that case, we should consider the interrupt to be invalid when
>> checking its state in the irq_get_irqchip_state() handler.
>
>
> and we need also these tags :
>
> Fixes: da15c03b047d ("powerpc/xive: Implement get_irqchip_state method for 
> XIVE to fix shutdown race")
> Cc: sta...@vger.kernel.org # v5.3+

I added those, although it's v5.4+, as the offending commit was first
included in v5.4-rc1.

cheers

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Timur Tabi


On 1/13/20 7:10 PM, Timur Tabi wrote:


I would prefer that ev_byte_channel_send() is updated to access only 
'count' bytes.  If that means adding a memcpy to the 
ev_byte_channel_send() itself, then so be it.  Trying to figure out how 
to stuff n bytes into 4 32-bit registers is probably not worth the effort.


Looks like ev_byte_channel_receive() has the same bug, but in reverse.

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Michael Ellerman

Laurentiu Tudor  writes:
> Hello,
>
> On 13.01.2020 15:48, Timur Tabi wrote:
>> On 1/13/20 6:26 AM, Michael Ellerman wrote:
>>> I've never heard of it, and I have no idea how to test it.
>>>
>>> It's not used by qemu, I guess there is/was a Freescale hypervisor that
>>> used it.
>> 
>> Yes, there is/was a Freescale hypervisor that I and a few others worked 
>> on.  I've added a couple people on CC that might be able to tell the 
>> current disposition of it.
>
> More info on this: it's opensource and it's published here [1]. We still 
> claim to maintain it but there wasn't much activity past years, as one 
> might notice.
>
>>> But maybe it's time to remove it if it's not being maintained/used by
>>> anyone?
>> 
>> I wouldn't be completely opposed to that if there really are no more 
>> users.  There really weren't any users even when I wrote the driver.
>
> There are a few users that I know of, but I can't tell if that's enough 
> to justify keeping the driver.

It is, I don't want to remove code that people are actually using,
unless it's causing unsustainable maintenance burden.

But this should be easy enough to get fixed.

Could you or someone else at NXP volunteer to maintain this driver? That
shouldn't involve much work, other than fixing small things like this
warning.

cheers

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Timur Tabi


On 1/13/20 2:25 PM, Stephen Rothwell wrote:

The problem is not really the declaration, the problem is that
ev_byte_channel_send always accesses 16 bytes from the buffer and it is
not always passed a buffer that long (in one case it is passed a
pointer to a single byte).  So the alternative to the memcpy approach I
have take is to complicate ev_byte_channel_send so that only accesses
count bytes from the buffer.


Ah, I see now.  This is all coming back to me.

I would prefer that ev_byte_channel_send() is updated to access only 
'count' bytes.  If that means adding a memcpy to the 
ev_byte_channel_send() itself, then so be it.  Trying to figure out how 
to stuff n bytes into 4 32-bit registers is probably not worth the effort.

[RFC PATCH 1/1] powerpc/pgtable: Skip serialize_against_pte_lookup() when unmapping

2020-01-13 Thread Leonardo Bras

If a process (qemu) with a lot of CPUs (128) try to munmap() a large
chunk of memory (496GB) mapped with THP, it takes an average of 275
seconds, which can cause a lot of problems to the load (in qemu case,
the guest will lock for this time).

Trying to find the source of this bug, I found out most of this time is
spent on serialize_against_pte_lookup(). This function will take a lot
of time in smp_call_function_many() if there is more than a couple CPUs
running the user process. Since it has to happen to all THP mapped, it
will take a very long time for large amounts of memory.

By the docs, serialize_against_pte_lookup() is needed in order to avoid
pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless
pagetable walk, to happen concurrently with THP splitting/collapsing.

In this case, as the page is being munmapped, there is no need to call
serialize_against_pte_lookup(), given it will not be used after or
during munmap.

This patch does so by adding option to skip serializing on
radix__pmdp_huge_get_and_clear(). This option is used by the proxy
__pmdp_huge_get_and_clear(), that is called with 'unmap == true' on
an (new) arch version of pmdp_huge_get_and_clear_full(), and with
'unmap == false' on pmdp_huge_get_and_clear(), that is used on
generic code.

pmdp_huge_get_and_clear_full() is only called in zap_huge_pmd(), so
it's safe to assume it will always be called on memory that will be
unmapped.

On my workload (qemu: 128vcpus + 500GB), I could see munmap's time
reduction from 275 seconds to 39ms.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 25 +---
 arch/powerpc/include/asm/book3s/64/radix.h   |  3 ++-
 arch/powerpc/mm/book3s64/radix_pgtable.c |  6 +++--
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b01624e5c467..5e3e7c48624a 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1243,14 +1243,21 @@ extern int pmdp_test_and_clear_young(struct 
vm_area_struct *vma,
 unsigned long address, pmd_t *pmdp);
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
-static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-   unsigned long addr, pmd_t *pmdp)
+static inline pmd_t __pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp,
+ bool unmap)
 {
if (radix_enabled())
-   return radix__pmdp_huge_get_and_clear(mm, addr, pmdp);
+   return radix__pmdp_huge_get_and_clear(mm, addr, pmdp, !unmap);
return hash__pmdp_huge_get_and_clear(mm, addr, pmdp);
 }
 
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+   unsigned long addr, pmd_t *pmdp)
+{
+   return __pmdp_huge_get_and_clear(mm, addr, pmdp, false);
+}
+
 static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
 {
@@ -1337,6 +1344,18 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *, 
unsigned long, pte_t *);
 void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
 pte_t *, pte_t, pte_t);
 
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+unsigned long address,
+pmd_t *pmdp,
+int full)
+{
+   /*
+* Called only on unmapping
+*/
+   return __pmdp_huge_get_and_clear(mm, address, pmdp, true);
+}
+
 /*
  * Returns true for a R -> RW upgrade of pte
  */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..148874aa5260 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -253,7 +253,8 @@ extern void radix__pgtable_trans_huge_deposit(struct 
mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
 extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, 
pmd_t *pmdp);
 extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp);
+   unsigned long addr, pmd_t *pmdp,
+   bool serialize);
 static inline int radix__has_transparent_hugepage(void)
 {
/* For radix 2M at PMD level means thp */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 974109bb85db..eac8409cd316 100644
--- a/arch/powe

Re: [PATCH v4 5/9] trace/bpf_trace: open access for CAP_SYS_PERFMON privileged process

2020-01-13 Thread Song Liu




> On Dec 18, 2019, at 1:28 AM, Alexey Budankov 
>  wrote:
> 
> 
> Open access to bpf_trace monitoring for CAP_SYS_PERFMON privileged
> processes. For backward compatibility reasons access to bpf_trace
> monitoring remains open for CAP_SYS_ADMIN privileged processes but
> CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged
> with respect to CAP_SYS_PERFMON capability.
> 
> Signed-off-by: Alexey Budankov 

Acked-by: Song Liu 

> ---
> kernel/trace/bpf_trace.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 44bd08f2443b..bafe21ac6d92 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1272,7 +1272,7 @@ int perf_event_query_prog_array(struct perf_event 
> *event, void __user *info)
>   u32 *ids, prog_cnt, ids_len;
>   int ret;
> 
> - if (!capable(CAP_SYS_ADMIN))
> + if (!perfmon_capable())
>   return -EPERM;
>   if (event->attr.type != PERF_TYPE_TRACEPOINT)
>   return -EINVAL;

I guess we need to fix this check for kprobe/uprobe created with 
perf_event_open()...

Thanks,
Song

Re: [PATCH v4 1/9] capabilities: introduce CAP_SYS_PERFMON to kernel and user space

2020-01-13 Thread Song Liu




> On Dec 18, 2019, at 1:24 AM, Alexey Budankov 
>  wrote:
> 
> 
> Introduce CAP_SYS_PERFMON capability devoted to secure system performance
> monitoring and observability operations so that CAP_SYS_PERFMON would
> assist CAP_SYS_ADMIN capability in its governing role for perf_events,
> i915_perf and other subsystems of the kernel.
> 
> CAP_SYS_PERFMON intends to harden system security and integrity during
> system performance monitoring and observability operations by decreasing
> attack surface that is available to CAP_SYS_ADMIN privileged processes.
> 
> CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related
> to system performance monitoring and observability operations and balance
> amount of CAP_SYS_ADMIN credentials in accordance with the recommendations
> provided in the man page for CAP_SYS_ADMIN [1]: "Note: this capability
> is overloaded; see Notes to kernel developers, below."
> 
> [1] 
> https://urldefense.proofpoint.com/v2/url?u=http-3A__man7.org_linux_man-2Dpages_man7_capabilities.7.html&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=dR8692q0_uaizy0jkrBJQM5k2hfm4CiFxYT8KaysFrg&m=L5qCuMRrTvYhyjR1rpgE9vEv4HppVlOXDIzKzoGL30c&s=FNJpET4buKFRuqktVHQphaY1qE7IsdFpU4iYwpCn4tY&e=
>  
> 
> Signed-off-by: Alexey Budankov 

Acked-by: Song Liu

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Stephen Rothwell

Hi Timur,

On Mon, 13 Jan 2020 10:03:18 -0600 Timur Tabi  wrote:
>
> Why not simply correct the parameters of ev_byte_channel_send?
> 
> static inline unsigned int ev_byte_channel_send(unsigned int handle,
> -unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES])
> +unsigned int *count, const char *buffer)
> 
> Back then, I probably thought I was just being clever with this code,
> but I realize now that it doesn't make sense to do the way I did.

The problem is not really the declaration, the problem is that
ev_byte_channel_send always accesses 16 bytes from the buffer and it is
not always passed a buffer that long (in one case it is passed a
pointer to a single byte).  So the alternative to the memcpy approach I
have take is to complicate ev_byte_channel_send so that only accesses
count bytes from the buffer.

-- 
Cheers,
Stephen Rothwell

pgp69n8DME1wI.pgp
Description: OpenPGP digital signature

Re: [PATCH 08/10] soc: lantiq: convert to devm_platform_ioremap_resource

2020-01-13 Thread Paul Burton

Hello,

Yangtao Li wrote:
> Use devm_platform_ioremap_resource() to simplify code.

Applied to mips-next.

> commit 23c25c732530
> https://git.kernel.org/mips/c/23c25c732530
> 
> Signed-off-by: Yangtao Li 
> Signed-off-by: Paul Burton 

Thanks,
Paul

[ This message was auto-generated; if you believe anything is incorrect
  then please email paulbur...@kernel.org to report it. ]

[RFC PATCH v3 12/12] powerpc/vdso: provide __arch_is_hw_counter_valid()

2020-01-13 Thread Christophe Leroy

To avoid a double verification of the hw_counter validity,
provide __arch_is_hw_counter_valid()

Before:
clock-gettime-realtime:vdso: 1078 nsec/call

After:
clock-gettime-realtime:vdso: 1064 nsec/call

The * shows the test of the clock type.
The > shows the additional test on the counter,
that goes away with this patch

Before:
 5e0:   81 25 00 04 lwz r9,4(r5)
*5e4:   2f 89 00 00 cmpwi   cr7,r9,0
*5e8:   40 9e 01 3c bne cr7,724 <__c_kernel_clock_gettime+0x17c>
 5ec:   94 21 ff e0 stwur1,-32(r1)
 5f0:   93 61 00 0c stw r27,12(r1)
 5f4:   93 81 00 10 stw r28,16(r1)
 5f8:   93 a1 00 14 stw r29,20(r1)
 5fc:   93 c1 00 18 stw r30,24(r1)
 600:   93 e1 00 1c stw r31,28(r1)
 604:   7d 8d 42 e6 mftbu   r12
 608:   7f ac 42 e6 mftbr29
 60c:   7c cd 42 e6 mftbu   r6
 610:   7f 8c 30 40 cmplw   cr7,r12,r6
 614:   40 9e ff f0 bne cr7,604 <__c_kernel_clock_gettime+0x5c>
>618:   2f 8c 00 00 cmpwi   cr7,r12,0
 61c:   83 6b 00 28 lwz r27,40(r11)
 620:   83 8b 00 2c lwz r28,44(r11)
 624:   81 45 00 08 lwz r10,8(r5)
 628:   80 e5 00 0c lwz r7,12(r5)
>62c:   41 9c 00 b4 blt cr7,6e0 <__c_kernel_clock_gettime+0x138>
 630:   81 05 00 18 lwz r8,24(r5)
 634:   83 c5 00 1c lwz r30,28(r5)
 638:   80 cb 00 24 lwz r6,36(r11)
 63c:   83 e5 00 00 lwz r31,0(r5)
 640:   7f 9f 00 40 cmplw   cr7,r31,r0
 644:   40 9e 00 84 bne cr7,6c8 <__c_kernel_clock_gettime+0x120>

After:
 5e0:   81 25 00 04 lwz r9,4(r5)
*5e4:   2f 89 00 00 cmpwi   cr7,r9,0
*5e8:   40 9e 01 88 bne cr7,770 <__c_kernel_clock_gettime+0x1c8>
 5ec:   94 21 ff e0 stwur1,-32(r1)
 5f0:   93 61 00 0c stw r27,12(r1)
 5f4:   93 81 00 10 stw r28,16(r1)
 5f8:   93 a1 00 14 stw r29,20(r1)
 5fc:   93 c1 00 18 stw r30,24(r1)
 600:   93 e1 00 1c stw r31,28(r1)
 604:   7f cd 42 e6 mftbu   r30
 608:   7f ac 42 e6 mftbr29
 60c:   7c cd 42 e6 mftbu   r6
 610:   7f 9e 30 40 cmplw   cr7,r30,r6
 614:   40 9e ff f0 bne cr7,604 <__c_kernel_clock_gettime+0x5c>
 618:   83 6b 00 28 lwz r27,40(r11)
 61c:   83 8b 00 2c lwz r28,44(r11)
 620:   81 45 00 08 lwz r10,8(r5)
 624:   80 e5 00 0c lwz r7,12(r5)
 628:   81 05 00 18 lwz r8,24(r5)
 62c:   83 e5 00 1c lwz r31,28(r5)
 630:   80 cb 00 24 lwz r6,36(r11)
 634:   81 85 00 00 lwz r12,0(r5)
 638:   7f 8c 00 40 cmplw   cr7,r12,r0
 63c:   40 9e 00 84 bne cr7,6c0 <__c_kernel_clock_gettime+0x118>

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/vdso/gettimeofday.h | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index d1e702e0ea86..c5a24f31382e 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -69,15 +69,18 @@ int clock_getres32_fallback(clockid_t _clkid, struct 
old_timespec32 *_ts)
 }
 #endif
 
-static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
+static __always_inline bool __arch_is_hw_counter_valid(s32 clock_mode)
 {
/*
 * clock_mode == 0 implies that vDSO are enabled otherwise
 * fallback on syscall.
 */
-   if (clock_mode != 0)
-   return U64_MAX;
+   return clock_mode == 0 ? true : false;
+}
+#define __arch_is_hw_counter_valid __arch_is_hw_counter_valid
 
+static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
+{
return get_tb();
 }
 
-- 
2.13.3

[RFC PATCH v3 11/12] lib: vdso: split clock verification out of __arch_get_hw_counter()

2020-01-13 Thread Christophe Leroy

__arch_get_hw_counter() returns the current value of the counter if
the counter is valid or a negative number if the counter is not valid.

This is suboptimal because the validity is checked twice: once before
reading the counter and once after reading the counter.

Optionaly split the verification out of __arch_get_hw_counter()
by providing an optional __arch_is_hw_counter_valid() function.

Signed-off-by: Christophe Leroy 
---
 lib/vdso/gettimeofday.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index ea1a55507af5..001f6329e846 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -67,11 +67,18 @@ static __always_inline int do_hres(const struct vdso_data 
*vd, clockid_t clk,
 
do {
seq = vdso_read_begin(vd);
+#ifdef __arch_is_hw_counter_valid
+   if (unlikely(!__arch_is_hw_counter_valid(vd->clock_mode)))
+   return -1;
+#endif
+
cycles = __arch_get_hw_counter(vd->clock_mode);
ns = vdso_ts->nsec;
last = vd->cycle_last;
+#ifndef __arch_is_hw_counter_valid
if (unlikely((s64)cycles < 0))
return -1;
+#endif
 
ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
nsec = ns >> vd->shift;
-- 
2.13.3

[RFC PATCH v3 10/12] powerpc/vdso: provide vdso data pointer from the ASM caller.

2020-01-13 Thread Christophe Leroy

__arch_get_vdso_data() clobbers the link register, requiring
the caller to save it.

As the ASM calling function already has to set a stack frame and
saves the link register before calling the C vdso function,
retriving the vdso data pointer there is lighter.

The improvement is significant:

Before:
gettimeofday:vdso: 1027 nsec/call
clock-getres-realtime-coarse:vdso: 699 nsec/call
clock-gettime-realtime-coarse:vdso: 784 nsec/call
clock-gettime-realtime:vdso: 1231 nsec/call

After:
gettimeofday:vdso: 908 nsec/call
clock-getres-realtime-coarse:vdso: 545 nsec/call
clock-gettime-realtime-coarse:vdso: 617 nsec/call
clock-gettime-realtime:vdso: 1078 nsec/call

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/vdso/gettimeofday.h | 12 ++--
 arch/powerpc/kernel/vdso32/gettimeofday.S|  3 +++
 arch/powerpc/kernel/vdso32/vgettimeofday.c   | 19 +++
 arch/powerpc/kernel/vdso64/gettimeofday.S|  3 +++
 arch/powerpc/kernel/vdso64/vgettimeofday.c   | 19 +++
 5 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 343c81a7e951..d1e702e0ea86 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -6,13 +6,14 @@
 
 #include 
 #include 
-#include 
 #include 
 
 #define VDSO_HAS_CLOCK_GETRES  1
 
 #define VDSO_HAS_TIME  1
 
+#define VDSO_GETS_VD_PTR_FROM_ARCH 1
+
 static __always_inline int do_syscall_2(const unsigned long _r0, const 
unsigned long _r3,
const unsigned long _r4)
 {
@@ -80,15 +81,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
return get_tb();
 }
 
-void *__get_datapage(void);
-
-static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
-{
-   struct vdso_arch_data *vdso_data = __get_datapage();
-
-   return vdso_data->data;
-}
-
 /*
  * powerpc specific delta calculation.
  *
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S 
b/arch/powerpc/kernel/vdso32/gettimeofday.S
index ba0bd64b3da3..0d43878e462c 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -18,6 +19,7 @@
stwur1, -16(r1)
mflrr0
stw r0, 20(r1)
+   get_datapager5, VDSO_DATA_OFFSET
bl  \funct
lwz r0, 20(r1)
cmpwi   r3, 0
@@ -79,6 +81,7 @@ V_FUNCTION_BEGIN(__kernel_time)
stwur1, -16(r1)
mflrr0
stw r0, 20(r1)
+   get_datapager4, VDSO_DATA_OFFSET
bl  __c_kernel_time
lwz r0, 20(r1)
crclr   cr0*4+so
diff --git a/arch/powerpc/kernel/vdso32/vgettimeofday.c 
b/arch/powerpc/kernel/vdso32/vgettimeofday.c
index 4ed1bf2ae30e..7fdccf896a9c 100644
--- a/arch/powerpc/kernel/vdso32/vgettimeofday.c
+++ b/arch/powerpc/kernel/vdso32/vgettimeofday.c
@@ -5,22 +5,25 @@
 #include 
 #include 
 
-int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
+int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
+const struct vdso_data *vd)
 {
-   return __cvdso_clock_gettime32(clock, ts);
+   return __cvdso_clock_gettime32(vd, clock, ts);
 }
 
-int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone 
*tz)
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone 
*tz,
+   const struct vdso_data *vd)
 {
-   return __cvdso_gettimeofday(tv, tz);
+   return __cvdso_gettimeofday(vd, tv, tz);
 }
 
-int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res)
+int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
+   const struct vdso_data *vd)
 {
-   return __cvdso_clock_getres_time32(clock_id, res);
+   return __cvdso_clock_getres_time32(vd, clock_id, res);
 }
 
-time_t __c_kernel_time(time_t *time)
+time_t __c_kernel_time(time_t *time, const struct vdso_data *vd)
 {
-   return __cvdso_time(time);
+   return __cvdso_time(vd, time);
 }
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 22f4f1f73bbc..f61c53eb6600 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -18,6 +19,7 @@
mflrr0
std r0, 16(r1)
stdur1, -128(r1)
+   get_datapager5, VDSO_DATA_OFFSET
bl  \funct
addir1, r1, 128
ld  r0, 16(r1)
@@ -79,6 +81,7 @@ V_FUNCTION_BEGIN(__kernel_time)
mflrr0
std r0, 16(r1)
stdur1, -128(r1)
+   get_datapager

[RFC PATCH v3 08/12] lib: vdso: allow arches to provide vdso data pointer

2020-01-13 Thread Christophe Leroy

On powerpc, __arch_get_vdso_data() clobbers the link register,
requiring the caller to save it.

As the parent function already has to set a stack frame and saves
the link register before calling the C vdso function, retriving the
vdso data pointer there is lighter.

Give arches the opportunity to hand the vdso data pointer
to C vdso functions.

Signed-off-by: Christophe Leroy 
---
 lib/vdso/gettimeofday.c | 56 +
 1 file changed, 56 insertions(+)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index da15a8842825..ea1a55507af5 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -104,9 +104,15 @@ static __always_inline int do_coarse(const struct 
vdso_data *vd, clockid_t clk,
 }
 
 static __maybe_unused int
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+__cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock,
+ struct __kernel_timespec *ts)
+{
+#else
 __cvdso_clock_gettime_common(clockid_t clock, struct __kernel_timespec *ts)
 {
const struct vdso_data *vd = __arch_get_vdso_data();
+#endif
u32 msk;
 
/* Check for negative values or invalid clocks */
@@ -131,9 +137,16 @@ __cvdso_clock_gettime_common(clockid_t clock, struct 
__kernel_timespec *ts)
 }
 
 static __maybe_unused int
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+__cvdso_clock_gettime(const struct vdso_data *vd, clockid_t clock,
+ struct __kernel_timespec *ts)
+{
+   int ret = __cvdso_clock_gettime_common(vd, clock, ts);
+#else
 __cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
 {
int ret = __cvdso_clock_gettime_common(clock, ts);
+#endif
 
if (unlikely(ret))
return clock_gettime_fallback(clock, ts);
@@ -141,12 +154,21 @@ __cvdso_clock_gettime(clockid_t clock, struct 
__kernel_timespec *ts)
 }
 
 static __maybe_unused int
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+__cvdso_clock_gettime32(const struct vdso_data *vd, clockid_t clock,
+   struct old_timespec32 *res)
+#else
 __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res)
+#endif
 {
struct __kernel_timespec ts;
int ret;
 
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+   ret = __cvdso_clock_gettime_common(vd, clock, &ts);
+#else
ret = __cvdso_clock_gettime_common(clock, &ts);
+#endif
 
 #ifdef VDSO_HAS_32BIT_FALLBACK
if (unlikely(ret))
@@ -164,9 +186,15 @@ __cvdso_clock_gettime32(clockid_t clock, struct 
old_timespec32 *res)
 }
 
 static __maybe_unused int
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+__cvdso_gettimeofday(const struct vdso_data *vd, struct __kernel_old_timeval 
*tv,
+struct timezone *tz)
+{
+#else
 __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
 {
const struct vdso_data *vd = __arch_get_vdso_data();
+#endif
 
if (likely(tv != NULL)) {
struct __kernel_timespec ts;
@@ -187,9 +215,15 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, 
struct timezone *tz)
 }
 
 #ifdef VDSO_HAS_TIME
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+static __maybe_unused __kernel_old_time_t
+__cvdso_time(const struct vdso_data *vd, __kernel_old_time_t *time)
+{
+#else
 static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t 
*time)
 {
const struct vdso_data *vd = __arch_get_vdso_data();
+#endif
__kernel_old_time_t t = 
READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);
 
if (time)
@@ -201,9 +235,15 @@ static __maybe_unused __kernel_old_time_t 
__cvdso_time(__kernel_old_time_t *time
 
 #ifdef VDSO_HAS_CLOCK_GETRES
 static __maybe_unused
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock,
+   struct __kernel_timespec *res)
+{
+#else
 int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res)
 {
const struct vdso_data *vd = __arch_get_vdso_data();
+#endif
u32 msk;
u64 ns;
 
@@ -238,9 +278,16 @@ int __cvdso_clock_getres_common(clockid_t clock, struct 
__kernel_timespec *res)
 }
 
 static __maybe_unused
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+int __cvdso_clock_getres(const struct vdso_data *vd, clockid_t clock,
+struct __kernel_timespec *res)
+{
+   int ret = __cvdso_clock_getres_common(vd, clock, res);
+#else
 int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res)
 {
int ret = __cvdso_clock_getres_common(clock, res);
+#endif
 
if (unlikely(ret))
return clock_getres_fallback(clock, res);
@@ -248,12 +295,21 @@ int __cvdso_clock_getres(clockid_t clock, struct 
__kernel_timespec *res)
 }
 
 static __maybe_unused int
+#ifdef VDSO_GETS_VD_PTR_FROM_ARCH
+__cvdso_clock_getres_time32(const struct vdso_data *vd, clockid_t clock,
+   struct old_timespec32 *res)
+#else
 __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec3

[RFC PATCH v3 09/12] powerpc/vdso: provide inline alternative to __get_datapage()

2020-01-13 Thread Christophe Leroy

__get_datapage() is only a few instructions to retrieve the
address of the page where the kernel stores data to the VDSO.

By inlining this function into its users, a bl/blr pair and
a mflr/mtlr pair is avoided, plus a few reg moves.

The improvement is noticeable (about 55 nsec/call on an 8xx)

With current __get_datapage() function:
gettimeofday:vdso: 731 nsec/call
clock-gettime-realtime-coarse:vdso: 668 nsec/call
clock-gettime-monotonic-coarse:vdso: 745 nsec/call

Using the __get_datapage macro provided by this patch:
gettimeofday:vdso: 677 nsec/call
clock-gettime-realtime-coarse:vdso: 613 nsec/call
clock-gettime-monotonic-coarse:vdso: 690 nsec/call

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/vdso_datapage.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/vdso_datapage.h 
b/arch/powerpc/include/asm/vdso_datapage.h
index 4d7965bf369e..7342cc0c1ae4 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -105,6 +105,17 @@ struct vdso_arch_data {
 
 extern struct vdso_arch_data *vdso_data;
 
+#else /* __ASSEMBLY__ */
+
+.macro get_datapage ptr, offset=0
+   bcl 20, 31, .+4
+   mflr\ptr
+#if CONFIG_PPC_PAGE_SHIFT > 14
+   addis   \ptr, \ptr, (_vdso_datapage + \offset - (.-4))@ha
+#endif
+   addi\ptr, \ptr, (_vdso_datapage + \offset - (.-4))@l
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
-- 
2.13.3

[RFC PATCH v3 07/12] powerpc/vdso: simplify __get_datapage()

2020-01-13 Thread Christophe Leroy

The VDSO datapage and the text pages are always located immediately
next to each other, so it can be hardcoded without an indirection
through __kernel_datapage_offset

In order to ease things, move the data page in front like other
arches, that way there is no need to know the size of the library
to locate the data page.

Before:
clock-getres-realtime-coarse:vdso: 714 nsec/call
clock-gettime-realtime-coarse:vdso: 792 nsec/call
clock-gettime-realtime:vdso: 1243 nsec/call

After:
clock-getres-realtime-coarse:vdso: 699 nsec/call
clock-gettime-realtime-coarse:vdso: 784 nsec/call
clock-gettime-realtime:vdso: 1231 nsec/call
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c  | 53 +
 arch/powerpc/kernel/vdso32/datapage.S   | 10 +++
 arch/powerpc/kernel/vdso32/vdso32.lds.S |  7 ++---
 arch/powerpc/kernel/vdso64/datapage.S   | 10 +++
 arch/powerpc/kernel/vdso64/vdso64.lds.S |  7 ++---
 5 files changed, 19 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 16a44bffe698..c093d90a222a 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -191,7 +191,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
 * install_special_mapping or the perf counter mmap tracking code
 * will fail to recognise it as a vDSO (since arch_vma_name fails).
 */
-   current->mm->context.vdso_base = vdso_base;
+   current->mm->context.vdso_base = vdso_base + PAGE_SIZE;
 
/*
 * our vma flags don't have VM_WRITE so by default, the process isn't
@@ -488,42 +488,6 @@ static __init void vdso_setup_trampolines(struct 
lib32_elfinfo *v32,
vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32");
 }
 
-static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
-  struct lib64_elfinfo *v64)
-{
-#ifdef CONFIG_VDSO32
-   Elf32_Sym *sym32;
-#endif
-#ifdef CONFIG_PPC64
-   Elf64_Sym *sym64;
-
-   sym64 = find_symbol64(v64, "__kernel_datapage_offset");
-   if (sym64 == NULL) {
-   printk(KERN_ERR "vDSO64: Can't find symbol "
-  "__kernel_datapage_offset !\n");
-   return -1;
-   }
-   *((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) =
-   (vdso64_pages << PAGE_SHIFT) -
-   (sym64->st_value - VDSO64_LBASE);
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_VDSO32
-   sym32 = find_symbol32(v32, "__kernel_datapage_offset");
-   if (sym32 == NULL) {
-   printk(KERN_ERR "vDSO32: Can't find symbol "
-  "__kernel_datapage_offset !\n");
-   return -1;
-   }
-   *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
-   (vdso32_pages << PAGE_SHIFT) -
-   (sym32->st_value - VDSO32_LBASE);
-#endif
-
-   return 0;
-}
-
-
 static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
  struct lib64_elfinfo *v64)
 {
@@ -624,9 +588,6 @@ static __init int vdso_setup(void)
if (vdso_do_find_sections(&v32, &v64))
return -1;
 
-   if (vdso_fixup_datapage(&v32, &v64))
-   return -1;
-
if (vdso_fixup_features(&v32, &v64))
return -1;
 
@@ -771,26 +732,26 @@ static int __init vdso_init(void)
vdso32_pagelist = kcalloc(vdso32_pages + 2, sizeof(struct page *),
  GFP_KERNEL);
BUG_ON(vdso32_pagelist == NULL);
+   vdso32_pagelist[0] = virt_to_page(vdso_data);
for (i = 0; i < vdso32_pages; i++) {
struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
get_page(pg);
-   vdso32_pagelist[i] = pg;
+   vdso32_pagelist[i + 1] = pg;
}
-   vdso32_pagelist[i++] = virt_to_page(vdso_data);
-   vdso32_pagelist[i] = NULL;
+   vdso32_pagelist[i + 1] = NULL;
 #endif
 
 #ifdef CONFIG_PPC64
vdso64_pagelist = kcalloc(vdso64_pages + 2, sizeof(struct page *),
  GFP_KERNEL);
BUG_ON(vdso64_pagelist == NULL);
+   vdso64_pagelist[0] = virt_to_page(vdso_data);
for (i = 0; i < vdso64_pages; i++) {
struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
get_page(pg);
-   vdso64_pagelist[i] = pg;
+   vdso64_pagelist[i + 1] = pg;
}
-   vdso64_pagelist[i++] = virt_to_page(vdso_data);
-   vdso64_pagelist[i] = NULL;
+   vdso64_pagelist[i + 1] = NULL;
 #endif /* CONFIG_PPC64 */
 
get_page(virt_to_page(vdso_data));
diff --git a/arch/powerpc/kernel/vdso32/datapage.S 
b/arch/powerpc/kernel/vdso32/datapage.S
index 6c7401bd284e..d839aa1a4f01 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -12,9 +12,

[RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time

2020-01-13 Thread Christophe Leroy

Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.

Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday:vdso: 1078 nsec/call
clock-gettime-monotonic-raw:vdso: 1317 nsec/call
clock-gettime-monotonic:vdso: 1255 nsec/call

After:
gettimeofday:vdso: 1032 nsec/call
clock-gettime-monotonic-raw:vdso: 1312 nsec/call
clock-gettime-monotonic:vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy 
---
 lib/vdso/gettimeofday.c | 26 +++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 
mult)
 }
 #endif
 
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long 
*remainder)
+{
+   u32 ret = 0;
+
+   while (dividend >= divisor) {
+   /* The following asm() prevents the compiler from
+  optimising this loop into a modulo operation.  */
+   asm("" : "+rm"(dividend));
+
+   dividend -= divisor;
+   ret++;
+   }
+
+   *remainder = dividend;
+
+   return ret;
+}
+
 static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
   struct __kernel_timespec *ts)
 {
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
u64 cycles, last, sec, ns;
u32 seq;
+   unsigned long nsec;
 
do {
seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data 
*vd, clockid_t clk,
return -1;
 
ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-   ns >>= vd->shift;
+   nsec = ns >> vd->shift;
sec = vdso_ts->sec;
} while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data 
*vd, clockid_t clk,
 * Do this outside the loop: a race inside the loop could result
 * in __iter_div_u64_rem() being extremely slow.
 */
-   ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-   ts->tv_nsec = ns;
+   ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+   ts->tv_nsec = nsec;
 
return 0;
 }
-- 
2.13.3

[RFC PATCH v3 05/12] lib: vdso: Avoid duplication in __cvdso_clock_getres()

2020-01-13 Thread Christophe Leroy

VDSO_HRES and VDSO_RAW clocks are handled the same way.

Don't duplicate code.

Before the patch:
clock-getres-monotonic-raw:vdso: 737 nsec/call
clock-getres-monotonic-coarse:vdso: 753 nsec/call
clock-getres-monotonic:vdso: 691 nsec/call

After the patch:
clock-getres-monotonic-raw:vdso: 715 nsec/call
clock-getres-monotonic-coarse:vdso: 715 nsec/call
clock-getres-monotonic:vdso: 714 nsec/call

Signed-off-by: Christophe Leroy 
Reviewed-by: Andy Lutomirski 
---
 lib/vdso/gettimeofday.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index d75e44ba716f..decd3f2b37af 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -184,7 +184,6 @@ static __maybe_unused
 int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res)
 {
const struct vdso_data *vd = __arch_get_vdso_data();
-   u64 hrtimer_res;
u32 msk;
u64 ns;
 
@@ -192,27 +191,21 @@ int __cvdso_clock_getres_common(clockid_t clock, struct 
__kernel_timespec *res)
if (unlikely((u32) clock >= MAX_CLOCKS))
return -1;
 
-   hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res);
/*
 * Convert the clockid to a bitmask and use it to check which
 * clocks are handled in the VDSO directly.
 */
msk = 1U << clock;
-   if (msk & VDSO_HRES) {
+   if (msk & (VDSO_HRES | VDSO_RAW)) {
/*
 * Preserves the behaviour of posix_get_hrtimer_res().
 */
-   ns = hrtimer_res;
+   ns = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res);
} else if (msk & VDSO_COARSE) {
/*
 * Preserves the behaviour of posix_get_coarse_res().
 */
ns = LOW_RES_NSEC;
-   } else if (msk & VDSO_RAW) {
-   /*
-* Preserves the behaviour of posix_get_hrtimer_res().
-*/
-   ns = hrtimer_res;
} else {
return -1;
}
-- 
2.13.3

[RFC PATCH v3 04/12] lib: vdso: inline do_hres() and do_coarse()

2020-01-13 Thread Christophe Leroy

do_hres() is called from several places, so GCC doesn't inline
it at first.

do_hres() takes a struct __kernel_timespec * parameter for
passing the result. In the 32 bits case, this parameter corresponds
to a local var in the caller. In order to provide a pointer
to this structure, the caller has to put it in its stack and
do_hres() has to write the result in the stack. This is suboptimal,
especially on RISC processor like powerpc.

By making GCC inline the function, the struct __kernel_timespec
remains a local var using registers, avoiding the need to write and
read stack.

The improvement is significant on powerpc:
Before:
gettimeofday:vdso: 1379 nsec/call
clock-gettime-realtime-coarse:vdso: 868 nsec/call
clock-gettime-realtime:vdso: 1511 nsec/call
clock-gettime-monotonic-raw:vdso: 1576 nsec/call

After:
gettimeofday:vdso: 1078 nsec/call
clock-gettime-realtime-coarse:vdso: 807 nsec/call
clock-gettime-realtime:vdso: 1256 nsec/call
clock-gettime-monotonic-raw:vdso: 1316 nsec/call

At the same time, change the return type of do_coarse() to int, this
increase readability of the if/elseif/elseif/else  section
in __cvdso_clock_gettime_common()

Signed-off-by: Christophe Leroy 
---
 lib/vdso/gettimeofday.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index 42bd8ab955fa..d75e44ba716f 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,8 +38,8 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 }
 #endif
 
-static int do_hres(const struct vdso_data *vd, clockid_t clk,
-  struct __kernel_timespec *ts)
+static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
+  struct __kernel_timespec *ts)
 {
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
u64 cycles, last, sec, ns;
@@ -68,8 +68,8 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk,
return 0;
 }
 
-static void do_coarse(const struct vdso_data *vd, clockid_t clk,
- struct __kernel_timespec *ts)
+static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk,
+struct __kernel_timespec *ts)
 {
const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
u32 seq;
@@ -79,6 +79,8 @@ static void do_coarse(const struct vdso_data *vd, clockid_t 
clk,
ts->tv_sec = vdso_ts->sec;
ts->tv_nsec = vdso_ts->nsec;
} while (unlikely(vdso_read_retry(vd, seq)));
+
+   return 0;
 }
 
 static __maybe_unused int
@@ -96,15 +98,16 @@ __cvdso_clock_gettime_common(clockid_t clock, struct 
__kernel_timespec *ts)
 * clocks are handled in the VDSO directly.
 */
msk = 1U << clock;
-   if (likely(msk & VDSO_HRES)) {
-   return do_hres(&vd[CS_HRES_COARSE], clock, ts);
-   } else if (msk & VDSO_COARSE) {
-   do_coarse(&vd[CS_HRES_COARSE], clock, ts);
-   return 0;
-   } else if (msk & VDSO_RAW) {
-   return do_hres(&vd[CS_RAW], clock, ts);
-   }
-   return -1;
+   if (likely(msk & VDSO_HRES))
+   vd += CS_HRES_COARSE;
+   else if (msk & VDSO_COARSE)
+   return do_coarse(&vd[CS_HRES_COARSE], clock, ts);
+   else if (msk & VDSO_RAW)
+   vd += CS_RAW;
+   else
+   return -1;
+
+   return do_hres(vd, clock, ts);
 }
 
 static __maybe_unused int
-- 
2.13.3

[RFC PATCH v3 02/12] powerpc/vdso: Switch VDSO to generic C implementation.

2020-01-13 Thread Christophe Leroy

This is a minimalistic conversion to VDSO generic C implementation.

On powerpc 8xx, performance is degraded by 40-45% for gettime and
by 50% for getres.

Optimisations will follow in next patches, some in the core
VDSO functions, some in the powerpc arch.

powerpc is a bit special for VDSO as well as system calls in the
way that it requires setting CR SO bit which cannot be done in C.
Therefore, entry/exit needs to be performed in ASM.

On a powerpc885 at 132MHz:
With current powerpc/32 ASM VDSO:

gettimeofday:vdso: 737 nsec/call
clock-getres-realtime-coarse:vdso: 3081 nsec/call
clock-gettime-realtime-coarse:vdso: 2861 nsec/call
clock-getres-realtime:vdso: 475 nsec/call
clock-gettime-realtime:vdso: 892 nsec/call
clock-getres-boottime:vdso: 2621 nsec/call
clock-gettime-boottime:vdso: 3857 nsec/call
clock-getres-tai:vdso: 2620 nsec/call
clock-gettime-tai:vdso: 3854 nsec/call
clock-getres-monotonic-raw:vdso: 2621 nsec/call
clock-gettime-monotonic-raw:vdso: 3499 nsec/call
clock-getres-monotonic-coarse:vdso: 3083 nsec/call
clock-gettime-monotonic-coarse:vdso: 3082 nsec/call
clock-getres-monotonic:vdso: 475 nsec/call
clock-gettime-monotonic:vdso: 1014 nsec/call

Once switched to C implementation:

gettimeofday:vdso: 1379 nsec/call
getcpu:vdso: not tested
clock-getres-realtime-coarse:vdso: 984 nsec/call
clock-gettime-realtime-coarse:vdso: 868 nsec/call
clock-getres-realtime:vdso: 922 nsec/call
clock-gettime-realtime:vdso: 1511 nsec/call
clock-getres-boottime:vdso: 922 nsec/call
clock-gettime-boottime:vdso: 1510 nsec/call
clock-getres-tai:vdso: 923 nsec/call
clock-gettime-tai:vdso: 1511 nsec/call
clock-getres-monotonic-raw:vdso: 968 nsec/call
clock-gettime-monotonic-raw:vdso: 1576 nsec/call
clock-getres-monotonic-coarse:vdso: 984 nsec/call
clock-gettime-monotonic-coarse:vdso: 868 nsec/call
clock-getres-monotonic:vdso: 922 nsec/call
clock-gettime-monotonic:vdso: 1510 nsec/call
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |   2 +
 arch/powerpc/include/asm/vdso/gettimeofday.h | 109 
 arch/powerpc/include/asm/vdso/vsyscall.h |  25 +++
 arch/powerpc/include/asm/vdso_datapage.h |  41 ++---
 arch/powerpc/kernel/asm-offsets.c|  46 +
 arch/powerpc/kernel/time.c   |  90 --
 arch/powerpc/kernel/vdso.c   |   5 +-
 arch/powerpc/kernel/vdso32/Makefile  |  27 ++-
 arch/powerpc/kernel/vdso32/gettimeofday.S| 255 +++
 arch/powerpc/kernel/vdso32/vgettimeofday.c   |  26 +++
 arch/powerpc/kernel/vdso64/Makefile  |  23 ++-
 arch/powerpc/kernel/vdso64/datapage.S|   3 +-
 arch/powerpc/kernel/vdso64/gettimeofday.S| 254 +++---
 arch/powerpc/kernel/vdso64/vgettimeofday.c   |  26 +++
 14 files changed, 312 insertions(+), 620 deletions(-)
 create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h
 create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h
 create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c
 create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1ec34e16ed65..bd04c68baf91 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -169,6 +169,7 @@ config PPC
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
+   select GENERIC_GETTIMEOFDAY
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP  if PPC_BOOK3S_64 && 
PPC_RADIX_MMU
select HAVE_ARCH_JUMP_LABEL
@@ -198,6 +199,7 @@ config PPC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200   # 
plugin support on gcc <= 5.1 is buggy on PPC
+   select HAVE_GENERIC_VDSO
select HAVE_HW_BREAKPOINT   if PERF_EVENTS && (PPC_BOOK3S 
|| PPC_8xx)
select HAVE_IDE
select HAVE_IOREMAP_PROT
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
new file mode 100644
index ..343c81a7e951
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_GETTIMEOFDAY_H
+#define __ASM_VDSO_GETTIMEOFDAY_H
+
+#ifndef __ASSEMBLY__
+
+#include 
+#include 
+#include 
+#include 
+
+#define VDSO_HAS_CLOCK_GETRES  1
+
+#define VDSO_HAS_TIME  1
+
+static __always_inline int do_syscall_2(const unsigned long _r0, const 
unsigned long _r3,
+   const unsigned long _r4)
+{
+   register long r0 asm("r0") = _r0;
+   register unsigned long r3 asm("r3") = _r3;
+   register unsigned long r4 asm("r4") = _r4;
+   register int ret asm ("r3");
+
+   asm volatile(
+

[RFC PATCH v3 01/12] powerpc/64: Don't provide time functions in compat VDSO32

2020-01-13 Thread Christophe Leroy

In order to allow use of generic C VDSO, remove time functions
from the 32 bits VDSO on PPC64.

This (temporary) removal is needed because powerpc kernel C headers
are not prepared for building 32 bits code on PPC64.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso32/Makefile | 3 ++-
 arch/powerpc/kernel/vdso32/vdso32.lds.S | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/vdso32/Makefile 
b/arch/powerpc/kernel/vdso32/Makefile
index 06f54d947057..738d52105392 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -3,7 +3,8 @@
 # List of files in the vdso, has to be asm only for now
 
 obj-vdso32-$(CONFIG_PPC64) = getcpu.o
-obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \
+obj-vdso32-$(CONFIG_PPC32) = gettimeofday.o
+obj-vdso32 = sigtramp.o datapage.o cacheflush.o note.o \
$(obj-vdso32-y)
 
 # Build rules
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S 
b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 00c025ba4a92..9400b182e163 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -144,7 +144,7 @@ VERSION
__kernel_datapage_offset;
 
__kernel_get_syscall_map;
-#ifndef CONFIG_PPC_BOOK3S_601
+#if defined(CONFIG_PPC32) && !defined(CONFIG_PPC_BOOK3S_601)
__kernel_gettimeofday;
__kernel_clock_gettime;
__kernel_clock_getres;
-- 
2.13.3

[RFC PATCH v3 00/12] powerpc: switch VDSO to C implementation.

2020-01-13 Thread Christophe Leroy

This is a third tentative to switch powerpc VDSO to generic C implementation.

This version should work on PPC64 (untested). VDSO32 for PPC64 is
impossible to build and has been de-activated, because the powerpc
ASM header files for C are not prepared to build 32 bits code with CONFIG_PPC64.

powerpc is a bit special for VDSO as well as system calls in the
way that it requires setting CR SO bit which cannot be done in C.
Therefore, entry/exit and fallback need to be performed in ASM.

On a powerpc8xx, with current powerpc/32 ASM VDSO:

gettimeofday:vdso: 737 nsec/call
clock-getres-realtime:vdso: 475 nsec/call
clock-gettime-realtime:vdso: 892 nsec/call

The first patch adds VDSO generic C support without any changes to common code.
Performance is as follows:

gettimeofday:vdso: 1379 nsec/call
clock-getres-realtime-coarse:vdso: 984 nsec/call
clock-gettime-realtime-coarse:vdso: 868 nsec/call
clock-getres-realtime:vdso: 922 nsec/call
clock-gettime-realtime:vdso: 1511 nsec/call
clock-getres-monotonic-raw:vdso: 968 nsec/call
clock-gettime-monotonic-raw:vdso: 1576 nsec/call

Then a few changes in the common code have allowed performance improvement. At
the end of the series we have:

gettimeofday:vdso: 899 nsec/call
clock-getres-realtime-coarse:vdso: 546 nsec/call
clock-gettime-realtime-coarse:vdso: 615 nsec/call
clock-getres-realtime:vdso: 545 nsec/call
clock-gettime-realtime:vdso: 1064 nsec/call
clock-getres-monotonic-raw:vdso: 546 nsec/call
clock-gettime-monotonic-raw:vdso: 1125 nsec/call

Christophe Leroy (12):
  powerpc/64: Don't provide time functions in compat VDSO32
  powerpc/vdso: Switch VDSO to generic C implementation.
  lib: vdso: mark __cvdso_clock_getres() as static
  lib: vdso: inline do_hres() and do_coarse()
  lib: vdso: Avoid duplication in __cvdso_clock_getres()
  lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time
  powerpc/vdso: simplify __get_datapage()
  lib: vdso: allow arches to provide vdso data pointer
  powerpc/vdso: provide inline alternative to __get_datapage()
  powerpc/vdso: provide vdso data pointer from the ASM caller.
  lib: vdso: split clock verification out of __arch_get_hw_counter()
  powerpc/vdso: provide __arch_is_hw_counter_valid()

 arch/powerpc/Kconfig |   2 +
 arch/powerpc/include/asm/vdso/gettimeofday.h | 104 +++
 arch/powerpc/include/asm/vdso/vsyscall.h |  25 +++
 arch/powerpc/include/asm/vdso_datapage.h |  52 +++---
 arch/powerpc/kernel/asm-offsets.c|  46 +
 arch/powerpc/kernel/time.c   |  90 --
 arch/powerpc/kernel/vdso.c   |  58 ++
 arch/powerpc/kernel/vdso32/Makefile  |  30 +++-
 arch/powerpc/kernel/vdso32/datapage.S|  10 +-
 arch/powerpc/kernel/vdso32/gettimeofday.S| 258 ---
 arch/powerpc/kernel/vdso32/vdso32.lds.S  |   9 +-
 arch/powerpc/kernel/vdso32/vgettimeofday.c   |  29 +++
 arch/powerpc/kernel/vdso64/Makefile  |  23 ++-
 arch/powerpc/kernel/vdso64/datapage.S|  13 +-
 arch/powerpc/kernel/vdso64/gettimeofday.S| 257 --
 arch/powerpc/kernel/vdso64/vdso64.lds.S  |   7 +-
 arch/powerpc/kernel/vdso64/vgettimeofday.c   |  29 +++
 lib/vdso/gettimeofday.c  | 130 +++---
 18 files changed, 457 insertions(+), 715 deletions(-)
 create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h
 create mode 100644 arch/powerpc/include/asm/vdso/vsyscall.h
 create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c
 create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c

-- 
2.13.3

[RFC PATCH v3 03/12] lib: vdso: mark __cvdso_clock_getres() as static

2020-01-13 Thread Christophe Leroy

When __cvdso_clock_getres() became __cvdso_clock_getres_common()
and a new __cvdso_clock_getres() was added, static qualifier was
forgotten.

This change allows the compiler to inline __cvdso_clock_getres_common(),
and the performance improvement is significant:

Before:
clock-getres-realtime-coarse:vdso: 984 nsec/call
clock-getres-realtime:vdso: 922 nsec/call
clock-getres-monotonic-raw:vdso: 968 nsec/call

After:
clock-getres-realtime-coarse:vdso: 753 nsec/call
clock-getres-realtime:vdso: 691 nsec/call
clock-getres-monotonic-raw:vdso: 737 nsec/call

Fixes: 502a590a170b ("lib/vdso: Move fallback invocation to the callers")
Signed-off-by: Christophe Leroy 
---
 lib/vdso/gettimeofday.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index 9ecfd3b547ba..42bd8ab955fa 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -221,6 +221,7 @@ int __cvdso_clock_getres_common(clockid_t clock, struct 
__kernel_timespec *res)
return 0;
 }
 
+static __maybe_unused
 int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res)
 {
int ret = __cvdso_clock_getres_common(clock, res);
-- 
2.13.3

[PATCH] powerpc/xive: discard ESB load value when interrupt is invalid

2020-01-13 Thread Cédric Le Goater

From: Frederic Barrat 

A load on an ESB page returning all 1's means that the underlying
device has invalidated the access to the PQ state of the interrupt
through mmio. It may happen, for example when querying a PHB interrupt
while the PHB is in an error state.

In that case, we should consider the interrupt to be invalid when
checking its state in the irq_get_irqchip_state() handler.

Cc: Paul Mackerras 
Signed-off-by: Frederic Barrat 
[ clg: - wrote a commit log
   - introduced XIVE_ESB_INVALID ]
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/include/asm/xive-regs.h |  1 +
 arch/powerpc/sysdev/xive/common.c| 15 ---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/xive-regs.h 
b/arch/powerpc/include/asm/xive-regs.h
index f2dfcd50a2d3..33aee7490cbb 100644
--- a/arch/powerpc/include/asm/xive-regs.h
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -39,6 +39,7 @@
 
 #define XIVE_ESB_VAL_P 0x2
 #define XIVE_ESB_VAL_Q 0x1
+#define XIVE_ESB_INVALID   0xFF
 
 /*
  * Thread Management (aka "TM") registers
diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f5fadbd2533a..9651ca061828 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -972,12 +972,21 @@ static int xive_get_irqchip_state(struct irq_data *data,
  enum irqchip_irq_state which, bool *state)
 {
struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
+   u8 pq;
 
switch (which) {
case IRQCHIP_STATE_ACTIVE:
-   *state = !xd->stale_p &&
-(xd->saved_p ||
- !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P));
+   pq = xive_esb_read(xd, XIVE_ESB_GET);
+
+   /*
+* The esb value being all 1's means we couldn't get
+* the PQ state of the interrupt through mmio. It may
+* happen, for example when querying a PHB interrupt
+* while the PHB is in an error state. We consider the
+* interrupt to be inactive in that case.
+*/
+   *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p &&
+   (xd->saved_p || !!(pq & XIVE_ESB_VAL_P));
return 0;
default:
return -EINVAL;
-- 
2.21.1

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Timur Tabi

On Thu, Jan 9, 2020 at 1:41 AM Stephen Rothwell  wrote:
>
> ev_byte_channel_send() assumes that its third argument is a 16 byte array.
> Some places where it is called it may not be (or we can't easily tell
> if it is).  Newer compilers have started producing warnings about this,
> so make sure we actually pass a 16 byte array.

...

> +static unsigned int local_ev_byte_channel_send(unsigned int handle,
> +unsigned int *count, const char *p)
> +{
> +   char buffer[EV_BYTE_CHANNEL_MAX_BYTES];
> +   unsigned int c = *count;
> +
> +   if (c < sizeof(buffer)) {
> +   memcpy(buffer, p, c);
> +   memset(&buffer[c], 0, sizeof(buffer) - c);
> +   p = buffer;
> +   }
> +   return ev_byte_channel_send(handle, count, p);
> +}

Why not simply correct the parameters of ev_byte_channel_send?

static inline unsigned int ev_byte_channel_send(unsigned int handle,
-unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES])
+unsigned int *count, const char *buffer)

Back then, I probably thought I was just being clever with this code,
but I realize now that it doesn't make sense to do the way I did.

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Timur Tabi


On 1/13/20 8:34 AM, Laurentiu Tudor wrote:

There are a few users that I know of, but I can't tell if that's enough
to justify keeping the driver.

[1]https://source.codeaurora.org/external/qoriq/qoriq-yocto-sdk/hypervisor/


IIRC, the driver is the only reasonable way to get a serial console from 
a guest.  So if there are users of the hypervisor, then I think there's 
a good chance at least one is using the byte channel driver.

Re: [PATCH] lib: vdso: mark __cvdso_clock_getres() as static

2020-01-13 Thread Thomas Gleixner

Christophe Leroy  writes:
> Is there a git tree with the latest VDSO status ?
>
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git?h=timers%2Fvdso 
> is 6 monthes old.

Will push a new one later today. I'll let you know.

Thanks,

tglx

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Laurentiu Tudor

Hello,

On 13.01.2020 15:48, Timur Tabi wrote:
> On 1/13/20 6:26 AM, Michael Ellerman wrote:
>> I've never heard of it, and I have no idea how to test it.
>>
>> It's not used by qemu, I guess there is/was a Freescale hypervisor that
>> used it.
> 
> Yes, there is/was a Freescale hypervisor that I and a few others worked 
> on.  I've added a couple people on CC that might be able to tell the 
> current disposition of it.

More info on this: it's opensource and it's published here [1]. We still 
claim to maintain it but there wasn't much activity past years, as one 
might notice.

>> But maybe it's time to remove it if it's not being maintained/used by
>> anyone?
> 
> I wouldn't be completely opposed to that if there really are no more 
> users.  There really weren't any users even when I wrote the driver.

There are a few users that I know of, but I can't tell if that's enough 
to justify keeping the driver.

[1] https://source.codeaurora.org/external/qoriq/qoriq-yocto-sdk/hypervisor/

---
Best Regards, Laurentiu

Re: [PATCH] powerpc/xive: discard ESB load value when interrupt is invalid

2020-01-13 Thread Cédric Le Goater

On 1/13/20 2:01 PM, Cédric Le Goater wrote:
> From: Frederic Barrat 
> 
> A load on an ESB page returning all 1's means that the underlying
> device has invalidated the access to the PQ state of the interrupt
> through mmio. It may happen, for example when querying a PHB interrupt
> while the PHB is in an error state.
> 
> In that case, we should consider the interrupt to be invalid when
> checking its state in the irq_get_irqchip_state() handler.


and we need also these tags :

Fixes: da15c03b047d ("powerpc/xive: Implement get_irqchip_state method for XIVE 
to fix shutdown race")
Cc: sta...@vger.kernel.org # v5.3+



> Cc: Paul Mackerras 
> Signed-off-by: Frederic Barrat 
> [ clg: - wrote a commit log
>- introduced XIVE_ESB_INVALID ]
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/include/asm/xive-regs.h |  1 +
>  arch/powerpc/sysdev/xive/common.c| 15 ---
>  2 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/xive-regs.h 
> b/arch/powerpc/include/asm/xive-regs.h
> index f2dfcd50a2d3..33aee7490cbb 100644
> --- a/arch/powerpc/include/asm/xive-regs.h
> +++ b/arch/powerpc/include/asm/xive-regs.h
> @@ -39,6 +39,7 @@
>  
>  #define XIVE_ESB_VAL_P   0x2
>  #define XIVE_ESB_VAL_Q   0x1
> +#define XIVE_ESB_INVALID 0xFF
>  
>  /*
>   * Thread Management (aka "TM") registers
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index f5fadbd2533a..9651ca061828 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -972,12 +972,21 @@ static int xive_get_irqchip_state(struct irq_data *data,
> enum irqchip_irq_state which, bool *state)
>  {
>   struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
> + u8 pq;
>  
>   switch (which) {
>   case IRQCHIP_STATE_ACTIVE:
> - *state = !xd->stale_p &&
> -  (xd->saved_p ||
> -   !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P));
> + pq = xive_esb_read(xd, XIVE_ESB_GET);
> +
> + /*
> +  * The esb value being all 1's means we couldn't get
> +  * the PQ state of the interrupt through mmio. It may
> +  * happen, for example when querying a PHB interrupt
> +  * while the PHB is in an error state. We consider the
> +  * interrupt to be inactive in that case.
> +  */
> + *state = (pq != XIVE_ESB_INVALID) && !xd->stale_p &&
> + (xd->saved_p || !!(pq & XIVE_ESB_VAL_P));
>   return 0;
>   default:
>   return -EINVAL;
>

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Timur Tabi


On 1/13/20 6:26 AM, Michael Ellerman wrote:

I've never heard of it, and I have no idea how to test it.

It's not used by qemu, I guess there is/was a Freescale hypervisor that
used it.


Yes, there is/was a Freescale hypervisor that I and a few others worked 
on.  I've added a couple people on CC that might be able to tell the 
current disposition of it.



But maybe it's time to remove it if it's not being maintained/used by
anyone?


I wouldn't be completely opposed to that if there really are no more 
users.  There really weren't any users even when I wrote the driver.


I haven't had a chance to study the patch, but my first instinct is that 
there's got to be a better way to fix this than introducing a memcpy.

Re: [PATCH v3] powerpc/smp: Use nid as fallback for package_id

2020-01-13 Thread Michael Ellerman

Hi Srikar,

Still some comments sorry ...

Srikar Dronamraju  writes:
> Package_id is to find out all cores that are part of the same chip. On
> PowerNV machines, package_id defaults to chip_id. However ibm,chip_id
> property is not present in device-tree of PowerVM Lpars. Hence lscpu
> output shows one core per socket and multiple cores.
>
> To overcome this, use nid as the package_id on PowerVM Lpars.
>
> Before the patch.
> ---
> Architecture:ppc64le
> Byte Order:  Little Endian
> CPU(s):  128
> On-line CPU(s) list: 0-127
> Thread(s) per core:  8
> Core(s) per socket:  1   <--
> Socket(s):   16  <--
> NUMA node(s):2
> Model:   2.2 (pvr 004e 0202)
> Model name:  POWER9 (architected), altivec supported
> Hypervisor vendor:   pHyp
> Virtualization type: para
> L1d cache:   32K
> L1i cache:   32K
> L2 cache:512K
> L3 cache:10240K
> NUMA node0 CPU(s):   0-63
> NUMA node1 CPU(s):   64-127
>  #
>  # cat /sys/devices/system/cpu/cpu0/topology/physical_package_id
>  -1
>  #
>
> After the patch
> ---
> Architecture:ppc64le
> Byte Order:  Little Endian
> CPU(s):  128
> On-line CPU(s) list: 0-127
> Thread(s) per core:  8<--
> Core(s) per socket:  8<--
> Socket(s):   2
> NUMA node(s):2
> Model:   2.2 (pvr 004e 0202)
> Model name:  POWER9 (architected), altivec supported
> Hypervisor vendor:   pHyp
> Virtualization type: para
> L1d cache:   32K
> L1i cache:   32K
> L2 cache:512K
> L3 cache:10240K
> NUMA node0 CPU(s):   0-63
> NUMA node1 CPU(s):   64-127
>  #
>  # cat /sys/devices/system/cpu/cpu0/topology/physical_package_id
>  0
>  #
> Now lscpu output is more in line with the system configuration.
>
> Signed-off-by: Srikar Dronamraju 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Michael Ellerman 
> Cc: Vasant Hegde 
> Cc: Vaidyanathan Srinivasan 
> ---
> Changelog from v2: https://patchwork.ozlabs.org/patch/1151649
> - Rebased to v5.5-rc2
> - Renamed the new function to cpu_to_nid
> - Removed checks to fadump property. (Looked too excessive)
> - Moved pseries specific code to pseries/lpar.c
>
> Changelog from v1: https://patchwork.ozlabs.org/patch/1126145
> - In v1 cpu_to_chip_id was overloaded to fallback on nid.  Michael
>   Ellerman wasn't comfortable with nid being shown up as chip_id.
>
>  arch/powerpc/include/asm/topology.h   |  7 ++-
>  arch/powerpc/kernel/smp.c |  6 +++---
>  arch/powerpc/platforms/pseries/lpar.c | 22 ++
>  3 files changed, 31 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index 2f7e1ea5089e..7422ef913c75 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -130,11 +130,16 @@ static inline void shared_proc_topology_init(void) {}
>  
>  #ifdef CONFIG_SMP
>  #include 
> +#ifdef CONFIG_PPC_SPLPAR
> +int cpu_to_nid(int);
> +#else
> +#define cpu_to_nid(cpu)  cpu_to_chip_id(cpu)
> +#endif

We already have cpu_to_node(), which returns the numa node (nid) for a
given CPU, so I think introducing a new accessor with an almost
identical name is going to cause confusion. ie. when should code use
cpu_to_node() and when should it use cpu_to_nid()?

>  #ifdef CONFIG_PPC64
>  #include 
>  
> -#define topology_physical_package_id(cpu)(cpu_to_chip_id(cpu))
> +#define topology_physical_package_id(cpu)(cpu_to_nid(cpu))

I think you should be overriding topology_physical_package_id() instead
of introducing cpu_to_nid().

>  #define topology_sibling_cpumask(cpu)(per_cpu(cpu_sibling_map, cpu))
>  #define topology_core_cpumask(cpu)   (per_cpu(cpu_core_map, cpu))
>  #define topology_core_id(cpu)(cpu_to_core_id(cpu))
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index ea6adbf6a221..b0c1438d8d9a 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1188,7 +1188,7 @@ static inline void add_cpu_to_smallcore_masks(int cpu)
>  static void add_cpu_to_masks(int cpu)
>  {
>   int first_thread = cpu_first_thread_sibling(cpu);
> - int chipid = cpu_to_chip_id(cpu);
> + int nid = cpu_to_nid(cpu);

And here you should be using topology_physical_package_id(), rather than
cpu_to_nid() directly.

>   int i;
>  
>   /*
> @@ -1217,11 +1217,11 @@ static void add_cpu_to_masks(int cpu)
>   for_each_cpu(i, cpu_l2_cache_mask(cpu))
>   set_cpus_related(cpu, i, cpu_core_mask);
>  
> - if (chipid == -1)
> + if (nid == -1)
>   return;
>  
>   for_each_cpu(i, cpu_online_mask)
> - if (cpu_to_chip_id(i) == chip

[Bug 205201] Booting halts if Dawicontrol DC-2976 UW SCSI board installed, unless RAM size limited to 3500M

2020-01-13 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=205201

Michael Ellerman (mich...@ellerman.id.au) changed:

   What|Removed |Added

 Status|RESOLVED|CLOSED
 CC||mich...@ellerman.id.au

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [PATCH] evh_bytechan: fix out of bounds accesses

2020-01-13 Thread Michael Ellerman

Stephen Rothwell  writes:
> ev_byte_channel_send() assumes that its third argument is a 16 byte array.
> Some places where it is called it may not be (or we can't easily tell
> if it is).  Newer compilers have started producing warnings about this,
> so make sure we actually pass a 16 byte array.
>
> There may be more elegant solutions to this, but the driver is quite
> old and hasn't been updated in many years.
...
> Fixes: dcd83aaff1c8 ("tty/powerpc: introduce the ePAPR embedded hypervisor 
> byte channel driver")
> Cc: Michael Ellerman 
> Cc: PowerPC Mailing List 
> Signed-off-by: Stephen Rothwell 
> ---
>  drivers/tty/ehv_bytechan.c | 20 +---
>  1 file changed, 17 insertions(+), 3 deletions(-)
>
> I have only build tested this change so it would be good to get some
> response from the PowerPC maintainers/developers.

I've never heard of it, and I have no idea how to test it.

It's not used by qemu, I guess there is/was a Freescale hypervisor that
used it.

But maybe it's time to remove it if it's not being maintained/used by
anyone?

cheers


> diff --git a/drivers/tty/ehv_bytechan.c b/drivers/tty/ehv_bytechan.c
> index 769e0a5d1dfc..546f80c49ae6 100644
> --- a/drivers/tty/ehv_bytechan.c
> +++ b/drivers/tty/ehv_bytechan.c
> @@ -136,6 +136,20 @@ static int find_console_handle(void)
>   return 1;
>  }
>  
> +static unsigned int local_ev_byte_channel_send(unsigned int handle,
> +unsigned int *count, const char *p)
> +{
> + char buffer[EV_BYTE_CHANNEL_MAX_BYTES];
> + unsigned int c = *count;
> +
> + if (c < sizeof(buffer)) {
> + memcpy(buffer, p, c);
> + memset(&buffer[c], 0, sizeof(buffer) - c);
> + p = buffer;
> + }
> + return ev_byte_channel_send(handle, count, p);
> +}
> +
>  /*** EARLY CONSOLE DRIVER 
> ***/
>  
>  #ifdef CONFIG_PPC_EARLY_DEBUG_EHV_BC
> @@ -154,7 +168,7 @@ static void byte_channel_spin_send(const char data)
>  
>   do {
>   count = 1;
> - ret = ev_byte_channel_send(CONFIG_PPC_EARLY_DEBUG_EHV_BC_HANDLE,
> + ret = 
> local_ev_byte_channel_send(CONFIG_PPC_EARLY_DEBUG_EHV_BC_HANDLE,
>  &count, &data);
>   } while (ret == EV_EAGAIN);
>  }
> @@ -221,7 +235,7 @@ static int ehv_bc_console_byte_channel_send(unsigned int 
> handle, const char *s,
>   while (count) {
>   len = min_t(unsigned int, count, EV_BYTE_CHANNEL_MAX_BYTES);
>   do {
> - ret = ev_byte_channel_send(handle, &len, s);
> + ret = local_ev_byte_channel_send(handle, &len, s);
>   } while (ret == EV_EAGAIN);
>   count -= len;
>   s += len;
> @@ -401,7 +415,7 @@ static void ehv_bc_tx_dequeue(struct ehv_bc_data *bc)
>   CIRC_CNT_TO_END(bc->head, bc->tail, BUF_SIZE),
>   EV_BYTE_CHANNEL_MAX_BYTES);
>  
> - ret = ev_byte_channel_send(bc->handle, &len, bc->buf + 
> bc->tail);
> + ret = local_ev_byte_channel_send(bc->handle, &len, bc->buf + 
> bc->tail);
>  
>   /* 'len' is valid only if the return code is 0 or EV_EAGAIN */
>   if (!ret || (ret == EV_EAGAIN))
> -- 
> 2.25.0.rc1
>
> -- 
> Cheers,
> Stephen Rothwell

Re: [RESEND PATCH v5 2/5] arm64/crash_core: Export TCR_EL1.T1SZ in vmcoreinfo

2020-01-13 Thread Bhupesh Sharma


Hi James,

On 01/11/2020 12:30 AM, Dave Anderson wrote:


- Original Message -

Hi Bhupesh,

On 25/12/2019 19:01, Bhupesh Sharma wrote:

On 12/12/2019 04:02 PM, James Morse wrote:

On 29/11/2019 19:59, Bhupesh Sharma wrote:

vabits_actual variable on arm64 indicates the actual VA space size,
and allows a single binary to support both 48-bit and 52-bit VA
spaces.

If the ARMv8.2-LVA optional feature is present, and we are running
with a 64KB page size; then it is possible to use 52-bits of address
space for both userspace and kernel addresses. However, any kernel
binary that supports 52-bit must also be able to fall back to 48-bit
at early boot time if the hardware feature is not present.

Since TCR_EL1.T1SZ indicates the size offset of the memory region
addressed by TTBR1_EL1 (and hence can be used for determining the
vabits_actual value) it makes more sense to export the same in
vmcoreinfo rather than vabits_actual variable, as the name of the
variable can change in future kernel versions, but the architectural
constructs like TCR_EL1.T1SZ can be used better to indicate intended
specific fields to user-space.

User-space utilities like makedumpfile and crash-utility, need to
read/write this value from/to vmcoreinfo


(write?)


Yes, also write so that the vmcoreinfo from an (crashing) arm64 system can
be used for
analysis of the root-cause of panic/crash on say an x86_64 host using
utilities like
crash-utility/gdb.


I read this as as "User-space [...] needs to write to vmcoreinfo".


That's correct. But for writing to vmcore dump in the kdump kernel, we 
need to read the symbols from the vmcoreinfo in the primary kernel.



for determining if a virtual address lies in the linear map range.


I think this is a fragile example. The debugger shouldn't need to know
this.


Well that the current user-space utility design, so I am not sure we can
tweak that too much.


The user-space computation for determining whether an address lies in
the linear map range is the same as we have in kernel-space:

#define __is_lm_address(addr)(!(((u64)addr) & BIT(vabits_actual -
1)))


This was changed with 14c127c957c1 ("arm64: mm: Flip kernel VA space"). If
user-space
tools rely on 'knowing' the kernel memory layout, they must have to
constantly be fixed
and updated. This is a poor argument for adding this to something that
ends up as ABI.


See above. The user-space has to rely on some ABI/guaranteed
hardware-symbols which can be
used for 'determining' the kernel memory layout.


I disagree. Everything and anything in the kernel will change. The ABI rules 
apply to
stuff exposed via syscalls and kernel filesystems. It does not apply to kernel 
internals,
like the memory layout we used yesterday. 14c127c957c1 is a case in point.

A debugger trying to rely on this sort of thing would have to play catchup 
whenever it
changes.


Exactly.  That's the whole point.

The crash utility and makedumpfile are not in the same league as other 
user-space tools.
They have always had to "play catchup" precisely because they depend upon 
kernel internals,
which constantly change.


I agree with you and DaveA here. Software user-space debuggers are 
dependent on kernel internals (which can change from time-to-time) and 
will have to play catch-up (which has been the case since the very start).


Unfortunately we don't have any clear ABI for software debugging tools - 
may be something to look for in future.


A case in point is gdb/kgdb, which still needs to run with KASLR 
turned-off (nokaslr) for debugging, as it confuses gdb which resolve 
kernel symbol address from symbol table of vmlinux. But we can 
work-around the same in makedumpfile/crash by reading the 'kaslr_offset' 
value. And I have several users telling me now they cannot use gdb on 
KASLR enabled kernel to debug panics, but can makedumpfile + crash 
combination to achieve the same.


So, we should be looking to fix these utilities which are broken since 
the 52-bit changes for arm64. Accordingly, I will try to send the v6

soon while incorporating the comments posted on the v5.

Thanks,
Bhupesh

Re: [PATCH] lib: vdso: mark __cvdso_clock_getres() as static

2020-01-13 Thread Christophe Leroy





Le 11/01/2020 à 20:59, Thomas Gleixner a écrit :

Christophe Leroy  writes:

When __cvdso_clock_getres() became __cvdso_clock_getres_common()
and a new __cvdso_clock_getres() was added, static qualifier was
forgotten.

Fixes: 502a590a170b ("lib/vdso: Move fallback invocation to the callers")
Signed-off-by: Christophe Leroy 


I've already queued:

  https://lore.kernel.org/r/20191128111719.8282-1-vincenzo.frasc...@arm.com

but thanks for caring!




Is there a git tree with the latest VDSO status ?

https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git?h=timers%2Fvdso 
is 6 monthes old.


Christophe

[PATCH v3 3/3] powerpc/powernv: Parse device tree, population of SPR support

2020-01-13 Thread Pratik Rajesh Sampat

Parse the device tree for nodes self-save, self-restore and populate
support for the preferred SPRs based what was advertised by the device
tree.

Signed-off-by: Pratik Rajesh Sampat 
Reviewed-by: Ram Pai 
---
 arch/powerpc/platforms/powernv/idle.c | 104 ++
 1 file changed, 104 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 97aeb45e897b..384980c743eb 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -1436,6 +1436,107 @@ static void __init pnv_probe_idle_states(void)
supported_cpuidle_states |= pnv_idle_states[i].flags;
 }
 
+/*
+ * Extracts and populates the self save or restore capabilities
+ * passed from the device tree node
+ */
+static int extract_save_restore_state_dt(struct device_node *np, int type)
+{
+   int nr_sprns = 0, i, bitmask_index;
+   int rc = 0;
+   u64 *temp_u64;
+   const char *state_prop;
+   u64 bit_pos;
+
+   state_prop = of_get_property(np, "status", NULL);
+   if (!state_prop) {
+   pr_warn("opal: failed to find the active value for self 
save/restore node");
+   return -EINVAL;
+   }
+   if (strncmp(state_prop, "disabled", 8) == 0) {
+   /*
+* if the feature is not active, strip the preferred_sprs from
+* that capability.
+*/
+   if (type == SELF_RESTORE_TYPE) {
+   for (i = 0; i < nr_preferred_sprs; i++) {
+   preferred_sprs[i].supported_mode &=
+   ~SELF_RESTORE_STRICT;
+   }
+   } else {
+   for (i = 0; i < nr_preferred_sprs; i++) {
+   preferred_sprs[i].supported_mode &=
+   ~SELF_SAVE_STRICT;
+   }
+   }
+   return 0;
+   }
+   nr_sprns = of_property_count_u64_elems(np, "sprn-bitmask");
+   if (nr_sprns <= 0)
+   return rc;
+   temp_u64 = kcalloc(nr_sprns, sizeof(u64), GFP_KERNEL);
+   if (of_property_read_u64_array(np, "sprn-bitmask",
+  temp_u64, nr_sprns)) {
+   pr_warn("cpuidle-powernv: failed to find registers in DT\n");
+   kfree(temp_u64);
+   return -EINVAL;
+   }
+   /*
+* Populate acknowledgment of support for the sprs in the global vector
+* gotten by the registers supplied by the firmware.
+* The registers are in a bitmask, bit index within
+* that specifies the SPR
+*/
+   for (i = 0; i < nr_preferred_sprs; i++) {
+   bitmask_index = preferred_sprs[i].spr / 64;
+   bit_pos = preferred_sprs[i].spr % 64;
+   if ((temp_u64[bitmask_index] & (1UL << bit_pos)) == 0) {
+   if (type == SELF_RESTORE_TYPE)
+   preferred_sprs[i].supported_mode &=
+   ~SELF_RESTORE_STRICT;
+   else
+   preferred_sprs[i].supported_mode &=
+   ~SELF_SAVE_STRICT;
+   continue;
+   }
+   if (type == SELF_RESTORE_TYPE) {
+   preferred_sprs[i].supported_mode |=
+   SELF_RESTORE_STRICT;
+   } else {
+   preferred_sprs[i].supported_mode |=
+   SELF_SAVE_STRICT;
+   }
+   }
+
+   kfree(temp_u64);
+   return rc;
+}
+
+static int pnv_parse_deepstate_dt(void)
+{
+   struct device_node *sr_np, *ss_np;
+   int rc = 0;
+
+   /* Self restore register population */
+   sr_np = of_find_node_by_path("/ibm,opal/power-mgt/self-restore");
+   if (!sr_np) {
+   pr_warn("opal: self restore Node not found");
+   } else {
+   rc = extract_save_restore_state_dt(sr_np, SELF_RESTORE_TYPE);
+   if (rc != 0)
+   return rc;
+   }
+   /* Self save register population */
+   ss_np = of_find_node_by_path("/ibm,opal/power-mgt/self-save");
+   if (!ss_np) {
+   pr_warn("opal: self save Node not found");
+   pr_warn("Legacy firmware. Assuming default self-restore 
support");
+   } else {
+   rc = extract_save_restore_state_dt(ss_np, SELF_SAVE_TYPE);
+   }
+   return rc;
+}
+
 /*
  * This function parses device-tree and populates all the information
  * into pnv_idle_states structure. It also sets up nr_pnv_idle_states
@@ -1584,6 +1685,9 @@ static int __init pnv_init_idle_states(void)
return rc;
pnv_probe_idle_states();
 
+   rc = pnv_parse_deepstate_dt();
+   if (rc)
+   return rc;
if (!cp

[PATCH v3 2/3] powerpc/powernv: Introduce Self save support

2020-01-13 Thread Pratik Rajesh Sampat

This commit introduces and leverages the Self save API which OPAL now
supports.

Add the new Self Save OPAL API call in the list of OPAL calls.
Implement the self saving of the SPRs based on the support populated
while respecting it's preferences.

This implementation allows mixing of support for the SPRs, which
means that a SPR can be self restored while another SPR be self saved if
they support and prefer it to be so.

Signed-off-by: Pratik Rajesh Sampat 
Reviewed-by: Ram Pai 
---
 arch/powerpc/include/asm/opal-api.h|  3 ++-
 arch/powerpc/include/asm/opal.h|  1 +
 arch/powerpc/platforms/powernv/idle.c  | 22 ++
 arch/powerpc/platforms/powernv/opal-call.c |  1 +
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..1b6e1a68d431 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,8 @@
 #define OPAL_SECVAR_GET176
 #define OPAL_SECVAR_GET_NEXT   177
 #define OPAL_SECVAR_ENQUEUE_UPDATE 178
-#define OPAL_LAST  178
+#define OPAL_SLW_SELF_SAVE_REG 181
+#define OPAL_LAST  181
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..389a85b63805 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -203,6 +203,7 @@ int64_t opal_handle_hmi(void);
 int64_t opal_handle_hmi2(__be64 *out_flags);
 int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
 int64_t opal_unregister_dump_region(uint32_t id);
+int64_t opal_slw_self_save_reg(uint64_t cpu_pir, uint64_t sprn);
 int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
 int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag);
 int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t 
pe_number);
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 03fe835aadd1..97aeb45e897b 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -279,6 +279,26 @@ static int pnv_self_save_restore_sprs(void)
if (rc != 0)
return rc;
break;
+   } else if (preferred & curr_spr.supported_mode
+  & SELF_SAVE_STRICT) {
+   is_initialized = true;
+   if (curr_spr.spr == SPRN_HMEER &&
+   cpu_thread_in_core(cpu) != 0) {
+   continue;
+   }
+   rc = opal_slw_self_save_reg(pir,
+   curr_spr.spr);
+   if (rc != 0)
+   return rc;
+   switch (curr_spr.spr) {
+   case SPRN_LPCR:
+   is_lpcr_self_save = true;
+   break;
+   case SPRN_PTCR:
+   is_ptcr_self_save = true;
+   break;
+   }
+   break;
}
preferred_sprs[index].preferred_mode =
preferred_sprs[index].preferred_mode >>
@@ -1159,6 +1179,8 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 
lpcr_val)
if (!is_lpcr_self_save)
opal_slw_set_reg(pir, SPRN_LPCR,
 lpcr_val);
+   else
+   opal_slw_self_save_reg(pir, SPRN_LPCR);
}
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
index 5cd0f52d258f..11e0ceb90de0 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -223,6 +223,7 @@ OPAL_CALL(opal_handle_hmi,  
OPAL_HANDLE_HMI);
 OPAL_CALL(opal_handle_hmi2,OPAL_HANDLE_HMI2);
 OPAL_CALL(opal_config_cpu_idle_state,  OPAL_CONFIG_CPU_IDLE_STATE);
 OPAL_CALL(opal_slw_set_reg,OPAL_SLW_SET_REG);
+OPAL_CALL(opal_slw_self_save_reg,  OPAL_SLW_SELF_SAV

[PATCH v3 1/3] powerpc/powernv: Interface to define support and preference for a SPR

2020-01-13 Thread Pratik Rajesh Sampat

Define a bitmask interface to determine support for the Self Restore,
Self Save or both.

Also define an interface to determine the preference of that SPR to
be strictly saved or restored or encapsulated with an order of preference.

The preference bitmask is shown as below:

|... | 2nd pref | 1st pref |

MSB   LSB

The preference from higher to lower is from LSB to MSB with a shift of 8
bits.
Example:
Prefer self save first, if not available then prefer self
restore
The preference mask for this scenario will be seen as below.
((SELF_RESTORE_STRICT << PREFERENCE_SHIFT) | SELF_SAVE_STRICT)
-
|... | Self restore | Self save |
-
MSB LSB

Finally, declare a list of preferred SPRs which encapsulate the bitmaks
for preferred and supported with defaults of both being set to support
legacy firmware.

This commit also implements using the above interface and retains the
legacy functionality of self restore.

Signed-off-by: Pratik Rajesh Sampat 
Reviewed-by: Ram Pai 
---
 arch/powerpc/platforms/powernv/idle.c | 316 +-
 1 file changed, 259 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..03fe835aadd1 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -32,9 +32,112 @@
 #define P9_STOP_SPR_MSR 2000
 #define P9_STOP_SPR_PSSCR  855
 
+/* Interface for the stop state supported and preference */
+#define SELF_RESTORE_TYPE0
+#define SELF_SAVE_TYPE   1
+
+#define NR_PREFERENCES2
+#define PREFERENCE_SHIFT  4
+#define PREFERENCE_MASK   0xf
+
+#define UNSUPPORTED 0x0
+#define SELF_RESTORE_STRICT 0x1
+#define SELF_SAVE_STRICT0x2
+
+/*
+ * Bitmask defining the kind of preferences available.
+ * Note : The higher to lower preference is from LSB to MSB, with a shift of
+ * 4 bits.
+ * 
+ * || 2nd pref | 1st pref |
+ * 
+ * MSB   LSB
+ */
+/* Prefer Restore if available, otherwise unsupported */
+#define PREFER_SELF_RESTORE_ONLY   SELF_RESTORE_STRICT
+/* Prefer Save if available, otherwise unsupported */
+#define PREFER_SELF_SAVE_ONLY  SELF_SAVE_STRICT
+/* Prefer Restore when available, otherwise prefer Save */
+#define PREFER_RESTORE_SAVE((SELF_SAVE_STRICT << \
+ PREFERENCE_SHIFT)\
+ | SELF_RESTORE_STRICT)
+/* Prefer Save when available, otherwise prefer Restore*/
+#define PREFER_SAVE_RESTORE((SELF_RESTORE_STRICT <<\
+ PREFERENCE_SHIFT)\
+ | SELF_SAVE_STRICT)
 static u32 supported_cpuidle_states;
 struct pnv_idle_states_t *pnv_idle_states;
 int nr_pnv_idle_states;
+/* Caching the lpcr & ptcr support to use later */
+static bool is_lpcr_self_save;
+static bool is_ptcr_self_save;
+
+struct preferred_sprs {
+   u64 spr;
+   u32 preferred_mode;
+   u32 supported_mode;
+};
+
+/*
+ * Preferred mode: Order of precedence when both self-save and self-restore
+ *supported
+ * Supported mode: Default support. Can be overwritten during system
+ *initialization
+ */
+struct preferred_sprs preferred_sprs[] = {
+   {
+   .spr = SPRN_HSPRG0,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_LPCR,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_PTCR,
+   .preferred_mode = PREFER_SAVE_RESTORE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_HMEER,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_HID0,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = P9_STOP_SPR_MSR,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = P9_STOP_SPR_PSSCR,
+   .preferred_mode = PREFER_SAVE_RESTORE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_HID1,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_HID4,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+

[PATCH v3 0/3] Introduce Self-Save API for deep stop states

2020-01-13 Thread Pratik Rajesh Sampat

Skiboot patches: https://patchwork.ozlabs.org/patch/1221011/
v2 patches: https://lkml.org/lkml/2020/1/12/253
Changelog
Patch v2 --> v3
1. Addressed minor comments from Ram Pai

Currently the stop-API supports a mechanism called as self-restore
which allows us to restore the values of certain SPRs on wakeup from a
deep-stop state to a desired value. To use this, the Kernel makes an
OPAL call passing the PIR of the CPU, the SPR number and the value to
which the SPR should be restored when that CPU wakes up from a deep
stop state.

Recently, a new feature, named self-save has been enabled in the
stop-api, which is an alternative mechanism to do the same, except
that self-save will save the current content of the SPR before
entering a deep stop state and also restore the content back on
waking up from a deep stop state.

This patch series aims at introducing and leveraging the self-save feature in
the kernel.

Now, as the kernel has a choice to prefer one mode over the other and
there can be registers in both the save/restore SPR list which are sent
from the device tree, a new interface has been defined for the seamless
handing of the modes for each SPR.

A list of preferred SPRs are maintained in the kernel which contains two
properties:
1. supported_mode: Helps in identifying if it strictly supports self
   save or restore or both.
   Initialized using the information from device tree.
2. preferred_mode: Calls out what mode is preferred for each SPR. It
   could be strictly self save or restore, or it can also
   determine the preference of  mode over the other if both
   are present by encapsulating the other in bitmask from
   LSB to MSB.
   Initialized statically.

Below is a table to show the Scenario::Consequence when the self save and
self restore modes are available or disabled in different combinations as
perceived from the device tree thus giving complete backwards compatibly
regardless of an older firmware running a newer kernel or vise-versa.
Support for self save or self-restore is embedded in the device tree,
along with the set of registers it supports.

SR = Self restore; SS = Self save

.---..
| Scenario  |Consequence |
:---+:
| Legacy Firmware. No SS or SR node | Self restore is called for all |
|   | supported SPRs |
:---+:
| SR: !active SS: !active   | Deep stop states disabled  |
:---+:
| SR: active SS: !active| Self restore is called for all |
|   | supported SPRs |
:---+:
| SR: active SS: active | Goes through the preferences for each  |
|   | SPR and executes of the modes  |
|   | accordingly. Currently, Self restore is|
|   | called for all the SPRs except PSSCR   |
|   | which is self saved|
:---+:
| SR: active(only HID0) SS: active  | Self save called for all supported |
|   | registers expect HID0 (as HID0 cannot  |
|   | be self saved currently)   |
:---+:
| SR: !active SS: active| currently will disable deep states as  |
|   | HID0 is needed to be self restored and |
|   | cannot be self saved   |
'---''

Pratik Rajesh Sampat (3):
  powerpc/powernv: Interface to define support and preference for a SPR
  powerpc/powernv: Introduce Self save support
  powerpc/powernv: Parse device tree, population of SPR support

 arch/powerpc/include/asm/opal-api.h|   3 +-
 arch/powerpc/include/asm/opal.h|   1 +
 arch/powerpc/platforms/powernv/idle.c  | 442 ++---
 arch/powerpc/platforms/powernv/opal-call.c |   1 +
 4 files changed, 389 insertions(+), 58 deletions(-)

-- 
2.24.1

Re: [PATCH 15/18] powerpc/uprobes: Add support for prefixed instructions

2020-01-13 Thread Balamuruhan S

On Tue, Nov 26, 2019 at 04:21:38PM +1100, Jordan Niethe wrote:
> Uprobes can execute instructions out of line. Increase the size of the
> buffer used  for this so that this works for prefixed instructions. Take
> into account the length of prefixed instructions when fixing up the nip.
> 
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/uprobes.h | 18 ++
>  arch/powerpc/kernel/uprobes.c  |  4 ++--
>  2 files changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/uprobes.h 
> b/arch/powerpc/include/asm/uprobes.h
> index 2bbdf27d09b5..5b5e8a3d2f55 100644
> --- a/arch/powerpc/include/asm/uprobes.h
> +++ b/arch/powerpc/include/asm/uprobes.h
> @@ -14,18 +14,28 @@
>  
>  typedef ppc_opcode_t uprobe_opcode_t;
>  
> +/*
> + * We have to ensure we have enought space for prefixed instructions, which

minor typo of `enought` and we can have something like below,

s/We have to ensure we have enought/Ensure we have enough

-- Bala

> + * are double the size of a word instruction, i.e. 8 bytes. However,
> + * sometimes it is simpler to treat a prefixed instruction like 2 word
> + * instructions.
> + */
>  #define MAX_UINSN_BYTES  4
> -#define UPROBE_XOL_SLOT_BYTES(MAX_UINSN_BYTES)
> +#define UPROBE_XOL_SLOT_BYTES(2 * MAX_UINSN_BYTES)
>  
>  /* The following alias is needed for reference from arch-agnostic code */
>  #define UPROBE_SWBP_INSN BREAKPOINT_INSTRUCTION
>  #define UPROBE_SWBP_INSN_SIZE4 /* swbp insn size in bytes */
>  
>  struct arch_uprobe {
> +  /*
> +   * Ensure there is enough space for prefixed instructions. Prefixed
> +   * instructions must not cross 64-byte boundaries.
> +   */
>   union {
> - u32 insn;
> - u32 ixol;
> - };
> + uprobe_opcode_t insn[2];
> + uprobe_opcode_t ixol[2];
> + } __aligned(64);
>  };
>  
>  struct arch_uprobe_task {
> diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
> index ab1077dc6148..cfcea6946f8b 100644
> --- a/arch/powerpc/kernel/uprobes.c
> +++ b/arch/powerpc/kernel/uprobes.c
> @@ -111,7 +111,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, 
> struct pt_regs *regs)
>* support doesn't exist and have to fix-up the next instruction
>* to be executed.
>*/
> - regs->nip = utask->vaddr + MAX_UINSN_BYTES;
> + regs->nip = utask->vaddr + ((IS_PREFIX(auprobe->insn[0])) ? 8 : 4);
>  
>   user_disable_single_step(current);
>   return 0;
> @@ -173,7 +173,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, 
> struct pt_regs *regs)
>* emulate_step() returns 1 if the insn was successfully emulated.
>* For all other cases, we need to single-step in hardware.
>*/
> - ret = emulate_step(regs, auprobe->insn, 0);
> + ret = emulate_step(regs, auprobe->insn[0], auprobe->insn[1]);
>   if (ret > 0)
>   return true;
>  
> -- 
> 2.20.1
>

Re: [RESEND PATCH v2 2/3] powerpc/powernv: Introduce Self save support

2020-01-13 Thread Pratik Sampat


I made a mistake while arranging the patches in the series. I'll re-arrange it
correctly now. Sorry about that.

On 13/01/20 1:21 pm, Ram Pai wrote:

On Mon, Jan 13, 2020 at 09:15:08AM +0530, Pratik Rajesh Sampat wrote:

This commit introduces and leverages the Self save API which OPAL now
supports.

Add the new Self Save OPAL API call in the list of OPAL calls.
Implement the self saving of the SPRs based on the support populated
while respecting it's preferences.

This implementation allows mixing of support for the SPRs, which
means that a SPR can be self restored while another SPR be self saved if
they support and prefer it to be so.

Signed-off-by: Pratik Rajesh Sampat 
---
  arch/powerpc/include/asm/opal-api.h| 3 ++-
  arch/powerpc/include/asm/opal.h| 1 +
  arch/powerpc/platforms/powernv/idle.c  | 2 ++
  arch/powerpc/platforms/powernv/opal-call.c | 1 +
  4 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..1b6e1a68d431 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,8 @@
  #define OPAL_SECVAR_GET   176
  #define OPAL_SECVAR_GET_NEXT  177
  #define OPAL_SECVAR_ENQUEUE_UPDATE178
-#define OPAL_LAST  178
+#define OPAL_SLW_SELF_SAVE_REG 181
+#define OPAL_LAST  181

  #define QUIESCE_HOLD  1 /* Spin all calls at entry */
  #define QUIESCE_REJECT2 /* Fail all calls with 
OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..389a85b63805 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -203,6 +203,7 @@ int64_t opal_handle_hmi(void);
  int64_t opal_handle_hmi2(__be64 *out_flags);
  int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
  int64_t opal_unregister_dump_region(uint32_t id);
+int64_t opal_slw_self_save_reg(uint64_t cpu_pir, uint64_t sprn);
  int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
  int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag);
  int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t 
pe_number);
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 2f328403b0dc..d67d4d0b169b 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -1172,6 +1172,8 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 
lpcr_val)
if (!is_lpcr_self_save)
opal_slw_set_reg(pir, SPRN_LPCR,
 lpcr_val);
+   else
+   opal_slw_self_save_reg(pir, SPRN_LPCR);

opal_slw_self_save_reg() was used in the prior patch too. How did it
compile, if the definition is in this patch?


Reviewed-by: Ram Pai 

RP

Re: [RESEND PATCH v2 1/3] powerpc/powernv: Interface to define support and preference for a SPR

2020-01-13 Thread Pratik Sampat


Thanks for the review.
The support just signifies what is default. Self restore is known to be
supported by legacy systems.

I'll mention a comment saying that this can change when the system is
initialized.


On 13/01/20 1:14 pm, Ram Pai wrote:

On Mon, Jan 13, 2020 at 09:15:07AM +0530, Pratik Rajesh Sampat wrote:

Define a bitmask interface to determine support for the Self Restore,
Self Save or both.

Also define an interface to determine the preference of that SPR to
be strictly saved or restored or encapsulated with an order of preference.

The preference bitmask is shown as below:

|... | 2nd pref | 1st pref |

MSB   LSB

The preference from higher to lower is from LSB to MSB with a shift of 8
bits.
Example:
Prefer self save first, if not available then prefer self
restore
The preference mask for this scenario will be seen as below.
((SELF_RESTORE_STRICT << PREFERENCE_SHIFT) | SELF_SAVE_STRICT)
-
|... | Self restore | Self save |
-
MSB LSB

Finally, declare a list of preferred SPRs which encapsulate the bitmaks
for preferred and supported with defaults of both being set to support
legacy firmware.

This commit also implements using the above interface and retains the
legacy functionality of self restore.

Signed-off-by: Pratik Rajesh Sampat 
---
  arch/powerpc/platforms/powernv/idle.c | 327 +-
  1 file changed, 271 insertions(+), 56 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..2f328403b0dc 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -32,9 +32,106 @@
  #define P9_STOP_SPR_MSR 2000
  #define P9_STOP_SPR_PSSCR  855

+/* Interface for the stop state supported and preference */
+#define SELF_RESTORE_TYPE0
+#define SELF_SAVE_TYPE   1
+
+#define NR_PREFERENCES2
+#define PREFERENCE_SHIFT  4
+#define PREFERENCE_MASK   0xf
+
+#define UNSUPPORTED 0x0
+#define SELF_RESTORE_STRICT 0x1
+#define SELF_SAVE_STRICT0x2
+
+/*
+ * Bitmask defining the kind of preferences available.
+ * Note : The higher to lower preference is from LSB to MSB, with a shift of
+ * 4 bits.
+ * 
+ * || 2nd pref | 1st pref |
+ * 
+ * MSB   LSB
+ */
+/* Prefer Restore if available, otherwise unsupported */
+#define PREFER_SELF_RESTORE_ONLY   SELF_RESTORE_STRICT
+/* Prefer Save if available, otherwise unsupported */
+#define PREFER_SELF_SAVE_ONLY  SELF_SAVE_STRICT
+/* Prefer Restore when available, otherwise prefer Save */
+#define PREFER_RESTORE_SAVE((SELF_SAVE_STRICT << \
+ PREFERENCE_SHIFT)\
+ | SELF_RESTORE_STRICT)
+/* Prefer Save when available, otherwise prefer Restore*/
+#define PREFER_SAVE_RESTORE((SELF_RESTORE_STRICT <<\
+ PREFERENCE_SHIFT)\
+ | SELF_SAVE_STRICT)
  static u32 supported_cpuidle_states;
  struct pnv_idle_states_t *pnv_idle_states;
  int nr_pnv_idle_states;
+/* Caching the lpcr & ptcr support to use later */
+static bool is_lpcr_self_save;
+static bool is_ptcr_self_save;
+
+struct preferred_sprs {
+   u64 spr;
+   u32 preferred_mode;
+   u32 supported_mode;
+};
+
+struct preferred_sprs preferred_sprs[] = {
+   {
+   .spr = SPRN_HSPRG0,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_LPCR,
+   .preferred_mode = PREFER_RESTORE_SAVE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },
+   {
+   .spr = SPRN_PTCR,
+   .preferred_mode = PREFER_SAVE_RESTORE,
+   .supported_mode = SELF_RESTORE_STRICT,
+   },

This confuses me.  It says SAVE takes precedence over RESTORE.
and than it says it is strictly 'RESTORE' only.

Maybe you should not initialize the 'supported_mode' ?
or put a comment somewhere here, saying this value will be overwritten
during system initialization?


Otherwise the code looks correct.

Reviewed-by: Ram Pai 
RP

53 matches

Mail list logo