[PATCH V3] powerpc: Add KVM guest defconfig

2018-11-14 Thread sathnaga
From: Satheesh Rajendran 

This patch adds new defconfig options for powerpc KVM guest
and guest.config with additional config symbols enabled,
which is to build kernel to boot without initramfs and can be used
as place holder for guest specific additional config symbols in future.

Signed-off-by: Michael Ellerman 
Signed-off-by: Satheesh Rajendran 
---
Changes Since:
V1:
* replaced kvm to KVM in commit msg.
V2:
* moved XFS config symbol enable into 
  ppc64_defconfig, based on suggestion from Michael.  

 arch/powerpc/Makefile |  8 
 arch/powerpc/configs/guest.config | 13 +
 2 files changed, 21 insertions(+)
 create mode 100644 arch/powerpc/configs/guest.config

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 8a2ce14d68d0..0bff8bd82ed5 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -317,6 +317,14 @@ PHONY += ppc64le_defconfig
 ppc64le_defconfig:
$(call merge_into_defconfig,ppc64_defconfig,le)
 
+PHONY += ppc64le_guest_defconfig
+ppc64le_guest_defconfig:
+   $(call merge_into_defconfig,ppc64_defconfig,le guest)
+
+PHONY += ppc64_guest_defconfig
+ppc64_guest_defconfig:
+   $(call merge_into_defconfig,ppc64_defconfig,be guest)
+
 PHONY += powernv_be_defconfig
 powernv_be_defconfig:
$(call merge_into_defconfig,powernv_defconfig,be)
diff --git a/arch/powerpc/configs/guest.config 
b/arch/powerpc/configs/guest.config
new file mode 100644
index ..8b8cd18ecd7c
--- /dev/null
+++ b/arch/powerpc/configs/guest.config
@@ -0,0 +1,13 @@
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_BLK_SCSI=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_NET=y
+CONFIG_NET_FAILOVER=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_KVM_GUEST=y
+CONFIG_EPAPR_PARAVIRT=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VHOST_NET=y
+CONFIG_VHOST=y
-- 
2.17.2



[PATCH] powerpc: Add missing config symbols for ppc64_defconfig

2018-11-14 Thread sathnaga
From: Satheesh Rajendran 

This patch adds missing config symbols for ppc64_defconfig
to enable cgroups, memhotplug, numa balancing and XFS
in core kernel image.

Signed-off-by: Satheesh Rajendran 
---
 arch/powerpc/configs/ppc64_defconfig | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index f2515674a1e2..dc3ffefbc070 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -284,7 +284,7 @@ CONFIG_REISERFS_FS_SECURITY=y
 CONFIG_JFS_FS=m
 CONFIG_JFS_POSIX_ACL=y
 CONFIG_JFS_SECURITY=y
-CONFIG_XFS_FS=m
+CONFIG_XFS_FS=y
 CONFIG_XFS_POSIX_ACL=y
 CONFIG_BTRFS_FS=m
 CONFIG_BTRFS_FS_POSIX_ACL=y
@@ -369,3 +369,13 @@ CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m
 CONFIG_VHOST_NET=m
 CONFIG_PRINTK_TIME=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_MEMCG=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_NUMA_BALANCING=y
-- 
2.17.2



[PATCH] powerpc/mm: dump block address translation on book3s/32

2018-11-14 Thread Christophe Leroy
This patch adds a debugfs file to dump block address translation:

~# cat /sys/kernel/debug/block_address_translation
Instruction Block Address Translations:
0: -
1: -
2: 0xc000-0xcfff 0x Kernel EXEC coherent
3: 0xd000-0xdfff 0x1000 Kernel EXEC coherent
4: -
5: -
6: -
7: -

Data Block Address Translations:
0: -
1: -
2: 0xc000-0xcfff 0x Kernel RW coherent
3: 0xd000-0xdfff 0x1000 Kernel RW coherent
4: -
5: -
6: -
7: -

Signed-off-by: Christophe Leroy 
---
 Tested on mpc8321 aka 603
 Please review/test the 601 part

 arch/powerpc/mm/Makefile|   2 +-
 arch/powerpc/mm/dump_bats.c | 176 
 2 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/mm/dump_bats.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index ca96e7be4d0e..2adad10b5856 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -47,7 +47,7 @@ ifdef CONFIG_PPC_PTDUMP
 obj-$(CONFIG_4xx)  += dump_linuxpagetables-generic.o
 obj-$(CONFIG_PPC_8xx)  += dump_linuxpagetables-8xx.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += dump_linuxpagetables-generic.o
-obj-$(CONFIG_PPC_BOOK3S_32)+= dump_linuxpagetables-generic.o
+obj-$(CONFIG_PPC_BOOK3S_32)+= dump_linuxpagetables-generic.o dump_bats.o
 obj-$(CONFIG_PPC_BOOK3S_64)+= dump_linuxpagetables-book3s64.o
 endif
 obj-$(CONFIG_PPC_HTDUMP)   += dump_hashpagetable.o
diff --git a/arch/powerpc/mm/dump_bats.c b/arch/powerpc/mm/dump_bats.c
new file mode 100644
index ..824a94ee9051
--- /dev/null
+++ b/arch/powerpc/mm/dump_bats.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018, Christophe Leroy CS S.I.
+ * 
+ *
+ * This dumps the content of BATS
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+static char *pp_601(int k, int pp)
+{
+   if (pp == 0)
+   return k ? "NA" : "RWX";
+   if (pp == 1)
+   return k ? "ROX" : "RWX";
+   if (pp == 2)
+   return k ? "RWX" : "RWX";
+   return k ? "ROX" : "ROX";
+}
+
+static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper)
+{
+   u32 blpi = upper & 0xfffe;
+   u32 k = (upper >> 2) & 3;
+   u32 pp = upper & 3;
+   u32 pbn = lower & 0xfffe;
+   u32 bsm = lower & 0x3ff;
+   u32 size = (bsm + 1) << 17;
+
+   seq_printf(m, "%d: ", idx);
+   if (!(lower & 0x40)) {
+   seq_puts(m, "-\n");
+   return;
+   }
+
+   seq_printf(m, "0x%08x-0x%08x ", blpi, blpi + size - 1);
+   seq_printf(m, "0x%08x ", pbn);
+
+   seq_printf(m, "Kernel %s User %s", pp_601(k & 2, pp), pp_601(k & 1, 
pp));
+
+   if (lower & _PAGE_WRITETHRU)
+   seq_puts(m, "write through ");
+   if (lower & _PAGE_NO_CACHE)
+   seq_puts(m, "no cache ");
+   if (lower & _PAGE_COHERENT)
+   seq_puts(m, "coherent ");
+   seq_puts(m, "\n");
+}
+
+#define BAT_SHOW_601(_m, _n, _l, _u) bat_show_601(_m, _n, mfspr(_l), mfspr(_u))
+
+static int bats_show_601(struct seq_file *m, void *v)
+{
+   seq_puts(m, "Block Address Translation:\n");
+
+   BAT_SHOW_601(m, 0, SPRN_IBAT0L, SPRN_IBAT0U);
+   BAT_SHOW_601(m, 1, SPRN_IBAT1L, SPRN_IBAT1U);
+   BAT_SHOW_601(m, 2, SPRN_IBAT2L, SPRN_IBAT2U);
+   BAT_SHOW_601(m, 3, SPRN_IBAT3L, SPRN_IBAT3U);
+
+   return 0;
+}
+
+static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, 
bool is_d)
+{
+   u32 bepi = upper & 0xfffe;
+   u32 bl = (upper >> 2) & 0x7ff;
+   u32 k = upper & 3;
+   u32 brpn = lower & 0xfffe;
+   u32 size = (bl + 1) << 17;
+
+   seq_printf(m, "%d: ", idx);
+   if (k == 0) {
+   seq_puts(m, "-\n");
+   return;
+   }
+
+   seq_printf(m, "0x%08x-0x%08x ", bepi, bepi + size - 1);
+   seq_printf(m, "0x%08x ", brpn);
+
+   if (k == 1)
+   seq_puts(m, "User ");
+   else if (k == 2)
+   seq_puts(m, "Kernel ");
+   else
+   seq_puts(m, "Kernel/User ");
+
+   if (lower & BPP_RX)
+   seq_puts(m, is_d ? "RO " : "EXEC ");
+   else if (lower & BPP_RW)
+   seq_puts(m, is_d ? "RW " : "EXEC ");
+   else
+   seq_puts(m, is_d ? "NA " : "NX   ");
+
+   if (lower & _PAGE_WRITETHRU)
+   seq_puts(m, "write through ");
+   if (lower & _PAGE_NO_CACHE)
+   seq_puts(m, "no cache ");
+   if (lower & _PAGE_COHERENT)
+   seq_puts(m, "coherent ");
+   if (lower & _PAGE_GUARDED)
+   seq_puts(m, "guarded ");
+   seq_puts(m, "\n");
+}
+
+#define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, 

Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected

2018-11-14 Thread Bjorn Helgaas
On Wed, Nov 14, 2018 at 07:22:04PM +, alex_gagn...@dellteam.com wrote:
> On 11/14/2018 12:00 AM, Bjorn Helgaas wrote:
> > On Tue, Nov 13, 2018 at 10:39:15PM +, alex_gagn...@dellteam.com wrote:
> >> On 11/12/2018 11:02 PM, Bjorn Helgaas wrote:
> >> ...
> >>> Do you think Linux observes the rule about not touching AER bits on
> >>> FFS?  I'm not sure it does.  I'm not even sure what section of the
> >>> spec is relevant.
> >>
> >> I haven't found any place where linux breaks this rule. I'm very
> >> confident that, unless otherwise instructed, we follow this rule.
> > 
> > Just to make sure we're on the same page, can you point me to this
> > rule?  I do see that OSPM must request control of AER using _OSC
> > before it touches the AER registers.  What I don't see is the
> > connection between firmware-first and the AER registers.
> 
> ACPI 6.2 - 6.2.11.3, Table 6-197:
> 
> PCI Express Advanced Error Reporting control:
>   * The firmware sets this bit to 1 to grant control over PCI Express 
> Advanced Error Reporting. If firmware allows the OS control of this 
> feature, then in the context of the _OSC method it must ensure that 
> error messages are routed to device interrupts as described in the PCI 
> Express Base Specification[...]

The PCIe Base Spec is pretty big, so I wish this reference were a
little more explicit.  I *guess* maybe it's referring to PCIe r4.0,
figure 6-3 in sec 6.2.6, where PCIe ERR_* Messages can be routed to
"INTx or MSI Error Interrupts" and/or "platform-specific System Error"
interrupts.

"Device interrupts" seems like it refers to the "INTx or MSI"
interrupts, not the platform-specific System Errors, so I would read
that as saying "if firmware grants OS control of AER via _OSC,
firmware must set the AER Reporting Enables in the AER Root Error
Command register."  But that seems a little silly because the OS now
*owns* the AER capability and it can set the AER Root Error Command
register itself if it wants to.

And I still don't see the connection here with Firmware-First.  I'm
pretty sure firmware could not be notified via INTx or MSI interrupts
because those are totally managed by OSPM.

> > The closest I can find is the "Enabled" field in the HEST PCIe
> > AER structures (ACPI v6.2, sec 18.3.2.4, .5, .6), where it says:
> > 
> >If the field value is 1, indicates this error source is
> >to be enabled.
> > 
> >If the field value is 0, indicates that the error source
> >is not to be enabled.
> > 
> >If FIRMWARE_FIRST is set in the flags field, the Enabled
> >field is ignored by the OSPM.
> > 
> > AFAICT, Linux completely ignores the Enabled field in these
> > structures.
> 
> I don't think ignoring the field is a problem:
>   * With FFS, OS should ignore it.
>   * Without FFS, we have control, and we get to make the decisions anyway.
> In the latter case we decide whether to use AER, independent of the crap 
> in ACPI. I'm not even sure why "Enabled" matters in native AER handling. 

It seems like these HEST structures are "here's how firmware thinks
you should set up AER on this device".  But I agree, I have no idea
how to interpret "Enabled".  The rest of the HEST fields cover all the
useful AER registers, including the Reporting Enables in the AER Root
Error Command register *and* the Error Reporting Enables in the Device
Control register.  So I don't know what the "Enabled" field adds to
all that.  What a mess.

> > For firmware-first to work, firmware has to get control.  How does
> > it get control?  How does OSPM know to either set up that
> > mechanism or keep its mitts off something firmware set up before
> > handoff?
> 
> My understanding is that, if FW keeps control of AER in _OSC, then
> it will have set things up to get notified instead of the OS. OSPM
> not touching AER bits is to make sure it doesn't mess up FW's setup.
> I think there are some proprietary bits in the root port to route
> interrupts to SMIs instead of the AER vectors.

It makes good sense that if OSPM doesn't have AER control, firmware
does all AER handling, including any setup for firmware-first
notification.  If we can assume that firmware-first notification is
done in some way the OS doesn't know about and can't mess up, that
would be awesome.

But I think the VMD model really has nothing to do with the APEI
firmware-first model.  With VMD, it sounds like OSPM owns the AER
capability and doesn't know firmware exists *except* that it has to be
careful not to step on firmware's interrupt.  So maybe we can handle it
separately.

Bjorn


Re: [PATCH kernel v3 02/22] powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a region

2018-11-14 Thread David Gibson
On Tue, Nov 13, 2018 at 07:28:03PM +1100, Alexey Kardashevskiy wrote:
> Normally mm_iommu_get() is supposed to add a reference and
> mm_iommu_put() to remove it. However historically mm_iommu_find() does
> the referencing and mm_iommu_get() is doing allocation and referencing.
> 
> We are going to add another helper to preregister device memory so
> instead of having mm_iommu_new() which pre-registers the normal memory
> and references the region, we need separate helpers for pre-registering
> and referencing.
> 
> This renames:
> - mm_iommu_get to mm_iommu_new;
> - mm_iommu_find to mm_iommu_get.
> 
> To make the mm_iommu_get name reflect what it is supposed to do, this
> changes mm_iommu_get() to reference the region so from now on for every
> mm_iommu_get() we need a matching mm_iommu_put().
> 
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: David Gibson 

> ---
> Changes:
> v2:
> * merged 2 patches into one
> ---
>  arch/powerpc/include/asm/mmu_context.h |  4 +--
>  arch/powerpc/mm/mmu_context_iommu.c| 13 ++---
>  drivers/vfio/vfio_iommu_spapr_tce.c| 37 +-
>  3 files changed, 35 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index 0381394..2d6b00d 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -21,7 +21,7 @@ struct mm_iommu_table_group_mem_t;
>  
>  extern int isolate_lru_page(struct page *page);  /* from internal.h */
>  extern bool mm_iommu_preregistered(struct mm_struct *mm);
> -extern long mm_iommu_get(struct mm_struct *mm,
> +extern long mm_iommu_new(struct mm_struct *mm,
>   unsigned long ua, unsigned long entries,
>   struct mm_iommu_table_group_mem_t **pmem);
>  extern long mm_iommu_put(struct mm_struct *mm,
> @@ -32,7 +32,7 @@ extern struct mm_iommu_table_group_mem_t 
> *mm_iommu_lookup(struct mm_struct *mm,
>   unsigned long ua, unsigned long size);
>  extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
>   struct mm_struct *mm, unsigned long ua, unsigned long size);
> -extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
> +extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
>   unsigned long ua, unsigned long entries);
>  extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>   unsigned long ua, unsigned int pageshift, unsigned long *hpa);
> diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
> b/arch/powerpc/mm/mmu_context_iommu.c
> index 1d5161f..babc6ad 100644
> --- a/arch/powerpc/mm/mmu_context_iommu.c
> +++ b/arch/powerpc/mm/mmu_context_iommu.c
> @@ -89,7 +89,7 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
>  }
>  EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
>  
> -long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
> entries,
> +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long 
> entries,
>   struct mm_iommu_table_group_mem_t **pmem)
>  {
>   struct mm_iommu_table_group_mem_t *mem;
> @@ -202,7 +202,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
> unsigned long entries,
>  
>   return ret;
>  }
> -EXPORT_SYMBOL_GPL(mm_iommu_get);
> +EXPORT_SYMBOL_GPL(mm_iommu_new);
>  
>  static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
>  {
> @@ -318,21 +318,26 @@ struct mm_iommu_table_group_mem_t 
> *mm_iommu_lookup_rm(struct mm_struct *mm,
>   return ret;
>  }
>  
> -struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
> +struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
>   unsigned long ua, unsigned long entries)
>  {
>   struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
>  
> + mutex_lock(_list_mutex);
> +
>   list_for_each_entry_rcu(mem, >context.iommu_group_mem_list, next) {
>   if ((mem->ua == ua) && (mem->entries == entries)) {
>   ret = mem;
> + ++mem->used;
>   break;
>   }
>   }
>  
> + mutex_unlock(_list_mutex);
> +
>   return ret;
>  }
> -EXPORT_SYMBOL_GPL(mm_iommu_find);
> +EXPORT_SYMBOL_GPL(mm_iommu_get);
>  
>  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>   unsigned long ua, unsigned int pageshift, unsigned long *hpa)
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index ad63725..56db071 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -151,12 +151,13 @@ static long tce_iommu_unregister_pages(struct 
> tce_container *container,
>  {
>   struct mm_iommu_table_group_mem_t *mem;
>   struct tce_iommu_prereg *tcemem;
> - bool found = false;
> + bool found;
> + long ret;
>  
>   if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
>   

Re: [PATCH kernel] powerpc/powernv/ioda: Reduce a number of hooks in pnv_phb

2018-11-14 Thread Sam Bobroff
On Tue, Oct 16, 2018 at 01:34:09PM +1100, Alexey Kardashevskiy wrote:
> fixup_phb() is never used, this removes it.
> 
> pick_m64_pe() and reserve_m64_pe() are always defined for all powernv
> PHBs: they are initialized by pnv_ioda_parse_m64_window() which is
> called unconditionally from pnv_pci_init_ioda_phb() which initializes
> all known PHB types on powernv so we can open code them.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/platforms/powernv/pci.h  | 4 
>  arch/powerpc/platforms/powernv/pci-ioda.c | 9 +++--
>  2 files changed, 3 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index 8b37b28..2131373 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -115,11 +115,7 @@ struct pnv_phb {
>unsigned int hwirq, unsigned int virq,
>unsigned int is_64, struct msi_msg *msg);
>   void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
> - void (*fixup_phb)(struct pci_controller *hose);
>   int (*init_m64)(struct pnv_phb *phb);
> - void (*reserve_m64_pe)(struct pci_bus *bus,
> -unsigned long *pe_bitmap, bool all);
> - struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all);
>   int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>   void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>   int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 78b61f0..15a4556 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -518,8 +518,6 @@ static void __init pnv_ioda_parse_m64_window(struct 
> pnv_phb *phb)
>   phb->init_m64 = pnv_ioda1_init_m64;
>   else
>   phb->init_m64 = pnv_ioda2_init_m64;
> - phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
> - phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
>  }
>  
>  static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
> @@ -1161,8 +1159,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
> pci_bus *bus, bool all)
>   pe = >ioda.pe_array[phb->ioda.root_pe_idx];
>  
>   /* Check if PE is determined by M64 */
> - if (!pe && phb->pick_m64_pe)
> - pe = phb->pick_m64_pe(bus, all);
> + if (!pe)
> + pe = pnv_ioda_pick_m64_pe(bus, all);

What about the cases where pnv_ioda_parse_m64_window() returns before
setting pick_m64_pe or reserve_m64_pe?

For example, if !firmware_has_feature(FW_FEATURE_OPAL).  In that case,
it looks like this change would cause pnv_ioda_pick_m64_pe() to be
called, and it would try to perform an OPAL call without OPAL support.

(Is it even possible to have powernv without OPAL?)

>  
>   /* The PE number isn't pinned by M64 */
>   if (!pe)
> @@ -3395,8 +3393,7 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, 
> unsigned long type)
>   return;
>  
>   /* Reserve PEs according to used M64 resources */
> - if (phb->reserve_m64_pe)
> - phb->reserve_m64_pe(bus, NULL, all);
> + pnv_ioda_reserve_m64_pe(bus, NULL, all);
>  
>   /*
>* Assign PE. We might run here because of partial hotplug.
> -- 
> 2.11.0
> 


signature.asc
Description: PGP signature


Re: [RFC PATCH 13/14] powerpc/tm: Do not restore TM without SPRs

2018-11-14 Thread Michael Neuling
On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> Currently the signal context restore code enables the bit on the MSR
> register without restoring the TM SPRs, which can cause undesired side
> effects.
> 
> This is not correct because if TM is enabled in MSR, it means the TM SPR
> registers are valid and updated, which is not correct here. In fact, the
> live registers may contain previous' thread SPRs.
> 
> Functions check if the register values are valid or not through looking
> if the facility is enabled at MSR, as MSR[TM] set means that the TM SPRs
> are hot.
> 
> When just enabling MSR[TM] without updating the live SPRs, this can cause a
> crash, since current TM SPR from previous thread will be saved on the
> current thread, and might not have TEXASR[FS] set, for example.
> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/kernel/signal_64.c | 12 +++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> index 487c3b6aa2e3..156b90e8ee78 100644
> --- a/arch/powerpc/kernel/signal_64.c
> +++ b/arch/powerpc/kernel/signal_64.c
> @@ -478,8 +478,18 @@ static long restore_tm_sigcontexts(struct task_struct
> *tsk,
>* happened whilst in the signal handler and load_tm overflowed,
>* disabling the TM bit. In either case we can end up with an illegal
>* TM state leading to a TM Bad Thing when we return to userspace.
> +  *
> +  * Every time MSR_TM is enabled, mainly for the b) case, the TM SPRs
> +  * must be restored in the live registers, since the live registers
> +  * could contain garbage and later we want to read from live, since
> +  * MSR_TM is enabled, and MSR[TM] is what is used to check if the
> +  * TM SPRs live registers are valid or not.
>*/
> - regs->msr |= MSR_TM;
> + if ((regs->msr & MSR_TM) == 0) {
> + regs->msr |= MSR_TM;
> + tm_enable();
> + tm_restore_sprs(>thread);
> + }

I'm wondering if we should put the save/restore TM registers in the early
entry/exit code too. We'd need to add the check on msr[tm]/load_tm.

Distributing the SPR save/restore throughout the kernel is just going to lead us
to similar problems that we are having now with reclaim/recheckpoint.

Mikey


>  
>   /* pull in MSR LE from user context */
>   regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);


Re: [PATCH kernel] powerpc/powernv/ioda1: Remove dead code for a single device PE

2018-11-14 Thread Sam Bobroff
On Thu, Nov 08, 2018 at 04:45:04PM +1100, Alexey Kardashevskiy wrote:
> Ping?
> 
> 
> On 16/10/2018 13:30, Alexey Kardashevskiy wrote:
> > At the moment PNV_IODA_PE_DEV is only used for NPU PEs which are not
> > present on IODA1 machines (i.e. POWER7) so let's remove a piece of
> > dead code.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> > 
> > We might actually want to get rid of the entire IODA1 there.
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 10 +-
> >  1 file changed, 1 insertion(+), 9 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index cde7102..78b61f0 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -2367,15 +2367,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct 
> > pnv_phb *phb,
> > pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
> > iommu_init_table(tbl, phb->hose->node);
> >  
> > -   if (pe->flags & PNV_IODA_PE_DEV) {
> > -   /*
> > -* Setting table base here only for carrying iommu_group
> > -* further down to let iommu_add_device() do the job.
> > -* pnv_pci_ioda_dma_dev_setup will override it later anyway.
> > -*/
> > -   set_iommu_table_base(>pdev->dev, tbl);
> > -   iommu_add_device(>pdev->dev);
> > -   } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> > +   if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> > pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
> >  
> > return;
> > 
> 
> -- 
> Alexey
> 

Looks good to me. It's pretty easy to see that PNV_IODA_PE_DEV is only
set for PHBs that are PNV_PHB_NPU_NVLINK or PNV_PHB_NPU_OCAPI, so:

Reviewed-by: Sam Bobroff 


signature.asc
Description: PGP signature


Re: [RFC PATCH 09/14] powerpc/tm: Warn if state is transactional

2018-11-14 Thread Michael Neuling



On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> Since every kernel entrance is calling TM_KERNEL_ENTRY, it is not
> expected to arrive at this point with a suspended transaction.
> 
> If that is the case, cause a warning and reclaim the current thread in
> order to avoid a TM Bad Thing.
> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/kernel/process.c | 7 +++
>  arch/powerpc/kernel/signal.c  | 2 +-
>  2 files changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 73872f751b33..849591bf0881 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1752,11 +1752,10 @@ void start_thread(struct pt_regs *regs, unsigned long
> start, unsigned long sp)
>  
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>   /*
> -  * Clear any transactional state, we're exec()ing. The cause is
> -  * not important as there will never be a recheckpoint so it's not
> -  * user visible.
> +  * It is a bug if the transaction was not reclaimed until this
> +  * point. Warn us and try to workaround it calling tm_reclaim().
>*/
> - if (MSR_TM_SUSPENDED(mfmsr()))
> + if (WARN_ON(MSR_TM_SUSPENDED(mfmsr(
>   tm_reclaim_current(0);
>  #endif

Let's turn these into BUG_ON()

Mikey
 
> diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
> index b3e8db376ecd..cbaccc2be0fb 100644
> --- a/arch/powerpc/kernel/signal.c
> +++ b/arch/powerpc/kernel/signal.c
> @@ -203,7 +203,7 @@ unsigned long get_tm_stackpointer(struct task_struct *tsk)
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>   BUG_ON(tsk != current);
>  
> - if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
> + if (WARN_ON(MSR_TM_ACTIVE(mfmsr( {
>   tm_reclaim_current(TM_CAUSE_SIGNAL);
>   if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
>   return tsk->thread.ckpt_regs.gpr[1];


Re: [RFC PATCH 08/14] powerpc/tm: Recheckpoint at exit path

2018-11-14 Thread Michael Neuling
On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> In the past, TIF_RESTORE_TM was being handled with the rest of the TIF
> workers,
> but, that was too early, and can cause some IRQ to be replayed in suspended
> state (after recheckpoint).
> 
> This patch moves TIF_RESTORE_TM handler to as late as possible, it also
> forces the IRQ to be disabled, and it will continue to be until RFID, so,
> no IRQ will be replayed at all. I.e, if trecheckpoint happens, it will RFID
> to userspace.
> 
> This makes TIF_RESTORE_TM a special case that should not be handled
> similarly to the _TIF_USER_WORK_MASK.
> 
> Since _TIF_RESTORE_TM is not part of _TIF_USER_WORK_MASK anymore, we
> need to force system_call_exit to continue to leaves through
> fast_exception_return, so, we add the flags together with
> _TIF_USER_WORK_MASK at system_call_exist path.
> 
> If this flag is set at system_call_exit, it means that recheckpoint
> will be called, and doing it through fast_exception_return is the only
> way to do so.
> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/include/asm/thread_info.h |  2 +-
>  arch/powerpc/kernel/entry_64.S | 23 ++-
>  2 files changed, 15 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/thread_info.h
> b/arch/powerpc/include/asm/thread_info.h
> index 544cac0474cb..2835d60bc9ef 100644
> --- a/arch/powerpc/include/asm/thread_info.h
> +++ b/arch/powerpc/include/asm/thread_info.h
> @@ -139,7 +139,7 @@ void arch_setup_new_exec(void);
>  
>  #define _TIF_USER_WORK_MASK  (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
>_TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> -  _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \
> +  _TIF_PATCH_PENDING | \
>_TIF_FSCHECK)
>  #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
>  
> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
> index 17484ebda66c..a86619edf29d 100644
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -255,7 +255,7 @@ system_call_exit:
>  
>   ld  r9,TI_FLAGS(r12)
>   li  r11,-MAX_ERRNO
> - andi.   r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MAS
> K|_TIF_PERSYSCALL_MASK)
> + andi.   r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|
> _TIF_PERSYSCALL_MASK |_TIF_RESTORE_TM)
>   bne-.Lsyscall_exit_work
>  
>   andi.   r0,r8,MSR_FP
> @@ -784,14 +784,6 @@ _GLOBAL(ret_from_except_lite)
>   SCHEDULE_USER
>   b   ret_from_except_lite
>  2:
> -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> - andi.   r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
> - bne 3f  /* only restore TM if nothing else to do */
> - addir3,r1,STACK_FRAME_OVERHEAD
> - bl  restore_tm_state
> - b   restore
> -3:
> -#endif
>   bl  save_nvgprs
>   /*
>* Use a non volatile GPR to save and restore our thread_info flags
> @@ -938,6 +930,19 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
>*/
>   .globl  fast_exception_return
>  fast_exception_return:
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> + CURRENT_THREAD_INFO(r4, r1)
> + ld  r4,TI_FLAGS(r4)
> + andi.   r0,r4,_TIF_RESTORE_TM
> + beq 22f
> + ld  r4,_MSR(r1) /* TODO: MSR[!PR] shouldn't be here */
> + andi.   r0,r4,MSR_PR
> + beq 22f  /* Skip if Kernel thread */
> + addir3,r1,STACK_FRAME_OVERHEAD
> + bl  restore_tm_state

Calling out to C here is a bit concerning this late.

The main thing you are calling out to is asm anyway (with a small C wrapper). 
We might want to adapt the asm this case, rather than the old model of getting
it called from __switch_to(). We shouldn't need to call
tm_reclaim/tm_recheckpoint from C anymore (especially if we use BUG_ON() rather
than WARN_ON() in the cases already mentioned.).

Same for patch 1.

> +22:
> +#endif
>   ld  r3,_MSR(r1)
>   ld  r4,_CTR(r1)
>   ld  r0,_LINK(r1)


Re: It looks like that wild_bctr on powerpc/fixes is still not compiling

2018-11-14 Thread Gustavo Romero

Hi Michael,

On 11/13/2018 10:58 PM, Michael Ellerman wrote:

It looks like binutils 2.27 doesn't accept ULL but binutils 2.28 does.

Ah yep, here:

   
https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commit;h=86b80085c889cd388fa677a5ae9053fd4be3776c


The following trivial workaround can solve it by forcing a type promotion on
the compiler side whilst leaving the macro taken into the asm code without
the UL string:

diff --git a/tools/testing/selftests/powerpc/mm/wild_bctr.c 
b/tools/testing/selftests/powerpc/mm/wild_bctr.c
index 90469a9..d2772f4 100644
--- a/tools/testing/selftests/powerpc/mm/wild_bctr.c
+++ b/tools/testing/selftests/powerpc/mm/wild_bctr.c
@@ -47,8 +47,9 @@ static int ok(void)
  return 0;
   }
   
-#define REG_POISON 0x5a5aUL

-#define POISONED_REG(n)((REG_POISON << 48) | ((n) << 32) | (REG_POISON 
<< 16) | (n))
+#define REG_POISON 0x5a5a
+#define POISONED_REG(n)(((REG_POISON+0UL) << 48) | ((n) << 32) | 
((REG_POISON+0UL) << 16) | (n))
   
   static inline void poison_regs(void)

   {


Should I contribute such a fix?


Yes thanks.


Segher kindly suggested to use explicitly "unsigned long" (thanks!), so I sent
a v2 to:

https://lists.ozlabs.org/pipermail/linuxppc-dev/2018-November/181434.html


Best regards,
Gustavo



[PATCH] powerpc/64: Fix kernel stack 16-byte alignment

2018-11-14 Thread Nicholas Piggin
Commit 4c2de74cc869 ("powerpc/64: Interrupts save PPR on stack rather
than thread_struct") changed sizeof(struct pt_regs) % 16 from 0 to 8,
which causes the interrupt frame allocation on kernel entry to put the
kernel stack out of alignment.

Add a pad field to fix alignment, and add a BUILD_BUG_ON to catch this
in future.

Fixes: 4c2de74cc869 ("powerpc/64: Interrupts save PPR on stack rather
than thread_struct")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/ptrace.h | 1 +
 arch/powerpc/kernel/setup_64.c| 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index f73886a1a7f5..1513292bf046 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -54,6 +54,7 @@ struct pt_regs
 
 #ifdef CONFIG_PPC64
unsigned long ppr;
+   unsigned long pad;  /* Maintain 16 byte interrupt stack alignment */
 #endif
 };
 #endif
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2a51e4cc8246..236c1151a3a7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -636,6 +636,8 @@ static void *__init alloc_stack(unsigned long limit, int 
cpu)
 {
unsigned long pa;
 
+   BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16);
+
pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
early_cpu_to_node(cpu), MEMBLOCK_NONE);
if (!pa) {
-- 
2.18.0



[PATCH v2] selftests/powerpc: Adjust wild_bctr to build with old gcc

2018-11-14 Thread Gustavo Romero
Currently the selftest wild_bctr can fail to build when an old gcc is used,
notably on gcc using a binutils version <= 2.27, because the assembler does
not support the integer suffix UL.

This patch adjusts the wild_bctr test so the REG_POISON value is still
treated as an unsigned long for the shifts on compilation but the UL
suffix is absent on the stringification, so the inline asm code generated
has no UL suffixes.

Signed-off-by: Gustavo Romero 
---
 tools/testing/selftests/powerpc/mm/wild_bctr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/powerpc/mm/wild_bctr.c 
b/tools/testing/selftests/powerpc/mm/wild_bctr.c
index 90469a9..aadac172 100644
--- a/tools/testing/selftests/powerpc/mm/wild_bctr.c
+++ b/tools/testing/selftests/powerpc/mm/wild_bctr.c
@@ -47,8 +47,8 @@ static int ok(void)
return 0;
 }
 
-#define REG_POISON 0x5a5aUL
-#define POISONED_REG(n)((REG_POISON << 48) | ((n) << 32) | (REG_POISON 
<< 16) | (n))
+#define REG_POISON 0x5a5a
+#define POISONED_REG(n)unsigned long) REG_POISON) << 48) | ((n) << 
32) | (((unsigned long) REG_POISON) << 16) | (n))
 
 static inline void poison_regs(void)
 {
-- 
2.7.4



Re: [RFC PATCH 05/14] powerpc/tm: Refactor the __switch_to_tm code

2018-11-14 Thread Michael Neuling
On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> __switch_to_tm is the function that switches between two tasks which might
> have TM enabled. This function is clearly split in two parts, the task that
> is leaving the CPU, known as 'prev' and the task that is being scheduled,
> known as 'new'.
> 
> It starts checking if the previous task had TM enable, if so, it increases
> the load_tm (this is the only place we increment load_tm). It also saves
> the TM SPRs here.
> 
> If the previous task was scheduled out with a transaction active, the
> failure cause needs to be updated, since it might contain the failure cause
> that caused the exception, as TM_CAUSE_MISC. In this case, since there was
> a context switch, overwrite the failure cause.
> 
> If the previous task has overflowed load_tm, disable TM, putting the
> facility save/restore lazy mechanism at lazy mode.
> 
> Regarding the 'new' task being scheduled, restoring TM SPRs is enough if
> the task had TM enabled when it was de-scheduled. (Checking if a
> recheckpoint would be required will be done later, at restore_tm_state()
> stage.)
> 
> On top of that, both tm_reclaim_task() and tm_recheckpoint_new_task()
> functions are not used anymore, removing them.

Is the above describing the previous functionality or the refactored
functionality?

> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/kernel/process.c | 167 --
>  1 file changed, 78 insertions(+), 89 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 1842fd96b123..73872f751b33 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -912,48 +912,6 @@ void tm_reclaim_current(uint8_t cause)
>   tm_reclaim_thread(>thread, cause);
>  }
>  
> -static inline void tm_reclaim_task(struct task_struct *tsk)
> -{
> - /* We have to work out if we're switching from/to a task that's in the
> -  * middle of a transaction.
> -  *
> -  * In switching we need to maintain a 2nd register state as
> -  * oldtask->thread.ckpt_regs.  We tm_reclaim(oldproc); this saves the
> -  * checkpointed (tbegin) state in ckpt_regs, ckfp_state and
> -  * ckvr_state
> -  *
> -  * We also context switch (save) TFHAR/TEXASR/TFIAR in here.
> -  */
> - struct thread_struct *thr = >thread;
> -
> - if (!thr->regs)
> - return;
> -
> - if (!MSR_TM_ACTIVE(thr->regs->msr))
> - goto out_and_saveregs;
> -
> - WARN_ON(tm_suspend_disabled);
> -
> - TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
> -  "ccr=%lx, msr=%lx, trap=%lx)\n",
> -  tsk->pid, thr->regs->nip,
> -  thr->regs->ccr, thr->regs->msr,
> -  thr->regs->trap);
> -
> - tm_reclaim_thread(thr, TM_CAUSE_RESCHED);
> -
> - TM_DEBUG("--- tm_reclaim on pid %d complete\n",
> -  tsk->pid);
> -
> -out_and_saveregs:
> - /* Always save the regs here, even if a transaction's not active.
> -  * This context-switches a thread's TM info SPRs.  We do it here to
> -  * be consistent with the restore path (in recheckpoint) which
> -  * cannot happen later in _switch().
> -  */
> - tm_save_sprs(thr);
> -}
> -
>  extern void __tm_recheckpoint(struct thread_struct *thread);
>  
>  void tm_recheckpoint(struct thread_struct *thread)
> @@ -980,59 +938,91 @@ void tm_recheckpoint(struct thread_struct *thread)
>   local_irq_restore(flags);
>  }
>  
> -static inline void tm_recheckpoint_new_task(struct task_struct *new)
> +static void tm_change_failure_cause(struct task_struct *task, uint8_t cause)
>  {
> - if (!cpu_has_feature(CPU_FTR_TM))
> - return;
> -
> - /* Recheckpoint the registers of the thread we're about to switch to.
> -  *
> -  * If the task was using FP, we non-lazily reload both the original and
> -  * the speculative FP register states.  This is because the kernel
> -  * doesn't see if/when a TM rollback occurs, so if we take an FP
> -  * unavailable later, we are unable to determine which set of FP regs
> -  * need to be restored.
> -  */
> - if (!tm_enabled(new))
> - return;
> -
> - if (!MSR_TM_ACTIVE(new->thread.regs->msr)){
> - tm_restore_sprs(>thread);
> - return;
> - }
> - /* Recheckpoint to restore original checkpointed register state. */
> - TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n",
> -  new->pid, new->thread.regs->msr);
> -
> - tm_recheckpoint(>thread);
> -
> - /*
> -  * The checkpointed state has been restored but the live state has
> -  * not, ensure all the math functionality is turned off to trigger
> -  * restore_math() to reload.
> -  */
> - new->thread.regs->msr &= ~(MSR_FP | MSR_VEC | MSR_VSX);
> -
> - TM_DEBUG("*** tm_recheckpoint of pid %d complete "
> -  "(kernel msr 0x%lx)\n",
> -  

Re: [RFC PATCH 01/14] powerpc/tm: Reclaim transaction on kernel entry

2018-11-14 Thread Nicholas Piggin
On Tue,  6 Nov 2018 10:40:15 -0200
Breno Leitao  wrote:

> This patch creates a macro that will be invoked on all entrance to the
> kernel, so, in kernel space the transaction will be completely reclaimed
> and not suspended anymore.

This doesn't get invoked on _all_ kernel entries, by the looks (SLB
miss or early machine check, for example). And of course we always
have to run _some_ MSR[PR]=0 code before it is reclaimed. So it is
important to document the rules for what code must not run with TM
suspended now, and why.

> 
> This patchset checks if we are coming from PR, if not, skip. This is useful
> when there is a irq_replay() being called after recheckpoint, when the IRQ
> is re-enable. In this case, we do not want to re-reclaim and
> re-recheckpoint, thus, if not coming from PR, skip it completely.

I really should learn a bit more about TM but I've been trying not to.
Seeing as I don't, I don't really understand this comment. Why don't
we want to reclaim?

> 
> This macro does not care about TM SPR also, it will only be saved and
> restore in the context switch code now on.
> 
> This macro will return 0 or 1 in r3 register, to specify if a reclaim was
> executed or not.

We want to be careful about efficiency here, so I think this macro
should be tightened up. A lot of code doesn't seem to care about the
return value for example, so you could have two macros, one which
cares about return, another which doesn't. Instead of setting value
via branches which you then use to test and branch again, macro could
accept branch labels to go to perhaps.

It would be good to move the TM reclaim path out of line and make the
common case a not taken branch. Don't know how feasible that will be.

> 
> This patchset is based on initial work done by Cyril:
> https://patchwork.ozlabs.org/cover/875341/
> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/include/asm/exception-64s.h | 46 
>  arch/powerpc/kernel/entry_64.S   | 10 ++
>  arch/powerpc/kernel/exceptions-64s.S | 12 +--
>  3 files changed, 66 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/exception-64s.h 
> b/arch/powerpc/include/asm/exception-64s.h
> index 3b4767ed3ec5..931a74ba037b 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -36,6 +36,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  
>  /* PACA save area offsets (exgen, exmc, etc) */
>  #define EX_R90
> @@ -677,10 +678,54 @@ BEGIN_FTR_SECTION   \
>   beqlppc64_runlatch_on_trampoline;   \
>  END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
>  
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +
> +/*
> + * This macro will reclaim a transaction if called when coming from userspace
> + * (MSR.PR = 1) and if the transaction state is active or suspended.
> + *
> + * Since we don't want to reclaim when coming from kernel, for instance after
> + * a trechkpt. or a IRQ replay, the live MSR is not useful and instead of it 
> the
> + * MSR from thread stack is used to check the MSR.PR bit.
> + * This macro has one argument which is the cause that will be used by 
> treclaim.
> + * and returns in r3 '1' if the reclaim happens or '0' if reclaim didn't
> + * happen, which is useful to know what registers were clobbered.
> + *
> + * NOTE: If addition registers are clobbered here, make sure the callee
> + * function restores them before proceeding.
> + */
> +#define TM_KERNEL_ENTRY(cause)   
> \
> + ld  r3, _MSR(r1);   \
> + andi.   r0, r3, MSR_PR; /* Coming from userspace? */\
> + beq 1f; /* Skip reclaim if MSR.PR != 1 */   \

I wonder if this can be put with the other userspace entry code?
Maybe it's too difficult.

> + rldicl. r0, r3, (64-MSR_TM_LG), 63; /* Is TM enabled? */\
> + beq 1f; /* Skip reclaim if TM is off */ \
> + rldicl. r0, r3, (64-MSR_TS_LG), 62; /* Is active */ \
> + beq 1f; /* Skip reclaim if neither */   \

Can this be merged into a single test?

And/or can these branches be rearranged so the one most likely to
go to skip happens first? (I assume TM being active is less likely
than being enabled).

> + /*  \
> +  * If there is a transaction active or suspended, save the  \
> +  * non-volatile GPRs if they are not already saved. \
> +  */ \
> + bl  save_nvgprs;\
> + /*  \
> +  * Soft disable the IRQs, otherwise it might cause a CPU hang.  \
> +  */ \
> + RECONCILE_IRQ_STATE(r10, r11); 

Re: [RFC PATCH 03/14] powerpc/tm: Recheckpoint when exiting from kernel

2018-11-14 Thread Michael Neuling
On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> This is the only place we are going to recheckpoint now. Now the task
> needs to have TIF_RESTORE_TM flag set, which will get into
> restore_tm_state() at exception exit path, and execute the recheckpoint
> depending on the MSR.
> 
> Every time a task is required to recheckpoint, or just have the TM SPRs
> restore, the TIF_RESTORE_TM flag should be set and the task MSR should
> properly be in a transactional state, which will be checked by
> restore_tm_state().
> 
> After the facility registers are recheckpointed, they are clobbered with
> the values that were recheckpointed (and are now also in the checkpoint
> area).

Which facility registers? I don't understand this.

> If facility is enabled at MSR that is being returned to user space, then
> the facility registers need to be restored, otherwise userspace will see
> invalid values.
> 
> This patch simplify the restore_tm_state() to just restore the facility
> registers that are enabled when returning to userspace, i.e. the MSR will
> be the same that will be put into SRR1, which will be the MSR after RFID.
> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/kernel/process.c | 38 ---
>  1 file changed, 26 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 4d5322cfad25..c7e758a42b8f 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1049,8 +1049,6 @@ static inline void __switch_to_tm(struct task_struct
> *prev,
>   */
>  void restore_tm_state(struct pt_regs *regs)
>  {
> - unsigned long msr_diff;
> -
>   /*
>* This is the only moment we should clear TIF_RESTORE_TM as
>* it is here that ckpt_regs.msr and pt_regs.msr become the same
> @@ -1061,19 +1059,35 @@ void restore_tm_state(struct pt_regs *regs)
>   if (!MSR_TM_ACTIVE(regs->msr))
>   return;
>  
> - msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
> - msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
> + tm_enable();
> + /* The only place we recheckpoint */
> + tm_recheckpoint(>thread);
>  
> - /* Ensure that restore_math() will restore */
> - if (msr_diff & MSR_FP)
> - current->thread.load_fp = 1;
> + /*
> +  * Restore the facility registers that were clobbered during
> +  * recheckpoint.
> +  */
> + if (regs->msr & MSR_FP) {
> + /*
> +  * Using load_fp_state() instead of restore_fp() because we
> +  * want to force the restore, independent of
> +  * tsk->thread.load_fp. Same for other cases below.
> +  */
> + load_fp_state(>thread.fp_state);
> + }
>  #ifdef CONFIG_ALTIVEC
> - if (cpu_has_feature(CPU_FTR_ALTIVEC) && msr_diff & MSR_VEC)
> - current->thread.load_vec = 1;
> + if (cpu_has_feature(CPU_FTR_ALTIVEC) && regs->msr & MSR_VEC)
> + load_vr_state(>thread.vr_state);
> +#endif
> +#ifdef CONFIG_VSX
> + if (cpu_has_feature(CPU_FTR_VSX) && regs->msr & MSR_VSX) {
> + /*
> +  * If VSX is enabled, it is expected that VEC and FP are
> +  * also enabled and already restored the full register set.
> +  * Cause a warning if that is not the case.
> +  */
> + WARN_ON(!(regs->msr & MSR_VEC) || !(regs->msr & MSR_FP)); }
>  #endif
> - restore_math(regs);
> -
> - regs->msr |= msr_diff;
>  }
>  
>  #else


Re: [RFC PATCH 01/14] powerpc/tm: Reclaim transaction on kernel entry

2018-11-14 Thread Michael Neuling
On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> This patch creates a macro that will be invoked on all entrance to the
> kernel, so, in kernel space the transaction will be completely reclaimed
> and not suspended anymore.
> 
> This patchset checks if we are coming from PR, if not, skip. 

Remove the double negative here. ie

"This skips when coming from the OS". or "Only happens when coming from PR"

> This is useful
> when there is a irq_replay() being called after recheckpoint, when the IRQ
> is re-enable. 

So we are talking about tm_recheckpoint on exit? On exit, we do:
   tm_recheckpoint -> irq_replay -> rfid?

Why not swap the order of the recheckpoint and the replay to avoid this problem?

> In this case, we do not want to re-reclaim and
> re-recheckpoint, thus, if not coming from PR, skip it completely.

Move double negatives... Try: "if coming from the OS, skip" or "only do it when
coming from userspace"

> This macro does not care about TM SPR also, it will only be saved and
> restore in the context switch code now on.
> This macro will return 0 or 1 in r3 register, to specify if a reclaim was
> executed or not.
> 
> This patchset is based on initial work done by Cyril:
> https://patchwork.ozlabs.org/cover/875341/
> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/include/asm/exception-64s.h | 46 
>  arch/powerpc/kernel/entry_64.S   | 10 ++
>  arch/powerpc/kernel/exceptions-64s.S | 12 +--
>  3 files changed, 66 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/exception-64s.h 
> b/arch/powerpc/include/asm/exception-64s.h
> index 3b4767ed3ec5..931a74ba037b 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -36,6 +36,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  
>  /* PACA save area offsets (exgen, exmc, etc) */
>  #define EX_R90
> @@ -677,10 +678,54 @@ BEGIN_FTR_SECTION   \
>   beqlppc64_runlatch_on_trampoline;   \
>  END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
>  
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +
> +/*
> + * This macro will reclaim a transaction if called when coming from userspace
> + * (MSR.PR = 1) and if the transaction state is active or suspended.
> + *
> + * Since we don't want to reclaim when coming from kernel, for instance after
> + * a trechkpt. or a IRQ replay, the live MSR is not useful and instead of it 
> the
> + * MSR from thread stack is used to check the MSR.PR bit.
> + * This macro has one argument which is the cause that will be used by 
> treclaim.
> + * and returns in r3 '1' if the reclaim happens or '0' if reclaim didn't
> + * happen, which is useful to know what registers were clobbered.
> + *
> + * NOTE: If addition registers are clobbered here, make sure the callee
> + * function restores them before proceeding.
> + */
> +#define TM_KERNEL_ENTRY(cause)   
> \
> + ld  r3, _MSR(r1);   \
> + andi.   r0, r3, MSR_PR; /* Coming from userspace? */\
> + beq 1f; /* Skip reclaim if MSR.PR != 1 */   \
> + rldicl. r0, r3, (64-MSR_TM_LG), 63; /* Is TM enabled? */\
> + beq 1f; /* Skip reclaim if TM is off */ \
> + rldicl. r0, r3, (64-MSR_TS_LG), 62; /* Is active */ \
> + beq 1f; /* Skip reclaim if neither */   \
> + /*  \
> +  * If there is a transaction active or suspended, save the  \
> +  * non-volatile GPRs if they are not already saved. \
> +  */ \
> + bl  save_nvgprs;\
> + /*  \
> +  * Soft disable the IRQs, otherwise it might cause a CPU hang.  \
> +  */ \
> + RECONCILE_IRQ_STATE(r10, r11);  \
> + li  r3, cause;  \
> + bl  tm_reclaim_current; \

Are we ready to call out to C at this point in the exception handlers?

> + li  r3, 1;  /* Reclaim happened */  \
> + b   2f; \
> +1:   li  r3, 0;  /* Reclaim didn't happen */ \
> +2:
> +#else
> +#define TM_KERNEL_ENTRY(cause)
> +#endif
> +
>  #define EXCEPTION_COMMON(area, trap, label, hdlr, ret, additions) \
> EXCEPTION_PROLOG_COMMON(trap, area);\
> /* Volatile regs are potentially clobbered here */  \
> additions;  \
> +   TM_KERNEL_ENTRY(TM_CAUSE_MISC);  

Re: [RFC PATCH 02/14] powerpc/tm: Reclaim on unavailable exception

2018-11-14 Thread Michael Neuling
On Tue, 2018-11-06 at 10:40 -0200, Breno Leitao wrote:
> If there is a FP/VEC/Altivec touch inside a transaction and the facility is
> disabled, then a facility unavailable exception is raised and ends up
> calling {fp,vec,vsx}_unavailable_tm, which was reclaiming and
> recheckpointing.
> 
> This is not required anymore, since the checkpointed state was reclaimed in
> the exception entrance, and it will be recheckpointed by restore_tm_state
> later.
> 
> Adding a WARN_ON() warning if we hit the _unavailable_tm() in suspended
> mode, i.e, the reclaim was not executed somehow in the trap entrance, and
> this is a bug.

The "why" above is good and the important part of the commit but, 

Can you also add what you're doing?  The above would suggest you're just
removing some things but you're actually adding the TM_KERNEL_ENTRY() macro too.

Mikey

> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/include/asm/exception-64s.h |  4 
>  arch/powerpc/kernel/exceptions-64s.S |  3 +++
>  arch/powerpc/kernel/traps.c  | 22 --
>  3 files changed, 11 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/exception-64s.h
> b/arch/powerpc/include/asm/exception-64s.h
> index 931a74ba037b..80f01d5683c3 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -711,6 +711,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
>* Soft disable the IRQs, otherwise it might cause a CPU hang.  \
>*/ \
>   RECONCILE_IRQ_STATE(r10, r11);  \
> + /*  \
> +  * Although this cause will be set initially, it might be   \
> +  * updated later, once the exception is better understood   \
> +  */ \
>   li  r3, cause;  \
>   bl  tm_reclaim_current; \
>   li  r3, 1;  /* Reclaim happened */  \
> diff --git a/arch/powerpc/kernel/exceptions-64s.S
> b/arch/powerpc/kernel/exceptions-64s.S
> index 5c685a46202d..47e05b09eed6 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -786,6 +786,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  2:   /* User process was in a transaction */
>   bl  save_nvgprs
> + TM_KERNEL_ENTRY(TM_CAUSE_FAC_UNAV)
>   RECONCILE_IRQ_STATE(r10, r11)
>   addir3,r1,STACK_FRAME_OVERHEAD
>   bl  fp_unavailable_tm
> @@ -1128,6 +1129,7 @@ BEGIN_FTR_SECTION
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  2:   /* User process was in a transaction */
>   bl  save_nvgprs
> + TM_KERNEL_ENTRY(TM_CAUSE_FAC_UNAV)
>   RECONCILE_IRQ_STATE(r10, r11)
>   addir3,r1,STACK_FRAME_OVERHEAD
>   bl  altivec_unavailable_tm
> @@ -1164,6 +1166,7 @@ BEGIN_FTR_SECTION
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>  2:   /* User process was in a transaction */
>   bl  save_nvgprs
> + TM_KERNEL_ENTRY(TM_CAUSE_FAC_UNAV)
>   RECONCILE_IRQ_STATE(r10, r11)
>   addir3,r1,STACK_FRAME_OVERHEAD
>   bl  vsx_unavailable_tm
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 9a86572db1ef..e74b735e974c 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -1742,23 +1742,10 @@ void fp_unavailable_tm(struct pt_regs *regs)
>   * transaction, and probably retry but now with FP enabled.  So the
>   * checkpointed FP registers need to be loaded.
>*/
> - tm_reclaim_current(TM_CAUSE_FAC_UNAV);
> -
> - /*
> -  * Reclaim initially saved out bogus (lazy) FPRs to ckfp_state, and
> -  * then it was overwrite by the thr->fp_state by tm_reclaim_thread().
> -  *
> -  * At this point, ck{fp,vr}_state contains the exact values we want to
> -  * recheckpoint.
> -  */
> + WARN_ON(MSR_TM_SUSPENDED(mfmsr()));
>  
>   /* Enable FP for the task: */
>   current->thread.load_fp = 1;
> -
> - /*
> -  * Recheckpoint all the checkpointed ckpt, ck{fp, vr}_state registers.
> -  */
> - tm_recheckpoint(>thread);
>  }
>  
>  void altivec_unavailable_tm(struct pt_regs *regs)
> @@ -1770,10 +1757,10 @@ void altivec_unavailable_tm(struct pt_regs *regs)
>   TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx,"
>"MSR=%lx\n",
>regs->nip, regs->msr);
> - tm_reclaim_current(TM_CAUSE_FAC_UNAV);
> + WARN_ON(MSR_TM_SUSPENDED(mfmsr()));
>   current->thread.load_vec = 1;
> - tm_recheckpoint(>thread);
>   current->thread.used_vr = 1;
> +
>  }
>  
>  void vsx_unavailable_tm(struct pt_regs *regs)
> @@ -1792,12 +1779,11 @@ void vsx_unavailable_tm(struct pt_regs *regs)
>   

[PATCH AUTOSEL 4.4 6/8] hwmon: (ibmpowernv) Remove bogus __init annotations

2018-11-14 Thread Sasha Levin
From: Geert Uytterhoeven 

[ Upstream commit e3e61f01d755188cb6c2dcf5a244b9c0937c258e ]

If gcc decides not to inline make_sensor_label():

WARNING: vmlinux.o(.text+0x4df549c): Section mismatch in reference from the 
function .create_device_attrs() to the function .init.text:.make_sensor_label()
The function .create_device_attrs() references
the function __init .make_sensor_label().
This is often because .create_device_attrs lacks a __init
annotation or the annotation of .make_sensor_label is wrong.

As .probe() can be called after freeing of __init memory, all __init
annotiations in the driver are bogus, and should be removed.

Signed-off-by: Geert Uytterhoeven 
Signed-off-by: Guenter Roeck 
Signed-off-by: Sasha Levin 
---
 drivers/hwmon/ibmpowernv.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index 55b5a8ff1cfe..ca3aa28977bc 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -114,7 +114,7 @@ static ssize_t show_label(struct device *dev, struct 
device_attribute *devattr,
return sprintf(buf, "%s\n", sdata->label);
 }
 
-static int __init get_logical_cpu(int hwcpu)
+static int get_logical_cpu(int hwcpu)
 {
int cpu;
 
@@ -125,9 +125,8 @@ static int __init get_logical_cpu(int hwcpu)
return -ENOENT;
 }
 
-static void __init make_sensor_label(struct device_node *np,
-struct sensor_data *sdata,
-const char *label)
+static void make_sensor_label(struct device_node *np,
+ struct sensor_data *sdata, const char *label)
 {
u32 id;
size_t n;
-- 
2.17.1



[PATCH AUTOSEL 4.9 08/13] hwmon: (ibmpowernv) Remove bogus __init annotations

2018-11-14 Thread Sasha Levin
From: Geert Uytterhoeven 

[ Upstream commit e3e61f01d755188cb6c2dcf5a244b9c0937c258e ]

If gcc decides not to inline make_sensor_label():

WARNING: vmlinux.o(.text+0x4df549c): Section mismatch in reference from the 
function .create_device_attrs() to the function .init.text:.make_sensor_label()
The function .create_device_attrs() references
the function __init .make_sensor_label().
This is often because .create_device_attrs lacks a __init
annotation or the annotation of .make_sensor_label is wrong.

As .probe() can be called after freeing of __init memory, all __init
annotiations in the driver are bogus, and should be removed.

Signed-off-by: Geert Uytterhoeven 
Signed-off-by: Guenter Roeck 
Signed-off-by: Sasha Levin 
---
 drivers/hwmon/ibmpowernv.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index 6d2e6605751c..18b3c8f258bf 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -114,7 +114,7 @@ static ssize_t show_label(struct device *dev, struct 
device_attribute *devattr,
return sprintf(buf, "%s\n", sdata->label);
 }
 
-static int __init get_logical_cpu(int hwcpu)
+static int get_logical_cpu(int hwcpu)
 {
int cpu;
 
@@ -125,9 +125,8 @@ static int __init get_logical_cpu(int hwcpu)
return -ENOENT;
 }
 
-static void __init make_sensor_label(struct device_node *np,
-struct sensor_data *sdata,
-const char *label)
+static void make_sensor_label(struct device_node *np,
+ struct sensor_data *sdata, const char *label)
 {
u32 id;
size_t n;
-- 
2.17.1



[PATCH AUTOSEL 4.14 21/27] ibmvnic: fix accelerated VLAN handling

2018-11-14 Thread Sasha Levin
From: Michał Mirosław 

[ Upstream commit e84b47941e15eafb8ee8b21d1c3fc1a013af ]

Don't request tag insertion when it isn't present in outgoing skb.

Signed-off-by: Michał Mirosław 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 046af22a37cb..5c7134ccc1fd 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1259,7 +1259,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_crq.v1.sge_len = cpu_to_be32(skb->len);
tx_crq.v1.ioba = cpu_to_be64(data_dma_addr);
 
-   if (adapter->vlan_header_insertion) {
+   if (adapter->vlan_header_insertion && skb_vlan_tag_present(skb)) {
tx_crq.v1.flags2 |= IBMVNIC_TX_VLAN_INSERT;
tx_crq.v1.vlan_id = cpu_to_be16(skb->vlan_tci);
}
-- 
2.17.1



[PATCH AUTOSEL 4.14 12/27] hwmon: (ibmpowernv) Remove bogus __init annotations

2018-11-14 Thread Sasha Levin
From: Geert Uytterhoeven 

[ Upstream commit e3e61f01d755188cb6c2dcf5a244b9c0937c258e ]

If gcc decides not to inline make_sensor_label():

WARNING: vmlinux.o(.text+0x4df549c): Section mismatch in reference from the 
function .create_device_attrs() to the function .init.text:.make_sensor_label()
The function .create_device_attrs() references
the function __init .make_sensor_label().
This is often because .create_device_attrs lacks a __init
annotation or the annotation of .make_sensor_label is wrong.

As .probe() can be called after freeing of __init memory, all __init
annotiations in the driver are bogus, and should be removed.

Signed-off-by: Geert Uytterhoeven 
Signed-off-by: Guenter Roeck 
Signed-off-by: Sasha Levin 
---
 drivers/hwmon/ibmpowernv.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index 5ccdd0b52650..b38f4951c94e 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -126,7 +126,7 @@ static ssize_t show_label(struct device *dev, struct 
device_attribute *devattr,
return sprintf(buf, "%s\n", sdata->label);
 }
 
-static int __init get_logical_cpu(int hwcpu)
+static int get_logical_cpu(int hwcpu)
 {
int cpu;
 
@@ -137,9 +137,8 @@ static int __init get_logical_cpu(int hwcpu)
return -ENOENT;
 }
 
-static void __init make_sensor_label(struct device_node *np,
-struct sensor_data *sdata,
-const char *label)
+static void make_sensor_label(struct device_node *np,
+ struct sensor_data *sdata, const char *label)
 {
u32 id;
size_t n;
-- 
2.17.1



[PATCH AUTOSEL 4.18 46/59] ibmvnic: fix accelerated VLAN handling

2018-11-14 Thread Sasha Levin
From: Michał Mirosław 

[ Upstream commit e84b47941e15eafb8ee8b21d1c3fc1a013af ]

Don't request tag insertion when it isn't present in outgoing skb.

Signed-off-by: Michał Mirosław 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index d834308adf95..b6754cc925dc 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1563,7 +1563,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_crq.v1.sge_len = cpu_to_be32(skb->len);
tx_crq.v1.ioba = cpu_to_be64(data_dma_addr);
 
-   if (adapter->vlan_header_insertion) {
+   if (adapter->vlan_header_insertion && skb_vlan_tag_present(skb)) {
tx_crq.v1.flags2 |= IBMVNIC_TX_VLAN_INSERT;
tx_crq.v1.vlan_id = cpu_to_be16(skb->vlan_tci);
}
-- 
2.17.1



[PATCH AUTOSEL 4.18 22/59] hwmon: (ibmpowernv) Remove bogus __init annotations

2018-11-14 Thread Sasha Levin
From: Geert Uytterhoeven 

[ Upstream commit e3e61f01d755188cb6c2dcf5a244b9c0937c258e ]

If gcc decides not to inline make_sensor_label():

WARNING: vmlinux.o(.text+0x4df549c): Section mismatch in reference from the 
function .create_device_attrs() to the function .init.text:.make_sensor_label()
The function .create_device_attrs() references
the function __init .make_sensor_label().
This is often because .create_device_attrs lacks a __init
annotation or the annotation of .make_sensor_label is wrong.

As .probe() can be called after freeing of __init memory, all __init
annotiations in the driver are bogus, and should be removed.

Signed-off-by: Geert Uytterhoeven 
Signed-off-by: Guenter Roeck 
Signed-off-by: Sasha Levin 
---
 drivers/hwmon/ibmpowernv.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index f829dadfd5a0..2968d527bdf6 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -129,7 +129,7 @@ static ssize_t show_label(struct device *dev, struct 
device_attribute *devattr,
return sprintf(buf, "%s\n", sdata->label);
 }
 
-static int __init get_logical_cpu(int hwcpu)
+static int get_logical_cpu(int hwcpu)
 {
int cpu;
 
@@ -140,9 +140,8 @@ static int __init get_logical_cpu(int hwcpu)
return -ENOENT;
 }
 
-static void __init make_sensor_label(struct device_node *np,
-struct sensor_data *sdata,
-const char *label)
+static void make_sensor_label(struct device_node *np,
+ struct sensor_data *sdata, const char *label)
 {
u32 id;
size_t n;
-- 
2.17.1



Re: [PATCH v4 2/4] kgdb: Fix kgdb_roundup_cpus() for arches who used smp_call_function()

2018-11-14 Thread Will Deacon
On Mon, Nov 12, 2018 at 10:26:56AM -0800, Douglas Anderson wrote:
> When I had lockdep turned on and dropped into kgdb I got a nice splat
> on my system.  Specifically it hit:
>   DEBUG_LOCKS_WARN_ON(current->hardirq_context)
> 
> Specifically it looked like this:
>   sysrq: SysRq : DEBUG
>   [ cut here ]
>   DEBUG_LOCKS_WARN_ON(current->hardirq_context)
>   WARNING: CPU: 0 PID: 0 at .../kernel/locking/lockdep.c:2875 
> lockdep_hardirqs_on+0xf0/0x160
>   CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0 #27
>   pstate: 604003c9 (nZCv DAIF +PAN -UAO)
>   pc : lockdep_hardirqs_on+0xf0/0x160
>   ...
>   Call trace:
>lockdep_hardirqs_on+0xf0/0x160
>trace_hardirqs_on+0x188/0x1ac
>kgdb_roundup_cpus+0x14/0x3c
>kgdb_cpu_enter+0x53c/0x5cc
>kgdb_handle_exception+0x180/0x1d4
>kgdb_compiled_brk_fn+0x30/0x3c
>brk_handler+0x134/0x178
>do_debug_exception+0xfc/0x178
>el1_dbg+0x18/0x78
>kgdb_breakpoint+0x34/0x58
>sysrq_handle_dbg+0x54/0x5c
>__handle_sysrq+0x114/0x21c
>handle_sysrq+0x30/0x3c
>qcom_geni_serial_isr+0x2dc/0x30c
>   ...
>   ...
>   irq event stamp: ...45
>   hardirqs last  enabled at (...44): [...] __do_softirq+0xd8/0x4e4
>   hardirqs last disabled at (...45): [...] el1_irq+0x74/0x130
>   softirqs last  enabled at (...42): [...] _local_bh_enable+0x2c/0x34
>   softirqs last disabled at (...43): [...] irq_exit+0xa8/0x100
>   ---[ end trace adf21f830c46e638 ]---
> 
> Looking closely at it, it seems like a really bad idea to be calling
> local_irq_enable() in kgdb_roundup_cpus().  If nothing else that seems
> like it could violate spinlock semantics and cause a deadlock.
> 
> Instead, let's use a private csd alongside
> smp_call_function_single_async() to round up the other CPUs.  Using
> smp_call_function_single_async() doesn't require interrupts to be
> enabled so we can remove the offending bit of code.
> 
> In order to avoid duplicating this across all the architectures that
> use the default kgdb_roundup_cpus(), we'll add a "weak" implementation
> to debug_core.c.
> 
> Looking at all the people who previously had copies of this code,
> there were a few variants.  I've attempted to keep the variants
> working like they used to.  Specifically:
> * For arch/arc we passed NULL to kgdb_nmicallback() instead of
>   get_irq_regs().
> * For arch/mips there was a bit of extra code around
>   kgdb_nmicallback()
> 
> NOTE: In this patch we will still get into trouble if we try to round
> up a CPU that failed to round up before.  We'll try to round it up
> again and potentially hang when we try to grab the csd lock.  That's
> not new behavior but we'll still try to do better in a future patch.
> 
> Suggested-by: Daniel Thompson 
> Signed-off-by: Douglas Anderson 
> ---
> 
> Changes in v4: None
> Changes in v3:
> - No separate init call.
> - Don't round up the CPU that is doing the rounding up.
> - Add "#ifdef CONFIG_SMP" to match the rest of the file.
> - Updated desc saying we don't solve the "failed to roundup" case.
> - Document the ignored parameter.
> 
> Changes in v2:
> - Removing irq flags separated from fixing lockdep splat.
> - Don't use smp_call_function (Daniel).
> 
>  arch/arc/kernel/kgdb.c | 10 ++
>  arch/arm/kernel/kgdb.c | 12 
>  arch/arm64/kernel/kgdb.c   | 12 
>  arch/hexagon/kernel/kgdb.c | 27 ---
>  arch/mips/kernel/kgdb.c|  9 +
>  arch/powerpc/kernel/kgdb.c |  4 ++--
>  arch/sh/kernel/kgdb.c  | 12 
>  include/linux/kgdb.h   | 15 +--
>  kernel/debug/debug_core.c  | 35 +++
>  9 files changed, 53 insertions(+), 83 deletions(-)

[...]

> diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
> index f3cadda45f07..23f2b5613afa 100644
> --- a/kernel/debug/debug_core.c
> +++ b/kernel/debug/debug_core.c
> @@ -55,6 +55,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -220,6 +221,40 @@ int __weak kgdb_skipexception(int exception, struct 
> pt_regs *regs)
>   return 0;
>  }
>  
> +#ifdef CONFIG_SMP
> +
> +/*
> + * Default (weak) implementation for kgdb_roundup_cpus
> + */
> +
> +static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
> +
> +void __weak kgdb_call_nmi_hook(void *ignored)
> +{
> + kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
> +}

I suppose you could pass the cpu as an argument, but it doesn't really
matter. Also, I think there are cases where the CSD callback can run without
having received an IPI, so we could potentially up passing NULL for the regs
here which probably goes boom.

> +
> +void __weak kgdb_roundup_cpus(void)
> +{
> + call_single_data_t *csd;
> + int this_cpu = get_cpu();

Do you actually need to disable preemption here? afaict, irqs are already
disabled by the kgdb core.

> + int cpu;
> +
> + for_each_cpu(cpu, cpu_online_mask) {

for_each_online_cpu(cpu) ?
I'm assuming 

Re: [PATCH v4 1/4] kgdb: Remove irq flags from roundup

2018-11-14 Thread Will Deacon
On Mon, Nov 12, 2018 at 10:26:55AM -0800, Douglas Anderson wrote:
> The function kgdb_roundup_cpus() was passed a parameter that was
> documented as:
> 
> > the flags that will be used when restoring the interrupts. There is
> > local_irq_save() call before kgdb_roundup_cpus().
> 
> Nobody used those flags.  Anyone who wanted to temporarily turn on
> interrupts just did local_irq_enable() and local_irq_disable() without
> looking at them.  So we can definitely remove the flags.
> 
> Signed-off-by: Douglas Anderson 
> ---

Acked-by: Will Deacon 

I'm hopeful that you'll keep hacking on kgdb, because it definitely needs
some love in its current state.

Will


Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected

2018-11-14 Thread Keith Busch
On Wed, Nov 14, 2018 at 08:52:10PM +, alex_gagn...@dellteam.com wrote:
> But it does in portdrv_core.c:
> 
>   if (dev->aer_cap && pci_aer_available() &&
>   (pcie_ports_native || host->native_aer)) {
>   services |= PCIE_PORT_SERVICE_AER;
> 
> That flag later creates a pcie device that allows aerdrv to attach to.

Oh, right. I saw negotiate_os_control() just uses a stack variable for
the _OSC response, but if I had looked one level deeper, I'd see it
cached in a different structure.


Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected

2018-11-14 Thread Alex_Gagniuc
On 11/14/2018 02:27 PM, Keith Busch wrote:
> On Wed, Nov 14, 2018 at 07:22:04PM +, alex_gagn...@dellteam.com wrote:
>> On 11/14/2018 12:00 AM, Bjorn Helgaas wrote:
>>> Just to make sure we're on the same page, can you point me to this
>>> rule?  I do see that OSPM must request control of AER using _OSC
>>> before it touches the AER registers.  What I don't see is the
>>> connection between firmware-first and the AER registers.
>>
>> ACPI 6.2 - 6.2.11.3, Table 6-197:
>>[...]
>> Maybe Keith knows better why we're doing it this way. From ACPI text, it
>> doesn't seem that control of AER would be tied to HEST entries, although
>> in practice, it is.
> 
> I'm not sure, that predates me.  HEST does have a FIRMWARE_FIRST flag, but
> spec does not say anymore on relation to _OSC control or AER capability.
> Nothing in PCIe spec either.

Speaking to one of the PCIe (and _HPX type 3) spec authors, ownership of 
AER should be determined by _OSC. period. The result of _OSC applies to 
every device under the root port. This crap we do with checking HEST is 
crap.

If I'm not stepping on anyone toes, and there's no known unintended 
consequences, I can look at patching this up. I'm not promising a patch, 
though, but it's exactly the sort of thing I like to fix.

> I also don't know why Linux disables the AER driver if only one
> device has a FIRMWARE_FIRST HEST. Shouldn't that just be a per-device
> decision?

I think the logic is if one HEST entry has both FFS and GLOBAL flags 
set, then then disable AER services for all devices. It works in 
practice better than it works in theory. I think _OSC should be the 
determining factor here, not HEST.

>>> The closest I can find is the "Enabled" field in the HEST PCIe
>>> AER structures (ACPI v6.2, sec 18.3.2.4, .5, .6), where it says:
>>> [...]
>>> AFAICT, Linux completely ignores the Enabled field in these
>>> structures.
>>
>> I don't think ignoring the field is a problem:
>>* With FFS, OS should ignore it.
>>* Without FFS, we have control, and we get to make the decisions anyway.
>> In the latter case we decide whether to use AER, independent of the crap
>> in ACPI. I'm not even sure why "Enabled" matters in native AER handling.
>> Probably one of the check-boxes in "Binary table designer's handbook"?
> 
> And why doesn't Linux do anything with _OSC response other than logging
> it? If OS control wasn't granted, shouldn't that take priority over HEST?

But it does in portdrv_core.c:

if (dev->aer_cap && pci_aer_available() &&
(pcie_ports_native || host->native_aer)) {
services |= PCIE_PORT_SERVICE_AER;

That flag later creates a pcie device that allows aerdrv to attach to.

Alex


Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected

2018-11-14 Thread Keith Busch
On Wed, Nov 14, 2018 at 07:22:04PM +, alex_gagn...@dellteam.com wrote:
> On 11/14/2018 12:00 AM, Bjorn Helgaas wrote:
> > Just to make sure we're on the same page, can you point me to this
> > rule?  I do see that OSPM must request control of AER using _OSC
> > before it touches the AER registers.  What I don't see is the
> > connection between firmware-first and the AER registers.
> 
> ACPI 6.2 - 6.2.11.3, Table 6-197:
> 
> PCI Express Advanced Error Reporting control:
>   * The firmware sets this bit to 1 to grant control over PCI Express 
> Advanced Error Reporting. If firmware allows the OS control of this 
> feature, then in the context of the _OSC method it must ensure that 
> error messages are routed to device interrupts as described in the PCI 
> Express Base Specification[...]
> 
> Now I'm confused too:
>   * HEST -> __aer_firmware_first
>   This is used for touching/not touching AER bits
>   * _OSC -> bridge->native_aer
>   Used to enable/not enable AER portdrv service
> Maybe Keith knows better why we're doing it this way. From ACPI text, it 
> doesn't seem that control of AER would be tied to HEST entries, although 
> in practice, it is.

I'm not sure, that predates me.  HEST does have a FIRMWARE_FIRST flag, but
spec does not say anymore on relation to _OSC control or AER capability.
Nothing in PCIe spec either.

I also don't know why Linux disables the AER driver if only one
device has a FIRMWARE_FIRST HEST. Shouldn't that just be a per-device
decision?

> > The closest I can find is the "Enabled" field in the HEST PCIe
> > AER structures (ACPI v6.2, sec 18.3.2.4, .5, .6), where it says:
> > 
> >If the field value is 1, indicates this error source is
> >to be enabled.
> > 
> >If the field value is 0, indicates that the error source
> >is not to be enabled.
> > 
> >If FIRMWARE_FIRST is set in the flags field, the Enabled
> >field is ignored by the OSPM.
> > 
> > AFAICT, Linux completely ignores the Enabled field in these
> > structures.
> 
> I don't think ignoring the field is a problem:
>   * With FFS, OS should ignore it.
>   * Without FFS, we have control, and we get to make the decisions anyway.
> In the latter case we decide whether to use AER, independent of the crap 
> in ACPI. I'm not even sure why "Enabled" matters in native AER handling. 
> Probably one of the check-boxes in "Binary table designer's handbook"?

And why doesn't Linux do anything with _OSC response other than logging
it? If OS control wasn't granted, shouldn't that take priority over HEST?


Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected

2018-11-14 Thread Derrick, Jonathan
On Wed, 2018-11-14 at 19:22 +, alex_gagn...@dellteam.com wrote:
[snip]
> The whole issue of firmware-first, the mechanism by which
> > > > firmware
> > > > gets control, the System Error enables in Root Port Root
> > > > Control
> > > > registers, etc., is very murky to me.  Jon has a sort of
> > > > similar issue
> > > > with VMD where he needs to leave System Errors enabled instead
> > > > of
> > > > disabling them as we currently do.
> > > 
> > > Well, OS gets control via _OSC method, and based on that it
> > > should
> > > touch/not touch the AER bits.
> > 
> > I agree so far.
> > 
> > > The bits that get set/cleared come from _HPX method,
> > 
> > _HPX tells us about some AER registers, Device Control, Link
> > Control,
> > and some bridge registers.  It doesn't say anything about the Root
> > Control register that Jon is concerned with.
> 
> _HPX type 3 (yay!!!) got approved recently, and that will have more 
> fine-grained control. It will be able to handle root control reg.
> 
> > For firmware-first to work, firmware has to get control.  How does
> > it
> > get control?  How does OSPM know to either set up that mechanism or
> > keep its mitts off something firmware set up before handoff?
> 
> My understanding is that, if FW keeps control of AER in _OSC, then
> it 
> will have set things up to get notified instead of the OS. OSPM not 
> touching AER bits is to make sure it doesn't mess up FW's setup. I
> think 
> there are some proprietary bits in the root port to route interrupts
> to 
> SMIs instead of the AER vectors.
> 
> > In Jon's
> > VMD case, I think firmware-first relies on the System Error
> > controlled
> > by the Root Control register.  Linux thinks it owns that, and I
> > don't
> > know how to learn otherwise.
> 
> Didn't Keith say the root port is not visible to the OS?
> 
> Alex

That's correct. OS visibility wrt ACPI is limited to the VMD
endpoint/host bridge device which exposes the root ports. The root
ports aren't described by ACPI. VMD is the unusual case.

In VMD case, we might or might not need to pass back control to AER for
further error handling post FFS. I can see that's normally done by GHES
but will probably need some shimming to support the VMD case. I can't
rely on AER, because if any other devices use APEI, then the AER module
won't be initialized (aer_service_init::aer_acpi_firmware_first)

smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v2] PCI/MSI: Don't touch MSI bits when the PCI device is disconnected

2018-11-14 Thread Alex_Gagniuc
On 11/14/2018 12:00 AM, Bjorn Helgaas wrote:
> On Tue, Nov 13, 2018 at 10:39:15PM +, alex_gagn...@dellteam.com wrote:
>> On 11/12/2018 11:02 PM, Bjorn Helgaas wrote:
>>>
>>> [EXTERNAL EMAIL]
>>> Please report any suspicious attachments, links, or requests for sensitive 
>>> information.
> 
> It looks like Dell's email system adds the above in such a way that the
> email quoting convention suggests that *I* wrote it, when I did not.

I was wondering why you thought I was suspicious. It's a recent 
(server-side) change. You used to be able to disable these sort of 
notices. I'm told back in the day people were asked to delete emails 
before reading them.

>> ...
>>> Do you think Linux observes the rule about not touching AER bits on
>>> FFS?  I'm not sure it does.  I'm not even sure what section of the
>>> spec is relevant.
>>
>> I haven't found any place where linux breaks this rule. I'm very
>> confident that, unless otherwise instructed, we follow this rule.
> 
> Just to make sure we're on the same page, can you point me to this
> rule?  I do see that OSPM must request control of AER using _OSC
> before it touches the AER registers.  What I don't see is the
> connection between firmware-first and the AER registers.

ACPI 6.2 - 6.2.11.3, Table 6-197:

PCI Express Advanced Error Reporting control:
  * The firmware sets this bit to 1 to grant control over PCI Express 
Advanced Error Reporting. If firmware allows the OS control of this 
feature, then in the context of the _OSC method it must ensure that 
error messages are routed to device interrupts as described in the PCI 
Express Base Specification[...]

Now I'm confused too:
  * HEST -> __aer_firmware_first
This is used for touching/not touching AER bits
  * _OSC -> bridge->native_aer
Used to enable/not enable AER portdrv service
Maybe Keith knows better why we're doing it this way. From ACPI text, it 
doesn't seem that control of AER would be tied to HEST entries, although 
in practice, it is.

> The closest I can find is the "Enabled" field in the HEST PCIe
> AER structures (ACPI v6.2, sec 18.3.2.4, .5, .6), where it says:
> 
>If the field value is 1, indicates this error source is
>to be enabled.
> 
>If the field value is 0, indicates that the error source
>is not to be enabled.
> 
>If FIRMWARE_FIRST is set in the flags field, the Enabled
>field is ignored by the OSPM.
> 
> AFAICT, Linux completely ignores the Enabled field in these
> structures.

I don't think ignoring the field is a problem:
  * With FFS, OS should ignore it.
  * Without FFS, we have control, and we get to make the decisions anyway.
In the latter case we decide whether to use AER, independent of the crap 
in ACPI. I'm not even sure why "Enabled" matters in native AER handling. 
Probably one of the check-boxes in "Binary table designer's handbook"?

> These structures also contain values the OS is apparently supposed to
> write to Device Control and several AER registers (in struct
> acpi_hest_aer_common).  Linux ignores these as well.
> 
> These seem like fairly serious omissions in Linux.

I think HPX carries the same sort of information (except for Root 
Command reg). FW is supposed to program those registers anyway, so even 
if OS doesn't touch them, I'd expect things to just work.

>>> The whole issue of firmware-first, the mechanism by which firmware
>>> gets control, the System Error enables in Root Port Root Control
>>> registers, etc., is very murky to me.  Jon has a sort of similar issue
>>> with VMD where he needs to leave System Errors enabled instead of
>>> disabling them as we currently do.
>>
>> Well, OS gets control via _OSC method, and based on that it should
>> touch/not touch the AER bits.
> 
> I agree so far.
> 
>> The bits that get set/cleared come from _HPX method,
> 
> _HPX tells us about some AER registers, Device Control, Link Control,
> and some bridge registers.  It doesn't say anything about the Root
> Control register that Jon is concerned with.

_HPX type 3 (yay!!!) got approved recently, and that will have more 
fine-grained control. It will be able to handle root control reg.

> For firmware-first to work, firmware has to get control.  How does it
> get control?  How does OSPM know to either set up that mechanism or
> keep its mitts off something firmware set up before handoff?

My understanding is that, if FW keeps control of AER in _OSC, then it 
will have set things up to get notified instead of the OS. OSPM not 
touching AER bits is to make sure it doesn't mess up FW's setup. I think 
there are some proprietary bits in the root port to route interrupts to 
SMIs instead of the AER vectors.

> In Jon's
> VMD case, I think firmware-first relies on the System Error controlled
> by the Root Control register.  Linux thinks it owns that, and I don't
> know how to learn otherwise.

Didn't Keith say the root port is not visible to the OS?

Alex


[PATCH] powerpc/numa: fix hot-added CPU on memory-less node

2018-11-14 Thread Laurent Vivier
Trying to hotplug a CPU on an empty NUMA node (without
memory or CPU) crashes the kernel when the CPU is onlined.

During the onlining process, the kernel calls start_secondary()
that ends by calling
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]))
that relies on NODE_DATA(nid)->node_zonelists and in our case
NODE_DATA(nid) is NULL.

To fix that, add the same checking as we already have in
find_and_online_cpu_nid(): if NODE_DATA() is NULL, use
the first online node.

Bug: https://github.com/linuxppc/linux/issues/184
Fixes: ea05ba7c559c8e5a5946c3a94a2a266e9a6680a6
   (powerpc/numa: Ensure nodes initialized for hotplug)
Signed-off-by: Laurent Vivier 
---
 arch/powerpc/mm/numa.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3a048e98a132..1b2d25a3c984 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -483,6 +483,15 @@ static int numa_setup_cpu(unsigned long lcpu)
if (nid < 0 || !node_possible(nid))
nid = first_online_node;
 
+   if (NODE_DATA(nid) == NULL) {
+   /*
+* Default to using the nearest node that has memory installed.
+* Otherwise, it would be necessary to patch the kernel MM code
+* to deal with more memoryless-node error conditions.
+*/
+   nid = first_online_node;
+   }
+
map_cpu_to_node(lcpu, nid);
of_node_put(cpu);
 out:
-- 
2.17.2



Re: UBSAN: Undefined behaviour in kernel/rcu/tree_plugin.h in 4.20-rc1

2018-11-14 Thread Paul E. McKenney
On Wed, Nov 14, 2018 at 03:43:05PM +0100, Christophe LEROY wrote:
> 
> 
> Le 09/11/2018 à 21:10, Paul E. McKenney a écrit :
> >On Fri, Nov 09, 2018 at 06:11:20PM +0100, Christophe LEROY wrote:
> >>(Resending due to error in Paul's address)
> >>
> >>Paul
> >>
> >>I get the following UBSAN reports in 4.20-rc1 on an MPC8321E
> >>(powerpc/book3s/32)
> >>
> >>I bisected it to 3e31009898699dfc ("rcu: Defer reporting RCU-preempt
> >>quiescent states when disabled")
> >
> >Fixed by dfdc33585b0a ("rcu: Avoid signed integer overflow in
> >rcu_preempt_deferred_qs()") in my -rcu tree and in -next, which I intend
> >to push into the next merge window.
> 
> Thanks, I confirm it fixes the issue.
> 
> Do you intend to push it into 4.20-rc3 or do you mean 4.21 ?

The next merge window, which will be either v4.21 or v5.0.  The v4.20
merge window is over and done.  ;-)

Please note that the gcc command-line arguments used by the Linux kernel
prevent the compiler from taking advantage of the C-standard signed
integer overflow aspect of undefined behavior, so this is a aesthetic
issue rather than a failure case.  Plus the C++ standards committee just
voted in a change that gets rid of signed integer overflow completely.
It is not clear whether the C language will also make this change, but
it does require that the usual compilers have the ability to operate in
this manner.

Thanx, Paul

> Christophe
> 
> > Thanx, Paul
> >
> >>Thanks
> >>Christophe
> >>
> >>[4.919995] 
> >>
> >>[4.928428] UBSAN: Undefined behaviour in kernel/rcu/tree_plugin.h:623:28
> >>[4.935198] signed integer overflow:
> >>[4.938766] 0 - -2147483648 cannot be represented in type 'int'
> >>[4.944678] CPU: 0 PID: 119 Comm: mkdir Not tainted
> >>4.19.0-rc1-s3k-dev-5-g5a60513 #214
> >>[4.952908] Call Trace:
> >>[4.955382] [dec4fd20] [c02cb0d0] ubsan_epilogue+0x18/0x74 (unreliable)
> >>[4.962003] [dec4fd30] [c02cb5e0] handle_overflow+0xd0/0xe0
> >>[4.967588] [dec4fdb0] [c007b424] rcu_preempt_deferred_qs+0xc0/0xc8
> >>[4.973857] [dec4fdd0] [c007be28] rcu_note_context_switch+0x74/0x608
> >>[4.980217] [dec4fe10] [c064b790] __schedule+0x58/0x6e0
> >>[4.985448] [dec4fe50] [c064bfdc] preempt_schedule_common+0x48/0x9c
> >>[4.991717] [dec4fe70] [c01308c8] handle_mm_fault+0x10fc/0x1ecc
> >>[4.997639] [dec4fee0] [c001339c] do_page_fault+0x10c/0x760
> >>[5.003225] [dec4ff40] [c001234c] handle_page_fault+0x14/0x40
> >>[5.008968] --- interrupt: 401 at 0xff9cff8
> >>[5.008968] LR = 0xfeefd78
> >>[5.016170] 
> >>
> >>[5.024591] 
> >>
> >>[5.033005] UBSAN: Undefined behaviour in kernel/rcu/tree_plugin.h:627:28
> >>[5.039775] signed integer overflow:
> >>[5.043342] -2147483648 + -2147483648 cannot be represented in type 'int'
> >>[5.050118] CPU: 0 PID: 119 Comm: mkdir Not tainted
> >>4.19.0-rc1-s3k-dev-5-g5a60513 #214
> >>[5.058348] Call Trace:
> >>[5.060813] [dec4fd20] [c02cb0d0] ubsan_epilogue+0x18/0x74 (unreliable)
> >>[5.067433] [dec4fd30] [c02cb5e0] handle_overflow+0xd0/0xe0
> >>[5.073014] [dec4fdb0] [c007b408] rcu_preempt_deferred_qs+0xa4/0xc8
> >>[5.079283] [dec4fdd0] [c007be28] rcu_note_context_switch+0x74/0x608
> >>[5.085640] [dec4fe10] [c064b790] __schedule+0x58/0x6e0
> >>[5.090871] [dec4fe50] [c064bfdc] preempt_schedule_common+0x48/0x9c
> >>[5.097139] [dec4fe70] [c01308c8] handle_mm_fault+0x10fc/0x1ecc
> >>[5.103059] [dec4fee0] [c001339c] do_page_fault+0x10c/0x760
> >>[5.108642] [dec4ff40] [c001234c] handle_page_fault+0x14/0x40
> >>[5.114385] --- interrupt: 401 at 0xff9cff8
> >>[5.114385] LR = 0xfeefd78
> >>[5.121588] 
> >>
> >>
> 



Re: UBSAN: Undefined behaviour in kernel/rcu/tree_plugin.h in 4.20-rc1

2018-11-14 Thread Christophe LEROY




Le 09/11/2018 à 21:10, Paul E. McKenney a écrit :

On Fri, Nov 09, 2018 at 06:11:20PM +0100, Christophe LEROY wrote:

(Resending due to error in Paul's address)

Paul

I get the following UBSAN reports in 4.20-rc1 on an MPC8321E
(powerpc/book3s/32)

I bisected it to 3e31009898699dfc ("rcu: Defer reporting RCU-preempt
quiescent states when disabled")


Fixed by dfdc33585b0a ("rcu: Avoid signed integer overflow in
rcu_preempt_deferred_qs()") in my -rcu tree and in -next, which I intend
to push into the next merge window.



Thanks, I confirm it fixes the issue.

Do you intend to push it into 4.20-rc3 or do you mean 4.21 ?

Christophe


Thanx, Paul


Thanks
Christophe

[4.919995] 

[4.928428] UBSAN: Undefined behaviour in kernel/rcu/tree_plugin.h:623:28
[4.935198] signed integer overflow:
[4.938766] 0 - -2147483648 cannot be represented in type 'int'
[4.944678] CPU: 0 PID: 119 Comm: mkdir Not tainted
4.19.0-rc1-s3k-dev-5-g5a60513 #214
[4.952908] Call Trace:
[4.955382] [dec4fd20] [c02cb0d0] ubsan_epilogue+0x18/0x74 (unreliable)
[4.962003] [dec4fd30] [c02cb5e0] handle_overflow+0xd0/0xe0
[4.967588] [dec4fdb0] [c007b424] rcu_preempt_deferred_qs+0xc0/0xc8
[4.973857] [dec4fdd0] [c007be28] rcu_note_context_switch+0x74/0x608
[4.980217] [dec4fe10] [c064b790] __schedule+0x58/0x6e0
[4.985448] [dec4fe50] [c064bfdc] preempt_schedule_common+0x48/0x9c
[4.991717] [dec4fe70] [c01308c8] handle_mm_fault+0x10fc/0x1ecc
[4.997639] [dec4fee0] [c001339c] do_page_fault+0x10c/0x760
[5.003225] [dec4ff40] [c001234c] handle_page_fault+0x14/0x40
[5.008968] --- interrupt: 401 at 0xff9cff8
[5.008968] LR = 0xfeefd78
[5.016170] 

[5.024591] 

[5.033005] UBSAN: Undefined behaviour in kernel/rcu/tree_plugin.h:627:28
[5.039775] signed integer overflow:
[5.043342] -2147483648 + -2147483648 cannot be represented in type 'int'
[5.050118] CPU: 0 PID: 119 Comm: mkdir Not tainted
4.19.0-rc1-s3k-dev-5-g5a60513 #214
[5.058348] Call Trace:
[5.060813] [dec4fd20] [c02cb0d0] ubsan_epilogue+0x18/0x74 (unreliable)
[5.067433] [dec4fd30] [c02cb5e0] handle_overflow+0xd0/0xe0
[5.073014] [dec4fdb0] [c007b408] rcu_preempt_deferred_qs+0xa4/0xc8
[5.079283] [dec4fdd0] [c007be28] rcu_note_context_switch+0x74/0x608
[5.085640] [dec4fe10] [c064b790] __schedule+0x58/0x6e0
[5.090871] [dec4fe50] [c064bfdc] preempt_schedule_common+0x48/0x9c
[5.097139] [dec4fe70] [c01308c8] handle_mm_fault+0x10fc/0x1ecc
[5.103059] [dec4fee0] [c001339c] do_page_fault+0x10c/0x760
[5.108642] [dec4ff40] [c001234c] handle_page_fault+0x14/0x40
[5.114385] --- interrupt: 401 at 0xff9cff8
[5.114385] LR = 0xfeefd78
[5.121588] 




[PATCH] Powerpc/perf: Wire up PMI throttling

2018-11-14 Thread Ravi Bangoria
Commit 14c63f17b1fde ("perf: Drop sample rate when sampling is too
slow") introduced a way to throttle PMU interrupts if we're spending
too much time just processing those. Wire up powerpc PMI handler to
use this infrastructure.

Signed-off-by: Ravi Bangoria 
---
 arch/powerpc/kernel/traps.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9a86572db1ef..44f85fa22356 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1803,9 +1804,12 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 
 void performance_monitor_exception(struct pt_regs *regs)
 {
+   u64 start_clock;
__this_cpu_inc(irq_stat.pmu_irqs);
 
+   start_clock = sched_clock();
perf_irq(regs);
+   perf_sample_event_took(sched_clock() - start_clock);
 }
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
-- 
2.19.1



Re: [PATCH] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2018-11-14 Thread Michael Bringmann



On 11/13/2018 02:39 AM, Srikar Dronamraju wrote:
>> -static void topology_work_fn(struct work_struct *work)
>> -{
>> -rebuild_sched_domains();
>> +if (changed)
>> +rebuild_sched_domains();
>>  }
>>  static DECLARE_WORK(topology_work, topology_work_fn);
>>
>> @@ -1553,7 +1424,6 @@ void __init shared_proc_topology_init(void)
>>  if (lppaca_shared_proc(get_lppaca())) {
>>  bitmap_fill(cpumask_bits(_associativity_changes_mask),
>>  nr_cpumask_bits);
>> -numa_update_cpu_topology(false);
> 
> Shouldn't we be calling topology_schedule_update() here?

Agreed.

> 
>>  }
>>  }
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH] selftests/powerpc: Adjust wild_bctr to build with old gcc

2018-11-14 Thread Gustavo Romero
Currently the selftest wild_bctr can fail to build when an old gcc is used,
notably on gcc using a binutils version <= 2.27, because the assembler does
not support the integer suffix UL.

That patch adjusts the wild_bctr test so the type promotion to UL for the
shifts on compilation still happens but the UL suffix is absent on the
stringification, so the inline asm code generated has no UL suffixes.

Signed-off-by: Gustavo Romero 
---
 tools/testing/selftests/powerpc/mm/wild_bctr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/powerpc/mm/wild_bctr.c 
b/tools/testing/selftests/powerpc/mm/wild_bctr.c
index 90469a9..7e56aa4 100644
--- a/tools/testing/selftests/powerpc/mm/wild_bctr.c
+++ b/tools/testing/selftests/powerpc/mm/wild_bctr.c
@@ -47,8 +47,8 @@ static int ok(void)
return 0;
 }
 
-#define REG_POISON 0x5a5aUL
-#define POISONED_REG(n)((REG_POISON << 48) | ((n) << 32) | (REG_POISON 
<< 16) | (n))
+#define REG_POISON 0x5a5a
+#define POISONED_REG(n)(((REG_POISON+0UL) << 48) | ((n) << 32) | 
((REG_POISON+0UL) << 16) | (n))
 
 static inline void poison_regs(void)
 {
-- 
2.7.4



Re: linux-next: build warnings from Linus' tree

2018-11-14 Thread Michael Ellerman
Joel Stanley  writes:
> Hello Alan,
>
> On Tue, 12 Jun 2018 at 07:44, Stephen Rothwell  wrote:
>
>> Building Linus' tree, today's linux-next build (powerpc ppc64_defconfig)
>> produced these warning:
>>
>> ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in 
>> section `.gnu.hash'.
>> ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in 
>> section `.gnu.hash'.
>> ld: warning: orphan section `.gnu.hash' from `linker stubs' being placed in 
>> section `.gnu.hash'.
>>
>> This may just be because I have started building using the native Debian
>> gcc for the powerpc builds ...
>
> Do you know why we started creating these?

It's controlled by the ld option --hash-style, which AFAICS still
defaults to sysv (generating .hash).

But it seems gcc can be configured to have a different default, and at
least my native ppc64le toolchains are passing gnu, eg:

 /usr/lib/gcc/powerpc64le-linux-gnu/6/collect2 -plugin
 /usr/lib/gcc/powerpc64le-linux-gnu/6/liblto_plugin.so
 -plugin-opt=/usr/lib/gcc/powerpc64le-linux-gnu/6/lto-wrapper
 -plugin-opt=-fresolution=/tmp/ccw1U2fF.res
 -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s
 -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc
 -plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr
 -V -shared -m elf64lppc
 --hash-style=gnu
 

So that's presumably why we're seeing it, some GCCs are configured to
use it.

> If it's intentional, should we be putting including them in the same
> way as .hash sections?
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kernel/vmlinux.lds.S#n282
>
>   .hash : AT(ADDR(.hash) - LOAD_OFFSET) { *(.hash) }

That would presumably work.

My question though is do we even need it?

>From what I can see for it to be useful you need the section as well as
an entry in the dynamic section pointing at it, and we don't have a
dynamic section at all:

  $ readelf -S vmlinux | grep gnu.hash
[ 4] .gnu.hash GNU_HASH c0dbbdb0  00dcbdb0
  $ readelf -d vmlinux
  
  There is no dynamic section in this file.

Compare to the vdso:

$ readelf -d arch/powerpc/kernel/vdso64/vdso64.so

Dynamic section at offset 0x868 contains 12 entries:
  TagType Name/Value
 0x000e (SONAME) Library soname: [linux-vdso64.so.1]
 0x0004 (HASH)   0x120
 0x6ef5 (GNU_HASH)   0x170
 0x0005 (STRTAB) 0x320
 0x0006 (SYMTAB) 0x1d0
 0x000a (STRSZ)  269 (bytes)
 0x000b (SYMENT) 24 (bytes)
 0x7003 (PPC64_OPT)  0x0
 0x6ffc (VERDEF) 0x450
 0x6ffd (VERDEFNUM)  2
 0x6ff0 (VERSYM) 0x42e
 0x (NULL)   0x0


So can't we just discard .gnu.hash? And in fact do we need .hash either?

Actually arm64 discards the latter, and parisc discards both.

Would still be good to hear from Alan or someone else who knows anything
about toolchain stuff, ie. not me :)

cheers


[PATCH v2 4/4] powerpc: generate uapi header and system call table files

2018-11-14 Thread Firoz Khan
System call table generation script must be run to gener-
ate unistd_32/64.h and syscall_table_32/64/c32/spu.h files.
This patch will have changes which will invokes the script.

This patch will generate unistd_32/64.h and syscall_table-
_32/64/c32/spu.h files by the syscall table generation
script invoked by parisc/Makefile and the generated files
against the removed files must be identical.

The generated uapi header file will be included in uapi/-
asm/unistd.h and generated system call table header file
will be included by kernel/systbl.S file.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/Makefile   |   3 +
 arch/powerpc/include/asm/Kbuild |   4 +
 arch/powerpc/include/asm/systbl.h   | 395 
 arch/powerpc/include/uapi/asm/Kbuild|   2 +
 arch/powerpc/include/uapi/asm/unistd.h  | 392 +--
 arch/powerpc/kernel/Makefile|  10 -
 arch/powerpc/kernel/systbl.S|  36 +--
 arch/powerpc/kernel/systbl_chk.c|  61 -
 arch/powerpc/platforms/cell/spu_callbacks.c |  18 +-
 9 files changed, 27 insertions(+), 894 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/systbl.h
 delete mode 100644 arch/powerpc/kernel/systbl_chk.c

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 8a2ce14..34897191 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -402,6 +402,9 @@ archclean:
 
 archprepare: checkbin
 
+archheaders:
+   $(Q)$(MAKE) $(build)=arch/powerpc/kernel/syscalls all
+
 ifdef CONFIG_STACKPROTECTOR
 prepare: stack_protector_prepare
 
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d22..77ff7fb 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,3 +1,7 @@
+generated-y += syscall_table_32.h
+generated-y += syscall_table_64.h
+generated-y += syscall_table_c32.h
+generated-y += syscall_table_spu.h
 generic-y += div64.h
 generic-y += export.h
 generic-y += irq_regs.h
diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
deleted file mode 100644
index c4321b9..000
--- a/arch/powerpc/include/asm/systbl.h
+++ /dev/null
@@ -1,395 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * List of powerpc syscalls. For the meaning of the _SPU suffix see
- * arch/powerpc/platforms/cell/spu_callbacks.c
- */
-
-SYSCALL(restart_syscall)
-SYSCALL(exit)
-PPC_SYS(fork)
-SYSCALL_SPU(read)
-SYSCALL_SPU(write)
-COMPAT_SYS_SPU(open)
-SYSCALL_SPU(close)
-SYSCALL_SPU(waitpid)
-SYSCALL_SPU(creat)
-SYSCALL_SPU(link)
-SYSCALL_SPU(unlink)
-COMPAT_SYS(execve)
-SYSCALL_SPU(chdir)
-COMPAT_SYS_SPU(time)
-SYSCALL_SPU(mknod)
-SYSCALL_SPU(chmod)
-SYSCALL_SPU(lchown)
-SYSCALL(ni_syscall)
-OLDSYS(stat)
-COMPAT_SYS_SPU(lseek)
-SYSCALL_SPU(getpid)
-COMPAT_SYS(mount)
-SYSX(sys_ni_syscall,sys_oldumount,sys_oldumount)
-SYSCALL_SPU(setuid)
-SYSCALL_SPU(getuid)
-COMPAT_SYS_SPU(stime)
-COMPAT_SYS(ptrace)
-SYSCALL_SPU(alarm)
-OLDSYS(fstat)
-SYSCALL(pause)
-COMPAT_SYS(utime)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(access)
-SYSCALL_SPU(nice)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(sync)
-SYSCALL_SPU(kill)
-SYSCALL_SPU(rename)
-SYSCALL_SPU(mkdir)
-SYSCALL_SPU(rmdir)
-SYSCALL_SPU(dup)
-SYSCALL_SPU(pipe)
-COMPAT_SYS_SPU(times)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(brk)
-SYSCALL_SPU(setgid)
-SYSCALL_SPU(getgid)
-SYSCALL(signal)
-SYSCALL_SPU(geteuid)
-SYSCALL_SPU(getegid)
-SYSCALL(acct)
-SYSCALL(umount)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(ioctl)
-COMPAT_SYS_SPU(fcntl)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(setpgid)
-SYSCALL(ni_syscall)
-SYSX(sys_ni_syscall,sys_olduname,sys_olduname)
-SYSCALL_SPU(umask)
-SYSCALL_SPU(chroot)
-COMPAT_SYS(ustat)
-SYSCALL_SPU(dup2)
-SYSCALL_SPU(getppid)
-SYSCALL_SPU(getpgrp)
-SYSCALL_SPU(setsid)
-SYS32ONLY(sigaction)
-SYSCALL_SPU(sgetmask)
-SYSCALL_SPU(ssetmask)
-SYSCALL_SPU(setreuid)
-SYSCALL_SPU(setregid)
-SYS32ONLY(sigsuspend)
-SYSX(sys_ni_syscall,compat_sys_sigpending,sys_sigpending)
-SYSCALL_SPU(sethostname)
-COMPAT_SYS_SPU(setrlimit)
-SYSX(sys_ni_syscall,compat_sys_old_getrlimit,sys_old_getrlimit)
-COMPAT_SYS_SPU(getrusage)
-COMPAT_SYS_SPU(gettimeofday)
-COMPAT_SYS_SPU(settimeofday)
-SYSCALL_SPU(getgroups)
-SYSCALL_SPU(setgroups)
-SYSX(sys_ni_syscall,sys_ni_syscall,ppc_select)
-SYSCALL_SPU(symlink)
-OLDSYS(lstat)
-SYSCALL_SPU(readlink)
-SYSCALL(uselib)
-SYSCALL(swapon)
-SYSCALL(reboot)
-SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir)
-SYSCALL_SPU(mmap)
-SYSCALL_SPU(munmap)
-COMPAT_SYS_SPU(truncate)
-COMPAT_SYS_SPU(ftruncate)
-SYSCALL_SPU(fchmod)
-SYSCALL_SPU(fchown)
-SYSCALL_SPU(getpriority)
-SYSCALL_SPU(setpriority)
-SYSCALL(ni_syscall)
-COMPAT_SYS(statfs)
-COMPAT_SYS(fstatfs)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(socketcall)
-SYSCALL_SPU(syslog)
-COMPAT_SYS_SPU(setitimer)
-COMPAT_SYS_SPU(getitimer)
-COMPAT_SYS_SPU(newstat)
-COMPAT_SYS_SPU(newlstat)
-COMPAT_SYS_SPU(newfstat)
-SYSX(sys_ni_syscall,sys_uname,sys_uname)

[PATCH v2 3/4] powerpc: add system call table generation support

2018-11-14 Thread Firoz Khan
The system call tables are in different format in all
architecture and it will be difficult to manually add or
modify the system calls in the respective files. To make
it easy by keeping a script and which will generate the
uapi header and syscall table file. This change will also
help to unify the implementation across all architectures.

The system call table generation script is added in
syscalls directory which contain the script to generate
both uapi header file and system call table files.
The syscall.tbl file will be the input for the scripts.

syscall.tbl contains the list of available system calls
along with system call number and corresponding entry point.
Add a new system call in this architecture will be possible
by adding new entry in the syscall.tbl file.

Adding a new table entry consisting of:
- System call number.
- ABI.
- System call name.
- Entry point name.
- Compat entry name, if required.

syscallhdr.sh and syscalltbl.sh will generate uapi header-
unistd_32/64.h and syscall_table_32/64/c32.h files respect-
ively. File syscall_table_32/64/c32.h is included by sys-
call.S - the real system call table. Both .sh files will
parse the content syscall.tbl to generate the header and
table files.

ARM, s390 and x86 architecuture does have the similar support.
I leverage their implementation to come up with a generic
solution.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/kernel/syscalls/Makefile  |  61 +
 arch/powerpc/kernel/syscalls/syscall.tbl   | 408 +
 arch/powerpc/kernel/syscalls/syscallhdr.sh |  36 +++
 arch/powerpc/kernel/syscalls/syscalltbl.sh |  41 +++
 4 files changed, 546 insertions(+)
 create mode 100644 arch/powerpc/kernel/syscalls/Makefile
 create mode 100644 arch/powerpc/kernel/syscalls/syscall.tbl
 create mode 100644 arch/powerpc/kernel/syscalls/syscallhdr.sh
 create mode 100644 arch/powerpc/kernel/syscalls/syscalltbl.sh

diff --git a/arch/powerpc/kernel/syscalls/Makefile 
b/arch/powerpc/kernel/syscalls/Makefile
new file mode 100644
index 000..6615c24
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/Makefile
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+kapi := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
+
+_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')  \
+ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+
+syscall := $(srctree)/$(src)/syscall.tbl
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR  $@
+  cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@'   \
+  '$(syshdr_abis_$(basetarget))'   \
+  '$(syshdr_pfx_$(basetarget))'\
+  '$(syshdr_offset_$(basetarget))'
+
+quiet_cmd_systbl = SYSTBL  $@
+  cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@'   \
+  '$(systbl_abis_$(basetarget))'   \
+  '$(systbl_abi_$(basetarget))'\
+  '$(systbl_offset_$(basetarget))'
+
+syshdr_abis_unistd_32 := common,32
+$(uapi)/unistd_32.h: $(syscall) $(syshdr)
+   $(call if_changed,syshdr)
+
+syshdr_abis_unistd_64 := common,64
+$(uapi)/unistd_64.h: $(syscall) $(syshdr)
+   $(call if_changed,syshdr)
+
+systbl_abis_syscall_table_32 := common,32
+$(kapi)/syscall_table_32.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+systbl_abis_syscall_table_64 := common,64
+$(kapi)/syscall_table_64.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+systbl_abis_syscall_table_c32 := common,32
+systbl_abi_syscall_table_c32 := c32
+$(kapi)/syscall_table_c32.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+systbl_abis_syscall_table_spu := common,32
+systbl_abi_syscall_table_spu := spu
+$(kapi)/syscall_table_spu.h: $(syscall) $(systbl)
+   $(call if_changed,systbl)
+
+uapisyshdr-y   += unistd_32.h unistd_64.h
+kapisyshdr-y   += syscall_table_32.h   \
+  syscall_table_64.h   \
+  syscall_table_c32.h  \
+  syscall_table_spu.h
+
+targets+= $(uapisyshdr-y) $(kapisyshdr-y)
+
+PHONY += all
+all: $(addprefix $(uapi)/,$(uapisyshdr-y))
+all: $(addprefix $(kapi)/,$(kapisyshdr-y))
+   @:
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl 
b/arch/powerpc/kernel/syscalls/syscall.tbl
new file mode 100644
index 000..4d90f30
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -0,0 +1,408 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# system call numbers and entry vectors for powerpc
+#
+# The format is:
+#  
+#
+# The  can be common, 64, or 32 for this file.
+#
+0  common  restart_syscall sys_restart_syscall 
sys_restart_syscall
+1  common  exitsys_exit

[PATCH v2 2/4] powerpc: move macro definition from asm/systbl.h

2018-11-14 Thread Firoz Khan
Move the macro definition for compat_sys_sigsuspend from
asm/systbl.h to the file which it is getting included.

One of the patch in this patch series is generating uapi
header and syscall table files. In order to come up with
a common implimentation across all architecture, we need
to do this change.

This change will simplify the implementation of system
call table generation script and help to come up a common
implementation across all architecture.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/include/asm/systbl.h   | 1 -
 arch/powerpc/kernel/systbl.S| 1 +
 arch/powerpc/kernel/systbl_chk.c| 1 +
 arch/powerpc/platforms/cell/spu_callbacks.c | 1 +
 4 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index 01b5171..c4321b9 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -76,7 +76,6 @@
 SYSCALL_SPU(ssetmask)
 SYSCALL_SPU(setreuid)
 SYSCALL_SPU(setregid)
-#define compat_sys_sigsuspend sys_sigsuspend
 SYS32ONLY(sigsuspend)
 SYSX(sys_ni_syscall,compat_sys_sigpending,sys_sigpending)
 SYSCALL_SPU(sethostname)
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 919a327..9ff1913 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -47,4 +47,5 @@
 .globl sys_call_table
 sys_call_table:
 
+#define compat_sys_sigsuspend  sys_sigsuspend
 #include 
diff --git a/arch/powerpc/kernel/systbl_chk.c b/arch/powerpc/kernel/systbl_chk.c
index 4653258..db97da7 100644
--- a/arch/powerpc/kernel/systbl_chk.c
+++ b/arch/powerpc/kernel/systbl_chk.c
@@ -56,5 +56,6 @@
 #define getrlimit  ugetrlimit
 
 START_TABLE
+#define compat_sys_sigsuspend  sys_sigsuspend
 #include 
 END_TABLE NR_syscalls
diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c 
b/arch/powerpc/platforms/cell/spu_callbacks.c
index 8ae8620..7517a43 100644
--- a/arch/powerpc/platforms/cell/spu_callbacks.c
+++ b/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -47,6 +47,7 @@
 #define COMPAT_SPU_NEW(func)   sys_##func,
 #define SYSX_SPU(f, f3264, f32)f,
 
+#define compat_sys_sigsuspend  sys_sigsuspend
 #include 
 };
 
-- 
1.9.1



[PATCH v2 1/4] powerpc: add __NR_syscalls along with NR_syscalls

2018-11-14 Thread Firoz Khan
NR_syscalls macro holds the number of system call exist
in powerpc architecture. We have to change the value of
NR_syscalls, if we add or delete a system call.

One of the patch in this patch series has a script which
will generate a uapi header based on syscall.tbl file.
The syscall.tbl file contains the number of system call
information. So we have two option to update NR_syscalls
value.

1. Update NR_syscalls in asm/unistd.h manually by count-
   ing the no.of system calls. No need to update NR_sys-
   calls until we either add a new system call or delete
   existing system call.

2. We can keep this feature in above mentioned script,
   that will count the number of syscalls and keep it in
   a generated file. In this case we don't need to expli-
   citly update NR_syscalls in asm/unistd.h file.

The 2nd option will be the recommended one. For that, I
added the __NR_syscalls macro in uapi/asm/unistd.h along
with NR_syscalls asm/unistd.h. The macro __NR_syscalls
also added for making the name convention same across all
architecture. While __NR_syscalls isn't strictly part of
the uapi, having it as part of the generated header to
simplifies the implementation. We also need to enclose
this macro with #ifdef __KERNEL__ to avoid side effects.

Signed-off-by: Firoz Khan 
---
 arch/powerpc/include/asm/unistd.h  | 3 +--
 arch/powerpc/include/uapi/asm/unistd.h | 5 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index b0de85b..a3c35e6 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -11,8 +11,7 @@
 
 #include 
 
-
-#define NR_syscalls389
+#define NR_syscalls__NR_syscalls
 
 #define __NR__exit __NR_exit
 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index 985534d..7195868 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -10,7 +10,6 @@
 #ifndef _UAPI_ASM_POWERPC_UNISTD_H_
 #define _UAPI_ASM_POWERPC_UNISTD_H_
 
-
 #define __NR_restart_syscall 0
 #define __NR_exit1
 #define __NR_fork2
@@ -401,4 +400,8 @@
 #define __NR_rseq  387
 #define __NR_io_pgetevents 388
 
+#ifdef __KERNEL__
+#define __NR_syscalls  389
+#endif
+
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
1.9.1



[PATCH v2 0/4] powerpc: system call table generation support

2018-11-14 Thread Firoz Khan
The purpose of this patch series is, we can easily
add/modify/delete system call table support by cha-
nging entry in syscall.tbl file instead of manually
changing many files. The other goal is to unify the 
system call table generation support implementation 
across all the architectures. 

The system call tables are in different format in 
all architecture. It will be difficult to manually
add, modify or delete the system calls in the resp-
ective files manually. To make it easy by keeping a 
script and which'll generate uapi header file and 
syscall table file.

syscall.tbl contains the list of available system 
calls along with system call number and correspond-
ing entry point. Add a new system call in this arch-
itecture will be possible by adding new entry in 
the syscall.tbl file.

Adding a new table entry consisting of:
- System call number.
- ABI.
- System call name.
- Entry point name.
- Compat entry name.
- spu entry name, if required.

ARM, s390 and x86 architecuture does exist the sim-
ilar support. I leverage their implementation to 
come up with a generic solution.

I have done the same support for work for alpha, 
ia64, m68k, microblaze, mips, parisc, sh, sparc, 
and xtensa. Below mentioned git repository contains
more details about the workflow.

https://github.com/frzkhn/system_call_table_generator/

Finally, this is the ground work to solve the Y2038
issue. We need to add two dozen of system calls to 
solve Y2038 issue. So this patch series will help to
add new system calls easily by adding new entry in the
syscall.tbl.

Changes since v1:
 - optimized/updated the syscall table generation 
   scripts.
 - fixed all mixed indentation issues in syscall.tbl.
 - added "comments" in syscall_*.tbl.
 - changed from generic-y to generated-y in Kbuild.

Firoz Khan (4):
  powerpc: add __NR_syscalls along with NR_syscalls
  powerpc: move macro definition from asm/systbl.h
  powerpc: add system call table generation support
  powerpc: generate uapi header and system call table files

 arch/powerpc/Makefile   |   3 +
 arch/powerpc/include/asm/Kbuild |   4 +
 arch/powerpc/include/asm/systbl.h   | 396 ---
 arch/powerpc/include/asm/unistd.h   |   3 +-
 arch/powerpc/include/uapi/asm/Kbuild|   2 +
 arch/powerpc/include/uapi/asm/unistd.h  | 389 +-
 arch/powerpc/kernel/Makefile|  10 -
 arch/powerpc/kernel/syscalls/Makefile   |  61 +
 arch/powerpc/kernel/syscalls/syscall.tbl| 408 
 arch/powerpc/kernel/syscalls/syscallhdr.sh  |  36 +++
 arch/powerpc/kernel/syscalls/syscalltbl.sh  |  41 +++
 arch/powerpc/kernel/systbl.S|  37 +--
 arch/powerpc/kernel/systbl_chk.c|  60 
 arch/powerpc/platforms/cell/spu_callbacks.c |  17 +-
 14 files changed, 575 insertions(+), 892 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/systbl.h
 create mode 100644 arch/powerpc/kernel/syscalls/Makefile
 create mode 100644 arch/powerpc/kernel/syscalls/syscall.tbl
 create mode 100644 arch/powerpc/kernel/syscalls/syscallhdr.sh
 create mode 100644 arch/powerpc/kernel/syscalls/syscalltbl.sh
 delete mode 100644 arch/powerpc/kernel/systbl_chk.c

-- 
1.9.1



[PATCH 03/34] powerpc/dma: remove the unused ARCH_HAS_DMA_MMAP_COHERENT define

2018-11-14 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
---
 arch/powerpc/include/asm/dma-mapping.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index 8fa394520af6..f2a4a7142b1e 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -112,7 +112,5 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
 extern u64 __dma_get_required_mask(struct device *dev);
 
-#define ARCH_HAS_DMA_MMAP_COHERENT
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
-- 
2.19.1



[PATCH 01/34] powerpc: use mm zones more sensibly

2018-11-14 Thread Christoph Hellwig
Powerpc has somewhat odd usage where ZONE_DMA is used for all memory on
common 64-bit configfs, and ZONE_DMA32 is used for 31-bit schemes.

Move to a scheme closer to what other architectures use (and I dare to
say the intent of the system):

 - ZONE_DMA: optionally for memory < 31-bit (64-bit embedded only)
 - ZONE_NORMAL: everything addressable by the kernel
 - ZONE_HIGHMEM: memory > 32-bit for 32-bit kernels

Also provide information on how ZONE_DMA is used by defining
ARCH_ZONE_DMA_BITS.

Contains various fixes from Benjamin Herrenschmidt.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/Kconfig  |  8 +---
 arch/powerpc/include/asm/page.h   |  2 +
 arch/powerpc/include/asm/pgtable.h|  1 -
 arch/powerpc/kernel/dma-swiotlb.c |  6 +--
 arch/powerpc/kernel/dma.c |  7 +--
 arch/powerpc/mm/mem.c | 47 +++
 arch/powerpc/platforms/85xx/corenet_generic.c | 10 
 arch/powerpc/platforms/85xx/qemu_e500.c   |  9 
 include/linux/mmzone.h|  2 +-
 9 files changed, 25 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8be31261aec8..c3613bc1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -374,9 +374,9 @@ config PPC_ADV_DEBUG_DAC_RANGE
depends on PPC_ADV_DEBUG_REGS && 44x
default y
 
-config ZONE_DMA32
+config ZONE_DMA
bool
-   default y if PPC64
+   default y if PPC_BOOK3E_64
 
 config PGTABLE_LEVELS
int
@@ -869,10 +869,6 @@ config ISA
  have an IBM RS/6000 or pSeries machine, say Y.  If you have an
  embedded board, consult your board documentation.
 
-config ZONE_DMA
-   bool
-   default y
-
 config GENERIC_ISA_DMA
bool
depends on ISA_DMA_API
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index f6a1265face2..fc8c9ac0c6be 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -354,4 +354,6 @@ typedef struct page *pgtable_t;
 #endif /* __ASSEMBLY__ */
 #include 
 
+#define ARCH_ZONE_DMA_BITS 31
+
 #endif /* _ASM_POWERPC_PAGE_H */
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 9679b7519a35..8af32ce93c7f 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -66,7 +66,6 @@ extern unsigned long empty_zero_page[];
 
 extern pgd_t swapper_pg_dir[];
 
-void limit_zone_pfn(enum zone_type zone, unsigned long max_pfn);
 int dma_pfn_limit_to_zone(u64 pfn_limit);
 extern void paging_init(void);
 
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index 5fc335f4d9cd..678811abccfc 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -108,12 +108,8 @@ int __init swiotlb_setup_bus_notifier(void)
 
 void __init swiotlb_detect_4g(void)
 {
-   if ((memblock_end_of_DRAM() - 1) > 0x) {
+   if ((memblock_end_of_DRAM() - 1) > 0x)
ppc_swiotlb_enable = 1;
-#ifdef CONFIG_ZONE_DMA32
-   limit_zone_pfn(ZONE_DMA32, (1ULL << 32) >> PAGE_SHIFT);
-#endif
-   }
 }
 
 static int __init check_swiotlb_enabled(void)
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index dbfc7056d7df..6551685a4ed0 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -50,7 +50,7 @@ static int dma_nommu_dma_supported(struct device *dev, u64 
mask)
return 1;
 
 #ifdef CONFIG_FSL_SOC
-   /* Freescale gets another chance via ZONE_DMA/ZONE_DMA32, however
+   /* Freescale gets another chance via ZONE_DMA, however
 * that will have to be refined if/when they support iommus
 */
return 1;
@@ -94,13 +94,10 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t 
size,
}
 
switch (zone) {
+#ifdef CONFIG_ZONE_DMA
case ZONE_DMA:
flag |= GFP_DMA;
break;
-#ifdef CONFIG_ZONE_DMA32
-   case ZONE_DMA32:
-   flag |= GFP_DMA32;
-   break;
 #endif
};
 #endif /* CONFIG_FSL_SOC */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0a64fffabee1..c0b676c3a5ba 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -246,35 +246,19 @@ static int __init mark_nonram_nosave(void)
 }
 #endif
 
-static bool zone_limits_final;
-
 /*
- * The memory zones past TOP_ZONE are managed by generic mm code.
- * These should be set to zero since that's what every other
- * architecture does.
+ * Zones usage:
+ *
+ * We setup ZONE_DMA to be 31-bits on all platforms and ZONE_NORMAL to be
+ * everything else. GFP_DMA32 page allocations automatically fall back to
+ * ZONE_DMA.
+ *
+ * By using 31-bit unconditionally, we can exploit ARCH_ZONE_DMA_BITS to
+ * inform the generic DMA mapping code.  32-bit only devices (if not handled
+ * by an IOMMU 

use generic DMA mapping code in powerpc V4

2018-11-14 Thread Christoph Hellwig
Hi all,

this series switches the powerpc port to use the generic swiotlb and
noncoherent dma ops, and to use more generic code for the coherent
direct mapping, as well as removing a lot of dead code.

As this series is very large and depends on the dma-mapping tree I've
also published a git tree:

git://git.infradead.org/users/hch/misc.git powerpc-dma.4

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/powerpc-dma.4

Changes since v3:
 - rebase on the powerpc fixes tree
 - add a new patch to actually make the baseline amigaone config
   configure without warnings
 - only use ZONE_DMA for 64-bit embedded CPUs, on pseries an IOMMU is
   always present
 - fix compile in mem.c for one configuration
 - drop the full npu removal for now, will be resent separately
 - a few git bisection fixes

The changes since v1 are to big to list and v2 was not posted in public.



[PATCH 30/34] powerpc/dma: remove dma_nommu_mmap_coherent

2018-11-14 Thread Christoph Hellwig
The coherent cache version of this function already is functionally
identicall to the default version, and by defining the
arch_dma_coherent_to_pfn hook the same is ture for the noncoherent
version as well.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-mapping.h |  4 
 arch/powerpc/kernel/dma-iommu.c|  1 -
 arch/powerpc/kernel/dma-swiotlb.c  |  1 -
 arch/powerpc/kernel/dma.c  | 19 ---
 arch/powerpc/mm/dma-noncoherent.c  |  7 +--
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 arch/powerpc/platforms/pseries/vio.c   |  1 -
 7 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index 16d45518d9bb..f19c486e7b3f 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -25,10 +25,6 @@ extern void *__dma_nommu_alloc_coherent(struct device *dev, 
size_t size,
 extern void __dma_nommu_free_coherent(struct device *dev, size_t size,
   void *vaddr, dma_addr_t dma_handle,
   unsigned long attrs);
-extern int dma_nommu_mmap_coherent(struct device *dev,
-   struct vm_area_struct *vma,
-   void *cpu_addr, dma_addr_t handle,
-   size_t size, unsigned long attrs);
 int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction direction,
unsigned long attrs);
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 4937b39e246b..5b15e53ee43d 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -172,7 +172,6 @@ int dma_iommu_mapping_error(struct device *dev, dma_addr_t 
dma_addr)
 const struct dma_map_ops dma_iommu_ops = {
.alloc  = dma_iommu_alloc_coherent,
.free   = dma_iommu_free_coherent,
-   .mmap   = dma_nommu_mmap_coherent,
.map_sg = dma_iommu_map_sg,
.unmap_sg   = dma_iommu_unmap_sg,
.dma_supported  = dma_iommu_dma_supported,
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index b3266f7a6915..03df252ff5fb 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -42,7 +42,6 @@ unsigned int ppc_swiotlb_enable;
 const struct dma_map_ops powerpc_swiotlb_dma_ops = {
.alloc = __dma_nommu_alloc_coherent,
.free = __dma_nommu_free_coherent,
-   .mmap = dma_nommu_mmap_coherent,
.map_sg = swiotlb_map_sg_attrs,
.unmap_sg = swiotlb_unmap_sg_attrs,
.dma_supported = swiotlb_dma_supported,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 5c83a34f288f..a6590aa77181 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -113,24 +113,6 @@ void __dma_nommu_free_coherent(struct device *dev, size_t 
size,
 }
 #endif /* !CONFIG_NOT_COHERENT_CACHE */
 
-int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
-void *cpu_addr, dma_addr_t handle, size_t size,
-unsigned long attrs)
-{
-   unsigned long pfn;
-
-#ifdef CONFIG_NOT_COHERENT_CACHE
-   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-   pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
-#else
-   pfn = page_to_pfn(virt_to_page(cpu_addr));
-#endif
-   return remap_pfn_range(vma, vma->vm_start,
-  pfn + vma->vm_pgoff,
-  vma->vm_end - vma->vm_start,
-  vma->vm_page_prot);
-}
-
 int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction direction,
unsigned long attrs)
@@ -196,7 +178,6 @@ static inline void dma_nommu_sync_single(struct device *dev,
 const struct dma_map_ops dma_nommu_ops = {
.alloc  = __dma_nommu_alloc_coherent,
.free   = __dma_nommu_free_coherent,
-   .mmap   = dma_nommu_mmap_coherent,
.map_sg = dma_nommu_map_sg,
.dma_supported  = dma_nommu_dma_supported,
.map_page   = dma_nommu_map_page,
diff --git a/arch/powerpc/mm/dma-noncoherent.c 
b/arch/powerpc/mm/dma-noncoherent.c
index e955539686a4..ee95da19c82d 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -400,14 +401,16 @@ EXPORT_SYMBOL(__dma_sync_page);
 
 /*
  * Return the PFN for a given cpu virtual address returned by
- * __dma_nommu_alloc_coherent. This is used by 

[PATCH 32/34] powerpc/dma: remove get_dma_offset

2018-11-14 Thread Christoph Hellwig
Just fold the calculation into __phys_to_dma/__dma_to_phys as those are
the only places that should know about it.

Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
---
 arch/powerpc/include/asm/dma-direct.h  |  8 ++--
 arch/powerpc/include/asm/dma-mapping.h | 16 
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-direct.h 
b/arch/powerpc/include/asm/dma-direct.h
index 92d8aed86422..a2912b47102c 100644
--- a/arch/powerpc/include/asm/dma-direct.h
+++ b/arch/powerpc/include/asm/dma-direct.h
@@ -13,11 +13,15 @@ static inline bool dma_capable(struct device *dev, 
dma_addr_t addr, size_t size)
 
 static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-   return paddr + get_dma_offset(dev);
+   if (!dev)
+   return paddr + PCI_DRAM_OFFSET;
+   return paddr + dev->archdata.dma_offset;
 }
 
 static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-   return daddr - get_dma_offset(dev);
+   if (!dev)
+   return daddr - PCI_DRAM_OFFSET;
+   return daddr - dev->archdata.dma_offset;
 }
 #endif /* ASM_POWERPC_DMA_DIRECT_H */
diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index 93e57e28be28..c70f55d2f5e0 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -43,22 +43,6 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return NULL;
 }
 
-/*
- * get_dma_offset()
- *
- * Get the dma offset on configurations where the dma address can be determined
- * from the physical address by looking at a simple offset.  Direct dma and
- * swiotlb use this function, but it is typically not used by implementations
- * with an iommu.
- */
-static inline dma_addr_t get_dma_offset(struct device *dev)
-{
-   if (dev)
-   return dev->archdata.dma_offset;
-
-   return PCI_DRAM_OFFSET;
-}
-
 static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 {
if (dev)
-- 
2.19.1



[PATCH 29/34] powerpc/dma: use phys_to_dma instead of get_dma_offset

2018-11-14 Thread Christoph Hellwig
Use the standard portable helper instead of the powerpc specific one,
which is about to go away.

Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
---
 arch/powerpc/kernel/dma.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index cf0ae0b3fb24..5c83a34f288f 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -6,7 +6,7 @@
  */
 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -42,7 +42,7 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev)
 int dma_nommu_dma_supported(struct device *dev, u64 mask)
 {
 #ifdef CONFIG_PPC64
-   u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1);
+   u64 limit = phys_to_dma(dev, (memblock_end_of_DRAM() - 1));
 
/* Limit fits in the mask, we are good */
if (mask >= limit)
@@ -100,7 +100,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t 
size,
return NULL;
ret = page_address(page);
memset(ret, 0, size);
-   *dma_handle = __pa(ret) + get_dma_offset(dev);
+   *dma_handle = phys_to_dma(dev,__pa(ret));
 
return ret;
 }
@@ -139,7 +139,7 @@ int dma_nommu_map_sg(struct device *dev, struct scatterlist 
*sgl,
int i;
 
for_each_sg(sgl, sg, nents, i) {
-   sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
+   sg->dma_address = phys_to_dma(dev, sg_phys(sg));
sg->dma_length = sg->length;
 
if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
@@ -170,7 +170,7 @@ dma_addr_t dma_nommu_map_page(struct device *dev, struct 
page *page,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_sync_page(page, offset, size, dir);
 
-   return page_to_phys(page) + offset + get_dma_offset(dev);
+   return phys_to_dma(dev, page_to_phys(page)) + offset;
 }
 
 #ifdef CONFIG_NOT_COHERENT_CACHE
-- 
2.19.1



[PATCH 34/34] powerpc/dma: trim the fat from

2018-11-14 Thread Christoph Hellwig
There is no need to provide anything but get_arch_dma_ops to
.  More the remaining declarations to 
and drop all the includes.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-mapping.h| 29 ---
 arch/powerpc/include/asm/iommu.h  | 10 +++
 arch/powerpc/platforms/44x/ppc476.c   |  1 +
 arch/powerpc/platforms/85xx/corenet_generic.c |  1 +
 arch/powerpc/platforms/85xx/qemu_e500.c   |  1 +
 arch/powerpc/sysdev/fsl_pci.c |  1 +
 6 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index a59c42879194..565d6f74b189 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -1,37 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2004 IBM
- *
- * Implements the generic device dma API for powerpc.
- * the pci and vio busses
  */
 #ifndef _ASM_DMA_MAPPING_H
 #define _ASM_DMA_MAPPING_H
-#ifdef __KERNEL__
-
-#include 
-#include 
-/* need struct page definitions */
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static inline unsigned long device_to_mask(struct device *dev)
-{
-   if (dev->dma_mask && *dev->dma_mask)
-   return *dev->dma_mask;
-   /* Assume devices without mask can take 32 bit addresses */
-   return 0xul;
-}
-
-/*
- * Available generic sets of operations
- */
-#ifdef CONFIG_PPC64
-extern const struct dma_map_ops dma_iommu_ops;
-#endif
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
@@ -43,5 +15,4 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return NULL;
 }
 
-#endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5128aac8e165..46a8d4716d90 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -332,5 +332,15 @@ extern bool iommu_fixed_is_weak;
 #define iommu_fixed_is_weak false
 #endif
 
+extern const struct dma_map_ops dma_iommu_ops;
+
+static inline unsigned long device_to_mask(struct device *dev)
+{
+   if (dev->dma_mask && *dev->dma_mask)
+   return *dev->dma_mask;
+   /* Assume devices without mask can take 32 bit addresses */
+   return 0xul;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/platforms/44x/ppc476.c 
b/arch/powerpc/platforms/44x/ppc476.c
index e55933f9cd55..a5e61e5c16e2 100644
--- a/arch/powerpc/platforms/44x/ppc476.c
+++ b/arch/powerpc/platforms/44x/ppc476.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c 
b/arch/powerpc/platforms/85xx/corenet_generic.c
index b0dac307bebf..0ea13697189e 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c 
b/arch/powerpc/platforms/85xx/qemu_e500.c
index 27631c607f3d..c52c8f9e8385 100644
--- a/arch/powerpc/platforms/85xx/qemu_e500.c
+++ b/arch/powerpc/platforms/85xx/qemu_e500.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "smp.h"
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 964a4aede6b1..9584765dbe3b 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
-- 
2.19.1



[PATCH 28/34] dma-mapping, powerpc: simplify the arch dma_set_mask override

2018-11-14 Thread Christoph Hellwig
Instead of letting the architecture supply all of dma_set_mask just
give it an additional hook selected by Kconfig.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/dma-mapping.h |  3 ---
 arch/powerpc/kernel/dma-swiotlb.c  |  8 
 arch/powerpc/kernel/dma.c  | 12 
 arch/powerpc/sysdev/fsl_pci.c  |  4 
 include/linux/dma-mapping.h| 11 ---
 kernel/dma/Kconfig |  3 +++
 7 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2d4a19bc8023..4f03997ad54a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -126,6 +126,7 @@ config PPC
# Please keep this list sorted alphabetically.
#
select ARCH_HAS_DEVMEM_IS_ALLOWED
+   select ARCH_HAS_DMA_SET_MASKif SWIOTLB
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index e5ee4ac97c14..16d45518d9bb 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -110,8 +110,5 @@ static inline void set_dma_offset(struct device *dev, 
dma_addr_t off)
dev->archdata.dma_offset = off;
 }
 
-#define HAVE_ARCH_DMA_SET_MASK 1
-extern int dma_set_mask(struct device *dev, u64 dma_mask);
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index 62caa16b91a9..b3266f7a6915 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -22,6 +22,14 @@
 #include 
 #include 
 
+bool arch_dma_set_mask(struct device *dev, u64 dma_mask)
+{
+   if (!ppc_md.dma_set_mask)
+   return 0;
+   return ppc_md.dma_set_mask(dev, dma_mask);
+}
+EXPORT_SYMBOL(arch_dma_set_mask);
+
 unsigned int ppc_swiotlb_enable;
 
 /*
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 59f38ca3975c..cf0ae0b3fb24 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -209,18 +209,6 @@ const struct dma_map_ops dma_nommu_ops = {
 };
 EXPORT_SYMBOL(dma_nommu_ops);
 
-int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-   if (ppc_md.dma_set_mask)
-   return ppc_md.dma_set_mask(dev, dma_mask);
-
-   if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-   return -EIO;
-   *dev->dma_mask = dma_mask;
-   return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 static int __init dma_init(void)
 {
 #ifdef CONFIG_IBMVIO
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 296ffabc9386..cb91a3d113d1 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -135,9 +135,6 @@ static inline void setup_swiotlb_ops(struct pci_controller 
*hose) {}
 
 static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask)
 {
-   if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-   return -EIO;
-
/*
 * Fix up PCI devices that are able to DMA to the large inbound
 * mapping that allows addressing any RAM address from across PCI.
@@ -147,7 +144,6 @@ static int fsl_pci_dma_set_mask(struct device *dev, u64 
dma_mask)
set_dma_offset(dev, pci64_dma_offset);
}
 
-   *dev->dma_mask = dma_mask;
return 0;
 }
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 15bd41447025..8dd19e66c0e5 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -598,18 +598,23 @@ static inline int dma_supported(struct device *dev, u64 
mask)
return ops->dma_supported(dev, mask);
 }
 
-#ifndef HAVE_ARCH_DMA_SET_MASK
+#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
+bool arch_dma_set_mask(struct device *dev, u64 mask);
+#else
+#define arch_dma_set_mask(dev, mask)   true
+#endif
+
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
if (!dev->dma_mask || !dma_supported(dev, mask))
return -EIO;
-
+   if (!arch_dma_set_mask(dev, mask))
+   return -EIO;
dma_check_mask(dev, mask);
 
*dev->dma_mask = mask;
return 0;
 }
-#endif
 
 static inline u64 dma_get_mask(struct device *dev)
 {
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 645c7a2ecde8..951045c90c2c 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -16,6 +16,9 @@ config ARCH_DMA_ADDR_T_64BIT
 config ARCH_HAS_DMA_COHERENCE_H
bool
 
+config ARCH_HAS_DMA_SET_MASK
+   bool
+
 config HAVE_GENERIC_DMA_COHERENT
bool
 
-- 
2.19.1



[PATCH 33/34] powerpc/dma: remove set_dma_offset

2018-11-14 Thread Christoph Hellwig
There is no good reason for this helper, just opencode it.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-mapping.h| 6 --
 arch/powerpc/kernel/pci-common.c  | 2 +-
 arch/powerpc/platforms/cell/iommu.c   | 4 ++--
 arch/powerpc/platforms/powernv/pci-ioda.c | 6 +++---
 arch/powerpc/platforms/pseries/iommu.c| 7 ++-
 arch/powerpc/sysdev/dart_iommu.c  | 2 +-
 arch/powerpc/sysdev/fsl_pci.c | 2 +-
 drivers/misc/cxl/vphb.c   | 2 +-
 8 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index c70f55d2f5e0..a59c42879194 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -43,11 +43,5 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return NULL;
 }
 
-static inline void set_dma_offset(struct device *dev, dma_addr_t off)
-{
-   if (dev)
-   dev->archdata.dma_offset = off;
-}
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 661b937f31ed..b645b3882815 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -966,7 +966,7 @@ static void pcibios_setup_device(struct pci_dev *dev)
 
/* Hook up default DMA ops */
set_dma_ops(>dev, pci_dma_ops);
-   set_dma_offset(>dev, PCI_DRAM_OFFSET);
+   dev->dev.archdata.dma_offset = PCI_DRAM_OFFSET;
 
/* Additional platform DMA/iommu setup */
phb = pci_bus_to_host(dev->bus);
diff --git a/arch/powerpc/platforms/cell/iommu.c 
b/arch/powerpc/platforms/cell/iommu.c
index 75fd2ee57e26..348a815779c1 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -577,10 +577,10 @@ static void cell_dma_dev_setup(struct device *dev)
u64 addr = cell_iommu_get_fixed_address(dev);
 
if (addr != OF_BAD_ADDR)
-   set_dma_offset(dev, addr + dma_iommu_fixed_base);
+   dev->archdata.dma_offset = addr + dma_iommu_fixed_base;
set_iommu_table_base(dev, cell_get_iommu_table(dev));
} else {
-   set_dma_offset(dev, cell_dma_nommu_offset);
+   dev->archdata.dma_offset = cell_dma_nommu_offset;
}
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 23fd46cd2ab3..e516d99bb2ed 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1735,7 +1735,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb 
*phb, struct pci_dev *pdev
 
pe = >ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(>dev) != _iommu_ops);
-   set_dma_offset(>dev, pe->tce_bypass_base);
+   pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
set_iommu_table_base(>dev, pe->table_group.tables[0]);
/*
 * Note: iommu_add_device() will fail here as
@@ -1848,7 +1848,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
if (rc)
return rc;
/* 4GB offset bypasses 32-bit space */
-   set_dma_offset(>dev, (1ULL << 32));
+   pdev->dev.archdata.dma_offset = (1ULL << 32);
return true;
}
 
@@ -1863,7 +1863,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 
list_for_each_entry(dev, >devices, bus_list) {
set_iommu_table_base(>dev, pe->table_group.tables[0]);
-   set_dma_offset(>dev, pe->tce_bypass_base);
+   dev->dev.archdata.dma_offset = pe->tce_bypass_base;
if (add_to_group)
iommu_add_device(>dev);
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 8965d174c53b..a2ff20d154fe 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1197,7 +1197,6 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
 {
struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
const __be32 *dma_window = NULL;
-   u64 dma_offset;
 
/* only attempt to use a new window if 64-bit DMA is requested */
if (dma_mask < DMA_BIT_MASK(64))
@@ -1219,11 +1218,9 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
}
 
if (pdn && PCI_DN(pdn)) {
-   dma_offset = enable_ddw(pdev, pdn);
-   if (dma_offset != 0) {
-   set_dma_offset(>dev, dma_offset);
+   pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
+   if (pdev->dev.archdata.dma_offset)
return true;
-   }
}
 
return false;
diff 

[PATCH 31/34] powerpc/dma: use generic direct and swiotlb ops

2018-11-14 Thread Christoph Hellwig
 - The ppc32 case of dma_nommu_dma_supported already was a no-op, and the
   64-bit case came to the same conclusion as dma_direct_supported, so
   replace it with the generic version.

 - supports CMA

 - Note that the cache maintainance in the existing code is a bit odd
   as it implements both the sync_to_device and sync_to_cpu callouts,
   but never flushes caches when unmapping.  This patch keeps both
   directions arounds, which will lead to more flushing than the previous
   implementation.  Someone more familar with the required CPUs should
   eventually take a look and optimize the cache flush handling if needed.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/Kconfig   |   1 +
 arch/powerpc/include/asm/dma-mapping.h |  41 -
 arch/powerpc/include/asm/pgtable.h |   1 -
 arch/powerpc/include/asm/swiotlb.h |   2 -
 arch/powerpc/kernel/Makefile   |   2 +-
 arch/powerpc/kernel/dma-iommu.c|  13 +-
 arch/powerpc/kernel/dma-swiotlb.c  |  24 +--
 arch/powerpc/kernel/dma.c  | 202 -
 arch/powerpc/kernel/pci-common.c   |   2 +-
 arch/powerpc/kernel/setup-common.c |   2 +-
 arch/powerpc/mm/dma-noncoherent.c  |  35 +++--
 arch/powerpc/mm/mem.c  |  22 ---
 arch/powerpc/platforms/44x/warp.c  |   2 +-
 arch/powerpc/platforms/Kconfig.cputype |   2 +
 arch/powerpc/platforms/cell/iommu.c|   4 +-
 arch/powerpc/platforms/pasemi/iommu.c  |   2 +-
 arch/powerpc/platforms/pasemi/setup.c  |   2 +-
 arch/powerpc/platforms/pseries/vio.c   |   7 +
 arch/powerpc/sysdev/fsl_pci.c  |   2 +-
 drivers/misc/cxl/vphb.c|   2 +-
 20 files changed, 50 insertions(+), 320 deletions(-)
 delete mode 100644 arch/powerpc/kernel/dma.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4f03997ad54a..e200cdf92290 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
select CLONE_BACKWARDS
select DCACHE_WORD_ACCESS   if PPC64 && CPU_LITTLE_ENDIAN
select DYNAMIC_FTRACE   if FUNCTION_TRACER
+   select DMA_DIRECT_OPS
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_ATOMIC64 if PPC32
diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index f19c486e7b3f..93e57e28be28 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -18,46 +18,6 @@
 #include 
 #include 
 
-/* Some dma direct funcs must be visible for use in other dma_ops */
-extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
-dma_addr_t *dma_handle, gfp_t flag,
-unsigned long attrs);
-extern void __dma_nommu_free_coherent(struct device *dev, size_t size,
-  void *vaddr, dma_addr_t dma_handle,
-  unsigned long attrs);
-int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
-   int nents, enum dma_data_direction direction,
-   unsigned long attrs);
-dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page,
-   unsigned long offset, size_t size,
-   enum dma_data_direction dir, unsigned long attrs);
-int dma_nommu_dma_supported(struct device *dev, u64 mask);
-u64 dma_nommu_get_required_mask(struct device *dev);
-
-#ifdef CONFIG_NOT_COHERENT_CACHE
-/*
- * DMA-consistent mapping functions for PowerPCs that don't support
- * cache snooping.  These allocate/free a region of uncached mapped
- * memory space for use with DMA devices.  Alternatively, you could
- * allocate the space "normally" and use the cache management functions
- * to ensure it is consistent.
- */
-struct device;
-extern void __dma_sync(void *vaddr, size_t size, int direction);
-extern void __dma_sync_page(struct page *page, unsigned long offset,
-size_t size, int direction);
-extern unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr);
-
-#else /* ! CONFIG_NOT_COHERENT_CACHE */
-/*
- * Cache coherent cores.
- */
-
-#define __dma_sync(addr, size, rw) ((void)0)
-#define __dma_sync_page(pg, off, sz, rw)   ((void)0)
-
-#endif /* ! CONFIG_NOT_COHERENT_CACHE */
-
 static inline unsigned long device_to_mask(struct device *dev)
 {
if (dev->dma_mask && *dev->dma_mask)
@@ -72,7 +32,6 @@ static inline unsigned long device_to_mask(struct device *dev)
 #ifdef CONFIG_PPC64
 extern const struct dma_map_ops dma_iommu_ops;
 #endif
-extern const struct dma_map_ops dma_nommu_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 8af32ce93c7f..70979b860761 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ 

[PATCH 23/34] powerpc/dma: remove get_pci_dma_ops

2018-11-14 Thread Christoph Hellwig
This function is only used by the Cell iommu code, which can keep track
if it is using the iommu internally just as good.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/pci.h  |  2 --
 arch/powerpc/kernel/pci-common.c|  6 --
 arch/powerpc/platforms/cell/iommu.c | 17 -
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 2af9ded80540..04c44c4b0acf 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -52,10 +52,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev 
*dev, int channel)
 
 #ifdef CONFIG_PCI
 extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops);
-extern const struct dma_map_ops *get_pci_dma_ops(void);
 #else  /* CONFIG_PCI */
 #define set_pci_dma_ops(d)
-#define get_pci_dma_ops()  NULL
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 88e4f69a09e5..a84707680525 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -69,12 +69,6 @@ void set_pci_dma_ops(const struct dma_map_ops *dma_ops)
pci_dma_ops = dma_ops;
 }
 
-const struct dma_map_ops *get_pci_dma_ops(void)
-{
-   return pci_dma_ops;
-}
-EXPORT_SYMBOL(get_pci_dma_ops);
-
 /*
  * This function should run under locking protection, specifically
  * hose_spinlock.
diff --git a/arch/powerpc/platforms/cell/iommu.c 
b/arch/powerpc/platforms/cell/iommu.c
index fb51f78035ce..93c7e4aef571 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -544,6 +544,7 @@ static struct cbe_iommu *cell_iommu_for_node(int nid)
 static unsigned long cell_dma_nommu_offset;
 
 static unsigned long dma_iommu_fixed_base;
+static bool cell_iommu_enabled;
 
 /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */
 bool iommu_fixed_is_weak;
@@ -572,16 +573,14 @@ static u64 cell_iommu_get_fixed_address(struct device 
*dev);
 
 static void cell_dma_dev_setup(struct device *dev)
 {
-   if (get_pci_dma_ops() == _iommu_ops) {
+   if (cell_iommu_enabled) {
u64 addr = cell_iommu_get_fixed_address(dev);
 
if (addr != OF_BAD_ADDR)
set_dma_offset(dev, addr + dma_iommu_fixed_base);
set_iommu_table_base(dev, cell_get_iommu_table(dev));
-   } else if (get_pci_dma_ops() == _nommu_ops) {
-   set_dma_offset(dev, cell_dma_nommu_offset);
} else {
-   BUG();
+   set_dma_offset(dev, cell_dma_nommu_offset);
}
 }
 
@@ -599,11 +598,11 @@ static int cell_of_bus_notify(struct notifier_block *nb, 
unsigned long action,
if (action != BUS_NOTIFY_ADD_DEVICE)
return 0;
 
-   /* We use the PCI DMA ops */
-   dev->dma_ops = get_pci_dma_ops();
-
+   if (cell_iommu_enabled)
+   dev->dma_ops = _iommu_ops;
+   else
+   dev->dma_ops = _nommu_ops;
cell_dma_dev_setup(dev);
-
return 0;
 }
 
@@ -1091,7 +1090,7 @@ static int __init cell_iommu_init(void)
cell_pci_iommu_bypass_supported;
}
set_pci_dma_ops(_iommu_ops);
-
+   cell_iommu_enabled = true;
  bail:
/* Register callbacks on OF platform device addition/removal
 * to handle linking them to the right DMA operations
-- 
2.19.1



[PATCH 27/34] powerpc/fsl_pci: simplify fsl_pci_dma_set_mask

2018-11-14 Thread Christoph Hellwig
swiotlb will only bounce buffer the effectice dma address for the device
is smaller than the actual DMA range.  Instead of flipping between the
swiotlb and nommu ops for FSL SOCs that have the second outbound window
just don't set the bus dma_mask in this case.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/sysdev/fsl_pci.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index f136567a5ed5..296ffabc9386 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -143,7 +143,7 @@ static int fsl_pci_dma_set_mask(struct device *dev, u64 
dma_mask)
 * mapping that allows addressing any RAM address from across PCI.
 */
if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) {
-   set_dma_ops(dev, _nommu_ops);
+   dev->bus_dma_mask = 0;
set_dma_offset(dev, pci64_dma_offset);
}
 
@@ -403,10 +403,6 @@ static void setup_pci_atmu(struct pci_controller *hose)
out_be32(>piw[win_idx].piwar,  piwar);
}
 
-   /*
-* install our own dma_set_mask handler to fixup dma_ops
-* and dma_offset
-*/
ppc_md.dma_set_mask = fsl_pci_dma_set_mask;
 
pr_info("%pOF: Setup 64-bit PCI DMA window\n", 
hose->dn);
-- 
2.19.1



[PATCH 26/34] powerpc/dma: fix an off-by-one in dma_capable

2018-11-14 Thread Christoph Hellwig
We need to compare the last byte in the dma range and not the one after it
for the bus_dma_mask, just like we do for the regular dma_mask.  Fix this
cleanly by merging the two comparisms into one.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-direct.h | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-direct.h 
b/arch/powerpc/include/asm/dma-direct.h
index e00ab5d0612d..92d8aed86422 100644
--- a/arch/powerpc/include/asm/dma-direct.h
+++ b/arch/powerpc/include/asm/dma-direct.h
@@ -4,15 +4,11 @@
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t 
size)
 {
-#ifdef CONFIG_SWIOTLB
-   if (dev->bus_dma_mask && addr + size > dev->bus_dma_mask)
-   return false;
-#endif
-
if (!dev->dma_mask)
return false;
 
-   return addr + size - 1 <= *dev->dma_mask;
+   return addr + size - 1 <=
+   min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
 }
 
 static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
-- 
2.19.1



[PATCH 21/34] powerpc/pci: remove the dma_set_mask pci_controller ops methods

2018-11-14 Thread Christoph Hellwig
Unused now.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/pci-bridge.h | 2 --
 arch/powerpc/kernel/dma.c | 7 ---
 2 files changed, 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index aace7033fa02..a50703af7db3 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -45,8 +45,6 @@ struct pci_controller_ops {
void(*teardown_msi_irqs)(struct pci_dev *pdev);
 #endif
 
-   int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
-
void(*shutdown)(struct pci_controller *hose);
 };
 
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 154e1cdae7f9..829eb2fefc8c 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -279,13 +279,6 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
if (ppc_md.dma_set_mask)
return ppc_md.dma_set_mask(dev, dma_mask);
 
-   if (dev_is_pci(dev)) {
-   struct pci_dev *pdev = to_pci_dev(dev);
-   struct pci_controller *phb = pci_bus_to_host(pdev->bus);
-   if (phb->controller_ops.dma_set_mask)
-   return phb->controller_ops.dma_set_mask(pdev, dma_mask);
-   }
-
if (!dev->dma_mask || !dma_supported(dev, dma_mask))
return -EIO;
*dev->dma_mask = dma_mask;
-- 
2.19.1



[PATCH 20/34] powerpc/dma: stop overriding dma_get_required_mask

2018-11-14 Thread Christoph Hellwig
The ppc_md and pci_controller_ops methods are unused now and can be
removed.  The dma_nommu implementation is generic to the generic one
except for using max_pfn instead of calling into the memblock API,
and all other dma_map_ops instances implement a method of their own.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/device.h  |  2 --
 arch/powerpc/include/asm/dma-mapping.h |  2 --
 arch/powerpc/include/asm/machdep.h |  2 --
 arch/powerpc/include/asm/pci-bridge.h  |  1 -
 arch/powerpc/kernel/dma.c  | 30 --
 drivers/base/platform.c|  2 --
 6 files changed, 39 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 1aa53318b4bc..3814e1c2d4bc 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -59,6 +59,4 @@ struct pdev_archdata {
u64 dma_mask;
 };
 
-#define ARCH_HAS_DMA_GET_REQUIRED_MASK
-
 #endif /* _ASM_POWERPC_DEVICE_H */
diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index 140ce5ad3120..e5ee4ac97c14 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -113,7 +113,5 @@ static inline void set_dma_offset(struct device *dev, 
dma_addr_t off)
 #define HAVE_ARCH_DMA_SET_MASK 1
 extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
-extern u64 __dma_get_required_mask(struct device *dev);
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 8311869005fa..7b70dcbce1b9 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -47,9 +47,7 @@ struct machdep_calls {
 #endif
 #endif /* CONFIG_PPC64 */
 
-   /* Platform set_dma_mask and dma_get_required_mask overrides */
int (*dma_set_mask)(struct device *dev, u64 dma_mask);
-   u64 (*dma_get_required_mask)(struct device *dev);
 
int (*probe)(void);
void(*setup_arch)(void); /* Optional, may be NULL */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 5c7a1e7ffc8a..aace7033fa02 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -46,7 +46,6 @@ struct pci_controller_ops {
 #endif
 
int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
-   u64 (*dma_get_required_mask)(struct pci_dev *pdev);
 
void(*shutdown)(struct pci_controller *hose);
 };
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 6c368b6820bb..154e1cdae7f9 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -246,7 +246,6 @@ const struct dma_map_ops dma_nommu_ops = {
.map_sg = dma_nommu_map_sg,
.dma_supported  = dma_nommu_dma_supported,
.map_page   = dma_nommu_map_page,
-   .get_required_mask  = dma_nommu_get_required_mask,
 #ifdef CONFIG_NOT_COHERENT_CACHE
.sync_single_for_cpu= dma_nommu_sync_single,
.sync_single_for_device = dma_nommu_sync_single,
@@ -294,35 +293,6 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 }
 EXPORT_SYMBOL(dma_set_mask);
 
-u64 __dma_get_required_mask(struct device *dev)
-{
-   const struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-   if (unlikely(dma_ops == NULL))
-   return 0;
-
-   if (dma_ops->get_required_mask)
-   return dma_ops->get_required_mask(dev);
-
-   return DMA_BIT_MASK(8 * sizeof(dma_addr_t));
-}
-
-u64 dma_get_required_mask(struct device *dev)
-{
-   if (ppc_md.dma_get_required_mask)
-   return ppc_md.dma_get_required_mask(dev);
-
-   if (dev_is_pci(dev)) {
-   struct pci_dev *pdev = to_pci_dev(dev);
-   struct pci_controller *phb = pci_bus_to_host(pdev->bus);
-   if (phb->controller_ops.dma_get_required_mask)
-   return phb->controller_ops.dma_get_required_mask(pdev);
-   }
-
-   return __dma_get_required_mask(dev);
-}
-EXPORT_SYMBOL_GPL(dma_get_required_mask);
-
 static int __init dma_init(void)
 {
 #ifdef CONFIG_IBMVIO
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 41b91af95afb..648b6213e322 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1179,7 +1179,6 @@ int __init platform_bus_init(void)
return error;
 }
 
-#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
 static u64 dma_default_get_required_mask(struct device *dev)
 {
u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
@@ -1208,7 +1207,6 @@ u64 dma_get_required_mask(struct device *dev)
return dma_default_get_required_mask(dev);
 }
 EXPORT_SYMBOL_GPL(dma_get_required_mask);
-#endif
 
 static __initdata 

[PATCH 25/34] powerpc/dma: remove max_direct_dma_addr

2018-11-14 Thread Christoph Hellwig
The max_direct_dma_addr duplicates the bus_dma_mask field in struct
device.  Use the generic field instead.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/device.h |  3 ---
 arch/powerpc/include/asm/dma-direct.h |  4 +---
 arch/powerpc/kernel/dma-swiotlb.c | 20 
 arch/powerpc/kernel/dma.c |  5 ++---
 arch/powerpc/sysdev/fsl_pci.c |  3 +--
 5 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 3814e1c2d4bc..a130be13ee83 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -38,9 +38,6 @@ struct dev_archdata {
 #ifdef CONFIG_IOMMU_API
void*iommu_domain;
 #endif
-#ifdef CONFIG_SWIOTLB
-   dma_addr_t  max_direct_dma_addr;
-#endif
 #ifdef CONFIG_PPC64
struct pci_dn   *pci_data;
 #endif
diff --git a/arch/powerpc/include/asm/dma-direct.h 
b/arch/powerpc/include/asm/dma-direct.h
index 7702875aabb7..e00ab5d0612d 100644
--- a/arch/powerpc/include/asm/dma-direct.h
+++ b/arch/powerpc/include/asm/dma-direct.h
@@ -5,9 +5,7 @@
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t 
size)
 {
 #ifdef CONFIG_SWIOTLB
-   struct dev_archdata *sd = >archdata;
-
-   if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr)
+   if (dev->bus_dma_mask && addr + size > dev->bus_dma_mask)
return false;
 #endif
 
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index 38a2c9f5ab54..62caa16b91a9 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -24,21 +24,6 @@
 
 unsigned int ppc_swiotlb_enable;
 
-static u64 swiotlb_powerpc_get_required(struct device *dev)
-{
-   u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr;
-
-   end = memblock_end_of_DRAM();
-   if (max_direct_dma_addr && end > max_direct_dma_addr)
-   end = max_direct_dma_addr;
-   end += get_dma_offset(dev);
-
-   mask = 1ULL << (fls64(end) - 1);
-   mask += mask - 1;
-
-   return mask;
-}
-
 /*
  * At the moment, all platforms that use this code only require
  * swiotlb to be used if we're operating on HIGHMEM.  Since
@@ -60,22 +45,17 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = {
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.mapping_error = dma_direct_mapping_error,
-   .get_required_mask = swiotlb_powerpc_get_required,
 };
 
 static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
  unsigned long action, void *data)
 {
struct device *dev = data;
-   struct dev_archdata *sd;
 
/* We are only intereted in device addition */
if (action != BUS_NOTIFY_ADD_DEVICE)
return 0;
 
-   sd = >archdata;
-   sd->max_direct_dma_addr = 0;
-
/* May need to bounce if the device can't address all of DRAM */
if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM())
set_dma_ops(dev, _swiotlb_dma_ops);
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index f9f51fc505a1..59f38ca3975c 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -30,11 +30,10 @@
 static u64 __maybe_unused get_pfn_limit(struct device *dev)
 {
u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1;
-   struct dev_archdata __maybe_unused *sd = >archdata;
 
 #ifdef CONFIG_SWIOTLB
-   if (sd->max_direct_dma_addr && dev->dma_ops == _swiotlb_dma_ops)
-   pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
+   if (dev->bus_dma_mask && dev->dma_ops == _swiotlb_dma_ops)
+   pfn = min_t(u64, pfn, dev->bus_dma_mask >> PAGE_SHIFT);
 #endif
 
return pfn;
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 561f97d698cc..f136567a5ed5 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -117,9 +117,8 @@ static u64 pci64_dma_offset;
 static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
 {
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-   struct dev_archdata *sd = >dev.archdata;
 
-   sd->max_direct_dma_addr =
+   pdev->dev.bus_dma_mask =
hose->dma_window_base_cur + hose->dma_window_size;
 }
 
-- 
2.19.1



[PATCH 19/34] cxl: drop the dma_set_mask callback from vphb

2018-11-14 Thread Christoph Hellwig
The CXL code never even looks at the dma mask, so there is no good
reason for this sanity check.  Remove it because it gets in the way
of the dma ops refactoring.

Signed-off-by: Christoph Hellwig 
---
 drivers/misc/cxl/vphb.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index 7908633d9204..49da2f744bbf 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -11,17 +11,6 @@
 #include 
 #include "cxl.h"
 
-static int cxl_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
-{
-   if (dma_mask < DMA_BIT_MASK(64)) {
-   pr_info("%s only 64bit DMA supported on CXL", __func__);
-   return -EIO;
-   }
-
-   *(pdev->dev.dma_mask) = dma_mask;
-   return 0;
-}
-
 static int cxl_pci_probe_mode(struct pci_bus *bus)
 {
return PCI_PROBE_NORMAL;
@@ -220,7 +209,6 @@ static struct pci_controller_ops cxl_pci_controller_ops =
.reset_secondary_bus = cxl_pci_reset_secondary_bus,
.setup_msi_irqs = cxl_setup_msi_irqs,
.teardown_msi_irqs = cxl_teardown_msi_irqs,
-   .dma_set_mask = cxl_dma_set_mask,
 };
 
 int cxl_pci_vphb_add(struct cxl_afu *afu)
-- 
2.19.1



[PATCH 24/34] powerpc/dma: move pci_dma_dev_setup_swiotlb to fsl_pci.c

2018-11-14 Thread Christoph Hellwig
pci_dma_dev_setup_swiotlb is only used by the fsl_pci code, and closely
related to it, so fsl_pci.c seems like a better place for it.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/swiotlb.h |  2 --
 arch/powerpc/kernel/dma-swiotlb.c  | 11 ---
 arch/powerpc/sysdev/fsl_pci.c  |  9 +
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/swiotlb.h 
b/arch/powerpc/include/asm/swiotlb.h
index f65ecf57b66c..26a0f12b835b 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -18,8 +18,6 @@ extern const struct dma_map_ops powerpc_swiotlb_dma_ops;
 extern unsigned int ppc_swiotlb_enable;
 int __init swiotlb_setup_bus_notifier(void);
 
-extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev);
-
 #ifdef CONFIG_SWIOTLB
 void swiotlb_detect_4g(void);
 #else
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index 678811abccfc..38a2c9f5ab54 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -63,17 +63,6 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = {
.get_required_mask = swiotlb_powerpc_get_required,
 };
 
-void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
-{
-   struct pci_controller *hose;
-   struct dev_archdata *sd;
-
-   hose = pci_bus_to_host(pdev->bus);
-   sd = >dev.archdata;
-   sd->max_direct_dma_addr =
-   hose->dma_window_base_cur + hose->dma_window_size;
-}
-
 static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
  unsigned long action, void *data)
 {
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 918be816b097..561f97d698cc 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -114,6 +114,15 @@ static struct pci_ops fsl_indirect_pcie_ops =
 static u64 pci64_dma_offset;
 
 #ifdef CONFIG_SWIOTLB
+static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
+{
+   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+   struct dev_archdata *sd = >dev.archdata;
+
+   sd->max_direct_dma_addr =
+   hose->dma_window_base_cur + hose->dma_window_size;
+}
+
 static void setup_swiotlb_ops(struct pci_controller *hose)
 {
if (ppc_swiotlb_enable) {
-- 
2.19.1



[PATCH 18/34] powerpc/powernv: use the generic iommu bypass code

2018-11-14 Thread Christoph Hellwig
Use the generic iommu bypass code instead of overriding set_dma_mask.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 95 ++-
 1 file changed, 25 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1d9f446f3eff..23fd46cd2ab3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1814,89 +1814,45 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct 
pnv_ioda_pe *pe)
return -EIO;
 }
 
-static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
+   u64 dma_mask)
 {
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
-   uint64_t top;
-   bool bypass = false;
-   s64 rc;
 
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
return -ENODEV;
 
pe = >ioda.pe_array[pdn->pe_number];
if (pe->tce_bypass_enabled) {
-   top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
-   bypass = (dma_mask >= top);
+   u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+   if (dma_mask >= top)
+   return true;
}
 
-   if (bypass) {
-   dev_info(>dev, "Using 64-bit DMA iommu bypass\n");
-   set_dma_ops(>dev, _nommu_ops);
-   } else {
-   /*
-* If the device can't set the TCE bypass bit but still wants
-* to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
-* bypass the 32-bit region and be usable for 64-bit DMAs.
-* The device needs to be able to address all of this space.
-*/
-   if (dma_mask >> 32 &&
-   dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-   /* pe->pdev should be set if it's a single device, pe->pbus 
if not */
-   (pe->device_count == 1 || !pe->pbus) &&
-   phb->model == PNV_PHB_MODEL_PHB3) {
-   /* Configure the bypass mode */
-   rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-   if (rc)
-   return rc;
-   /* 4GB offset bypasses 32-bit space */
-   set_dma_offset(>dev, (1ULL << 32));
-   set_dma_ops(>dev, _nommu_ops);
-   } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
-   /*
-* Fail the request if a DMA mask between 32 and 64 bits
-* was requested but couldn't be fulfilled. Ideally we
-* would do this for 64-bits but historically we have
-* always fallen back to 32-bits.
-*/
-   return -ENOMEM;
-   } else {
-   dev_info(>dev, "Using 32-bit DMA via iommu\n");
-   set_dma_ops(>dev, _iommu_ops);
-   }
+   /*
+* If the device can't set the TCE bypass bit but still wants
+* to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+* bypass the 32-bit region and be usable for 64-bit DMAs.
+* The device needs to be able to address all of this space.
+*/
+   if (dma_mask >> 32 &&
+   dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+   /* pe->pdev should be set if it's a single device, pe->pbus if not 
*/
+   (pe->device_count == 1 || !pe->pbus) &&
+   phb->model == PNV_PHB_MODEL_PHB3) {
+   /* Configure the bypass mode */
+   s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+   if (rc)
+   return rc;
+   /* 4GB offset bypasses 32-bit space */
+   set_dma_offset(>dev, (1ULL << 32));
+   return true;
}
-   *pdev->dev.dma_mask = dma_mask;
-
-   /* Update peer npu devices */
-   pnv_npu_try_dma_set_bypass(pdev, bypass);
-
-   return 0;
-}
-
-static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
-{
-   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-   struct pnv_phb *phb = hose->private_data;
-   struct pci_dn *pdn = pci_get_pdn(pdev);
-   struct pnv_ioda_pe *pe;
-   u64 end, mask;
 
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-   return 0;
-
-   pe = >ioda.pe_array[pdn->pe_number];
-   if (!pe->tce_bypass_enabled)
-   return __dma_get_required_mask(>dev);
-
-
-   end = pe->tce_bypass_base + memblock_end_of_DRAM();
-   mask = 1ULL << (fls64(end) - 1);
-   mask 

[PATCH 22/34] powerpc/dma: remove the iommu fallback for coherent allocations

2018-11-14 Thread Christoph Hellwig
All iommu capable platforms now always use the iommu code with the
internal bypass, so there is not need for this magic anymore.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/Kconfig  |  4 ---
 arch/powerpc/kernel/dma.c | 68 ++-
 2 files changed, 2 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c3613bc1..2d4a19bc8023 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -119,9 +119,6 @@ config GENERIC_HWEIGHT
bool
default y
 
-config ARCH_HAS_DMA_SET_COHERENT_MASK
-bool
-
 config PPC
bool
default y
@@ -129,7 +126,6 @@ config PPC
# Please keep this list sorted alphabetically.
#
select ARCH_HAS_DEVMEM_IS_ALLOWED
-   select ARCH_HAS_DMA_SET_COHERENT_MASK
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 829eb2fefc8c..f9f51fc505a1 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -114,51 +114,6 @@ void __dma_nommu_free_coherent(struct device *dev, size_t 
size,
 }
 #endif /* !CONFIG_NOT_COHERENT_CACHE */
 
-static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
-  dma_addr_t *dma_handle, gfp_t flag,
-  unsigned long attrs)
-{
-   struct iommu_table *iommu;
-
-   /* The coherent mask may be smaller than the real mask, check if
-* we can really use the direct ops
-*/
-   if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
-   return __dma_nommu_alloc_coherent(dev, size, dma_handle,
-  flag, attrs);
-
-   /* Ok we can't ... do we have an iommu ? If not, fail */
-   iommu = get_iommu_table_base(dev);
-   if (!iommu)
-   return NULL;
-
-   /* Try to use the iommu */
-   return iommu_alloc_coherent(dev, iommu, size, dma_handle,
-   dev->coherent_dma_mask, flag,
-   dev_to_node(dev));
-}
-
-static void dma_nommu_free_coherent(struct device *dev, size_t size,
-void *vaddr, dma_addr_t dma_handle,
-unsigned long attrs)
-{
-   struct iommu_table *iommu;
-
-   /* See comments in dma_nommu_alloc_coherent() */
-   if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask))
-   return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle,
- attrs);
-   /* Maybe we used an iommu ... */
-   iommu = get_iommu_table_base(dev);
-
-   /* If we hit that we should have never allocated in the first
-* place so how come we are freeing ?
-*/
-   if (WARN_ON(!iommu))
-   return;
-   iommu_free_coherent(iommu, size, vaddr, dma_handle);
-}
-
 int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
 void *cpu_addr, dma_addr_t handle, size_t size,
 unsigned long attrs)
@@ -240,8 +195,8 @@ static inline void dma_nommu_sync_single(struct device *dev,
 #endif
 
 const struct dma_map_ops dma_nommu_ops = {
-   .alloc  = dma_nommu_alloc_coherent,
-   .free   = dma_nommu_free_coherent,
+   .alloc  = __dma_nommu_alloc_coherent,
+   .free   = __dma_nommu_free_coherent,
.mmap   = dma_nommu_mmap_coherent,
.map_sg = dma_nommu_map_sg,
.dma_supported  = dma_nommu_dma_supported,
@@ -255,25 +210,6 @@ const struct dma_map_ops dma_nommu_ops = {
 };
 EXPORT_SYMBOL(dma_nommu_ops);
 
-int dma_set_coherent_mask(struct device *dev, u64 mask)
-{
-   if (!dma_supported(dev, mask)) {
-   /*
-* We need to special case the direct DMA ops which can
-* support a fallback for coherent allocations. There
-* is no dma_op->set_coherent_mask() so we have to do
-* things the hard way:
-*/
-   if (get_dma_ops(dev) != _nommu_ops ||
-   get_iommu_table_base(dev) == NULL ||
-   !dma_iommu_dma_supported(dev, mask))
-   return -EIO;
-   }
-   dev->coherent_dma_mask = mask;
-   return 0;
-}
-EXPORT_SYMBOL(dma_set_coherent_mask);
-
 int dma_set_mask(struct device *dev, u64 dma_mask)
 {
if (ppc_md.dma_set_mask)
-- 
2.19.1



[PATCH 16/34] powerpc/powernv: remove pnv_pci_ioda_pe_single_vendor

2018-11-14 Thread Christoph Hellwig
This function is completely bogus - the fact that two PCIe devices come
from the same vendor has absolutely nothing to say about the DMA
capabilities and characteristics.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 28 ++-
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index dd807446801e..afbb73cd3c5b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1745,31 +1745,6 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb 
*phb, struct pci_dev *pdev
 */
 }
 
-static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
-{
-   unsigned short vendor = 0;
-   struct pci_dev *pdev;
-
-   if (pe->device_count == 1)
-   return true;
-
-   /* pe->pdev should be set if it's a single device, pe->pbus if not */
-   if (!pe->pbus)
-   return true;
-
-   list_for_each_entry(pdev, >pbus->devices, bus_list) {
-   if (!vendor) {
-   vendor = pdev->vendor;
-   continue;
-   }
-
-   if (pdev->vendor != vendor)
-   return false;
-   }
-
-   return true;
-}
-
 /*
  * Reconfigure TVE#0 to be usable as 64-bit DMA space.
  *
@@ -1870,7 +1845,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev 
*pdev, u64 dma_mask)
 */
if (dma_mask >> 32 &&
dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-   pnv_pci_ioda_pe_single_vendor(pe) &&
+   /* pe->pdev should be set if it's a single device, pe->pbus 
if not */
+   (pe->device_count == 1 || !pe->pbus) &&
phb->model == PNV_PHB_MODEL_PHB3) {
/* Configure the bypass mode */
rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-- 
2.19.1



[PATCH 15/34] powerpc/dart: use the generic iommu bypass code

2018-11-14 Thread Christoph Hellwig
Use the generic iommu bypass code instead of overriding set_dma_mask.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/sysdev/dart_iommu.c | 45 +++-
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index 283ce04c5844..2681a19347ba 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -360,13 +360,6 @@ static void iommu_table_dart_setup(void)
set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map);
 }
 
-static void pci_dma_dev_setup_dart(struct pci_dev *dev)
-{
-   if (dart_is_u4)
-   set_dma_offset(>dev, DART_U4_BYPASS_BASE);
-   set_iommu_table_base(>dev, _table_dart);
-}
-
 static void pci_dma_bus_setup_dart(struct pci_bus *bus)
 {
if (!iommu_table_dart_inited) {
@@ -390,27 +383,16 @@ static bool dart_device_on_pcie(struct device *dev)
return false;
 }
 
-static int dart_dma_set_mask(struct device *dev, u64 dma_mask)
+static void pci_dma_dev_setup_dart(struct pci_dev *dev)
 {
-   if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-   return -EIO;
-
-   /* U4 supports a DART bypass, we use it for 64-bit capable
-* devices to improve performances. However, that only works
-* for devices connected to U4 own PCIe interface, not bridged
-* through hypertransport. We need the device to support at
-* least 40 bits of addresses.
-*/
-   if (dart_device_on_pcie(dev) && dma_mask >= DMA_BIT_MASK(40)) {
-   dev_info(dev, "Using 64-bit DMA iommu bypass\n");
-   set_dma_ops(dev, _nommu_ops);
-   } else {
-   dev_info(dev, "Using 32-bit DMA via iommu\n");
-   set_dma_ops(dev, _iommu_ops);
-   }
+   if (dart_is_u4 && dart_device_on_pcie(>dev))
+   set_dma_offset(>dev, DART_U4_BYPASS_BASE);
+   set_iommu_table_base(>dev, _table_dart);
+}
 
-   *dev->dma_mask = dma_mask;
-   return 0;
+static bool iommu_bypass_supported_dart(struct pci_dev *dev, u64 mask)
+{
+   return dart_is_u4 && dart_device_on_pcie(>dev);
 }
 
 void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
@@ -430,12 +412,15 @@ void __init iommu_init_early_dart(struct 
pci_controller_ops *controller_ops)
if (dart_init(dn) != 0)
return;
 
-   /* Setup bypass if supported */
-   if (dart_is_u4)
-   ppc_md.dma_set_mask = dart_dma_set_mask;
-
+   /*
+* U4 supports a DART bypass, we use it for 64-bit capable devices to
+* improve performance.  However, that only works for devices connected
+* to the U4 own PCIe interface, not bridged through hypertransport.
+* We need the device to support at least 40 bits of addresses.
+*/
controller_ops->dma_dev_setup = pci_dma_dev_setup_dart;
controller_ops->dma_bus_setup = pci_dma_bus_setup_dart;
+   controller_ops->iommu_bypass_supported = iommu_bypass_supported_dart;
 
/* Setup pci_dma ops */
set_pci_dma_ops(_iommu_ops);
-- 
2.19.1



[PATCH 14/34] powerpc/dart: remove dead cleanup code in iommu_init_early_dart

2018-11-14 Thread Christoph Hellwig
If dart_init failed we didn't have a chance to setup dma or controller
ops yet, so there is no point in resetting them.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/sysdev/dart_iommu.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index a5b40d1460f1..283ce04c5844 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -428,7 +428,7 @@ void __init iommu_init_early_dart(struct pci_controller_ops 
*controller_ops)
 
/* Initialize the DART HW */
if (dart_init(dn) != 0)
-   goto bail;
+   return;
 
/* Setup bypass if supported */
if (dart_is_u4)
@@ -439,15 +439,6 @@ void __init iommu_init_early_dart(struct 
pci_controller_ops *controller_ops)
 
/* Setup pci_dma ops */
set_pci_dma_ops(_iommu_ops);
-   return;
-
- bail:
-   /* If init failed, use direct iommu and null setup functions */
-   controller_ops->dma_dev_setup = NULL;
-   controller_ops->dma_bus_setup = NULL;
-
-   /* Setup pci_dma ops */
-   set_pci_dma_ops(_nommu_ops);
 }
 
 #ifdef CONFIG_PM
-- 
2.19.1



[PATCH 17/34] powerpc/powernv: remove pnv_npu_dma_set_mask

2018-11-14 Thread Christoph Hellwig
These devices are not PCIe devices and do not have associated dma map
ops, so this is just dead code.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index afbb73cd3c5b..1d9f446f3eff 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3688,14 +3688,6 @@ static const struct pci_controller_ops 
pnv_pci_ioda_controller_ops = {
.shutdown   = pnv_pci_ioda_shutdown,
 };
 
-static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
-{
-   dev_err_once(>dev,
-   "%s operation unsupported for NVLink devices\n",
-   __func__);
-   return -EPERM;
-}
-
 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
.dma_dev_setup  = pnv_pci_dma_dev_setup,
 #ifdef CONFIG_PCI_MSI
@@ -3705,7 +3697,6 @@ static const struct pci_controller_ops 
pnv_npu_ioda_controller_ops = {
.enable_device_hook = pnv_pci_enable_device_hook,
.window_alignment   = pnv_pci_window_alignment,
.reset_secondary_bus= pnv_pci_reset_secondary_bus,
-   .dma_set_mask   = pnv_npu_dma_set_mask,
.shutdown   = pnv_pci_ioda_shutdown,
 };
 
-- 
2.19.1



[PATCH 13/34] powerpc/cell: use the generic iommu bypass code

2018-11-14 Thread Christoph Hellwig
This gets rid of a lot of clumsy code and finally allows us to mark
dma_iommu_ops const.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-mapping.h |   2 +-
 arch/powerpc/include/asm/iommu.h   |   6 ++
 arch/powerpc/kernel/dma-iommu.c|   7 +-
 arch/powerpc/platforms/cell/iommu.c| 143 ++---
 4 files changed, 22 insertions(+), 136 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index 824f55995a18..140ce5ad3120 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -74,7 +74,7 @@ static inline unsigned long device_to_mask(struct device *dev)
  * Available generic sets of operations
  */
 #ifdef CONFIG_PPC64
-extern struct dma_map_ops dma_iommu_ops;
+extern const struct dma_map_ops dma_iommu_ops;
 #endif
 extern const struct dma_map_ops dma_nommu_ops;
 
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 75daa10f31a4..5128aac8e165 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -326,5 +326,11 @@ extern void iommu_release_ownership(struct iommu_table 
*tbl);
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
 extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
+#ifdef CONFIG_PPC_CELL_NATIVE
+extern bool iommu_fixed_is_weak;
+#else
+#define iommu_fixed_is_weak false
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 459be16f8334..4937b39e246b 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -20,14 +20,15 @@
  */
 static inline bool dma_iommu_alloc_bypass(struct device *dev)
 {
-   return dev->archdata.iommu_bypass &&
+   return dev->archdata.iommu_bypass && !iommu_fixed_is_weak &&
dma_nommu_dma_supported(dev, dev->coherent_dma_mask);
 }
 
 static inline bool dma_iommu_map_bypass(struct device *dev,
unsigned long attrs)
 {
-   return dev->archdata.iommu_bypass;
+   return dev->archdata.iommu_bypass &&
+   (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING));
 }
 
 /* Allocates a contiguous real buffer and creates mappings over it.
@@ -168,7 +169,7 @@ int dma_iommu_mapping_error(struct device *dev, dma_addr_t 
dma_addr)
return dma_addr == IOMMU_MAPPING_ERROR;
 }
 
-struct dma_map_ops dma_iommu_ops = {
+const struct dma_map_ops dma_iommu_ops = {
.alloc  = dma_iommu_alloc_coherent,
.free   = dma_iommu_free_coherent,
.mmap   = dma_nommu_mmap_coherent,
diff --git a/arch/powerpc/platforms/cell/iommu.c 
b/arch/powerpc/platforms/cell/iommu.c
index cce5bf9515e5..fb51f78035ce 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -546,7 +546,7 @@ static unsigned long cell_dma_nommu_offset;
 static unsigned long dma_iommu_fixed_base;
 
 /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */
-static int iommu_fixed_is_weak;
+bool iommu_fixed_is_weak;
 
 static struct iommu_table *cell_get_iommu_table(struct device *dev)
 {
@@ -568,95 +568,6 @@ static struct iommu_table *cell_get_iommu_table(struct 
device *dev)
return >table;
 }
 
-/* A coherent allocation implies strong ordering */
-
-static void *dma_fixed_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag,
- unsigned long attrs)
-{
-   if (iommu_fixed_is_weak)
-   return iommu_alloc_coherent(dev, cell_get_iommu_table(dev),
-   size, dma_handle,
-   device_to_mask(dev), flag,
-   dev_to_node(dev));
-   else
-   return dma_nommu_ops.alloc(dev, size, dma_handle, flag,
-   attrs);
-}
-
-static void dma_fixed_free_coherent(struct device *dev, size_t size,
-   void *vaddr, dma_addr_t dma_handle,
-   unsigned long attrs)
-{
-   if (iommu_fixed_is_weak)
-   iommu_free_coherent(cell_get_iommu_table(dev), size, vaddr,
-   dma_handle);
-   else
-   dma_nommu_ops.free(dev, size, vaddr, dma_handle, attrs);
-}
-
-static dma_addr_t dma_fixed_map_page(struct device *dev, struct page *page,
-unsigned long offset, size_t size,
-enum dma_data_direction direction,
-unsigned long attrs)
-{
-   if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING))
-   return dma_nommu_ops.map_page(dev, page, offset, size,
-   

[PATCH 12/34] powerpc/cell: move dma direct window setup out of dma_configure

2018-11-14 Thread Christoph Hellwig
Configure the dma settings at device setup time, and stop playing games
with get_pci_dma_ops.  This prepares for using the common dma_configure
code later on.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/cell/iommu.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/cell/iommu.c 
b/arch/powerpc/platforms/cell/iommu.c
index 12352a58072a..cce5bf9515e5 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -657,14 +657,21 @@ static const struct dma_map_ops dma_iommu_fixed_ops = {
.mapping_error  = dma_iommu_mapping_error,
 };
 
+static u64 cell_iommu_get_fixed_address(struct device *dev);
+
 static void cell_dma_dev_setup(struct device *dev)
 {
-   if (get_pci_dma_ops() == _iommu_ops)
+   if (get_pci_dma_ops() == _iommu_ops) {
+   u64 addr = cell_iommu_get_fixed_address(dev);
+
+   if (addr != OF_BAD_ADDR)
+   set_dma_offset(dev, addr + dma_iommu_fixed_base);
set_iommu_table_base(dev, cell_get_iommu_table(dev));
-   else if (get_pci_dma_ops() == _nommu_ops)
+   } else if (get_pci_dma_ops() == _nommu_ops) {
set_dma_offset(dev, cell_dma_nommu_offset);
-   else
+   } else {
BUG();
+   }
 }
 
 static void cell_pci_dma_dev_setup(struct pci_dev *dev)
@@ -950,19 +957,14 @@ static int dma_suported_and_switch(struct device *dev, 
u64 dma_mask)
 {
if (dma_mask == DMA_BIT_MASK(64) &&
cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) {
-   u64 addr = cell_iommu_get_fixed_address(dev) +
-   dma_iommu_fixed_base;
dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
-   dev_dbg(dev, "iommu: fixed addr = %llx\n", addr);
set_dma_ops(dev, _iommu_fixed_ops);
-   set_dma_offset(dev, addr);
return 1;
}
 
if (dma_iommu_dma_supported(dev, dma_mask)) {
dev_dbg(dev, "iommu: not 64-bit, using default ops\n");
-   set_dma_ops(dev, get_pci_dma_ops());
-   cell_dma_dev_setup(dev);
+   set_dma_ops(dev, _iommu_ops);
return 1;
}
 
-- 
2.19.1



[PATCH 11/34] powerpc/pseries: use the generic iommu bypass code

2018-11-14 Thread Christoph Hellwig
Use the generic iommu bypass code instead of overriding set_dma_mask.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/pseries/iommu.c | 100 +++--
 1 file changed, 27 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index da5716de7f4c..8965d174c53b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -973,7 +973,7 @@ static LIST_HEAD(failed_ddw_pdn_list);
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by dma_set_mask
+ * returns the dma offset for use by the direct mapped DMA code.
  */
 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
@@ -1193,87 +1193,40 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev 
*dev)
iommu_add_device(>dev);
 }
 
-static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 
dma_mask)
 {
-   bool ddw_enabled = false;
-   struct device_node *pdn, *dn;
-   struct pci_dev *pdev;
+   struct device_node *dn = pci_device_to_OF_node(pdev), *pdn;
const __be32 *dma_window = NULL;
u64 dma_offset;
 
-   if (!dev->dma_mask)
-   return -EIO;
-
-   if (!dev_is_pci(dev))
-   goto check_mask;
-
-   pdev = to_pci_dev(dev);
-
/* only attempt to use a new window if 64-bit DMA is requested */
-   if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
-   dn = pci_device_to_OF_node(pdev);
-   dev_dbg(dev, "node is %pOF\n", dn);
+   if (dma_mask < DMA_BIT_MASK(64))
+   return false;
 
-   /*
-* the device tree might contain the dma-window properties
-* per-device and not necessarily for the bus. So we need to
-* search upwards in the tree until we either hit a dma-window
-* property, OR find a parent with a table already allocated.
-*/
-   for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
-   pdn = pdn->parent) {
-   dma_window = of_get_property(pdn, "ibm,dma-window", 
NULL);
-   if (dma_window)
-   break;
-   }
-   if (pdn && PCI_DN(pdn)) {
-   dma_offset = enable_ddw(pdev, pdn);
-   if (dma_offset != 0) {
-   dev_info(dev, "Using 64-bit direct DMA at 
offset %llx\n", dma_offset);
-   set_dma_offset(dev, dma_offset);
-   set_dma_ops(dev, _nommu_ops);
-   ddw_enabled = true;
-   }
-   }
-   }
+   dev_dbg(>dev, "node is %pOF\n", dn);
 
-   /* fall back on iommu ops */
-   if (!ddw_enabled && get_dma_ops(dev) != _iommu_ops) {
-   dev_info(dev, "Restoring 32-bit DMA via iommu\n");
-   set_dma_ops(dev, _iommu_ops);
+   /*
+* the device tree might contain the dma-window properties
+* per-device and not necessarily for the bus. So we need to
+* search upwards in the tree until we either hit a dma-window
+* property, OR find a parent with a table already allocated.
+*/
+   for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
+   pdn = pdn->parent) {
+   dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+   if (dma_window)
+   break;
}
 
-check_mask:
-   if (!dma_supported(dev, dma_mask))
-   return -EIO;
-
-   *dev->dma_mask = dma_mask;
-   return 0;
-}
-
-static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
-{
-   if (!dev->dma_mask)
-   return 0;
-
-   if (!disable_ddw && dev_is_pci(dev)) {
-   struct pci_dev *pdev = to_pci_dev(dev);
-   struct device_node *dn;
-
-   dn = pci_device_to_OF_node(pdev);
-
-   /* search upwards for ibm,dma-window */
-   for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
-   dn = dn->parent)
-   if (of_get_property(dn, "ibm,dma-window", NULL))
-   break;
-   /* if there is a ibm,ddw-applicable property require 64 bits */
-   if (dn && PCI_DN(dn) &&
-   of_get_property(dn, "ibm,ddw-applicable", NULL))
-   return DMA_BIT_MASK(64);
+   if (pdn && PCI_DN(pdn)) {
+   dma_offset = enable_ddw(pdev, pdn);
+   if (dma_offset != 0) {
+   set_dma_offset(>dev, dma_offset);
+ 

[PATCH 05/34] powerpc/dma: remove the unused dma_iommu_ops export

2018-11-14 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/kernel/dma-iommu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index f9fe2080ceb9..2ca6cfaebf65 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -6,7 +6,6 @@
  * busses using the iommu infrastructure
  */
 
-#include 
 #include 
 
 /*
@@ -123,4 +122,3 @@ struct dma_map_ops dma_iommu_ops = {
.get_required_mask  = dma_iommu_get_required_mask,
.mapping_error  = dma_iommu_mapping_error,
 };
-EXPORT_SYMBOL(dma_iommu_ops);
-- 
2.19.1



[PATCH 08/34] powerpc/dma: untangle vio_dma_mapping_ops from dma_iommu_ops

2018-11-14 Thread Christoph Hellwig
vio_dma_mapping_ops currently does a lot of indirect calls through
dma_iommu_ops, which not only make the code harder to follow but are
also expensive in the post-spectre world.  Unwind the indirect calls
by calling the ppc_iommu_* or iommu_* APIs directly applicable, or
just use the dma_iommu_* methods directly where we can.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/dma-iommu.c  |  2 +-
 arch/powerpc/platforms/pseries/vio.c | 87 
 3 files changed, 38 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 35db0cbc9222..75daa10f31a4 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -242,6 +242,7 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */
 
+u64 dma_iommu_get_required_mask(struct device *dev);
 int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 #else
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 2ca6cfaebf65..0613278abf9f 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -92,7 +92,7 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
return 1;
 }
 
-static u64 dma_iommu_get_required_mask(struct device *dev)
+u64 dma_iommu_get_required_mask(struct device *dev)
 {
struct iommu_table *tbl = get_iommu_table_base(dev);
u64 mask;
diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 88f1ad1d6309..ea3a9745c812 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -492,7 +492,9 @@ static void *vio_dma_iommu_alloc_coherent(struct device 
*dev, size_t size,
return NULL;
}
 
-   ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
+   ret = iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
+   dma_handle, dev->coherent_dma_mask, flag,
+   dev_to_node(dev));
if (unlikely(ret == NULL)) {
vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
atomic_inc(>cmo.allocs_failed);
@@ -507,8 +509,7 @@ static void vio_dma_iommu_free_coherent(struct device *dev, 
size_t size,
 {
struct vio_dev *viodev = to_vio_dev(dev);
 
-   dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
-
+   iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
 }
 
@@ -518,22 +519,22 @@ static dma_addr_t vio_dma_iommu_map_page(struct device 
*dev, struct page *page,
  unsigned long attrs)
 {
struct vio_dev *viodev = to_vio_dev(dev);
-   struct iommu_table *tbl;
+   struct iommu_table *tbl = get_iommu_table_base(dev);
dma_addr_t ret = IOMMU_MAPPING_ERROR;
 
-   tbl = get_iommu_table_base(dev);
-   if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl {
-   atomic_inc(>cmo.allocs_failed);
-   return ret;
-   }
-
-   ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
-   if (unlikely(dma_mapping_error(dev, ret))) {
-   vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
-   atomic_inc(>cmo.allocs_failed);
-   }
-
+   if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl
+   goto out_fail;
+   ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev),
+   direction, attrs);
+   if (unlikely(ret == IOMMU_MAPPING_ERROR))
+   goto out_deallocate;
return ret;
+
+out_deallocate:
+   vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+out_fail:
+   atomic_inc(>cmo.allocs_failed);
+   return IOMMU_MAPPING_ERROR;
 }
 
 static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
@@ -542,11 +543,9 @@ static void vio_dma_iommu_unmap_page(struct device *dev, 
dma_addr_t dma_handle,
 unsigned long attrs)
 {
struct vio_dev *viodev = to_vio_dev(dev);
-   struct iommu_table *tbl;
-
-   tbl = get_iommu_table_base(dev);
-   dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
+   struct iommu_table *tbl = get_iommu_table_base(dev);
 
+   iommu_unmap_page(tbl, dma_handle, size, direction, attrs);
vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
 }
 
@@ -555,34 +554,32 @@ static int vio_dma_iommu_map_sg(struct device *dev, 
struct scatterlist *sglist,
 unsigned long attrs)
 {
struct vio_dev *viodev = to_vio_dev(dev);
-   struct iommu_table *tbl;
+   struct iommu_table *tbl = get_iommu_table_base(dev);
struct 

[PATCH 02/34] powerpc: allow NOT_COHERENT_CACHE for amigaone

2018-11-14 Thread Christoph Hellwig
AMIGAONE select NOT_COHERENT_CACHE, so we better allow it.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/Kconfig.cputype | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index f4e2c5729374..6fedbf349fce 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -412,7 +412,8 @@ config NR_CPUS
 
 config NOT_COHERENT_CACHE
bool
-   depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || GAMECUBE_COMMON
+   depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \
+   GAMECUBE_COMMON || AMIGAONE
default n if PPC_47x
default y
 
-- 
2.19.1



[PATCH 06/34] powerpc/dma: split the two __dma_alloc_coherent implementations

2018-11-14 Thread Christoph Hellwig
The implemementation for the CONFIG_NOT_COHERENT_CACHE case doesn't share
any code with the one for systems with coherent caches.  Split it off
and merge it with the helpers in dma-noncoherent.c that have no other
callers.

Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
---
 arch/powerpc/include/asm/dma-mapping.h |  5 -
 arch/powerpc/kernel/dma.c  | 14 ++
 arch/powerpc/mm/dma-noncoherent.c  | 15 +++
 arch/powerpc/platforms/44x/warp.c  |  2 +-
 4 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index f2a4a7142b1e..dacd0f93f2b2 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -39,9 +39,6 @@ extern int dma_nommu_mmap_coherent(struct device *dev,
  * to ensure it is consistent.
  */
 struct device;
-extern void *__dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *handle, gfp_t gfp);
-extern void __dma_free_coherent(size_t size, void *vaddr);
 extern void __dma_sync(void *vaddr, size_t size, int direction);
 extern void __dma_sync_page(struct page *page, unsigned long offset,
 size_t size, int direction);
@@ -52,8 +49,6 @@ extern unsigned long __dma_get_coherent_pfn(unsigned long 
cpu_addr);
  * Cache coherent cores.
  */
 
-#define __dma_alloc_coherent(dev, gfp, size, handle)   NULL
-#define __dma_free_coherent(size, addr)((void)0)
 #define __dma_sync(addr, size, rw) ((void)0)
 #define __dma_sync_page(pg, off, sz, rw)   ((void)0)
 
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 6551685a4ed0..d6deb458bb91 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -62,18 +62,12 @@ static int dma_nommu_dma_supported(struct device *dev, u64 
mask)
 #endif
 }
 
+#ifndef CONFIG_NOT_COHERENT_CACHE
 void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
  dma_addr_t *dma_handle, gfp_t flag,
  unsigned long attrs)
 {
void *ret;
-#ifdef CONFIG_NOT_COHERENT_CACHE
-   ret = __dma_alloc_coherent(dev, size, dma_handle, flag);
-   if (ret == NULL)
-   return NULL;
-   *dma_handle += get_dma_offset(dev);
-   return ret;
-#else
struct page *page;
int node = dev_to_node(dev);
 #ifdef CONFIG_FSL_SOC
@@ -110,19 +104,15 @@ void *__dma_nommu_alloc_coherent(struct device *dev, 
size_t size,
*dma_handle = __pa(ret) + get_dma_offset(dev);
 
return ret;
-#endif
 }
 
 void __dma_nommu_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_handle,
unsigned long attrs)
 {
-#ifdef CONFIG_NOT_COHERENT_CACHE
-   __dma_free_coherent(size, vaddr);
-#else
free_pages((unsigned long)vaddr, get_order(size));
-#endif
 }
+#endif /* !CONFIG_NOT_COHERENT_CACHE */
 
 static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
   dma_addr_t *dma_handle, gfp_t flag,
diff --git a/arch/powerpc/mm/dma-noncoherent.c 
b/arch/powerpc/mm/dma-noncoherent.c
index b6e7b5952ab5..e955539686a4 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -29,7 +29,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 #include 
@@ -151,8 +151,8 @@ static struct ppc_vm_region *ppc_vm_region_find(struct 
ppc_vm_region *head, unsi
  * Allocate DMA-coherent memory space and return both the kernel remapped
  * virtual and bus address for that space.
  */
-void *
-__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, 
gfp_t gfp)
+void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
+   dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
struct page *page;
struct ppc_vm_region *c;
@@ -223,7 +223,7 @@ __dma_alloc_coherent(struct device *dev, size_t size, 
dma_addr_t *handle, gfp_t
/*
 * Set the "dma handle"
 */
-   *handle = page_to_phys(page);
+   *dma_handle = phys_to_dma(dev, page_to_phys(page));
 
do {
SetPageReserved(page);
@@ -249,12 +249,12 @@ __dma_alloc_coherent(struct device *dev, size_t size, 
dma_addr_t *handle, gfp_t
  no_page:
return NULL;
 }
-EXPORT_SYMBOL(__dma_alloc_coherent);
 
 /*
  * free a page as defined by the above mapping.
  */
-void __dma_free_coherent(size_t size, void *vaddr)
+void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
+   dma_addr_t dma_handle, unsigned long attrs)
 {
struct ppc_vm_region *c;
unsigned long flags, addr;
@@ -309,7 +309,6 @@ void __dma_free_coherent(size_t size, void *vaddr)
   __func__, 

[PATCH 10/34] powerpc/pseries: unwind dma_get_required_mask_pSeriesLP a bit

2018-11-14 Thread Christoph Hellwig
Call dma_get_required_mask_pSeriesLP directly instead of dma_iommu_ops
to simply the code a bit.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/pseries/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 06f02960b439..da5716de7f4c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1273,7 +1273,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device 
*dev)
return DMA_BIT_MASK(64);
}
 
-   return dma_iommu_ops.get_required_mask(dev);
+   return dma_iommu_get_required_mask(dev);
 }
 
 static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
-- 
2.19.1



[PATCH 09/34] powerpc/dma: handle iommu bypass in dma_iommu_ops

2018-11-14 Thread Christoph Hellwig
Add a new iommu_bypass flag to struct dev_archdata so that the dma_iommu
implementation can handle the direct mapping transparently instead of
switiching ops around.  Setting of this flag is controlled by new
pci_controller_ops method.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/device.h  |  5 ++
 arch/powerpc/include/asm/dma-mapping.h |  8 +++
 arch/powerpc/include/asm/pci-bridge.h  |  2 +
 arch/powerpc/kernel/dma-iommu.c| 70 +++---
 arch/powerpc/kernel/dma.c  | 19 +++
 5 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 0245bfcaac32..1aa53318b4bc 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -19,6 +19,11 @@ struct iommu_table;
  * drivers/macintosh/macio_asic.c
  */
 struct dev_archdata {
+   /*
+* Set to %true if the dma_iommu_ops are requested to use a direct
+* window instead of dynamically mapping memory.
+*/
+   booliommu_bypass : 1;
/*
 * These two used to be a union. However, with the hybrid ops we need
 * both so here we store both a DMA offset for direct mappings and
diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index dacd0f93f2b2..824f55995a18 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -29,6 +29,14 @@ extern int dma_nommu_mmap_coherent(struct device *dev,
struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t handle,
size_t size, unsigned long attrs);
+int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
+   int nents, enum dma_data_direction direction,
+   unsigned long attrs);
+dma_addr_t dma_nommu_map_page(struct device *dev, struct page *page,
+   unsigned long offset, size_t size,
+   enum dma_data_direction dir, unsigned long attrs);
+int dma_nommu_dma_supported(struct device *dev, u64 mask);
+u64 dma_nommu_get_required_mask(struct device *dev);
 
 #ifdef CONFIG_NOT_COHERENT_CACHE
 /*
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 94d449031b18..5c7a1e7ffc8a 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -19,6 +19,8 @@ struct device_node;
 struct pci_controller_ops {
void(*dma_dev_setup)(struct pci_dev *pdev);
void(*dma_bus_setup)(struct pci_bus *bus);
+   bool(*iommu_bypass_supported)(struct pci_dev *pdev,
+   u64 mask);
 
int (*probe_mode)(struct pci_bus *bus);
 
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 0613278abf9f..459be16f8334 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -6,12 +6,30 @@
  * busses using the iommu infrastructure
  */
 
+#include 
+#include 
 #include 
 
 /*
  * Generic iommu implementation
  */
 
+/*
+ * The coherent mask may be smaller than the real mask, check if we can
+ * really use a direct window.
+ */
+static inline bool dma_iommu_alloc_bypass(struct device *dev)
+{
+   return dev->archdata.iommu_bypass &&
+   dma_nommu_dma_supported(dev, dev->coherent_dma_mask);
+}
+
+static inline bool dma_iommu_map_bypass(struct device *dev,
+   unsigned long attrs)
+{
+   return dev->archdata.iommu_bypass;
+}
+
 /* Allocates a contiguous real buffer and creates mappings over it.
  * Returns the virtual address of the buffer and sets dma_handle
  * to the dma address (mapping) of the first page.
@@ -20,6 +38,9 @@ static void *dma_iommu_alloc_coherent(struct device *dev, 
size_t size,
  dma_addr_t *dma_handle, gfp_t flag,
  unsigned long attrs)
 {
+   if (dma_iommu_alloc_bypass(dev))
+   return __dma_nommu_alloc_coherent(dev, size, dma_handle, flag,
+   attrs);
return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
dma_handle, dev->coherent_dma_mask, flag,
dev_to_node(dev));
@@ -29,7 +50,11 @@ static void dma_iommu_free_coherent(struct device *dev, 
size_t size,
void *vaddr, dma_addr_t dma_handle,
unsigned long attrs)
 {
-   iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
+   if (dma_iommu_alloc_bypass(dev))
+   __dma_nommu_free_coherent(dev, size, vaddr, dma_handle, attrs);
+   else
+   iommu_free_coherent(get_iommu_table_base(dev), size, vaddr,
+   

[PATCH 04/34] powerpc/dma: remove the unused ISA_DMA_THRESHOLD export

2018-11-14 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
---
 arch/powerpc/kernel/setup_32.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 81909600013a..07f7e6aaf104 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -59,7 +59,6 @@ unsigned long ISA_DMA_THRESHOLD;
 unsigned int DMA_MODE_READ;
 unsigned int DMA_MODE_WRITE;
 
-EXPORT_SYMBOL(ISA_DMA_THRESHOLD);
 EXPORT_SYMBOL(DMA_MODE_READ);
 EXPORT_SYMBOL(DMA_MODE_WRITE);
 
-- 
2.19.1



[PATCH 07/34] powerpc/dma: remove the no-op dma_nommu_unmap_{page, sg} routines

2018-11-14 Thread Christoph Hellwig
These methods are optional, no need to implement no-op versions.

Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
---
 arch/powerpc/kernel/dma.c | 16 
 1 file changed, 16 deletions(-)

diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index d6deb458bb91..7078d72baec2 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -197,12 +197,6 @@ static int dma_nommu_map_sg(struct device *dev, struct 
scatterlist *sgl,
return nents;
 }
 
-static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
-   int nents, enum dma_data_direction direction,
-   unsigned long attrs)
-{
-}
-
 static u64 dma_nommu_get_required_mask(struct device *dev)
 {
u64 end, mask;
@@ -228,14 +222,6 @@ static inline dma_addr_t dma_nommu_map_page(struct device 
*dev,
return page_to_phys(page) + offset + get_dma_offset(dev);
 }
 
-static inline void dma_nommu_unmap_page(struct device *dev,
-dma_addr_t dma_address,
-size_t size,
-enum dma_data_direction direction,
-unsigned long attrs)
-{
-}
-
 #ifdef CONFIG_NOT_COHERENT_CACHE
 static inline void dma_nommu_sync_sg(struct device *dev,
struct scatterlist *sgl, int nents,
@@ -261,10 +247,8 @@ const struct dma_map_ops dma_nommu_ops = {
.free   = dma_nommu_free_coherent,
.mmap   = dma_nommu_mmap_coherent,
.map_sg = dma_nommu_map_sg,
-   .unmap_sg   = dma_nommu_unmap_sg,
.dma_supported  = dma_nommu_dma_supported,
.map_page   = dma_nommu_map_page,
-   .unmap_page = dma_nommu_unmap_page,
.get_required_mask  = dma_nommu_get_required_mask,
 #ifdef CONFIG_NOT_COHERENT_CACHE
.sync_single_for_cpu= dma_nommu_sync_single,
-- 
2.19.1