Re: [PATCH v6 22/24] mm: Speculative page fault handler return VMA

2018-01-12 Thread Matthew Wilcox
On Fri, Jan 12, 2018 at 11:02:51AM -0800, Matthew Wilcox wrote:
> On Fri, Jan 12, 2018 at 06:26:06PM +0100, Laurent Dufour wrote:
> > @@ -1354,7 +1354,10 @@ extern int handle_mm_fault(struct vm_area_struct 
> > *vma, unsigned long address,
> > unsigned int flags);
> >  #ifdef CONFIG_SPF
> >  extern int handle_speculative_fault(struct mm_struct *mm,
> > +   unsigned long address, unsigned int flags,
> > +   struct vm_area_struct **vma);
> 
> I think this shows that we need to create 'struct vm_fault' on the stack
> in the arch code and then pass it to handle_speculative_fault(), followed
> by handle_mm_fault().  That should be quite a nice cleanup actually.
> I know that's only 30+ architectures to change ;-)

Of course, we don't need to change them all.  Try this:

Subject: [PATCH] Add vm_handle_fault

For the speculative fault handler, we want to create the struct vm_fault
on the stack in the arch code and pass it into the generic mm code.
To avoid changing 30+ architectures, leave handle_mm_fault with its
current function signature and move its guts into the new vm_handle_fault
function.  Even this saves a nice 172 bytes on the random x86-64 .config
I happen to have around.

Signed-off-by: Matthew Wilcox 

diff --git a/mm/memory.c b/mm/memory.c
index 5eb3d2524bdc..403934297a3d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3977,36 +3977,28 @@ static int handle_pte_fault(struct vm_fault *vmf)
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
-static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
-   unsigned int flags)
+static int __handle_mm_fault(struct vm_fault *vmf)
 {
-   struct vm_fault vmf = {
-   .vma = vma,
-   .address = address & PAGE_MASK,
-   .flags = flags,
-   .pgoff = linear_page_index(vma, address),
-   .gfp_mask = __get_fault_gfp_mask(vma),
-   };
-   unsigned int dirty = flags & FAULT_FLAG_WRITE;
-   struct mm_struct *mm = vma->vm_mm;
+   unsigned int dirty = vmf->flags & FAULT_FLAG_WRITE;
+   struct mm_struct *mm = vmf->vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
int ret;
 
-   pgd = pgd_offset(mm, address);
-   p4d = p4d_alloc(mm, pgd, address);
+   pgd = pgd_offset(mm, vmf->address);
+   p4d = p4d_alloc(mm, pgd, vmf->address);
if (!p4d)
return VM_FAULT_OOM;
 
-   vmf.pud = pud_alloc(mm, p4d, address);
-   if (!vmf.pud)
+   vmf->pud = pud_alloc(mm, p4d, vmf->address);
+   if (!vmf->pud)
return VM_FAULT_OOM;
-   if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
-   ret = create_huge_pud();
+   if (pud_none(*vmf->pud) && transparent_hugepage_enabled(vmf->vma)) {
+   ret = create_huge_pud(vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
-   pud_t orig_pud = *vmf.pud;
+   pud_t orig_pud = *vmf->pud;
 
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
@@ -4014,50 +4006,51 @@ static int __handle_mm_fault(struct vm_area_struct 
*vma, unsigned long address,
/* NUMA case for anonymous PUDs would go here */
 
if (dirty && !pud_access_permitted(orig_pud, WRITE)) {
-   ret = wp_huge_pud(, orig_pud);
+   ret = wp_huge_pud(vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
-   huge_pud_set_accessed(, orig_pud);
+   huge_pud_set_accessed(vmf, orig_pud);
return 0;
}
}
}
 
-   vmf.pmd = pmd_alloc(mm, vmf.pud, address);
-   if (!vmf.pmd)
+   vmf->pmd = pmd_alloc(mm, vmf->pud, vmf->address);
+   if (!vmf->pmd)
return VM_FAULT_OOM;
-   if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-   ret = create_huge_pmd();
+   if (pmd_none(*vmf->pmd) && transparent_hugepage_enabled(vmf->vma)) {
+   ret = create_huge_pmd(vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
-   pmd_t orig_pmd = *vmf.pmd;
+   pmd_t orig_pmd = *vmf->pmd;
 
barrier();
if (unlikely(is_swap_pmd(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
  !is_pmd_migration_entry(orig_pmd));
if (is_pmd_migration_entry(orig_pmd))
-   

Re: [PATCH V2] powerpc/kernel: Add 'ibm, thread-groups' property for CPU allocation

2018-01-12 Thread Michael Ellerman
Nathan Fontenot  writes:

> On 01/08/2018 11:19 AM, Michael Bringmann wrote:
>> Add code to parse the new property 'ibm,thread-groups" when it is
>> present.  The content of this property explicitly defines the number
>> of threads per core as well as the PowerPC 'threads_core_mask'.
>> The design provides a common device-tree for both P9 normal core and
>> P9 fused core systems.  The new property has been observed to be
>> available on P9 pHyp systems, but it is not always present on
>> OpenPower BMC systems.
>> 
>> The property updates the kernel to know which CPUs/threads of each
>> core are actually present, and then use the map when adding cores
>> to the system at boot, or during hotplug operations.
>> 
>> * Previously, the information about the number of threads per core
>>   was inferred solely from the "ibm,ppc-interrupt-server#s" property
>>   in the system device tree.
>> * Also previous to this property, The mask of threads per CPU was
>>   inferred to be a strict linear series from 0..(nthreads-1).
>> * After reading the "ibm,thread-group" property, we can determine
>>   the number of threads per core to be the 'bitmask weight' of the
>>   CPU thread mask.
>> * Also after reading the property, we can determine which of the
>>   possible threads we are allowed to online for each CPU.  It is no
>>   longer a simple linear sequence, but may be discontinuous e.g.
>>   activate threads 1,2,3,5,6,7 on a core instead of 0-5 sequentially.
>> 
>> Implementation of the "ibm,thread-groups" property is spread across
>> a few files in the powerpc specific code:
>> 
>> * prom.c: Parse the property and create 'ppc_thread_group_mask'.
>>   Use the mask in operation of early_init_dt_scan_cpus().
>> * setup-common.c: Import 'ppc_thread_group_mask' and use the value
>>   in the operation of cpu_init_thread_core_maps(), and
>>   smp_setup_cpu_maps.
>> * hotplug-cpu.c: Use 'ppc_thread_group_mask' in several locations
>>   where the code previously expected to iterate over a
>>   linear series of active threads (0..nthreads-1).
>> 
>> Note that the "ibm,thread-groups" property also includes semantics
>> of 'thread-group' i.e. define one or more subgroups of the available
>> threads, each group of threads to be used for a specific class of
>> task.  Translating thread group semantics into Linux kernel features
>> is TBD.
>
> One thing I don't see addressed in the comments or in the code is
> migration support. I think we need to update the thread group mask
> post-migration to reflect the threads per core on the new system.

Normally I'd agree with you, but I don't see any prospect of the kernel
surviving if the threads per core changes across a migration. We'll have
data structures allocated based on the old value and things will
definitely crash if the value increases. If it shrinks maybe we'd get
away with it, but either way is dicey.

If there's an expectation that we'll be able to migrate between systems
with different settings then we have a much bigger problem.

cheers


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Segher Boessenkool
On Fri, Jan 12, 2018 at 10:45:31PM +0100, Arnd Bergmann wrote:
> > I guess you could enable the _x routines whenever you use ubsan?  Ubsan
> > will cause much bigger code growth than the handful of insns in those
> > routines?
> 
> Right, that could work, too. My patch that Herbert merged intentionally
> used -Os also for non-UBSAN builds because it turned out to
> be much faster (see gcc PR83651),

"Much"?

-Os is *slower* with 8.0, 5% faster with 7.2, 4% faster with 7.1,
slower with 7.0 and 6.3.  Your numbers, #c1.

Anf this is the generic code of course, which is slow anyway (not to
mention insecure).

> but we could revert that back
> to the default and only use the -Os for UBSAN, essentially
> addressing only PR83356 but not PR83651.


Segher


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Arnd Bergmann
On Fri, Jan 12, 2018 at 10:41 PM, Segher Boessenkool
 wrote:
> On Fri, Jan 12, 2018 at 10:29:01PM +0100, Arnd Bergmann wrote:
>> On Fri, Jan 12, 2018 at 9:41 PM, Segher Boessenkool
>>  wrote:
>> > On Fri, Jan 12, 2018 at 08:43:21PM +0100, Arnd Bergmann wrote:
>> >> On Fri, Jan 12, 2018 at 5:39 PM, Segher Boessenkool
>>
>> >> We could theoretically work around it by turning that into
>> >> "#if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) ||
>> >> defined(CONFIG_CRYPTO_AES)", but that seems rather ugly.
>> >>
>> >> My earlier patch already tried to be more specific, turning very
>> >> specific optimizations off rather than moving from -O2 to -Os,
>> >> but that turned out to lead to significantly worse performance,
>> >> where -Os improved performance slightly. Is there a way
>> >> to ask powerpc compilers to use mostly -Os but not the
>> >> specific thing that makes it link to _restgpr_31_x?
>> >
>> > There is no such thing, sorry.  Would be very hard to implement, and
>> > older compilers will never get it, so it won't help you anyway :-(
>>
>> We use -Os only for gcc-7.1 and higher, where it produces faster
>> code for AES and avoids running into
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356
>>
>> > Maybe for now just enable it in crtsavres.S always, with a comment?
>> > That -Os workaround is hopefully not going to live long either...
>>
>> It depends on whether or how soon someone comes up with a
>> better fix for PR83356.
>> gcc-8.0.0 is currently not affected by it, so we could limit the
>> workaround (and the hack in crtsavres.S) to gcc-7-only.
>
> I guess you could enable the _x routines whenever you use ubsan?  Ubsan
> will cause much bigger code growth than the handful of insns in those
> routines?

Right, that could work, too. My patch that Herbert merged intentionally
used -Os also for non-UBSAN builds because it turned out to
be much faster (see gcc PR83651), but we could revert that back
to the default and only use the -Os for UBSAN, essentially
addressing only PR83356 but not PR83651.

   Arnd


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Segher Boessenkool
On Fri, Jan 12, 2018 at 10:29:01PM +0100, Arnd Bergmann wrote:
> On Fri, Jan 12, 2018 at 9:41 PM, Segher Boessenkool
>  wrote:
> > On Fri, Jan 12, 2018 at 08:43:21PM +0100, Arnd Bergmann wrote:
> >> On Fri, Jan 12, 2018 at 5:39 PM, Segher Boessenkool
> 
> >> We could theoretically work around it by turning that into
> >> "#if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) ||
> >> defined(CONFIG_CRYPTO_AES)", but that seems rather ugly.
> >>
> >> My earlier patch already tried to be more specific, turning very
> >> specific optimizations off rather than moving from -O2 to -Os,
> >> but that turned out to lead to significantly worse performance,
> >> where -Os improved performance slightly. Is there a way
> >> to ask powerpc compilers to use mostly -Os but not the
> >> specific thing that makes it link to _restgpr_31_x?
> >
> > There is no such thing, sorry.  Would be very hard to implement, and
> > older compilers will never get it, so it won't help you anyway :-(
> 
> We use -Os only for gcc-7.1 and higher, where it produces faster
> code for AES and avoids running into
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356
> 
> > Maybe for now just enable it in crtsavres.S always, with a comment?
> > That -Os workaround is hopefully not going to live long either...
> 
> It depends on whether or how soon someone comes up with a
> better fix for PR83356.
> gcc-8.0.0 is currently not affected by it, so we could limit the
> workaround (and the hack in crtsavres.S) to gcc-7-only.

I guess you could enable the _x routines whenever you use ubsan?  Ubsan
will cause much bigger code growth than the handful of insns in those
routines?


Segher


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Arnd Bergmann
On Fri, Jan 12, 2018 at 9:41 PM, Segher Boessenkool
 wrote:
> On Fri, Jan 12, 2018 at 08:43:21PM +0100, Arnd Bergmann wrote:
>> On Fri, Jan 12, 2018 at 5:39 PM, Segher Boessenkool

>> We could theoretically work around it by turning that into
>> "#if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) ||
>> defined(CONFIG_CRYPTO_AES)", but that seems rather ugly.
>>
>> My earlier patch already tried to be more specific, turning very
>> specific optimizations off rather than moving from -O2 to -Os,
>> but that turned out to lead to significantly worse performance,
>> where -Os improved performance slightly. Is there a way
>> to ask powerpc compilers to use mostly -Os but not the
>> specific thing that makes it link to _restgpr_31_x?
>
> There is no such thing, sorry.  Would be very hard to implement, and
> older compilers will never get it, so it won't help you anyway :-(

We use -Os only for gcc-7.1 and higher, where it produces faster
code for AES and avoids running into
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356

> Maybe for now just enable it in crtsavres.S always, with a comment?
> That -Os workaround is hopefully not going to live long either...

It depends on whether or how soon someone comes up with a
better fix for PR83356.
gcc-8.0.0 is currently not affected by it, so we could limit the
workaround (and the hack in crtsavres.S) to gcc-7-only.

 Arnd


Re: [PATCH 06/22] swiotlb: rename swiotlb_free to swiotlb_exit

2018-01-12 Thread Konrad Rzeszutek Wilk
On Wed, Jan 10, 2018 at 09:09:16AM +0100, Christoph Hellwig wrote:

OK?

Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/powerpc/kernel/dma-swiotlb.c | 2 +-
>  arch/x86/kernel/pci-swiotlb.c | 2 +-
>  include/linux/swiotlb.h   | 4 ++--
>  lib/swiotlb.c | 2 +-
>  4 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
> b/arch/powerpc/kernel/dma-swiotlb.c
> index 506ac4fafac5..88f3963ca30f 100644
> --- a/arch/powerpc/kernel/dma-swiotlb.c
> +++ b/arch/powerpc/kernel/dma-swiotlb.c
> @@ -121,7 +121,7 @@ static int __init check_swiotlb_enabled(void)
>   if (ppc_swiotlb_enable)
>   swiotlb_print_info();
>   else
> - swiotlb_free();
> + swiotlb_exit();
>  
>   return 0;
>  }
> diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
> index 0d77603c2f50..0ee0f8f34251 100644
> --- a/arch/x86/kernel/pci-swiotlb.c
> +++ b/arch/x86/kernel/pci-swiotlb.c
> @@ -120,7 +120,7 @@ void __init pci_swiotlb_late_init(void)
>  {
>   /* An IOMMU turned us off. */
>   if (!swiotlb)
> - swiotlb_free();
> + swiotlb_exit();
>   else {
>   printk(KERN_INFO "PCI-DMA: "
>  "Using software bounce buffering for IO (SWIOTLB)\n");
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 24ed817082ee..606375e35d87 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -115,10 +115,10 @@ extern int
>  swiotlb_dma_supported(struct device *hwdev, u64 mask);
>  
>  #ifdef CONFIG_SWIOTLB
> -extern void __init swiotlb_free(void);
> +extern void __init swiotlb_exit(void);
>  unsigned int swiotlb_max_segment(void);
>  #else
> -static inline void swiotlb_free(void) { }
> +static inline void swiotlb_exit(void) { }
>  static inline unsigned int swiotlb_max_segment(void) { return 0; }
>  #endif
>  
> diff --git a/lib/swiotlb.c b/lib/swiotlb.c
> index 125c1062119f..cf5311908fa9 100644
> --- a/lib/swiotlb.c
> +++ b/lib/swiotlb.c
> @@ -417,7 +417,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long 
> nslabs)
>   return -ENOMEM;
>  }
>  
> -void __init swiotlb_free(void)
> +void __init swiotlb_exit(void)
>  {
>   if (!io_tlb_orig_addr)
>   return;
> -- 
> 2.14.2
> 


Re: [PATCH 05/22] x86: rename swiotlb_dma_ops

2018-01-12 Thread Konrad Rzeszutek Wilk
On Wed, Jan 10, 2018 at 09:09:15AM +0100, Christoph Hellwig wrote:
> We'll need that name for a generic implementation soon.
> 
> Signed-off-by: Christoph Hellwig 
Reviewed-by: Konrad Rzeszutek Wilk 
> ---
>  arch/x86/kernel/pci-swiotlb.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
> index 9d3e35c33d94..0d77603c2f50 100644
> --- a/arch/x86/kernel/pci-swiotlb.c
> +++ b/arch/x86/kernel/pci-swiotlb.c
> @@ -48,7 +48,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t 
> size,
>   dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
>  }
>  
> -static const struct dma_map_ops swiotlb_dma_ops = {
> +static const struct dma_map_ops x86_swiotlb_dma_ops = {
>   .mapping_error = swiotlb_dma_mapping_error,
>   .alloc = x86_swiotlb_alloc_coherent,
>   .free = x86_swiotlb_free_coherent,
> @@ -112,7 +112,7 @@ void __init pci_swiotlb_init(void)
>  {
>   if (swiotlb) {
>   swiotlb_init(0);
> - dma_ops = _dma_ops;
> + dma_ops = _swiotlb_dma_ops;
>   }
>  }
>  
> -- 
> 2.14.2
> 


Re: [PATCH 04/22] powerpc: rename swiotlb_dma_ops

2018-01-12 Thread Konrad Rzeszutek Wilk
On Wed, Jan 10, 2018 at 09:09:14AM +0100, Christoph Hellwig wrote:
> We'll need that name for a generic implementation soon.
> 
Reviewed-by: Konrad Rzeszutek Wilk 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/powerpc/include/asm/swiotlb.h | 2 +-
>  arch/powerpc/kernel/dma-swiotlb.c  | 4 ++--
>  arch/powerpc/kernel/dma.c  | 2 +-
>  arch/powerpc/sysdev/fsl_pci.c  | 2 +-
>  4 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/swiotlb.h 
> b/arch/powerpc/include/asm/swiotlb.h
> index 9341ee804d19..f65ecf57b66c 100644
> --- a/arch/powerpc/include/asm/swiotlb.h
> +++ b/arch/powerpc/include/asm/swiotlb.h
> @@ -13,7 +13,7 @@
>  
>  #include 
>  
> -extern const struct dma_map_ops swiotlb_dma_ops;
> +extern const struct dma_map_ops powerpc_swiotlb_dma_ops;
>  
>  extern unsigned int ppc_swiotlb_enable;
>  int __init swiotlb_setup_bus_notifier(void);
> diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
> b/arch/powerpc/kernel/dma-swiotlb.c
> index f1e99b9cee97..506ac4fafac5 100644
> --- a/arch/powerpc/kernel/dma-swiotlb.c
> +++ b/arch/powerpc/kernel/dma-swiotlb.c
> @@ -46,7 +46,7 @@ static u64 swiotlb_powerpc_get_required(struct device *dev)
>   * map_page, and unmap_page on highmem, use normal dma_ops
>   * for everything else.
>   */
> -const struct dma_map_ops swiotlb_dma_ops = {
> +const struct dma_map_ops powerpc_swiotlb_dma_ops = {
>   .alloc = __dma_nommu_alloc_coherent,
>   .free = __dma_nommu_free_coherent,
>   .mmap = dma_nommu_mmap_coherent,
> @@ -89,7 +89,7 @@ static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
>  
>   /* May need to bounce if the device can't address all of DRAM */
>   if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM())
> - set_dma_ops(dev, _dma_ops);
> + set_dma_ops(dev, _swiotlb_dma_ops);
>  
>   return NOTIFY_DONE;
>  }
> diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
> index 76079841d3d0..da20569de9d4 100644
> --- a/arch/powerpc/kernel/dma.c
> +++ b/arch/powerpc/kernel/dma.c
> @@ -33,7 +33,7 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev)
>   struct dev_archdata __maybe_unused *sd = >archdata;
>  
>  #ifdef CONFIG_SWIOTLB
> - if (sd->max_direct_dma_addr && dev->dma_ops == _dma_ops)
> + if (sd->max_direct_dma_addr && dev->dma_ops == _swiotlb_dma_ops)
>   pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT);
>  #endif
>  
> diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
> index e4d0133bbeeb..61e07c78d64f 100644
> --- a/arch/powerpc/sysdev/fsl_pci.c
> +++ b/arch/powerpc/sysdev/fsl_pci.c
> @@ -118,7 +118,7 @@ static void setup_swiotlb_ops(struct pci_controller *hose)
>  {
>   if (ppc_swiotlb_enable) {
>   hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb;
> - set_pci_dma_ops(_dma_ops);
> + set_pci_dma_ops(_swiotlb_dma_ops);
>   }
>  }
>  #else
> -- 
> 2.14.2
> 


Re: [PATCH 03/22] ia64: rename swiotlb_dma_ops

2018-01-12 Thread Konrad Rzeszutek Wilk
On Wed, Jan 10, 2018 at 09:09:13AM +0100, Christoph Hellwig wrote:
> We'll need that name for a generic implementation soon.
> 
> Signed-off-by: Christoph Hellwig 


Reviewed-by: Konrad Rzeszutek Wilk 
> ---
>  arch/ia64/hp/common/hwsw_iommu.c | 4 ++--
>  arch/ia64/hp/common/sba_iommu.c  | 6 +++---
>  arch/ia64/kernel/pci-swiotlb.c   | 6 +++---
>  3 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/ia64/hp/common/hwsw_iommu.c 
> b/arch/ia64/hp/common/hwsw_iommu.c
> index 63d8e1d2477f..41279f0442bd 100644
> --- a/arch/ia64/hp/common/hwsw_iommu.c
> +++ b/arch/ia64/hp/common/hwsw_iommu.c
> @@ -19,7 +19,7 @@
>  #include 
>  #include 
>  
> -extern const struct dma_map_ops sba_dma_ops, swiotlb_dma_ops;
> +extern const struct dma_map_ops sba_dma_ops, ia64_swiotlb_dma_ops;
>  
>  /* swiotlb declarations & definitions: */
>  extern int swiotlb_late_init_with_default_size (size_t size);
> @@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev)
>  const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
>  {
>   if (use_swiotlb(dev))
> - return _dma_ops;
> + return _swiotlb_dma_ops;
>   return _dma_ops;
>  }
>  EXPORT_SYMBOL(hwsw_dma_get_ops);
> diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
> index aec4a3354abe..8c0a9ae6afec 100644
> --- a/arch/ia64/hp/common/sba_iommu.c
> +++ b/arch/ia64/hp/common/sba_iommu.c
> @@ -2096,7 +2096,7 @@ static int __init acpi_sba_ioc_init_acpi(void)
>  /* This has to run before acpi_scan_init(). */
>  arch_initcall(acpi_sba_ioc_init_acpi);
>  
> -extern const struct dma_map_ops swiotlb_dma_ops;
> +extern const struct dma_map_ops ia64_swiotlb_dma_ops;
>  
>  static int __init
>  sba_init(void)
> @@ -2111,7 +2111,7 @@ sba_init(void)
>* a successful kdump kernel boot is to use the swiotlb.
>*/
>   if (is_kdump_kernel()) {
> - dma_ops = _dma_ops;
> + dma_ops = _swiotlb_dma_ops;
>   if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
>   panic("Unable to initialize software I/O TLB:"
> " Try machvec=dig boot option");
> @@ -2133,7 +2133,7 @@ sba_init(void)
>* If we didn't find something sba_iommu can claim, we
>* need to setup the swiotlb and switch to the dig machvec.
>*/
> - dma_ops = _dma_ops;
> + dma_ops = _swiotlb_dma_ops;
>   if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
>   panic("Unable to find SBA IOMMU or initialize "
> "software I/O TLB: Try machvec=dig boot option");
> diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
> index 5e50939aa03e..f1ae873a8c35 100644
> --- a/arch/ia64/kernel/pci-swiotlb.c
> +++ b/arch/ia64/kernel/pci-swiotlb.c
> @@ -31,7 +31,7 @@ static void ia64_swiotlb_free_coherent(struct device *dev, 
> size_t size,
>   swiotlb_free_coherent(dev, size, vaddr, dma_addr);
>  }
>  
> -const struct dma_map_ops swiotlb_dma_ops = {
> +const struct dma_map_ops ia64_swiotlb_dma_ops = {
>   .alloc = ia64_swiotlb_alloc_coherent,
>   .free = ia64_swiotlb_free_coherent,
>   .map_page = swiotlb_map_page,
> @@ -48,7 +48,7 @@ const struct dma_map_ops swiotlb_dma_ops = {
>  
>  void __init swiotlb_dma_init(void)
>  {
> - dma_ops = _dma_ops;
> + dma_ops = _swiotlb_dma_ops;
>   swiotlb_init(1);
>  }
>  
> @@ -60,7 +60,7 @@ void __init pci_swiotlb_init(void)
>   printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
>   machvec_init("dig");
>   swiotlb_init(1);
> - dma_ops = _dma_ops;
> + dma_ops = _swiotlb_dma_ops;
>  #else
>   panic("Unable to find Intel IOMMU");
>  #endif
> -- 
> 2.14.2
> 


Re: [PATCH 21/33] dma-mapping: add an arch_dma_supported hook

2018-01-12 Thread Konrad Rzeszutek Wilk
On Wed, Jan 10, 2018 at 09:00:15AM +0100, Christoph Hellwig wrote:
> To implement the x86 forbid_dac and iommu_sac_force we want an arch hook
> so that it can apply the global options across all dma_map_ops
> implementations.
> 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Konrad Rzeszutek Wilk 
> ---
>  arch/x86/include/asm/dma-mapping.h |  3 +++
>  arch/x86/kernel/pci-dma.c  | 19 ---
>  include/linux/dma-mapping.h| 11 +++
>  3 files changed, 26 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/dma-mapping.h 
> b/arch/x86/include/asm/dma-mapping.h
> index dfdc9357a349..6277c83c0eb1 100644
> --- a/arch/x86/include/asm/dma-mapping.h
> +++ b/arch/x86/include/asm/dma-mapping.h
> @@ -30,6 +30,9 @@ static inline const struct dma_map_ops 
> *get_arch_dma_ops(struct bus_type *bus)
>   return dma_ops;
>  }
>  
> +int arch_dma_supported(struct device *dev, u64 mask);
> +#define arch_dma_supported arch_dma_supported
> +
>  bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
>  #define arch_dma_alloc_attrs arch_dma_alloc_attrs
>  
> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> index 61a8f1cb3829..df7ab02f959f 100644
> --- a/arch/x86/kernel/pci-dma.c
> +++ b/arch/x86/kernel/pci-dma.c
> @@ -215,7 +215,7 @@ static __init int iommu_setup(char *p)
>  }
>  early_param("iommu", iommu_setup);
>  
> -int x86_dma_supported(struct device *dev, u64 mask)
> +int arch_dma_supported(struct device *dev, u64 mask)
>  {
>  #ifdef CONFIG_PCI
>   if (mask > 0x && forbid_dac > 0) {
> @@ -224,12 +224,6 @@ int x86_dma_supported(struct device *dev, u64 mask)
>   }
>  #endif
>  
> - /* Copied from i386. Doesn't make much sense, because it will
> -only work for pci_alloc_coherent.
> -The caller just has to use GFP_DMA in this case. */
> - if (mask < DMA_BIT_MASK(24))
> - return 0;
> -
>   /* Tell the device to use SAC when IOMMU force is on.  This
>  allows the driver to use cheaper accesses in some cases.
>  
> @@ -249,6 +243,17 @@ int x86_dma_supported(struct device *dev, u64 mask)
>  
>   return 1;
>  }
> +EXPORT_SYMBOL(arch_dma_supported);
> +
> +int x86_dma_supported(struct device *dev, u64 mask)
> +{
> + /* Copied from i386. Doesn't make much sense, because it will
> +only work for pci_alloc_coherent.
> +The caller just has to use GFP_DMA in this case. */
> + if (mask < DMA_BIT_MASK(24))
> + return 0;
> + return 1;
> +}
>  
>  static int __init pci_iommu_init(void)
>  {
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 88bcb1a8211d..d67742dad904 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -576,6 +576,14 @@ static inline int dma_mapping_error(struct device *dev, 
> dma_addr_t dma_addr)
>   return 0;
>  }
>  
> +/*
> + * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
> + * don't use this is new code.
> + */
> +#ifndef arch_dma_supported
> +#define arch_dma_supported(dev, mask)(1)
> +#endif
> +
>  static inline void dma_check_mask(struct device *dev, u64 mask)
>  {
>   if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
> @@ -588,6 +596,9 @@ static inline int dma_supported(struct device *dev, u64 
> mask)
>  
>   if (!ops)
>   return 0;
> + if (!arch_dma_supported(dev, mask))
> + return 0;
> +
>   if (!ops->dma_supported)
>   return 1;
>   return ops->dma_supported(dev, mask);
> -- 
> 2.14.2
> 


Re: [PATCH 19/33] dma-mapping: warn when there is no coherent_dma_mask

2018-01-12 Thread Konrad Rzeszutek Wilk
On Wed, Jan 10, 2018 at 09:00:13AM +0100, Christoph Hellwig wrote:
> These days all devices should have a DMA coherent mask, and most dma_ops
> implementations rely on that fact.  But just to be sure add an assert to
> ring the warning bell if that is not the case.
> 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Konrad Rzeszutek Wilk 
> ---
>  include/linux/dma-mapping.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index d84951865be7..9f28b2fa329e 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -513,6 +513,7 @@ static inline void *dma_alloc_attrs(struct device *dev, 
> size_t size,
>   void *cpu_addr;
>  
>   BUG_ON(!ops);
> + WARN_ON_ONCE(!dev->coherent_dma_mask);
>  
>   if (dma_alloc_from_dev_coherent(dev, size, dma_handle, _addr))
>   return cpu_addr;
> -- 
> 2.14.2
> 


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Segher Boessenkool
On Fri, Jan 12, 2018 at 08:43:21PM +0100, Arnd Bergmann wrote:
> On Fri, Jan 12, 2018 at 5:39 PM, Segher Boessenkool
>  wrote:
> 
> >> or why the aes_generic implementation needs this on
> >> powerpc when built with 'gcc -Os'. FWIW, the -Os change was needed
> >> to work around a possible kernel stack overflow that can happen with
> >> gcc-7.2, see https://patchwork.kernel.org/patch/10143607/ and
> >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356
> >
> > The _x versions are smaller but slower; that's why they are used with -Os.
> > Apparently nothing else was built with -Os (and the other needed flags)
> > before.
> 
> Ah, that explains it, the definition is in arch/powerpc/lib/crtsavres.S,
> but inside of #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE.

Ah ok.  Right.

> We could theoretically work around it by turning that into
> "#if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) ||
> defined(CONFIG_CRYPTO_AES)", but that seems rather ugly.
> 
> My earlier patch already tried to be more specific, turning very
> specific optimizations off rather than moving from -O2 to -Os,
> but that turned out to lead to significantly worse performance,
> where -Os improved performance slightly. Is there a way
> to ask powerpc compilers to use mostly -Os but not the
> specific thing that makes it link to _restgpr_31_x?

There is no such thing, sorry.  Would be very hard to implement, and
older compilers will never get it, so it won't help you anyway :-(

Maybe for now just enable it in crtsavres.S always, with a comment?
That -Os workaround is hopefully not going to live long either...


Segher


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Arnd Bergmann
On Fri, Jan 12, 2018 at 5:39 PM, Segher Boessenkool
 wrote:

>> or why the aes_generic implementation needs this on
>> powerpc when built with 'gcc -Os'. FWIW, the -Os change was needed
>> to work around a possible kernel stack overflow that can happen with
>> gcc-7.2, see https://patchwork.kernel.org/patch/10143607/ and
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356
>
> The _x versions are smaller but slower; that's why they are used with -Os.
> Apparently nothing else was built with -Os (and the other needed flags)
> before.

Ah, that explains it, the definition is in arch/powerpc/lib/crtsavres.S,
but inside of #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE.

We could theoretically work around it by turning that into
"#if defined(CONFIG_CC_OPTIMIZE_FOR_SIZE) ||
defined(CONFIG_CRYPTO_AES)", but that seems rather ugly.

My earlier patch already tried to be more specific, turning very
specific optimizations off rather than moving from -O2 to -Os,
but that turned out to lead to significantly worse performance,
where -Os improved performance slightly. Is there a way
to ask powerpc compilers to use mostly -Os but not the
specific thing that makes it link to _restgpr_31_x?

   Arnd


Re: [PATCH] powerpc/mm: Simplify _PAGE_RO handling in page table dump

2018-01-12 Thread christophe leroy


Le 09/05/2017 à 16:16, Christophe Leroy a écrit :

Commit fd893fe56a130 ("powerpc/mm: Fix missing page attributes in
page table dump") added support of _PAGE_RO attribute.

This patch makes it more simple


Superseeded by https://patchwork.ozlabs.org/patch/859896/

Christophe



Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/dump_linuxpagetables.c | 7 +--
  1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/powerpc/mm/dump_linuxpagetables.c 
b/arch/powerpc/mm/dump_linuxpagetables.c
index d659345a98d6..eeef51107cff 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -121,13 +121,8 @@ static const struct flag_info flag_array[] = {
.set= "user",
.clear  = "",
}, {
-#if _PAGE_RO == 0
-   .mask   = _PAGE_RW,
+   .mask   = _PAGE_RW | _PAGE_RO,
.val= _PAGE_RW,
-#else
-   .mask   = _PAGE_RO,
-   .val= 0,
-#endif
.set= "rw",
.clear  = "ro",
}, {



---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus



Re: [PATCH v6 22/24] mm: Speculative page fault handler return VMA

2018-01-12 Thread Matthew Wilcox
On Fri, Jan 12, 2018 at 06:26:06PM +0100, Laurent Dufour wrote:
> @@ -1354,7 +1354,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, 
> unsigned long address,
>   unsigned int flags);
>  #ifdef CONFIG_SPF
>  extern int handle_speculative_fault(struct mm_struct *mm,
> + unsigned long address, unsigned int flags,
> + struct vm_area_struct **vma);

I think this shows that we need to create 'struct vm_fault' on the stack
in the arch code and then pass it to handle_speculative_fault(), followed
by handle_mm_fault().  That should be quite a nice cleanup actually.
I know that's only 30+ architectures to change ;-)



Re: [PATCH v6 01/24] x86/mm: Define CONFIG_SPF

2018-01-12 Thread Thomas Gleixner
On Fri, 12 Jan 2018, Laurent Dufour wrote:

> Introduce CONFIG_SPF which turns on the Speculative Page Fault handler when
> building for 64bits with SMP.
> 
> Signed-off-by: Laurent Dufour 
> ---
>  arch/x86/Kconfig | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index a317d5594b6a..d74353b85aaf 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2882,6 +2882,10 @@ config X86_DMA_REMAP
>  config HAVE_GENERIC_GUP
>   def_bool y
>  
> +config SPF
> + def_bool y
> + depends on X86_64 && SMP

Can you please put that into a generic place as

config SPF
   bool

and let the architectures select it.

Also SPF could be bit more elaborate and self explaining for the causual
reader. 3 letter acronyms are reserved for non existing agencies.

Thanks,

tglx


Re: [PATCH v6 16/24] mm: Protect mm_rb tree with a rwlock

2018-01-12 Thread Matthew Wilcox
On Fri, Jan 12, 2018 at 06:26:00PM +0100, Laurent Dufour wrote:
> -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
> +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
>  {
> + struct rb_root *root = >mm_rb;
>   /*
>* Note rb_erase_augmented is a fairly large inline function,
>* so make sure we instantiate it only once with our desired
>* augmented rbtree callbacks.
>*/
> +#ifdef CONFIG_SPF
> + write_lock(>mm_rb_lock);
> +#endif
>   rb_erase_augmented(>vm_rb, root, _gap_callbacks);
> +#ifdef CONFIG_SPF
> + write_unlock(>mm_rb_lock); /* wmb */
> +#endif

I can't say I love this.  Have you considered:

#ifdef CONFIG_SPF
#define vma_rb_write_lock(mm)   write_lock(>mm_rb_lock)
#define vma_rb_write_unlock(mm) write_unlock(>mm_rb_lock)
#else
#define vma_rb_write_lock(mm)   do { } while (0)
#define vma_rb_write_unlock(mm) do { } while (0)
#endif

Also, SPF is kind of uninformative.  CONFIG_MM_SPF might be better?
Or perhaps even CONFIG_SPECULATIVE_PAGE_FAULT, just to make it really
painful to do these one-liner ifdefs that make the code so hard to read.


Re: [PATCH v6 18/24] mm: Try spin lock in speculative path

2018-01-12 Thread Matthew Wilcox
On Fri, Jan 12, 2018 at 06:26:02PM +0100, Laurent Dufour wrote:
> There is a deadlock when a CPU is doing a speculative page fault and
> another one is calling do_unmap().
> 
> The deadlock occurred because the speculative path try to spinlock the
> pte while the interrupt are disabled. When the other CPU in the
> unmap's path has locked the pte then is waiting for all the CPU to
> invalidate the TLB. As the CPU doing the speculative fault have the
> interrupt disable it can't invalidate the TLB, and can't get the lock.
> 
> Since we are in a speculative path, we can race with other mm action.
> So let assume that the lock may not get acquired and fail the
> speculative page fault.

It seems like you introduced this bug in the previous patch, and now
you're fixing it in this patch?  Why not merge the two?



Re: [PATCH v2 00/16] ASoC: fsl_ssi: Clean up - program flow level

2018-01-12 Thread Caleb Crome
On Wed, Jan 10, 2018 at 10:42 PM, Nicolin Chen  wrote:
>
> ==Change log==
> v2
>  * Reworked the series by taking suggestions from Maciej
>   + Added PATCH-01 to keep all ssi->i2s_net updated
>   + Replaced bool tx with bool dir in PATCH-03 and PATCH-06
>   + Moved all initial register configurations from dai probe() to
> platform probe() so as to let AC97 CODEC successfully probe.
>  * Added Tested-by from Caleb for TDM test cases.
>
> ==Background==
> The fsl_ssi driver was designed for PPC originally and then it has
> been updated to support different modes for i.MX Series, including
> SDMA, I2S Master mode, AC97 and older i.MXs with FIQ, by different
> contributors for different use cases in different coding styles.
>
> Additionally, in order to fix/work-around hardware bugs and design
> flaws, the driver made a lot of compromise so now its program flow
> looks very complicated and it's getting hard to maintain or update.
>
> So I am going to clean up the driver on both coding style level and
> program flow level.
>
> ==Introduction==
> This series of patches is the second set to clean up fsl_ssi driver
> in the program flow level. Any patch here may impact a fundamental
> test case like playback or record.
>
> ==Verification==
> This series of patches require fully tested. I have done such tests
> on i.MX6SoloX with WM8962 using imx_v6_v7_defconfig as:
>  - Playback via I2S Master and Slave mode
>  - Record via I2S Master and Slave mode
>  - Simultaneous playback and record via I2S Master and Slave mode
>  - Background playback with foreground record (starting at different
>time) via I2S Master and Slave mode
>  - Background record with foreground playback (starting at different
>time) via I2S Master and Slave mode
>  * All tests above by hacking offline_config to true in imx51.
>
> Caleb has tested v1 with TDM lookback tests on i.MX6.
>
> Example of uncovered tests: AC97, PowerPC and FIQ.
>
> Nicolin Chen (16):
>   ASoC: fsl_ssi: Keep ssi->i2s_net updated
>   ASoC: fsl_ssi: Clean up set_dai_tdm_slot()
>   ASoC: fsl_ssi: Maintain a mask of active streams
>   ASoC: fsl_ssi: Rename fsl_ssi_disable_val macro
>   ASoC: fsl_ssi: Clear FIFO directly in fsl_ssi_config()
>   ASoC: fsl_ssi: Clean up helper functions of trigger()
>   ASoC: fsl_ssi: Add DAIFMT define for AC97
>   ASoC: fsl_ssi: Clean up fsl_ssi_setup_regvals()
>   ASoC: fsl_ssi: Set xFEN0 and xFEN1 together
>   ASoC: fsl_ssi: Use snd_soc_init_dma_data instead
>   ASoC: fsl_ssi: Move one-time configurations to probe()
>   ASoC: fsl_ssi: Setup AC97 in fsl_ssi_hw_init()
>   ASoC: fsl_ssi: Clean up _fsl_ssi_set_dai_fmt()
>   ASoC: fsl_ssi: Remove cpu_dai_drv from fsl_ssi structure
>   ASoC: fsl_ssi: Move DT related code to a separate probe()
>   ASoC: fsl_ssi: Use ssi->streams instead of reading register
>
>  sound/soc/fsl/fsl_ssi.c | 740 
> 
>  1 file changed, 369 insertions(+), 371 deletions(-)
>
> --
> 2.7.4
>
Tested again, just to be sure...  All looks good.


Tested-by: Caleb Crome 


[PATCH v6 24/24] powerpc/mm: Add speculative page fault

2018-01-12 Thread Laurent Dufour
This patch enable the speculative page fault on the PowerPC
architecture.

This will try a speculative page fault without holding the mmap_sem,
if it returns with VM_FAULT_RETRY, the mmap_sem is acquired and the
traditional page fault processing is done.

The speculative path is only tried for multithreaded process as there is no
risk of contention on the mmap_sem otherwise.

Build on if CONFIG_SPF is defined (currently for BOOK3S_64 && SMP).

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/mm/fault.c | 31 ++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 6e1e39035380..67b4c6aa7975 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -384,6 +384,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
   unsigned long error_code)
 {
struct vm_area_struct * vma;
+#ifdef CONFIG_SPF
+   struct vm_area_struct *spf_vma = NULL;
+#endif
struct mm_struct *mm = current->mm;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
int is_exec = TRAP(regs) == 0x400;
@@ -447,6 +450,20 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef CONFIG_SPF
+   if (is_user && (atomic_read(>mm_users) > 1)) {
+   /* let's try a speculative page fault without grabbing the
+* mmap_sem.
+*/
+   fault = handle_speculative_fault(mm, address, flags, _vma);
+   if (!(fault & VM_FAULT_RETRY)) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1,
+ regs, address);
+   goto done;
+   }
+   }
+#endif /* CONFIG_SPF */
+
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -477,7 +494,16 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
might_sleep();
}
 
-   vma = find_vma(mm, address);
+#ifdef CONFIG_SPF
+   if (spf_vma) {
+   if (can_reuse_spf_vma(spf_vma, address))
+   vma = spf_vma;
+   else
+   vma =  find_vma(mm, address);
+   spf_vma = NULL;
+   } else
+#endif
+   vma = find_vma(mm, address);
if (unlikely(!vma))
return bad_area(regs, address);
if (likely(vma->vm_start <= address))
@@ -531,6 +557,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 
up_read(>mm->mmap_sem);
 
+#ifdef CONFIG_SPF
+done:
+#endif
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
 
-- 
2.7.4



[PATCH v6 23/24] x86/mm: Add speculative pagefault handling

2018-01-12 Thread Laurent Dufour
From: Peter Zijlstra 

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
[Don't build SPF call if !CONFIG_SPF]
[Try speculative fault path only for multi threaded processes]
[Try to the VMA fetch during the speculative path in case of retry]
Signed-off-by: Laurent Dufour 
---
 arch/x86/mm/fault.c | 38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 06fe3d51d385..8db69a116521 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1242,6 +1242,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
unsigned long address)
 {
struct vm_area_struct *vma;
+#ifdef CONFIG_SPF
+   struct vm_area_struct *spf_vma = NULL;
+#endif
struct task_struct *tsk;
struct mm_struct *mm;
int fault, major = 0;
@@ -1339,6 +1342,27 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
 
+#ifdef CONFIG_SPF
+   if ((error_code & X86_PF_USER) && (atomic_read(>mm_users) > 1)) {
+   fault = handle_speculative_fault(mm, address, flags,
+_vma);
+
+   if (!(fault & VM_FAULT_RETRY)) {
+   if (!(fault & VM_FAULT_ERROR)) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1,
+ regs, address);
+   goto done;
+   }
+   /*
+* In case of error we need the pkey value, but
+* can't get it from the spf_vma as it is only returned
+* when VM_FAULT_RETRY is returned. So we have to
+* retry the page fault with the mmap_sem grabbed.
+*/
+   }
+   }
+#endif /* CONFIG_SPF */
+
/*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
@@ -1372,7 +1396,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
might_sleep();
}
 
-   vma = find_vma(mm, address);
+#ifdef CONFIG_SPF
+   if (spf_vma) {
+   if (can_reuse_spf_vma(spf_vma, address))
+   vma = spf_vma;
+   else
+   vma = find_vma(mm, address);
+   spf_vma = NULL;
+   } else
+#endif
+   vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
return;
@@ -1458,6 +1491,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
return;
}
 
+#ifdef CONFIG_SPF
+done:
+#endif
/*
 * Major/minor page fault accounting. If any of the events
 * returned VM_FAULT_MAJOR, we account it as a major fault.
-- 
2.7.4



[PATCH v6 22/24] mm: Speculative page fault handler return VMA

2018-01-12 Thread Laurent Dufour
When the speculative page fault handler is returning VM_RETRY, there is a
chance that VMA fetched without grabbing the mmap_sem can be reused by the
legacy page fault handler.  By reusing it, we avoid calling find_vma()
again. To achieve, that we must ensure that the VMA structure will not be
freed in our back. This is done by getting the reference on it (get_vma())
and by assuming that the caller will call the new service
can_reuse_spf_vma() once it has grabbed the mmap_sem.

can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
, and then that the VMA's boundaries matched the passed address and release
the reference on the VMA so that it can be freed if needed.

In the case the VMA is freed, can_reuse_spf_vma() will have returned false
as the VMA is no more in the RB tree.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |   5 +-
 mm/memory.c| 136 +
 2 files changed, 88 insertions(+), 53 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4d8a7621da8a..02da17792f0d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1354,7 +1354,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, 
unsigned long address,
unsigned int flags);
 #ifdef CONFIG_SPF
 extern int handle_speculative_fault(struct mm_struct *mm,
-   unsigned long address, unsigned int flags);
+   unsigned long address, unsigned int flags,
+   struct vm_area_struct **vma);
+extern bool can_reuse_spf_vma(struct vm_area_struct *vma,
+ unsigned long address);
 #endif /* CONFIG_SPF */
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
diff --git a/mm/memory.c b/mm/memory.c
index 6ccb1f45473a..e1f172ac2c90 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4284,13 +4284,22 @@ static int __handle_mm_fault(struct vm_area_struct 
*vma, unsigned long address,
 /* This is required by vm_normal_page() */
 #error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL"
 #endif
-
 /*
  * vm_normal_page() adds some processing which should be done while
  * hodling the mmap_sem.
  */
+
+/*
+ * Tries to handle the page fault in a speculative way, without grabbing the
+ * mmap_sem.
+ * When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
+ * be checked later when the mmap_sem has been grabbed by calling
+ * can_reuse_spf_vma().
+ * This is needed as the returned vma is kept in memory until the call to
+ * can_reuse_spf_vma() is made.
+ */
 int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
-unsigned int flags)
+unsigned int flags, struct vm_area_struct **vma)
 {
struct vm_fault vmf = {
.address = address,
@@ -4299,7 +4308,6 @@ int handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
p4d_t *p4d, p4dval;
pud_t pudval;
int seq, ret = VM_FAULT_RETRY;
-   struct vm_area_struct *vma;
 #ifdef CONFIG_NUMA
struct mempolicy *pol;
 #endif
@@ -4308,14 +4316,16 @@ int handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
flags |= FAULT_FLAG_SPECULATIVE;
 
-   vma = get_vma(mm, address);
-   if (!vma)
+   *vma = get_vma(mm, address);
+   if (!*vma)
return ret;
+   vmf.vma = *vma;
 
-   seq = raw_read_seqcount(>vm_sequence); /* rmb <-> 
seqlock,vma_rb_erase() */
+   /* rmb <-> seqlock,vma_rb_erase() */
+   seq = raw_read_seqcount(>vm_sequence);
if (seq & 1) {
-   trace_spf_vma_changed(_RET_IP_, vma, address);
-   goto out_put;
+   trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
+   return ret;
}
 
/*
@@ -4323,9 +4333,9 @@ int handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
 * with the VMA.
 * This include huge page from hugetlbfs.
 */
-   if (vma->vm_ops) {
-   trace_spf_vma_notsup(_RET_IP_, vma, address);
-   goto out_put;
+   if (vmf.vma->vm_ops) {
+   trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
+   return ret;
}
 
/*
@@ -4333,18 +4343,18 @@ int handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
 * because vm_next and vm_prev must be safe. This can't be guaranteed
 * in the speculative path.
 */
-   if (unlikely(!vma->anon_vma)) {
-   trace_spf_vma_notsup(_RET_IP_, vma, address);
-   goto out_put;
+   if (unlikely(!vmf.vma->anon_vma)) {
+   trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
+ 

[PATCH v6 21/24] perf tools: Add support for the SPF perf event

2018-01-12 Thread Laurent Dufour
Add support for the new speculative faults event.

Signed-off-by: Laurent Dufour 
---
 tools/include/uapi/linux/perf_event.h | 1 +
 tools/perf/util/evsel.c   | 1 +
 tools/perf/util/parse-events.c| 4 
 tools/perf/util/parse-events.l| 1 +
 tools/perf/util/python.c  | 1 +
 5 files changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 769533696483..06c7fdb14f89 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a4d256ea0dc4..9493d7b0f9b7 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -438,6 +438,7 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
"alignment-faults",
"emulation-faults",
"dummy",
+   "speculative-faults",
 };
 
 static const char *__perf_evsel__sw_name(u64 config)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 170316795a18..e75de3c3ffbb 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -137,6 +137,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias  = "",
},
+   [PERF_COUNT_SW_SPF] = {
+   .symbol = "speculative-faults",
+   .alias  = "spf",
+   },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 655ecff636a8..5d6782426b30 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -308,6 +308,7 @@ emulation-faults{ return 
sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EM
 dummy  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 duration_time  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 bpf-output { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+speculative-faults|spf { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF); }
 
/*
 * We have to handle the kernel PMU event 
cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index b1e999bd21ef..100507d632fa 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1142,6 +1142,7 @@ static struct {
PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS),
PERF_CONST(COUNT_SW_EMULATION_FAULTS),
PERF_CONST(COUNT_SW_DUMMY),
+   PERF_CONST(COUNT_SW_SPF),
 
PERF_CONST(SAMPLE_IP),
PERF_CONST(SAMPLE_TID),
-- 
2.7.4



[PATCH v6 20/24] perf: Add a speculative page fault sw event

2018-01-12 Thread Laurent Dufour
Add a new software event to count succeeded speculative page faults.

Signed-off-by: Laurent Dufour 
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 769533696483..06c7fdb14f89 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4



[PATCH v6 19/24] mm: Adding speculative page fault failure trace events

2018-01-12 Thread Laurent Dufour
This patch a set of new trace events to collect the speculative page fault
event failures.

Signed-off-by: Laurent Dufour 
---
 include/trace/events/pagefault.h | 87 
 mm/memory.c  | 62 ++--
 2 files changed, 136 insertions(+), 13 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h
new file mode 100644
index ..1d793f8c739b
--- /dev/null
+++ b/include/trace/events/pagefault.h
@@ -0,0 +1,87 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include 
+#include 
+
+DECLARE_EVENT_CLASS(spf,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, caller)
+   __field(unsigned long, vm_start)
+   __field(unsigned long, vm_end)
+   __field(unsigned long, address)
+   ),
+
+   TP_fast_assign(
+   __entry->caller = caller;
+   __entry->vm_start   = vma->vm_start;
+   __entry->vm_end = vma->vm_end;
+   __entry->address= address;
+   ),
+
+   TP_printk("ip:%lx vma:%lx-%lx address:%lx",
+ __entry->caller, __entry->vm_start, __entry->vm_end,
+ __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_pte_lock,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_pmd_changed,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include 
diff --git a/mm/memory.c b/mm/memory.c
index 83640079d407..6ccb1f45473a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -80,6 +80,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include 
+
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for 
last_cpupid.
 #endif
@@ -2460,23 +2463,30 @@ static bool pte_spinlock(struct vm_fault *vmf)
}
 
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
/*
 * We check if the pmd value is still the same to ensure that there
 * is a huge collapse operation in progress in our back.
 */
pmdval = READ_ONCE(*vmf->pmd);
-   if (!pmd_same(pmdval, vmf->orig_pmd))
+   if (!pmd_same(pmdval, vmf->orig_pmd)) {
+   trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   if (unlikely(!spin_trylock(vmf->ptl)))
+   if (unlikely(!spin_trylock(vmf->ptl))) {
+   trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
@@ -2516,16 +2526,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
 * block on the PTL and thus we're safe.
 */
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
/*
 * We check if the pmd value is still the same to ensure that there
 * is a huge collapse operation in progress in our back.
 */
pmdval = READ_ONCE(*vmf->pmd);
-   if (!pmd_same(pmdval, vmf->orig_pmd))
+   if (!pmd_same(pmdval, vmf->orig_pmd)) {
+   

[PATCH v6 18/24] mm: Try spin lock in speculative path

2018-01-12 Thread Laurent Dufour
There is a deadlock when a CPU is doing a speculative page fault and
another one is calling do_unmap().

The deadlock occurred because the speculative path try to spinlock the
pte while the interrupt are disabled. When the other CPU in the
unmap's path has locked the pte then is waiting for all the CPU to
invalidate the TLB. As the CPU doing the speculative fault have the
interrupt disable it can't invalidate the TLB, and can't get the lock.

Since we are in a speculative path, we can race with other mm action.
So let assume that the lock may not get acquired and fail the
speculative page fault.

Here are the stacks captured during the deadlock:

CPU 0
native_flush_tlb_others+0x7c/0x260
flush_tlb_mm_range+0x6a/0x220
tlb_flush_mmu_tlbonly+0x63/0xc0
unmap_page_range+0x897/0x9d0
? unmap_single_vma+0x7d/0xe0
? release_pages+0x2b3/0x360
unmap_single_vma+0x7d/0xe0
unmap_vmas+0x51/0xa0
unmap_region+0xbd/0x130
do_munmap+0x279/0x460
SyS_munmap+0x53/0x70

CPU 1
do_raw_spin_lock+0x14e/0x160
_raw_spin_lock+0x5d/0x80
? pte_map_lock+0x169/0x1b0
pte_map_lock+0x169/0x1b0
handle_pte_fault+0xbf2/0xd80
? trace_hardirqs_on+0xd/0x10
handle_speculative_fault+0x272/0x280
handle_speculative_fault+0x5/0x280
__do_page_fault+0x187/0x580
trace_do_page_fault+0x52/0x260
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 96720cc7ca74..83640079d407 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2472,7 +2472,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
goto out;
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (unlikely(!spin_trylock(vmf->ptl)))
+   goto out;
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
@@ -2526,8 +2527,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
if (!pmd_same(pmdval, vmf->orig_pmd))
goto out;
 
-   pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, );
+   /*
+* Same as pte_offset_map_lock() except that we call
+* spin_trylock() in place of spin_lock() to avoid race with
+* unmap path which may have the lock and wait for this CPU
+* to invalidate TLB but this CPU has irq disabled.
+* Since we are in a speculative patch, accept it could fail
+*/
+   ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   pte = pte_offset_map(vmf->pmd, vmf->address);
+   if (unlikely(!spin_trylock(ptl))) {
+   pte_unmap(pte);
+   goto out;
+   }
+
if (vma_has_changed(vmf)) {
pte_unmap_unlock(pte, ptl);
goto out;
-- 
2.7.4



[PATCH v6 17/24] mm: Provide speculative fault infrastructure

2018-01-12 Thread Laurent Dufour
From: Peter Zijlstra 

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) 

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Fetch p4d and pud]
[Set vmd.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
[Don't build SPF services if !CONFIG_SPF]
[Add mem cgroup oom check]
[Use READ_ONCE to access p*d entries]
[Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
[Don't fetch pte again in handle_pte_fault() when running the speculative
 path]
[Check PMD against concurrent collapsing operation]
Signed-off-by: Laurent Dufour 
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |   8 +
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  16 +-
 mm/memory.c| 321 -
 5 files changed, 344 insertions(+), 7 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..9e25283d6fc9 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -8,7 +8,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0d1b8d2d1e4f..4d8a7621da8a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -331,6 +331,10 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+#ifdef CONFIG_SPF
+   unsigned int sequence;
+   pmd_t orig_pmd; /* value of PMD at the time of fault */
+#endif
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1348,6 +1352,10 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+#ifdef CONFIG_SPF
+extern int handle_speculative_fault(struct mm_struct *mm,
+   unsigned long address, unsigned int flags);
+#endif /* CONFIG_SPF */
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 34ce3ebf97d5..70e4d2688e7b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -456,8 +456,8 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
-   pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-   pgoff += vma->vm_pgoff;
+   pgoff = (address - 

[PATCH v6 16/24] mm: Protect mm_rb tree with a rwlock

2018-01-12 Thread Laurent Dufour
This change is inspired by the Peter's proposal patch [1] which was
protecting the VMA using SRCU. Unfortunately, SRCU is not scaling well in
that particular case, and it is introducing major performance degradation
due to excessive scheduling operations.

To allow access to the mm_rb tree without grabbing the mmap_sem, this patch
is protecting it access using a rwlock.  As the mm_rb tree is a O(log n)
search it is safe to protect it using such a lock.  The VMA cache is not
protected by the new rwlock and it should not be used without holding the
mmap_sem.

To allow the picked VMA structure to be used once the rwlock is released, a
use count is added to the VMA structure. When the VMA is allocated it is
set to 1.  Each time the VMA is picked with the rwlock held its use count
is incremented. Each time the VMA is released it is decremented. When the
use count hits zero, this means that the VMA is no more used and should be
freed.

This patch is preparing for 2 kind of VMA access :
 - as usual, under the control of the mmap_sem,
 - without holding the mmap_sem for the speculative page fault handler.

Access done under the control the mmap_sem doesn't require to grab the
rwlock to protect read access to the mm_rb tree, but access in write must
be done under the protection of the rwlock too. This affects inserting and
removing of elements in the RB tree.

The patch is introducing 2 new functions:
 - vma_get() to find a VMA based on an address by holding the new rwlock.
 - vma_put() to release the VMA when its no more used.
These services are designed to be used when access are made to the RB tree
without holding the mmap_sem.

When a VMA is removed from the RB tree, its vma->vm_rb field is cleared and
we rely on the WMB done when releasing the rwlock to serialize the write
with the RMB done in a later patch to check for the VMA's validity.

When free_vma is called, the file associated with the VMA is closed
immediately, but the policy and the file structure remained in used until
the VMA's use count reach 0, which may happens later when exiting an
in progress speculative page fault.

[1] https://patchwork.kernel.org/patch/5108281/

Signed-off-by: Laurent Dufour 
Cc: Peter Zijlstra (Intel) 
---
 include/linux/mm_types.h |   4 ++
 kernel/fork.c|   3 ++
 mm/init-mm.c |   3 ++
 mm/internal.h|   6 +++
 mm/mmap.c| 120 ++-
 5 files changed, 104 insertions(+), 32 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e0e3df3b9641..2684df7e7294 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,7 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_SPF
seqcount_t vm_sequence;
+   atomic_t vm_ref_count;  /* see vma_get(), vma_put() */
 #endif
 } __randomize_layout;
 
@@ -353,6 +354,9 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+#ifdef CONFIG_SPF
+   rwlock_t mm_rb_lock;
+#endif
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index 0914307d4f3b..d99606e1e9ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -898,6 +898,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+#ifdef CONFIG_SPF
+   rwlock_init(>mm_rb_lock);
+#endif
atomic_set(>mm_users, 1);
atomic_set(>mm_count, 1);
init_rwsem(>mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..aaa5d7851d87 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -17,6 +17,9 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+#ifdef CONFIG_SPF
+   .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
+#endif
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..4b9c3357bd6c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,12 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);
 
+#ifdef CONFIG_SPF
+extern struct vm_area_struct *get_vma(struct mm_struct *mm,
+ unsigned long addr);
+extern void put_vma(struct vm_area_struct *vma);
+#endif
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 960e2f16ffcf..972ddee0b151 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,6 +160,27 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
 }
 
+static void 

[PATCH v6 15/24] mm: Introduce __page_add_new_anon_rmap()

2018-01-12 Thread Laurent Dufour
When dealing with speculative page fault handler, we may race with VMA
being split or merged. In this case the vma->vm_start and vm->vm_end
fields may not match the address the page fault is occurring.

This can only happens when the VMA is split but in that case, the
anon_vma pointer of the new VMA will be the same as the original one,
because in __split_vma the new->anon_vma is set to src->anon_vma when
*new = *vma.

So even if the VMA boundaries are not correct, the anon_vma pointer is
still valid.

If the VMA has been merged, then the VMA in which it has been merged
must have the same anon_vma pointer otherwise the merge can't be done.

So in all the case we know that the anon_vma is valid, since we have
checked before starting the speculative page fault that the anon_vma
pointer is valid for this VMA and since there is an anon_vma this
means that at one time a page has been backed and that before the VMA
is cleaned, the page table lock would have to be grab to clean the
PTE, and the anon_vma field is checked once the PTE is locked.

This patch introduce a new __page_add_new_anon_rmap() service which
doesn't check for the VMA boundaries, and create a new inline one
which do the check.

When called from a page fault handler, if this is not a speculative one,
there is a guarantee that vm_start and vm_end match the faulting address,
so this check is useless. In the context of the speculative page fault
handler, this check may be wrong but anon_vma is still valid as explained
above.

Signed-off-by: Laurent Dufour 
---
 include/linux/rmap.h | 12 ++--
 mm/memory.c  |  8 
 mm/rmap.c|  5 ++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 988d176472df..a5d282573093 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -174,8 +174,16 @@ void page_add_anon_rmap(struct page *, struct 
vm_area_struct *,
unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
   unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-   unsigned long, bool);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
+static inline void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address, bool compound)
+{
+   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+   __page_add_new_anon_rmap(page, vma, address, compound);
+}
+
 void page_add_file_rmap(struct page *, bool);
 void page_remove_rmap(struct page *, bool);
 
diff --git a/mm/memory.c b/mm/memory.c
index a7cb109bf25a..0ce303260ad1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2539,7 +2539,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * thread doing COW.
 */
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-   page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
__lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
@@ -3082,7 +3082,7 @@ int do_swap_page(struct vm_fault *vmf)
 
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
@@ -3232,7 +3232,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
}
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
@@ -3484,7 +3484,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
/* copy-on-write page */
if (write && !(vmf->vma_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
diff --git a/mm/rmap.c b/mm/rmap.c
index 47db27f8049e..6ec168ba5f73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1136,7 

[PATCH v6 14/24] mm: Introduce __vm_normal_page()

2018-01-12 Thread Laurent Dufour
When dealing with the speculative fault path we should use the VMA's field
cached value stored in the vm_fault structure.

Currently vm_normal_page() is using the pointer to the VMA to fetch the
vm_flags value. This patch provides a new __vm_normal_page() which is
receiving the vm_flags flags value as parameter.

Note: The speculative path is turned on for architecture providing support
for special PTE flag. So only the first block of vm_normal_page is used
during the speculative path.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  7 +--
 mm/memory.c| 18 ++
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d77a11067b94..0d1b8d2d1e4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1260,8 +1260,11 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap 
*/
 };
 
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-pte_t pte, bool with_public_device);
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device,
+ unsigned long vma_flags);
+#define _vm_normal_page(vma, addr, pte, with_public_device) \
+   __vm_normal_page(vma, addr, pte, with_public_device, (vma)->vm_flags)
 #define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
 
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index c24891d5676f..a7cb109bf25a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -826,8 +826,9 @@ static void print_bad_pte(struct vm_area_struct *vma, 
unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-pte_t pte, bool with_public_device)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device,
+ unsigned long vma_flags)
 {
unsigned long pfn = pte_pfn(pte);
 
@@ -836,7 +837,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
-   if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+   if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (is_zero_pfn(pfn))
return NULL;
@@ -868,8 +869,8 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
 
/* !HAVE_PTE_SPECIAL case follows: */
 
-   if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-   if (vma->vm_flags & VM_MIXEDMAP) {
+   if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+   if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
@@ -878,7 +879,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
-   if (!is_cow_mapping(vma->vm_flags))
+   if (!is_cow_mapping(vma_flags))
return NULL;
}
}
@@ -2722,7 +2723,8 @@ static int do_wp_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
 
-   vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+   vmf->page = __vm_normal_page(vma, vmf->address, vmf->orig_pte, false,
+vmf->vma_flags);
if (!vmf->page) {
/*
 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -3837,7 +3839,7 @@ static int do_numa_page(struct vm_fault *vmf)
ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
 
-   page = vm_normal_page(vma, vmf->address, pte);
+   page = __vm_normal_page(vma, vmf->address, pte, false, vmf->vma_flags);
if (!page) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
-- 
2.7.4



[PATCH v6 13/24] mm: Introduce __maybe_mkwrite()

2018-01-12 Thread Laurent Dufour
The current maybe_mkwrite() is getting passed the pointer to the vma
structure to fetch the vm_flags field.

When dealing with the speculative page fault handler, it will be better to
rely on the cached vm_flags value stored in the vm_fault structure.

This patch introduce a __maybe_mkwrite() service which can be called by
passing the value of the vm_flags field.

There is no change functional changes expected for the other callers of
maybe_mkwrite().

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 9 +++--
 mm/memory.c| 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f14a73a8d420..d77a11067b94 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -685,13 +685,18 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-   if (likely(vma->vm_flags & VM_WRITE))
+   if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
 
+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   return __maybe_mkwrite(pte, vma->vm_flags);
+}
+
 int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page);
 int finish_fault(struct vm_fault *vmf);
diff --git a/mm/memory.c b/mm/memory.c
index cbd7e5c3a42f..c24891d5676f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2438,7 +2438,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2529,8 +2529,8 @@ static int wp_page_copy(struct vm_fault *vmf)
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-   entry = mk_pte(new_page, vma->vm_page_prot);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = mk_pte(new_page, vmf->vma_page_prot);
+   entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
/*
 * Clear the pte entry and flush it first, before updating the
 * pte with the new entry. This will avoid a race condition
-- 
2.7.4



[PATCH v6 12/24] mm: Introduce __lru_cache_add_active_or_unevictable

2018-01-12 Thread Laurent Dufour
The speculative page fault handler which is run without holding the
mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
is not guaranteed to remain constant.
Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
value parameter instead of the vma pointer.

Signed-off-by: Laurent Dufour 
---
 include/linux/swap.h | 10 --
 mm/memory.c  |  8 
 mm/swap.c|  6 +++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a1a3f4ed94ce..99377b66ea93 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -337,8 +337,14 @@ extern void deactivate_file_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-   struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+   unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+   struct vm_area_struct *vma)
+{
+   return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
index e4c0f08b78e8..cbd7e5c3a42f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2540,7 +2540,7 @@ static int wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(new_page, vma);
+   __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
 * We call the notify macro here because, when using secondary
 * mmu page tables (such as kvm shadow page tables), we want the
@@ -3082,7 +3082,7 @@ int do_swap_page(struct vm_fault *vmf)
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -3232,7 +3232,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3484,7 +3484,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/swap.c b/mm/swap.c
index 566cfb9fdaf3..7e25a74397b9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -455,12 +455,12 @@ void lru_cache_add(struct page *page)
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+  unsigned long vma_flags)
 {
VM_BUG_ON_PAGE(PageLRU(page), page);
 
-   if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+   if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
SetPageActive(page);
else if (!TestSetPageMlocked(page)) {
/*
-- 
2.7.4



[PATCH v6 11/24] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()

2018-01-12 Thread Laurent Dufour
migrate_misplaced_page() is only called during the page fault handling so
it's better to pass the pointer to the struct vm_fault instead of the vma.

This way during the speculative page fault path the saved vma->vm_flags
could be used.

Signed-off-by: Laurent Dufour 
---
 include/linux/migrate.h | 4 ++--
 mm/memory.c | 2 +-
 mm/migrate.c| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0c6fe904bc97..08960ec74246 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -126,14 +126,14 @@ static inline void __ClearPageMovable(struct page *page)
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node);
+ struct vm_fault *vmf, int node);
 #else
 static inline bool pmd_trans_migrating(pmd_t pmd)
 {
return false;
 }
 static inline int migrate_misplaced_page(struct page *page,
-struct vm_area_struct *vma, int node)
+struct vm_fault *vmf, int node)
 {
return -EAGAIN; /* can't migrate now */
 }
diff --git a/mm/memory.c b/mm/memory.c
index 79dfd2a60224..e4c0f08b78e8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3878,7 +3878,7 @@ static int do_numa_page(struct vm_fault *vmf)
}
 
/* Migrate to the requested node */
-   migrated = migrate_misplaced_page(page, vma, target_nid);
+   migrated = migrate_misplaced_page(page, vmf, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
diff --git a/mm/migrate.c b/mm/migrate.c
index 33224b92db98..6e1ae62501da 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1900,7 +1900,7 @@ bool pmd_trans_migrating(pmd_t pmd)
  * node. Caller is expected to have an elevated reference count on
  * the page that will be dropped by this function before returning.
  */
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
   int node)
 {
pg_data_t *pgdat = NODE_DATA(node);
@@ -1913,7 +1913,7 @@ int migrate_misplaced_page(struct page *page, struct 
vm_area_struct *vma,
 * with execute permissions as they are probably shared libraries.
 */
if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
-   (vma->vm_flags & VM_EXEC))
+   (vmf->vma_flags & VM_EXEC))
goto out;
 
/*
-- 
2.7.4



[PATCH v6 10/24] mm: Cache some VMA fields in the vm_fault structure

2018-01-12 Thread Laurent Dufour
When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  6 ++
 mm/hugetlb.c   |  2 ++
 mm/khugepaged.c|  2 ++
 mm/memory.c| 38 --
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 61a2b63eccad..f14a73a8d420 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -361,6 +361,12 @@ struct vm_fault {
 * page table to avoid allocation from
 * atomic context.
 */
+   /*
+* These entries are required when handling speculative page fault.
+* This way the page handling is done using consistent field values.
+*/
+   unsigned long vma_flags;
+   pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ffcae114ceed..3b163ecc80e6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3717,6 +3717,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
.vma = vma,
.address = address,
.flags = flags,
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
/*
 * Hard to debug if it ends up being
 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 32314e9e48dd..a946d5306160 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -882,6 +882,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct 
*mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
};
 
/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index 3ac54a65b0f9..79dfd2a60224 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2595,7 +2595,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * Don't let another task, with possibly unlocked vma,
 * keep the mlocked page.
 */
-   if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+   if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
lock_page(old_page);/* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
@@ -2629,7 +2629,7 @@ static int wp_page_copy(struct vm_fault *vmf)
  */
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
-   WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+   WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
if (!pte_map_lock(vmf))
return VM_FAULT_RETRY;
/*
@@ -2731,7 +2731,7 @@ static int do_wp_page(struct vm_fault *vmf)
 * We should not cow pages in a shared writeable mapping.
 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 */
-   if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   if ((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
 (VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
 
@@ -2778,7 +2778,7 @@ static int do_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
unlock_page(vmf->page);
-   } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+   } else if (unlikely((vmf->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
@@ -3065,7 +3065,7 @@ int do_swap_page(struct vm_fault *vmf)
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
-   pte = mk_pte(page, vma->vm_page_prot);
+   pte = mk_pte(page, vmf->vma_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -3091,7 +3091,7 @@ int do_swap_page(struct vm_fault *vmf)
 
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
-   (vma->vm_flags & 

[PATCH v6 09/24] mm: Protect SPF handler against anon_vma changes

2018-01-12 Thread Laurent Dufour
The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 22bdc5c6c5ee..3ac54a65b0f9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -624,7 +624,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   vm_write_begin(vma);
unlink_anon_vmas(vma);
+   vm_write_end(vma);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -638,7 +640,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   vm_write_begin(vma);
unlink_anon_vmas(vma);
+   vm_write_end(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
-- 
2.7.4



[PATCH v6 08/24] mm: protect mremap() against SPF hanlder

2018-01-12 Thread Laurent Dufour
If a thread is remapping an area while another one is faulting on the
destination area, the SPF handler may fetch the vma from the RB tree before
the pte has been moved by the other thread. This means that the moved ptes
will overwrite those create by the page fault handler leading to page
leaked.

CPU 1   CPU2
enter mremap()
unmap the dest area
copy_vma()  Enter speculative page fault handler
   >> at this time the dest area is present in the RB tree
fetch the vma matching dest area
create a pte as the VMA matched
Exit the SPF handler

move_ptes()
  > it is assumed that the dest area is empty,
  > the move ptes overwrite the page mapped by the CPU2.

To prevent that, when the VMA matching the dest area is extended or created
by copy_vma(), it should be marked as non available to the SPF handler.
The usual way to so is to rely on vm_write_begin()/end().
This is already in __vma_adjust() called by copy_vma() (through
vma_merge()). But __vma_adjust() is calling vm_write_end() before returning
which create a window for another thread.
This patch adds a new parameter to vma_merge() which is passed down to
vma_adjust().
The assumption is that copy_vma() is returning a vma which should be
released by calling vm_raw_write_end() by the callee once the ptes have
been moved.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 16 
 mm/mmap.c  | 47 ---
 mm/mremap.c| 13 +
 3 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca7ceba84292..61a2b63eccad 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2186,16 +2186,24 @@ void anon_vma_interval_tree_verify(struct 
anon_vma_chain *node);
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int 
cap_sys_admin);
 extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-   struct vm_area_struct *expand);
+   struct vm_area_struct *expand, bool keep_locked);
 static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
-   return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+   return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
 }
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+extern struct vm_area_struct *__vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-   struct mempolicy *, struct vm_userfaultfd_ctx);
+   struct mempolicy *, struct vm_userfaultfd_ctx, bool keep_locked);
+static inline struct vm_area_struct *vma_merge(struct mm_struct *vma,
+   struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+   unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+   pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff)
+{
+   return __vma_merge(vma, prev, addr, end, vm_flags, anon, file, off,
+  pol, uff, false);
+}
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
diff --git a/mm/mmap.c b/mm/mmap.c
index 73e740291a3a..960e2f16ffcf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -684,7 +684,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm,
  */
 int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-   struct vm_area_struct *expand)
+   struct vm_area_struct *expand, bool keep_locked)
 {
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -996,7 +996,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
 
if (next && next != vma)
vm_raw_write_end(next);
-   vm_raw_write_end(vma);
+   if (!keep_locked)
+   vm_raw_write_end(vma);
 
validate_mm(mm);
 
@@ -1132,12 +1133,13 @@ can_vma_merge_after(struct vm_area_struct *vma, 
unsigned long vm_flags,
  * parameter) may establish ptes with the wrong permissions of 
  * instead of the right permissions of .
  */
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long 

[PATCH v6 07/24] mm: Protect VMA modifications using VMA sequence count

2018-01-12 Thread Laurent Dufour
The VMA sequence count has been introduced to allow fast detection of
VMA modification when running a page fault handler without holding
the mmap_sem.

This patch provides protection against the VMA modification done in :
- madvise()
- mpol_rebind_policy()
- vma_replace_policy()
- change_prot_numa()
- mlock(), munlock()
- mprotect()
- mmap_region()
- collapse_huge_page()
- userfaultd registering services

In addition, VMA fields which will be read during the speculative fault
path needs to be written using WRITE_ONCE to prevent write to be split
and intermediate values to be pushed to other CPUs.

Signed-off-by: Laurent Dufour 
---
 fs/proc/task_mmu.c |  5 -
 fs/userfaultfd.c   | 17 +
 mm/khugepaged.c|  3 +++
 mm/madvise.c   |  6 +-
 mm/mempolicy.c | 51 ++-
 mm/mlock.c | 13 -
 mm/mmap.c  | 17 ++---
 mm/mprotect.c  |  4 +++-
 mm/swap_state.c|  8 ++--
 9 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ec6d2983a5cb..e312d67e0297 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1155,8 +1155,11 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
-   vma->vm_flags &= ~VM_SOFTDIRTY;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & 
~VM_SOFTDIRTY);
vma_set_page_prot(vma);
+   vm_write_end(vma);
}
downgrade_write(>mmap_sem);
break;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87a13a7c8270..1da1ba63c7dd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -659,8 +659,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct 
list_head *fcs)
 
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+   vm_write_begin(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-   vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
+   vm_write_end(vma);
return 0;
}
 
@@ -885,8 +888,10 @@ static int userfaultfd_release(struct inode *inode, struct 
file *file)
vma = prev;
else
prev = vma;
-   vma->vm_flags = new_flags;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   vm_write_end(vma);
}
up_write(>mmap_sem);
mmput(mm);
@@ -1434,8 +1439,10 @@ static int userfaultfd_register(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
+   vm_write_end(vma);
 
skip:
prev = vma;
@@ -1592,8 +1599,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   vm_write_end(vma);
 
skip:
prev = vma;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b7e2268dfc9a..32314e9e48dd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1006,6 +1006,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (mm_find_pmd(mm, address) != pmd)
goto out;
 
+   vm_write_begin(vma);
anon_vma_lock_write(vma->anon_vma);
 
pte = pte_offset_map(pmd, address);
@@ -1041,6 +1042,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+   vm_write_end(vma);
result = SCAN_FAIL;
goto out;
   

[PATCH v6 06/24] mm: VMA sequence count

2018-01-12 Thread Laurent Dufour
From: Peter Zijlstra 

Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
counts such that we can easily test if a VMA is changed.

The unmap_page_range() one allows us to make assumptions about
page-tables; when we find the seqcount hasn't changed we can assume
page-tables are still valid.

The flip side is that we cannot distinguish between a vma_adjust() and
the unmap_page_range() -- where with the former we could have
re-checked the vma bounds against the address.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Build depends on CONFIG_SPF]
[Introduce vm_write_* inline function depending on CONFIG_SPF]
[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence by
 using vm_raw_write* functions]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h   | 41 +
 include/linux/mm_types.h |  3 +++
 mm/memory.c  |  2 ++
 mm/mmap.c| 35 +++
 4 files changed, 81 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad299ed7b85c..ca7ceba84292 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1369,6 +1369,47 @@ static inline void unmap_shared_mapping_range(struct 
address_space *mapping,
unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 
+#ifdef CONFIG_SPF
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+   write_seqcount_begin(>vm_sequence);
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+int subclass)
+{
+   write_seqcount_begin_nested(>vm_sequence, subclass);
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+   write_seqcount_end(>vm_sequence);
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+   raw_write_seqcount_begin(>vm_sequence);
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+   raw_write_seqcount_end(>vm_sequence);
+}
+#else
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+int subclass)
+{
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+}
+#endif /* CONFIG_SPF */
+
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fd1af6b9591d..e0e3df3b9641 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -333,6 +333,9 @@ struct vm_area_struct {
struct mempolicy *vm_policy;/* NUMA policy for the VMA */
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_SPF
+   seqcount_t vm_sequence;
+#endif
 } __randomize_layout;
 
 struct core_thread {
diff --git a/mm/memory.c b/mm/memory.c
index 6b2c4732e49d..22bdc5c6c5ee 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1503,6 +1503,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long next;
 
BUG_ON(addr >= end);
+   vm_write_begin(vma);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1512,6 +1513,7 @@ void unmap_page_range(struct mmu_gather *tlb,
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+   vm_write_end(vma);
 }
 
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 4bb038e7984b..1dc5397fbd59 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -558,6 +558,10 @@ void __vma_link_rb(struct mm_struct *mm, struct 
vm_area_struct *vma,
else
mm->highest_vm_end = vm_end_gap(vma);
 
+#ifdef CONFIG_SPF
+   seqcount_init(>vm_sequence);
+#endif
+
/*
 * vma->vm_prev wasn't known when we followed the rbtree to find the
 * correct insertion point for that vma. As a result, we could not
@@ -692,6 +696,30 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
long adjust_next = 0;
int remove_next = 0;
 
+   /*
+* Why using vm_raw_write*() functions here to avoid lockdep's warning ?
+*
+* Locked is complaining about a theoretical lock dependency, involving
+* 3 locks:
+*   mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
+*
+* Here are the major path leading to this dependency :
+*  1. __vma_adjust() mmap_sem  -> vm_sequence -> i_mmap_rwsem
+*  2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
+*  3. __alloc_pages_nodemask() fs_reclaim -> 

[PATCH v6 04/24] mm: Prepare for FAULT_FLAG_SPECULATIVE

2018-01-12 Thread Laurent Dufour
From: Peter Zijlstra 

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  1 +
 mm/memory.c| 56 ++
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 63f7ba111f64..ad299ed7b85c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -302,6 +302,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index 259f621345b2..868424ab850c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2438,6 +2438,13 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }
 
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+  vmf->address, >ptl);
+   return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2465,6 +2472,7 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2492,7 +2500,11 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+   ret = VM_FAULT_RETRY;
+   goto oom_free_new;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2584,7 +2596,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 oom:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2605,8 +2617,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  >ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2724,8 +2736,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2953,8 +2968,10 @@ int do_swap_page(struct vm_fault *vmf)
 * Back out if somebody else faulted in this pte
 * while we released the pte lock.
 */
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+   return VM_FAULT_RETRY;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = 

[PATCH v6 03/24] mm: Dont assume page-table invariance during faults

2018-01-12 Thread Laurent Dufour
From: Peter Zijlstra 

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the reliance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) 

[Remove only if !CONFIG_SPF]
Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 8a80986fff48..259f621345b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2274,6 +2274,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
+#ifndef CONFIG_SPF
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
  * read non-atomically.  Before making any commitment, on those architectures
@@ -2297,6 +2298,7 @@ static inline int pte_unmap_same(struct mm_struct *mm, 
pmd_t *pmd,
pte_unmap(page_table);
return same;
 }
+#endif /* CONFIG_SPF */
 
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
 {
@@ -2884,11 +2886,13 @@ int do_swap_page(struct vm_fault *vmf)
swapcache = page;
}
 
+#ifndef CONFIG_SPF
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
if (page)
put_page(page);
goto out;
}
+#endif
 
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
-- 
2.7.4



[PATCH v6 01/24] x86/mm: Define CONFIG_SPF

2018-01-12 Thread Laurent Dufour
Introduce CONFIG_SPF which turns on the Speculative Page Fault handler when
building for 64bits with SMP.

Signed-off-by: Laurent Dufour 
---
 arch/x86/Kconfig | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a317d5594b6a..d74353b85aaf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2882,6 +2882,10 @@ config X86_DMA_REMAP
 config HAVE_GENERIC_GUP
def_bool y
 
+config SPF
+   def_bool y
+   depends on X86_64 && SMP
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
-- 
2.7.4



[PATCH v6 02/24] powerpc/mm: Define CONFIG_SPF

2018-01-12 Thread Laurent Dufour
Define CONFIG_SPF for BOOK3S_64 and SMP. This enables the Speculative Page
Fault handler.

Support is only provide for BOOK3S_64 currently because:
- require CONFIG_PPC_STD_MMU because checks done in
  set_access_flags_filter()
- require BOOK3S because we can't support for book3e_hugetlb_preload()
  called by update_mmu_cache()

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/Kconfig | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d99250d9185d..31be1d69b350 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1209,6 +1209,10 @@ endif
 config ARCH_RANDOM
def_bool n
 
+config SPF
+   def_bool y
+   depends on PPC_BOOK3S_64 && SMP
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
-- 
2.7.4



[PATCH v6 00/24] Speculative page faults

2018-01-12 Thread Laurent Dufour
This is a port on kernel 4.15 of the work done by Peter Zijlstra to handle
page fault without holding the mm semaphore [1].

The idea is to try to handle user space page faults without holding the
mmap_sem. This should allow better concurrency for massively threaded
process since the page fault handler will not wait for other threads memory
layout change to be done, assuming that this change is done in another part
of the process's memory space. This type page fault is named speculative
page fault. If the speculative page fault fails because of a concurrency is
detected or because underlying PMD or PTE tables are not yet allocating, it
is failing its processing and a classic page fault is then tried.

The speculative page fault (SPF) has to look for the VMA matching the fault
address without holding the mmap_sem, this is done by introducing a rwlock
which protects the access to the mm_rb tree. Previously this was done using
SRCU but it was introducing a lot of scheduling to process the VMA's
freeing operation which was hitting the performance by 20% as reported by
Kemi Wang [2].Using a rwlock to protect access to the mm_rb tree is
limiting the locking contention to these operations which are expected to
be in a O(log n) order. In addition to ensure that the VMA is not freed in
our back a reference count is added and 2 services (get_vma() and
put_vma()) are introduced to handle the reference count. When a VMA is
fetch from the RB tree using get_vma() is must be later freeed using
put_vma(). Furthermore, to allow the VMA to be used again by the classic
page fault handler a service is introduced can_reuse_spf_vma(). This
service is expected to be called with the mmap_sem hold. It checked that
the VMA is still matching the specified address and is releasing its
reference count as the mmap_sem is hold it is ensure that it will not be
freed in our back. In general, the VMA's reference count could be
decremented when holding the mmap_sem but it should not be increased as
holding the mmap_sem is ensuring that the VMA is stable. I can't see
anymore the overhead I got while will-it-scale benchmark anymore.

The VMA's attributes checked during the speculative page fault processing
have to be protected against parallel changes. This is done by using a per
VMA sequence lock. This sequence lock allows the speculative page fault
handler to fast check for parallel changes in progress and to abort the
speculative page fault in that case.

Once the VMA is found, the speculative page fault handler would check for
the VMA's attributes to verify that the page fault has to be handled
correctly or not. Thus the VMA is protected through a sequence lock which
allows fast detection of concurrent VMA changes. If such a change is
detected, the speculative page fault is aborted and a *classic* page fault
is tried.  VMA sequence lockings are added when VMA attributes which are
checked during the page fault are modified.

When the PTE is fetched, the VMA is checked to see if it has been changed,
so once the page table is locked, the VMA is valid, so any other changes
leading to touching this PTE will need to lock the page table, so no
parallel change is possible at this time.

The locking of the PTE is done with interrupts disabled, this allows to
check for the PMD to ensure that there is not an ongoing collapsing
operation. Since khugepaged is firstly set the PMD to pmd_none and then is
waiting for the other CPU to have catch the IPI interrupt, if the pmd is
valid at the time the PTE is locked, we have the guarantee that the
collapsing opertion will have to wait on the PTE lock to move foward. This
allows the SPF handler to map the PTE safely. If the PMD value is different
than the one recorded at the beginning of the SPF operation, the classic
page fault handler will be called to handle the operation while holding the
mmap_sem. As the PTE lock is done with the interrupts disabled, the lock is
done using spin_trylock() to avoid dead lock when handling a page fault
while a TLB invalidate is requested by an other CPU holding the PTE.

Support for THP is not done because when checking for the PMD, we can be
confused by an in progress collapsing operation done by khugepaged. The
issue is that pmd_none() could be true either if the PMD is not already
populate or if the underlying PTE are in the way to be collapsed. So we
cannot safely allocate a PMD if pmd_none() is true.

This series builds on top of v4.14-rc5 and is functional on x86 and
PowerPC.

--
Benchmarks results

Base kernel is 4.15-rc6-mmotm-2018-01-04-16-19
SPF is BASE + this series

Kernbench:
--
Here are the results on a 16 CPUs X86 guest using kernbench on a 4.13-rc4
kernel (kernel is build 5 times):

Average Optimal load -j 8
 Base   SPF
 Run (std deviation)
Elapsed Time 148.04 (0.62446)   150.31 (0.940585)   1.53%
UserTime 1017.27 (1.23567)  1029.14 (4.43995)   1.17%
System  

Re: [PATCH V2] powerpc/kernel: Add 'ibm,thread-groups' property for CPU allocation

2018-01-12 Thread Nathan Fontenot
On 01/08/2018 11:19 AM, Michael Bringmann wrote:
> Add code to parse the new property 'ibm,thread-groups" when it is
> present.  The content of this property explicitly defines the number
> of threads per core as well as the PowerPC 'threads_core_mask'.
> The design provides a common device-tree for both P9 normal core and
> P9 fused core systems.  The new property has been observed to be
> available on P9 pHyp systems, but it is not always present on
> OpenPower BMC systems.
> 
> The property updates the kernel to know which CPUs/threads of each
> core are actually present, and then use the map when adding cores
> to the system at boot, or during hotplug operations.
> 
> * Previously, the information about the number of threads per core
>   was inferred solely from the "ibm,ppc-interrupt-server#s" property
>   in the system device tree.
> * Also previous to this property, The mask of threads per CPU was
>   inferred to be a strict linear series from 0..(nthreads-1).
> * After reading the "ibm,thread-group" property, we can determine
>   the number of threads per core to be the 'bitmask weight' of the
>   CPU thread mask.
> * Also after reading the property, we can determine which of the
>   possible threads we are allowed to online for each CPU.  It is no
>   longer a simple linear sequence, but may be discontinuous e.g.
>   activate threads 1,2,3,5,6,7 on a core instead of 0-5 sequentially.
> 
> Implementation of the "ibm,thread-groups" property is spread across
> a few files in the powerpc specific code:
> 
> * prom.c: Parse the property and create 'ppc_thread_group_mask'.
>   Use the mask in operation of early_init_dt_scan_cpus().
> * setup-common.c: Import 'ppc_thread_group_mask' and use the value
>   in the operation of cpu_init_thread_core_maps(), and
>   smp_setup_cpu_maps.
> * hotplug-cpu.c: Use 'ppc_thread_group_mask' in several locations
>   where the code previously expected to iterate over a
>   linear series of active threads (0..nthreads-1).
> 
> Note that the "ibm,thread-groups" property also includes semantics
> of 'thread-group' i.e. define one or more subgroups of the available
> threads, each group of threads to be used for a specific class of
> task.  Translating thread group semantics into Linux kernel features
> is TBD.

One thing I don't see addressed in the comments or in the code is
migration support. I think we need to update the thread group mask
post-migration to reflect the threads per core on the new system.

-Nathan

> 
> Signed-off-by: Michael Bringmann 
> ---
> Changes in V2:
>   -- Add more information and examples to the patch description.
>   -- Rename 'pseries_thread_group_mask' to 'ppc_thread_group_mask'
>   -- Remove unnecessary debug message complaining about absence of
>  property.
>   -- Reduce indent complexity of early_init_dt_scan_cpus().
> ---
>  arch/powerpc/include/asm/cputhreads.h|2 +
>  arch/powerpc/kernel/prom.c   |   74 
> ++
>  arch/powerpc/kernel/setup-common.c   |   30 +++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |   13 -
>  4 files changed, 107 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cputhreads.h 
> b/arch/powerpc/include/asm/cputhreads.h
> index d71a909..8e444d4 100644
> --- a/arch/powerpc/include/asm/cputhreads.h
> +++ b/arch/powerpc/include/asm/cputhreads.h
> @@ -31,6 +31,8 @@
>  #define threads_core_mask(*get_cpu_mask(0))
>  #endif
> 
> +extern cpumask_t ppc_thread_group_mask;
> +
>  /* cpu_thread_mask_to_cores - Return a cpumask of one per cores
>   *hit by the argument
>   *
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index b15bae2..0a49231 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -68,6 +68,9 @@
>  #define DBG(fmt...)
>  #endif
> 
> +cpumask_t ppc_thread_group_mask;
> +EXPORT_SYMBOL(ppc_thread_group_mask);
> +
>  #ifdef CONFIG_PPC64
>  int __initdata iommu_is_off;
>  int __initdata iommu_force_on;
> @@ -303,6 +306,71 @@ static void __init check_cpu_feature_properties(unsigned 
> long node)
>   }
>  }
> 
> +static void __init early_init_setup_thread_group_mask(unsigned long node,
> + cpumask_t *thread_group_mask)
> +{
> + const __be32 *thrgrp;
> + int len, rc = 0;
> + u32 cc_type = 0, no_split = 0, thr_per_split = 0;
> + int j, k;
> +
> + cpumask_clear(thread_group_mask);
> +
> + thrgrp = of_get_flat_dt_prop(node, "ibm,thread-groups", );
> + if (!thrgrp)
> + return;
> +
> + /* Process the thread groups for the Core thread mask */
> + /* Characteristic type per table */
> + cc_type = of_read_number(thrgrp++, 1);
> +
> + /*
> +  * 1 : Group shares common L1, translation cache, and
> +  * instruction data flow
> +  * >1 : Reserved
> +  */
> + 

Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Segher Boessenkool
Hi!

On Fri, Jan 12, 2018 at 03:55:47PM +0100, Arnd Bergmann wrote:
> >crypto/aes_generic.o: In function `crypto_aes_set_key':
> >>> aes_generic.c:(.text+0x4e0): undefined reference to `_restgpr_31_x'
> 
> adding linuxpcc-dev to Cc, maybe someone knows a way out of this.
> It appears related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43810

It is not.

> but I don't know what _restgpr_31_x actually does,

It restores GPR31 from stack, restores LR from stack, and returns
(_x is "exit").

> why it's not provided by the kernel

Because the kernel refuses to use libgcc.  Let's, uh, not start that
again?  :-)

> or why the aes_generic implementation needs this on
> powerpc when built with 'gcc -Os'. FWIW, the -Os change was needed
> to work around a possible kernel stack overflow that can happen with
> gcc-7.2, see https://patchwork.kernel.org/patch/10143607/ and
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356

The _x versions are smaller but slower; that's why they are used with -Os.
Apparently nothing else was built with -Os (and the other needed flags)
before.


Segher


Re: DPAA Ethernet problems with mainstream Linux kernels

2018-01-12 Thread Jamie Krueger

On 01/12/2018 08:22 AM, Madalin-cristian Bucur wrote:

-Original Message-
From: Linuxppc-dev [mailto:linuxppc-dev-
bounces+madalin.bucur=nxp@lists.ozlabs.org] On Behalf Of Jamie Krueger
Sent: Wednesday, January 10, 2018 5:57 PM
To: linuxppc-dev@lists.ozlabs.org
Subject: DPAA Ethernet problems with mainstream Linux kernels

Hello all @ linuxppc-dev,

I have been working with a team of people maintaining PowerPC
Linux for the new AmigaONE X5000/20 (a Freescale p5020 SoC based
machine).

We are trying to determine why the submitted Data Path Acceleration
Architecture (DPAA) Ethernet Driver is not fully functional with
the mainstream Linux kernels.

Hi Jamie,

Hi Madalin,

We are testing the DPAA driver on several DS and RDB platforms and it
is working properly. The issues you encounter with it on the X5000/20
are likely caused by some issues specific to that particular platform.

It is good to hear that the DPAA driver is functioning correctly
on the reference platforms. I am positive you are correct that
the issue is the difference in implementation on the X5000/20
(Cyrus) motherboard, as compared to the reference boards.

Can you verify which Linux Kernel sources your tests are being
performed on? We have been testing using the mainstream
Linux sources up to linux-4.15-rc6 thus far.


The device tree that you mention, cyrus_p5020.eth.dts is not found in
the Linux kernel sources. The cyrus_p5020.dts file from the fsl ppc
device tree folder does not include the PHY information for the DPAA
interfaces. The problems that you experience may be caused by some
issues with the PHY configuration (i.e. internal delay).

The cyrus_p5020.eth.dts is a modified version of the cyrus_p5020.dts,
which of course was based off the original p5020ds.dts file. As you
noted, the current cyrus_p5020.dts file is incomplete, and does not
map the Ethernet connections properly.

The cyrus_p5020.eth.dts file, along with it's cyrus-pre.dtsi dependent
file, are an attempt to correctly define the Ethernet hardware, as it is
implemented on the X5000/20.

** I have attached both the cyrus_p5020.eth.dts and cyrus-pre.dtsi
 files with this email for comparison. Please let me know if you see
 any corrections that should be made to either file.

I am not sure what PHY hardware/configuration you are using on the
DS and RDB platforms, but I can confirm that AmigaONE X5000/20
(Cyrus Motherboard with p5020 SoC), has dTSEC 4 and dTSEC 5
wired to two Micrel KSZ9021RN Gigabit Ethernet PHYs, using the
RGMII protocol.


  I suggest
that you connect the DPAA interface to a traffic analyzer or directly
to another device on which you can capture the incoming traffic and
check that the received frames are correct.

I have started testing along that line, using Wireshark to view the
traffic on the X5000/20 itself, and from another machine connected
on the same subnet. So far (as indicated by some details of in my
initial email), I can see outgoing broadcast requests (for DHCP)
being sent out from the X5000/20, and these requests are correctly
constructed and visible outside the X5000/20.

However, no responses to the DHCP broadcasts appear to reach
to X5000/20's DPAA Ethernet. I will need to setup some further
tests to determine if the DHCP server saw the requests and responded
to them. (I assume the DHCP server is getting them, and responding,
as I can always get a successful DHCP response to the X5000/20
when using an add-on Ethernet PICe card on the same subnet).

I will setup some more direct machine-to-machine testing to
see what else I can glean from the network traffic.

Please have a look at the attached dts files, maybe there is something
obvious there we are not seeing.

Also, given that the X5000/20 uses Micrel KSZ9021RN PHYs in RGMII
mode, what changes to the DPAA hardware configuration should we
expect to see so that the DPAA is configured to talk to them?


Madalin


--

Best Regards,

Jamie Krueger
BITbyBIT Software Group LLC


Here is the results from my latest tests. They were performed using
the linux-4.10.17 ppc64, since that represents when the DPAA Ethernet
code was introduced.

Similar tests, with similar results, were also performed
using the latest Linux kernels:

linux-4.15-rc5
linux-4.15-rc6
linux-4.15-rc7

(Hence the reason for falling back to test the kernel right
   after the introduction of the DPAA Ethernet driver sources)

---

All Kernel builds had the DPAA Ethernet enabled in the kernel,
and are using the correct cyrus_p5020.eth.dtb device tree file
(for use on the X5000/20).

The results are quite similar for all kernels in regards to the DPAA
Ethernet.

All tested kernels setup the two Ethernet interfaces correctly
as eth0 and eth1, and pull the correct MAC addresses from U-Boot
environment variables ethaddr and eth1addr respectively.

So at this point Linux has what it believes is fully configured
hardware, waiting to have an IP Address/Netmask/Gateway
to be set and to bring the interface online.


Re: [cryptodev:master 130/134] aes_generic.c:undefined reference to `_restgpr_31_x'

2018-01-12 Thread Arnd Bergmann
On Fri, Jan 12, 2018 at 3:11 PM, kbuild test robot
 wrote:
> tree:   
> https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git 
> master
> head:   b40fa82cd6138350f723aa47b37e3e3e80906b40
> commit: 148b974deea927f5dbb6c468af2707b488bfa2de [130/134] crypto: 
> aes-generic - build with -Os on gcc-7+
> config: powerpc-linkstation_defconfig (attached as .config)
> compiler: powerpc-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
> reproduce:
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> git checkout 148b974deea927f5dbb6c468af2707b488bfa2de
> # save the attached .config to linux build tree
> make.cross ARCH=powerpc
>
> All errors (new ones prefixed by >>):
>
>crypto/aes_generic.o: In function `crypto_aes_set_key':
>>> aes_generic.c:(.text+0x4e0): undefined reference to `_restgpr_31_x'

adding linuxpcc-dev to Cc, maybe someone knows a way out of this.
It appears related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43810
but I don't know what _restgpr_31_x actually does, why it's not provided
by the kernel or why the aes_generic implementation needs this on
powerpc when built with 'gcc -Os'. FWIW, the -Os change was needed
to work around a possible kernel stack overflow that can happen with
gcc-7.2, see https://patchwork.kernel.org/patch/10143607/ and
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356

 Arnd


RE: DPAA Ethernet problems with mainstream Linux kernels

2018-01-12 Thread Madalin-cristian Bucur
> -Original Message-
> From: Linuxppc-dev [mailto:linuxppc-dev-
> bounces+madalin.bucur=nxp@lists.ozlabs.org] On Behalf Of Jamie Krueger
> Sent: Wednesday, January 10, 2018 5:57 PM
> To: linuxppc-dev@lists.ozlabs.org
> Subject: DPAA Ethernet problems with mainstream Linux kernels
> 
> Hello all @ linuxppc-dev,
> 
> I have been working with a team of people maintaining PowerPC
> Linux for the new AmigaONE X5000/20 (a Freescale p5020 SoC based
> machine).
> 
> We are trying to determine why the submitted Data Path Acceleration
> Architecture (DPAA) Ethernet Driver is not fully functional with
> the mainstream Linux kernels.

Hi Jamie,

We are testing the DPAA driver on several DS and RDB platforms and it
is working properly. The issues you encounter with it on the X5000/20
are likely caused by some issues specific to that particular platform.
The device tree that you mention, cyrus_p5020.eth.dts is not found in
the Linux kernel sources. The cyrus_p5020.dts file from the fsl ppc
device tree folder does not include the PHY information for the DPAA
interfaces. The problems that you experience may be caused by some
issues with the PHY configuration (i.e. internal delay). I suggest
that you connect the DPAA interface to a traffic analyzer or directly
to another device on which you can capture the incoming traffic and
check that the received frames are correct.

Madalin

> Here is the results from my latest tests. They were performed using
> the linux-4.10.17 ppc64, since that represents when the DPAA Ethernet
> code was introduced.
> 
> Similar tests, with similar results, were also performed
> using the latest Linux kernels:
> 
> linux-4.15-rc5
> linux-4.15-rc6
> linux-4.15-rc7
> 
> (Hence the reason for falling back to test the kernel right
>   after the introduction of the DPAA Ethernet driver sources)
> 
> ---
> 
> All Kernel builds had the DPAA Ethernet enabled in the kernel,
> and are using the correct cyrus_p5020.eth.dtb device tree file
> (for use on the X5000/20).
> 
> The results are quite similar for all kernels in regards to the DPAA
> Ethernet.
> 
> All tested kernels setup the two Ethernet interfaces correctly
> as eth0 and eth1, and pull the correct MAC addresses from U-Boot
> environment variables ethaddr and eth1addr respectively.
> 
> So at this point Linux has what it believes is fully configured
> hardware, waiting to have an IP Address/Netmask/Gateway
> to be set and to bring the interface online.
> 
> However, all attempts to communicate with the outside world
> do not make it out the physical (PHY) hardware - or do they?
> 
> ** The following results were captured under linux-4.10.17 **
> 
> When I bring the interface up using a static address, in this case
> 192.168.1.21, I see the following (NOTE TX bytes says 154.0 KB,
> while RX bytes says 0.0 B):
> 
> jamie@X5000-Linux:$ ifconfig
> eth0  Link encap:Ethernet  HWaddr 00:80:10:11:11:11
>    inet addr:192.168.1.21  Bcast:192.168.1.255 Mask:255.255.255.0
>    inet6 addr: fe80::280:10ff:fe11:/64 Scope:Link
>    UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>    RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>    TX packets:1428 errors:0 dropped:0 overruns:0 carrier:0
>    collisions:0 txqueuelen:1000
>    RX bytes:0 (0.0 B)  TX bytes:154066 (154.0 KB)
>    Memory:fe4e6000-fe4e6fff
> 
> eth1  Link encap:Ethernet  HWaddr 00:80:10:22:22:22
>    UP BROADCAST MULTICAST  MTU:1500  Metric:1
>    RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>    TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>    collisions:0 txqueuelen:1000
>    RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>    Memory:fe4e8000-fe4e8fff
> 
> lo    Link encap:Local Loopback
>    inet addr:127.0.0.1  Mask:255.0.0.0
>    inet6 addr: ::1/128 Scope:Host
>    UP LOOPBACK RUNNING  MTU:65536  Metric:1
>    RX packets:1869 errors:0 dropped:0 overruns:0 frame:0
>    TX packets:1869 errors:0 dropped:0 overruns:0 carrier:0
>    collisions:0 txqueuelen:1000
>    RX bytes:156932 (156.9 KB)  TX bytes:156932 (156.9 KB)
> 
> Checking the routing table, everything looks fine there:
> 
> jamie@X5000-Linux:$ netstat -r
> Kernel IP routing table
> Destination Gateway Genmask Flags   MSS Window  irtt
> Iface
> default 192.168.1.1 0.0.0.0 UG    0 0  0
> eth0
> link-local  *   255.255.0.0 U 0 0  0
> eth0
> 192.168.1.0 *   255.255.255.0   U 0 0  0
> eth0
> 
> Attempting to PING the interface itself works:
> 
> jamie@X5000-Linux:$ ping 192.168.1.21
> PING 192.168.1.21 (192.168.1.21) 56(84) bytes of data.
> 64 bytes from 192.168.1.21: icmp_seq=1 ttl=64 time=0.037 ms
> 64 bytes from 192.168.1.21: icmp_seq=2 ttl=64 time=0.045 ms
> 64 bytes from 192.168.1.21: icmp_seq=3 ttl=64 

[PATCH 8/8] powerpc/8xx: Use L1 entry APG to handle _PAGE_ACCESSED for CONFIG_SWAP

2018-01-12 Thread Christophe Leroy
When CONFIG_SWAP is set, the TLB miss handlers have to also take
into account _PAGE_ACCESSED flag. At the moment it is done by
anding _PAGE_ACCESSED into _PAGE_PRESENT using 3 instructions.

This patch uses APG for handling _PAGE_ACCESSED, allowing to
just copy _PAGE_ACCESSED bit into APG field, hence reducing the
action to a single instruction.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-8xx.h | 34 +++-
 arch/powerpc/kernel/head_8xx.S | 45 +++---
 arch/powerpc/mm/8xx_mmu.c  |  2 +-
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index ae68f6c848d3..ee5591fe6efc 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -34,12 +34,20 @@
  * respectively NA for All or X for Supervisor and no access for User.
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
- * Therefore, we define 2 APG groups. lsb is _PMD_USER
- * 0 => No user => 01 (all accesses performed according to page definition)
- * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
+ * We also use the 2nd APG bit for _PAGE_ACCESSED when having SWAP:
+ * When that bit is not set access is done iaw "all user"
+ * which means no access iaw page rules.
+ * Therefore, we define 4 APG groups. lsb is _PMD_USER, 2nd is _PAGE_ACCESSED
+ * 0x => No access => 11 (all accesses performed as user iaw page definition)
+ * 10 => No user => 01 (all accesses performed according to page definition)
+ * 11 => User => 00 (all accesses performed as supervisor iaw page definition)
  * We define all 16 groups so that all other bits of APG can take any value
  */
+#ifdef CONFIG_SWAP
+#define MI_APG_INIT0xf4f4f4f4
+#else
 #define MI_APG_INIT0x
+#endif
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
@@ -107,12 +115,20 @@
  * Supervisor and no access for user and NA for ALL.
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
- * Therefore, we define 2 APG groups. lsb is _PMD_USER
- * 0 => No user => 01 (all accesses performed according to page definition)
- * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
+ * We also use the 2nd APG bit for _PAGE_ACCESSED when having SWAP:
+ * When that bit is not set access is done iaw "all user"
+ * which means no access iaw page rules.
+ * Therefore, we define 4 APG groups. lsb is _PMD_USER, 2nd is _PAGE_ACCESSED
+ * 0x => No access => 11 (all accesses performed as user iaw page definition)
+ * 10 => No user => 01 (all accesses performed according to page definition)
+ * 11 => User => 00 (all accesses performed as supervisor iaw page definition)
  * We define all 16 groups so that all other bits of APG can take any value
  */
+#ifdef CONFIG_SWAP
+#define MD_APG_INIT0xf4f4f4f4
+#else
 #define MD_APG_INIT0x
+#endif
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MD_RPN is written, bits in
@@ -164,6 +180,12 @@
  */
 #define SPRN_M_TW  799
 
+/* APGs */
+#define M_APG0 0x
+#define M_APG1 0x0020
+#define M_APG2 0x0040
+#define M_APG3 0x0060
+
 #ifndef __ASSEMBLY__
 typedef struct {
unsigned int id;
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index c3b831bb8bad..d8670a37d70c 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -354,14 +354,13 @@ _ENTRY(ITLBMiss_cmp)
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mtcrr12
 #endif
-   /* Load the MI_TWC with the attributes for this "segment." */
-   mtspr   SPRN_MI_TWC, r11/* Set segment attributes */
 
 #ifdef CONFIG_SWAP
-   rlwinm  r11, r10, 32-5, _PAGE_PRESENT
-   and r11, r11, r10
-   rlwimi  r10, r11, 0, _PAGE_PRESENT
+   rlwinm  r11, r10, 31, _PAGE_ACCESSED >> 1
 #endif
+   /* Load the MI_TWC with the attributes for this "segment." */
+   mtspr   SPRN_MI_TWC, r11/* Set segment attributes */
+
li  r11, RPN_PATTERN | 0x200
/* The Linux PTE won't go exactly into the MMU TLB.
 * Software indicator bits 20 and 23 must be clear.
@@ -472,22 +471,14 @@ _ENTRY(DTLBMiss_jmp)
 * above.
 */
rlwimi  r11, r10, 0, _PAGE_GUARDED
-   mtspr   SPRN_MD_TWC, r11
-
-   /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set.
-* We also need to know if the insn is a load/store, so:
-* Clear _PAGE_PRESENT and load that which will
-* trap into DTLB Error with store bit set accordinly.
- 

[PATCH 7/8] powerpc/8xx: Remove _PAGE_USER and handle user access at PMD level

2018-01-12 Thread Christophe Leroy
As Linux kernel separates KERNEL and USER address spaces, there is
therefore no need to flag USER access at page level.

Today, the 8xx TLB handlers already handle user access in the L1 entry
through Access Protection Groups, it is then natural to move the user
access handling at PMD level once _PAGE_NA allows to handle PAGE_NONE
protection without _PAGE_USER

In the mean time, as we free up one bit in the PTE, we can use it to
include SPS (page size flag) in the PTE and avoid handling it at every
TLB miss hence removing special handling based on compiled page size.

For _PAGE_EXEC, we rework it to use PP PTE bits, avoiding the copy
of _PAGE_EXEC bit into the L1 entry. Unfortunatly we are not
able to put it at the correct location as it conflicts with
NA/RO/RW bits for data entries.

Upper bits of APG in L1 entry overlap with PMD base address. In
order to avoid having to filter that out, we set up all groups so that
upper bits can have any value.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hugetlb.h   |  3 +-
 arch/powerpc/include/asm/mmu-8xx.h   | 34 ++--
 arch/powerpc/include/asm/nohash/32/pgalloc.h |  3 +-
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 14 ++---
 arch/powerpc/include/asm/nohash/pgtable.h|  2 +-
 arch/powerpc/include/asm/pte-common.h|  6 
 arch/powerpc/kernel/head_8xx.S   | 46 ++--
 arch/powerpc/mm/hugetlbpage.c|  2 +-
 8 files changed, 48 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 14c9d44f355b..1a4847f67ea8 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -47,8 +47,7 @@ static inline pte_t *hugepd_page(hugepd_t hpd)
 {
BUG_ON(!hugepd_ok(hpd));
 #ifdef CONFIG_PPC_8xx
-   return (pte_t *)__va(hpd_val(hpd) &
-~(_PMD_PAGE_MASK | _PMD_PRESENT_MASK));
+   return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK);
 #else
return (pte_t *)((hpd_val(hpd) &
  ~HUGEPD_SHIFT_MASK) | PD_HUGE);
diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index 40aa7b0cd0dc..ae68f6c848d3 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -29,17 +29,17 @@
 #define MI_Kp  0x4000  /* Should always be set */
 
 /*
- * All pages' PP exec bits are set to 000, which means Execute for Supervisor
- * and no Execute for User.
- * Then we use the APG to say whether accesses are according to Page rules,
- * "all Supervisor" rules (Exec for all) and "all User" rules (Exec for noone)
- * Therefore, we define 4 APG groups. msb is _PAGE_EXEC, lsb is _PAGE_USER
- * 0 (00) => Not User, no exec => 11 (all accesses performed as user)
- * 1 (01) => User but no exec => 11 (all accesses performed as user)
- * 2 (10) => Not User, exec => 01 (rights according to page definition)
- * 3 (11) => User, exec => 00 (all accesses performed as supervisor)
- */
-#define MI_APG_INIT0xf4ff
+ * All pages' PP data bits are set to either 001 or 011 by copying _PAGE_EXEC
+ * into bit 21 in the ITLBmiss handler (bit 21 is the middle bit), which means
+ * respectively NA for All or X for Supervisor and no access for User.
+ * Then we use the APG to say whether accesses are according to Page rules or
+ * "all Supervisor" rules (Access to all)
+ * Therefore, we define 2 APG groups. lsb is _PMD_USER
+ * 0 => No user => 01 (all accesses performed according to page definition)
+ * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
+ * We define all 16 groups so that all other bits of APG can take any value
+ */
+#define MI_APG_INIT0x
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
@@ -102,17 +102,17 @@
 #define MD_Kp  0x4000  /* Should always be set */
 
 /*
- * All pages' PP data bits are set to either 000 or 011, which means
+ * All pages' PP data bits are set to either 000 or 011 or 001, which means
  * respectively RW for Supervisor and no access for User, or RO for
- * Supervisor and no access for user.
+ * Supervisor and no access for user and NA for ALL.
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
- * Therefore, we define 2 APG groups. lsb is _PAGE_USER
+ * Therefore, we define 2 APG groups. lsb is _PMD_USER
  * 0 => No user => 01 (all accesses performed according to page definition)
- * 1 => User => 00 (all accesses performed as supervisor
- * according to page definition)
+ * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
+ * We define all 16 groups so that all other bits of APG can take any value
  */
-#define 

[PATCH 6/8] powerpc/mm: Introduce _PAGE_NA

2018-01-12 Thread Christophe Leroy
Today, PAGE_NONE is defined as a page not having _PAGE_USER.
In some circunstances, when the CPU supports it, it might be
better to be able to flag a page with NO ACCESS.

In a following patch, the 8xx will switch user access being flagged
in the PMD, therefore it will not be possible anymore to use
_PAGE_USER as a way to flag a page with no access.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  1 +
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 +-
 arch/powerpc/include/asm/pte-common.h|  7 +--
 arch/powerpc/mm/dump_linuxpagetables.c   | 18 +++---
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index db38050e1a98..f1c43d9b0773 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -14,6 +14,7 @@
  */
 #define _PAGE_BIT_SWAP_TYPE0
 
+#define _PAGE_NA   0
 #define _PAGE_RO   0
 #define _PAGE_USER 0
 
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index cc2bfec3aa3b..504a3c36ce5c 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -282,7 +282,7 @@ static inline void __ptep_set_access_flags(struct mm_struct 
*mm,
 {
unsigned long set = pte_val(entry) &
(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
-   unsigned long clr = ~pte_val(entry) & _PAGE_RO;
+   unsigned long clr = ~pte_val(entry) & (_PAGE_RO | _PAGE_NA);
 
pte_update(ptep, clr, set);
 }
diff --git a/arch/powerpc/include/asm/pte-common.h 
b/arch/powerpc/include/asm/pte-common.h
index 0e6595a1b9d8..426a902816c5 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -50,6 +50,9 @@
 #define _PAGE_USER 0
 #endif
 #endif
+#ifndef _PAGE_NA
+#define _PAGE_NA 0
+#endif
 
 #ifndef _PMD_PRESENT_MASK
 #define _PMD_PRESENT_MASK  _PMD_PRESENT
@@ -122,7 +125,7 @@ static inline bool pte_user(pte_t pte)
 /* Mask of bits returned by pte_pgprot() */
 #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \
 _PAGE_WRITETHRU | _PAGE_ENDIAN | _PAGE_4K_PFN | \
-_PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \
+_PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | _PAGE_NA | \
 _PAGE_PRIVILEGED | \
 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC)
 
@@ -150,7 +153,7 @@ static inline bool pte_user(pte_t pte)
  *
  * Note due to the way vm flags are laid out, the bits are XWR
  */
-#define PAGE_NONE  __pgprot(_PAGE_BASE)
+#define PAGE_NONE  __pgprot(_PAGE_BASE | _PAGE_NA)
 #define PAGE_SHARED__pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW)
 #define PAGE_SHARED_X  __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \
 _PAGE_EXEC)
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c 
b/arch/powerpc/mm/dump_linuxpagetables.c
index d9547e1ec5ef..4e8c6e593276 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -117,16 +117,20 @@ static const struct flag_info flag_array[] = {
.set= "user",
.clear  = "",
}, {
-#if _PAGE_RO == 0
-   .mask   = _PAGE_RW,
+   .mask   = _PAGE_RW | _PAGE_RO | _PAGE_NA,
.val= _PAGE_RW,
-#else
-   .mask   = _PAGE_RO,
-   .val= 0,
-#endif
.set= "rw",
-   .clear  = "ro",
}, {
+   .mask   = _PAGE_RW | _PAGE_RO | _PAGE_NA,
+   .val= _PAGE_RO,
+   .set= "ro",
+   }, {
+#if _PAGE_NA != 0
+   .mask   = _PAGE_RW | _PAGE_RO | _PAGE_NA,
+   .val= _PAGE_RO,
+   .set= "na",
+   }, {
+#endif
.mask   = _PAGE_EXEC,
.val= _PAGE_EXEC,
.set= " X ",
-- 
2.13.3



[PATCH 5/8] powerpc/mm: extend _PAGE_PRIVILEGED to all CPUs

2018-01-12 Thread Christophe Leroy
commit ac29c64089b74 ("powerpc/mm: Replace _PAGE_USER with
_PAGE_PRIVILEGED") introduced _PAGE_PRIVILEGED for BOOK3S/64

This patch generalises _PAGE_PRIVILEGED for all CPUs, allowing
to have either _PAGE_PRIVILEGED or _PAGE_USER or both.

PPC_8xx has a _PAGE_SHARED flag which is set for and only for
all non user pages. Lets rename it _PAGE_PRIVILEGED to remove
confusion as it has nothing to do with Linux shared pages.

On BookE, there's a _PAGE_BAP_SR which has to be set for kernel
pages: defining _PAGE_PRIVILEGED as _PAGE_BAP_SR will make
this generic

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  2 +-
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 10 +-
 arch/powerpc/include/asm/nohash/pte-book3e.h |  1 +
 arch/powerpc/include/asm/pte-common.h| 24 
 arch/powerpc/kernel/head_8xx.S   |  6 +++---
 arch/powerpc/mm/8xx_mmu.c|  2 +-
 arch/powerpc/mm/dump_linuxpagetables.c   | 11 +--
 arch/powerpc/mm/pgtable.c|  3 ++-
 arch/powerpc/mm/pgtable_32.c |  9 +
 arch/powerpc/mm/pgtable_64.c | 14 +-
 10 files changed, 28 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 44697817ccc6..db38050e1a98 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -15,7 +15,7 @@
 #define _PAGE_BIT_SWAP_TYPE0
 
 #define _PAGE_RO   0
-#define _PAGE_SHARED   0
+#define _PAGE_USER 0
 
 #define _PAGE_EXEC 0x1 /* execute permission */
 #define _PAGE_WRITE0x2 /* write access allowed */
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 19a5ecaef265..7c7040f015e2 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -31,7 +31,7 @@
 /* Definitions for 8xx embedded chips. */
 #define _PAGE_PRESENT  0x0001  /* Page is valid */
 #define _PAGE_NO_CACHE 0x0002  /* I: cache inhibit */
-#define _PAGE_SHARED   0x0004  /* No ASID (context) compare */
+#define _PAGE_PRIVILEGED   0x0004  /* No ASID (context) compare */
 #define _PAGE_SPECIAL  0x0008  /* SW entry, forced to 0 by the TLB miss */
 #define _PAGE_DIRTY0x0100  /* C: page changed */
 
@@ -54,13 +54,5 @@
 /* Until my rework is finished, 8xx still needs atomic PTE updates */
 #define PTE_ATOMIC_UPDATES 1
 
-/* We need to add _PAGE_SHARED to kernel pages */
-#define _PAGE_KERNEL_RO(_PAGE_SHARED | _PAGE_RO)
-#define _PAGE_KERNEL_ROX   (_PAGE_SHARED | _PAGE_RO | _PAGE_EXEC)
-#define _PAGE_KERNEL_RW(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW 
| \
-_PAGE_HWWRITE)
-#define _PAGE_KERNEL_RWX   (_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
-_PAGE_HWWRITE | _PAGE_EXEC)
-
 #endif /* __KERNEL__ */
 #endif /*  _ASM_POWERPC_NOHASH_32_PTE_8xx_H */
diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h 
b/arch/powerpc/include/asm/nohash/pte-book3e.h
index 2da4532ca377..ccee8eb509bb 100644
--- a/arch/powerpc/include/asm/nohash/pte-book3e.h
+++ b/arch/powerpc/include/asm/nohash/pte-book3e.h
@@ -55,6 +55,7 @@
 #define _PAGE_KERNEL_RWX   (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | 
_PAGE_BAP_SX)
 #define _PAGE_KERNEL_ROX   (_PAGE_BAP_SR | _PAGE_BAP_SX)
 #define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */
+#define _PAGE_PRIVILEGED   (_PAGE_BAP_SR)
 
 #define _PAGE_HASHPTE  0
 #define _PAGE_BUSY 0
diff --git a/arch/powerpc/include/asm/pte-common.h 
b/arch/powerpc/include/asm/pte-common.h
index ce142ef99ba7..0e6595a1b9d8 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -8,9 +8,6 @@
 #ifndef _PAGE_HASHPTE
 #define _PAGE_HASHPTE  0
 #endif
-#ifndef _PAGE_SHARED
-#define _PAGE_SHARED   0
-#endif
 #ifndef _PAGE_HWWRITE
 #define _PAGE_HWWRITE  0
 #endif
@@ -45,6 +42,14 @@
 #ifndef _PAGE_PTE
 #define _PAGE_PTE 0
 #endif
+/* At least one of _PAGE_PRIVILEGED or _PAGE_USER must be defined */
+#ifndef _PAGE_PRIVILEGED
+#define _PAGE_PRIVILEGED 0
+#else
+#ifndef _PAGE_USER
+#define _PAGE_USER 0
+#endif
+#endif
 
 #ifndef _PMD_PRESENT_MASK
 #define _PMD_PRESENT_MASK  _PMD_PRESENT
@@ -54,16 +59,18 @@
 #define PMD_PAGE_SIZE(pmd) bad_call_to_PMD_PAGE_SIZE()
 #endif
 #ifndef _PAGE_KERNEL_RO
-#define _PAGE_KERNEL_RO(_PAGE_RO)
+#define _PAGE_KERNEL_RO(_PAGE_PRIVILEGED | _PAGE_RO)
 #endif
 #ifndef _PAGE_KERNEL_ROX
-#define _PAGE_KERNEL_ROX   (_PAGE_EXEC | _PAGE_RO)
+#define _PAGE_KERNEL_ROX   (_PAGE_PRIVILEGED | _PAGE_RO | _PAGE_EXEC)
 #endif
 #ifndef _PAGE_KERNEL_RW
-#define _PAGE_KERNEL_RW(_PAGE_DIRTY | _PAGE_RW | 

[PATCH 4/8] powerpc/8xx: remove unused _PAGE_WRITETHRU

2018-01-12 Thread Christophe Leroy
_PAGE_WRITETHRU is only used in:
* AMIGA_Z2RAM block driver which is never activated on powerPC
* Video/FB driver which is for PPC_PMAC

Therefore, no need to spend time in 8xx TLB miss handlers for
handling it.

And by removing it, we free up bit 20 which then avoids having
to clear it on each TLB miss.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 3 +--
 arch/powerpc/include/asm/nohash/pgtable.h| 2 ++
 arch/powerpc/kernel/head_8xx.S   | 5 -
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 6dc0180fd5c7..19a5ecaef265 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -41,8 +41,7 @@
 #define _PAGE_GUARDED  0x0010  /* Copied to L1 G entry in DTLB */
 #define _PAGE_USER 0x0020  /* Copied to L1 APG lsb */
 #define _PAGE_EXEC 0x0040  /* Copied to L1 APG */
-#define _PAGE_WRITETHRU0x0080  /* software: caching is write through */
-#define _PAGE_ACCESSED 0x0800  /* software: page referenced */
+#define _PAGE_ACCESSED 0x0080  /* software: page referenced */
 
 #define _PAGE_RO   0x0600  /* Supervisor RO, User no access */
 
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 5c68f4a59f75..84120d65d0e9 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -212,8 +212,10 @@ extern int ptep_set_access_flags(struct vm_area_struct 
*vma, unsigned long addre
 #define pgprot_cached(prot)   (__pgprot((pgprot_val(prot) & 
~_PAGE_CACHE_CTL) | \
_PAGE_COHERENT))
 
+#if _PAGE_WRITETHRU != 0
 #define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & 
~_PAGE_CACHE_CTL) | \
_PAGE_COHERENT | _PAGE_WRITETHRU))
+#endif
 
 #define pgprot_cached_noncoherent(prot) \
(__pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL))
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 641c9a9d4db2..6399dcadf51d 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -486,10 +486,6 @@ _ENTRY(DTLBMiss_jmp)
 * above.
 */
rlwimi  r11, r10, 0, 26, 27
-   /* Insert the WriteThru flag into the TWC from the Linux PTE.
-* It is bit 25 in the Linux PTE and bit 30 in the TWC
-*/
-   rlwimi  r11, r10, 32-5, 30, 30
mtspr   SPRN_MD_TWC, r11
 
/* In 4k pages mode, SPS (bit 28) in RPN must match PS[1] (bit 29)
@@ -523,7 +519,6 @@ _ENTRY(DTLBMiss_jmp)
 #else
rlwimi  r10, r11, 0, 24, 28 /* Set 24-27, clear 28 */
 #endif
-   rlwimi  r10, r11, 0, 20, 20 /* clear 20 */
mtspr   SPRN_MD_RPN, r10/* Update TLB entry */
 
/* Restore registers */
-- 
2.13.3



[PATCH 3/8] powerpc/8xx: Only perform perf counting when perf is in use.

2018-01-12 Thread Christophe Leroy
In TLB miss handlers, updating the perf counter is only useful
when performing a perf analysis. As it has a noticeable overhead,
let's only do it when needed.

In order to do so, the exit of the miss handlers will be patched
when starting/stopping 'perf': the first register restore
instruction of each exit point will be replaced by a jump to
the counting code.

Once this is done, CONFIG_PPC_8xx_PERF_EVENT becomes useless as
this feature doesn't add any overhead.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/ppc-opcode.h  |  2 ++
 arch/powerpc/kernel/entry_32.S | 10 +++
 arch/powerpc/kernel/head_8xx.S | 47 --
 arch/powerpc/perf/8xx-pmu.c| 52 +++---
 arch/powerpc/perf/Makefile |  2 +-
 arch/powerpc/platforms/Kconfig.cputype |  7 -
 6 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index ce0930d68857..ab5c1588b487 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -236,6 +236,7 @@
 #define PPC_INST_RFCI  0x4c66
 #define PPC_INST_RFDI  0x4c4e
 #define PPC_INST_RFMCI 0x4c4c
+#define PPC_INST_MFSPR 0x7c0002a6
 #define PPC_INST_MFSPR_DSCR0x7c1102a6
 #define PPC_INST_MFSPR_DSCR_MASK   0xfc1e
 #define PPC_INST_MTSPR_DSCR0x7c1103a6
@@ -383,6 +384,7 @@
 #define __PPC_ME64(s)  __PPC_MB64(s)
 #define __PPC_BI(s)(((s) & 0x1f) << 16)
 #define __PPC_CT(t)(((t) & 0x0f) << 21)
+#define __PPC_SPR(r)   r) & 0x1f) << 16) | r) >> 5) & 0x1f) << 11))
 
 /*
  * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e780e1fbf6c2..eb8d01bae8c6 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -211,7 +211,7 @@ transfer_to_handler_cont:
mflrr9
lwz r11,0(r9)   /* virtual address of handler */
lwz r9,4(r9)/* where to go when done */
-#ifdef CONFIG_PPC_8xx_PERF_EVENT
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
mtspr   SPRN_NRI, r0
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -301,7 +301,7 @@ stack_ovf:
lis r9,StackOverflow@ha
addir9,r9,StackOverflow@l
LOAD_MSR_KERNEL(r10,MSR_KERNEL)
-#ifdef CONFIG_PPC_8xx_PERF_EVENT
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
mtspr   SPRN_NRI, r0
 #endif
mtspr   SPRN_SRR0,r9
@@ -430,7 +430,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
lwz r7,_NIP(r1)
lwz r2,GPR2(r1)
lwz r1,GPR1(r1)
-#ifdef CONFIG_PPC_8xx_PERF_EVENT
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
mtspr   SPRN_NRI, r0
 #endif
mtspr   SPRN_SRR0,r7
@@ -727,7 +727,7 @@ fast_exception_return:
lwz r10,_LINK(r11)
mtlrr10
REST_GPR(10, r11)
-#ifdef CONFIG_PPC_8xx_PERF_EVENT
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
mtspr   SPRN_NRI, r0
 #endif
mtspr   SPRN_SRR1,r9
@@ -978,7 +978,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
.globl exc_exit_restart
 exc_exit_restart:
lwz r12,_NIP(r1)
-#ifdef CONFIG_PPC_8xx_PERF_EVENT
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
mtspr   SPRN_NRI, r0
 #endif
mtspr   SPRN_SRR0,r12
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index eda582b96dbf..641c9a9d4db2 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -304,12 +304,6 @@ InstructionTLBMiss:
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mtspr   SPRN_SPRG_SCRATCH2, r12
 #endif
-#ifdef CONFIG_PPC_8xx_PERF_EVENT
-   lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha
-   lwz r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
-   addir11, r11, 1
-   stw r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
-#endif
 
/* If we are faulting a kernel address, we have to use the
 * kernel page tables.
@@ -392,6 +386,20 @@ _ENTRY(ITLBMiss_cmp)
mtspr   SPRN_MI_RPN, r10/* Update TLB entry */
 
/* Restore registers */
+_ENTRY(itlb_miss_exit_1)
+   mfspr   r10, SPRN_SPRG_SCRATCH0
+   mfspr   r11, SPRN_SPRG_SCRATCH1
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+   mfspr   r12, SPRN_SPRG_SCRATCH2
+#endif
+   rfi
+#ifdef CONFIG_PERF_EVENTS
+_ENTRY(itlb_miss_perf)
+   lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha
+   lwz r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
+   addir11, r11, 1
+   stw r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
+#endif
mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr 

[PATCH 2/8] powerpc/8xx: remove EXCEPTION_PROLOG/EPILOG_0 and change r3 to r12

2018-01-12 Thread Christophe Leroy
EXCEPTION_PROLOG_0 and EXCEPTION_EPILOG_0 were added some
time ago in order to regroup the two mtspr/mfspr to SCRATCH0 and
SCRATCH1 and the mfcr/mtcr in order to ease entry and exit of
function not using the full EXCEPTION_PROLOG.

Since then, the mfcr/mtcr has been taken out, hence just leaving
the two mtspr/mfspr in the macro.

In order to improve readability of the exception functions, we
remove those two macros and copy back the two mtspr/mfspr instead.

As r10 and r11 are used for SCRATCH0 and SCRATCH1, lets also use
r12 for SCRATCH2. It will also improve the readability/maintenance.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 78 ++
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 728b513c07b8..eda582b96dbf 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -117,15 +117,12 @@ turn_on_mmu:
  * task's thread_struct.
  */
 #define EXCEPTION_PROLOG   \
-   EXCEPTION_PROLOG_0; \
+   mtspr   SPRN_SPRG_SCRATCH0, r10;\
+   mtspr   SPRN_SPRG_SCRATCH1, r11;\
mfcrr10;\
EXCEPTION_PROLOG_1; \
EXCEPTION_PROLOG_2
 
-#define EXCEPTION_PROLOG_0 \
-   mtspr   SPRN_SPRG_SCRATCH0,r10; \
-   mtspr   SPRN_SPRG_SCRATCH1,r11
-
 #define EXCEPTION_PROLOG_1 \
mfspr   r11,SPRN_SRR1;  /* check whether user or kernel */ \
andi.   r11,r11,MSR_PR; \
@@ -160,13 +157,6 @@ turn_on_mmu:
SAVE_2GPRS(7, r11)
 
 /*
- * Exception exit code.
- */
-#define EXCEPTION_EPILOG_0 \
-   mfspr   r10,SPRN_SPRG_SCRATCH0; \
-   mfspr   r11,SPRN_SPRG_SCRATCH1
-
-/*
  * Note: code which follows this uses cr0.eq (set if from kernel),
  * r11, r12 (SRR0), and r9 (SRR1).
  *
@@ -309,10 +299,11 @@ SystemCall:
 #endif
 
 InstructionTLBMiss:
+   mtspr   SPRN_SPRG_SCRATCH0, r10
+   mtspr   SPRN_SPRG_SCRATCH1, r11
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
-   mtspr   SPRN_SPRG_SCRATCH2, r3
+   mtspr   SPRN_SPRG_SCRATCH2, r12
 #endif
-   EXCEPTION_PROLOG_0
 #ifdef CONFIG_PPC_8xx_PERF_EVENT
lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha
lwz r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
@@ -328,7 +319,7 @@ InstructionTLBMiss:
/* Only modules will cause ITLB Misses as we always
 * pin the first 8MB of kernel memory */
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
-   mfcrr3
+   mfcrr12
 #endif
 #ifdef ITLB_MISS_KERNEL
 #if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
@@ -371,7 +362,7 @@ _ENTRY(ITLBMiss_cmp)
lwz r10, 0(r10) /* Get the pte */
 4:
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
-   mtcrr3
+   mtcrr12
 #endif
/* Insert the APG into the TWC from the Linux PTE. */
rlwimi  r11, r10, 0, 25, 26
@@ -401,10 +392,11 @@ _ENTRY(ITLBMiss_cmp)
mtspr   SPRN_MI_RPN, r10/* Update TLB entry */
 
/* Restore registers */
+   mfspr   r10, SPRN_SPRG_SCRATCH0
+   mfspr   r11, SPRN_SPRG_SCRATCH1
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
-   mfspr   r3, SPRN_SPRG_SCRATCH2
+   mfspr   r12, SPRN_SPRG_SCRATCH2
 #endif
-   EXCEPTION_EPILOG_0
rfi
 
 #ifdef CONFIG_HUGETLB_PAGE
@@ -434,15 +426,16 @@ _ENTRY(ITLBMiss_cmp)
 
. = 0x1200
 DataStoreTLBMiss:
-   mtspr   SPRN_SPRG_SCRATCH2, r3
-   EXCEPTION_PROLOG_0
+   mtspr   SPRN_SPRG_SCRATCH0, r10
+   mtspr   SPRN_SPRG_SCRATCH1, r11
+   mtspr   SPRN_SPRG_SCRATCH2, r12
 #ifdef CONFIG_PPC_8xx_PERF_EVENT
lis r10, (dtlb_miss_counter - PAGE_OFFSET)@ha
lwz r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
addir11, r11, 1
stw r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
 #endif
-   mfcrr3
+   mfcrr12
 
/* If we are faulting a kernel address, we have to use the
 * kernel page tables.
@@ -482,7 +475,7 @@ _ENTRY(DTLBMiss_jmp)
rlwimi  r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */
lwz r10, 0(r10) /* Get the pte */
 4:
-   mtcrr3
+   mtcrr12
 
/* Insert the Guarded flag and APG into the TWC from the Linux PTE.
 * It is bit 26-27 of both the Linux PTE and the TWC (at least
@@ -532,9 +525,10 @@ _ENTRY(DTLBMiss_jmp)
mtspr   SPRN_MD_RPN, r10/* Update TLB entry */
 
/* Restore registers */
-   mfspr   r3, SPRN_SPRG_SCRATCH2
mtspr   SPRN_DAR, r11   /* Tag DAR */
-   EXCEPTION_EPILOG_0
+   mfspr   r10, SPRN_SPRG_SCRATCH0
+   mfspr   r11, SPRN_SPRG_SCRATCH1
+   mfspr   r12, SPRN_SPRG_SCRATCH2
rfi
 
 #ifdef CONFIG_HUGETLB_PAGE
@@ -584,7 +578,8 @@ itlbie:
  */
. = 0x1400
 DataTLBError:
-   

[PATCH 1/8] powerpc/8xx: Remove CPU6 ERRATA Workaround

2018-01-12 Thread Christophe Leroy
CPU6 ERRATA affects only MPC860 revisions prior to C.0. Manufacturing
of those revisiosn was stopped in 1999-2000.
Therefore, it has been almost 20 years since this ERRATA has been
fixed in the silicon.

This patch removes the workaround for that ERRATA.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/configs/mpc866_ads_defconfig |  1 -
 arch/powerpc/include/asm/reg_8xx.h| 82 ---
 arch/powerpc/kernel/head_8xx.S| 54 +---
 arch/powerpc/platforms/8xx/Kconfig| 12 -
 4 files changed, 12 insertions(+), 137 deletions(-)

diff --git a/arch/powerpc/configs/mpc866_ads_defconfig 
b/arch/powerpc/configs/mpc866_ads_defconfig
index f1f176c29fa3..5320735395e7 100644
--- a/arch/powerpc/configs/mpc866_ads_defconfig
+++ b/arch/powerpc/configs/mpc866_ads_defconfig
@@ -13,7 +13,6 @@ CONFIG_EXPERT=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_MPC86XADS=y
 CONFIG_8xx_COPYBACK=y
-CONFIG_8xx_CPU6=y
 CONFIG_GEN_RTC=y
 CONFIG_HZ_1000=y
 CONFIG_MATH_EMULATION=y
diff --git a/arch/powerpc/include/asm/reg_8xx.h 
b/arch/powerpc/include/asm/reg_8xx.h
index 53a7e2955d3e..7192eece6c3e 100644
--- a/arch/powerpc/include/asm/reg_8xx.h
+++ b/arch/powerpc/include/asm/reg_8xx.h
@@ -66,86 +66,4 @@
 #define DC_DFWT0x4000  /* Data cache is forced write 
through */
 #define DC_LES 0x2000  /* Caches are little endian mode */
 
-#ifdef CONFIG_8xx_CPU6
-#define do_mtspr_cpu6(rn, rn_addr, v)  \
-   do {\
-   int _reg_cpu6 = rn_addr, _tmp_cpu6; \
-   asm volatile("stw %0, %1;"  \
-"lwz %0, %1;"  \
-"mtspr " __stringify(rn) ",%2" :   \
-: "r" (_reg_cpu6), "m"(_tmp_cpu6), \
-  "r" ((unsigned long)(v)) \
-: "memory");   \
-   } while (0)
-
-#define do_mtspr(rn, v)asm volatile("mtspr " __stringify(rn) ",%0" :   
\
-: "r" ((unsigned long)(v)) \
-: "memory")
-#define mtspr(rn, v) \
-   do {\
-   if (rn == SPRN_IMMR)\
-   do_mtspr_cpu6(rn, 0x3d30, v);   \
-   else if (rn == SPRN_IC_CST) \
-   do_mtspr_cpu6(rn, 0x2110, v);   \
-   else if (rn == SPRN_IC_ADR) \
-   do_mtspr_cpu6(rn, 0x2310, v);   \
-   else if (rn == SPRN_IC_DAT) \
-   do_mtspr_cpu6(rn, 0x2510, v);   \
-   else if (rn == SPRN_DC_CST) \
-   do_mtspr_cpu6(rn, 0x3110, v);   \
-   else if (rn == SPRN_DC_ADR) \
-   do_mtspr_cpu6(rn, 0x3310, v);   \
-   else if (rn == SPRN_DC_DAT) \
-   do_mtspr_cpu6(rn, 0x3510, v);   \
-   else if (rn == SPRN_MI_CTR) \
-   do_mtspr_cpu6(rn, 0x2180, v);   \
-   else if (rn == SPRN_MI_AP)  \
-   do_mtspr_cpu6(rn, 0x2580, v);   \
-   else if (rn == SPRN_MI_EPN) \
-   do_mtspr_cpu6(rn, 0x2780, v);   \
-   else if (rn == SPRN_MI_TWC) \
-   do_mtspr_cpu6(rn, 0x2b80, v);   \
-   else if (rn == SPRN_MI_RPN) \
-   do_mtspr_cpu6(rn, 0x2d80, v);   \
-   else if (rn == SPRN_MI_CAM) \
-   do_mtspr_cpu6(rn, 0x2190, v);   \
-   else if (rn == SPRN_MI_RAM0)\
-   do_mtspr_cpu6(rn, 0x2390, v);   \
-   else if (rn == SPRN_MI_RAM1)\
-   do_mtspr_cpu6(rn, 0x2590, v);   \
-   else if (rn == SPRN_MD_CTR) \
-   do_mtspr_cpu6(rn, 0x3180, v);   \
-   else if (rn == SPRN_M_CASID)\
-   do_mtspr_cpu6(rn, 0x3380, v);   \
-   else if (rn == SPRN_MD_AP)   

Re: [PATCH v6 1/2] powerpc/powernv: Enable tunneled operations

2018-01-12 Thread christophe lombard

Le 11/01/2018 à 16:01, Philippe Bergheaud a écrit :

P9 supports PCI tunneled operations (atomics and as_notify). This
patch adds support for tunneled operations on powernv, with a new
API, to be called by device drivers:

pnv_pci_get_tunnel_ind()
Tell driver the 16-bit ASN indication used by kernel.

pnv_pci_set_tunnel_bar()
Tell kernel the Tunnel BAR Response address used by driver.
This function uses two new OPAL calls, as the PBCQ Tunnel BAR
register is configured by skiboot.

pnv_pci_get_as_notify_info()
Return the ASN info of the thread to be woken up.

Signed-off-by: Philippe Bergheaud 
---
Changelog:

v2: Do not set the ASN indication. Get it from the device tree.

v3: Make pnv_pci_get_phb_node() available when compiling without cxl.

v4: Add pnv_pci_get_as_notify_info().
 Rebase opal call numbers on skiboot 5.9.6.

v5: pnv_pci_get_tunnel_ind():
   - fix node reference count
 pnv_pci_get_as_notify_info():
   - fail if task == NULL
   - read pid from mm->context.id
   - explain that thread.tidr require CONFIG_PPC64

v6: pnv_pci_get_tunnel_ind():
   - check if radix is enabled, or else return an error
  pnv_pci_get_as_notify_info():
   - remove a capi-specific comment, irrelevant for pci

This patch depends on the following skiboot patches:
   https://patchwork.ozlabs.org/patch/858324/
   https://patchwork.ozlabs.org/patch/858325/
---
  arch/powerpc/include/asm/opal-api.h|   4 +-
  arch/powerpc/include/asm/opal.h|   2 +
  arch/powerpc/include/asm/pnv-pci.h |   5 ++
  arch/powerpc/platforms/powernv/opal-wrappers.S |   2 +
  arch/powerpc/platforms/powernv/pci-cxl.c   |   8 --
  arch/powerpc/platforms/powernv/pci.c   | 106 +
  6 files changed, 118 insertions(+), 9 deletions(-)



Acked-by: Christophe Lombard 




Re: [PATCH v6 2/2] cxl: read PHB indications from the device tree

2018-01-12 Thread christophe lombard

Le 11/01/2018 à 16:01, Philippe Bergheaud a écrit :

Configure the P9 XSL_DSNCTL register with PHB indications found
in the device tree, or else use legacy hard-coded values.

Signed-off-by: Philippe Bergheaud 
---
Changelog:

v2: New patch. Use the new device tree property "ibm,phb-indications".

v3: No change.

v4: No functional change.
 Drop cosmetic fix in comment.

v5: get_phb_indications():
   - make static variables local to function.
   - return static variable values by arguments.

v6: get_phb_indications():
   - acquire a mutex before setting the phb indications.

This patch depends on the following skiboot patch:
   https://patchwork.ozlabs.org/patch/858324/
---
  drivers/misc/cxl/cxl.h|  2 +-
  drivers/misc/cxl/cxllib.c |  2 +-
  drivers/misc/cxl/pci.c| 50 ++-
  3 files changed, 47 insertions(+), 7 deletions(-)



Acked-by: Christophe Lombard 



Re: [PATCH] cpufreq: powernv: Dont assume distinct pstate values for nominal and pmin

2018-01-12 Thread Viresh Kumar
On 12-01-18, 12:43, Shilpasri G Bhat wrote:
> Some OpenPOWER boxes can have same pstate values for nominal and
> pmin pstates. In these boxes the current code will not initialize
> 'powernv_pstate_info.min' variable and result in erroneous CPU
> frequency reporting. This patch fixes this problem.
> 
> Fixes: 09ca4c9b5958 ("cpufreq: powernv: Replacing pstate_id with frequency 
> table index")
> Reported-by: Alvin Wang 
> Signed-off-by: Shilpasri G Bhat 
> ---
>  drivers/cpufreq/powernv-cpufreq.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/powernv-cpufreq.c 
> b/drivers/cpufreq/powernv-cpufreq.c
> index b6d7c4c..da7fdb4 100644
> --- a/drivers/cpufreq/powernv-cpufreq.c
> +++ b/drivers/cpufreq/powernv-cpufreq.c
> @@ -288,9 +288,9 @@ static int init_powernv_pstates(void)
>  
>   if (id == pstate_max)
>   powernv_pstate_info.max = i;
> - else if (id == pstate_nominal)
> + if (id == pstate_nominal)
>   powernv_pstate_info.nominal = i;
> - else if (id == pstate_min)
> + if (id == pstate_min)
>   powernv_pstate_info.min = i;
>  
>   if (powernv_pstate_info.wof_enabled && id == pstate_turbo) {

Acked-by: Viresh Kumar 

-- 
viresh


Re: [PATCH V7] cxl: Add support for ASB_Notify on POWER9

2018-01-12 Thread Vaibhav Jain
Christophe Lombard  writes:

> The POWER9 core supports a new feature: ASB_Notify which requires the
> support of the Special Purpose Register: TIDR.
>
> The ASB_Notify command, generated by the AFU, will attempt to
> wake-up the host thread identified by the particular LPID:PID:TID.
>
> This patch assign a unique TIDR (thread id) for the current thread which
> will be used in the process element entry.
>
> Signed-off-by: Christophe Lombard 
> Reviewed-by: Philippe Bergheaud 
>
Reviewed-by: Vaibhav Jain 



[PATCH 34/34] h8300: use dma-direct

2018-01-12 Thread Christoph Hellwig
Replace the bare-bones h8300 direct dma mapping implementation with
the fully featured generic dma-direct one.

Signed-off-by: Christoph Hellwig 
---
 arch/h8300/Kconfig   |  1 +
 arch/h8300/include/asm/Kbuild|  1 +
 arch/h8300/include/asm/dma-mapping.h | 12 ---
 arch/h8300/kernel/Makefile   |  2 +-
 arch/h8300/kernel/dma.c  | 66 
 5 files changed, 3 insertions(+), 79 deletions(-)
 delete mode 100644 arch/h8300/include/asm/dma-mapping.h
 delete mode 100644 arch/h8300/kernel/dma.c

diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index f8d3fde08190..091d6d04b5e5 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -23,6 +23,7 @@ config H8300
select HAVE_ARCH_KGDB
select HAVE_ARCH_HASH
select CPU_NO_EFFICIENT_FFS
+   select DMA_DIRECT_OPS
 
 config CPU_BIG_ENDIAN
def_bool y
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index bc077491d299..642752c94306 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -9,6 +9,7 @@ generic-y += delay.h
 generic-y += device.h
 generic-y += div64.h
 generic-y += dma.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
diff --git a/arch/h8300/include/asm/dma-mapping.h 
b/arch/h8300/include/asm/dma-mapping.h
deleted file mode 100644
index 21bb1fc3a6f1..
--- a/arch/h8300/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _H8300_DMA_MAPPING_H
-#define _H8300_DMA_MAPPING_H
-
-extern const struct dma_map_ops h8300_dma_map_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-   return _dma_map_ops;
-}
-
-#endif
diff --git a/arch/h8300/kernel/Makefile b/arch/h8300/kernel/Makefile
index b62e830525c6..307aa51576dd 100644
--- a/arch/h8300/kernel/Makefile
+++ b/arch/h8300/kernel/Makefile
@@ -7,7 +7,7 @@ extra-y := vmlinux.lds
 
 obj-y := process.o traps.o ptrace.o \
 signal.o setup.o syscalls.o \
-irq.o entry.o dma.o
+irq.o entry.o
 
 obj-$(CONFIG_ROMKERNEL) += head_rom.o
 obj-$(CONFIG_RAMKERNEL) += head_ram.o
diff --git a/arch/h8300/kernel/dma.c b/arch/h8300/kernel/dma.c
deleted file mode 100644
index d44ba5db4ac3..
--- a/arch/h8300/kernel/dma.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static void *dma_alloc(struct device *dev, size_t size,
-  dma_addr_t *dma_handle, gfp_t gfp,
-  unsigned long attrs)
-{
-   void *ret;
-
-   if (dev == NULL || (*dev->dma_mask < 0x))
-   gfp |= GFP_DMA;
-   ret = (void *)__get_free_pages(gfp, get_order(size));
-
-   if (ret != NULL) {
-   memset(ret, 0, size);
-   *dma_handle = virt_to_phys(ret);
-   }
-   return ret;
-}
-
-static void dma_free(struct device *dev, size_t size,
-void *vaddr, dma_addr_t dma_handle,
-unsigned long attrs)
-
-{
-   free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static dma_addr_t map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction direction,
- unsigned long attrs)
-{
-   return page_to_phys(page) + offset;
-}
-
-static int map_sg(struct device *dev, struct scatterlist *sgl,
- int nents, enum dma_data_direction direction,
- unsigned long attrs)
-{
-   struct scatterlist *sg;
-   int i;
-
-   for_each_sg(sgl, sg, nents, i) {
-   sg->dma_address = sg_phys(sg);
-   }
-
-   return nents;
-}
-
-const struct dma_map_ops h8300_dma_map_ops = {
-   .alloc = dma_alloc,
-   .free = dma_free,
-   .map_page = map_page,
-   .map_sg = map_sg,
-};
-EXPORT_SYMBOL(h8300_dma_map_ops);
-- 
2.14.2



[PATCH 33/34] cris: use dma-direct

2018-01-12 Thread Christoph Hellwig
cris currently has an incomplete direct mapping dma_map_ops implementation
if PCI support is enabled.  Replace it with the fully feature generic
dma-direct implementation.

Signed-off-by: Christoph Hellwig 
Acked-by: Jesper Nilsson 
---
 arch/cris/Kconfig   |  4 ++
 arch/cris/arch-v32/drivers/pci/Makefile |  2 +-
 arch/cris/arch-v32/drivers/pci/dma.c| 77 -
 arch/cris/include/asm/Kbuild|  1 +
 arch/cris/include/asm/dma-mapping.h | 20 -
 5 files changed, 6 insertions(+), 98 deletions(-)
 delete mode 100644 arch/cris/arch-v32/drivers/pci/dma.c
 delete mode 100644 arch/cris/include/asm/dma-mapping.h

diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 54d3f426763b..cd5a0865c97f 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -33,6 +33,9 @@ config GENERIC_CALIBRATE_DELAY
 config NO_IOPORT_MAP
def_bool y if !PCI
 
+config NO_DMA
+   def_bool y if !PCI
+
 config FORCE_MAX_ZONEORDER
int
default 6
@@ -72,6 +75,7 @@ config CRIS
select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
select HAVE_NMI
+   select DMA_DIRECT_OPS if PCI
 
 config HZ
int
diff --git a/arch/cris/arch-v32/drivers/pci/Makefile 
b/arch/cris/arch-v32/drivers/pci/Makefile
index bff7482f2444..93c8be6170b1 100644
--- a/arch/cris/arch-v32/drivers/pci/Makefile
+++ b/arch/cris/arch-v32/drivers/pci/Makefile
@@ -2,4 +2,4 @@
 # Makefile for Etrax cardbus driver
 #
 
-obj-$(CONFIG_ETRAX_CARDBUS)+= bios.o dma.o
+obj-$(CONFIG_ETRAX_CARDBUS)+= bios.o
diff --git a/arch/cris/arch-v32/drivers/pci/dma.c 
b/arch/cris/arch-v32/drivers/pci/dma.c
deleted file mode 100644
index 8c3802244ef3..
--- a/arch/cris/arch-v32/drivers/pci/dma.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Dynamic DMA mapping support.
- *
- * On cris there is no hardware dynamic DMA address translation,
- * so consistent alloc/free are merely page allocation/freeing.
- * The rest of the dynamic DMA mapping interface is implemented
- * in asm/pci.h.
- *
- * Borrowed from i386.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static void *v32_dma_alloc(struct device *dev, size_t size,
-   dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
-{
-   void *ret;
-
-   if (dev == NULL || (dev->coherent_dma_mask < 0x))
-   gfp |= GFP_DMA;
-
-   ret = (void *)__get_free_pages(gfp,  get_order(size));
-
-   if (ret != NULL) {
-   memset(ret, 0, size);
-   *dma_handle = virt_to_phys(ret);
-   }
-   return ret;
-}
-
-static void v32_dma_free(struct device *dev, size_t size, void *vaddr,
-   dma_addr_t dma_handle, unsigned long attrs)
-{
-   free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static inline dma_addr_t v32_dma_map_page(struct device *dev,
-   struct page *page, unsigned long offset, size_t size,
-   enum dma_data_direction direction, unsigned long attrs)
-{
-   return page_to_phys(page) + offset;
-}
-
-static inline int v32_dma_map_sg(struct device *dev, struct scatterlist *sg,
-   int nents, enum dma_data_direction direction,
-   unsigned long attrs)
-{
-   printk("Map sg\n");
-   return nents;
-}
-
-static inline int v32_dma_supported(struct device *dev, u64 mask)
-{
-/*
- * we fall back to GFP_DMA when the mask isn't all 1s,
- * so we can't guarantee allocations that must be
- * within a tighter range than GFP_DMA..
- */
-if (mask < 0x00ff)
-return 0;
-   return 1;
-}
-
-const struct dma_map_ops v32_dma_ops = {
-   .alloc  = v32_dma_alloc,
-   .free   = v32_dma_free,
-   .map_page   = v32_dma_map_page,
-   .map_sg = v32_dma_map_sg,
-   .dma_supported  = v32_dma_supported,
-};
-EXPORT_SYMBOL(v32_dma_ops);
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 460349cb147f..8cf45ac30c1b 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -5,6 +5,7 @@ generic-y += cmpxchg.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
diff --git a/arch/cris/include/asm/dma-mapping.h 
b/arch/cris/include/asm/dma-mapping.h
deleted file mode 100644
index 1553bdb30a0c..
--- a/arch/cris/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_CRIS_DMA_MAPPING_H
-#define _ASM_CRIS_DMA_MAPPING_H
-
-#ifdef CONFIG_PCI
-extern const struct dma_map_ops v32_dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)

[PATCH 32/34] dma-direct: reject too small dma masks

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 include/linux/dma-direct.h |  1 +
 lib/dma-direct.c   | 19 +++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 4788bf0bf683..bcdb1a3e4b1f 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -42,5 +42,6 @@ void *dma_direct_alloc(struct device *dev, size_t size, 
dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs);
+int dma_direct_supported(struct device *dev, u64 mask);
 
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 784a68dfdbe3..40b1f92f2214 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -122,6 +122,24 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
return nents;
 }
 
+int dma_direct_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_ZONE_DMA
+   if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+   return 0;
+#else
+   /*
+* Because 32-bit DMA masks are so common we expect every architecture
+* to be able to satisfy them - either by not supporting more physical
+* memory, or by providing a ZONE_DMA32.  If neither is the case, the
+* architecture needs to use an IOMMU instead of the direct mapping.
+*/
+   if (mask < DMA_BIT_MASK(32))
+   return 0;
+#endif
+   return 1;
+}
+
 static int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
return dma_addr == DIRECT_MAPPING_ERROR;
@@ -132,6 +150,7 @@ const struct dma_map_ops dma_direct_ops = {
.free   = dma_direct_free,
.map_page   = dma_direct_map_page,
.map_sg = dma_direct_map_sg,
+   .dma_supported  = dma_direct_supported,
.mapping_error  = dma_direct_mapping_error,
 };
 EXPORT_SYMBOL(dma_direct_ops);
-- 
2.14.2



[PATCH 31/34] dma-direct: make dma_direct_{alloc, free} available to other implementations

2018-01-12 Thread Christoph Hellwig
So that they don't need to indirect through the operation vector.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Vladimir Murzin 
---
 arch/arm/mm/dma-mapping-nommu.c | 9 +++--
 include/linux/dma-direct.h  | 5 +
 lib/dma-direct.c| 6 +++---
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 4d8042521e89..619f24a42d09 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -11,7 +11,7 @@
 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 #include 
@@ -39,7 +39,6 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t 
size,
 unsigned long attrs)
 
 {
-   const struct dma_map_ops *ops = _direct_ops;
void *ret;
 
/*
@@ -48,7 +47,7 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t 
size,
 */
 
if (attrs & DMA_ATTR_NON_CONSISTENT)
-   return ops->alloc(dev, size, dma_handle, gfp, attrs);
+   return dma_direct_alloc(dev, size, dma_handle, gfp, attrs);
 
ret = dma_alloc_from_global_coherent(size, dma_handle);
 
@@ -70,10 +69,8 @@ static void arm_nommu_dma_free(struct device *dev, size_t 
size,
   void *cpu_addr, dma_addr_t dma_addr,
   unsigned long attrs)
 {
-   const struct dma_map_ops *ops = _direct_ops;
-
if (attrs & DMA_ATTR_NON_CONSISTENT) {
-   ops->free(dev, size, cpu_addr, dma_addr, attrs);
+   dma_direct_free(dev, size, cpu_addr, dma_addr, attrs);
} else {
int ret = dma_release_from_global_coherent(get_order(size),
   cpu_addr);
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 10e924b7cba7..4788bf0bf683 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -38,4 +38,9 @@ static inline void dma_mark_clean(void *addr, size_t size)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */
 
+void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+   gfp_t gfp, unsigned long attrs);
+void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
+   dma_addr_t dma_addr, unsigned long attrs);
+
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 4e43c2bb7f5f..784a68dfdbe3 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -40,8 +40,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
return phys_to_dma(dev, phys) + size - 1 <= dev->coherent_dma_mask;
 }
 
-static void *dma_direct_alloc(struct device *dev, size_t size,
-   dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+   gfp_t gfp, unsigned long attrs)
 {
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
int page_order = get_order(size);
@@ -84,7 +84,7 @@ static void *dma_direct_alloc(struct device *dev, size_t size,
return page_address(page);
 }
 
-static void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
+void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
 {
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-- 
2.14.2



[PATCH 30/34] dma-direct: retry allocations using GFP_DMA for small masks

2018-01-12 Thread Christoph Hellwig
If an attempt to allocate memory succeeded, but isn't inside the
supported DMA mask, retry the allocation with GFP_DMA set as a
last resort.

Based on the x86 code, but an off by one error in what is now
dma_coherent_ok has been fixed vs the x86 code.

Signed-off-by: Christoph Hellwig 
---
 lib/dma-direct.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 8f76032ebc3c..4e43c2bb7f5f 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -35,6 +35,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
return true;
 }
 
+static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+{
+   return phys_to_dma(dev, phys) + size - 1 <= dev->coherent_dma_mask;
+}
+
 static void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
@@ -48,11 +53,29 @@ static void *dma_direct_alloc(struct device *dev, size_t 
size,
if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
gfp |= GFP_DMA32;
 
+again:
/* CMA can be used only in the context which permits sleeping */
-   if (gfpflags_allow_blocking(gfp))
+   if (gfpflags_allow_blocking(gfp)) {
page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
+   if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+   dma_release_from_contiguous(dev, page, count);
+   page = NULL;
+   }
+   }
if (!page)
page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
+
+   if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+   __free_pages(page, page_order);
+   page = NULL;
+
+   if (dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
+   !(gfp & GFP_DMA)) {
+   gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+   goto again;
+   }
+   }
+
if (!page)
return NULL;
 
-- 
2.14.2



[PATCH 29/34] dma-direct: add support for allocation from ZONE_DMA and ZONE_DMA32

2018-01-12 Thread Christoph Hellwig
This allows to dip into zones for lower memory if they are available.
If one of the zones is not available the corresponding GFP_* flag
will evaluate to 0 so they won't change anything.  We provide an
arch tunable for those architectures that do not use GFP_DMA for
the lowest 24-bits, given that there are a few.

Roughly based on the x86 code.

Signed-off-by: Christoph Hellwig 
---
 lib/dma-direct.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index f04a424f91fa..8f76032ebc3c 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -12,6 +12,14 @@
 
 #define DIRECT_MAPPING_ERROR   0
 
+/*
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but
+ * some use it for entirely different regions:
+ */
+#ifndef ARCH_ZONE_DMA_BITS
+#define ARCH_ZONE_DMA_BITS 24
+#endif
+
 static bool
 check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
const char *caller)
@@ -34,6 +42,12 @@ static void *dma_direct_alloc(struct device *dev, size_t 
size,
int page_order = get_order(size);
struct page *page = NULL;
 
+   /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
+   if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+   gfp |= GFP_DMA;
+   if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
+   gfp |= GFP_DMA32;
+
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp))
page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
-- 
2.14.2



[PATCH 28/34] dma-direct: use node local allocations for coherent memory

2018-01-12 Thread Christoph Hellwig
To preserve the x86 behavior.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 lib/dma-direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index a9ae98be7af3..f04a424f91fa 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -38,7 +38,7 @@ static void *dma_direct_alloc(struct device *dev, size_t size,
if (gfpflags_allow_blocking(gfp))
page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
if (!page)
-   page = alloc_pages(gfp, page_order);
+   page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
if (!page)
return NULL;
 
-- 
2.14.2



[PATCH 27/34] dma-direct: add support for CMA allocation

2018-01-12 Thread Christoph Hellwig
Try the CMA allocator for coherent allocations if supported.

Roughly modelled after the x86 code.

Signed-off-by: Christoph Hellwig 
---
 lib/dma-direct.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 32fd4d9e4c47..a9ae98be7af3 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define DIRECT_MAPPING_ERROR   0
@@ -29,19 +30,30 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
 static void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-   void *ret;
+   unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+   int page_order = get_order(size);
+   struct page *page = NULL;
 
-   ret = (void *)__get_free_pages(gfp, get_order(size));
-   if (ret)
-   *dma_handle = phys_to_dma(dev, virt_to_phys(ret));
+   /* CMA can be used only in the context which permits sleeping */
+   if (gfpflags_allow_blocking(gfp))
+   page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
+   if (!page)
+   page = alloc_pages(gfp, page_order);
+   if (!page)
+   return NULL;
 
-   return ret;
+   *dma_handle = phys_to_dma(dev, page_to_phys(page));
+   memset(page_address(page), 0, size);
+   return page_address(page);
 }
 
 static void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
 {
-   free_pages((unsigned long)cpu_addr, get_order(size));
+   unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+   if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count))
+   free_pages((unsigned long)cpu_addr, get_order(size));
 }
 
 static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-- 
2.14.2



[PATCH 26/34] dma-direct: add dma address sanity checks

2018-01-12 Thread Christoph Hellwig
Roughly based on the x86 pci-nommu implementation.

Signed-off-by: Christoph Hellwig 
---
 lib/dma-direct.c | 31 ++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 12ea9653781b..32fd4d9e4c47 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -9,6 +9,23 @@
 #include 
 #include 
 
+#define DIRECT_MAPPING_ERROR   0
+
+static bool
+check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
+   const char *caller)
+{
+   if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+   if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
+   dev_err(dev,
+   "%s: overflow %pad+%zu of device mask %llx\n",
+   caller, _addr, size, *dev->dma_mask);
+   }
+   return false;
+   }
+   return true;
+}
+
 static void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
@@ -31,7 +48,11 @@ static dma_addr_t dma_direct_map_page(struct device *dev, 
struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
 {
-   return phys_to_dma(dev, page_to_phys(page)) + offset;
+   dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset;
+
+   if (!check_addr(dev, dma_addr, size, __func__))
+   return DIRECT_MAPPING_ERROR;
+   return dma_addr;
 }
 
 static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
@@ -44,16 +65,24 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
BUG_ON(!sg_page(sg));
 
sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
+   if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__))
+   return 0;
sg_dma_len(sg) = sg->length;
}
 
return nents;
 }
 
+static int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+   return dma_addr == DIRECT_MAPPING_ERROR;
+}
+
 const struct dma_map_ops dma_direct_ops = {
.alloc  = dma_direct_alloc,
.free   = dma_direct_free,
.map_page   = dma_direct_map_page,
.map_sg = dma_direct_map_sg,
+   .mapping_error  = dma_direct_mapping_error,
 };
 EXPORT_SYMBOL(dma_direct_ops);
-- 
2.14.2



[PATCH 25/34] dma-direct: use phys_to_dma

2018-01-12 Thread Christoph Hellwig
This means it uses whatever linear remapping scheme that the architecture
provides is used in the generic dma_direct ops.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Vladimir Murzin 
---
 lib/dma-direct.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 0ec3262a3148..12ea9653781b 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -1,12 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * lib/dma-noop.c
- *
- * DMA operations that map to physical addresses without flushing memory.
+ * DMA operations that map physical memory directly without using an IOMMU or
+ * flushing caches.
  */
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 
@@ -17,7 +16,7 @@ static void *dma_direct_alloc(struct device *dev, size_t size,
 
ret = (void *)__get_free_pages(gfp, get_order(size));
if (ret)
-   *dma_handle = virt_to_phys(ret) - PFN_PHYS(dev->dma_pfn_offset);
+   *dma_handle = phys_to_dma(dev, virt_to_phys(ret));
 
return ret;
 }
@@ -32,7 +31,7 @@ static dma_addr_t dma_direct_map_page(struct device *dev, 
struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
 {
-   return page_to_phys(page) + offset - PFN_PHYS(dev->dma_pfn_offset);
+   return phys_to_dma(dev, page_to_phys(page)) + offset;
 }
 
 static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
@@ -42,12 +41,9 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
struct scatterlist *sg;
 
for_each_sg(sgl, sg, nents, i) {
-   dma_addr_t offset = PFN_PHYS(dev->dma_pfn_offset);
-   void *va;
-
BUG_ON(!sg_page(sg));
-   va = sg_virt(sg);
-   sg_dma_address(sg) = (dma_addr_t)virt_to_phys(va) - offset;
+
+   sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
sg_dma_len(sg) = sg->length;
}
 
-- 
2.14.2



[PATCH 24/34] dma-direct: rename dma_noop to dma_direct

2018-01-12 Thread Christoph Hellwig
The trivial direct mapping implementation already does a virtual to
physical translation which isn't strictly a noop, and will soon learn
to do non-direct but linear physical to dma translations through the
device offset and a few small tricks.  Rename it to a better fitting
name.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Vladimir Murzin 
---
 MAINTAINERS|  2 +-
 arch/arm/Kconfig   |  2 +-
 arch/arm/include/asm/dma-mapping.h |  2 +-
 arch/arm/mm/dma-mapping-nommu.c|  8 
 arch/m32r/Kconfig  |  2 +-
 arch/riscv/Kconfig |  2 +-
 arch/s390/Kconfig  |  2 +-
 include/asm-generic/dma-mapping.h  |  2 +-
 include/linux/dma-mapping.h|  2 +-
 lib/Kconfig|  2 +-
 lib/Makefile   |  2 +-
 lib/{dma-noop.c => dma-direct.c}   | 35 +++
 12 files changed, 29 insertions(+), 34 deletions(-)
 rename lib/{dma-noop.c => dma-direct.c} (52%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 234e642e7149..2d54e636d625 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4334,7 +4334,7 @@ T:git 
git://git.infradead.org/users/hch/dma-mapping.git
 W: http://git.infradead.org/users/hch/dma-mapping.git
 S: Supported
 F: lib/dma-debug.c
-F: lib/dma-noop.c
+F: lib/dma-direct.c
 F: lib/dma-virt.c
 F: drivers/base/dma-mapping.c
 F: drivers/base/dma-coherent.c
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 00d889a37965..430a0aa710d6 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -25,7 +25,7 @@ config ARM
select CLONE_BACKWARDS
select CPU_PM if (SUSPEND || CPU_IDLE)
select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
-   select DMA_NOOP_OPS if !MMU
+   select DMA_DIRECT_OPS if !MMU
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select GENERIC_ALLOCATOR
diff --git a/arch/arm/include/asm/dma-mapping.h 
b/arch/arm/include/asm/dma-mapping.h
index e5d9020c9ee1..8436f6ade57d 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -18,7 +18,7 @@ extern const struct dma_map_ops arm_coherent_dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-   return IS_ENABLED(CONFIG_MMU) ? _dma_ops : _noop_ops;
+   return IS_ENABLED(CONFIG_MMU) ? _dma_ops : _direct_ops;
 }
 
 #ifdef __arch_page_to_dma
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 6db5fc26d154..4d8042521e89 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -22,7 +22,7 @@
 #include "dma.h"
 
 /*
- *  dma_noop_ops is used if
+ *  dma_direct_ops is used if
  *   - MMU/MPU is off
  *   - cpu is v7m w/o cache support
  *   - device is coherent
@@ -39,7 +39,7 @@ static void *arm_nommu_dma_alloc(struct device *dev, size_t 
size,
 unsigned long attrs)
 
 {
-   const struct dma_map_ops *ops = _noop_ops;
+   const struct dma_map_ops *ops = _direct_ops;
void *ret;
 
/*
@@ -70,7 +70,7 @@ static void arm_nommu_dma_free(struct device *dev, size_t 
size,
   void *cpu_addr, dma_addr_t dma_addr,
   unsigned long attrs)
 {
-   const struct dma_map_ops *ops = _noop_ops;
+   const struct dma_map_ops *ops = _direct_ops;
 
if (attrs & DMA_ATTR_NON_CONSISTENT) {
ops->free(dev, size, cpu_addr, dma_addr, attrs);
@@ -213,7 +213,7 @@ EXPORT_SYMBOL(arm_nommu_dma_ops);
 
 static const struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
 {
-   return coherent ? _noop_ops : _nommu_dma_ops;
+   return coherent ? _direct_ops : _nommu_dma_ops;
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 498398d915c1..dd84ee194579 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -19,7 +19,7 @@ config M32R
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
select CPU_NO_EFFICIENT_FFS
-   select DMA_NOOP_OPS
+   select DMA_DIRECT_OPS
select ARCH_NO_COHERENT_DMA_MMAP if !MMU
 
 config SBUS
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 2c6adf12713a..865e14f50c14 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -83,7 +83,7 @@ config PGTABLE_LEVELS
 config HAVE_KPROBES
def_bool n
 
-config DMA_NOOP_OPS
+config DMA_DIRECT_OPS
def_bool y
 
 menu "Platform type"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 829c67986db7..9376637229c9 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -140,7 +140,7 @@ config S390
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_API_DEBUG
select HAVE_DMA_CONTIGUOUS
-   select DMA_NOOP_OPS
+   select DMA_DIRECT_OPS
select HAVE_DYNAMIC_FTRACE
  

[PATCH 23/34] dma-mapping: provide a generic asm/dma-mapping.h

2018-01-12 Thread Christoph Hellwig
For architectures that just use the generic dma_noop_ops we can provide
a generic version of dma-mapping.h.

Signed-off-by: Christoph Hellwig 
---
 MAINTAINERS  |  1 +
 arch/m32r/include/asm/Kbuild |  1 +
 arch/m32r/include/asm/dma-mapping.h  | 17 -
 arch/riscv/include/asm/Kbuild|  1 +
 arch/riscv/include/asm/dma-mapping.h | 30 --
 arch/s390/include/asm/Kbuild |  1 +
 arch/s390/include/asm/dma-mapping.h  | 17 -
 include/asm-generic/dma-mapping.h| 10 ++
 8 files changed, 14 insertions(+), 64 deletions(-)
 delete mode 100644 arch/m32r/include/asm/dma-mapping.h
 delete mode 100644 arch/riscv/include/asm/dma-mapping.h
 delete mode 100644 arch/s390/include/asm/dma-mapping.h
 create mode 100644 include/asm-generic/dma-mapping.h

diff --git a/MAINTAINERS b/MAINTAINERS
index d2cfdcce1db5..234e642e7149 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4338,6 +4338,7 @@ F:lib/dma-noop.c
 F: lib/dma-virt.c
 F: drivers/base/dma-mapping.c
 F: drivers/base/dma-coherent.c
+F: include/asm-generic/dma-mapping.h
 F: include/linux/dma-direct.h
 F: include/linux/dma-mapping.h
 
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 7e11b125c35e..ca83fda8177b 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -1,5 +1,6 @@
 generic-y += clkdev.h
 generic-y += current.h
+generic-y += dma-mapping.h
 generic-y += exec.h
 generic-y += extable.h
 generic-y += irq_work.h
diff --git a/arch/m32r/include/asm/dma-mapping.h 
b/arch/m32r/include/asm/dma-mapping.h
deleted file mode 100644
index 8967fb659691..
--- a/arch/m32r/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_M32R_DMA_MAPPING_H
-#define _ASM_M32R_DMA_MAPPING_H
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-   return _noop_ops;
-}
-
-#endif /* _ASM_M32R_DMA_MAPPING_H */
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 970460a0b492..197460ccbf21 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y += device.h
 generic-y += div64.h
 generic-y += dma.h
 generic-y += dma-contiguous.h
+generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
diff --git a/arch/riscv/include/asm/dma-mapping.h 
b/arch/riscv/include/asm/dma-mapping.h
deleted file mode 100644
index 73849e2cc761..
--- a/arch/riscv/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2003-2004 Hewlett-Packard Co
- * David Mosberger-Tang 
- * Copyright (C) 2012 ARM Ltd.
- * Copyright (C) 2016 SiFive, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see .
- */
-#ifndef __ASM_RISCV_DMA_MAPPING_H
-#define __ASM_RISCV_DMA_MAPPING_H
-
-/* Use ops->dma_mapping_error (if it exists) or assume success */
-// #undef DMA_ERROR_CODE
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-   return _noop_ops;
-}
-
-#endif /* __ASM_RISCV_DMA_MAPPING_H */
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 048450869328..dade72be127b 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += cacheflush.h
 generic-y += clkdev.h
 generic-y += device.h
 generic-y += dma-contiguous.h
+generic-y += dma-mapping.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += export.h
diff --git a/arch/s390/include/asm/dma-mapping.h 
b/arch/s390/include/asm/dma-mapping.h
deleted file mode 100644
index bdc2455483f6..
--- a/arch/s390/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_DMA_MAPPING_H
-#define _ASM_S390_DMA_MAPPING_H
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
-{
-   return _noop_ops;
-}
-
-#endif /* _ASM_S390_DMA_MAPPING_H */
diff --git a/include/asm-generic/dma-mapping.h 
b/include/asm-generic/dma-mapping.h
new file mode 100644
index ..164031531d85
--- /dev/null
+++ b/include/asm-generic/dma-mapping.h
@@ -0,0 

[PATCH 22/34] dma-mapping: add an arch_dma_supported hook

2018-01-12 Thread Christoph Hellwig
To implement the x86 forbid_dac and iommu_sac_force we want an arch hook
so that it can apply the global options across all dma_map_ops
implementations.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/include/asm/dma-mapping.h |  3 +++
 arch/x86/kernel/pci-dma.c  | 19 ---
 include/linux/dma-mapping.h| 11 +++
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/dma-mapping.h 
b/arch/x86/include/asm/dma-mapping.h
index dfdc9357a349..6277c83c0eb1 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -30,6 +30,9 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return dma_ops;
 }
 
+int arch_dma_supported(struct device *dev, u64 mask);
+#define arch_dma_supported arch_dma_supported
+
 bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
 #define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 61a8f1cb3829..df7ab02f959f 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -215,7 +215,7 @@ static __init int iommu_setup(char *p)
 }
 early_param("iommu", iommu_setup);
 
-int x86_dma_supported(struct device *dev, u64 mask)
+int arch_dma_supported(struct device *dev, u64 mask)
 {
 #ifdef CONFIG_PCI
if (mask > 0x && forbid_dac > 0) {
@@ -224,12 +224,6 @@ int x86_dma_supported(struct device *dev, u64 mask)
}
 #endif
 
-   /* Copied from i386. Doesn't make much sense, because it will
-  only work for pci_alloc_coherent.
-  The caller just has to use GFP_DMA in this case. */
-   if (mask < DMA_BIT_MASK(24))
-   return 0;
-
/* Tell the device to use SAC when IOMMU force is on.  This
   allows the driver to use cheaper accesses in some cases.
 
@@ -249,6 +243,17 @@ int x86_dma_supported(struct device *dev, u64 mask)
 
return 1;
 }
+EXPORT_SYMBOL(arch_dma_supported);
+
+int x86_dma_supported(struct device *dev, u64 mask)
+{
+   /* Copied from i386. Doesn't make much sense, because it will
+  only work for pci_alloc_coherent.
+  The caller just has to use GFP_DMA in this case. */
+   if (mask < DMA_BIT_MASK(24))
+   return 0;
+   return 1;
+}
 
 static int __init pci_iommu_init(void)
 {
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 88bcb1a8211d..d67742dad904 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -576,6 +576,14 @@ static inline int dma_mapping_error(struct device *dev, 
dma_addr_t dma_addr)
return 0;
 }
 
+/*
+ * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
+ * don't use this is new code.
+ */
+#ifndef arch_dma_supported
+#define arch_dma_supported(dev, mask)  (1)
+#endif
+
 static inline void dma_check_mask(struct device *dev, u64 mask)
 {
if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
@@ -588,6 +596,9 @@ static inline int dma_supported(struct device *dev, u64 
mask)
 
if (!ops)
return 0;
+   if (!arch_dma_supported(dev, mask))
+   return 0;
+
if (!ops->dma_supported)
return 1;
return ops->dma_supported(dev, mask);
-- 
2.14.2



[PATCH 21/34] dma-mapping: clear harmful GFP_* flags in common code

2018-01-12 Thread Christoph Hellwig
Lift the code from x86 so that we behave consistently.  In the future we
should probably warn if any of these is set.

Signed-off-by: Christoph Hellwig 
Acked-by: Jesper Nilsson 
Acked-by: Geert Uytterhoeven  [m68k]
---
 arch/cris/arch-v32/drivers/pci/dma.c  | 3 ---
 arch/h8300/kernel/dma.c   | 3 ---
 arch/m68k/kernel/dma.c| 2 --
 arch/mips/cavium-octeon/dma-octeon.c  | 3 ---
 arch/mips/loongson64/common/dma-swiotlb.c | 3 ---
 arch/mips/mm/dma-default.c| 3 ---
 arch/mips/netlogic/common/nlm-dma.c   | 3 ---
 arch/mn10300/mm/dma-alloc.c   | 3 ---
 arch/nios2/mm/dma-mapping.c   | 3 ---
 arch/powerpc/kernel/dma.c | 3 ---
 arch/x86/kernel/pci-dma.c | 2 --
 include/linux/dma-mapping.h   | 7 +++
 12 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/arch/cris/arch-v32/drivers/pci/dma.c 
b/arch/cris/arch-v32/drivers/pci/dma.c
index dbbd3816cc0b..8c3802244ef3 100644
--- a/arch/cris/arch-v32/drivers/pci/dma.c
+++ b/arch/cris/arch-v32/drivers/pci/dma.c
@@ -22,9 +22,6 @@ static void *v32_dma_alloc(struct device *dev, size_t size,
 {
void *ret;
 
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
-
if (dev == NULL || (dev->coherent_dma_mask < 0x))
gfp |= GFP_DMA;
 
diff --git a/arch/h8300/kernel/dma.c b/arch/h8300/kernel/dma.c
index 225dd0a188dc..d44ba5db4ac3 100644
--- a/arch/h8300/kernel/dma.c
+++ b/arch/h8300/kernel/dma.c
@@ -16,9 +16,6 @@ static void *dma_alloc(struct device *dev, size_t size,
 {
void *ret;
 
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
-
if (dev == NULL || (*dev->dma_mask < 0x))
gfp |= GFP_DMA;
ret = (void *)__get_free_pages(gfp, get_order(size));
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 87ef73a93856..c01b9b8f97bf 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -76,8 +76,6 @@ static void *m68k_dma_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
void *ret;
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
 
if (dev == NULL || (*dev->dma_mask < 0x))
gfp |= GFP_DMA;
diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index c64bd87f0b6e..5baf79fce643 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -161,9 +161,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, 
size_t size,
 {
void *ret;
 
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
-
if (IS_ENABLED(CONFIG_ZONE_DMA) && dev == NULL)
gfp |= __GFP_DMA;
else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c 
b/arch/mips/loongson64/common/dma-swiotlb.c
index ef07740cee61..15388c24a504 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -15,9 +15,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, 
size_t size,
 {
void *ret;
 
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
-
if ((IS_ENABLED(CONFIG_ISA) && dev == NULL) ||
(IS_ENABLED(CONFIG_ZONE_DMA) &&
 dev->coherent_dma_mask < DMA_BIT_MASK(32)))
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index e3e94d05f0fd..237532e89919 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -93,9 +93,6 @@ static gfp_t massage_gfp_flags(const struct device *dev, 
gfp_t gfp)
 {
gfp_t dma_flag;
 
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
-
 #ifdef CONFIG_ISA
if (dev == NULL)
dma_flag = __GFP_DMA;
diff --git a/arch/mips/netlogic/common/nlm-dma.c 
b/arch/mips/netlogic/common/nlm-dma.c
index 0ec9d9da6d51..49c975b6aa28 100644
--- a/arch/mips/netlogic/common/nlm-dma.c
+++ b/arch/mips/netlogic/common/nlm-dma.c
@@ -47,9 +47,6 @@ static char *nlm_swiotlb;
 static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
-   /* ignore region specifiers */
-   gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
-
 #ifdef CONFIG_ZONE_DMA32
if (dev->coherent_dma_mask <= DMA_BIT_MASK(32))
gfp |= __GFP_DMA32;
diff --git a/arch/mn10300/mm/dma-alloc.c b/arch/mn10300/mm/dma-alloc.c
index 86108d2496b3..e3910d4db102 100644
--- a/arch/mn10300/mm/dma-alloc.c
+++ b/arch/mn10300/mm/dma-alloc.c
@@ -37,9 +37,6 @@ static void *mn10300_dma_alloc(struct device *dev, size_t 
size,
   

[PATCH 20/34] dma-mapping: warn when there is no coherent_dma_mask

2018-01-12 Thread Christoph Hellwig
These days all devices should have a DMA coherent mask, and most dma_ops
implementations rely on that fact.  But just to be sure add an assert to
ring the warning bell if that is not the case.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Vladimir Murzin 
---
 include/linux/dma-mapping.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index d84951865be7..9f28b2fa329e 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -513,6 +513,7 @@ static inline void *dma_alloc_attrs(struct device *dev, 
size_t size,
void *cpu_addr;
 
BUG_ON(!ops);
+   WARN_ON_ONCE(!dev->coherent_dma_mask);
 
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, _addr))
return cpu_addr;
-- 
2.14.2



[PATCH 19/34] s390: move s390_pci_dma_ops to asm/pci_dma.h

2018-01-12 Thread Christoph Hellwig
This is not needed in drivers, so move it to a private header.

Signed-off-by: Christoph Hellwig 
---
 arch/s390/include/asm/dma-mapping.h | 2 --
 arch/s390/include/asm/pci_dma.h | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/dma-mapping.h 
b/arch/s390/include/asm/dma-mapping.h
index 2ec7240c1ada..bdc2455483f6 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -9,8 +9,6 @@
 #include 
 #include 
 
-extern const struct dma_map_ops s390_pci_dma_ops;
-
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
return _noop_ops;
diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index e8d9161fa17a..419fac7a62c0 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -201,4 +201,7 @@ void dma_cleanup_tables(unsigned long *);
 unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr);
 void dma_update_cpu_trans(unsigned long *entry, void *page_addr, int flags);
 
+extern const struct dma_map_ops s390_pci_dma_ops;
+
+
 #endif
-- 
2.14.2



[PATCH 18/34] microblaze: remove the dead !NOT_COHERENT_CACHE dma code

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/microblaze/kernel/dma.c | 28 
 1 file changed, 28 deletions(-)

diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index b45d8f8967af..c91e8cef98dd 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -15,42 +15,18 @@
 #include 
 #include 
 
-#define NOT_COHERENT_CACHE
-
 static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
   dma_addr_t *dma_handle, gfp_t flag,
   unsigned long attrs)
 {
-#ifdef NOT_COHERENT_CACHE
return consistent_alloc(flag, size, dma_handle);
-#else
-   void *ret;
-   struct page *page;
-   int node = dev_to_node(dev);
-
-   /* ignore region specifiers */
-   flag  &= ~(__GFP_HIGHMEM);
-
-   page = alloc_pages_node(node, flag, get_order(size));
-   if (page == NULL)
-   return NULL;
-   ret = page_address(page);
-   memset(ret, 0, size);
-   *dma_handle = virt_to_phys(ret);
-
-   return ret;
-#endif
 }
 
 static void dma_nommu_free_coherent(struct device *dev, size_t size,
 void *vaddr, dma_addr_t dma_handle,
 unsigned long attrs)
 {
-#ifdef NOT_COHERENT_CACHE
consistent_free(size, vaddr);
-#else
-   free_pages((unsigned long)vaddr, get_order(size));
-#endif
 }
 
 static inline void __dma_sync(unsigned long paddr,
@@ -186,12 +162,8 @@ int dma_nommu_mmap_coherent(struct device *dev, struct 
vm_area_struct *vma,
if (off >= count || user_count > (count - off))
return -ENXIO;
 
-#ifdef NOT_COHERENT_CACHE
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
pfn = consistent_virt_to_pfn(cpu_addr);
-#else
-   pfn = virt_to_pfn(cpu_addr);
-#endif
return remap_pfn_range(vma, vma->vm_start, pfn + off,
   vma->vm_end - vma->vm_start, vma->vm_page_prot);
 #else
-- 
2.14.2



[PATCH 17/34] microblaze: remove dma_nommu_dma_supported

2018-01-12 Thread Christoph Hellwig
Always returning 1 is the same behavior as not supplying a method at all.

Signed-off-by: Christoph Hellwig 
---
 arch/microblaze/kernel/dma.c | 6 --
 arch/parisc/kernel/pci-dma.c | 7 ---
 2 files changed, 13 deletions(-)

diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index 450803e5731a..b45d8f8967af 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -89,11 +89,6 @@ static int dma_nommu_map_sg(struct device *dev, struct 
scatterlist *sgl,
return nents;
 }
 
-static int dma_nommu_dma_supported(struct device *dev, u64 mask)
-{
-   return 1;
-}
-
 static inline dma_addr_t dma_nommu_map_page(struct device *dev,
 struct page *page,
 unsigned long offset,
@@ -209,7 +204,6 @@ const struct dma_map_ops dma_nommu_ops = {
.free   = dma_nommu_free_coherent,
.mmap   = dma_nommu_mmap_coherent,
.map_sg = dma_nommu_map_sg,
-   .dma_supported  = dma_nommu_dma_supported,
.map_page   = dma_nommu_map_page,
.unmap_page = dma_nommu_unmap_page,
.sync_single_for_cpu= dma_nommu_sync_single_for_cpu,
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index c0dfd892f70c..91bc0cac03a1 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -75,11 +75,6 @@ void dump_resmap(void)
 static inline void dump_resmap(void) {;}
 #endif
 
-static int pa11_dma_supported( struct device *dev, u64 mask)
-{
-   return 1;
-}
-
 static inline int map_pte_uncached(pte_t * pte,
unsigned long vaddr,
unsigned long size, unsigned long *paddr_ptr)
@@ -579,7 +574,6 @@ static void pa11_dma_cache_sync(struct device *dev, void 
*vaddr, size_t size,
 }
 
 const struct dma_map_ops pcxl_dma_ops = {
-   .dma_supported =pa11_dma_supported,
.alloc =pa11_dma_alloc,
.free = pa11_dma_free,
.map_page = pa11_dma_map_page,
@@ -616,7 +610,6 @@ static void pcx_dma_free(struct device *dev, size_t size, 
void *vaddr,
 }
 
 const struct dma_map_ops pcx_dma_ops = {
-   .dma_supported =pa11_dma_supported,
.alloc =pcx_dma_alloc,
.free = pcx_dma_free,
.map_page = pa11_dma_map_page,
-- 
2.14.2



[PATCH 16/34] microblaze: rename dma_direct to dma_nommu

2018-01-12 Thread Christoph Hellwig
This frees the dma_direct_* namespace for a generic implementation.

Signed-off-by: Christoph Hellwig 
---
 arch/microblaze/include/asm/dma-mapping.h |  4 +--
 arch/microblaze/kernel/dma.c  | 48 +++
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/microblaze/include/asm/dma-mapping.h 
b/arch/microblaze/include/asm/dma-mapping.h
index 6b9ea39405b8..add50c1373bf 100644
--- a/arch/microblaze/include/asm/dma-mapping.h
+++ b/arch/microblaze/include/asm/dma-mapping.h
@@ -18,11 +18,11 @@
 /*
  * Available generic sets of operations
  */
-extern const struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_nommu_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-   return _direct_ops;
+   return _nommu_ops;
 }
 
 #endif /* _ASM_MICROBLAZE_DMA_MAPPING_H */
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index 990bf9ea0ec6..450803e5731a 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -17,7 +17,7 @@
 
 #define NOT_COHERENT_CACHE
 
-static void *dma_direct_alloc_coherent(struct device *dev, size_t size,
+static void *dma_nommu_alloc_coherent(struct device *dev, size_t size,
   dma_addr_t *dma_handle, gfp_t flag,
   unsigned long attrs)
 {
@@ -42,7 +42,7 @@ static void *dma_direct_alloc_coherent(struct device *dev, 
size_t size,
 #endif
 }
 
-static void dma_direct_free_coherent(struct device *dev, size_t size,
+static void dma_nommu_free_coherent(struct device *dev, size_t size,
 void *vaddr, dma_addr_t dma_handle,
 unsigned long attrs)
 {
@@ -69,7 +69,7 @@ static inline void __dma_sync(unsigned long paddr,
}
 }
 
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
+static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl,
 int nents, enum dma_data_direction direction,
 unsigned long attrs)
 {
@@ -89,12 +89,12 @@ static int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl,
return nents;
 }
 
-static int dma_direct_dma_supported(struct device *dev, u64 mask)
+static int dma_nommu_dma_supported(struct device *dev, u64 mask)
 {
return 1;
 }
 
-static inline dma_addr_t dma_direct_map_page(struct device *dev,
+static inline dma_addr_t dma_nommu_map_page(struct device *dev,
 struct page *page,
 unsigned long offset,
 size_t size,
@@ -106,7 +106,7 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
return page_to_phys(page) + offset;
 }
 
-static inline void dma_direct_unmap_page(struct device *dev,
+static inline void dma_nommu_unmap_page(struct device *dev,
 dma_addr_t dma_address,
 size_t size,
 enum dma_data_direction direction,
@@ -122,7 +122,7 @@ static inline void dma_direct_unmap_page(struct device *dev,
 }
 
 static inline void
-dma_direct_sync_single_for_cpu(struct device *dev,
+dma_nommu_sync_single_for_cpu(struct device *dev,
   dma_addr_t dma_handle, size_t size,
   enum dma_data_direction direction)
 {
@@ -136,7 +136,7 @@ dma_direct_sync_single_for_cpu(struct device *dev,
 }
 
 static inline void
-dma_direct_sync_single_for_device(struct device *dev,
+dma_nommu_sync_single_for_device(struct device *dev,
  dma_addr_t dma_handle, size_t size,
  enum dma_data_direction direction)
 {
@@ -150,7 +150,7 @@ dma_direct_sync_single_for_device(struct device *dev,
 }
 
 static inline void
-dma_direct_sync_sg_for_cpu(struct device *dev,
+dma_nommu_sync_sg_for_cpu(struct device *dev,
   struct scatterlist *sgl, int nents,
   enum dma_data_direction direction)
 {
@@ -164,7 +164,7 @@ dma_direct_sync_sg_for_cpu(struct device *dev,
 }
 
 static inline void
-dma_direct_sync_sg_for_device(struct device *dev,
+dma_nommu_sync_sg_for_device(struct device *dev,
  struct scatterlist *sgl, int nents,
  enum dma_data_direction direction)
 {
@@ -178,7 +178,7 @@ dma_direct_sync_sg_for_device(struct device *dev,
 }
 
 static
-int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
+int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
 void *cpu_addr, dma_addr_t handle, size_t size,
 unsigned long attrs)
 {
@@ -204,20 +204,20 @@ int dma_direct_mmap_coherent(struct device *dev, struct 

[PATCH 15/34] powerpc: rename dma_direct_ to dma_nommu_

2018-01-12 Thread Christoph Hellwig
We want to use the dma_direct_ namespace for a generic implementation,
so rename powerpc to the second best choice: dma_nommu_.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-mapping.h|  8 ++--
 arch/powerpc/kernel/dma-iommu.c   |  2 +-
 arch/powerpc/kernel/dma-swiotlb.c |  6 +--
 arch/powerpc/kernel/dma.c | 68 +++
 arch/powerpc/kernel/pci-common.c  |  2 +-
 arch/powerpc/kernel/setup-common.c|  2 +-
 arch/powerpc/platforms/cell/iommu.c   | 28 ++---
 arch/powerpc/platforms/pasemi/iommu.c |  2 +-
 arch/powerpc/platforms/pasemi/setup.c |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c |  4 +-
 arch/powerpc/platforms/pseries/iommu.c|  2 +-
 arch/powerpc/platforms/pseries/vio.c  |  2 +-
 arch/powerpc/sysdev/dart_iommu.c  |  4 +-
 arch/powerpc/sysdev/fsl_pci.c |  2 +-
 drivers/misc/cxl/vphb.c   |  2 +-
 15 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index f6ab51205a85..8fa394520af6 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -19,13 +19,13 @@
 #include 
 
 /* Some dma direct funcs must be visible for use in other dma_ops */
-extern void *__dma_direct_alloc_coherent(struct device *dev, size_t size,
+extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
 dma_addr_t *dma_handle, gfp_t flag,
 unsigned long attrs);
-extern void __dma_direct_free_coherent(struct device *dev, size_t size,
+extern void __dma_nommu_free_coherent(struct device *dev, size_t size,
   void *vaddr, dma_addr_t dma_handle,
   unsigned long attrs);
-extern int dma_direct_mmap_coherent(struct device *dev,
+extern int dma_nommu_mmap_coherent(struct device *dev,
struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t handle,
size_t size, unsigned long attrs);
@@ -73,7 +73,7 @@ static inline unsigned long device_to_mask(struct device *dev)
 #ifdef CONFIG_PPC64
 extern struct dma_map_ops dma_iommu_ops;
 #endif
-extern const struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_nommu_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 66f33e7f8d40..f9fe2080ceb9 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -114,7 +114,7 @@ int dma_iommu_mapping_error(struct device *dev, dma_addr_t 
dma_addr)
 struct dma_map_ops dma_iommu_ops = {
.alloc  = dma_iommu_alloc_coherent,
.free   = dma_iommu_free_coherent,
-   .mmap   = dma_direct_mmap_coherent,
+   .mmap   = dma_nommu_mmap_coherent,
.map_sg = dma_iommu_map_sg,
.unmap_sg   = dma_iommu_unmap_sg,
.dma_supported  = dma_iommu_dma_supported,
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index d0ea7860e02b..f1e99b9cee97 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -47,9 +47,9 @@ static u64 swiotlb_powerpc_get_required(struct device *dev)
  * for everything else.
  */
 const struct dma_map_ops swiotlb_dma_ops = {
-   .alloc = __dma_direct_alloc_coherent,
-   .free = __dma_direct_free_coherent,
-   .mmap = dma_direct_mmap_coherent,
+   .alloc = __dma_nommu_alloc_coherent,
+   .free = __dma_nommu_free_coherent,
+   .mmap = dma_nommu_mmap_coherent,
.map_sg = swiotlb_map_sg_attrs,
.unmap_sg = swiotlb_unmap_sg_attrs,
.dma_supported = swiotlb_dma_supported,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 4194db10..6d5d04ccf3b4 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -40,7 +40,7 @@ static u64 __maybe_unused get_pfn_limit(struct device *dev)
return pfn;
 }
 
-static int dma_direct_dma_supported(struct device *dev, u64 mask)
+static int dma_nommu_dma_supported(struct device *dev, u64 mask)
 {
 #ifdef CONFIG_PPC64
u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1);
@@ -62,7 +62,7 @@ static int dma_direct_dma_supported(struct device *dev, u64 
mask)
 #endif
 }
 
-void *__dma_direct_alloc_coherent(struct device *dev, size_t size,
+void *__dma_nommu_alloc_coherent(struct device *dev, size_t size,
  dma_addr_t *dma_handle, gfp_t flag,
  unsigned long attrs)
 {
@@ -119,7 +119,7 @@ void *__dma_direct_alloc_coherent(struct 

[PATCH 14/34] hexagon: use the generic dma_capable helper

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
Acked-by: Richard Kuo 
---
 arch/hexagon/include/asm/dma-mapping.h | 7 ---
 arch/hexagon/kernel/dma.c  | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/hexagon/include/asm/dma-mapping.h 
b/arch/hexagon/include/asm/dma-mapping.h
index 5208de242e79..263f6acbfb0f 100644
--- a/arch/hexagon/include/asm/dma-mapping.h
+++ b/arch/hexagon/include/asm/dma-mapping.h
@@ -37,11 +37,4 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return dma_ops;
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t 
size)
-{
-   if (!dev->dma_mask)
-   return 0;
-   return addr + size - 1 <= *dev->dma_mask;
-}
-
 #endif
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index 546792d176a4..ad8347c29dcf 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -19,6 +19,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.14.2



[PATCH 13/34] dma-mapping: move dma_mark_clean to dma-direct.h

2018-01-12 Thread Christoph Hellwig
And unlike the other helpers we don't require a  as
this helper is a special case for ia64 only, and this keeps it as
simple as possible.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/include/asm/dma-mapping.h   | 2 --
 arch/arm64/include/asm/dma-mapping.h | 4 
 arch/ia64/Kconfig| 1 +
 arch/ia64/include/asm/dma.h  | 2 --
 arch/mips/include/asm/dma-mapping.h  | 2 --
 arch/powerpc/include/asm/swiotlb.h   | 2 --
 arch/tile/include/asm/dma-mapping.h  | 2 --
 arch/unicore32/include/asm/dma-mapping.h | 2 --
 arch/x86/include/asm/swiotlb.h   | 2 --
 include/linux/dma-direct.h   | 9 +
 10 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h 
b/arch/arm/include/asm/dma-mapping.h
index 5fb1b7fbdfbe..e5d9020c9ee1 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -109,8 +109,6 @@ static inline bool is_device_dma_coherent(struct device 
*dev)
return dev->archdata.dma_coherent;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size) { }
-
 /**
  * arm_dma_alloc - allocate consistent memory for DMA
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
diff --git a/arch/arm64/include/asm/dma-mapping.h 
b/arch/arm64/include/asm/dma-mapping.h
index 400fa67d3b5a..b7847eb8a7bb 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -50,9 +50,5 @@ static inline bool is_device_dma_coherent(struct device *dev)
return dev->archdata.dma_coherent;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_DMA_MAPPING_H */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 49583c5a5d44..4d18fca885ee 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -33,6 +33,7 @@ config IA64
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
+   select ARCH_HAS_DMA_MARK_CLEAN
select ARCH_HAS_SG_CHAIN
select VIRT_TO_BUS
select ARCH_DISCARD_MEMBLOCK
diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
index 186850eec934..23604d6a2cb2 100644
--- a/arch/ia64/include/asm/dma.h
+++ b/arch/ia64/include/asm/dma.h
@@ -20,6 +20,4 @@ extern unsigned long MAX_DMA_ADDRESS;
 
 #define free_dma(x)
 
-void dma_mark_clean(void *addr, size_t size);
-
 #endif /* _ASM_IA64_DMA_H */
diff --git a/arch/mips/include/asm/dma-mapping.h 
b/arch/mips/include/asm/dma-mapping.h
index 676c14cfc580..886e75a383f2 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -17,8 +17,6 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return mips_dma_map_ops;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
 #define arch_setup_dma_ops arch_setup_dma_ops
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
  u64 size, const struct iommu_ops *iommu,
diff --git a/arch/powerpc/include/asm/swiotlb.h 
b/arch/powerpc/include/asm/swiotlb.h
index 01d45a5fd00b..9341ee804d19 100644
--- a/arch/powerpc/include/asm/swiotlb.h
+++ b/arch/powerpc/include/asm/swiotlb.h
@@ -15,8 +15,6 @@
 
 extern const struct dma_map_ops swiotlb_dma_ops;
 
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
 extern unsigned int ppc_swiotlb_enable;
 int __init swiotlb_setup_bus_notifier(void);
 
diff --git a/arch/tile/include/asm/dma-mapping.h 
b/arch/tile/include/asm/dma-mapping.h
index 75b8aaa4e70b..d25fce101fc0 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -44,8 +44,6 @@ static inline void set_dma_offset(struct device *dev, 
dma_addr_t off)
dev->archdata.dma_offset = off;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
 #define HAVE_ARCH_DMA_SET_MASK 1
 int dma_set_mask(struct device *dev, u64 mask);
 
diff --git a/arch/unicore32/include/asm/dma-mapping.h 
b/arch/unicore32/include/asm/dma-mapping.h
index 5cb250bf2d8c..f2bfec273aa7 100644
--- a/arch/unicore32/include/asm/dma-mapping.h
+++ b/arch/unicore32/include/asm/dma-mapping.h
@@ -25,7 +25,5 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return _dma_map_ops;
 }
 
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index bdf9aed40403..1c6a6cb230ff 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -28,8 +28,6 @@ static inline void pci_swiotlb_late_init(void)
 }
 #endif
 
-static inline void dma_mark_clean(void *addr, size_t size) {}
-
 extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
  

[PATCH 12/34] dma-mapping: move swiotlb arch helpers to a new header

2018-01-12 Thread Christoph Hellwig
phys_to_dma, dma_to_phys and dma_capable are helpers published by
architecture code for use of swiotlb and xen-swiotlb only.  Drivers are
not supposed to use these directly, but use the DMA API instead.

Move these to a new asm/dma-direct.h helper, included by a
linux/dma-direct.h wrapper that provides the default linear mapping
unless the architecture wants to override it.

In the MIPS case the existing dma-coherent.h is reused for now as
untangling it will take a bit of work.

Signed-off-by: Christoph Hellwig 
Acked-by: Robin Murphy 
---
 MAINTAINERS|  1 +
 arch/Kconfig   |  4 +++
 arch/arm/Kconfig   |  1 +
 arch/arm/include/asm/dma-direct.h  | 36 ++
 arch/arm/include/asm/dma-mapping.h | 31 ---
 arch/arm64/include/asm/dma-mapping.h   | 22 -
 arch/arm64/mm/dma-mapping.c|  2 +-
 arch/ia64/include/asm/dma-mapping.h| 18 ---
 arch/mips/Kconfig  |  2 ++
 arch/mips/include/asm/dma-direct.h |  1 +
 arch/mips/include/asm/dma-mapping.h|  8 -
 .../include/asm/mach-cavium-octeon/dma-coherence.h |  8 +
 arch/mips/include/asm/mach-generic/dma-coherence.h | 12 
 .../include/asm/mach-loongson64/dma-coherence.h|  8 +
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/dma-direct.h  | 29 +
 arch/powerpc/include/asm/dma-mapping.h | 25 ---
 arch/tile/include/asm/dma-mapping.h| 18 ---
 arch/unicore32/include/asm/dma-mapping.h   | 18 ---
 arch/x86/Kconfig   |  1 +
 arch/x86/include/asm/dma-direct.h  | 30 ++
 arch/x86/include/asm/dma-mapping.h | 26 
 arch/x86/kernel/amd_gart_64.c  |  1 +
 arch/x86/kernel/pci-dma.c  |  2 +-
 arch/x86/kernel/pci-nommu.c|  2 +-
 arch/x86/kernel/pci-swiotlb.c  |  2 +-
 arch/x86/mm/mem_encrypt.c  |  2 +-
 arch/x86/pci/sta2x11-fixup.c   |  1 +
 arch/xtensa/include/asm/dma-mapping.h  | 10 --
 drivers/crypto/marvell/cesa.c  |  1 +
 drivers/mtd/nand/qcom_nandc.c  |  1 +
 drivers/xen/swiotlb-xen.c  |  2 +-
 include/linux/dma-direct.h | 32 +++
 lib/swiotlb.c  |  2 +-
 34 files changed, 165 insertions(+), 195 deletions(-)
 create mode 100644 arch/arm/include/asm/dma-direct.h
 create mode 100644 arch/mips/include/asm/dma-direct.h
 create mode 100644 arch/powerpc/include/asm/dma-direct.h
 create mode 100644 arch/x86/include/asm/dma-direct.h
 create mode 100644 include/linux/dma-direct.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 95c3fa1f520f..d2cfdcce1db5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4338,6 +4338,7 @@ F:lib/dma-noop.c
 F: lib/dma-virt.c
 F: drivers/base/dma-mapping.c
 F: drivers/base/dma-coherent.c
+F: include/linux/dma-direct.h
 F: include/linux/dma-mapping.h
 
 DME1737 HARDWARE MONITOR DRIVER
diff --git a/arch/Kconfig b/arch/Kconfig
index 400b9e1b2f27..3edf118ad777 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -938,6 +938,10 @@ config STRICT_MODULE_RWX
  and non-text memory will be made non-executable. This provides
  protection against certain security exploits (e.g. writing to text)
 
+# select if the architecture provides an asm/dma-direct.h header
+config ARCH_HAS_PHYS_TO_DMA
+   bool
+
 config ARCH_HAS_REFCOUNT
bool
help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 51c8df561077..00d889a37965 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_SET_MEMORY
+   select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
select ARCH_HAS_STRICT_MODULE_RWX if MMU
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm/include/asm/dma-direct.h 
b/arch/arm/include/asm/dma-direct.h
new file mode 100644
index ..5b0a8a421894
--- /dev/null
+++ b/arch/arm/include/asm/dma-direct.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_ARM_DMA_DIRECT_H
+#define ASM_ARM_DMA_DIRECT_H 1
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+   unsigned int offset = paddr & ~PAGE_MASK;
+   return pfn_to_dma(dev, __phys_to_pfn(paddr)) + offset;
+}
+
+static inline phys_addr_t 

[PATCH 11/34] mips: fix an off-by-one in dma_capable

2018-01-12 Thread Christoph Hellwig
This makes it match the generic version.

Reported-by: Vladimir Murzin 
Signed-off-by: Christoph Hellwig 
---
 arch/mips/include/asm/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/dma-mapping.h 
b/arch/mips/include/asm/dma-mapping.h
index 0d9418d264f9..5c334ac15945 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -22,7 +22,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t 
addr, size_t size)
if (!dev->dma_mask)
return false;
 
-   return addr + size <= *dev->dma_mask;
+   return addr + size - 1 <= *dev->dma_mask;
 }
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
-- 
2.14.2



[PATCH 10/34] arm64: don't override dma_max_pfn

2018-01-12 Thread Christoph Hellwig
The generic version now takes dma_pfn_offset into account, so there is no
more need for an architecture override.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 arch/arm64/include/asm/dma-mapping.h | 9 -
 1 file changed, 9 deletions(-)

diff --git a/arch/arm64/include/asm/dma-mapping.h 
b/arch/arm64/include/asm/dma-mapping.h
index 0df756b24863..eada887a93bf 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -76,14 +76,5 @@ static inline void dma_mark_clean(void *addr, size_t size)
 {
 }
 
-/* Override for dma_max_pfn() */
-static inline unsigned long dma_max_pfn(struct device *dev)
-{
-   dma_addr_t dma_max = (dma_addr_t)*dev->dma_mask;
-
-   return (ulong)dma_to_phys(dev, dma_max) >> PAGE_SHIFT;
-}
-#define dma_max_pfn(dev) dma_max_pfn(dev)
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_DMA_MAPPING_H */
-- 
2.14.2



[PATCH 09/34] dma-mapping: take dma_pfn_offset into account in dma_max_pfn

2018-01-12 Thread Christoph Hellwig
This makes sure the generic version can be used with architectures /
devices that have a DMA offset in the direct mapping.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 include/linux/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 81ed9b2d84dc..d84951865be7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -692,7 +692,7 @@ static inline int dma_set_seg_boundary(struct device *dev, 
unsigned long mask)
 #ifndef dma_max_pfn
 static inline unsigned long dma_max_pfn(struct device *dev)
 {
-   return *dev->dma_mask >> PAGE_SHIFT;
+   return (*dev->dma_mask >> PAGE_SHIFT) + dev->dma_pfn_offset;
 }
 #endif
 
-- 
2.14.2



[PATCH 08/34] s390: remove the unused dma_capable helper

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/s390/include/asm/dma-mapping.h | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/s390/include/asm/dma-mapping.h 
b/arch/s390/include/asm/dma-mapping.h
index eaf490f9c5bc..2ec7240c1ada 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -16,11 +16,4 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return _noop_ops;
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t 
size)
-{
-   if (!dev->dma_mask)
-   return false;
-   return addr + size - 1 <= *dev->dma_mask;
-}
-
 #endif /* _ASM_S390_DMA_MAPPING_H */
-- 
2.14.2



[PATCH 07/34] riscv: remove the unused dma_capable helper

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/riscv/include/asm/dma-mapping.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/arch/riscv/include/asm/dma-mapping.h 
b/arch/riscv/include/asm/dma-mapping.h
index 3eec1000196d..73849e2cc761 100644
--- a/arch/riscv/include/asm/dma-mapping.h
+++ b/arch/riscv/include/asm/dma-mapping.h
@@ -27,12 +27,4 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return _noop_ops;
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t 
size)
-{
-   if (!dev->dma_mask)
-   return false;
-
-   return addr + size - 1 <= *dev->dma_mask;
-}
-
 #endif /* __ASM_RISCV_DMA_MAPPING_H */
-- 
2.14.2



[PATCH 05/34] arc: remove CONFIG_ARC_PLAT_NEEDS_PHYS_TO_DMA

2018-01-12 Thread Christoph Hellwig
We always use the stub definitions, so remove the unused other code.

Signed-off-by: Christoph Hellwig 
Acked-by: Vineet Gupta 
---
 arch/arc/Kconfig   |  3 ---
 arch/arc/include/asm/dma-mapping.h |  7 ---
 arch/arc/mm/dma.c  | 14 +++---
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 9d5fd00d9e91..f3a80cf164cc 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -463,9 +463,6 @@ config ARCH_PHYS_ADDR_T_64BIT
 config ARCH_DMA_ADDR_T_64BIT
bool
 
-config ARC_PLAT_NEEDS_PHYS_TO_DMA
-   bool
-
 config ARC_KVADDR_SIZE
int "Kernel Virtual Address Space size (MB)"
range 0 512
diff --git a/arch/arc/include/asm/dma-mapping.h 
b/arch/arc/include/asm/dma-mapping.h
index 94285031c4fb..7a16824bfe98 100644
--- a/arch/arc/include/asm/dma-mapping.h
+++ b/arch/arc/include/asm/dma-mapping.h
@@ -11,13 +11,6 @@
 #ifndef ASM_ARC_DMA_MAPPING_H
 #define ASM_ARC_DMA_MAPPING_H
 
-#ifndef CONFIG_ARC_PLAT_NEEDS_PHYS_TO_DMA
-#define plat_dma_to_phys(dev, dma_handle) ((phys_addr_t)(dma_handle))
-#define plat_phys_to_dma(dev, paddr) ((dma_addr_t)(paddr))
-#else
-#include 
-#endif
-
 extern const struct dma_map_ops arc_dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index e9d93604ad0f..1dcc404b5aec 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -60,7 +60,7 @@ static void *arc_dma_alloc(struct device *dev, size_t size,
/* This is linear addr (0x8000_ based) */
paddr = page_to_phys(page);
 
-   *dma_handle = plat_phys_to_dma(dev, paddr);
+   *dma_handle = paddr;
 
/* This is kernel Virtual address (0x7000_ based) */
if (need_kvaddr) {
@@ -92,7 +92,7 @@ static void *arc_dma_alloc(struct device *dev, size_t size,
 static void arc_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
 {
-   phys_addr_t paddr = plat_dma_to_phys(dev, dma_handle);
+   phys_addr_t paddr = dma_handle;
struct page *page = virt_to_page(paddr);
int is_non_coh = 1;
 
@@ -111,7 +111,7 @@ static int arc_dma_mmap(struct device *dev, struct 
vm_area_struct *vma,
 {
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   unsigned long pfn = __phys_to_pfn(plat_dma_to_phys(dev, dma_addr));
+   unsigned long pfn = __phys_to_pfn(dma_addr);
unsigned long off = vma->vm_pgoff;
int ret = -ENXIO;
 
@@ -175,7 +175,7 @@ static dma_addr_t arc_dma_map_page(struct device *dev, 
struct page *page,
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
_dma_cache_sync(paddr, size, dir);
 
-   return plat_phys_to_dma(dev, paddr);
+   return paddr;
 }
 
 /*
@@ -190,7 +190,7 @@ static void arc_dma_unmap_page(struct device *dev, 
dma_addr_t handle,
   size_t size, enum dma_data_direction dir,
   unsigned long attrs)
 {
-   phys_addr_t paddr = plat_dma_to_phys(dev, handle);
+   phys_addr_t paddr = handle;
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
_dma_cache_sync(paddr, size, dir);
@@ -224,13 +224,13 @@ static void arc_dma_unmap_sg(struct device *dev, struct 
scatterlist *sg,
 static void arc_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
-   _dma_cache_sync(plat_dma_to_phys(dev, dma_handle), size, 
DMA_FROM_DEVICE);
+   _dma_cache_sync(dma_handle, size, DMA_FROM_DEVICE);
 }
 
 static void arc_dma_sync_single_for_device(struct device *dev,
dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
-   _dma_cache_sync(plat_dma_to_phys(dev, dma_handle), size, DMA_TO_DEVICE);
+   _dma_cache_sync(dma_handle, size, DMA_TO_DEVICE);
 }
 
 static void arc_dma_sync_sg_for_cpu(struct device *dev,
-- 
2.14.2



[PATCH 06/34] m32r: remove the unused dma_capable helper

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/m32r/include/asm/dma-mapping.h | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/m32r/include/asm/dma-mapping.h 
b/arch/m32r/include/asm/dma-mapping.h
index 336ffe60814b..8967fb659691 100644
--- a/arch/m32r/include/asm/dma-mapping.h
+++ b/arch/m32r/include/asm/dma-mapping.h
@@ -14,11 +14,4 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return _noop_ops;
 }
 
-static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t 
size)
-{
-   if (!dev->dma_mask)
-   return false;
-   return addr + size - 1 <= *dev->dma_mask;
-}
-
 #endif /* _ASM_M32R_DMA_MAPPING_H */
-- 
2.14.2



[PATCH 04/34] powerpc: remove unused flush_write_buffers definition

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/dma-mapping.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index 5a6cbe11db6f..592c7f418aa0 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -107,9 +107,6 @@ static inline void set_dma_offset(struct device *dev, 
dma_addr_t off)
dev->archdata.dma_offset = off;
 }
 
-/* this will be removed soon */
-#define flush_write_buffers()
-
 #define HAVE_ARCH_DMA_SET_MASK 1
 extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
-- 
2.14.2



consolidate direct dma mapping V4

2018-01-12 Thread Christoph Hellwig
Almost every architecture supports a direct dma mapping implementation,
where no iommu is used and the device dma address is a 1:1 mapping to
the physical address or has a simple linear offset.  Currently the
code for this implementation is most duplicated over the architectures,
and the duplicated again in the swiotlb code, and then duplicated again
for special cases like the x86 memory encryption DMA ops.

This series takes the existing very simple dma-noop dma mapping
implementation, enhances it with all the x86 features and quirks, and
creates a common set of architecture hooks for it and the swiotlb code.

It then switches a number of architectures to this generic
direct map implemention.

Note that for now this only handles architectures that do cache coherent
DMA, but a similar consolidation for non-coherent architectures is in the
work for later merge windows.

A git tree is also available:

   git://git.infradead.org/users/hch/misc.git dma-direct.3

Gitweb:

   http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct.3

Changes since V3
 - new patch to fix an off-by-one in mips dma_capable
Changes since V2:
 - fixed a few patch description typos
 - fixed a few printk formats
 - fixed an off by one in dma_coherent_ok
 - add a few Reviewed-by/Acked-by tags.
 - moved the swiotlb consolidation to a new series
 - dropped a few patches for now to not overwhelem the x86
   maintainers.  They will be resubmitted in the next merge window


[PATCH 01/34] alpha: mark jensen as broken

2018-01-12 Thread Christoph Hellwig
CONFIG_ALPHA_JENSEN has failed to compile since commit 6aca0503
("alpha/dma: use common noop dma ops"), so mark it as broken.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b31b974a03cb..e96adcbcab41 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -209,6 +209,7 @@ config ALPHA_EIGER
 
 config ALPHA_JENSEN
bool "Jensen"
+   depends on BROKEN
help
  DEC PC 150 AXP (aka Jensen): This is a very old Digital system - one
  of the first-generation Alpha systems. A number of these systems
-- 
2.14.2



[PATCH 02/34] hexagon: remove unused flush_write_buffers definition

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/hexagon/include/asm/io.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h
index 66f5e9a61efc..9e8621d94ee9 100644
--- a/arch/hexagon/include/asm/io.h
+++ b/arch/hexagon/include/asm/io.h
@@ -330,8 +330,6 @@ static inline void outsl(unsigned long port, const void 
*buffer, int count)
}
 }
 
-#define flush_write_buffers() do { } while (0)
-
 #endif /* __KERNEL__ */
 
 #endif
-- 
2.14.2



[PATCH 03/34] m32r: remove unused flush_write_buffers definition

2018-01-12 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/m32r/include/asm/io.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/m32r/include/asm/io.h b/arch/m32r/include/asm/io.h
index 1b653bb16f9a..a4272d8f0d9c 100644
--- a/arch/m32r/include/asm/io.h
+++ b/arch/m32r/include/asm/io.h
@@ -191,8 +191,6 @@ static inline void _writel(unsigned long l, unsigned long 
addr)
 
 #define mmiowb()
 
-#define flush_write_buffers() do { } while (0)  /* M32R_FIXME */
-
 static inline void
 memset_io(volatile void __iomem *addr, unsigned char val, int count)
 {
-- 
2.14.2