date:20180417

Re: [PATCH] powerpc/8xx: Build fix with Hugetlbfs enabled

2018-04-17 Thread Michael Ellerman

"Aneesh Kumar K.V"  writes:

> 8xx use slice code when hugetlbfs is enabled. We missed a header include on
> 8xx which resulted in the below build failure.
>
> config: mpc885_ads_defconfig + CONFIG_HUGETLBFS
>
>CC  arch/powerpc/mm/slice.o
> arch/powerpc/mm/slice.c: In function 'slice_get_unmapped_area':
> arch/powerpc/mm/slice.c:655:2: error: implicit declaration of function 
> 'need_extra_context' [-Werror=implicit-function-declaration]
> arch/powerpc/mm/slice.c:656:3: error: implicit declaration of function 
> 'alloc_extended_context' [-Werror=implicit-function-declaration]
> cc1: all warnings being treated as errors
> make[1]: *** [arch/powerpc/mm/slice.o] Error 1
> make: *** [arch/powerpc/mm] Error 2
>
> on PPC64 the mmu_context.h was included via linux/pkeys.h
>
> CC: Christophe LEROY 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/slice.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 9cd87d11fe4e..205fe557ca10 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 

I already merged this, didn't I?

cheers

Re: [RFC] virtio: Use DMA MAP API for devices without an IOMMU

2018-04-17 Thread Anshuman Khandual

On 04/15/2018 05:41 PM, Christoph Hellwig wrote:
> On Fri, Apr 06, 2018 at 06:37:18PM +1000, Benjamin Herrenschmidt wrote:
 implemented as DMA API which the virtio core understands. There is no
 need for an IOMMU to be involved for the device representation in this
 case IMHO.
>>>
>>> This whole virtio translation issue is a mess.  I think we need to
>>> switch it to the dma API, and then quirk the legacy case to always
>>> use the direct mapping inside the dma API.
>>
>> Fine with using a dma API always on the Linux side, but we do want to
>> special case virtio still at the arch and qemu side to have a "direct
>> mapping" mode. Not sure how (special flags on PCI devices) to avoid
>> actually going through an emulated IOMMU on the qemu side, because that
>> slows things down, esp. with vhost.
>>
>> IE, we can't I think just treat it the same as a physical device.
> 
> We should have treated it like a physical device from the start, but
> that device has unfortunately sailed.
> 
> But yes, we'll need a per-device quirk that says 'don't attach an
> iommu'.

How about doing it per platform basis as suggested in this RFC through
an arch specific callback. Because all the virtio devices in the given
platform would require and exercise this option (to avail bounce buffer
mechanism for secure guests as an example). So the flag basically is a
platform specific one not a device specific one.

[PATCH v2 7/7] ocxl: Document new OCXL IOCTLs

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

Signed-off-by: Alastair D'Silva 
---
 Documentation/accelerators/ocxl.rst | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/accelerators/ocxl.rst 
b/Documentation/accelerators/ocxl.rst
index 7904adcc07fd..3b8d3b99795c 100644
--- a/Documentation/accelerators/ocxl.rst
+++ b/Documentation/accelerators/ocxl.rst
@@ -157,6 +157,17 @@ OCXL_IOCTL_GET_METADATA:
   Obtains configuration information from the card, such at the size of
   MMIO areas, the AFU version, and the PASID for the current context.
 
+OCXL_IOCTL_ENABLE_P9_WAIT:
+
+  Allows the AFU to wake a userspace thread executing 'wait'. Returns
+  information to userspace to allow it to configure the AFU. Note that
+  this is only available on Power 9.
+
+OCXL_IOCTL_GET_FEATURES:
+
+  Reports on which CPU features that affect OpenCAPI are usable from
+  userspace.
+
 mmap
 
 
-- 
2.14.3

[PATCH v2 1/7] powerpc: Add TIDR CPU feature for Power9

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

This patch adds a CPU feature bit to show whether the CPU has
the TIDR register available, enabling as_notify/wait in userspace.

Signed-off-by: Alastair D'Silva 
---
 arch/powerpc/include/asm/cputable.h | 3 ++-
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 4e332f3531c5..54c4cbbe57b4 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -215,6 +215,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_P9_TM_HV_ASSIST
LONG_ASM_CONST(0x1000)
 #define CPU_FTR_P9_TM_XER_SO_BUG   LONG_ASM_CONST(0x2000)
 #define CPU_FTR_P9_TLBIE_BUG   LONG_ASM_CONST(0x4000)
+#define CPU_FTR_P9_TIDR
LONG_ASM_CONST(0x8000)
 
 #ifndef __ASSEMBLY__
 
@@ -462,7 +463,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
-   CPU_FTR_P9_TLBIE_BUG)
+   CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR)
 #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
 (~CPU_FTR_SAO))
 #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 11a3a4fed3fb..10f8b7f55637 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -722,6 +722,7 @@ static __init void cpufeatures_cpu_quirks(void)
if ((version & 0x) == 0x004e) {
cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR);
cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
+   cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR;
}
 }
 
-- 
2.14.3

[PATCH v2 6/7] ocxl: Add an IOCTL so userspace knows what CPU features are available

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

In order for a userspace AFU driver to call the Power9 specific
OCXL_IOCTL_ENABLE_P9_WAIT, it needs to verify that it can actually
make that call.

Signed-off-by: Alastair D'Silva 
---
 Documentation/accelerators/ocxl.rst |  1 -
 drivers/misc/ocxl/file.c| 25 +
 include/uapi/misc/ocxl.h|  4 
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/Documentation/accelerators/ocxl.rst 
b/Documentation/accelerators/ocxl.rst
index ddcc58d01cfb..7904adcc07fd 100644
--- a/Documentation/accelerators/ocxl.rst
+++ b/Documentation/accelerators/ocxl.rst
@@ -157,7 +157,6 @@ OCXL_IOCTL_GET_METADATA:
   Obtains configuration information from the card, such at the size of
   MMIO areas, the AFU version, and the PASID for the current context.
 
-
 mmap
 
 
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index eb409a469f21..33ae46ce0a8a 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -168,12 +168,32 @@ static long afu_ioctl_enable_p9_wait(struct ocxl_context 
*ctx,
 }
 #endif
 
+
+static long afu_ioctl_get_features(struct ocxl_context *ctx,
+   struct ocxl_ioctl_features __user *uarg)
+{
+   struct ocxl_ioctl_features arg;
+
+   memset(, 0, sizeof(arg));
+
+#ifdef CONFIG_PPC64
+   if (cpu_has_feature(CPU_FTR_P9_TIDR))
+   arg.flags[0] |= OCXL_IOCTL_FEATURES_FLAGS0_P9_WAIT;
+#endif
+
+   if (copy_to_user(uarg, , sizeof(arg)))
+   return -EFAULT;
+
+   return 0;
+}
+
 #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" :
\
x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" :   \
x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" : \
x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" : \
x == OCXL_IOCTL_GET_METADATA ? "GET_METADATA" : \
x == OCXL_IOCTL_ENABLE_P9_WAIT ? "ENABLE_P9_WAIT" : 
\
+   x == OCXL_IOCTL_GET_FEATURES ? "GET_FEATURES" : \
"UNKNOWN")
 
 static long afu_ioctl(struct file *file, unsigned int cmd,
@@ -239,6 +259,11 @@ static long afu_ioctl(struct file *file, unsigned int cmd,
break;
 #endif
 
+   case OCXL_IOCTL_GET_FEATURES:
+   rc = afu_ioctl_get_features(ctx,
+   (struct ocxl_ioctl_features __user *) args);
+   break;
+
default:
rc = -EINVAL;
}
diff --git a/include/uapi/misc/ocxl.h b/include/uapi/misc/ocxl.h
index 8d2748e69c84..bb80f294b429 100644
--- a/include/uapi/misc/ocxl.h
+++ b/include/uapi/misc/ocxl.h
@@ -55,6 +55,9 @@ struct ocxl_ioctl_p9_wait {
__u64 reserved3[3];
 };
 
+#define OCXL_IOCTL_FEATURES_FLAGS0_P9_WAIT 0x01
+struct ocxl_ioctl_features {
+   __u64 flags[4];
 };
 
 struct ocxl_ioctl_irq_fd {
@@ -72,5 +75,6 @@ struct ocxl_ioctl_irq_fd {
 #define OCXL_IOCTL_IRQ_SET_FD  _IOW(OCXL_MAGIC, 0x13, struct ocxl_ioctl_irq_fd)
 #define OCXL_IOCTL_GET_METADATA _IOR(OCXL_MAGIC, 0x14, struct 
ocxl_ioctl_metadata)
 #define OCXL_IOCTL_ENABLE_P9_WAIT  _IOR(OCXL_MAGIC, 0x15, struct 
ocxl_ioctl_p9_wait)
+#define OCXL_IOCTL_GET_FEATURES _IOR(OCXL_MAGIC, 0x16, struct 
ocxl_ioctl_platform)
 
 #endif /* _UAPI_MISC_OCXL_H */
-- 
2.14.3

[PATCH v2 5/7] ocxl: Expose the thread_id needed for wait on p9

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

In order to successfully issue as_notify, an AFU needs to know the TID
to notify, which in turn means that this information should be
available in userspace so it can be communicated to the AFU.

Signed-off-by: Alastair D'Silva 
---
 drivers/misc/ocxl/context.c   |  5 +++-
 drivers/misc/ocxl/file.c  | 53 +++
 drivers/misc/ocxl/link.c  | 36 ++
 drivers/misc/ocxl/ocxl_internal.h |  1 +
 include/misc/ocxl.h   |  9 +++
 include/uapi/misc/ocxl.h  | 10 
 6 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index 909e8807824a..95f74623113e 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -34,6 +34,8 @@ int ocxl_context_init(struct ocxl_context *ctx, struct 
ocxl_afu *afu,
mutex_init(>xsl_error_lock);
mutex_init(>irq_lock);
idr_init(>irq_idr);
+   ctx->tidr = 0;
+
/*
 * Keep a reference on the AFU to make sure it's valid for the
 * duration of the life of the context
@@ -65,6 +67,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
 {
int rc;
 
+   // Locks both status & tidr
mutex_lock(>status_mutex);
if (ctx->status != OPENED) {
rc = -EIO;
@@ -72,7 +75,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr)
}
 
rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid,
-   current->mm->context.id, 0, amr, current->mm,
+   current->mm->context.id, ctx->tidr, amr, current->mm,
xsl_fault_error, ctx);
if (rc)
goto out;
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index 038509e5d031..eb409a469f21 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -5,6 +5,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "ocxl_internal.h"
 
 
@@ -123,11 +125,55 @@ static long afu_ioctl_get_metadata(struct ocxl_context 
*ctx,
return 0;
 }
 
+#ifdef CONFIG_PPC64
+static long afu_ioctl_enable_p9_wait(struct ocxl_context *ctx,
+   struct ocxl_ioctl_p9_wait __user *uarg)
+{
+   struct ocxl_ioctl_p9_wait arg;
+
+   memset(, 0, sizeof(arg));
+
+   if (cpu_has_feature(CPU_FTR_P9_TIDR)) {
+   enum ocxl_context_status status;
+
+   // Locks both status & tidr
+   mutex_lock(>status_mutex);
+   if (!ctx->tidr) {
+   if (set_thread_tidr(current))
+   return -ENOENT;
+
+   ctx->tidr = current->thread.tidr;
+   }
+
+   status = ctx->status;
+   mutex_unlock(>status_mutex);
+
+   if (status == ATTACHED) {
+   int rc;
+   struct link *link = ctx->afu->fn->link;
+
+   rc = ocxl_link_update_pe(link, ctx->pasid, ctx->tidr);
+   if (rc)
+   return rc;
+   }
+
+   arg.thread_id = ctx->tidr;
+   } else
+   return -ENOENT;
+
+   if (copy_to_user(uarg, , sizeof(arg)))
+   return -EFAULT;
+
+   return 0;
+}
+#endif
+
 #define CMD_STR(x) (x == OCXL_IOCTL_ATTACH ? "ATTACH" :
\
x == OCXL_IOCTL_IRQ_ALLOC ? "IRQ_ALLOC" :   \
x == OCXL_IOCTL_IRQ_FREE ? "IRQ_FREE" : \
x == OCXL_IOCTL_IRQ_SET_FD ? "IRQ_SET_FD" : \
x == OCXL_IOCTL_GET_METADATA ? "GET_METADATA" : \
+   x == OCXL_IOCTL_ENABLE_P9_WAIT ? "ENABLE_P9_WAIT" : 
\
"UNKNOWN")
 
 static long afu_ioctl(struct file *file, unsigned int cmd,
@@ -186,6 +232,13 @@ static long afu_ioctl(struct file *file, unsigned int cmd,
(struct ocxl_ioctl_metadata __user *) args);
break;
 
+#ifdef CONFIG_PPC64
+   case OCXL_IOCTL_ENABLE_P9_WAIT:
+   rc = afu_ioctl_enable_p9_wait(ctx,
+   (struct ocxl_ioctl_p9_wait __user *) args);
+   break;
+#endif
+
default:
rc = -EINVAL;
}
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index 656e8610eec2..88876ae8f330 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -544,6 +544,42 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 
pidr, u32 tidr,
 }
 EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
 
+int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
+{
+   struct link *link = (struct link *) link_handle;
+   struct spa *spa = link->spa;
+   struct ocxl_process_element *pe;
+   int pe_handle, rc;
+
+   if

[PATCH v2 3/7] powerpc: use task_pid_nr() for TID allocation

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

The current implementation of TID allocation, using a global IDR, may
result in an errant process starving the system of available TIDs.
Instead, use task_pid_nr(), as mentioned by the original author. The
scenario described which prevented it's use is not applicable, as
set_thread_tidr can only be called after the task struct has been
populated.

Signed-off-by: Alastair D'Silva 
---
 arch/powerpc/include/asm/switch_to.h |  1 -
 arch/powerpc/kernel/process.c| 97 +---
 2 files changed, 1 insertion(+), 97 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index be8c9fa23983..5b03d8a82409 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -94,6 +94,5 @@ static inline void clear_task_ebb(struct task_struct *t)
 extern int set_thread_uses_vas(void);
 
 extern int set_thread_tidr(struct task_struct *t);
-extern void clear_thread_tidr(struct task_struct *t);
 
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 3b00da47699b..87f047fd2762 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1496,103 +1496,12 @@ int set_thread_uses_vas(void)
 }
 
 #ifdef CONFIG_PPC64
-static DEFINE_SPINLOCK(vas_thread_id_lock);
-static DEFINE_IDA(vas_thread_ida);
-
-/*
- * We need to assign a unique thread id to each thread in a process.
- *
- * This thread id, referred to as TIDR, and separate from the Linux's tgid,
- * is intended to be used to direct an ASB_Notify from the hardware to the
- * thread, when a suitable event occurs in the system.
- *
- * One such event is a "paste" instruction in the context of Fast Thread
- * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard
- * (VAS) in POWER9.
- *
- * To get a unique TIDR per process we could simply reuse task_pid_nr() but
- * the problem is that task_pid_nr() is not yet available copy_thread() is
- * called. Fixing that would require changing more intrusive arch-neutral
- * code in code path in copy_process()?.
- *
- * Further, to assign unique TIDRs within each process, we need an atomic
- * field (or an IDR) in task_struct, which again intrudes into the arch-
- * neutral code. So try to assign globally unique TIDRs for now.
- *
- * NOTE: TIDR 0 indicates that the thread does not need a TIDR value.
- *  For now, only threads that expect to be notified by the VAS
- *  hardware need a TIDR value and we assign values > 0 for those.
- */
-#define MAX_THREAD_CONTEXT ((1 << 16) - 1)
-static int assign_thread_tidr(void)
-{
-   int index;
-   int err;
-   unsigned long flags;
-
-again:
-   if (!ida_pre_get(_thread_ida, GFP_KERNEL))
-   return -ENOMEM;
-
-   spin_lock_irqsave(_thread_id_lock, flags);
-   err = ida_get_new_above(_thread_ida, 1, );
-   spin_unlock_irqrestore(_thread_id_lock, flags);
-
-   if (err == -EAGAIN)
-   goto again;
-   else if (err)
-   return err;
-
-   if (index > MAX_THREAD_CONTEXT) {
-   spin_lock_irqsave(_thread_id_lock, flags);
-   ida_remove(_thread_ida, index);
-   spin_unlock_irqrestore(_thread_id_lock, flags);
-   return -ENOMEM;
-   }
-
-   return index;
-}
-
-static void free_thread_tidr(int id)
-{
-   unsigned long flags;
-
-   spin_lock_irqsave(_thread_id_lock, flags);
-   ida_remove(_thread_ida, id);
-   spin_unlock_irqrestore(_thread_id_lock, flags);
-}
-
-/*
- * Clear any TIDR value assigned to this thread.
- */
-void clear_thread_tidr(struct task_struct *t)
-{
-   if (!t->thread.tidr)
-   return;
-
-   if (!cpu_has_feature(CPU_FTR_P9_TIDR)) {
-   WARN_ON_ONCE(1);
-   return;
-   }
-
-   mtspr(SPRN_TIDR, 0);
-   free_thread_tidr(t->thread.tidr);
-   t->thread.tidr = 0;
-}
-
-void arch_release_task_struct(struct task_struct *t)
-{
-   clear_thread_tidr(t);
-}
-
 /*
  * Assign a unique TIDR (thread id) for task @t and set it in the thread
  * structure. For now, we only support setting TIDR for 'current' task.
  */
 int set_thread_tidr(struct task_struct *t)
 {
-   int rc;
-
if (!cpu_has_feature(CPU_FTR_P9_TIDR))
return -EINVAL;
 
@@ -1602,11 +1511,7 @@ int set_thread_tidr(struct task_struct *t)
if (t->thread.tidr)
return 0;
 
-   rc = assign_thread_tidr();
-   if (rc < 0)
-   return rc;
-
-   t->thread.tidr = rc;
+   t->thread.tidr = (u16)task_pid_nr(t);
mtspr(SPRN_TIDR, t->thread.tidr);
 
return 0;
-- 
2.14.3

[PATCH v2 0/7] ocxl: Implement Power9 as_notify/wait for OpenCAPI

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

The Power 9 as_notify/wait feature provides a lower latency way to
signal a thread that work is complete. This series enables the use of
this feature from OpenCAPI adapters, as well as addressing a potential
starvation issue when allocating thread IDs.

Changelog:
v2:
  Rename get_platform IOCTL to get_features
  Move stray edit from patch 1 to patch 3

Alastair D'Silva (7):
  powerpc: Add TIDR CPU feature for Power9
  powerpc: Use TIDR CPU feature to control TIDR allocation
  powerpc: use task_pid_nr() for TID allocation
  ocxl: Rename pnv_ocxl_spa_remove_pe to clarify it's action
  ocxl: Expose the thread_id needed for wait on p9
  ocxl: Add an IOCTL so userspace knows what CPU features are available
  ocxl: Document new OCXL IOCTLs

 Documentation/accelerators/ocxl.rst   |  10 
 arch/powerpc/include/asm/cputable.h   |   3 +-
 arch/powerpc/include/asm/pnv-ocxl.h   |   2 +-
 arch/powerpc/include/asm/switch_to.h  |   1 -
 arch/powerpc/kernel/dt_cpu_ftrs.c |   1 +
 arch/powerpc/kernel/process.c | 101 +-
 arch/powerpc/platforms/powernv/ocxl.c |   4 +-
 drivers/misc/ocxl/context.c   |   5 +-
 drivers/misc/ocxl/file.c  |  78 ++
 drivers/misc/ocxl/link.c  |  38 -
 drivers/misc/ocxl/ocxl_internal.h |   1 +
 include/misc/ocxl.h   |   9 +++
 include/uapi/misc/ocxl.h  |  14 +
 13 files changed, 162 insertions(+), 105 deletions(-)

-- 
2.14.3

[PATCH v2 4/7] ocxl: Rename pnv_ocxl_spa_remove_pe to clarify it's action

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

The function removes the process element from NPU cache.

Signed-off-by: Alastair D'Silva 
---
 arch/powerpc/include/asm/pnv-ocxl.h   | 2 +-
 arch/powerpc/platforms/powernv/ocxl.c | 4 ++--
 drivers/misc/ocxl/link.c  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pnv-ocxl.h 
b/arch/powerpc/include/asm/pnv-ocxl.h
index f6945d3bc971..208b5503f4ed 100644
--- a/arch/powerpc/include/asm/pnv-ocxl.h
+++ b/arch/powerpc/include/asm/pnv-ocxl.h
@@ -28,7 +28,7 @@ extern int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void 
__iomem **dsisr,
 extern int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
void **platform_data);
 extern void pnv_ocxl_spa_release(void *platform_data);
-extern int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle);
+extern int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int 
pe_handle);
 
 extern int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr);
 extern void pnv_ocxl_free_xive_irq(u32 irq);
diff --git a/arch/powerpc/platforms/powernv/ocxl.c 
b/arch/powerpc/platforms/powernv/ocxl.c
index fa9b53af3c7b..8c65aacda9c8 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -475,7 +475,7 @@ void pnv_ocxl_spa_release(void *platform_data)
 }
 EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
 
-int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle)
+int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
 {
struct spa_data *data = (struct spa_data *) platform_data;
int rc;
@@ -483,7 +483,7 @@ int pnv_ocxl_spa_remove_pe(void *platform_data, int 
pe_handle)
rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
return rc;
 }
-EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe);
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
 
 int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr)
 {
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index f30790582dc0..656e8610eec2 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -599,7 +599,7 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
 * On powerpc, the entry needs to be cleared from the context
 * cache of the NPU.
 */
-   rc = pnv_ocxl_spa_remove_pe(link->platform_data, pe_handle);
+   rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
WARN_ON(rc);
 
pe_data = radix_tree_delete(>pe_tree, pe_handle);
-- 
2.14.3

[PATCH v2 2/7] powerpc: Use TIDR CPU feature to control TIDR allocation

2018-04-17 Thread Alastair D'Silva

From: Alastair D'Silva 

Switch the use of TIDR on it's CPU feature, rather than assuming it
is available based on architecture.

Signed-off-by: Alastair D'Silva 
---
 arch/powerpc/kernel/process.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13fed51..3b00da47699b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1154,7 +1154,7 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 
-   if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+   if (cpu_has_feature(CPU_FTR_P9_TIDR) &&
old_thread->tidr != new_thread->tidr)
mtspr(SPRN_TIDR, new_thread->tidr);
 #endif
@@ -1570,7 +1570,7 @@ void clear_thread_tidr(struct task_struct *t)
if (!t->thread.tidr)
return;
 
-   if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+   if (!cpu_has_feature(CPU_FTR_P9_TIDR)) {
WARN_ON_ONCE(1);
return;
}
@@ -1593,7 +1593,7 @@ int set_thread_tidr(struct task_struct *t)
 {
int rc;
 
-   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   if (!cpu_has_feature(CPU_FTR_P9_TIDR))
return -EINVAL;
 
if (t != current)
-- 
2.14.3

Re: [PATCH] misc: cxl: Change return type to vm_fault_t

2018-04-17 Thread Andrew Donnellan


On 18/04/18 00:53, Souptick Joarder wrote:

Use new return type vm_fault_t for fault handler. For
now, this is just documenting that the function returns
a VM_FAULT value rather than an errno. Once all instances
are converted, vm_fault_t will become a distinct type.

Reference id -> 1c8f422059ae ("mm: change return type to
vm_fault_t")

previously cxl_mmap_fault returns VM_FAULT_NOPAGE as
default value irrespective of vm_insert_pfn() return
value. This bug is fixed with new vmf_insert_pfn()
which will return VM_FAULT_ type based on err.

Signed-off-by: Souptick Joarder 


This looks good to me

Acked-by: Andrew Donnellan 

--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited

Re: [PATCH V1 00/11] powerpc/mm/book3s64: Support for split pmd ptlock

2018-04-17 Thread Balbir Singh

On Mon, 16 Apr 2018 16:57:12 +0530
"Aneesh Kumar K.V"  wrote:

> This patch series add split pmd pagetable lock for book3s64. nohash64 also 
> should
> be able to switch to this. I need to workout the code dependency. This series
> also migh have broken the build on platforms otherthan book3s64. I am sending 
> this early
> to get feedback on whether we should continue with the approach.
> 
> We switch the pmd allocator to use something similar to what we already use 
> for
> level 4 pagetable allocation. We get an order 0 page and divide that to 
> fragments
> and hand over fragments when we get request for a pmd pagetable. The pmd lock 
> is
> now stashed in the struct page backing the allocated page.

That's only for the THP case right?

> 
> The series helps in reducing lock contention on mm->page_table_lock.
>

The numbers look good.

Re: [PATCH 1/2] powernv/npu: Do a PID GPU TLB flush when invalidating a large address range

2018-04-17 Thread Balbir Singh

On Tue, Apr 17, 2018 at 7:17 PM, Balbir Singh  wrote:
> On Tue, Apr 17, 2018 at 7:11 PM, Alistair Popple  
> wrote:
>> The NPU has a limited number of address translation shootdown (ATSD)
>> registers and the GPU has limited bandwidth to process ATSDs. This can
>> result in contention of ATSD registers leading to soft lockups on some
>> threads, particularly when invalidating a large address range in
>> pnv_npu2_mn_invalidate_range().
>>
>> At some threshold it becomes more efficient to flush the entire GPU TLB for
>> the given MM context (PID) than individually flushing each address in the
>> range. This patch will result in ranges greater than 2MB being converted
>> from 32+ ATSDs into a single ATSD which will flush the TLB for the given
>> PID on each GPU.
>>
>> Signed-off-by: Alistair Popple 
>> +   }
>>  }
>>
>
> Acked-by: Balbir Singh 
Tested-by: Balbir Singh

[PATCH] powerpc: platform: cell: spufs: Change return type to vm_fault_t

2018-04-17 Thread Souptick Joarder

Use new return type vm_fault_t for fault handler. For
now, this is just documenting that the function returns
a VM_FAULT value rather than an errno. Once all instances
are converted, vm_fault_t will become a distinct type.

Reference id -> 1c8f422059ae ("mm: change return type to
vm_fault_t")

Previously vm_insert_pfn() returns err but driver returns 
VM_FAULT_NOPAGE as default. The new function vmf_insert_pfn()
will replace this inefficiency by returning correct VM_FAULT_*
type.

vmf_handle_error is a inline wrapper function which
will convert error number to vm_fault_t type err.

Signed-off-by: Souptick Joarder 
Reviewed-by: Matthew Wilcox 
---
 arch/powerpc/platforms/cell/spufs/file.c | 37 
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
b/arch/powerpc/platforms/cell/spufs/file.c
index 469bdd0..a1dca9a 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -232,12 +232,13 @@ static ssize_t spufs_attr_write(struct file *file, const 
char __user *buf,
return size;
 }
 
-static int
+static vm_fault_t
 spufs_mem_mmap_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct spu_context *ctx = vma->vm_file->private_data;
unsigned long pfn, offset;
+   vm_fault_t ret;
 
offset = vmf->pgoff << PAGE_SHIFT;
if (offset >= LS_SIZE)
@@ -256,11 +257,11 @@ static ssize_t spufs_attr_write(struct file *file, const 
char __user *buf,
vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
}
-   vm_insert_pfn(vma, vmf->address, pfn);
+   ret = vmf_insert_pfn(vma, vmf->address, pfn);
 
spu_release(ctx);
 
-   return VM_FAULT_NOPAGE;
+   return ret;
 }
 
 static int spufs_mem_mmap_access(struct vm_area_struct *vma,
@@ -312,13 +313,19 @@ static int spufs_mem_mmap(struct file *file, struct 
vm_area_struct *vma)
.mmap   = spufs_mem_mmap,
 };
 
-static int spufs_ps_fault(struct vm_fault *vmf,
+static inline vm_fault_t vmf_handle_error(int err)
+{
+   return VM_FAULT_NOPAGE;
+}
+
+static vm_fault_t spufs_ps_fault(struct vm_fault *vmf,
unsigned long ps_offs,
unsigned long ps_size)
 {
struct spu_context *ctx = vmf->vma->vm_file->private_data;
unsigned long area, offset = vmf->pgoff << PAGE_SHIFT;
-   int ret = 0;
+   int err = 0;
+   vm_fault_t ret = VM_FAULT_NOPAGE;
 
spu_context_nospu_trace(spufs_ps_fault__enter, ctx);
 
@@ -349,12 +356,14 @@ static int spufs_ps_fault(struct vm_fault *vmf,
if (ctx->state == SPU_STATE_SAVED) {
up_read(>mm->mmap_sem);
spu_context_nospu_trace(spufs_ps_fault__sleep, ctx);
-   ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+   err = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+   ret = vmf_handle_error(err);
spu_context_trace(spufs_ps_fault__wake, ctx, ctx->spu);
down_read(>mm->mmap_sem);
} else {
area = ctx->spu->problem_phys + ps_offs;
-   vm_insert_pfn(vmf->vma, vmf->address, (area + offset) >> 
PAGE_SHIFT);
+   ret = vmf_insert_pfn(vmf->vma, vmf->address,
+   (area + offset) >> PAGE_SHIFT);
spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
}
 
@@ -363,11 +372,11 @@ static int spufs_ps_fault(struct vm_fault *vmf,
 
 refault:
put_spu_context(ctx);
-   return VM_FAULT_NOPAGE;
+   return ret;
 }
 
 #if SPUFS_MMAP_4K
-static int spufs_cntl_mmap_fault(struct vm_fault *vmf)
+static vm_fault_t spufs_cntl_mmap_fault(struct vm_fault *vmf)
 {
return spufs_ps_fault(vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
 }
@@ -1040,7 +1049,7 @@ static ssize_t spufs_signal1_write(struct file *file, 
const char __user *buf,
return 4;
 }
 
-static int
+static vm_fault_t
 spufs_signal1_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
@@ -1178,7 +1187,7 @@ static ssize_t spufs_signal2_write(struct file *file, 
const char __user *buf,
 }
 
 #if SPUFS_MMAP_4K
-static int
+static vm_fault_t
 spufs_signal2_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
@@ -1307,7 +1316,7 @@ static u64 spufs_signal2_type_get(struct spu_context *ctx)
   spufs_signal2_type_set, "%llu\n", SPU_ATTR_ACQUIRE);
 
 #if SPUFS_MMAP_4K
-static int
+static vm_fault_t
 spufs_mss_mmap_fault(struct vm_fault *vmf)
 {
return spufs_ps_fault(vmf, 0x, SPUFS_MSS_MAP_SIZE);
@@ -1369,7 +1378,7 @@ static int spufs_mss_open(struct inode *inode, struct 
file *file)
.llseek  = no_llseek,
 };
 
-static int

[PATCH] misc: cxl: Change return type to vm_fault_t

2018-04-17 Thread Souptick Joarder

Use new return type vm_fault_t for fault handler. For
now, this is just documenting that the function returns
a VM_FAULT value rather than an errno. Once all instances
are converted, vm_fault_t will become a distinct type.

Reference id -> 1c8f422059ae ("mm: change return type to
vm_fault_t")

previously cxl_mmap_fault returns VM_FAULT_NOPAGE as
default value irrespective of vm_insert_pfn() return
value. This bug is fixed with new vmf_insert_pfn()
which will return VM_FAULT_ type based on err.

Signed-off-by: Souptick Joarder 
---
 drivers/misc/cxl/context.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 7ff315a..c6ec872 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -128,11 +128,12 @@ void cxl_context_set_mapping(struct cxl_context *ctx,
mutex_unlock(>mapping_lock);
 }

-static int cxl_mmap_fault(struct vm_fault *vmf)
+static vm_fault_t cxl_mmap_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct cxl_context *ctx = vma->vm_file->private_data;
u64 area, offset;
+   vm_fault_t ret;

offset = vmf->pgoff << PAGE_SHIFT;

@@ -169,11 +170,11 @@ static int cxl_mmap_fault(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}

-   vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
+   ret = vmf_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);

mutex_unlock(>status_mutex);

-   return VM_FAULT_NOPAGE;
+   return ret;
 }

 static const struct vm_operations_struct cxl_mmap_vmops = {
--
1.9.1

Re: [PATCH 2/2] powernv/npu: Add a debugfs setting to change ATSD threshold

2018-04-17 Thread Balbir Singh

On Tue, 17 Apr 2018 19:11:29 +1000
Alistair Popple  wrote:

> The threshold at which it becomes more efficient to coalesce a range of
> ATSDs into a single per-PID ATSD is currently not well understood due to a
> lack of real-world work loads. This patch adds a debugfs parameter allowing
> the threshold to be altered at runtime in order to aid future development
> and refinement of the value.
> 
> Signed-off-by: Alistair Popple 
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index dc34662e9df9..a765bf576c14 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -17,7 +17,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -44,7 +46,8 @@ DEFINE_SPINLOCK(npu_context_lock);
>   * entire TLB on the GPU for the given PID rather than each specific address 
> in
>   * the range.
>   */
> -#define ATSD_THRESHOLD (2*1024*1024)
> +static uint64_t atsd_threshold = 2 * 1024 * 1024;
> +static struct dentry *atsd_threshold_dentry;
>  
>  /*
>   * Other types of TCE cache invalidation are not functional in the
> @@ -682,7 +685,7 @@ static void pnv_npu2_mn_invalidate_range(struct 
> mmu_notifier *mn,
>   struct npu_context *npu_context = mn_to_npu_context(mn);
>   unsigned long address;
>  
> - if (end - start > ATSD_THRESHOLD) {
> + if (end - start > atsd_threshold) {
>   /*
>* Just invalidate the entire PID if the address range is too
>* large.
> @@ -956,6 +959,11 @@ int pnv_npu2_init(struct pnv_phb *phb)
>   static int npu_index;
>   uint64_t rc = 0;
>  
> + if (!atsd_threshold_dentry) {
> + atsd_threshold_dentry = debugfs_create_x64("atsd_threshold",

Nit-picking can we call this atsd_threshold_in_bytes?

> +0600, powerpc_debugfs_root, _threshold);
> + }
> +
>   phb->npu.nmmu_flush =
>   of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
>   for_each_child_of_node(phb->hose->dn, dn) {

Acked-by: Balbir Singh

Re: [PATCH] powerpc: platform: cell: spufs: Change return type to vm_fault_t

2018-04-17 Thread Matthew Wilcox

On Wed, Apr 18, 2018 at 12:50:38AM +0530, Souptick Joarder wrote:
> Use new return type vm_fault_t for fault handler. For
> now, this is just documenting that the function returns
> a VM_FAULT value rather than an errno. Once all instances
> are converted, vm_fault_t will become a distinct type.
> 
> Reference id -> 1c8f422059ae ("mm: change return type to
> vm_fault_t")
> 
> Previously vm_insert_pfn() returns err but driver returns 
> VM_FAULT_NOPAGE as default. The new function vmf_insert_pfn()
> will replace this inefficiency by returning correct VM_FAULT_*
> type.
> 
> vmf_handle_error is a inline wrapper function which
> will convert error number to vm_fault_t type err.

I think you sent the wrong version of this one ...

The commit message should mention that we're fixing a minor bug, that
the error from vm_insert_pfn() was being ignored and the effect of this
is likely to be only felt in OOM situations.

> @@ -256,11 +257,11 @@ static ssize_t spufs_attr_write(struct file *file, 
> const char __user *buf,
>   vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
>   pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
>   }
> - vm_insert_pfn(vma, vmf->address, pfn);
> + ret = vmf_insert_pfn(vma, vmf->address, pfn);
>  
>   spu_release(ctx);
>  
> - return VM_FAULT_NOPAGE;
> + return ret;
>  }

I thought I said not to introduce vmf_handle_error(), because it's too
trivial and obfuscates what's actually going on.

> -static int spufs_ps_fault(struct vm_fault *vmf,
> +static inline vm_fault_t vmf_handle_error(int err)
> +{
> + return VM_FAULT_NOPAGE;
> +}
> +

Re-reading spufs_ps_fault(), I wouldn't change anything inside it.  Just
change its return type to vm_fault_t and call it done.

Re: [PATCH] powerpc: platform: cell: spufs: Change return type to vm_fault_t

2018-04-17 Thread Arnd Bergmann

On Tue, Apr 17, 2018 at 9:20 PM, Souptick Joarder  wrote:
> Use new return type vm_fault_t for fault handler. For
> now, this is just documenting that the function returns
> a VM_FAULT value rather than an errno. Once all instances
> are converted, vm_fault_t will become a distinct type.
>
> Reference id -> 1c8f422059ae ("mm: change return type to
> vm_fault_t")
>
> Previously vm_insert_pfn() returns err but driver returns
> VM_FAULT_NOPAGE as default. The new function vmf_insert_pfn()
> will replace this inefficiency by returning correct VM_FAULT_*
> type.
>
> vmf_handle_error is a inline wrapper function which
> will convert error number to vm_fault_t type err.
>
> Signed-off-by: Souptick Joarder 
> Reviewed-by: Matthew Wilcox 

Acked-by: Arnd Bergmann

Re: [RFC PATCH 1/3] signal: Ensure every siginfo we send has all bits initialized

2018-04-17 Thread Eric W. Biederman

Dave Martin  writes:

> Hmmm
>
> memset()/clear_siginfo() may ensure that there are no uninitialised
> explicit fields except for those in inactive union members, but I'm not
> sure that this approach is guaranteed to sanitise the padding seen by
> userspace.
>
> Rationale below, though it's a bit theoretical...
>
> With this in mind, I tend agree with Linus that hiding memset() calls
> from the maintainer may be a bad idea unless they are also hidden from
> the compiler.  If the compiler sees the memset() it may be able to
> optimise it in ways that wouldn't be possible for some other random
> external function call, including optimising all or part of the call
> out.
>
> As a result, the breakdown into individual put_user()s etc. in
> copy_siginfo_to_user() may still be valuable even if all paths have the
> memset().

The breakdown into individual put_user()s is known to be problematically
slow, and is actually wrong.

Even exclusing the SI_USER duplication in a small number of cases the
fields filled out in siginfo by architecture code are not the fields
that copy_siginfo_to_user is copying.  Which is much worse.  The code
looks safe but is not.

My intention is to leave 0 instances of clear_siginfo in the
architecture specific code.  Ideally struct siginfo will be limited to
kernel/signal.c but I am not certain I can quite get that far.
The function do_coredump appears to have a legit need for siginfo.


> (Rationale for an arch/arm example:)
>
>> diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
>> index 4c375e11ae95..adda3fc2dde8 100644
>> --- a/arch/arm/vfp/vfpmodule.c
>> +++ b/arch/arm/vfp/vfpmodule.c
>> @@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct 
>> pt_regs *regs)
>>  {
>>  siginfo_t info;
>>  
>> -memset(, 0, sizeof(info));
>> -
>> +clear_siginfo();
>>  info.si_signo = SIGFPE;
>
> /* by c11 (n1570) 6.2.6.1 para 6 [1], all padding bytes in info now take
>unspecified values */
>
>>  info.si_code = sicode;
>>  info.si_addr = (void __user *)(instruction_pointer(regs) - 4);
>
> /* by c11 (n1570) 6.2.6.1 para 7 [2], all bytes of the union info._sifields
>other than than those corresponding to _sigfault take unspecified
>values */
>
> So I don't see why the compiler needs to ensure that any of the affected
> bytes are zero: it could potentially skip a lot of the memset() as a
> result, in theory.
>
> I've not seen a compiler actually take advantage of that, but I'm now
> not sure what forbids it.

I took a quick look at gcc-4.9 which I have handy.

The passes -f-no-strict-aliasing which helps, and gcc actually
documents that if you access things through the union it will
not take advantage of c11.

gcc-4.9 Documents it this way:

> -fstrict-aliasing'
>  Allow the compiler to assume the strictest aliasing rules
>  applicable to the language being compiled.  For C (and C++), this
>  activates optimizations based on the type of expressions.  In
>  particular, an object of one type is assumed never to reside at the
>  same address as an object of a different type, unless the types are
>  almost the same.  For example, an 'unsigned int' can alias an
>  'int', but not a 'void*' or a 'double'.  A character type may alias
>  any other type.
> 
>  Pay special attention to code like this:
>   union a_union {
> int i;
> double d;
>   };
> 
>   int f() {
> union a_union t;
> t.d = 3.0;
> return t.i;
>   }
>  The practice of reading from a different union member than the one
>  most recently written to (called "type-punning") is common.  Even
>  with '-fstrict-aliasing', type-punning is allowed, provided the
>  memory is accessed through the union type.  So, the code above
>  works as expected.


> If this can happen, I only see two watertight workarounds:
>
> 1) Ensure that there is no implicit padding in any UAPI structure, e.g.
> aeb1f39d814b: ("arm64/ptrace: Avoid uninitialised struct padding in
> fpr_set()").  This would include tail-padding of any union member that
> is smaller than the containing union.
>
> It would be significantly more effort to ensure this for siginfo though.
>
> 2) Poke all values directly into allocated or user memory directly
> via pointers to paddingless types; never assign to objects on the kernel
> stack if you care what ends up in the padding, e.g., what your
> copy_siginfo_to_user() does prior to this series.
>
>
> If I'm not barking up the wrong tree, memset() cannot generally be
> used to determine the value of padding bytes, but it may still be
> useful for forcing otherwise uninitialised members to sane initial
> values.
>
> This likely affects many more things than just siginfo.

Unless gcc has changed it's stance on type-punning through unions
or it's semantics with -fno-strict_aliasing we should be good.

Eric

Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION

2018-04-17 Thread Mathieu Malaterre

On Tue, Apr 17, 2018 at 6:49 PM, Christophe LEROY
 wrote:
>
>
> Le 17/04/2018 à 18:45, Mathieu Malaterre a écrit :
>>
>> On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy
>>  wrote:
>>>
>>> This option does dead code and data elimination with the linker by
>>> compiling with -ffunction-sections -fdata-sections and linking with
>>> --gc-sections.
>>>
>>> By selecting this option on mpc885_ads_defconfig,
>>> vmlinux LOAD segment size gets reduced by 10%
>>>
>>> Program Header before the patch:
>>>  LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
>>>   filesz 0x0036eda4 memsz 0x0038de04 flags rwx
>>>
>>> Program Header after the patch:
>>>  LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
>>>   filesz 0x00316da4 memsz 0x00334268 flags rwx
>>>
>>> Signed-off-by: Christophe Leroy 
>>> ---
>>>   arch/powerpc/Kconfig | 8 
>>>   1 file changed, 8 insertions(+)
>>>
>>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>>> index 8fe4353be5e3..e1fac49cf465 100644
>>> --- a/arch/powerpc/Kconfig
>>> +++ b/arch/powerpc/Kconfig
>>> @@ -888,6 +888,14 @@ config PPC_MEM_KEYS
>>>
>>>If unsure, say y.
>>>
>>> +config PPC_UNUSED_ELIMINATION
>>> +   bool "Eliminate unused functions and data from vmlinux"
>>> +   default n
>>> +   select LD_DEAD_CODE_DATA_ELIMINATION
>>> +   help
>>> + Select this to do dead code and data elimination with the
>>> linker
>>> + by compiling with -ffunction-sections -fdata-sections and
>>> linking
>>> + with --gc-sections.
>>>   endmenu
>>>
>>
>> Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The
>> messages I can see (prom_init) are:
>
>
> Which version of GCC do you use ?

$ powerpc-linux-gnu-gcc --version
powerpc-linux-gnu-gcc (Debian 6.3.0-18) 6.3.0 20170516
Copyright (C) 2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

this is simply coming from:

$ apt-cache policy crossbuild-essential-powerpc
crossbuild-essential-powerpc:
  Installed: 12.3
  Candidate: 12.3
  Version table:
 *** 12.3 500
500 http://ftp.fr.debian.org/debian stretch/main amd64 Packages
500 http://ftp.fr.debian.org/debian stretch/main i386 Packages
100 /var/lib/dpkg/status


> Can you provide the generated System.map with and without that option active
> ?

$ du -sh g4/System.map.*
1.7M g4/System.map.with
1.8M g4/System.map.without

Will send them by private emails.

> Thanks
> Christophe
>
>
>>
>> ---
>> done
>> copying OF device tree...
>> Building dt strings...
>> Building dt structure...
>> Device tree strings 0x0110 -> 0x01100e02
>> Device tree struct   0x01101000 -> 0x01109000
>> Quiescing Open Firmware ...
>> Booting Linux via __start() @ 0x0014 ...
>> _
>> ---
>>
>>
>>
>>>   config ISA_DMA_API
>>> --
>>> 2.13.3
>>>
>

[PATCH v2 2/2] powerpc/32be: use stmw/lmw for registers save/restore in asm

2018-04-17 Thread Christophe Leroy

arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs
in order to use multiple word instructions in functions entry/exit

The patch does the same for the asm parts, for consistency

On processors like the 8xx on which insn fetching is pretty slow,
this speeds up registers save/restore

Signed-off-by: Christophe Leroy 
---
 v2: Swapped both patches in the serie to reduce number of impacted
 lines and added the same modification in ppc_save_regs()

 arch/powerpc/include/asm/ppc_asm.h  |  5 +
 arch/powerpc/kernel/misc.S  | 10 ++
 arch/powerpc/kernel/ppc_save_regs.S |  4 
 3 files changed, 19 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index 13f7f4c0e1ea..4bb765d0b758 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -80,11 +80,16 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #else
 #define SAVE_GPR(n, base)  stw n,GPR0+4*(n)(base)
 #define REST_GPR(n, base)  lwz n,GPR0+4*(n)(base)
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define SAVE_NVGPRS(base)  stmw13, GPR0+4*13(base)
+#define REST_NVGPRS(base)  lmw 13, GPR0+4*13(base)
+#else
 #define SAVE_NVGPRS(base)  SAVE_GPR(13, base); SAVE_8GPRS(14, base); \
SAVE_10GPRS(22, base)
 #define REST_NVGPRS(base)  REST_GPR(13, base); REST_8GPRS(14, base); \
REST_10GPRS(22, base)
 #endif
+#endif
 
 #define SAVE_2GPRS(n, base)SAVE_GPR(n, base); SAVE_GPR(n+1, base)
 #define SAVE_4GPRS(n, base)SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 746ee0320ad4..a316d90a5c26 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -49,6 +49,10 @@ _GLOBAL(setjmp)
PPC_STL r0,0(r3)
PPC_STL r1,SZL(r3)
PPC_STL r2,2*SZL(r3)
+#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
+   mfcrr12
+   stmwr12, 3*SZL(r3)
+#else
mfcrr0
PPC_STL r0,3*SZL(r3)
PPC_STL r13,4*SZL(r3)
@@ -70,10 +74,15 @@ _GLOBAL(setjmp)
PPC_STL r29,20*SZL(r3)
PPC_STL r30,21*SZL(r3)
PPC_STL r31,22*SZL(r3)
+#endif
li  r3,0
blr
 
 _GLOBAL(longjmp)
+#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
+   lmw r12, 3*SZL(r3)
+   mtcrf   0x38, r12
+#else
PPC_LL  r13,4*SZL(r3)
PPC_LL  r14,5*SZL(r3)
PPC_LL  r15,6*SZL(r3)
@@ -95,6 +104,7 @@ _GLOBAL(longjmp)
PPC_LL  r31,22*SZL(r3)
PPC_LL  r0,3*SZL(r3)
mtcrf   0x38,r0
+#endif
PPC_LL  r0,0(r3)
PPC_LL  r1,SZL(r3)
PPC_LL  r2,2*SZL(r3)
diff --git a/arch/powerpc/kernel/ppc_save_regs.S 
b/arch/powerpc/kernel/ppc_save_regs.S
index 1b1787d52896..d60316e70514 100644
--- a/arch/powerpc/kernel/ppc_save_regs.S
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -25,6 +25,9 @@
  */
 _GLOBAL(ppc_save_regs)
PPC_STL r0,0*SZL(r3)
+#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
+   stmwr2, 2*SZL(r3)
+#else
PPC_STL r2,2*SZL(r3)
PPC_STL r3,3*SZL(r3)
PPC_STL r4,4*SZL(r3)
@@ -55,6 +58,7 @@ _GLOBAL(ppc_save_regs)
PPC_STL r29,29*SZL(r3)
PPC_STL r30,30*SZL(r3)
PPC_STL r31,31*SZL(r3)
+#endif
/* go up one stack frame for SP */
PPC_LL  r4,0(r1)
PPC_STL r4,1*SZL(r3)
-- 
2.13.3

[PATCH v2 1/2] powerpc: avoid an unnecessary test and branch in longjmp()

2018-04-17 Thread Christophe Leroy

Doing the test at exit of the function avoids an unnecessary
test and branch inside longjmp()

Signed-off-by: Christophe Leroy 
---
 v2: Swapped both patches in the serie to reduce number of impacted lines

 arch/powerpc/kernel/misc.S | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index e1f3a5d054c4..746ee0320ad4 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -74,10 +74,7 @@ _GLOBAL(setjmp)
blr
 
 _GLOBAL(longjmp)
-   PPC_LCMPI r4,0
-   bne 1f
-   li  r4,1
-1: PPC_LL  r13,4*SZL(r3)
+   PPC_LL  r13,4*SZL(r3)
PPC_LL  r14,5*SZL(r3)
PPC_LL  r15,6*SZL(r3)
PPC_LL  r16,7*SZL(r3)
@@ -102,7 +99,9 @@ _GLOBAL(longjmp)
PPC_LL  r1,SZL(r3)
PPC_LL  r2,2*SZL(r3)
mtlrr0
-   mr  r3,r4
+   mr. r3, r4
+   bnelr
+   li  r3, 1
blr
 
 _GLOBAL(current_stack_pointer)
-- 
2.13.3

Re: [PATCH 2/6 v2] iommu: of: make of_pci_map_rid() available for other devices too

2018-04-17 Thread Robin Murphy


On 17/04/18 11:21, Nipun Gupta wrote:

iommu-map property is also used by devices with fsl-mc. This
patch moves the of_pci_map_rid to generic location, so that it
can be used by other busses too.

Signed-off-by: Nipun Gupta 
---
  drivers/iommu/of_iommu.c | 106 +--


Doesn't this break "msi-parent" parsing for !CONFIG_OF_IOMMU? I guess 
you don't want fsl-mc to have to depend on PCI, but this looks like a 
step in the wrong direction.


I'm not entirely sure where of_map_rid() fits best, but from a quick 
look around the least-worst option might be drivers/of/of_address.c, 
unless Rob and Frank have a better idea of where generic DT-based ID 
translation routines could live?



  drivers/of/irq.c |   6 +--
  drivers/pci/of.c | 101 
  include/linux/of_iommu.h |  11 +
  include/linux/of_pci.h   |  10 -
  5 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 5c36a8b..4e7712f 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -138,6 +138,106 @@ static int of_iommu_xlate(struct device *dev,
return ops->of_xlate(dev, iommu_spec);
  }
  
+/**

+ * of_map_rid - Translate a requester ID through a downstream mapping.
+ * @np: root complex device node.
+ * @rid: device requester ID to map.
+ * @map_name: property name of the map to use.
+ * @map_mask_name: optional property name of the mask to use.
+ * @target: optional pointer to a target device node.
+ * @id_out: optional pointer to receive the translated ID.
+ *
+ * Given a device requester ID, look up the appropriate implementation-defined
+ * platform ID and/or the target device which receives transactions on that
+ * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or
+ * @id_out may be NULL if only the other is required. If @target points to
+ * a non-NULL device node pointer, only entries targeting that node will be
+ * matched; if it points to a NULL value, it will receive the device node of
+ * the first matching target phandle, with a reference held.
+ *
+ * Return: 0 on success or a standard error code on failure.
+ */
+int of_map_rid(struct device_node *np, u32 rid,
+  const char *map_name, const char *map_mask_name,
+  struct device_node **target, u32 *id_out)
+{
+   u32 map_mask, masked_rid;
+   int map_len;
+   const __be32 *map = NULL;
+
+   if (!np || !map_name || (!target && !id_out))
+   return -EINVAL;
+
+   map = of_get_property(np, map_name, _len);
+   if (!map) {
+   if (target)
+   return -ENODEV;
+   /* Otherwise, no map implies no translation */
+   *id_out = rid;
+   return 0;
+   }
+
+   if (!map_len || map_len % (4 * sizeof(*map))) {
+   pr_err("%pOF: Error: Bad %s length: %d\n", np,
+   map_name, map_len);
+   return -EINVAL;
+   }
+
+   /* The default is to select all bits. */
+   map_mask = 0x;
+
+   /*
+* Can be overridden by "{iommu,msi}-map-mask" property.
+*/
+   if (map_mask_name)
+   of_property_read_u32(np, map_mask_name, _mask);
+
+   masked_rid = map_mask & rid;
+   for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) {
+   struct device_node *phandle_node;
+   u32 rid_base = be32_to_cpup(map + 0);
+   u32 phandle = be32_to_cpup(map + 1);
+   u32 out_base = be32_to_cpup(map + 2);
+   u32 rid_len = be32_to_cpup(map + 3);
+
+   if (rid_base & ~map_mask) {
+   pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) 
ignores rid-base (0x%x)\n",
+   np, map_name, map_name,
+   map_mask, rid_base);
+   return -EFAULT;
+   }
+
+   if (masked_rid < rid_base || masked_rid >= rid_base + rid_len)
+   continue;
+
+   phandle_node = of_find_node_by_phandle(phandle);
+   if (!phandle_node)
+   return -ENODEV;
+
+   if (target) {
+   if (*target)
+   of_node_put(phandle_node);
+   else
+   *target = phandle_node;
+
+   if (*target != phandle_node)
+   continue;
+   }
+
+   if (id_out)
+   *id_out = masked_rid - rid_base + out_base;
+
+   pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: %08x, 
length: %08x, rid: %08x -> %08x\n",
+   np, map_name, map_mask, rid_base, out_base,
+   rid_len, rid, masked_rid - rid_base + out_base);

Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION

2018-04-17 Thread Christophe LEROY




Le 17/04/2018 à 18:45, Mathieu Malaterre a écrit :

On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy
 wrote:

This option does dead code and data elimination with the linker by
compiling with -ffunction-sections -fdata-sections and linking with
--gc-sections.

By selecting this option on mpc885_ads_defconfig,
vmlinux LOAD segment size gets reduced by 10%

Program Header before the patch:
 LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
  filesz 0x0036eda4 memsz 0x0038de04 flags rwx

Program Header after the patch:
 LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
  filesz 0x00316da4 memsz 0x00334268 flags rwx

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/Kconfig | 8 
  1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8fe4353be5e3..e1fac49cf465 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -888,6 +888,14 @@ config PPC_MEM_KEYS

   If unsure, say y.

+config PPC_UNUSED_ELIMINATION
+   bool "Eliminate unused functions and data from vmlinux"
+   default n
+   select LD_DEAD_CODE_DATA_ELIMINATION
+   help
+ Select this to do dead code and data elimination with the linker
+ by compiling with -ffunction-sections -fdata-sections and linking
+ with --gc-sections.
  endmenu



Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The
messages I can see (prom_init) are:


Which version of GCC do you use ?

Can you provide the generated System.map with and without that option 
active ?


Thanks
Christophe



---
done
copying OF device tree...
Building dt strings...
Building dt structure...
Device tree strings 0x0110 -> 0x01100e02
Device tree struct   0x01101000 -> 0x01109000
Quiescing Open Firmware ...
Booting Linux via __start() @ 0x0014 ...
_
---




  config ISA_DMA_API
--
2.13.3

Re: [PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION

2018-04-17 Thread Mathieu Malaterre

On Tue, Apr 17, 2018 at 12:49 PM, Christophe Leroy
 wrote:
> This option does dead code and data elimination with the linker by
> compiling with -ffunction-sections -fdata-sections and linking with
> --gc-sections.
>
> By selecting this option on mpc885_ads_defconfig,
> vmlinux LOAD segment size gets reduced by 10%
>
> Program Header before the patch:
> LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
>  filesz 0x0036eda4 memsz 0x0038de04 flags rwx
>
> Program Header after the patch:
> LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
>  filesz 0x00316da4 memsz 0x00334268 flags rwx
>
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/Kconfig | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 8fe4353be5e3..e1fac49cf465 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -888,6 +888,14 @@ config PPC_MEM_KEYS
>
>   If unsure, say y.
>
> +config PPC_UNUSED_ELIMINATION
> +   bool "Eliminate unused functions and data from vmlinux"
> +   default n
> +   select LD_DEAD_CODE_DATA_ELIMINATION
> +   help
> + Select this to do dead code and data elimination with the linker
> + by compiling with -ffunction-sections -fdata-sections and linking
> + with --gc-sections.
>  endmenu
>

Just for reference, I cannot boot my Mac Mini G4 anymore (yaboot). The
messages I can see (prom_init) are:

---
done
copying OF device tree...
Building dt strings...
Building dt structure...
Device tree strings 0x0110 -> 0x01100e02
Device tree struct   0x01101000 -> 0x01109000
Quiescing Open Firmware ...
Booting Linux via __start() @ 0x0014 ...
_
---



>  config ISA_DMA_API
> --
> 2.13.3
>

Re: powerpc/modules: Fix crashes by adding CONFIG_RELOCATABLE to vermagic

2018-04-17 Thread Ard Biesheuvel

On 16 April 2018 at 16:10, Michael Ellerman  wrote:
> Ard Biesheuvel  writes:
>
>> On 11 April 2018 at 16:49, Michael Ellerman
>>  wrote:
>>> On Tue, 2018-04-10 at 01:22:06 UTC, Michael Ellerman wrote:
 If you build the kernel with CONFIG_RELOCATABLE=n, then install the
 modules, rebuild the kernel with CONFIG_RELOCATABLE=y and leave the
 old modules installed, we crash something like:

   Unable to handle kernel paging request for data at address 
 0xd00018d66cef
   Faulting instruction address: 0xc21ddd08
   Oops: Kernel access of bad area, sig: 11 [#1]
   Modules linked in: x_tables autofs4
   CPU: 2 PID: 1 Comm: systemd Not tainted 
 4.16.0-rc6-gcc_ubuntu_le-g99fec39 #1
   ...
   NIP check_version.isra.22+0x118/0x170
   Call Trace:
 __ksymtab_xt_unregister_table+0x58/0xfcb8 [x_tables] 
 (unreliable)
 resolve_symbol+0xb4/0x150
 load_module+0x10e8/0x29a0
 SyS_finit_module+0x110/0x140
 system_call+0x58/0x6c

 This happens because since commit 71810db27c1c ("modversions: treat
 symbol CRCs as 32 bit quantities"), a relocatable kernel encodes and
 handles symbol CRCs differently from a non-relocatable kernel.

 Although it's possible we could try and detect this situation and
 handle it, it's much more robust to simply make the state of
 CONFIG_RELOCATABLE part of the module vermagic.

 Fixes: 71810db27c1c ("modversions: treat symbol CRCs as 32 bit quantities")
 Signed-off-by: Michael Ellerman 
>>>
>>> Applied to powerpc fixes.
>>>
>>> https://git.kernel.org/powerpc/c/73aca179d78eaa11604ba0783a6d8b
>>
>> Thanks for the cc. I guess this only affects powerpc, given that it is
>> the only arch that switches between CRC immediate values and CRC
>> offsets depending on the configuration.
>
> No worries.
>
> Is there any reason we shouldn't always turn on CONFIG_MODULE_REL_CRCS?
> It seems to work, but I wanted to test it more before switching to that,
> hence the quick fix above.
>
>
> arch/um looks like it might be switching based on config, but I don't
> know enough to say:
>
>   config LD_SCRIPT_STATIC
> bool
> default y
> depends on STATIC_LINK
>
>   config LD_SCRIPT_DYN
> bool
> default y
> depends on !LD_SCRIPT_STATIC
>   select MODULE_REL_CRCS if MODVERSIONS
>

The only reason not to enable it is that it ends up taking more space
on a 32-bit architecture with CONFIG_RELOCATABLE=n, given that you
need to record both the relative offset and the actual CRC value (both
32-bit quantities) rather than just the CRC itself. On a 64-bit arch
with CONFIG_RELOCATABLE=n, you end up replacing a single 64-bit
quantity with two 32-bit quantities, so it doesn't really matter.

[PATCH v10 22/25] mm: speculative page fault handler return VMA

2018-04-17 Thread Laurent Dufour

When the speculative page fault handler is returning VM_RETRY, there is a
chance that VMA fetched without grabbing the mmap_sem can be reused by the
legacy page fault handler.  By reusing it, we avoid calling find_vma()
again. To achieve, that we must ensure that the VMA structure will not be
freed in our back. This is done by getting the reference on it (get_vma())
and by assuming that the caller will call the new service
can_reuse_spf_vma() once it has grabbed the mmap_sem.

can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
, and then that the VMA's boundaries matched the passed address and release
the reference on the VMA so that it can be freed if needed.

In the case the VMA is freed, can_reuse_spf_vma() will have returned false
as the VMA is no more in the RB tree.

In the architecture page fault handler, the call to the new service
reuse_spf_or_find_vma() should be made in place of find_vma(), this will
handle the check on the spf_vma and if needed call find_vma().

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  22 +++--
 mm/memory.c| 140 -
 2 files changed, 103 insertions(+), 59 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 08540c98d63b..50b6fd3bf9e2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1382,25 +1382,37 @@ extern int handle_mm_fault(struct vm_area_struct *vma, 
unsigned long address,
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
 extern int __handle_speculative_fault(struct mm_struct *mm,
  unsigned long address,
- unsigned int flags);
+ unsigned int flags,
+ struct vm_area_struct **vma);
 static inline int handle_speculative_fault(struct mm_struct *mm,
   unsigned long address,
-  unsigned int flags)
+  unsigned int flags,
+  struct vm_area_struct **vma)
 {
/*
 * Try speculative page fault for multithreaded user space task only.
 */
-   if (!(flags & FAULT_FLAG_USER) || atomic_read(>mm_users) == 1)
+   if (!(flags & FAULT_FLAG_USER) || atomic_read(>mm_users) == 1) {
+   *vma = NULL;
return VM_FAULT_RETRY;
-   return __handle_speculative_fault(mm, address, flags);
+   }
+   return __handle_speculative_fault(mm, address, flags, vma);
 }
+extern bool can_reuse_spf_vma(struct vm_area_struct *vma,
+ unsigned long address);
 #else
 static inline int handle_speculative_fault(struct mm_struct *mm,
   unsigned long address,
-  unsigned int flags)
+  unsigned int flags,
+  struct vm_area_struct **vma)
 {
return VM_FAULT_RETRY;
 }
+static inline bool can_reuse_spf_vma(struct vm_area_struct *vma,
+unsigned long address)
+{
+   return false;
+}
 #endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
 
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
diff --git a/mm/memory.c b/mm/memory.c
index 76178feff000..425f07e0bf38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4311,13 +4311,22 @@ static int __handle_mm_fault(struct vm_area_struct 
*vma, unsigned long address,
 /* This is required by vm_normal_page() */
 #error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL"
 #endif
-
 /*
  * vm_normal_page() adds some processing which should be done while
  * hodling the mmap_sem.
  */
+
+/*
+ * Tries to handle the page fault in a speculative way, without grabbing the
+ * mmap_sem.
+ * When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
+ * be checked later when the mmap_sem has been grabbed by calling
+ * can_reuse_spf_vma().
+ * This is needed as the returned vma is kept in memory until the call to
+ * can_reuse_spf_vma() is made.
+ */
 int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
-  unsigned int flags)
+  unsigned int flags, struct vm_area_struct **vma)
 {
struct vm_fault vmf = {
.address = address,
@@ -4325,21 +4334,22 @@ int __handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
pgd_t *pgd, pgdval;
p4d_t *p4d, p4dval;
pud_t pudval;
-   int seq, ret = VM_FAULT_RETRY;
-   struct vm_area_struct *vma;
+   int seq, ret;
 
/* Clear flags that may lead to release the mmap_sem to retry */
flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
flags |= FAULT_FLAG_SPECULATIVE;
 
-   vma = get_vma(mm, address);

[PATCH v10 25/25] powerpc/mm: add speculative page fault

2018-04-17 Thread Laurent Dufour

This patch enable the speculative page fault on the PowerPC
architecture.

This will try a speculative page fault without holding the mmap_sem,
if it returns with VM_FAULT_RETRY, the mmap_sem is acquired and the
traditional page fault processing is done.

The speculative path is only tried for multithreaded process as there is no
risk of contention on the mmap_sem otherwise.

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/mm/fault.c | 33 +++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627e687a..37191147026e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -464,6 +464,26 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
 
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
+   fault = handle_speculative_fault(mm, address, flags, );
+   /*
+* Page fault is done if VM_FAULT_RETRY is not returned.
+* But if the memory protection keys are active, we don't know
+* if the fault is due to key mistmatch or due to a
+* classic protection check.
+* To differentiate that, we will need the VMA we no
+* more have, so let's retry with the mmap_sem held.
+*/
+   if (fault != VM_FAULT_RETRY &&
+   (IS_ENABLED(CONFIG_PPC_MEM_KEYS) &&
+fault != VM_FAULT_SIGSEGV)) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, address);
+   goto done;
+   }
+   } else {
+   vma = NULL;
+   }
+
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -494,7 +514,8 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
might_sleep();
}
 
-   vma = find_vma(mm, address);
+   if (!vma || !can_reuse_spf_vma(vma, address))
+   vma = find_vma(mm, address);
if (unlikely(!vma))
return bad_area(regs, address);
if (likely(vma->vm_start <= address))
@@ -551,8 +572,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 */
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
-   if (!fatal_signal_pending(current))
+   if (!fatal_signal_pending(current)) {
+   /*
+* Do not try to reuse this vma and fetch it
+* again since we will release the mmap_sem.
+*/
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT))
+   vma = NULL;
goto retry;
+   }
}
 
/*
@@ -564,6 +592,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 
up_read(>mm->mmap_sem);
 
+done:
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
 
-- 
2.7.4

[PATCH v10 24/25] x86/mm: add speculative pagefault handling

2018-04-17 Thread Laurent Dufour

From: Peter Zijlstra 

Try a speculative fault before acquiring mmap_sem, if it returns with
VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
traditional fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
 handle_speculative_fault()]
[Retry with usual fault path in the case VM_ERROR is returned by
 handle_speculative_fault(). This allows signal to be delivered]
[Don't build SPF call if !CONFIG_SPECULATIVE_PAGE_FAULT]
[Try speculative fault path only for multi threaded processes]
[Try reuse to the VMA fetch during the speculative path in case of retry]
[Call reuse_spf_or_find_vma()]
[Handle memory protection key fault]
Signed-off-by: Laurent Dufour 
---
 arch/x86/mm/fault.c | 42 ++
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 73bd8c95ac71..59f778386df5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1220,7 +1220,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
struct mm_struct *mm;
int fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-   u32 pkey;
+   u32 pkey, *pt_pkey = 
 
tsk = current;
mm = tsk->mm;
@@ -1310,6 +1310,30 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
flags |= FAULT_FLAG_INSTRUCTION;
 
/*
+* Do not try speculative page fault for kernel's pages and if
+* the fault was due to protection keys since it can't be resolved.
+*/
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT) &&
+   !(error_code & X86_PF_PK)) {
+   fault = handle_speculative_fault(mm, address, flags, );
+   if (fault != VM_FAULT_RETRY) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, address);
+   /*
+* Do not advertise for the pkey value since we don't
+* know it.
+* This is not a matter as we checked for X86_PF_PK
+* earlier, so we should not handle pkey fault here,
+* but to be sure that mm_fault_error() callees will
+* not try to use it, we invalidate the pointer.
+*/
+   pt_pkey = NULL;
+   goto done;
+   }
+   } else {
+   vma = NULL;
+   }
+
+   /*
 * When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in
 * the kernel and should generate an OOPS.  Unfortunately, in the
@@ -1342,7 +1366,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
might_sleep();
}
 
-   vma = find_vma(mm, address);
+   if (!vma || !can_reuse_spf_vma(vma, address))
+   vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
return;
@@ -1409,8 +1434,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
if (flags & FAULT_FLAG_ALLOW_RETRY) {
flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
-   if (!fatal_signal_pending(tsk))
+   if (!fatal_signal_pending(tsk)) {
+   /*
+* Do not try to reuse this vma and fetch it
+* again since we will release the mmap_sem.
+*/
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT))
+   vma = NULL;
goto retry;
+   }
}
 
/* User mode? Just return to handle the fatal exception */
@@ -1423,8 +1455,10 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
error_code,
}
 
up_read(>mmap_sem);
+
+done:
if (unlikely(fault & VM_FAULT_ERROR)) {
-   mm_fault_error(regs, error_code, address, , fault);
+   mm_fault_error(regs, error_code, address, pt_pkey, fault);
return;
}
 
-- 
2.7.4

[PATCH v10 23/25] mm: add speculative page fault vmstats

2018-04-17 Thread Laurent Dufour

Add speculative_pgfault vmstat counter to count successful speculative page
fault handling.

Also fixing a minor typo in include/linux/vm_event_item.h.

Signed-off-by: Laurent Dufour 
---
 include/linux/vm_event_item.h | 3 +++
 mm/memory.c   | 1 +
 mm/vmstat.c   | 5 -
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 5c7f010676a7..a240acc09684 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
SWAP_RA,
SWAP_RA_HIT,
 #endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   SPECULATIVE_PGFAULT,
+#endif
NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/memory.c b/mm/memory.c
index 425f07e0bf38..1cd5bc000643 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4508,6 +4508,7 @@ int __handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
 * If there is no need to retry, don't return the vma to the caller.
 */
if (ret != VM_FAULT_RETRY) {
+   count_vm_event(SPECULATIVE_PGFAULT);
put_vma(vmf.vma);
*vma = NULL;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 536332e988b8..c6b49bfa8139 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1289,7 +1289,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
 #endif
-#endif /* CONFIG_VM_EVENTS_COUNTERS */
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   "speculative_pgfault"
+#endif
+#endif /* CONFIG_VM_EVENT_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
 
-- 
2.7.4

[PATCH v10 21/25] perf tools: add support for the SPF perf event

2018-04-17 Thread Laurent Dufour

Add support for the new speculative faults event.

Acked-by: David Rientjes 
Signed-off-by: Laurent Dufour 
---
 tools/include/uapi/linux/perf_event.h | 1 +
 tools/perf/util/evsel.c   | 1 +
 tools/perf/util/parse-events.c| 4 
 tools/perf/util/parse-events.l| 1 +
 tools/perf/util/python.c  | 1 +
 5 files changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 912b85b52344..9aad243607fe 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 1ac8d9236efd..e14a754c3675 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -429,6 +429,7 @@ const char *perf_evsel__sw_names[PERF_COUNT_SW_MAX] = {
"alignment-faults",
"emulation-faults",
"dummy",
+   "speculative-faults",
 };
 
 static const char *__perf_evsel__sw_name(u64 config)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2fb0272146d8..54719f566314 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -140,6 +140,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
.symbol = "bpf-output",
.alias  = "",
},
+   [PERF_COUNT_SW_SPF] = {
+   .symbol = "speculative-faults",
+   .alias  = "spf",
+   },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index a1a01b1ac8b8..86584d3a3068 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -308,6 +308,7 @@ emulation-faults{ return 
sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EM
 dummy  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 duration_time  { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
 bpf-output { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
+speculative-faults|spf { return sym(yyscanner, 
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_SPF); }
 
/*
 * We have to handle the kernel PMU event 
cycles-ct/cycles-t/mem-loads/mem-stores separately.
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 863b61478edd..df4f7ff9bdff 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1181,6 +1181,7 @@ static struct {
PERF_CONST(COUNT_SW_ALIGNMENT_FAULTS),
PERF_CONST(COUNT_SW_EMULATION_FAULTS),
PERF_CONST(COUNT_SW_DUMMY),
+   PERF_CONST(COUNT_SW_SPF),
 
PERF_CONST(SAMPLE_IP),
PERF_CONST(SAMPLE_TID),
-- 
2.7.4

[PATCH v10 20/25] perf: add a speculative page fault sw event

2018-04-17 Thread Laurent Dufour

Add a new software event to count succeeded speculative page faults.

Acked-by: David Rientjes 
Signed-off-by: Laurent Dufour 
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 912b85b52344..9aad243607fe 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS  = 8,
PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT= 10,
+   PERF_COUNT_SW_SPF   = 11,
 
PERF_COUNT_SW_MAX,  /* non-ABI */
 };
-- 
2.7.4

[PATCH v10 19/25] mm: adding speculative page fault failure trace events

2018-04-17 Thread Laurent Dufour

This patch a set of new trace events to collect the speculative page fault
event failures.

Signed-off-by: Laurent Dufour 
---
 include/trace/events/pagefault.h | 88 
 mm/memory.c  | 62 ++--
 2 files changed, 137 insertions(+), 13 deletions(-)
 create mode 100644 include/trace/events/pagefault.h

diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h
new file mode 100644
index ..a9643b3759f2
--- /dev/null
+++ b/include/trace/events/pagefault.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include 
+#include 
+
+DECLARE_EVENT_CLASS(spf,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, caller)
+   __field(unsigned long, vm_start)
+   __field(unsigned long, vm_end)
+   __field(unsigned long, address)
+   ),
+
+   TP_fast_assign(
+   __entry->caller = caller;
+   __entry->vm_start   = vma->vm_start;
+   __entry->vm_end = vma->vm_end;
+   __entry->address= address;
+   ),
+
+   TP_printk("ip:%lx vma:%lx-%lx address:%lx",
+ __entry->caller, __entry->vm_start, __entry->vm_end,
+ __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_pte_lock,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_pmd_changed,
+
+   TP_PROTO(unsigned long caller,
+struct vm_area_struct *vma, unsigned long address),
+
+   TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include 
diff --git a/mm/memory.c b/mm/memory.c
index 8addf78deadb..76178feff000 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -80,6 +80,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include 
+
 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for 
last_cpupid.
 #endif
@@ -2317,8 +2320,10 @@ static bool pte_spinlock(struct vm_fault *vmf)
}
 
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
@@ -2326,16 +2331,21 @@ static bool pte_spinlock(struct vm_fault *vmf)
 * is not a huge collapse operation in progress in our back.
 */
pmdval = READ_ONCE(*vmf->pmd);
-   if (!pmd_same(pmdval, vmf->orig_pmd))
+   if (!pmd_same(pmdval, vmf->orig_pmd)) {
+   trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 #endif
 
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   if (unlikely(!spin_trylock(vmf->ptl)))
+   if (unlikely(!spin_trylock(vmf->ptl))) {
+   trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
if (vma_has_changed(vmf)) {
spin_unlock(vmf->ptl);
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
}
 
@@ -2368,8 +2378,10 @@ static bool pte_map_lock(struct vm_fault *vmf)
 * block on the PTL and thus we're safe.
 */
local_irq_disable();
-   if (vma_has_changed(vmf))
+   if (vma_has_changed(vmf)) {
+   trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
goto out;
+   }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
@@ -2377,8 +2389,10 @@ static bool pte_map_lock(struct vm_fault *vmf)
 * is not a huge collapse operation in progress in our back.

[PATCH v10 18/25] mm: provide speculative fault infrastructure

2018-04-17 Thread Laurent Dufour

From: Peter Zijlstra 

Provide infrastructure to do a speculative fault (not holding
mmap_sem).

The not holding of mmap_sem means we can race against VMA
change/removal and page-table destruction. We use the SRCU VMA freeing
to keep the VMA around. We use the VMA seqcount to detect change
(including umapping / page-table deletion) and we use gup_fast() style
page-table walking to deal with page-table races.

Once we've obtained the page and are ready to update the PTE, we
validate if the state we started the fault with is still valid, if
not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
PTE and we're done.

Signed-off-by: Peter Zijlstra (Intel) 

[Manage the newly introduced pte_spinlock() for speculative page
 fault to fail if the VMA is touched in our back]
[Rename vma_is_dead() to vma_has_changed() and declare it here]
[Fetch p4d and pud]
[Set vmd.sequence in __handle_mm_fault()]
[Abort speculative path when handle_userfault() has to be called]
[Add additional VMA's flags checks in handle_speculative_fault()]
[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
[Remove warning comment about waiting for !seq&1 since we don't want
 to wait]
[Remove warning about no huge page support, mention it explictly]
[Don't call do_fault() in the speculative path as __do_fault() calls
 vma->vm_ops->fault() which may want to release mmap_sem]
[Only vm_fault pointer argument for vma_has_changed()]
[Fix check against huge page, calling pmd_trans_huge()]
[Use READ_ONCE() when reading VMA's fields in the speculative path]
[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
 processing done in vm_normal_page()]
[Check that vma->anon_vma is already set when starting the speculative
 path]
[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
 the processing done in mpol_misplaced()]
[Don't support VMA growing up or down]
[Move check on vm_sequence just before calling handle_pte_fault()]
[Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
[Add mem cgroup oom check]
[Use READ_ONCE to access p*d entries]
[Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
[Don't fetch pte again in handle_pte_fault() when running the speculative
 path]
[Check PMD against concurrent collapsing operation]
[Try spin lock the pte during the speculative path to avoid deadlock with
 other CPU's invalidating the TLB and requiring this CPU to catch the
 inter processor's interrupt]
[Move define of FAULT_FLAG_SPECULATIVE here]
[Introduce __handle_speculative_fault() and add a check against
 mm->mm_users in handle_speculative_fault() defined in mm.h]
Signed-off-by: Laurent Dufour 
---
 include/linux/hugetlb_inline.h |   2 +-
 include/linux/mm.h |  30 
 include/linux/pagemap.h|   4 +-
 mm/internal.h  |  16 +-
 mm/memory.c| 340 -
 5 files changed, 385 insertions(+), 7 deletions(-)

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..9e25283d6fc9 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -8,7 +8,7 @@
 
 static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-   return !!(vma->vm_flags & VM_HUGETLB);
+   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
 }
 
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2c24ea58d94..08540c98d63b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,6 +309,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER0x40/* The fault originated in 
userspace */
 #define FAULT_FLAG_REMOTE  0x80/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction 
fetch */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding 
mmap_sem */
 
 #define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
@@ -337,6 +338,10 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations 
*/
pgoff_t pgoff;  /* Logical page offset based on vma */
unsigned long address;  /* Faulting virtual address */
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   unsigned int sequence;
+   pmd_t orig_pmd; /* value of PMD at the time of fault */
+#endif
pmd_t *pmd; /* Pointer to pmd entry matching
 * the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1373,6 +1378,31 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+
+#ifdef

[PATCH v10 17/25] mm: protect mm_rb tree with a rwlock

2018-04-17 Thread Laurent Dufour

This change is inspired by the Peter's proposal patch [1] which was
protecting the VMA using SRCU. Unfortunately, SRCU is not scaling well in
that particular case, and it is introducing major performance degradation
due to excessive scheduling operations.

To allow access to the mm_rb tree without grabbing the mmap_sem, this patch
is protecting it access using a rwlock.  As the mm_rb tree is a O(log n)
search it is safe to protect it using such a lock.  The VMA cache is not
protected by the new rwlock and it should not be used without holding the
mmap_sem.

To allow the picked VMA structure to be used once the rwlock is released, a
use count is added to the VMA structure. When the VMA is allocated it is
set to 1.  Each time the VMA is picked with the rwlock held its use count
is incremented. Each time the VMA is released it is decremented. When the
use count hits zero, this means that the VMA is no more used and should be
freed.

This patch is preparing for 2 kind of VMA access :
 - as usual, under the control of the mmap_sem,
 - without holding the mmap_sem for the speculative page fault handler.

Access done under the control the mmap_sem doesn't require to grab the
rwlock to protect read access to the mm_rb tree, but access in write must
be done under the protection of the rwlock too. This affects inserting and
removing of elements in the RB tree.

The patch is introducing 2 new functions:
 - vma_get() to find a VMA based on an address by holding the new rwlock.
 - vma_put() to release the VMA when its no more used.
These services are designed to be used when access are made to the RB tree
without holding the mmap_sem.

When a VMA is removed from the RB tree, its vma->vm_rb field is cleared and
we rely on the WMB done when releasing the rwlock to serialize the write
with the RMB done in a later patch to check for the VMA's validity.

When free_vma is called, the file associated with the VMA is closed
immediately, but the policy and the file structure remained in used until
the VMA's use count reach 0, which may happens later when exiting an
in progress speculative page fault.

[1] https://patchwork.kernel.org/patch/5108281/

Cc: Peter Zijlstra (Intel) 
Cc: Matthew Wilcox 
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h   |   1 +
 include/linux/mm_types.h |   4 ++
 kernel/fork.c|   3 ++
 mm/init-mm.c |   3 ++
 mm/internal.h|   6 +++
 mm/mmap.c| 115 +++
 6 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f967bf84094f..e2c24ea58d94 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1272,6 +1272,7 @@ static inline void INIT_VMA(struct vm_area_struct *vma)
INIT_LIST_HEAD(>anon_vma_chain);
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
seqcount_init(>vm_sequence);
+   atomic_set(>vm_ref_count, 1);
 #endif
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index db5e9d630e7a..faf3844dd815 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -337,6 +337,7 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
seqcount_t vm_sequence;
+   atomic_t vm_ref_count;  /* see vma_get(), vma_put() */
 #endif
 } __randomize_layout;
 
@@ -355,6 +356,9 @@ struct kioctx_table;
 struct mm_struct {
struct vm_area_struct *mmap;/* list of VMAs */
struct rb_root mm_rb;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   rwlock_t mm_rb_lock;
+#endif
u32 vmacache_seqnum;   /* per-thread vmacache */
 #ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/kernel/fork.c b/kernel/fork.c
index d937e5945f77..9f8d235a3df8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -891,6 +891,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   rwlock_init(>mm_rb_lock);
+#endif
atomic_set(>mm_users, 1);
atomic_set(>mm_count, 1);
init_rwsem(>mmap_sem);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..e71ac37a98c4 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -17,6 +17,9 @@
 
 struct mm_struct init_mm = {
.mm_rb  = RB_ROOT,
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
+#endif
.pgd= swapper_pg_dir,
.mm_users   = ATOMIC_INIT(2),
.mm_count   = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..fb2667b20f0a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -40,6 +40,12 @@ void page_writeback_init(void);
 
 int do_swap_page(struct vm_fault *vmf);

[PATCH v10 16/25] mm: introduce __page_add_new_anon_rmap()

2018-04-17 Thread Laurent Dufour

When dealing with speculative page fault handler, we may race with VMA
being split or merged. In this case the vma->vm_start and vm->vm_end
fields may not match the address the page fault is occurring.

This can only happens when the VMA is split but in that case, the
anon_vma pointer of the new VMA will be the same as the original one,
because in __split_vma the new->anon_vma is set to src->anon_vma when
*new = *vma.

So even if the VMA boundaries are not correct, the anon_vma pointer is
still valid.

If the VMA has been merged, then the VMA in which it has been merged
must have the same anon_vma pointer otherwise the merge can't be done.

So in all the case we know that the anon_vma is valid, since we have
checked before starting the speculative page fault that the anon_vma
pointer is valid for this VMA and since there is an anon_vma this
means that at one time a page has been backed and that before the VMA
is cleaned, the page table lock would have to be grab to clean the
PTE, and the anon_vma field is checked once the PTE is locked.

This patch introduce a new __page_add_new_anon_rmap() service which
doesn't check for the VMA boundaries, and create a new inline one
which do the check.

When called from a page fault handler, if this is not a speculative one,
there is a guarantee that vm_start and vm_end match the faulting address,
so this check is useless. In the context of the speculative page fault
handler, this check may be wrong but anon_vma is still valid as explained
above.

Signed-off-by: Laurent Dufour 
---
 include/linux/rmap.h | 12 ++--
 mm/memory.c  |  8 
 mm/rmap.c|  5 ++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 988d176472df..a5d282573093 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -174,8 +174,16 @@ void page_add_anon_rmap(struct page *, struct 
vm_area_struct *,
unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
   unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-   unsigned long, bool);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
+static inline void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address, bool compound)
+{
+   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+   __page_add_new_anon_rmap(page, vma, address, compound);
+}
+
 void page_add_file_rmap(struct page *, bool);
 void page_remove_rmap(struct page *, bool);
 
diff --git a/mm/memory.c b/mm/memory.c
index d9146a0c3d25..9c220ac0e2c5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2563,7 +2563,7 @@ static int wp_page_copy(struct vm_fault *vmf)
 * thread doing COW.
 */
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
-   page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
__lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
@@ -3100,7 +3100,7 @@ int do_swap_page(struct vm_fault *vmf)
 
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
@@ -3251,7 +3251,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
}
 
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
@@ -3505,7 +3505,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
/* copy-on-write page */
if (write && !(vmf->vma_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-   page_add_new_anon_rmap(page, vma, vmf->address, false);
+   __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
diff --git a/mm/rmap.c b/mm/rmap.c
index 8d5337fed37b..9307f6140796 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1136,7

[PATCH v10 15/25] mm: introduce __vm_normal_page()

2018-04-17 Thread Laurent Dufour

When dealing with the speculative fault path we should use the VMA's field
cached value stored in the vm_fault structure.

Currently vm_normal_page() is using the pointer to the VMA to fetch the
vm_flags value. This patch provides a new __vm_normal_page() which is
receiving the vm_flags flags value as parameter.

Note: The speculative path is turned on for architecture providing support
for special PTE flag. So only the first block of vm_normal_page is used
during the speculative path.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 18 +++---
 mm/memory.c| 25 -
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c65205c8c558..f967bf84094f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1275,9 +1275,21 @@ static inline void INIT_VMA(struct vm_area_struct *vma)
 #endif
 }
 
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-pte_t pte, bool with_public_device);
-#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device,
+ unsigned long vma_flags);
+static inline struct page *_vm_normal_page(struct vm_area_struct *vma,
+   unsigned long addr, pte_t pte,
+   bool with_public_device)
+{
+   return __vm_normal_page(vma, addr, pte, with_public_device,
+   vma->vm_flags);
+}
+static inline struct page *vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+   return _vm_normal_page(vma, addr, pte, false);
+}
 
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
diff --git a/mm/memory.c b/mm/memory.c
index 47af9e97f02a..d9146a0c3d25 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -780,7 +780,8 @@ static void print_bad_pte(struct vm_area_struct *vma, 
unsigned long addr,
 }
 
 /*
- * vm_normal_page -- This function gets the "struct page" associated with a 
pte.
+ * __vm_normal_page -- This function gets the "struct page" associated with
+ * a pte.
  *
  * "Special" mappings do not wish to be associated with a "struct page" (either
  * it doesn't exist, or it exists but they don't want to touch it). In this
@@ -826,8 +827,9 @@ static void print_bad_pte(struct vm_area_struct *vma, 
unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-pte_t pte, bool with_public_device)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device,
+ unsigned long vma_flags)
 {
unsigned long pfn = pte_pfn(pte);
 
@@ -836,7 +838,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
-   if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+   if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (is_zero_pfn(pfn))
return NULL;
@@ -867,9 +869,13 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
}
 
/* !HAVE_PTE_SPECIAL case follows: */
+   /*
+* This part should never get called when CONFIG_SPECULATIVE_PAGE_FAULT
+* is set. This is mainly because we can't rely on vm_start.
+*/
 
-   if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-   if (vma->vm_flags & VM_MIXEDMAP) {
+   if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+   if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
@@ -878,7 +884,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, 
unsigned long addr,
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
-   if (!is_cow_mapping(vma->vm_flags))
+   if (!is_cow_mapping(vma_flags))
return NULL;
}
}
@@ -2743,7 +2749,8 @@ static int do_wp_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
 
-   vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+   vmf->page = __vm_normal_page(vma,

[PATCH v10 14/25] mm: introduce __lru_cache_add_active_or_unevictable

2018-04-17 Thread Laurent Dufour

The speculative page fault handler which is run without holding the
mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
is not guaranteed to remain constant.
Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
value parameter instead of the vma pointer.

Acked-by: David Rientjes 
Signed-off-by: Laurent Dufour 
---
 include/linux/swap.h | 10 --
 mm/memory.c  |  8 
 mm/swap.c|  6 +++---
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1985940af479..a7dc37e0e405 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -338,8 +338,14 @@ extern void deactivate_file_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-   struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+   unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+   struct vm_area_struct *vma)
+{
+   return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
index e28cbbae3f3d..47af9e97f02a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2559,7 +2559,7 @@ static int wp_page_copy(struct vm_fault *vmf)
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(new_page, vma);
+   __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
 * We call the notify macro here because, when using secondary
 * mmu page tables (such as kvm shadow page tables), we want the
@@ -3095,7 +3095,7 @@ int do_swap_page(struct vm_fault *vmf)
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -3246,7 +3246,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
 setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3500,7 +3500,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup 
*memcg,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-   lru_cache_add_active_or_unevictable(page, vma);
+   __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
diff --git a/mm/swap.c b/mm/swap.c
index 3dd518832096..f2f9c587246f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -455,12 +455,12 @@ void lru_cache_add(struct page *page)
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+  unsigned long vma_flags)
 {
VM_BUG_ON_PAGE(PageLRU(page), page);
 
-   if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+   if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
SetPageActive(page);
else if (!TestSetPageMlocked(page)) {
/*
-- 
2.7.4

[PATCH v10 13/25] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page()

2018-04-17 Thread Laurent Dufour

migrate_misplaced_page() is only called during the page fault handling so
it's better to pass the pointer to the struct vm_fault instead of the vma.

This way during the speculative page fault path the saved vma->vm_flags
could be used.

Acked-by: David Rientjes 
Signed-off-by: Laurent Dufour 
---
 include/linux/migrate.h | 4 ++--
 mm/memory.c | 2 +-
 mm/migrate.c| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f2b4abbca55e..fd4c3ab7bd9c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -126,14 +126,14 @@ static inline void __ClearPageMovable(struct page *page)
 #ifdef CONFIG_NUMA_BALANCING
 extern bool pmd_trans_migrating(pmd_t pmd);
 extern int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node);
+ struct vm_fault *vmf, int node);
 #else
 static inline bool pmd_trans_migrating(pmd_t pmd)
 {
return false;
 }
 static inline int migrate_misplaced_page(struct page *page,
-struct vm_area_struct *vma, int node)
+struct vm_fault *vmf, int node)
 {
return -EAGAIN; /* can't migrate now */
 }
diff --git a/mm/memory.c b/mm/memory.c
index 2fb9920e06a5..e28cbbae3f3d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3894,7 +3894,7 @@ static int do_numa_page(struct vm_fault *vmf)
}
 
/* Migrate to the requested node */
-   migrated = migrate_misplaced_page(page, vma, target_nid);
+   migrated = migrate_misplaced_page(page, vmf, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
diff --git a/mm/migrate.c b/mm/migrate.c
index 44d7007cfc1c..5d5cf9b5ac16 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1944,7 +1944,7 @@ bool pmd_trans_migrating(pmd_t pmd)
  * node. Caller is expected to have an elevated reference count on
  * the page that will be dropped by this function before returning.
  */
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
   int node)
 {
pg_data_t *pgdat = NODE_DATA(node);
@@ -1957,7 +1957,7 @@ int migrate_misplaced_page(struct page *page, struct 
vm_area_struct *vma,
 * with execute permissions as they are probably shared libraries.
 */
if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
-   (vma->vm_flags & VM_EXEC))
+   (vmf->vma_flags & VM_EXEC))
goto out;
 
/*
-- 
2.7.4

[PATCH v10 12/25] mm: cache some VMA fields in the vm_fault structure

2018-04-17 Thread Laurent Dufour

When handling speculative page fault, the vma->vm_flags and
vma->vm_page_prot fields are read once the page table lock is released. So
there is no more guarantee that these fields would not change in our back.
They will be saved in the vm_fault structure before the VMA is checked for
changes.

This patch also set the fields in hugetlb_no_page() and
__collapse_huge_page_swapin even if it is not need for the callee.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 10 --
 mm/huge_memory.c   |  6 +++---
 mm/hugetlb.c   |  2 ++
 mm/khugepaged.c|  2 ++
 mm/memory.c| 50 ++
 mm/migrate.c   |  2 +-
 6 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6edd15563bc..c65205c8c558 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -367,6 +367,12 @@ struct vm_fault {
 * page table to avoid allocation from
 * atomic context.
 */
+   /*
+* These entries are required when handling speculative page fault.
+* This way the page handling is done using consistent field values.
+*/
+   unsigned long vma_flags;
+   pgprot_t vma_page_prot;
 };
 
 /* page entry size for vm->huge_fault() */
@@ -687,9 +693,9 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-   if (likely(vma->vm_flags & VM_WRITE))
+   if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3a1815f8e11..da2afda67e68 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1194,8 +1194,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault 
*vmf, pmd_t orig_pmd,
 
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t entry;
-   entry = mk_pte(pages[i], vma->vm_page_prot);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = mk_pte(pages[i], vmf->vma_page_prot);
+   entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
@@ -2168,7 +2168,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct 
*vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
-   entry = maybe_mkwrite(entry, vma);
+   entry = maybe_mkwrite(entry, vma->vm_flags);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 218679138255..774864153407 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3718,6 +3718,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
.vma = vma,
.address = address,
.flags = flags,
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
/*
 * Hard to debug if it ends up being
 * used by a callee that assumes
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0b28af4b950d..2b02a9f9589e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -887,6 +887,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct 
*mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
+   .vma_flags = vma->vm_flags,
+   .vma_page_prot = vma->vm_page_prot,
};
 
/* we only decide to swapin, if there is enough young ptes */
diff --git a/mm/memory.c b/mm/memory.c
index f76f5027d251..2fb9920e06a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1826,7 +1826,7 @@ static int insert_pfn(struct vm_area_struct *vma, 
unsigned long addr,
 out_mkwrite:
if (mkwrite) {
entry = pte_mkyoung(entry);
-   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
}
 
set_pte_at(mm, addr, pte, entry);
@@ -2472,7 +2472,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)

[PATCH v10 11/25] mm: protect SPF handler against anon_vma changes

2018-04-17 Thread Laurent Dufour

The speculative page fault handler must be protected against anon_vma
changes. This is because page_add_new_anon_rmap() is called during the
speculative path.

In addition, don't try speculative page fault if the VMA don't have an
anon_vma structure allocated because its allocation should be
protected by the mmap_sem.

In __vma_adjust() when importer->anon_vma is set, there is no need to
protect against speculative page faults since speculative page fault
is aborted if the vma->anon_vma is not set.

When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
valid since we checked for it when locking the pte and the anon_vma is
removed once the pte is unlocked. So even if the speculative page
fault handler is running concurrently with do_unmap(), as the pte is
locked in unmap_region() - through unmap_vmas() - and the anon_vma
unlinked later, because we check for the vma sequence counter which is
updated in unmap_page_range() before locking the pte, and then in
free_pgtables() so when locking the pte the change will be detected.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index f7fed053df80..f76f5027d251 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -624,7 +624,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * Hide vma from rmap and truncate_pagecache before freeing
 * pgtables
 */
+   vm_write_begin(vma);
unlink_anon_vmas(vma);
+   vm_write_end(vma);
unlink_file_vma(vma);
 
if (is_vm_hugetlb_page(vma)) {
@@ -638,7 +640,9 @@ void free_pgtables(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
   && !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+   vm_write_begin(vma);
unlink_anon_vmas(vma);
+   vm_write_end(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
-- 
2.7.4

[PATCH v10 10/25] mm: protect mremap() against SPF hanlder

2018-04-17 Thread Laurent Dufour

If a thread is remapping an area while another one is faulting on the
destination area, the SPF handler may fetch the vma from the RB tree before
the pte has been moved by the other thread. This means that the moved ptes
will overwrite those create by the page fault handler leading to page
leaked.

CPU 1   CPU2
enter mremap()
unmap the dest area
copy_vma()  Enter speculative page fault handler
   >> at this time the dest area is present in the RB tree
fetch the vma matching dest area
create a pte as the VMA matched
Exit the SPF handler

move_ptes()
  > it is assumed that the dest area is empty,
  > the move ptes overwrite the page mapped by the CPU2.

To prevent that, when the VMA matching the dest area is extended or created
by copy_vma(), it should be marked as non available to the SPF handler.
The usual way to so is to rely on vm_write_begin()/end().
This is already in __vma_adjust() called by copy_vma() (through
vma_merge()). But __vma_adjust() is calling vm_write_end() before returning
which create a window for another thread.
This patch adds a new parameter to vma_merge() which is passed down to
vma_adjust().
The assumption is that copy_vma() is returning a vma which should be
released by calling vm_raw_write_end() by the callee once the ptes have
been moved.

Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h | 24 +++-
 mm/mmap.c  | 53 +
 mm/mremap.c| 13 +
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 988daf7030c9..f6edd15563bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2211,18 +2211,32 @@ void anon_vma_interval_tree_verify(struct 
anon_vma_chain *node);
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int 
cap_sys_admin);
+
 extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-   struct vm_area_struct *expand);
+   struct vm_area_struct *expand, bool keep_locked);
+
 static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
-   return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+   return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
 }
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+
+extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
+   struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+   unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+   pgoff_t pgoff, struct mempolicy *mpol,
+   struct vm_userfaultfd_ctx uff, bool keep_locked);
+
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
-   unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-   struct mempolicy *, struct vm_userfaultfd_ctx);
+   unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+   pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff)
+{
+   return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
+  pol, uff, false);
+}
+
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
diff --git a/mm/mmap.c b/mm/mmap.c
index 921f20cc6df0..5601f1ef8bb9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -680,7 +680,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm,
  */
 int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
-   struct vm_area_struct *expand)
+   struct vm_area_struct *expand, bool keep_locked)
 {
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -796,8 +796,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
 
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
-   if (error)
+   if (error) {
+   if (next && next != vma)
+   vm_raw_write_end(next);
+   vm_raw_write_end(vma);
return error;
+   }
}
}
 again:
@@ -992,7 +996,8 @@ int

[PATCH v10 09/25] mm: protect VMA modifications using VMA sequence count

2018-04-17 Thread Laurent Dufour

The VMA sequence count has been introduced to allow fast detection of
VMA modification when running a page fault handler without holding
the mmap_sem.

This patch provides protection against the VMA modification done in :
- madvise()
- mpol_rebind_policy()
- vma_replace_policy()
- change_prot_numa()
- mlock(), munlock()
- mprotect()
- mmap_region()
- collapse_huge_page()
- userfaultd registering services

In addition, VMA fields which will be read during the speculative fault
path needs to be written using WRITE_ONCE to prevent write to be split
and intermediate values to be pushed to other CPUs.

Signed-off-by: Laurent Dufour 
---
 fs/proc/task_mmu.c |  5 -
 fs/userfaultfd.c   | 17 +
 mm/khugepaged.c|  3 +++
 mm/madvise.c   |  6 +-
 mm/mempolicy.c | 51 ++-
 mm/mlock.c | 13 -
 mm/mmap.c  | 22 +-
 mm/mprotect.c  |  4 +++-
 mm/swap_state.c|  8 ++--
 9 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c486ad4b43f0..aeb417f28839 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1136,8 +1136,11 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
-   vma->vm_flags &= ~VM_SOFTDIRTY;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & 
~VM_SOFTDIRTY);
vma_set_page_prot(vma);
+   vm_write_end(vma);
}
downgrade_write(>mmap_sem);
break;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..b8212ba17695 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -659,8 +659,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct 
list_head *fcs)
 
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+   vm_write_begin(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-   vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+   WRITE_ONCE(vma->vm_flags,
+  vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
+   vm_write_end(vma);
return 0;
}
 
@@ -885,8 +888,10 @@ static int userfaultfd_release(struct inode *inode, struct 
file *file)
vma = prev;
else
prev = vma;
-   vma->vm_flags = new_flags;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   vm_write_end(vma);
}
up_write(>mmap_sem);
mmput(mm);
@@ -1434,8 +1439,10 @@ static int userfaultfd_register(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
+   vm_write_end(vma);
 
skip:
prev = vma;
@@ -1592,8 +1599,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx 
*ctx,
 * the next vma was merged into the current one and
 * the current one has not been updated yet.
 */
-   vma->vm_flags = new_flags;
+   vm_write_begin(vma);
+   WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+   vm_write_end(vma);
 
skip:
prev = vma;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d7b2a4bf8671..0b28af4b950d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1011,6 +1011,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (mm_find_pmd(mm, address) != pmd)
goto out;
 
+   vm_write_begin(vma);
anon_vma_lock_write(vma->anon_vma);
 
pte = pte_offset_map(pmd, address);
@@ -1046,6 +1047,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+   vm_write_end(vma);
result = SCAN_FAIL;
goto

[PATCH v10 08/25] mm: VMA sequence count

2018-04-17 Thread Laurent Dufour

From: Peter Zijlstra 

Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
counts such that we can easily test if a VMA is changed.

The unmap_page_range() one allows us to make assumptions about
page-tables; when we find the seqcount hasn't changed we can assume
page-tables are still valid.

The flip side is that we cannot distinguish between a vma_adjust() and
the unmap_page_range() -- where with the former we could have
re-checked the vma bounds against the address.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Build depends on CONFIG_SPECULATIVE_PAGE_FAULT]
[Introduce vm_write_* inline function depending on
 CONFIG_SPECULATIVE_PAGE_FAULT]
[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence by
 using vm_raw_write* functions]
[Fix a lock dependency warning in mmap_region() when entering the error
 path]
[move sequence initialisation INIT_VMA()]
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h   | 44 
 include/linux/mm_types.h |  3 +++
 mm/memory.c  |  2 ++
 mm/mmap.c| 31 +++
 4 files changed, 80 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index efc1248b82bd..988daf7030c9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1264,6 +1264,9 @@ struct zap_details {
 static inline void INIT_VMA(struct vm_area_struct *vma)
 {
INIT_LIST_HEAD(>anon_vma_chain);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   seqcount_init(>vm_sequence);
+#endif
 }
 
 struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
@@ -1386,6 +1389,47 @@ static inline void unmap_shared_mapping_range(struct 
address_space *mapping,
unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+   write_seqcount_begin(>vm_sequence);
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+int subclass)
+{
+   write_seqcount_begin_nested(>vm_sequence, subclass);
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+   write_seqcount_end(>vm_sequence);
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+   raw_write_seqcount_begin(>vm_sequence);
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+   raw_write_seqcount_end(>vm_sequence);
+}
+#else
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+int subclass)
+{
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 21612347d311..db5e9d630e7a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,9 @@ struct vm_area_struct {
struct mempolicy *vm_policy;/* NUMA policy for the VMA */
 #endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+   seqcount_t vm_sequence;
+#endif
 } __randomize_layout;
 
 struct core_thread {
diff --git a/mm/memory.c b/mm/memory.c
index f86efcb8e268..f7fed053df80 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1503,6 +1503,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long next;
 
BUG_ON(addr >= end);
+   vm_write_begin(vma);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1512,6 +1513,7 @@ void unmap_page_range(struct mmu_gather *tlb,
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+   vm_write_end(vma);
 }
 
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 8bd9ae1dfacc..813e49589ea1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -692,6 +692,30 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
long adjust_next = 0;
int remove_next = 0;
 
+   /*
+* Why using vm_raw_write*() functions here to avoid lockdep's warning ?
+*
+* Locked is complaining about a theoretical lock dependency, involving
+* 3 locks:
+*   mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
+*
+* Here are the major path leading to this dependency :
+*  1. __vma_adjust() mmap_sem  -> vm_sequence ->

[PATCH v10 07/25] mm: introduce INIT_VMA()

2018-04-17 Thread Laurent Dufour

Some VMA struct fields need to be initialized once the VMA structure is
allocated.
Currently this only concerns anon_vma_chain field but some other will be
added to support the speculative page fault.

Instead of spreading the initialization calls all over the code, let's
introduce a dedicated inline function.

Signed-off-by: Laurent Dufour 
---
 fs/exec.c  |  2 +-
 include/linux/mm.h |  5 +
 kernel/fork.c  |  2 +-
 mm/mmap.c  | 10 +-
 mm/nommu.c |  2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 32eea4c65909..bd03689aa358 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -311,7 +311,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
vma->vm_start = vma->vm_end - PAGE_SIZE;
vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | 
VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(vma);
 
err = insert_vm_struct(mm, vma);
if (err)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 714da99d77a3..efc1248b82bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1261,6 +1261,11 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap 
*/
 };
 
+static inline void INIT_VMA(struct vm_area_struct *vma)
+{
+   INIT_LIST_HEAD(>anon_vma_chain);
+}
+
 struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 pte_t pte, bool with_public_device);
 #define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
diff --git a/kernel/fork.c b/kernel/fork.c
index b1d877f1a0ac..d937e5945f77 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,7 +451,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(tmp);
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
diff --git a/mm/mmap.c b/mm/mmap.c
index 188f195883b9..8bd9ae1dfacc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1700,7 +1700,7 @@ unsigned long mmap_region(struct file *file, unsigned 
long addr,
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(vma);
 
if (file) {
if (vm_flags & VM_DENYWRITE) {
@@ -2586,7 +2586,7 @@ int __split_vma(struct mm_struct *mm, struct 
vm_area_struct *vma,
/* most fields are the same, copy all, and then fixup */
*new = *vma;
 
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(new);
 
if (new_below)
new->vm_end = addr;
@@ -2956,7 +2956,7 @@ static int do_brk_flags(unsigned long addr, unsigned long 
request, unsigned long
return -ENOMEM;
}
 
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(vma);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -3167,7 +3167,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct 
**vmap,
new_vma->vm_pgoff = pgoff;
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
-   INIT_LIST_HEAD(_vma->anon_vma_chain);
+   INIT_VMA(new_vma);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
if (new_vma->vm_file)
@@ -3310,7 +3310,7 @@ static struct vm_area_struct *__install_special_mapping(
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
 
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(vma);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
diff --git a/mm/nommu.c b/mm/nommu.c
index 13723736d38f..6909ea0bf88d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1212,7 +1212,7 @@ unsigned long do_mmap(struct file *file,
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
 
-   INIT_LIST_HEAD(>anon_vma_chain);
+   INIT_VMA(vma);
vma->vm_flags = vm_flags;
vma->vm_pgoff = pgoff;
 
-- 
2.7.4

[PATCH v10 06/25] mm: make pte_unmap_same compatible with SPF

2018-04-17 Thread Laurent Dufour

pte_unmap_same() is making the assumption that the page table are still
around because the mmap_sem is held.
This is no more the case when running a speculative page fault and
additional check must be made to ensure that the final page table are still
there.

This is now done by calling pte_spinlock() to check for the VMA's
consistency while locking for the page tables.

This is requiring passing a vm_fault structure to pte_unmap_same() which is
containing all the needed parameters.

As pte_spinlock() may fail in the case of a speculative page fault, if the
VMA has been touched in our back, pte_unmap_same() should now return 3
cases :
1. pte are the same (0)
2. pte are different (VM_FAULT_PTNOTSAME)
3. a VMA's changes has been detected (VM_FAULT_RETRY)

The case 2 is handled by the introduction of a new VM_FAULT flag named
VM_FAULT_PTNOTSAME which is then trapped in cow_user_page().
If VM_FAULT_RETRY is returned, it is passed up to the callers to retry the
page fault while holding the mmap_sem.

Acked-by: David Rientjes 
Signed-off-by: Laurent Dufour 
---
 include/linux/mm.h |  1 +
 mm/memory.c| 39 ---
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4d1aff80669c..714da99d77a3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1208,6 +1208,7 @@ static inline void clear_page_pfmemalloc(struct page 
*page)
 #define VM_FAULT_NEEDDSYNC  0x2000 /* ->fault did not modify page tables
 * and needs fsync() to complete (for
 * synchronous page faults in DAX) */
+#define VM_FAULT_PTNOTSAME 0x4000  /* Page table entries have changed */
 
 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
diff --git a/mm/memory.c b/mm/memory.c
index 0b9a51f80e0e..f86efcb8e268 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2309,21 +2309,29 @@ static inline bool pte_map_lock(struct vm_fault *vmf)
  * parts, do_swap_page must check under lock before unmapping the pte and
  * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
+ *
+ * pte_unmap_same() returns:
+ * 0   if the PTE are the same
+ * VM_FAULT_PTNOTSAME  if the PTE are different
+ * VM_FAULT_RETRY  if the VMA has changed in our back during
+ * a speculative page fault handling.
  */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-   pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf)
 {
-   int same = 1;
+   int ret = 0;
+
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
-   spinlock_t *ptl = pte_lockptr(mm, pmd);
-   spin_lock(ptl);
-   same = pte_same(*page_table, orig_pte);
-   spin_unlock(ptl);
+   if (pte_spinlock(vmf)) {
+   if (!pte_same(*vmf->pte, vmf->orig_pte))
+   ret = VM_FAULT_PTNOTSAME;
+   spin_unlock(vmf->ptl);
+   } else
+   ret = VM_FAULT_RETRY;
}
 #endif
-   pte_unmap(page_table);
-   return same;
+   pte_unmap(vmf->pte);
+   return ret;
 }
 
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
@@ -2912,10 +2920,19 @@ int do_swap_page(struct vm_fault *vmf)
pte_t pte;
int locked;
int exclusive = 0;
-   int ret = 0;
+   int ret;
 
-   if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+   ret = pte_unmap_same(vmf);
+   if (ret) {
+   /*
+* If pte != orig_pte, this means another thread did the
+* swap operation in our back.
+* So nothing else to do.
+*/
+   if (ret == VM_FAULT_PTNOTSAME)
+   ret = 0;
goto out;
+   }
 
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
-- 
2.7.4

[PATCH v10 05/25] mm: introduce pte_spinlock for FAULT_FLAG_SPECULATIVE

2018-04-17 Thread Laurent Dufour

When handling page fault without holding the mmap_sem the fetch of the
pte lock pointer and the locking will have to be done while ensuring
that the VMA is not touched in our back.

So move the fetch and locking operations in a dedicated function.

Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 4528bd584b7a..0b9a51f80e0e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
+static inline bool pte_spinlock(struct vm_fault *vmf)
+{
+   vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+   spin_lock(vmf->ptl);
+   return true;
+}
+
 static inline bool pte_map_lock(struct vm_fault *vmf)
 {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
@@ -3804,8 +3811,8 @@ static int do_numa_page(struct vm_fault *vmf)
 * validation through pte_unmap_same(). It's of NUMA type but
 * the pfn may be screwed if the read is non atomic.
 */
-   vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (!pte_spinlock(vmf))
+   return VM_FAULT_RETRY;
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
@@ -3998,8 +4005,8 @@ static int handle_pte_fault(struct vm_fault *vmf)
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
 
-   vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-   spin_lock(vmf->ptl);
+   if (!pte_spinlock(vmf))
+   return VM_FAULT_RETRY;
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
-- 
2.7.4

[PATCH v10 04/25] mm: prepare for FAULT_FLAG_SPECULATIVE

2018-04-17 Thread Laurent Dufour

From: Peter Zijlstra 

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Signed-off-by: Peter Zijlstra (Intel) 

[Port to 4.12 kernel]
[Remove the comment about the fault_env structure which has been
 implemented as the vm_fault structure in the kernel]
[move pte_map_lock()'s definition upper in the file]
[move the define of FAULT_FLAG_SPECULATIVE later in the series]
[review error path in do_swap_page(), do_anonymous_page() and
 wp_page_copy()]
Signed-off-by: Laurent Dufour 
---
 mm/memory.c | 87 -
 1 file changed, 58 insertions(+), 29 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index a1f990e33e38..4528bd584b7a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2288,6 +2288,13 @@ int apply_to_page_range(struct mm_struct *mm, unsigned 
long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
+static inline bool pte_map_lock(struct vm_fault *vmf)
+{
+   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+  vmf->address, >ptl);
+   return true;
+}
+
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
  * read non-atomically.  Before making any commitment, on those architectures
@@ -2477,25 +2484,26 @@ static int wp_page_copy(struct vm_fault *vmf)
const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+   int ret = VM_FAULT_OOM;
 
if (unlikely(anon_vma_prepare(vma)))
-   goto oom;
+   goto out;
 
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma,
  vmf->address);
if (!new_page)
-   goto oom;
+   goto out;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (!new_page)
-   goto oom;
+   goto out;
cow_user_page(new_page, old_page, vmf->address, vma);
}
 
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, , false))
-   goto oom_free_new;
+   goto out_free_new;
 
__SetPageUptodate(new_page);
 
@@ -2504,7 +2512,10 @@ static int wp_page_copy(struct vm_fault *vmf)
/*
 * Re-check the pte - we dropped the lock
 */
-   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   ret = VM_FAULT_RETRY;
+   goto out_uncharge;
+   }
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2591,12 +2602,14 @@ static int wp_page_copy(struct vm_fault *vmf)
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
-oom_free_new:
+out_uncharge:
+   mem_cgroup_cancel_charge(new_page, memcg, false);
+out_free_new:
put_page(new_page);
-oom:
+out:
if (old_page)
put_page(old_page);
-   return VM_FAULT_OOM;
+   return ret;
 }
 
 /**
@@ -2617,8 +2630,8 @@ static int wp_page_copy(struct vm_fault *vmf)
 int finish_mkwrite_fault(struct vm_fault *vmf)
 {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
-   vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
-  >ptl);
+   if (!pte_map_lock(vmf))
+   return VM_FAULT_RETRY;
/*
 * We might have raced with another page fault while we released the
 * pte_offset_map_lock.
@@ -2736,8 +2749,11 @@ static int do_wp_page(struct vm_fault *vmf)
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
lock_page(vmf->page);
-   vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-   vmf->address, >ptl);
+   if (!pte_map_lock(vmf)) {
+   unlock_page(vmf->page);
+   put_page(vmf->page);
+   return VM_FAULT_RETRY;
+   }
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2944,11 +2960,15 @@ int do_swap_page(struct vm_fault

[PATCH v10 03/25] powerpc/mm: set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-04-17 Thread Laurent Dufour

Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for BOOK3S_64. This enables
the Speculative Page Fault handler.

Support is only provide for BOOK3S_64 currently because:
- require CONFIG_PPC_STD_MMU because checks done in
  set_access_flags_filter()
- require BOOK3S because we can't support for book3e_hugetlb_preload()
  called by update_mmu_cache()

Cc: Michael Ellerman 
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c32a181a7cbb..21ef887da7a3 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -233,6 +233,7 @@ config PPC
select OLD_SIGACTIONif PPC32
select OLD_SIGSUSPEND
select SPARSE_IRQ
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if PPC_BOOK3S_64
select SYSCTL_EXCEPTION_TRACE
select VIRT_TO_BUS  if !PPC64
#
-- 
2.7.4

[PATCH v10 02/25] x86/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-04-17 Thread Laurent Dufour

Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT which turns on the
Speculative Page Fault handler when building for 64bit.

Cc: Thomas Gleixner 
Signed-off-by: Laurent Dufour 
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d8983df5a2bc..ebdeb48e4a4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -30,6 +30,7 @@ config X86_64
select MODULES_USE_ELF_RELA
select X86_DEV_DMA_OPS
select ARCH_HAS_SYSCALL_WRAPPER
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
 
 #
 # Arch settings
-- 
2.7.4

[PATCH v10 01/25] mm: introduce CONFIG_SPECULATIVE_PAGE_FAULT

2018-04-17 Thread Laurent Dufour

This configuration variable will be used to build the code needed to
handle speculative page fault.

By default it is turned off, and activated depending on architecture
support, SMP and MMU.

Suggested-by: Thomas Gleixner 
Suggested-by: David Rientjes 
Signed-off-by: Laurent Dufour 
---
 mm/Kconfig | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index d5004d82a1d6..5484dca11199 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -752,3 +752,25 @@ config GUP_BENCHMARK
  performance of get_user_pages_fast().
 
  See tools/testing/selftests/vm/gup_benchmark.c
+
+config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+   def_bool n
+
+config SPECULATIVE_PAGE_FAULT
+   bool "Speculative page faults"
+   default y
+   depends on ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+   depends on MMU && SMP
+   help
+ Try to handle user space page faults without holding the mmap_sem.
+
+This should allow better concurrency for massively threaded process
+since the page fault handler will not wait for other threads memory
+layout change to be done, assuming that this change is done in another
+part of the process's memory space. This type of page fault is named
+speculative page fault.
+
+If the speculative page fault fails because of a concurrency is
+detected or because underlying PMD or PTE tables are not yet
+allocating, it is failing its processing and a classic page fault
+is then tried.
-- 
2.7.4

[PATCH v10 00/25] Speculative page faults

2018-04-17 Thread Laurent Dufour

This is a port on kernel 4.16 of the work done by Peter Zijlstra to
handle page fault without holding the mm semaphore [1].

The idea is to try to handle user space page faults without holding the
mmap_sem. This should allow better concurrency for massively threaded
process since the page fault handler will not wait for other threads memory
layout change to be done, assuming that this change is done in another part
of the process's memory space. This type page fault is named speculative
page fault. If the speculative page fault fails because of a concurrency is
detected or because underlying PMD or PTE tables are not yet allocating, it
is failing its processing and a classic page fault is then tried.

The speculative page fault (SPF) has to look for the VMA matching the fault
address without holding the mmap_sem, this is done by introducing a rwlock
which protects the access to the mm_rb tree. Previously this was done using
SRCU but it was introducing a lot of scheduling to process the VMA's
freeing
operation which was hitting the performance by 20% as reported by Kemi Wang
[2].Using a rwlock to protect access to the mm_rb tree is limiting the
locking contention to these operations which are expected to be in a O(log
n)
order. In addition to ensure that the VMA is not freed in our back a
reference count is added and 2 services (get_vma() and put_vma()) are
introduced to handle the reference count. When a VMA is fetch from the RB
tree using get_vma() is must be later freeed using put_vma(). Furthermore,
to allow the VMA to be used again by the classic page fault handler a
service is introduced can_reuse_spf_vma(). This service is expected to be
called with the mmap_sem hold. It checked that the VMA is still matching
the specified address and is releasing its reference count as the mmap_sem
is hold it is ensure that it will not be freed in our back. In general, the
VMA's reference count could be decremented when holding the mmap_sem but it
should not be increased as holding the mmap_sem is ensuring that the VMA is
stable. I can't see anymore the overhead I got while will-it-scale
benchmark anymore.

The VMA's attributes checked during the speculative page fault processing
have to be protected against parallel changes. This is done by using a per
VMA sequence lock. This sequence lock allows the speculative page fault
handler to fast check for parallel changes in progress and to abort the
speculative page fault in that case.

Once the VMA is found, the speculative page fault handler would check for
the VMA's attributes to verify that the page fault has to be handled
correctly or not. Thus the VMA is protected through a sequence lock which
allows fast detection of concurrent VMA changes. If such a change is
detected, the speculative page fault is aborted and a *classic* page fault
is tried.  VMA sequence lockings are added when VMA attributes which are
checked during the page fault are modified.

When the PTE is fetched, the VMA is checked to see if it has been changed,
so once the page table is locked, the VMA is valid, so any other changes
leading to touching this PTE will need to lock the page table, so no
parallel change is possible at this time.

The locking of the PTE is done with interrupts disabled, this allows to
check for the PMD to ensure that there is not an ongoing collapsing
operation. Since khugepaged is firstly set the PMD to pmd_none and then is
waiting for the other CPU to have catch the IPI interrupt, if the pmd is
valid at the time the PTE is locked, we have the guarantee that the
collapsing opertion will have to wait on the PTE lock to move foward. This
allows the SPF handler to map the PTE safely. If the PMD value is different
than the one recorded at the beginning of the SPF operation, the classic
page fault handler will be called to handle the operation while holding the
mmap_sem. As the PTE lock is done with the interrupts disabled, the lock is
done using spin_trylock() to avoid dead lock when handling a page fault
while a TLB invalidate is requested by an other CPU holding the PTE.

In pseudo code, this could be seen as:
speculative_page_fault()
{
vma = GET_VMA_vma()
check vma sequence count
check vma's support
disable interrupt
  check pgd,p4d,...,pte
  save pmd and pte in vmf
  save vma sequence counter in vmf
enable interrupt
check vma sequence count
handle_pte_fault(vma)
..
page = alloc_page()
pte_map_lock()
disable interrupt
abort if sequence counter has changed
abort if pmd or pte has changed
pte map and lock
enable interrupt
if abort
   free page
   abort

Re: [RFC PATCH 1/3] signal: Ensure every siginfo we send has all bits initialized

2018-04-17 Thread Dave Martin

On Sun, Apr 15, 2018 at 10:57:33AM -0500, Eric W. Biederman wrote:
> 
> Call clear_siginfo to ensure every stack allocated siginfo is properly
> initialized before being passed to the signal sending functions.
> 
> Note: It is not safe to depend on C initializers to initialize struct
> siginfo on the stack because C is allowed to skip holes when
> initializing a structure.
> 
> The initialization of struct siginfo in tracehook_report_syscall_exit
> was moved from the helper user_single_step_siginfo into
> tracehook_report_syscall_exit itself, to make it clear that the local
> variable siginfo gets fully initialized.
> 
> In a few cases the scope of struct siginfo has been reduced to make it
> clear that siginfo siginfo is not used on other paths in the function
> in which it is declared.
> 
> Instances of using memset to initialize siginfo have been replaced
> with calls clear_siginfo for clarity.
> 
> Signed-off-by: "Eric W. Biederman" 

[...]

Hmmm

memset()/clear_siginfo() may ensure that there are no uninitialised
explicit fields except for those in inactive union members, but I'm not
sure that this approach is guaranteed to sanitise the padding seen by
userspace.

Rationale below, though it's a bit theoretical...

With this in mind, I tend agree with Linus that hiding memset() calls
from the maintainer may be a bad idea unless they are also hidden from
the compiler.  If the compiler sees the memset() it may be able to
optimise it in ways that wouldn't be possible for some other random
external function call, including optimising all or part of the call
out.

As a result, the breakdown into individual put_user()s etc. in
copy_siginfo_to_user() may still be valuable even if all paths have the
memset().

(Rationale for an arch/arm example:)

> diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
> index 4c375e11ae95..adda3fc2dde8 100644
> --- a/arch/arm/vfp/vfpmodule.c
> +++ b/arch/arm/vfp/vfpmodule.c
> @@ -218,8 +218,7 @@ static void vfp_raise_sigfpe(unsigned int sicode, struct 
> pt_regs *regs)
>  {
>   siginfo_t info;
>  
> - memset(, 0, sizeof(info));
> -
> + clear_siginfo();
>   info.si_signo = SIGFPE;

/* by c11 (n1570) 6.2.6.1 para 6 [1], all padding bytes in info now take
   unspecified values */

>   info.si_code = sicode;
>   info.si_addr = (void __user *)(instruction_pointer(regs) - 4);

/* by c11 (n1570) 6.2.6.1 para 7 [2], all bytes of the union info._sifields
   other than than those corresponding to _sigfault take unspecified
   values */

So I don't see why the compiler needs to ensure that any of the affected
bytes are zero: it could potentially skip a lot of the memset() as a
result, in theory.

I've not seen a compiler actually take advantage of that, but I'm now
not sure what forbids it.

If this can happen, I only see two watertight workarounds:

1) Ensure that there is no implicit padding in any UAPI structure, e.g.
aeb1f39d814b: ("arm64/ptrace: Avoid uninitialised struct padding in
fpr_set()").  This would include tail-padding of any union member that
is smaller than the containing union.

It would be significantly more effort to ensure this for siginfo though.

2) Poke all values directly into allocated or user memory directly
via pointers to paddingless types; never assign to objects on the kernel
stack if you care what ends up in the padding, e.g., what your
copy_siginfo_to_user() does prior to this series.

If I'm not barking up the wrong tree, memset() cannot generally be
used to determine the value of padding bytes, but it may still be
useful for forcing otherwise uninitialised members to sane initial
values.

This likely affects many more things than just siginfo.

[...]

Cheers
---Dave

[1] n1570 6.2.6.1.6: When a value is stored in an object of structure or
union type, including in a member object, the bytes of the object
representation that correspond to any padding bytes take unspecified
values [...]

[2] n1570 6.2.6.1.7: When a value is stored in a member of an object of
union type, the bytes of the object representation that do not
correspond to that member but do correspond to other members take
unspecified values.

[PATCH] powerpc/time: remove to_tm and use RTC_LIB

2018-04-17 Thread Christophe Leroy

RTC_LIB includes a generic function to convert
RTC data into struct rtc_time. Use it and remove to_tm().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig|  1 +
 arch/powerpc/include/asm/time.h |  1 -
 arch/powerpc/kernel/rtas-proc.c |  4 +--
 arch/powerpc/kernel/time.c  | 52 +
 arch/powerpc/platforms/8xx/m8xx_setup.c |  2 +-
 arch/powerpc/platforms/powermac/time.c  |  2 +-
 arch/powerpc/platforms/ps3/time.c   |  2 +-
 7 files changed, 7 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index edbbd2ea1298..e1fac49cf465 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
select OF_RESERVED_MEM
select OLD_SIGACTIONif PPC32
select OLD_SIGSUSPEND
+   select RTC_LIB
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select VIRT_TO_BUS  if !PPC64
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index db546c034905..0ad1cf2285b1 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -27,7 +27,6 @@ extern unsigned long tb_ticks_per_sec;
 extern struct clock_event_device decrementer_clockevent;
 
 struct rtc_time;
-extern void to_tm(int tim, struct rtc_time * tm);
 extern void tick_broadcast_ipi_handler(void);
 
 extern void generic_calibrate_decr(void);
diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c
index fb070d8cad07..6de77f9434b0 100644
--- a/arch/powerpc/kernel/rtas-proc.c
+++ b/arch/powerpc/kernel/rtas-proc.c
@@ -314,7 +314,7 @@ static ssize_t ppc_rtas_poweron_write(struct file *file,
 
power_on_time = nowtime; /* save the time */
 
-   to_tm(nowtime, );
+   rtc_time64_to_tm(nowtime, );
 
error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, 
tm.tm_year, tm.tm_mon, tm.tm_mday, 
@@ -378,7 +378,7 @@ static ssize_t ppc_rtas_clock_write(struct file *file,
if (error)
return error;
 
-   to_tm(nowtime, );
+   rtc_time64_to_tm(nowtime, );
error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, 
tm.tm_year, tm.tm_mon, tm.tm_mday, 
tm.tm_hour, tm.tm_min, tm.tm_sec, 0);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 56869fd879ed..362673cc09f2 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -788,7 +788,7 @@ int update_persistent_clock(struct timespec now)
if (!ppc_md.set_rtc_time)
return -ENODEV;
 
-   to_tm(now.tv_sec + 1 + timezone_offset, );
+   rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, );
tm.tm_year -= 1900;
tm.tm_mon -= 1;
 
@@ -1141,56 +1141,6 @@ void __init time_init(void)
 #endif
 }
 
-
-#define FEBRUARY   2
-#defineSTARTOFTIME 1970
-#define SECDAY 86400L
-#define SECYR  (SECDAY * 365)
-#defineleapyear(year)  ((year) % 4 == 0 && \
-((year) % 100 != 0 || (year) % 400 == 0))
-#definedays_in_year(a) (leapyear(a) ? 366 : 365)
-#definedays_in_month(a)(month_days[(a) - 1])
-
-static int month_days[12] = {
-   31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
-};
-
-void to_tm(int tim, struct rtc_time * tm)
-{
-   register inti;
-   register long   hms, day;
-
-   day = tim / SECDAY;
-   hms = tim % SECDAY;
-
-   /* Hours, minutes, seconds are easy */
-   tm->tm_hour = hms / 3600;
-   tm->tm_min = (hms % 3600) / 60;
-   tm->tm_sec = (hms % 3600) % 60;
-
-   /* Number of years in days */
-   for (i = STARTOFTIME; day >= days_in_year(i); i++)
-   day -= days_in_year(i);
-   tm->tm_year = i;
-
-   /* Number of months in days left */
-   if (leapyear(tm->tm_year))
-   days_in_month(FEBRUARY) = 29;
-   for (i = 1; day >= days_in_month(i); i++)
-   day -= days_in_month(i);
-   days_in_month(FEBRUARY) = 28;
-   tm->tm_mon = i;
-
-   /* Days are what is left over (+1) from all that. */
-   tm->tm_mday = day + 1;
-
-   /*
-* No-one uses the day of the week.
-*/
-   tm->tm_wday = -1;
-}
-EXPORT_SYMBOL(to_tm);
-
 /*
  * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
  * result.
diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c 
b/arch/powerpc/platforms/8xx/m8xx_setup.c
index 2188d691a40f..0f9740185eb9 100644
--- a/arch/powerpc/platforms/8xx/m8xx_setup.c
+++ b/arch/powerpc/platforms/8xx/m8xx_setup.c
@@ -192,7 +192,7 @@ void mpc8xx_get_rtc_time(struct rtc_time *tm)
 
/* Get time from the RTC. */
data = in_be32(_tmr->sit_rtc);
-   to_tm(data, tm);
+   rtc_time64_to_tm(data, tm);
tm->tm_year -= 1900;

Re: [PATCH] powerpc/8xx: Build fix with Hugetlbfs enabled

2018-04-17 Thread Christophe LEROY




Le 16/04/2018 à 13:27, Aneesh Kumar K.V a écrit :

8xx use slice code when hugetlbfs is enabled. We missed a header include on
8xx which resulted in the below build failure.

config: mpc885_ads_defconfig + CONFIG_HUGETLBFS

CC  arch/powerpc/mm/slice.o
arch/powerpc/mm/slice.c: In function 'slice_get_unmapped_area':
arch/powerpc/mm/slice.c:655:2: error: implicit declaration of function 
'need_extra_context' [-Werror=implicit-function-declaration]
arch/powerpc/mm/slice.c:656:3: error: implicit declaration of function 
'alloc_extended_context' [-Werror=implicit-function-declaration]
cc1: all warnings being treated as errors
make[1]: *** [arch/powerpc/mm/slice.o] Error 1
make: *** [arch/powerpc/mm] Error 2

on PPC64 the mmu_context.h was included via linux/pkeys.h

CC: Christophe LEROY 
Signed-off-by: Aneesh Kumar K.V 


Tested-by: Christophe Leroy 


---
  arch/powerpc/mm/slice.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 9cd87d11fe4e..205fe557ca10 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -35,6 +35,7 @@
  #include 
  #include 
  #include 
+#include 
  
  static DEFINE_SPINLOCK(slice_convert_lock);

[PATCH] powerpc/8xx: Remove RTC clock on 88x

2018-04-17 Thread Christophe Leroy

The 885 familly processors don't have the Real Time Clock

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/platforms/8xx/adder875.c| 2 --
 arch/powerpc/platforms/8xx/ep88xc.c  | 2 --
 arch/powerpc/platforms/8xx/mpc885ads_setup.c | 2 --
 3 files changed, 6 deletions(-)

diff --git a/arch/powerpc/platforms/8xx/adder875.c 
b/arch/powerpc/platforms/8xx/adder875.c
index 333dece79394..bcef9f66191e 100644
--- a/arch/powerpc/platforms/8xx/adder875.c
+++ b/arch/powerpc/platforms/8xx/adder875.c
@@ -111,7 +111,5 @@ define_machine(adder875) {
.get_irq = mpc8xx_get_irq,
.restart = mpc8xx_restart,
.calibrate_decr = generic_calibrate_decr,
-   .set_rtc_time = mpc8xx_set_rtc_time,
-   .get_rtc_time = mpc8xx_get_rtc_time,
.progress = udbg_progress,
 };
diff --git a/arch/powerpc/platforms/8xx/ep88xc.c 
b/arch/powerpc/platforms/8xx/ep88xc.c
index cd0d90f1fb1c..ebcf34a14789 100644
--- a/arch/powerpc/platforms/8xx/ep88xc.c
+++ b/arch/powerpc/platforms/8xx/ep88xc.c
@@ -170,7 +170,5 @@ define_machine(ep88xc) {
.get_irq= mpc8xx_get_irq,
.restart = mpc8xx_restart,
.calibrate_decr = mpc8xx_calibrate_decr,
-   .set_rtc_time = mpc8xx_set_rtc_time,
-   .get_rtc_time = mpc8xx_get_rtc_time,
.progress = udbg_progress,
 };
diff --git a/arch/powerpc/platforms/8xx/mpc885ads_setup.c 
b/arch/powerpc/platforms/8xx/mpc885ads_setup.c
index e821a42d5816..a0c83c1905c6 100644
--- a/arch/powerpc/platforms/8xx/mpc885ads_setup.c
+++ b/arch/powerpc/platforms/8xx/mpc885ads_setup.c
@@ -220,7 +220,5 @@ define_machine(mpc885_ads) {
.get_irq= mpc8xx_get_irq,
.restart= mpc8xx_restart,
.calibrate_decr = mpc8xx_calibrate_decr,
-   .set_rtc_time   = mpc8xx_set_rtc_time,
-   .get_rtc_time   = mpc8xx_get_rtc_time,
.progress   = udbg_progress,
 };
-- 
2.13.3

[PATCH] powerpc/boot: remove unused variable in mpc8xx

2018-04-17 Thread Christophe Leroy

Variable div is set but never used. Remove it.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/boot/mpc8xx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/boot/mpc8xx.c b/arch/powerpc/boot/mpc8xx.c
index add55a7f184f..c9bd9285c548 100644
--- a/arch/powerpc/boot/mpc8xx.c
+++ b/arch/powerpc/boot/mpc8xx.c
@@ -24,7 +24,7 @@ u32 mpc885_get_clock(u32 crystal)
 {
u32 *immr;
u32 plprcr;
-   int mfi, mfn, mfd, pdf, div;
+   int mfi, mfn, mfd, pdf;
u32 ret;
 
immr = fsl_get_immr();
@@ -43,7 +43,6 @@ u32 mpc885_get_clock(u32 crystal)
}
 
pdf = (plprcr >> 1) & 0xf;
-   div = (plprcr >> 20) & 3;
mfd = (plprcr >> 22) & 0x1f;
mfn = (plprcr >> 27) & 0x1f;
 
-- 
2.13.3

[PATCH] powerpc/misc: merge reloc_offset() and add_reloc_offset()

2018-04-17 Thread Christophe Leroy

reloc_offset() is the same as add_reloc_offset(0)

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/misc.S | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 384357cb8bc0..e1f3a5d054c4 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -25,23 +25,12 @@
 /*
  * Returns (address we are running at) - (address we were linked at)
  * for use before the text and data are mapped to KERNELBASE.
- */
-
-_GLOBAL(reloc_offset)
-   mflrr0
-   bl  1f
-1: mflrr3
-   PPC_LL  r4,(2f-1b)(r3)
-   subfr3,r4,r3
-   mtlrr0
-   blr
 
-   .align  3
-2: PPC_LONG 1b
-
-/*
  * add_reloc_offset(x) returns x + reloc_offset().
  */
+
+_GLOBAL(reloc_offset)
+   li  r3, 0
 _GLOBAL(add_reloc_offset)
mflrr0
bl  1f
-- 
2.13.3

[PATCH] powerpc: Allow selection of CONFIG_LD_DEAD_CODE_DATA_ELIMINATION

2018-04-17 Thread Christophe Leroy

This option does dead code and data elimination with the linker by
compiling with -ffunction-sections -fdata-sections and linking with
--gc-sections.

By selecting this option on mpc885_ads_defconfig,
vmlinux LOAD segment size gets reduced by 10%

Program Header before the patch:
LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
 filesz 0x0036eda4 memsz 0x0038de04 flags rwx

Program Header after the patch:
LOAD off0x0001 vaddr 0xc000 paddr 0x align 2**16
 filesz 0x00316da4 memsz 0x00334268 flags rwx

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8fe4353be5e3..e1fac49cf465 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -888,6 +888,14 @@ config PPC_MEM_KEYS
 
  If unsure, say y.
 
+config PPC_UNUSED_ELIMINATION
+   bool "Eliminate unused functions and data from vmlinux"
+   default n
+   select LD_DEAD_CODE_DATA_ELIMINATION
+   help
+ Select this to do dead code and data elimination with the linker
+ by compiling with -ffunction-sections -fdata-sections and linking
+ with --gc-sections.
 endmenu
 
 config ISA_DMA_API
-- 
2.13.3

[PATCH 6/6 v2] arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc

2018-04-17 Thread Nipun Gupta

Fsl-mc bus now support the iommu-map property. Comply to this binding for
fsl_mc bus. This patch also updates the dts w.r.t. the DMA configuration.

Signed-off-by: Nipun Gupta 
---
 arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi 
b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
index f3a40af..1b1c5eb 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
@@ -135,6 +135,7 @@
#address-cells = <2>;
#size-cells = <2>;
ranges;
+   dma-ranges = <0x0 0x0 0x0 0x0 0x1 0x>;
 
clockgen: clocking@130 {
compatible = "fsl,ls2080a-clockgen";
@@ -357,6 +358,8 @@
reg = <0x0008 0x0c00 0 0x40>,/* MC portal 
base */
  <0x 0x0834 0 0x4>; /* MC control 
reg */
msi-parent = <>;
+   iommu-map = <0  0 0>;  /* This is fixed-up by 
u-boot */
+   dma-coherent;
#address-cells = <3>;
#size-cells = <1>;
 
@@ -460,6 +463,8 @@
compatible = "arm,mmu-500";
reg = <0 0x500 0 0x80>;
#global-interrupts = <12>;
+   #iommu-cells = <1>;
+   stream-match-mask = <0x7C00>;
interrupts = <0 13 4>, /* global secure fault */
 <0 14 4>, /* combined secure interrupt */
 <0 15 4>, /* global non-secure fault */
@@ -502,7 +507,6 @@
 <0 204 4>, <0 205 4>,
 <0 206 4>, <0 207 4>,
 <0 208 4>, <0 209 4>;
-   mmu-masters = <_mc 0x300 0>;
};
 
dspi: dspi@210 {
-- 
1.9.1

[PATCH 5/6 v2] bus: fsl-mc: supoprt dma configure for devices on fsl-mc bus

2018-04-17 Thread Nipun Gupta

Signed-off-by: Nipun Gupta 
---
 drivers/bus/fsl-mc/fsl-mc-bus.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 5d8266c..624828b 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -127,6 +127,16 @@ static int fsl_mc_bus_uevent(struct device *dev, struct 
kobj_uevent_env *env)
return 0;
 }
 
+static int fsl_mc_dma_configure(struct device *dev)
+{
+   struct device *dma_dev = dev;
+
+   while (dev_is_fsl_mc(dma_dev))
+   dma_dev = dma_dev->parent;
+
+   return of_dma_configure(dev, dma_dev->of_node, 0);
+}
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 char *buf)
 {
@@ -148,6 +158,7 @@ struct bus_type fsl_mc_bus_type = {
.name = "fsl-mc",
.match = fsl_mc_bus_match,
.uevent = fsl_mc_bus_uevent,
+   .dma_configure  = fsl_mc_dma_configure,
.dev_groups = fsl_mc_dev_groups,
 };
 EXPORT_SYMBOL_GPL(fsl_mc_bus_type);
@@ -616,6 +627,7 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc,
mc_dev->icid = parent_mc_dev->icid;
mc_dev->dma_mask = FSL_MC_DEFAULT_DMA_MASK;
mc_dev->dev.dma_mask = _dev->dma_mask;
+   mc_dev->dev.coherent_dma_mask = mc_dev->dma_mask;
dev_set_msi_domain(_dev->dev,
   dev_get_msi_domain(_mc_dev->dev));
}
@@ -633,10 +645,6 @@ int fsl_mc_device_add(struct fsl_mc_obj_desc *obj_desc,
goto error_cleanup_dev;
}
 
-   /* Objects are coherent, unless 'no shareability' flag set. */
-   if (!(obj_desc->flags & FSL_MC_OBJ_FLAG_NO_MEM_SHAREABILITY))
-   arch_setup_dma_ops(_dev->dev, 0, 0, NULL, true);
-
/*
 * The device-specific probe callback will get invoked by device_add()
 */
-- 
1.9.1

[PATCH 4/6 v2] iommu: arm-smmu: Add support for the fsl-mc bus

2018-04-17 Thread Nipun Gupta

Implement bus specific support for the fsl-mc bus including
registering arm_smmu_ops and bus specific device add operations.

Signed-off-by: Nipun Gupta 
---
 drivers/iommu/arm-smmu.c |  7 +++
 drivers/iommu/iommu.c| 21 +
 include/linux/fsl/mc.h   |  8 
 include/linux/iommu.h|  2 ++
 4 files changed, 38 insertions(+)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 69e7c60..e1d5090 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -52,6 +52,7 @@
 #include 
 
 #include 
+#include 
 
 #include "io-pgtable.h"
 #include "arm-smmu-regs.h"
@@ -1459,6 +1460,8 @@ static struct iommu_group *arm_smmu_device_group(struct 
device *dev)
 
if (dev_is_pci(dev))
group = pci_device_group(dev);
+   else if (dev_is_fsl_mc(dev))
+   group = fsl_mc_device_group(dev);
else
group = generic_device_group(dev);
 
@@ -2037,6 +2040,10 @@ static void arm_smmu_bus_init(void)
bus_set_iommu(_bus_type, _smmu_ops);
}
 #endif
+#ifdef CONFIG_FSL_MC_BUS
+   if (!iommu_present(_mc_bus_type))
+   bus_set_iommu(_mc_bus_type, _smmu_ops);
+#endif
 }
 
 static int arm_smmu_device_probe(struct platform_device *pdev)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 69fef99..fbeebb2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static struct kset *iommu_group_kset;
@@ -987,6 +988,26 @@ struct iommu_group *pci_device_group(struct device *dev)
return iommu_group_alloc();
 }
 
+/* Get the IOMMU group for device on fsl-mc bus */
+struct iommu_group *fsl_mc_device_group(struct device *dev)
+{
+   struct device *cont_dev = fsl_mc_cont_dev(dev);
+   struct iommu_group *group;
+
+   /* Container device is responsible for creating the iommu group */
+   if (fsl_mc_is_cont_dev(dev)) {
+   group = iommu_group_alloc();
+   if (IS_ERR(group))
+   return NULL;
+   } else {
+   get_device(cont_dev);
+   group = iommu_group_get(cont_dev);
+   put_device(cont_dev);
+   }
+
+   return group;
+}
+
 /**
  * iommu_group_get_for_dev - Find or create the IOMMU group for a device
  * @dev: target device
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index f27cb14..dddaca1 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -351,6 +351,14 @@ struct fsl_mc_io {
 #define dev_is_fsl_mc(_dev) (0)
 #endif
 
+/* Macro to check if a device is a container device */
+#define fsl_mc_is_cont_dev(_dev) (to_fsl_mc_device(_dev)->flags & \
+   FSL_MC_IS_DPRC)
+
+/* Macro to get the container device of a MC device */
+#define fsl_mc_cont_dev(_dev) (fsl_mc_is_cont_dev(_dev) ? \
+   (_dev) : (_dev)->parent)
+
 /*
  * module_fsl_mc_driver() - Helper macro for drivers that don't do
  * anything special in module init/exit.  This eliminates a lot of
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 41b8c57..00a460b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -389,6 +389,8 @@ static inline size_t iommu_map_sg(struct iommu_domain 
*domain,
 extern struct iommu_group *pci_device_group(struct device *dev);
 /* Generic device grouping function */
 extern struct iommu_group *generic_device_group(struct device *dev);
+/* FSL-MC device grouping function */
+struct iommu_group *fsl_mc_device_group(struct device *dev);
 
 /**
  * struct iommu_fwspec - per-device IOMMU instance data
-- 
1.9.1

[PATCH 3/6 v2] iommu: support iommu configuration for fsl-mc devices

2018-04-17 Thread Nipun Gupta

Signed-off-by: Nipun Gupta 
---
 drivers/iommu/of_iommu.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 4e7712f..af4fc3b 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define NO_IOMMU   1
 
@@ -260,6 +261,23 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 
alias, void *data)
return err;
 }
 
+static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev,
+   struct device_node *master_np)
+{
+   struct of_phandle_args iommu_spec = { .args_count = 1 };
+   int err;
+
+   err = of_map_rid(master_np, mc_dev->icid, "iommu-map",
+"iommu-map-mask", _spec.np,
+iommu_spec.args);
+   if (err)
+   return err == -ENODEV ? NO_IOMMU : err;
+
+   err = of_iommu_xlate(_dev->dev, _spec);
+   of_node_put(iommu_spec.np);
+   return err;
+}
+
 const struct iommu_ops *of_iommu_configure(struct device *dev,
   struct device_node *master_np)
 {
@@ -291,6 +309,8 @@ const struct iommu_ops *of_iommu_configure(struct device 
*dev,
 
err = pci_for_each_dma_alias(to_pci_dev(dev),
 of_pci_iommu_init, );
+   } else if (dev_is_fsl_mc(dev)) {
+   err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np);
} else {
struct of_phandle_args iommu_spec;
int idx = 0;
-- 
1.9.1

[PATCH 2/6 v2] iommu: of: make of_pci_map_rid() available for other devices too

2018-04-17 Thread Nipun Gupta

iommu-map property is also used by devices with fsl-mc. This
patch moves the of_pci_map_rid to generic location, so that it
can be used by other busses too.

Signed-off-by: Nipun Gupta 
---
 drivers/iommu/of_iommu.c | 106 +--
 drivers/of/irq.c |   6 +--
 drivers/pci/of.c | 101 
 include/linux/of_iommu.h |  11 +
 include/linux/of_pci.h   |  10 -
 5 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 5c36a8b..4e7712f 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -138,6 +138,106 @@ static int of_iommu_xlate(struct device *dev,
return ops->of_xlate(dev, iommu_spec);
 }
 
+/**
+ * of_map_rid - Translate a requester ID through a downstream mapping.
+ * @np: root complex device node.
+ * @rid: device requester ID to map.
+ * @map_name: property name of the map to use.
+ * @map_mask_name: optional property name of the mask to use.
+ * @target: optional pointer to a target device node.
+ * @id_out: optional pointer to receive the translated ID.
+ *
+ * Given a device requester ID, look up the appropriate implementation-defined
+ * platform ID and/or the target device which receives transactions on that
+ * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or
+ * @id_out may be NULL if only the other is required. If @target points to
+ * a non-NULL device node pointer, only entries targeting that node will be
+ * matched; if it points to a NULL value, it will receive the device node of
+ * the first matching target phandle, with a reference held.
+ *
+ * Return: 0 on success or a standard error code on failure.
+ */
+int of_map_rid(struct device_node *np, u32 rid,
+  const char *map_name, const char *map_mask_name,
+  struct device_node **target, u32 *id_out)
+{
+   u32 map_mask, masked_rid;
+   int map_len;
+   const __be32 *map = NULL;
+
+   if (!np || !map_name || (!target && !id_out))
+   return -EINVAL;
+
+   map = of_get_property(np, map_name, _len);
+   if (!map) {
+   if (target)
+   return -ENODEV;
+   /* Otherwise, no map implies no translation */
+   *id_out = rid;
+   return 0;
+   }
+
+   if (!map_len || map_len % (4 * sizeof(*map))) {
+   pr_err("%pOF: Error: Bad %s length: %d\n", np,
+   map_name, map_len);
+   return -EINVAL;
+   }
+
+   /* The default is to select all bits. */
+   map_mask = 0x;
+
+   /*
+* Can be overridden by "{iommu,msi}-map-mask" property.
+*/
+   if (map_mask_name)
+   of_property_read_u32(np, map_mask_name, _mask);
+
+   masked_rid = map_mask & rid;
+   for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) {
+   struct device_node *phandle_node;
+   u32 rid_base = be32_to_cpup(map + 0);
+   u32 phandle = be32_to_cpup(map + 1);
+   u32 out_base = be32_to_cpup(map + 2);
+   u32 rid_len = be32_to_cpup(map + 3);
+
+   if (rid_base & ~map_mask) {
+   pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) 
ignores rid-base (0x%x)\n",
+   np, map_name, map_name,
+   map_mask, rid_base);
+   return -EFAULT;
+   }
+
+   if (masked_rid < rid_base || masked_rid >= rid_base + rid_len)
+   continue;
+
+   phandle_node = of_find_node_by_phandle(phandle);
+   if (!phandle_node)
+   return -ENODEV;
+
+   if (target) {
+   if (*target)
+   of_node_put(phandle_node);
+   else
+   *target = phandle_node;
+
+   if (*target != phandle_node)
+   continue;
+   }
+
+   if (id_out)
+   *id_out = masked_rid - rid_base + out_base;
+
+   pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: 
%08x, length: %08x, rid: %08x -> %08x\n",
+   np, map_name, map_mask, rid_base, out_base,
+   rid_len, rid, masked_rid - rid_base + out_base);
+   return 0;
+   }
+
+   pr_err("%pOF: Invalid %s translation - no match for rid 0x%x on %pOF\n",
+   np, map_name, rid, target && *target ? *target : NULL);
+   return -EFAULT;
+}
+
 struct of_pci_iommu_alias_info {
struct device *dev;
struct device_node *np;
@@ -149,9 +249,9 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 
alias, void *data)
struct of_phandle_args iommu_spec = { .args_count =

[PATCH 1/6 v2] Docs: dt: add fsl-mc iommu-map device-tree binding

2018-04-17 Thread Nipun Gupta

The existing IOMMU bindings cannot be used to specify the relationship
between fsl-mc devices and IOMMUs. This patch adds a generic binding for
mapping fsl-mc devices to IOMMUs, using iommu-map property.

Signed-off-by: Nipun Gupta 
---
 .../devicetree/bindings/misc/fsl,qoriq-mc.txt  | 39 ++
 1 file changed, 39 insertions(+)

diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt 
b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
index 6611a7c..8cbed4f 100644
--- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
+++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
@@ -9,6 +9,25 @@ blocks that can be used to create functional hardware 
objects/devices
 such as network interfaces, crypto accelerator instances, L2 switches,
 etc.
 
+For an overview of the DPAA2 architecture and fsl-mc bus see:
+drivers/staging/fsl-mc/README.txt
+
+As described in the above overview, all DPAA2 objects in a DPRC share the
+same hardware "isolation context" and a 10-bit value called an ICID
+(isolation context id) is expressed by the hardware to identify
+the requester.
+
+The generic 'iommus' property is insufficient to describe the relationship
+between ICIDs and IOMMUs, so an iommu-map property is used to define
+the set of possible ICIDs under a root DPRC and how they map to
+an IOMMU.
+
+For generic IOMMU bindings, see
+Documentation/devicetree/bindings/iommu/iommu.txt.
+
+For arm-smmu binding, see:
+Documentation/devicetree/bindings/iommu/arm,smmu.txt.
+
 Required properties:
 
 - compatible
@@ -88,14 +107,34 @@ Sub-nodes:
   Value type: 
   Definition: Specifies the phandle to the PHY device node 
associated
   with the this dpmac.
+Optional properties:
+
+- iommu-map: Maps an ICID to an IOMMU and associated iommu-specifier
+  data.
+
+  The property is an arbitrary number of tuples of
+  (icid-base,iommu,iommu-base,length).
+
+  Any ICID i in the interval [icid-base, icid-base + length) is
+  associated with the listed IOMMU, with the iommu-specifier
+  (i - icid-base + iommu-base).
 
 Example:
 
+smmu: iommu@500 {
+   compatible = "arm,mmu-500";
+   #iommu-cells = <2>;
+   stream-match-mask = <0x7C00>;
+   ...
+};
+
 fsl_mc: fsl-mc@80c00 {
 compatible = "fsl,qoriq-mc";
 reg = <0x0008 0x0c00 0 0x40>,/* MC portal base */
   <0x 0x0834 0 0x4>; /* MC control reg */
 msi-parent = <>;
+/* define map for ICIDs 23-64 */
+iommu-map = <23  23 41>;
 #address-cells = <3>;
 #size-cells = <1>;
 
-- 
1.9.1

[PATCH 0/6 v2] Support for fsl-mc bus and its devices in SMMU

2018-04-17 Thread Nipun Gupta

This patchset defines IOMMU DT binding for fsl-mc bus and adds
support in SMMU for fsl-mc bus.

This patch series is dependent on patset:
https://patchwork.kernel.org/patch/10317337/

These patches
  - Define property 'iommu-map' for fsl-mc bus (patch 1)
  - Integrates the fsl-mc bus with the SMMU using this
IOMMU binding (patch 2,3,4)
  - Adds the dma configuration support for fsl-mc bus (patch 5)
  - Updates the fsl-mc device node with iommu/dma related changes (patch6)

Nipun Gupta (6):
  Docs: dt: add fsl-mc iommu-map device-tree binding
  iommu: of: make of_pci_map_rid() available for other devices too
  iommu: support iommu configuration for fsl-mc devices
  iommu: arm-smmu: Add support for the fsl-mc bus
  bus: fsl-mc: supoprt dma configure for devices on fsl-mc bus
  arm64: dts: ls208xa: comply with the iommu map binding for fsl_mc

Changes in v2:
  - use iommu-map property for fsl-mc bus
  - rebase over patchset https://patchwork.kernel.org/patch/10317337/
and make corresponding changes for dma configuration of devices on
fsl-mc bus

 .../devicetree/bindings/misc/fsl,qoriq-mc.txt  |  39 +++
 arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi |   6 +-
 drivers/bus/fsl-mc/fsl-mc-bus.c|  16 ++-
 drivers/iommu/arm-smmu.c   |   7 ++
 drivers/iommu/iommu.c  |  21 
 drivers/iommu/of_iommu.c   | 126 -
 drivers/of/irq.c   |   6 +-
 drivers/pci/of.c   | 101 -
 include/linux/fsl/mc.h |   8 ++
 include/linux/iommu.h  |   2 +
 include/linux/of_iommu.h   |  11 ++
 include/linux/of_pci.h |  10 --
 12 files changed, 231 insertions(+), 122 deletions(-)

-- 
1.9.1

Re: [1/5] powerpc/lib: Fix off-by-one in alternate feature patching

2018-04-17 Thread Michael Ellerman

On Mon, 2018-04-16 at 14:39:01 UTC, Michael Ellerman wrote:
> When we patch an alternate feature section, we have to adjust any
> relative branches that branch out of the alternate section.
> 
> But currently we have a bug if we have a branch that points to past
> the last instruction of the alternate section, eg:
> 
>   FTR_SECTION_ELSE
>   1: b   2f
>  or  6,6,6
>   2:
>   ALT_FTR_SECTION_END(...)
>  nop
> 
> This will result in a relative branch at 1 with a target that equals
> the end of the alternate section.
> 
> That branch does not need adjusting when it's moved to the non-else
> location. Currently we do adjust it, resulting in a branch that goes
> off into the link-time location of the else section, which is junk.
> 
> The fix is to not patch branches that have a target == end of the
> alternate section.
> 
> Fixes: d20fe50a7b3c ("KVM: PPC: Book3S HV: Branch inside feature section")
> Fixes: 9b1a735de64c ("powerpc: Add logic to patch alternative feature 
> sections")
> Cc: sta...@vger.kernel.org # v2.6.27+
> Signed-off-by: Michael Ellerman 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/b8858581febb050688e276b956796b

cheers

Re: powerpc/64s: Default l1d_size to 64K in RFI fallback flush

2018-04-17 Thread Michael Ellerman

On Tue, 2018-04-17 at 01:49:20 UTC, Michael Ellerman wrote:
> From: Madhavan Srinivasan 
> 
> If there is no d-cache-size property in the device tree, l1d_size could
> be zero. We don't actually expect that to happen, it's only been seen
> on mambo (simulator) in some configurations.
> 
> A zero-size l1d_size leads to the loop in the asm wrapping around to
> 2^64-1, and then walking off the end of the fallback area and
> eventually causing a page fault which is fatal.
> 
> Just default to 64K which is correct on some CPUs, and sane enough to
> not cause a crash on others.
> 
> Fixes: aa8a5e0062ac9 ('powerpc/64s: Add support for RFI flush of L1-D cache')
> Signed-off-by: Madhavan Srinivasan 
> [mpe: Rewrite comment and change log]
> Signed-off-by: Michael Ellerman 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/9dfbf78e4114fcaf4ef61c49885c3a

cheers

[RESEND PATCH 1/3] powerpc: dts: use 'atmel' as at24 anufacturer for pdm360ng

2018-04-17 Thread Bartosz Golaszewski

Using 'at' as the  part of the compatible string is now
deprecated. Use a correct string: 'atmel,'.

Signed-off-by: Bartosz Golaszewski 
---
 arch/powerpc/boot/dts/pdm360ng.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/pdm360ng.dts 
b/arch/powerpc/boot/dts/pdm360ng.dts
index 445b88114009..df1283b63d9b 100644
--- a/arch/powerpc/boot/dts/pdm360ng.dts
+++ b/arch/powerpc/boot/dts/pdm360ng.dts
@@ -98,7 +98,7 @@
fsl,preserve-clocking;
 
eeprom@50 {
-   compatible = "at,24c01";
+   compatible = "atmel,24c01";
reg = <0x50>;
};
 
-- 
2.17.0

[RESEND PATCH 3/3] powerpc: dts: use a correct at24 compatible fallback in ac14xx

2018-04-17 Thread Bartosz Golaszewski

Using 'at24' as fallback is now deprecated - use the full
'atmel,' string.

Signed-off-by: Bartosz Golaszewski 
---
 arch/powerpc/boot/dts/ac14xx.dts | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/boot/dts/ac14xx.dts b/arch/powerpc/boot/dts/ac14xx.dts
index 83bcfd865167..0be5c4f3265d 100644
--- a/arch/powerpc/boot/dts/ac14xx.dts
+++ b/arch/powerpc/boot/dts/ac14xx.dts
@@ -176,12 +176,12 @@
clock-frequency = <40>;
 
at24@30 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x30>;
};
 
at24@31 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x31>;
};
 
@@ -191,42 +191,42 @@
};
 
at24@50 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x50>;
};
 
at24@51 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x51>;
};
 
at24@52 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x52>;
};
 
at24@53 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x53>;
};
 
at24@54 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x54>;
};
 
at24@55 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x55>;
};
 
at24@56 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x56>;
};
 
at24@57 {
-   compatible = "at24,24c01";
+   compatible = "atmel,24c01";
reg = <0x57>;
};
 
-- 
2.17.0

[RESEND PATCH 2/3] powerpc: dts: use 'atmel' as at24 manufacturer for kmcent2

2018-04-17 Thread Bartosz Golaszewski

Using compatible strings without the  part for at24 is
now deprecated. Use a correct 'atmel,' value.

Signed-off-by: Bartosz Golaszewski 
---
 arch/powerpc/boot/dts/fsl/kmcent2.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts 
b/arch/powerpc/boot/dts/fsl/kmcent2.dts
index 5922c1ea0e96..3094df05f5ea 100644
--- a/arch/powerpc/boot/dts/fsl/kmcent2.dts
+++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts
@@ -130,7 +130,7 @@
#size-cells = <0>;
 
eeprom@54 {
-   compatible = "24c02";
+   compatible = "atmel,24c02";
reg = <0x54>;
pagesize = <2>;
read-only;
-- 
2.17.0

Re: [PATCH] powerpc/misc: get rid of add_reloc_offset()

2018-04-17 Thread Paul Mackerras

On Tue, Apr 17, 2018 at 09:56:24AM +0200, Christophe Leroy wrote:
> add_reloc_offset() is almost redundant with reloc_offset()
> 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/setup.h   |  3 +--
>  arch/powerpc/kernel/misc.S | 16 
>  arch/powerpc/kernel/prom_init_check.sh |  2 +-
>  3 files changed, 2 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/setup.h 
> b/arch/powerpc/include/asm/setup.h
> index 27fa52ed6d00..115e0896ffa7 100644
> --- a/arch/powerpc/include/asm/setup.h
> +++ b/arch/powerpc/include/asm/setup.h
> @@ -17,10 +17,9 @@ extern void note_scsi_host(struct device_node *, void *);
>  
>  /* Used in very early kernel initialization. */
>  extern unsigned long reloc_offset(void);
> -extern unsigned long add_reloc_offset(unsigned long);
>  extern void reloc_got2(unsigned long);
>  
> -#define PTRRELOC(x)  ((typeof(x)) add_reloc_offset((unsigned long)(x)))
> +#define PTRRELOC(x)  ((typeof(x)) ((unsigned long)(x) + reloc_offset()))

NAK.  This is how it used to be, and we changed it in order to prevent
gcc from making incorrect assumptions.  If you use the form with the
explicit addition, and x is the address of an array, gcc will assume
that the result is within the bounds of the array (apparently the C
standard says it can do that) and potentially generate incorrect
code.  I recall that we had an actual case where gcc was generating
incorrect code, though I don't recall the details, as this was some
time before 2002.

Paul.

Re: [PATCH 1/2] powernv/npu: Do a PID GPU TLB flush when invalidating a large address range

2018-04-17 Thread Balbir Singh

On Tue, Apr 17, 2018 at 7:11 PM, Alistair Popple  wrote:
> The NPU has a limited number of address translation shootdown (ATSD)
> registers and the GPU has limited bandwidth to process ATSDs. This can
> result in contention of ATSD registers leading to soft lockups on some
> threads, particularly when invalidating a large address range in
> pnv_npu2_mn_invalidate_range().
>
> At some threshold it becomes more efficient to flush the entire GPU TLB for
> the given MM context (PID) than individually flushing each address in the
> range. This patch will result in ranges greater than 2MB being converted
> from 32+ ATSDs into a single ATSD which will flush the TLB for the given
> PID on each GPU.
>
> Signed-off-by: Alistair Popple 
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 23 +++
>  1 file changed, 19 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index 94801d8e7894..dc34662e9df9 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -40,6 +40,13 @@
>  DEFINE_SPINLOCK(npu_context_lock);
>
>  /*
> + * When an address shootdown range exceeds this threshold we invalidate the
> + * entire TLB on the GPU for the given PID rather than each specific address 
> in
> + * the range.
> + */
> +#define ATSD_THRESHOLD (2*1024*1024)
> +
> +/*
>   * Other types of TCE cache invalidation are not functional in the
>   * hardware.
>   */
> @@ -675,11 +682,19 @@ static void pnv_npu2_mn_invalidate_range(struct 
> mmu_notifier *mn,
> struct npu_context *npu_context = mn_to_npu_context(mn);
> unsigned long address;
>
> -   for (address = start; address < end; address += PAGE_SIZE)
> -   mmio_invalidate(npu_context, 1, address, false);
> +   if (end - start > ATSD_THRESHOLD) {

I'm nitpicking, but (end - start) > ATSD_THRESHOLD is clearer

> +   /*
> +* Just invalidate the entire PID if the address range is too
> +* large.
> +*/
> +   mmio_invalidate(npu_context, 0, 0, true);
> +   } else {
> +   for (address = start; address < end; address += PAGE_SIZE)
> +   mmio_invalidate(npu_context, 1, address, false);
>
> -   /* Do the flush only on the final addess == end */
> -   mmio_invalidate(npu_context, 1, address, true);
> +   /* Do the flush only on the final addess == end */
> +   mmio_invalidate(npu_context, 1, address, true);
> +   }
>  }
>

Acked-by: Balbir Singh

[PATCH 2/2] powernv/npu: Add a debugfs setting to change ATSD threshold

2018-04-17 Thread Alistair Popple

The threshold at which it becomes more efficient to coalesce a range of
ATSDs into a single per-PID ATSD is currently not well understood due to a
lack of real-world work loads. This patch adds a debugfs parameter allowing
the threshold to be altered at runtime in order to aid future development
and refinement of the value.

Signed-off-by: Alistair Popple 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index dc34662e9df9..a765bf576c14 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -17,7 +17,9 @@
 #include 
 #include 
 #include 
+#include 
 
+#include 
 #include 
 #include 
 #include 
@@ -44,7 +46,8 @@ DEFINE_SPINLOCK(npu_context_lock);
  * entire TLB on the GPU for the given PID rather than each specific address in
  * the range.
  */
-#define ATSD_THRESHOLD (2*1024*1024)
+static uint64_t atsd_threshold = 2 * 1024 * 1024;
+static struct dentry *atsd_threshold_dentry;
 
 /*
  * Other types of TCE cache invalidation are not functional in the
@@ -682,7 +685,7 @@ static void pnv_npu2_mn_invalidate_range(struct 
mmu_notifier *mn,
struct npu_context *npu_context = mn_to_npu_context(mn);
unsigned long address;
 
-   if (end - start > ATSD_THRESHOLD) {
+   if (end - start > atsd_threshold) {
/*
 * Just invalidate the entire PID if the address range is too
 * large.
@@ -956,6 +959,11 @@ int pnv_npu2_init(struct pnv_phb *phb)
static int npu_index;
uint64_t rc = 0;
 
+   if (!atsd_threshold_dentry) {
+   atsd_threshold_dentry = debugfs_create_x64("atsd_threshold",
+  0600, powerpc_debugfs_root, _threshold);
+   }
+
phb->npu.nmmu_flush =
of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
for_each_child_of_node(phb->hose->dn, dn) {
-- 
2.11.0

[PATCH 1/2] powernv/npu: Do a PID GPU TLB flush when invalidating a large address range

2018-04-17 Thread Alistair Popple

The NPU has a limited number of address translation shootdown (ATSD)
registers and the GPU has limited bandwidth to process ATSDs. This can
result in contention of ATSD registers leading to soft lockups on some
threads, particularly when invalidating a large address range in
pnv_npu2_mn_invalidate_range().

At some threshold it becomes more efficient to flush the entire GPU TLB for
the given MM context (PID) than individually flushing each address in the
range. This patch will result in ranges greater than 2MB being converted
from 32+ ATSDs into a single ATSD which will flush the TLB for the given
PID on each GPU.

Signed-off-by: Alistair Popple 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 94801d8e7894..dc34662e9df9 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -40,6 +40,13 @@
 DEFINE_SPINLOCK(npu_context_lock);
 
 /*
+ * When an address shootdown range exceeds this threshold we invalidate the
+ * entire TLB on the GPU for the given PID rather than each specific address in
+ * the range.
+ */
+#define ATSD_THRESHOLD (2*1024*1024)
+
+/*
  * Other types of TCE cache invalidation are not functional in the
  * hardware.
  */
@@ -675,11 +682,19 @@ static void pnv_npu2_mn_invalidate_range(struct 
mmu_notifier *mn,
struct npu_context *npu_context = mn_to_npu_context(mn);
unsigned long address;
 
-   for (address = start; address < end; address += PAGE_SIZE)
-   mmio_invalidate(npu_context, 1, address, false);
+   if (end - start > ATSD_THRESHOLD) {
+   /*
+* Just invalidate the entire PID if the address range is too
+* large.
+*/
+   mmio_invalidate(npu_context, 0, 0, true);
+   } else {
+   for (address = start; address < end; address += PAGE_SIZE)
+   mmio_invalidate(npu_context, 1, address, false);
 
-   /* Do the flush only on the final addess == end */
-   mmio_invalidate(npu_context, 1, address, true);
+   /* Do the flush only on the final addess == end */
+   mmio_invalidate(npu_context, 1, address, true);
+   }
 }
 
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
-- 
2.11.0

[PATCH] powerpc/misc: get rid of add_reloc_offset()

2018-04-17 Thread Christophe Leroy

add_reloc_offset() is almost redundant with reloc_offset()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/setup.h   |  3 +--
 arch/powerpc/kernel/misc.S | 16 
 arch/powerpc/kernel/prom_init_check.sh |  2 +-
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index 27fa52ed6d00..115e0896ffa7 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -17,10 +17,9 @@ extern void note_scsi_host(struct device_node *, void *);
 
 /* Used in very early kernel initialization. */
 extern unsigned long reloc_offset(void);
-extern unsigned long add_reloc_offset(unsigned long);
 extern void reloc_got2(unsigned long);
 
-#define PTRRELOC(x)((typeof(x)) add_reloc_offset((unsigned long)(x)))
+#define PTRRELOC(x)((typeof(x)) ((unsigned long)(x) + reloc_offset()))
 
 void check_for_initrd(void);
 void mem_topology_setup(void);
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 384357cb8bc0..2711b10ebdb3 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -39,22 +39,6 @@ _GLOBAL(reloc_offset)
.align  3
 2: PPC_LONG 1b
 
-/*
- * add_reloc_offset(x) returns x + reloc_offset().
- */
-_GLOBAL(add_reloc_offset)
-   mflrr0
-   bl  1f
-1: mflrr5
-   PPC_LL  r4,(2f-1b)(r5)
-   subfr5,r4,r5
-   add r3,r3,r5
-   mtlrr0
-   blr
-
-   .align  3
-2: PPC_LONG 1b
-
 _GLOBAL(setjmp)
mflrr0
PPC_STL r0,0(r3)
diff --git a/arch/powerpc/kernel/prom_init_check.sh 
b/arch/powerpc/kernel/prom_init_check.sh
index acb6b9226352..ee9f63186b72 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -16,7 +16,7 @@
 # If you really need to reference something from prom_init.o add
 # it to the list below:
 
-WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
+WHITELIST="__bss_start __bss_stop copy_and_flush
 _end enter_prom memcpy memset reloc_offset __secondary_hold
 __secondary_hold_acknowledge __secondary_hold_spinloop __start
 strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224
-- 
2.13.3

[PATCH 7/7] powerpc/lib: Remove .balign inside string functions for PPC32

2018-04-17 Thread Christophe Leroy

commit 87a156fb18fe1 ("Align hot loops of some string functions")
degraded the performance of string functions by adding useless
nops

A simple benchmark on an 8xx calling 10x a memchr() that
matches the first byte runs in 41668 TB ticks before this patch
and in 35986 TB ticks after this patch. So this gives an
improvement of approx 10%

Another benchmark doing the same with a memchr() matching the 128th
byte runs in 1011365 TB ticks before this patch and 1005682 TB ticks
after this patch, so regardless on the number of loops, removing
those useless nops improves the test by 5683 TB ticks.

Fixes: 87a156fb18fe1 ("Align hot loops of some string functions")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/string.S | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 89af53b08b4a..9e96f1c102c6 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -25,7 +25,9 @@ _GLOBAL(strncpy)
mtctr   r5
addir6,r3,-1
addir4,r4,-1
+#ifdef CONFIG_PPC64
.balign 16
+#endif
 1: lbzur0,1(r4)
cmpwi   0,r0,0
stbur0,1(r6)
@@ -47,7 +49,9 @@ _GLOBAL(strncmp)
mtctr   r5
addir5,r3,-1
addir4,r4,-1
+#ifdef CONFIG_PPC64
.balign 16
+#endif
 1: lbzur3,1(r5)
cmpwi   1,r3,0
lbzur0,1(r4)
@@ -68,7 +72,9 @@ _GLOBAL(memchr)
 #endif
mtctr   r5
addir3,r3,-1
+#ifdef CONFIG_PPC64
.balign 16
+#endif
 1: lbzur0,1(r3)
cmpw0,r0,r4
bdnzf   2,1b
-- 
2.13.3

[PATCH 6/7] powerpc/lib: inline more NUL size verifications

2018-04-17 Thread Christophe Leroy

strncmp(), strncpy(), memchr() are often called with constant
size.

This patch gives GCC a chance to optimise NULL size verification out

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/string.h | 24 
 arch/powerpc/lib/string.S |  8 
 2 files changed, 32 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 196ac5d587fb..1465d5629ef2 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -30,6 +30,22 @@ extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
 #ifndef CONFIG_FORTIFY_SOURCE
+static inline char *__strncpy(char *p, const char *q, __kernel_size_t size)
+{
+   if (unlikely(!size))
+   return p;
+   return strncpy(p, q, size);
+}
+#define strncpy __strncpy
+
+static inline int __strncmp(const char *p, const char *q, __kernel_size_t size)
+{
+   if (unlikely(!size))
+   return 0;
+   return strncmp(p, q, size);
+}
+#define strncmp __strncmp
+
 static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, 
int offset)
 {
int dif;
@@ -72,6 +88,14 @@ static inline int __memcmp(const void *p,const void 
*q,__kernel_size_t size)
return memcmp(p, q, size);
 }
 #define memcmp __memcmp
+
+static inline void *__memchr(const void *p, int c, __kernel_size_t size)
+{
+   if (unlikely(!size))
+   return NULL;
+   return memchr(p, c, size);
+}
+#define memchr __memchr
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index cbb90fdc672d..89af53b08b4a 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -18,8 +18,10 @@
 /* This clears out any unused part of the destination buffer,
just as the libc version does.  -- paulus */
 _GLOBAL(strncpy)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beqlr
+#endif
mtctr   r5
addir6,r3,-1
addir4,r4,-1
@@ -38,8 +40,10 @@ _GLOBAL(strncpy)
 EXPORT_SYMBOL(strncpy)
 
 _GLOBAL(strncmp)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beq-2f
+#endif
mtctr   r5
addir5,r3,-1
addir4,r4,-1
@@ -51,13 +55,17 @@ _GLOBAL(strncmp)
beqlr   1
bdnzt   eq,1b
blr
+#ifdef CONFIG_FORTIFY_SOURCE
 2: li  r3,0
blr
+#endif
 EXPORT_SYMBOL(strncmp)
 
 _GLOBAL(memchr)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beq-2f
+#endif
mtctr   r5
addir3,r3,-1
.balign 16
-- 
2.13.3

[PATCH 5/7] powerpc/lib: optimise 32 bits __clear_user()

2018-04-17 Thread Christophe Leroy

Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.

This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.

On a MPC885, throughput is almost doubled:

Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s

After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s

On a MPC8321, throughput is multiplied by 2.12:

Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s

After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/string_32.S | 85 +++-
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 5b2a73fb07be..31fc92b0aae6 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
.text
 
@@ -61,44 +62,78 @@ _GLOBAL(memcmp)
 #endif
 EXPORT_SYMBOL(memcmp)
 
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
 _GLOBAL(__clear_user)
-   addir6,r3,-4
-   li  r3,0
-   li  r5,0
-   cmplwi  0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.
+ */
+   cmplwi  cr0, r4, 4
+   mr  r10, r3
+   li  r3, 0
blt 7f
-   /* clear a single word */
-11:stwur5,4(r6)
+
+11:stw r3, 0(r10)
beqlr
-   /* clear word sized chunks */
-   andi.   r0,r6,3
-   add r4,r0,r4
-   subfr6,r0,r6
-   srwir0,r4,2
-   andi.   r4,r4,3
+   andi.   r0, r10, 3
+   add r11, r0, r4
+   subfr6, r0, r10
+
+   clrlwi  r7, r6, 32 - LG_CACHELINE_BYTES
+   add r8, r7, r11
+   srwir9, r8, LG_CACHELINE_BYTES
+   addic.  r9, r9, -1  /* total number of complete cachelines */
+   ble 2f
+   xorir0, r7, CACHELINE_MASK & ~3
+   srwi.   r0, r0, 2
+   beq 3f
+   mtctr   r0
+4: stwur3, 4(r6)
+   bdnz4b
+3: mtctr   r9
+   li  r7, 4
+10:dcbzr7, r6
+   addir6, r6, CACHELINE_BYTES
+   bdnz10b
+   clrlwi  r11, r8, 32 - LG_CACHELINE_BYTES
+   addir11, r11, 4
+
+2: srwir0 ,r11 ,2
mtctr   r0
-   bdz 7f
-1: stwur5,4(r6)
+   bdz 6f
+1: stwur3, 4(r6)
bdnz1b
-   /* clear byte sized chunks */
-7: cmpwi   0,r4,0
+6: andi.   r11, r11, 3
beqlr
-   mtctr   r4
-   addir6,r6,3
-8: stbur5,1(r6)
+   mtctr   r11
+   addir6, r6, 3
+8: stbur3, 1(r6)
bdnz8b
blr
-90:mr  r3,r4
+
+7: cmpwi   cr0, r4, 0
+   beqlr
+   mtctr   r4
+   addir6, r10, -1
+9: stbur3, 1(r6)
+   bdnz9b
blr
-91:mfctr   r3
-   slwir3,r3,2
-   add r3,r3,r4
+
+90:mr  r3, r4
blr
-92:mfctr   r3
+91:add r3, r10, r4
+   subfr3, r6, r3
blr
 
EX_TABLE(11b, 90b)
+   EX_TABLE(4b, 91b)
+   EX_TABLE(10b, 91b)
EX_TABLE(1b, 91b)
-   EX_TABLE(8b, 92b)
+   EX_TABLE(8b, 91b)
+   EX_TABLE(9b, 91b)
 
 EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

[PATCH 4/7] powerpc/lib: inline memcmp() for small constant sizes

2018-04-17 Thread Christophe Leroy

In my 8xx configuration, I get 208 calls to memcmp()
Within those 208 calls, about half of them have constant sizes,
46 have a size of 8, 17 have a size of 16, only a few have a
size over 16. Other fixed sizes are mostly 4, 6 and 10.

This patch inlines calls to memcmp() when size
is constant and lower than or equal to 16

In my 8xx configuration, this reduces the number of calls
to memcmp() from 208 to 123

The following table shows the number of TB timeticks to perform
a constant size memcmp() before and after the patch depending on
the size

Before  After   Improvement
01:  75775682   25%
02: 416685682   86%
03: 51137   13258   74%
04: 454555682   87%
05: 58713   13258   77%
06: 58712   13258   77%
07: 68183   20834   70%
08: 56819   15153   73%
09: 70077   28411   60%
10: 70077   28411   60%
11: 79546   35986   55%
12: 68182   28411   58%
13: 81440   35986   55%
14: 81440   39774   51%
15: 94697   43562   54%
16: 79546   37881   52%

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/string.h | 37 +
 1 file changed, 37 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index cf6f495134c3..196ac5d587fb 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include 
+
 #define __HAVE_ARCH_STRNCPY
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_MEMSET
@@ -28,10 +30,45 @@ extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
 #ifndef CONFIG_FORTIFY_SOURCE
+static inline int ___memcmp(const void *p,const void *q,__kernel_size_t size, 
int offset)
+{
+   int dif;
+
+   BUILD_BUG_ON(!size || size > 8);
+
+   p += offset, q += offset;
+   if (size == 1)
+   return *(u8*)p - *(u8*)q;
+   if (size == 2)
+   return be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q);
+   if (size == 3) {
+   dif = be16_to_cpu(*(u16*)p) - be16_to_cpu(*(u16*)q);
+   if (dif)
+   return dif;
+   return *(u8*)(p + 2) - *(u8*)(q + 2);
+   }
+   if (size == 8) {
+   s64 tmp = be64_to_cpu(*(u64*)p) - be64_to_cpu(*(u64*)q);
+   return tmp >> 32 ? : (int)tmp;
+   }
+
+   dif = be32_to_cpu(*(u32*)p) - be32_to_cpu(*(u32*)q);
+   if (size == 4 || dif)
+   return dif;
+
+   return ___memcmp(p, q, size - 4, 4);
+}
+
 static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
 {
if (unlikely(!size))
return 0;
+   if (__builtin_constant_p(size) && size <= 16) {
+   int dif = ___memcmp(p, q, size < 8 ? size : 8, 0);
+   if (size <= 8 || dif)
+   return dif;
+   return ___memcmp(p, q, size - 8, 8);
+   }
return memcmp(p, q, size);
 }
 #define memcmp __memcmp
-- 
2.13.3

[PATCH 3/7] powerpc/lib: optimise PPC32 memcmp

2018-04-17 Thread Christophe Leroy

At the time being, memcmp() compares two chunks of memory
byte per byte.

This patch optimised the comparison by comparing word by word.

A small benchmark performed on an 8xx based on the comparison
of two chuncks of 512 bytes performed 10 times gives:

Before : 5852274 TB ticks
After:   1488638 TB ticks

This is almost 4 times faster

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/string_32.S | 42 +++---
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 94e9c9bc31c3..5b2a73fb07be 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -19,13 +19,41 @@ _GLOBAL(memcmp)
PPC_LCMPI 0,r5,0
beq-2f
 #endif
-   mtctr   r5
-   addir6,r3,-1
-   addir4,r4,-1
-1: lbzur3,1(r6)
-   lbzur0,1(r4)
-   subf.   r3,r0,r3
-   bdnzt   2,1b
+   srawi.  r7, r5, 2   /* Divide len by 4 */
+   mr  r6, r3
+   beq-3f
+   mtctr   r7
+   li  r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+   lwbrx   r3, r6, r7
+   lwbrx   r0, r4, r7
+#else
+   lwzxr3, r6, r7
+   lwzxr0, r4, r7
+#endif
+   addir7, r7, 4
+   subf.   r3, r0, r3
+   bdnzt   eq, 1b
+   bnelr
+   andi.   r5, r5, 3
+   beqlr
+3: cmplwi  cr1, r5, 2
+   blt-cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+   lhbrx   r3, r6, r7
+   lhbrx   r0, r4, r7
+#else
+   lhzxr3, r6, r7
+   lhzxr0, r4, r7
+#endif
+   addir7, r7, 2
+   subf.   r3, r0, r3
+   beqlr   cr1
+   bnelr
+4: lbzxr3, r6, r7
+   lbzxr0, r4, r7
+   subf.   r3, r0, r3
blr
 #ifdef CONFIG_FORTIFY_SOURCE
 2: li  r3,0
-- 
2.13.3

[PATCH 2/7] powerpc/lib: inline memcmp() NUL size verification

2018-04-17 Thread Christophe Leroy

Many calls to memcmp() are done with constant size.
This patch gives GCC a chance to optimise out
the NULL size verification.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/string.h | 10 ++
 arch/powerpc/lib/memcmp_64.S  |  4 
 arch/powerpc/lib/string_32.S  |  4 
 3 files changed, 18 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..cf6f495134c3 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -27,6 +27,16 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
+#ifndef CONFIG_FORTIFY_SOURCE
+static inline int __memcmp(const void *p,const void *q,__kernel_size_t size)
+{
+   if (unlikely(!size))
+   return 0;
+   return memcmp(p, q, size);
+}
+#define memcmp __memcmp
+#endif
+
 #ifdef CONFIG_PPC64
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b7bd55..f6822fabf254 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -30,7 +30,9 @@
 #endif
 
 _GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
cmpdi   cr1,r5,0
+#endif
 
/* Use the short loop if both strings are not 8B aligned */
or  r6,r3,r4
@@ -39,7 +41,9 @@ _GLOBAL(memcmp)
/* Use the short loop if length is less than 32B */
cmpdi   cr6,r5,31
 
+#ifdef CONFIG_FORTIFY_SOURCE
beq cr1,.Lzero
+#endif
bne .Lshort
bgt cr6,.Llong
 
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 2519f8bd09e3..94e9c9bc31c3 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -15,8 +15,10 @@
.text
 
 _GLOBAL(memcmp)
+#ifdef CONFIG_FORTIFY_SOURCE
PPC_LCMPI 0,r5,0
beq-2f
+#endif
mtctr   r5
addir6,r3,-1
addir4,r4,-1
@@ -25,8 +27,10 @@ _GLOBAL(memcmp)
subf.   r3,r0,r3
bdnzt   2,1b
blr
+#ifdef CONFIG_FORTIFY_SOURCE
 2: li  r3,0
blr
+#endif
 EXPORT_SYMBOL(memcmp)
 
 _GLOBAL(__clear_user)
-- 
2.13.3

[PATCH 1/7] powerpc/lib: move PPC32 specific functions out of string.S

2018-04-17 Thread Christophe Leroy

In preparation of optimisation patches, move PPC32 specific
memcmp() and __clear_user() into string_32.S

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/Makefile|  5 +--
 arch/powerpc/lib/string.S| 61 -
 arch/powerpc/lib/string_32.S | 72 
 3 files changed, 75 insertions(+), 63 deletions(-)
 create mode 100644 arch/powerpc/lib/string_32.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  string_64.o memcpy_64.o memcmp_64.o pmem.o
+  memcpy_64.o memcmp_64.o pmem.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
 obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
 
-obj-y  += checksum_$(BITS).o checksum_wrappers.o
+obj-y  += checksum_$(BITS).o checksum_wrappers.o \
+  string_$(BITS).o
 
 obj-y  += sstep.o ldstfp.o quad.o
 obj64-y+= quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index a787776822d8..cbb90fdc672d 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -55,23 +55,6 @@ _GLOBAL(strncmp)
blr
 EXPORT_SYMBOL(strncmp)
 
-#ifdef CONFIG_PPC32
-_GLOBAL(memcmp)
-   PPC_LCMPI 0,r5,0
-   beq-2f
-   mtctr   r5
-   addir6,r3,-1
-   addir4,r4,-1
-1: lbzur3,1(r6)
-   lbzur0,1(r4)
-   subf.   r3,r0,r3
-   bdnzt   2,1b
-   blr
-2: li  r3,0
-   blr
-EXPORT_SYMBOL(memcmp)
-#endif
-
 _GLOBAL(memchr)
PPC_LCMPI 0,r5,0
beq-2f
@@ -85,47 +68,3 @@ _GLOBAL(memchr)
 2: li  r3,0
blr
 EXPORT_SYMBOL(memchr)
-
-#ifdef CONFIG_PPC32
-_GLOBAL(__clear_user)
-   addir6,r3,-4
-   li  r3,0
-   li  r5,0
-   cmplwi  0,r4,4
-   blt 7f
-   /* clear a single word */
-11:stwur5,4(r6)
-   beqlr
-   /* clear word sized chunks */
-   andi.   r0,r6,3
-   add r4,r0,r4
-   subfr6,r0,r6
-   srwir0,r4,2
-   andi.   r4,r4,3
-   mtctr   r0
-   bdz 7f
-1: stwur5,4(r6)
-   bdnz1b
-   /* clear byte sized chunks */
-7: cmpwi   0,r4,0
-   beqlr
-   mtctr   r4
-   addir6,r6,3
-8: stbur5,1(r6)
-   bdnz8b
-   blr
-90:mr  r3,r4
-   blr
-91:mfctr   r3
-   slwir3,r3,2
-   add r3,r3,r4
-   blr
-92:mfctr   r3
-   blr
-
-   EX_TABLE(11b, 90b)
-   EX_TABLE(1b, 91b)
-   EX_TABLE(8b, 92b)
-
-EXPORT_SYMBOL(__clear_user)
-#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index ..2519f8bd09e3
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,72 @@
+/*
+ * String handling functions for PowerPC32
+ *
+ * Copyright (C) 2018 CS Systemes d'Information
+ *
+ * Author: Christophe Leroy 
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include 
+#include 
+#include 
+#include 
+
+   .text
+
+_GLOBAL(memcmp)
+   PPC_LCMPI 0,r5,0
+   beq-2f
+   mtctr   r5
+   addir6,r3,-1
+   addir4,r4,-1
+1: lbzur3,1(r6)
+   lbzur0,1(r4)
+   subf.   r3,r0,r3
+   bdnzt   2,1b
+   blr
+2: li  r3,0
+   blr
+EXPORT_SYMBOL(memcmp)
+
+_GLOBAL(__clear_user)
+   addir6,r3,-4
+   li  r3,0
+   li  r5,0
+   cmplwi  0,r4,4
+   blt 7f
+   /* clear a single word */
+11:stwur5,4(r6)
+   beqlr
+   /* clear word sized chunks */
+   andi.   r0,r6,3
+   add r4,r0,r4
+   subfr6,r0,r6
+   srwir0,r4,2
+   andi.   r4,r4,3
+   mtctr   r0
+   bdz 7f
+1: stwur5,4(r6)
+   bdnz1b
+   /* clear byte sized chunks */
+7: cmpwi   0,r4,0
+   beqlr
+   mtctr   r4
+   addir6,r6,3
+8: stbur5,1(r6)
+   bdnz8b
+   blr
+90:mr  r3,r4
+   blr
+91:mfctr   r3
+   slwir3,r3,2
+   add r3,r3,r4
+   blr
+92:mfctr   r3
+   blr
+
+   EX_TABLE(11b, 90b)
+   EX_TABLE(1b, 91b)
+   EX_TABLE(8b, 92b)
+
+EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

83 matches

Mail list logo