[PATCH v7 7/7] powerpc/pmem: Initialize pmem device on newer hardware

2020-07-01 Thread Aneesh Kumar K.V
With kernel now supporting new pmem flush/sync instructions, we can now
enable the kernel to initialize the device. On P10 these devices would
appear with a new compatible string. For PAPR device we have

compatible   "ibm,pmemory-v2"

and for OF pmem device we have

compatible   "pmem-region-v2"

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 1 +
 drivers/nvdimm/of_pmem.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index 9c569078a09f..66c19c0fe566 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -876,6 +876,7 @@ static int papr_scm_remove(struct platform_device *pdev)
 
 static const struct of_device_id papr_scm_match[] = {
{ .compatible = "ibm,pmemory" },
+   { .compatible = "ibm,pmemory-v2" },
{ },
 };
 
diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
index 6826a274a1f1..10dbdcdfb9ce 100644
--- a/drivers/nvdimm/of_pmem.c
+++ b/drivers/nvdimm/of_pmem.c
@@ -90,6 +90,7 @@ static int of_pmem_region_remove(struct platform_device *pdev)
 
 static const struct of_device_id of_pmem_region_match[] = {
{ .compatible = "pmem-region" },
+   { .compatible = "pmem-region-v2" },
{ },
 };
 
-- 
2.26.2



[PATCH v7 4/7] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-07-01 Thread Aneesh Kumar K.V
Architectures like ppc64 provide persistent memory specific barriers
that will ensure that all stores for which the modifications are
written to persistent storage by preceding dcbfps and dcbstps
instructions have updated persistent storage before any data
access or data transfer caused by subsequent instructions is initiated.
This is in addition to the ordering done by wmb()

Update nvdimm core such that architecture can use barriers other than
wmb to ensure all previous writes are architecturally visible for
the platform buffer flush.

Reviewed-by: Dan Williams 
Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/memory-barriers.txt | 14 ++
 drivers/md/dm-writecache.c|  2 +-
 drivers/nvdimm/region_devs.c  |  8 
 include/asm-generic/barrier.h | 10 ++
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/memory-barriers.txt 
b/Documentation/memory-barriers.txt
index eaabc3134294..ff07cd3b2f82 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1935,6 +1935,20 @@ There are some more advanced barrier functions:
  relaxed I/O accessors and the Documentation/DMA-API.txt file for more
  information on consistent memory.
 
+ (*) pmem_wmb();
+
+ This is for use with persistent memory to ensure that stores for which
+ modifications are written to persistent storage reached a platform
+ durability domain.
+
+ For example, after a non-temporal write to pmem region, we use pmem_wmb()
+ to ensure that stores have reached a platform durability domain. This 
ensures
+ that stores have updated persistent storage before any data access or
+ data transfer caused by subsequent instructions is initiated. This is
+ in addition to the ordering done by wmb().
+
+ For load from persistent memory, existing read memory barriers are 
sufficient
+ to ensure read ordering.
 
 ===
 IMPLICIT KERNEL MEMORY BARRIERS
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 74f3c506f084..00534fa4a384 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -536,7 +536,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
 static void writecache_commit_flushed(struct dm_writecache *wc, bool 
wait_for_ios)
 {
if (WC_MODE_PMEM(wc))
-   wmb();
+   pmem_wmb();
else
ssd_commit_flushed(wc, wait_for_ios);
 }
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4502f9c4708d..c3237c2b03a6 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1206,13 +1206,13 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
 
/*
-* The first wmb() is needed to 'sfence' all previous writes
-* such that they are architecturally visible for the platform
-* buffer flush.  Note that we've already arranged for pmem
+* The pmem_wmb() is needed to 'sfence' all
+* previous writes such that they are architecturally visible for
+* the platform buffer flush. Note that we've already arranged for pmem
 * writes to avoid the cache via memcpy_flushcache().  The final
 * wmb() ensures ordering for the NVDIMM flush write.
 */
-   wmb();
+   pmem_wmb();
for (i = 0; i < nd_region->ndr_mappings; i++)
if (ndrd_get_flush_wpq(ndrd, i, 0))
writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 2eacaf7d62f6..b589bb216ee5 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -257,5 +257,15 @@ do {   
\
 })
 #endif
 
+/*
+ * pmem_wmb() ensures that all stores for which the modification
+ * are written to persistent storage by preceding instructions have
+ * updated persistent storage before any data  access or data transfer
+ * caused by subsequent instructions is initiated.
+ */
+#ifndef pmem_wmb
+#define pmem_wmb() wmb()
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __ASM_GENERIC_BARRIER_H */
-- 
2.26.2



[PATCH v7 3/7] powerpc/pmem: Add flush routines using new pmem store and sync instruction

2020-07-01 Thread Aneesh Kumar K.V
Start using dcbstps; phwsync; sequence for flushing persistent memory range.
The new instructions are implemented as a variant of dcbf and hwsync and on
P8 and P9 they will be executed as those instructions. We avoid using them on
older hardware. This helps to avoid difficult to debug bugs.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/cacheflush.h |  1 +
 arch/powerpc/lib/pmem.c   | 50 ---
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/cacheflush.h 
b/arch/powerpc/include/asm/cacheflush.h
index de600b915a3c..54764c6e922d 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -6,6 +6,7 @@
 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
index 0666a8d29596..5a61aaeb6930 100644
--- a/arch/powerpc/lib/pmem.c
+++ b/arch/powerpc/lib/pmem.c
@@ -9,20 +9,62 @@
 
 #include 
 
+static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
+{
+   unsigned long shift = l1_dcache_shift();
+   unsigned long bytes = l1_dcache_bytes();
+   void *addr = (void *)(start & ~(bytes - 1));
+   unsigned long size = stop - (unsigned long)addr + (bytes - 1);
+   unsigned long i;
+
+   for (i = 0; i < size >> shift; i++, addr += bytes)
+   asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
+
+
+   asm volatile(PPC_PHWSYNC ::: "memory");
+}
+
+static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
+{
+   unsigned long shift = l1_dcache_shift();
+   unsigned long bytes = l1_dcache_bytes();
+   void *addr = (void *)(start & ~(bytes - 1));
+   unsigned long size = stop - (unsigned long)addr + (bytes - 1);
+   unsigned long i;
+
+   for (i = 0; i < size >> shift; i++, addr += bytes)
+   asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
+
+
+   asm volatile(PPC_PHWSYNC ::: "memory");
+}
+
+static inline void clean_pmem_range(unsigned long start, unsigned long stop)
+{
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return __clean_pmem_range(start, stop);
+}
+
+static inline void flush_pmem_range(unsigned long start, unsigned long stop)
+{
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return __flush_pmem_range(start, stop);
+}
+
 /*
  * CONFIG_ARCH_HAS_PMEM_API symbols
  */
 void arch_wb_cache_pmem(void *addr, size_t size)
 {
unsigned long start = (unsigned long) addr;
-   flush_dcache_range(start, start + size);
+   clean_pmem_range(start, start + size);
 }
 EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 
 void arch_invalidate_pmem(void *addr, size_t size)
 {
unsigned long start = (unsigned long) addr;
-   flush_dcache_range(start, start + size);
+   flush_pmem_range(start, start + size);
 }
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 
@@ -35,7 +77,7 @@ long __copy_from_user_flushcache(void *dest, const void 
__user *src,
unsigned long copied, start = (unsigned long) dest;
 
copied = __copy_from_user(dest, src, size);
-   flush_dcache_range(start, start + size);
+   clean_pmem_range(start, start + size);
 
return copied;
 }
@@ -45,7 +87,7 @@ void *memcpy_flushcache(void *dest, const void *src, size_t 
size)
unsigned long start = (unsigned long) dest;
 
memcpy(dest, src, size);
-   flush_dcache_range(start, start + size);
+   clean_pmem_range(start, start + size);
 
return dest;
 }
-- 
2.26.2



[PATCH v7 2/7] powerpc/pmem: Add new instructions for persistent storage and sync

2020-07-01 Thread Aneesh Kumar K.V
POWER10 introduces two new variants of dcbf instructions (dcbstps and dcbfps)
that can be used to write modified locations back to persistent storage.

Additionally, POWER10 also introduce phwsync and plwsync which can be used
to establish order of these writes to persistent storage.

This patch exposes these instructions to the rest of the kernel. The existing
dcbf and hwsync instructions in P8 and P9 are adequate to enable appropriate
synchronization with OpenCAPI-hosted persistent storage. Hence the new
instructions are added as a variant of the old ones that old hardware
won't differentiate.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/ppc-opcode.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 2a39c716c343..1ad014e4633e 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -219,6 +219,8 @@
 #define PPC_INST_STWCX 0x7c00012d
 #define PPC_INST_LWSYNC0x7c2004ac
 #define PPC_INST_SYNC  0x7c0004ac
+#define PPC_INST_PHWSYNC   0x7c8004ac
+#define PPC_INST_PLWSYNC   0x7ca004ac
 #define PPC_INST_SYNC_MASK 0xfc0007fe
 #define PPC_INST_ISYNC 0x4c00012c
 #define PPC_INST_LXVD2X0x7c000698
@@ -284,6 +286,8 @@
 #define PPC_INST_TABORT0x7c00071d
 #define PPC_INST_TSR   0x7c0005dd
 
+#define PPC_INST_DCBF  0x7cac
+
 #define PPC_INST_NAP   0x4c000364
 #define PPC_INST_SLEEP 0x4c0003a4
 #define PPC_INST_WINKLE0x4c0003e4
@@ -532,6 +536,14 @@
 #define STBCIX(s,a,b)  stringify_in_c(.long PPC_INST_STBCIX | \
   __PPC_RS(s) | __PPC_RA(a) | __PPC_RB(b))
 
+#definePPC_DCBFPS(a, b)stringify_in_c(.long PPC_INST_DCBF |
\
+  ___PPC_RA(a) | ___PPC_RB(b) | (4 << 21))
+#definePPC_DCBSTPS(a, b)   stringify_in_c(.long PPC_INST_DCBF |
\
+  ___PPC_RA(a) | ___PPC_RB(b) | (6 << 21))
+
+#definePPC_PHWSYNC stringify_in_c(.long PPC_INST_PHWSYNC)
+#definePPC_PLWSYNC stringify_in_c(.long PPC_INST_PLWSYNC)
+
 /*
  * Define what the VSX XX1 form instructions will look like, then add
  * the 128 bit load store instructions based on that.
-- 
2.26.2



[PATCH v7 1/7] powerpc/pmem: Restrict papr_scm to P8 and above.

2020-07-01 Thread Aneesh Kumar K.V
The PAPR based virtualized persistent memory devices are only supported on
POWER9 and above. In the followup patch, the kernel will switch the persistent
memory cache flush functions to use a new `dcbf` variant instruction. The new
instructions even though added in ISA 3.1 works even on P8 and P9 because these
are implemented as a variant of existing `dcbf` and `hwsync` and on P8 and
P9 behaves as such.

Considering these devices are only supported on P8 and above,  update the driver
to prevent a P7-compat guest from using persistent memory devices.

We don't update of_pmem driver with the same condition, because, on bare-metal,
the firmware enables pmem support only on P9 and above. There the kernel depends
on OPAL firmware to restrict exposing persistent memory related device tree
entries on older hardware. of_pmem.ko is written without any arch dependency and
we don't want to add ppc64 specific cpu feature check in of_pmem driver.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/pseries/pmem.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pmem.c 
b/arch/powerpc/platforms/pseries/pmem.c
index f860a897a9e0..2347e1038f58 100644
--- a/arch/powerpc/platforms/pseries/pmem.c
+++ b/arch/powerpc/platforms/pseries/pmem.c
@@ -147,6 +147,12 @@ const struct of_device_id drc_pmem_match[] = {
 
 static int pseries_pmem_init(void)
 {
+   /*
+* Only supported on POWER8 and above.
+*/
+   if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+   return 0;
+
pmem_node = of_find_node_by_type(NULL, "ibm,persistent-memory");
if (!pmem_node)
return 0;
-- 
2.26.2



[PATCH v7 0/7] Support new pmem flush and sync instructions for POWER

2020-07-01 Thread Aneesh Kumar K.V
This patch series enables the usage os new pmem flush and sync instructions on 
POWER
architecture. POWER10 introduces two new variants of dcbf instructions (dcbstps 
and dcbfps)
that can be used to write modified locations back to persistent storage. 
Additionally,
POWER10 also introduce phwsync and plwsync which can be used to establish order 
of these
writes to persistent storage.

This series exposes these instructions to the rest of the kernel. The existing
dcbf and hwsync instructions in P8 and P9 are adequate to enable appropriate
synchronization with OpenCAPI-hosted persistent storage. Hence the new 
instructions
are added as a variant of the old ones that old hardware won't differentiate.

On POWER10, pmem devices will be represented by a different device tree compat
strings. This ensures that older kernels won't initialize pmem devices on 
POWER10.

With this:
1) vPMEM continues to work since it is a volatile region. That 
doesn't need any flush instructions.

2) pmdk and other user applications get updated to use new instructions
and updated packages are made available to all distributions

3) On newer hardware, the device will appear with a new compat string. 
Hence older distributions won't initialize pmem on newer hardware.

Changes from v6:
* rename flush barrier to pmem_wmb(). Update documentation. 
* Drop the WARN_ON in flush routines.
* Drop pap_scm ndr_region flush callback.

Changes from v5:
* Drop CONFIG_ARCH_MAP_SYNC_DISABLE and related changes

Changes from V4:
* Add namespace specific sychronous fault control.

Changes from V3:
* Add new compat string to be used for the device.
* Use arch_pmem_flush_barrier() in dm-writecache.
Aneesh Kumar K.V (7):
  powerpc/pmem: Restrict papr_scm to P8 and above.
  powerpc/pmem: Add new instructions for persistent storage and sync
  powerpc/pmem: Add flush routines using new pmem store and sync
instruction
  libnvdimm/nvdimm/flush: Allow architecture to override the flush
barrier
  powerpc/pmem: Update ppc64 to use the new barrier instruction.
  powerpc/pmem: Avoid the barrier in flush routines
  powerpc/pmem: Initialize pmem device on newer hardware

 Documentation/memory-barriers.txt | 14 
 arch/powerpc/include/asm/barrier.h| 13 +++
 arch/powerpc/include/asm/cacheflush.h |  1 +
 arch/powerpc/include/asm/ppc-opcode.h | 12 +++
 arch/powerpc/lib/pmem.c   | 44 ---
 arch/powerpc/platforms/pseries/papr_scm.c |  1 +
 arch/powerpc/platforms/pseries/pmem.c |  6 
 drivers/md/dm-writecache.c|  2 +-
 drivers/nvdimm/of_pmem.c  |  1 +
 drivers/nvdimm/region_devs.c  |  8 ++---
 include/asm-generic/barrier.h | 10 ++
 11 files changed, 103 insertions(+), 9 deletions(-)

-- 
2.26.2



Re: [PATCH v2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()

2020-07-01 Thread Christophe Leroy




Le 30/06/2020 à 23:18, Segher Boessenkool a écrit :

Hi again,

Thanks for your work so far!

On Tue, Jun 30, 2020 at 06:53:39PM +, Christophe Leroy wrote:

On 06/30/2020 04:33 PM, Segher Boessenkool wrote:

+ make -s CC=powerpc64-linux-gnu-gcc -j 160
In file included from /linux/include/linux/uaccess.h:11:0,
  from /linux/include/linux/sched/task.h:11,
  from /linux/include/linux/sched/signal.h:9,
  from /linux/include/linux/rcuwait.h:6,
  from /linux/include/linux/percpu-rwsem.h:7,
  from /linux/include/linux/fs.h:33,
  from /linux/include/linux/huge_mm.h:8,
  from /linux/include/linux/mm.h:675,
  from /linux/arch/powerpc/kernel/signal_32.c:17:
/linux/arch/powerpc/kernel/signal_32.c: In function
'save_user_regs.isra.14.constprop':
/linux/arch/powerpc/include/asm/uaccess.h:161:2: error: 'asm' operand has
impossible constraints
   __asm__ __volatile__( \
   ^
/linux/arch/powerpc/include/asm/uaccess.h:197:12: note: in expansion of
macro '__put_user_asm'
 case 4: __put_user_asm(x, ptr, retval, "stw"); break; \
 ^
/linux/arch/powerpc/include/asm/uaccess.h:206:2: note: in expansion of
macro '__put_user_size_allowed'
   __put_user_size_allowed(x, ptr, size, retval);  \
   ^
/linux/arch/powerpc/include/asm/uaccess.h:220:2: note: in expansion of
macro '__put_user_size'
   __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \
   ^
/linux/arch/powerpc/include/asm/uaccess.h:96:2: note: in expansion of
macro '__put_user_nocheck'
   __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
   ^
/linux/arch/powerpc/kernel/signal_32.c:120:7: note: in expansion of macro
'__put_user'
if (__put_user((unsigned int)gregs[i], &frame->mc_gregs[i]))
^


Can we see what that was after the macro jungle?  Like, the actual
preprocessed code?


Sorry for previous misunderstanding

Here is the code:

#define __put_user_asm(x, addr, err, op)\
__asm__ __volatile__(   \
"1:" op "%U2%X2 %1,%2# put_user\n"  \
"2:\n"\
".section .fixup,\"ax\"\n"  \
"3:li %0,%3\n"\
"  b 2b\n"\
".previous\n" \
EX_TABLE(1b, 3b)\
: "=r" (err)  \
: "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err))


Yeah I don't see it.  I'll have to look at compiler debug dumps, but I
don't have any working 4.9 around, and I cannot reproduce this with
either older or newer compilers.


I reproduced it with 4.8.5



It is complainig that constrain_operands just does not work *at all* on
this "m<>" constraint apparently, which doesn't make much sense.



Here is a small reproducer:

#include 
#include 
#include 

struct mcontext {
elf_gregset_t32 mc_gregs;
elf_fpregset_t  mc_fregs;
unsigned intmc_pad[2];
elf_vrregset_t32mc_vregs __attribute__((__aligned__(16)));
elf_vsrreghalf_t32  mc_vsregs __attribute__((__aligned__(16)));
};

int save_general_regs(struct pt_regs *regs, struct mcontext __user *frame)
{
elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
int i;

for (i = 0; i <= PT_RESULT; i ++) {
if (i == 14)
i = 32;
if (__put_user((unsigned int)gregs[i], &frame->mc_gregs[i]))
return -EFAULT;
}
return 0;
}


If you remove the "if i == 14 ..." you get no failure.

Preprocessor result:

int save_general_regs(struct pt_regs *regs, struct mcontext *frame)
{
 elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
 int i;

 for (i = 0; i <= 43; i ++) {
  if (i == 14)
   i = 32;
  if (({ long __pu_err; __typeof__(*((&frame->mc_gregs[i]))) *__pu_addr 
= ((&frame->mc_gregs[i])); __typeof__(*((&frame->mc_gregs[i]))) __pu_val 
= ((__typeof__(*(&frame->mc_gregs[i])))((unsigned int)gregs[i])); 
__typeof__(sizeof(*(&frame->mc_gregs[i]))) __pu_size = 
(sizeof(*(&frame->mc_gregs[i]))); if (!(((unsigned long)__pu_addr) >= 
0x8000ul)) might_fault(); (void)0; do { 
allow_write_to_user(__pu_addr, __pu_size); do { __pu_err = 0; switch 
(__pu_size) { case 1: __asm__ __volatile__( "1:	" "stb" "%U2%X2 %1,%2	# 
put_user\n" "2:\n" ".section .fixup,\"ax\"\n" "3:	li %0,%3\n" "	b 2b\n" 
".previous\n" ".section __ex_table,\"a\";" " " ".balign 4;" " " ".long 
(1b) - . ;" " " ".long (3b) - . ;" " " ".previous" " " : "=r" (__pu_err) 
: "r" (__pu_val), "m<>" (*__pu_addr), "i" (-14), "0" (__pu_err)); break; 
case 2: __asm__ __volatile__( "1:	" "sth" "%U2%X2 %1,%2	# put_user\n" 
"2:\n" ".sect

<    1   2