[PATCH v8 5/5] powerpc:selftest update memcmp_64 selftest for VMX implementation

2018-06-06 Thread wei . guo . simon
From: Simon Guo 

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo 
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h | 39 +
 .../selftests/powerpc/stringloops/asm/ppc_asm.h| 25 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +-
 4 files changed, 142 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
li  r3,0
blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
li  r3,1
blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
new file mode 100644
index 000..9de413c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2009 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * provides masks and opcode images for use by code generation, emulation
+ * and for instructions that older assemblers might not know about
+ */
+#ifndef _ASM_POWERPC_PPC_OPCODE_H
+#define _ASM_POWERPC_PPC_OPCODE_H
+
+
+#  define stringify_in_c(...)  __VA_ARGS__
+#  define ASM_CONST(x) x
+
+
+#define PPC_INST_VCMPEQUD_RC   0x10c7
+#define PPC_INST_VCMPEQUB_RC   0x1006
+
+#define __PPC_RC21 (0x1 << 10)
+
+/* macros to insert fields into opcodes */
+#define ___PPC_RA(a)   (((a) & 0x1f) << 16)
+#define ___PPC_RB(b)   (((b) & 0x1f) << 11)
+#define ___PPC_RS(s)   (((s) & 0x1f) << 21)
+#define ___PPC_RT(t)   ___PPC_RS(t)
+
+#define VCMPEQUD_RC(vrt, vra, vrb) stringify_in_c(.long 
PPC_INST_VCMPEQUD_RC | \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb) stringify_in_c(.long 
PPC_INST_VCMPEQUB_RC | \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..d2c0a91 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include 
 
 #ifndef r1
@@ -6,3 +8,26 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+#define _GLOBAL_TOC(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#define BEGIN_FTR_SECTION
+#define END_FTR_SECTION_IFSET(val)
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include 
 #include 
 #include 
+#include 
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 1
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+   vmx_count++;
+   return 1;
+}
+
+void exit_vmx_ops(void)
+{
+   vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+   unsigned long size_start, unsigned long max_size)
 {
unsigned long offset, size;
 
-   for (offset = 0; offset < SIZE; offset++) {
-   for (size = 0; size < (SIZE-offset); size++) {
+   for (offset = 

[PATCH v8 4/5] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()

2018-06-06 Thread wei . guo . simon
From: Simon Guo 

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
- 76% cases will fail/unmatch before 16 bytes;
- 83% cases will fail/unmatch before 32 bytes;
- 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 57 +++-
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index be2f792..844d8e7 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -404,8 +404,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
/* Enter with src/dst addrs has the same offset with 8 bytes
-* align boundary
+* align boundary.
+*
+* There is an optimization based on following fact: memcmp()
+* prones to fail early at the first 32 bytes.
+* Before applying VMX instructions which will lead to 32x128bits
+* VMX regs load/restore penalty, we compare the first 32 bytes
+* so that we can catch the ~80% fail cases.
 */
+
+   li  r0,4
+   mtctr   r0
+.Lsameoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Lsameoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Llong_novmx_cmp
 
@@ -482,16 +501,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-BEGIN_FTR_SECTION
-   /* only do vmx ops when the size equal or greater than 4K bytes */
-   cmpdi   cr5,r5,VMX_THRESH
-   bge cr5,.Ldiffoffset_vmx_cmp
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-
-.Ldiffoffset_novmx_cmp:
-#endif
-
/* now try to align s1 with 8 bytes */
rlwinm  r6,r3,3,26,28
beq .Ldiffoffset_align_s1_8bytes
@@ -515,6 +524,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 .Ldiffoffset_align_s1_8bytes:
/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+   /* only do vmx ops when the size equal or greater than 4K bytes */
+   cmpdi   cr5,r5,VMX_THRESH
+   bge cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
cmpdi   cr5,r5,31
ble cr5,.Lcmp_lt32bytes
 
@@ -526,6 +546,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+   /* perform a 32 bytes pre-checking before
+* enable VMX operations.
+*/
+   li  r0,4
+   mtctr   r0
+.Ldiffoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Ldiffoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1



[PATCH v8 3/5] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

2018-06-06 Thread wei . guo . simon
From: Simon Guo 

This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
--
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include 
>#include 
>#include 
>#include 
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
char *s1;
char *s2;
unsigned long i;

s1 = memalign(128, SIZE);
if (!s1) {
perror("memalign");
exit(1);
}

s2 = memalign(128, SIZE);
if (!s2) {
perror("memalign");
exit(1);
}

for (i = 0; i < SIZE; i++)  {
s1[i] = i & 0xff;
s2[i] = i & 0xff;
}
for (i = 0; i < ITERATIONS; i++) {
int ret = test_memcmp(s1, s2, SIZE);

if (ret) {
printf("return %d at[%ld]! should have returned 
zero\n", ret, i);
abort();
}
}

return 0;
}

int main(void)
{
return test_harness(testcase, "memcmp");
}
--
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
4.726728762 seconds time elapsed
  ( +-  3.54%)
With VMX patch:
4.234335473 seconds time elapsed
  ( +-  2.63%)
There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S|   4 +-
 arch/powerpc/lib/memcmp_64.S  | 241 +-
 arch/powerpc/lib/memcpy_power7.S  |   6 +-
 arch/powerpc/lib/vmx-helper.c |   4 +-
 5 files changed, 248 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S 
b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdur1,-STACKFRAMESIZE(r1)
-   bl  enter_vmx_copy
+   bl  enter_vmx_ops
cmpwi   r3,0
ld  r0,STACKFRAMESIZE+16(r1)
ld  r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addir3,r3,128
bdnz1b
 
-   b   exit_vmx_copy   /* tail call optimise */
+   b   exit_vmx_ops/* tail call optimise */
 
 #else
li  r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 5776f91..be2f792 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
  */
 #include 
 #include 
+#include 
 
 #define off8   r6
 #define off16  r7
@@ -27,12 +28,73 @@
 #define LH lhbrx
 #define LW lwbrx
 #define LD ldbrx
+#define LVSlvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH lhzx
 #define LW lwzx
 #define LD ldx
+#define LVSlvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS  \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdur1,-STACKFRAMESIZE(r1); \
+   bl  enter_vmx_ops; \
+   cmpwi   cr1,r3,0; \
+   ld  r0,STACKFRAMESIZE+16(r1); \
+   ld  r3,STK_REG(R31)(r1); \
+   ld  r4,STK_REG(R30)(r1); \
+   ld  r5,STK_REG(R29)(r1); \
+   addir1,r1,STACKFRAMESIZE; \
+   mtlrr0
+
+#define EXIT_VMX_OPS \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+  

[PATCH v8 2/5] powerpc: add vcmpequd/vcmpequb ppc instruction macro

2018-06-06 Thread wei . guo . simon
From: Simon Guo 

Some old tool chains don't know about instructions like vcmpequd.

This patch adds .long macro for vcmpequd and vcmpequb, which is
a preparation to optimize ppc64 memcmp with VMX instructions.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/ppc-opcode.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 18883b8..1866a97 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -366,6 +366,8 @@
 #define PPC_INST_STFDX 0x7c0005ae
 #define PPC_INST_LVX   0x7cce
 #define PPC_INST_STVX  0x7c0001ce
+#define PPC_INST_VCMPEQUD  0x10c7
+#define PPC_INST_VCMPEQUB  0x1006
 
 /* macros to insert fields into opcodes */
 #define ___PPC_RA(a)   (((a) & 0x1f) << 16)
@@ -396,6 +398,7 @@
 #define __PPC_BI(s)(((s) & 0x1f) << 16)
 #define __PPC_CT(t)(((t) & 0x0f) << 21)
 #define __PPC_SPR(r)   r) & 0x1f) << 16) | r) >> 5) & 0x1f) << 11))
+#define __PPC_RC21 (0x1 << 10)
 
 /*
  * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
@@ -567,4 +570,12 @@
   ((IH & 0x7) << 21))
 #define PPC_INVALIDATE_ERATPPC_SLBIA(7)
 
+#define VCMPEQUD_RC(vrt, vra, vrb) stringify_in_c(.long PPC_INST_VCMPEQUD 
| \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb) stringify_in_c(.long PPC_INST_VCMPEQUB 
| \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
-- 
1.8.3.1



[PATCH v8 1/5] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

2018-06-06 Thread wei . guo . simon
From: Simon Guo 

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed   
   ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed   
   ( +-  0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 1

 int test_memcmp(const void *s1, const void *s2, size_t n);


- Without patch
0.244746482 seconds time elapsed
  ( +-  0.36%)
- with patch
0.215069477 seconds time elapsed
  ( +-  0.51%)
-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 1
+#define SIZE 8
+#define ITERATIONS 100

 int test_memcmp(const void *s1, const void *s2, size_t n);
---
- Without patch
   1.845642503 seconds time elapsed 
 ( +- 0.12% )
- With patch
   1.849767135 seconds time elapsed 
 ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 140 ---
 1 file changed, 133 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..5776f91 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
 #define LD ldbrx
 #else
+#define LH lhzx
+#define LW lwzx
 #define LD ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_
+ */
 _GLOBAL(memcmp)
cmpdi   cr1,r5,0
 
-   /* Use the short loop if both strings are not 8B aligned */
-   or  r6,r3,r4
+   /* Use the short loop if the src/dst addresses are not
+* with the same offset of 8 bytes align boundary.
+*/
+   xor r6,r3,r4
andi.   r6,r6,7
 
-   /* Use the short loop if length is less than 32B */
-   cmpdi   cr6,r5,31
+   /* Fall back to short loop if compare at aligned addrs
+* with less than 8 bytes.
+*/
+   cmpdi   cr6,r5,7
 
beq cr1,.Lzero
-   bne .Lshort
-   bgt cr6,.Llong
+   bgt cr6,.Lno_short
 
 .Lshort:
mtctr   r5
-
 1: lbz rA,0(r3)
lbz rB,0(r4)
subf.   rC,rB,rA
@@ -78,11 +91,89 @@ _GLOBAL(memcmp)
li  r3,0
blr
 
+.Lno_short:
+   dcbt0,r3
+   dcbt0,r4
+   bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+   /* attempt to compare bytes not aligned with 8 bytes so that
+* rest comparison can run based on 8 bytes alignment.
+*/
+   andi.   r6,r3,7
+
+   /* Try to compare the first double word which is not 8 bytes aligned:
+* load the first double word at (src & ~7UL) and shift left appropriate
+* bits before comparision.
+*/
+   rlwinm  r6,r3,3,26,28
+   beq .Lsameoffset_8bytes_aligned
+   clrrdi  r3,r3,3
+   clrrdi  r4,r4,3
+   LD  rA,0,r3
+   LD  rB,0,r4
+   sld rA,rA,r6
+   sld rB,rB,r6
+   cmpld   cr0,rA,rB
+   srwir6,r6,3
+   bne cr0,.LcmpAB_lightweight
+   subfic  r6,r6,8
+   subf.   

[PATCH v8 0/5] powerpc/64: memcmp() optimization

2018-06-06 Thread wei . guo . simon
From: Simon Guo 

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v7 -> v8:
- define memcmp with _GLOBAL_TOC() instead of _GLOBAL() to fix TOC issue.
add _GLOBAL_TOC() definition into selftest so that it can be compiled.
- use mfocrf/mtocrf instead of mcrf to save/restore CR0

v6 -> v7:
- add vcmpequd/vcmpequdb .long macro
- add CPU_FTR pair so that Power7 won't invoke Altivec instrs.
- rework some instructions for higher performance or more readable.

v5 -> v6:
- correct some comments/commit messsage.
- rename VMX_OPS_THRES to VMX_THRESH

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.


Simon Guo (5):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
memcmp()
  powerpc: add vcmpequd/vcmpequb ppc instruction macro
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h  |   4 +-
 arch/powerpc/include/asm/ppc-opcode.h  |  11 +
 arch/powerpc/lib/copypage_power7.S |   4 +-
 arch/powerpc/lib/memcmp_64.S   | 414 -
 arch/powerpc/lib/memcpy_power7.S   |   6 +-
 arch/powerpc/lib/vmx-helper.c  |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h |  39 ++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h|  25 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 10 files changed, 568 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

-- 
1.8.3.1



[PATCH v7 5/5] powerpc:selftest update memcmp_64 selftest for VMX implementation

2018-05-30 Thread wei . guo . simon
From: Simon Guo 

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo 
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h | 39 +
 .../selftests/powerpc/stringloops/asm/ppc_asm.h| 24 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +-
 4 files changed, 141 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
li  r3,0
blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
li  r3,1
blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
new file mode 100644
index 000..9de413c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2009 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * provides masks and opcode images for use by code generation, emulation
+ * and for instructions that older assemblers might not know about
+ */
+#ifndef _ASM_POWERPC_PPC_OPCODE_H
+#define _ASM_POWERPC_PPC_OPCODE_H
+
+
+#  define stringify_in_c(...)  __VA_ARGS__
+#  define ASM_CONST(x) x
+
+
+#define PPC_INST_VCMPEQUD_RC   0x10c7
+#define PPC_INST_VCMPEQUB_RC   0x1006
+
+#define __PPC_RC21 (0x1 << 10)
+
+/* macros to insert fields into opcodes */
+#define ___PPC_RA(a)   (((a) & 0x1f) << 16)
+#define ___PPC_RB(b)   (((b) & 0x1f) << 11)
+#define ___PPC_RS(s)   (((s) & 0x1f) << 21)
+#define ___PPC_RT(t)   ___PPC_RS(t)
+
+#define VCMPEQUD_RC(vrt, vra, vrb) stringify_in_c(.long 
PPC_INST_VCMPEQUD_RC | \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb) stringify_in_c(.long 
PPC_INST_VCMPEQUB_RC | \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..33912bb 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include 
 
 #ifndef r1
@@ -6,3 +8,25 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#define BEGIN_FTR_SECTION
+#define END_FTR_SECTION_IFSET(val)
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include 
 #include 
 #include 
+#include 
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 1
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+   vmx_count++;
+   return 1;
+}
+
+void exit_vmx_ops(void)
+{
+   vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+   unsigned long size_start, unsigned long max_size)
 {
unsigned long offset, size;
 
-   for (offset = 0; offset < SIZE; offset++) {
-   for (size = 0; size < (SIZE-offset); size++) {
+   for (offset = 0; offset < max_offset; offset++) {
+

[PATCH v7 4/5] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()

2018-05-30 Thread wei . guo . simon
From: Simon Guo 

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
- 76% cases will fail/unmatch before 16 bytes;
- 83% cases will fail/unmatch before 32 bytes;
- 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 57 +++-
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index aef0e41..5eba497 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -404,8 +404,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
/* Enter with src/dst addrs has the same offset with 8 bytes
-* align boundary
+* align boundary.
+*
+* There is an optimization based on following fact: memcmp()
+* prones to fail early at the first 32 bytes.
+* Before applying VMX instructions which will lead to 32x128bits
+* VMX regs load/restore penalty, we compare the first 32 bytes
+* so that we can catch the ~80% fail cases.
 */
+
+   li  r0,4
+   mtctr   r0
+.Lsameoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Lsameoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Llong_novmx_cmp
 
@@ -482,16 +501,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-BEGIN_FTR_SECTION
-   /* only do vmx ops when the size equal or greater than 4K bytes */
-   cmpdi   cr5,r5,VMX_THRESH
-   bge cr5,.Ldiffoffset_vmx_cmp
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-
-.Ldiffoffset_novmx_cmp:
-#endif
-
/* now try to align s1 with 8 bytes */
rlwinm  r6,r3,3,26,28
beq .Ldiffoffset_align_s1_8bytes
@@ -515,6 +524,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 .Ldiffoffset_align_s1_8bytes:
/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+   /* only do vmx ops when the size equal or greater than 4K bytes */
+   cmpdi   cr5,r5,VMX_THRESH
+   bge cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
cmpdi   cr5,r5,31
ble cr5,.Lcmp_lt32bytes
 
@@ -526,6 +546,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+   /* perform a 32 bytes pre-checking before
+* enable VMX operations.
+*/
+   li  r0,4
+   mtctr   r0
+.Ldiffoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Ldiffoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1



[PATCH v7 3/5] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

2018-05-30 Thread wei . guo . simon
From: Simon Guo 

This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
--
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include 
>#include 
>#include 
>#include 
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
char *s1;
char *s2;
unsigned long i;

s1 = memalign(128, SIZE);
if (!s1) {
perror("memalign");
exit(1);
}

s2 = memalign(128, SIZE);
if (!s2) {
perror("memalign");
exit(1);
}

for (i = 0; i < SIZE; i++)  {
s1[i] = i & 0xff;
s2[i] = i & 0xff;
}
for (i = 0; i < ITERATIONS; i++) {
int ret = test_memcmp(s1, s2, SIZE);

if (ret) {
printf("return %d at[%ld]! should have returned 
zero\n", ret, i);
abort();
}
}

return 0;
}

int main(void)
{
return test_harness(testcase, "memcmp");
}
--
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
4.726728762 seconds time elapsed
  ( +-  3.54%)
With VMX patch:
4.234335473 seconds time elapsed
  ( +-  2.63%)
There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S|   4 +-
 arch/powerpc/lib/memcmp_64.S  | 239 +-
 arch/powerpc/lib/memcpy_power7.S  |   6 +-
 arch/powerpc/lib/vmx-helper.c |   4 +-
 5 files changed, 247 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S 
b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdur1,-STACKFRAMESIZE(r1)
-   bl  enter_vmx_copy
+   bl  enter_vmx_ops
cmpwi   r3,0
ld  r0,STACKFRAMESIZE+16(r1)
ld  r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addir3,r3,128
bdnz1b
 
-   b   exit_vmx_copy   /* tail call optimise */
+   b   exit_vmx_ops/* tail call optimise */
 
 #else
li  r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 5776f91..aef0e41 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
  */
 #include 
 #include 
+#include 
 
 #define off8   r6
 #define off16  r7
@@ -27,12 +28,73 @@
 #define LH lhbrx
 #define LW lwbrx
 #define LD ldbrx
+#define LVSlvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH lhzx
 #define LW lwzx
 #define LD ldx
+#define LVSlvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS  \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdur1,-STACKFRAMESIZE(r1); \
+   bl  enter_vmx_ops; \
+   cmpwi   cr1,r3,0; \
+   ld  r0,STACKFRAMESIZE+16(r1); \
+   ld  r3,STK_REG(R31)(r1); \
+   ld  r4,STK_REG(R30)(r1); \
+   ld  r5,STK_REG(R29)(r1); \
+   addir1,r1,STACKFRAMESIZE; \
+   mtlrr0
+
+#define EXIT_VMX_OPS \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+  

[PATCH v7 2/5] powerpc: add vcmpequd/vcmpequb ppc instruction macro

2018-05-30 Thread wei . guo . simon
From: Simon Guo 

Some old tool chains don't know about instructions like vcmpequd.

This patch adds .long macro for vcmpequd and vcmpequb, which is
a preparation to optimize ppc64 memcmp with VMX instructions.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/ppc-opcode.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 18883b8..1866a97 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -366,6 +366,8 @@
 #define PPC_INST_STFDX 0x7c0005ae
 #define PPC_INST_LVX   0x7cce
 #define PPC_INST_STVX  0x7c0001ce
+#define PPC_INST_VCMPEQUD  0x10c7
+#define PPC_INST_VCMPEQUB  0x1006
 
 /* macros to insert fields into opcodes */
 #define ___PPC_RA(a)   (((a) & 0x1f) << 16)
@@ -396,6 +398,7 @@
 #define __PPC_BI(s)(((s) & 0x1f) << 16)
 #define __PPC_CT(t)(((t) & 0x0f) << 21)
 #define __PPC_SPR(r)   r) & 0x1f) << 16) | r) >> 5) & 0x1f) << 11))
+#define __PPC_RC21 (0x1 << 10)
 
 /*
  * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
@@ -567,4 +570,12 @@
   ((IH & 0x7) << 21))
 #define PPC_INVALIDATE_ERATPPC_SLBIA(7)
 
+#define VCMPEQUD_RC(vrt, vra, vrb) stringify_in_c(.long PPC_INST_VCMPEQUD 
| \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb) stringify_in_c(.long PPC_INST_VCMPEQUB 
| \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
-- 
1.8.3.1



[PATCH v7 1/5] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

2018-05-30 Thread wei . guo . simon
From: Simon Guo 

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed   
   ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed   
   ( +-  0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 1

 int test_memcmp(const void *s1, const void *s2, size_t n);


- Without patch
0.244746482 seconds time elapsed
  ( +-  0.36%)
- with patch
0.215069477 seconds time elapsed
  ( +-  0.51%)
-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 1
+#define SIZE 8
+#define ITERATIONS 100

 int test_memcmp(const void *s1, const void *s2, size_t n);
---
- Without patch
   1.845642503 seconds time elapsed 
 ( +- 0.12% )
- With patch
   1.849767135 seconds time elapsed 
 ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 140 ---
 1 file changed, 133 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..5776f91 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
 #define LD ldbrx
 #else
+#define LH lhzx
+#define LW lwzx
 #define LD ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_
+ */
 _GLOBAL(memcmp)
cmpdi   cr1,r5,0
 
-   /* Use the short loop if both strings are not 8B aligned */
-   or  r6,r3,r4
+   /* Use the short loop if the src/dst addresses are not
+* with the same offset of 8 bytes align boundary.
+*/
+   xor r6,r3,r4
andi.   r6,r6,7
 
-   /* Use the short loop if length is less than 32B */
-   cmpdi   cr6,r5,31
+   /* Fall back to short loop if compare at aligned addrs
+* with less than 8 bytes.
+*/
+   cmpdi   cr6,r5,7
 
beq cr1,.Lzero
-   bne .Lshort
-   bgt cr6,.Llong
+   bgt cr6,.Lno_short
 
 .Lshort:
mtctr   r5
-
 1: lbz rA,0(r3)
lbz rB,0(r4)
subf.   rC,rB,rA
@@ -78,11 +91,89 @@ _GLOBAL(memcmp)
li  r3,0
blr
 
+.Lno_short:
+   dcbt0,r3
+   dcbt0,r4
+   bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+   /* attempt to compare bytes not aligned with 8 bytes so that
+* rest comparison can run based on 8 bytes alignment.
+*/
+   andi.   r6,r3,7
+
+   /* Try to compare the first double word which is not 8 bytes aligned:
+* load the first double word at (src & ~7UL) and shift left appropriate
+* bits before comparision.
+*/
+   rlwinm  r6,r3,3,26,28
+   beq .Lsameoffset_8bytes_aligned
+   clrrdi  r3,r3,3
+   clrrdi  r4,r4,3
+   LD  rA,0,r3
+   LD  rB,0,r4
+   sld rA,rA,r6
+   sld rB,rB,r6
+   cmpld   cr0,rA,rB
+   srwir6,r6,3
+   bne cr0,.LcmpAB_lightweight
+   subfic  r6,r6,8
+   subf.   

[PATCH v7 0/5] powerpc/64: memcmp() optimization

2018-05-30 Thread wei . guo . simon
From: Simon Guo 

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v6 -> v7:
- add vcmpequd/vcmpequdb .long macro
- add CPU_FTR pair so that Power7 won't invoke Altivec instrs.
- rework some instructions for higher performance or more readable.

v5 -> v6:
- correct some comments/commit messsage.
- rename VMX_OPS_THRES to VMX_THRESH

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.


Simon Guo (5):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
memcmp()
  powerpc: add vcmpequd/vcmpequb ppc instruction macro
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h  |   4 +-
 arch/powerpc/include/asm/ppc-opcode.h  |  11 +
 arch/powerpc/lib/copypage_power7.S |   4 +-
 arch/powerpc/lib/memcmp_64.S   | 412 -
 arch/powerpc/lib/memcpy_power7.S   |   6 +-
 arch/powerpc/lib/vmx-helper.c  |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h |  39 ++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h|  24 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 10 files changed, 566 insertions(+), 40 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

-- 
1.8.3.1



[PATCH v2] KVM: PPC: remove mmio_vsx_tx_sx_enabled in KVM MMIO emulation

2018-05-27 Thread wei . guo . simon
From: Simon Guo 

Originally PPC KVM MMIO emulation uses only 0~31#(5 bits) for VSR
reg number, and use mmio_vsx_tx_sx_enabled field together for
0~63# VSR regs.

Currently PPC KVM MMIO emulation is reimplemented with analyse_instr()
assistence. analyse_instr() returns 0~63 for VSR register number, so
it is not necessary to use additional mmio_vsx_tx_sx_enabled field
any more.

This patch extends related reg bits(expand io_gpr to u16 from u8
and use 6 bits for VSR reg#), so that mmio_vsx_tx_sx_enabled can
be removed.

v1 -> v2 change:
rework the commit message to remove "PR KVM" specific word.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_host.h  | 17 -
 arch/powerpc/kvm/emulate_loadstore.c |  7 +++
 arch/powerpc/kvm/powerpc.c   | 30 +++---
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8dc5e43..bd220a3 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -673,7 +673,7 @@ struct kvm_vcpu_arch {
gva_t vaddr_accessed;
pgd_t *pgdir;
 
-   u8 io_gpr; /* GPR used as IO source/target */
+   u16 io_gpr; /* GPR used as IO source/target */
u8 mmio_host_swabbed;
u8 mmio_sign_extend;
/* conversion between single and double precision */
@@ -689,7 +689,6 @@ struct kvm_vcpu_arch {
 */
u8 mmio_vsx_copy_nums;
u8 mmio_vsx_offset;
-   u8 mmio_vsx_tx_sx_enabled;
u8 mmio_vmx_copy_nums;
u8 mmio_vmx_offset;
u8 mmio_copy_type;
@@ -802,14 +801,14 @@ struct kvm_vcpu_arch {
 #define KVMPPC_VCPU_BUSY_IN_HOST   2
 
 /* Values for vcpu->arch.io_gpr */
-#define KVM_MMIO_REG_MASK  0x001f
-#define KVM_MMIO_REG_EXT_MASK  0xffe0
+#define KVM_MMIO_REG_MASK  0x003f
+#define KVM_MMIO_REG_EXT_MASK  0xffc0
 #define KVM_MMIO_REG_GPR   0x
-#define KVM_MMIO_REG_FPR   0x0020
-#define KVM_MMIO_REG_QPR   0x0040
-#define KVM_MMIO_REG_FQPR  0x0060
-#define KVM_MMIO_REG_VSX   0x0080
-#define KVM_MMIO_REG_VMX   0x00c0
+#define KVM_MMIO_REG_FPR   0x0040
+#define KVM_MMIO_REG_QPR   0x0080
+#define KVM_MMIO_REG_FQPR  0x00c0
+#define KVM_MMIO_REG_VSX   0x0100
+#define KVM_MMIO_REG_VMX   0x0180
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index dca7f1c..64b325b 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -106,7 +106,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 * if mmio_vsx_tx_sx_enabled == 1, copy data between
 * VSR[32..63] and memory
 */
-   vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
vcpu->arch.mmio_vsx_copy_nums = 0;
vcpu->arch.mmio_vsx_offset = 0;
vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
@@ -242,8 +241,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
}
 
emulated = kvmppc_handle_vsx_load(run, vcpu,
-   KVM_MMIO_REG_VSX | (op.reg & 0x1f),
-   io_size_each, 1, op.type & SIGNEXT);
+   KVM_MMIO_REG_VSX|op.reg, io_size_each,
+   1, op.type & SIGNEXT);
break;
}
 #endif
@@ -363,7 +362,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
}
 
emulated = kvmppc_handle_vsx_store(run, vcpu,
-   op.reg & 0x1f, io_size_each, 1);
+   op.reg, io_size_each, 1);
break;
}
 #endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 05eccdc..dcc7982 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -881,10 +881,10 @@ static inline void kvmppc_set_vsr_dword(struct kvm_vcpu 
*vcpu,
if (offset == -1)
return;
 
-   if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-   val.vval = VCPU_VSX_VR(vcpu, index);
+   if (index >= 32) {
+   val.vval = VCPU_VSX_VR(vcpu, index - 32);
val.vsxval[offset] = gpr;
-   VCPU_VSX_VR(vcpu, index) = val.vval;
+   VCPU_VSX_VR(vcpu, index - 32) = val.vval;
} else {
VCPU_VSX_FPR(vcpu, index, offset) = gpr;
}
@@ -896,11 +896,11 @@ static inline void kvmppc_set_vsr_dword_dump(struct 
kvm_vcpu *vcpu,
union kvmppc_one_reg val;
int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-   if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-   val.vval = VCPU_VSX_VR(vcpu, index);
+   if (index >= 32) {
+   

[PATCH v6 4/4] powerpc:selftest update memcmp_64 selftest for VMX implementation

2018-05-24 Thread wei . guo . simon
From: Simon Guo 

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo 
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h| 22 +
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +-
 3 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
li  r3,0
blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
li  r3,1
blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..185d257 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include 
 
 #ifndef r1
@@ -6,3 +8,23 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include 
 #include 
 #include 
+#include 
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 1
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+   vmx_count++;
+   return 1;
+}
+
+void exit_vmx_ops(void)
+{
+   vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+   unsigned long size_start, unsigned long max_size)
 {
unsigned long offset, size;
 
-   for (offset = 0; offset < SIZE; offset++) {
-   for (size = 0; size < (SIZE-offset); size++) {
+   for (offset = 0; offset < max_offset; offset++) {
+   for (size = size_start; size < (max_size - offset); size++) {
int x, y;
unsigned long i;
 
@@ -35,70 +55,104 @@ static void test_one(char *s1, char *s2)
printf("\n");
abort();
}
+
+   if (vmx_count != 0) {
+   printf("vmx enter/exit not paired.(offset:%ld 
size:%ld s1:%p s2:%p vc:%d\n",
+   offset, size, s1, s2, vmx_count);
+   printf("\n");
+   abort();
+   }
}
}
 }
 
-static int testcase(void)
+static int testcase(bool islarge)
 {
char *s1;
char *s2;
unsigned long i;
 
-   s1 = memalign(128, SIZE);
+   unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
+   unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+   int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+   s1 = memalign(128, alloc_size);
if (!s1) {
perror("memalign");
exit(1);
}
 
-   s2 = memalign(128, SIZE);
+   s2 = memalign(128, alloc_size);
if (!s2) {
perror("memalign");
exit(1);
}
 
-   srandom(1);
+   srandom(time(0));
 
-   for (i = 0; i < ITERATIONS; i++) {
+   for (i = 0; i < iterations; i++) {
unsigned long j;
unsigned long change;
+   char *rand_s1 = s1;
+   char *rand_s2 = s2;
 
-   for (j = 0; j < SIZE; j++)
+   for (j = 0; j < alloc_size; j++)
s1[j] = random();
 
-   memcpy(s2, s1, SIZE);
+   rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+   

[PATCH v6 3/4] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()

2018-05-24 Thread wei . guo . simon
From: Simon Guo 

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
- 76% cases will fail/unmatch before 16 bytes;
- 83% cases will fail/unmatch before 32 bytes;
- 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 50 +---
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 4ba7bb6..96eb08b 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -403,8 +403,27 @@ _GLOBAL(memcmp)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
/* Enter with src/dst addrs has the same offset with 8 bytes
-* align boundary
+* align boundary.
+*
+* There is an optimization based on following fact: memcmp()
+* prones to fail early at the first 32 bytes.
+* Before applying VMX instructions which will lead to 32x128bits
+* VMX regs load/restore penalty, we compare the first 32 bytes
+* so that we can catch the ~80% fail cases.
 */
+
+   li  r0,4
+   mtctr   r0
+.Lsameoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Lsameoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Llong_novmx_cmp
 
@@ -481,13 +500,6 @@ _GLOBAL(memcmp)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-   /* only do vmx ops when the size equal or greater than 4K bytes */
-   cmpdi   cr5,r5,VMX_THRESH
-   bge cr5,.Ldiffoffset_vmx_cmp
-.Ldiffoffset_novmx_cmp:
-#endif
-
/* now try to align s1 with 8 bytes */
andi.   r6,r3,0x7
rlwinm  r6,r6,3,0,28
@@ -512,6 +524,13 @@ _GLOBAL(memcmp)
 
 .Ldiffoffset_align_s1_8bytes:
/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+   /* only do vmx ops when the size is equal or greater than 4K bytes */
+   cmpdi   cr5,r5,VMX_THRESH
+   bge cr5,.Ldiffoffset_vmx_cmp
+.Ldiffoffset_novmx_cmp:
+#endif
+
cmpdi   cr5,r5,31
ble cr5,.Lcmp_lt32bytes
 
@@ -523,6 +542,21 @@ _GLOBAL(memcmp)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+   /* perform a 32 bytes pre-checking before
+* enable VMX operations.
+*/
+   li  r0,4
+   mtctr   r0
+.Ldiffoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Ldiffoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1



[PATCH v6 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

2018-05-24 Thread wei . guo . simon
From: Simon Guo 

This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
--
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include 
>#include 
>#include 
>#include 
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
char *s1;
char *s2;
unsigned long i;

s1 = memalign(128, SIZE);
if (!s1) {
perror("memalign");
exit(1);
}

s2 = memalign(128, SIZE);
if (!s2) {
perror("memalign");
exit(1);
}

for (i = 0; i < SIZE; i++)  {
s1[i] = i & 0xff;
s2[i] = i & 0xff;
}
for (i = 0; i < ITERATIONS; i++) {
int ret = test_memcmp(s1, s2, SIZE);

if (ret) {
printf("return %d at[%ld]! should have returned 
zero\n", ret, i);
abort();
}
}

return 0;
}

int main(void)
{
return test_harness(testcase, "memcmp");
}
--
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
4.726728762 seconds time elapsed
  ( +-  3.54%)
With VMX patch:
4.234335473 seconds time elapsed
  ( +-  2.63%)
There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S|   4 +-
 arch/powerpc/lib/memcmp_64.S  | 233 +-
 arch/powerpc/lib/memcpy_power7.S  |   6 +-
 arch/powerpc/lib/vmx-helper.c |   4 +-
 5 files changed, 241 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S 
b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdur1,-STACKFRAMESIZE(r1)
-   bl  enter_vmx_copy
+   bl  enter_vmx_ops
cmpwi   r3,0
ld  r0,STACKFRAMESIZE+16(r1)
ld  r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addir3,r3,128
bdnz1b
 
-   b   exit_vmx_copy   /* tail call optimise */
+   b   exit_vmx_ops/* tail call optimise */
 
 #else
li  r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index f20e883..4ba7bb6 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -27,12 +27,73 @@
 #define LH lhbrx
 #define LW lwbrx
 #define LD ldbrx
+#define LVSlvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH lhzx
 #define LW lwzx
 #define LD ldx
+#define LVSlvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS  \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdur1,-STACKFRAMESIZE(r1); \
+   bl  enter_vmx_ops; \
+   cmpwi   cr1,r3,0; \
+   ld  r0,STACKFRAMESIZE+16(r1); \
+   ld  r3,STK_REG(R31)(r1); \
+   ld  r4,STK_REG(R30)(r1); \
+   ld  r5,STK_REG(R29)(r1); \
+   addir1,r1,STACKFRAMESIZE; \
+   mtlrr0
+
+#define EXIT_VMX_OPS \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdu

[PATCH v6 1/4] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

2018-05-24 Thread wei . guo . simon
From: Simon Guo 

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed   
   ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed   
   ( +-  0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 1

 int test_memcmp(const void *s1, const void *s2, size_t n);


- Without patch
0.244746482 seconds time elapsed
  ( +-  0.36%)
- with patch
0.215069477 seconds time elapsed
  ( +-  0.51%)
-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 1
+#define SIZE 8
+#define ITERATIONS 100

 int test_memcmp(const void *s1, const void *s2, size_t n);
---
- Without patch
   1.845642503 seconds time elapsed 
 ( +- 0.12% )
- With patch
   1.849767135 seconds time elapsed 
 ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 143 ---
 1 file changed, 136 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..f20e883 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
 #define LD ldbrx
 #else
+#define LH lhzx
+#define LW lwzx
 #define LD ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_
+ */
 _GLOBAL(memcmp)
cmpdi   cr1,r5,0
 
-   /* Use the short loop if both strings are not 8B aligned */
-   or  r6,r3,r4
+   /* Use the short loop if the src/dst addresses are not
+* with the same offset of 8 bytes align boundary.
+*/
+   xor r6,r3,r4
andi.   r6,r6,7
 
-   /* Use the short loop if length is less than 32B */
-   cmpdi   cr6,r5,31
+   /* Fall back to short loop if compare at aligned addrs
+* with less than 8 bytes.
+*/
+   cmpdi   cr6,r5,7
 
beq cr1,.Lzero
-   bne .Lshort
-   bgt cr6,.Llong
+   bgt cr6,.Lno_short
 
 .Lshort:
mtctr   r5
-
 1: lbz rA,0(r3)
lbz rB,0(r4)
subf.   rC,rB,rA
@@ -78,11 +91,90 @@ _GLOBAL(memcmp)
li  r3,0
blr
 
+.Lno_short:
+   dcbt0,r3
+   dcbt0,r4
+   bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+   /* attempt to compare bytes not aligned with 8 bytes so that
+* rest comparison can run based on 8 bytes alignment.
+*/
+   andi.   r6,r3,7
+
+   /* Try to compare the first double word which is not 8 bytes aligned:
+* load the first double word at (src & ~7UL) and shift left appropriate
+* bits before comparision.
+*/
+   clrlwi  r6,r3,29
+   rlwinm  r6,r6,3,0,28
+   beq .Lsameoffset_8bytes_aligned
+   clrrdi  r3,r3,3
+   clrrdi  r4,r4,3
+   LD  rA,0,r3
+   LD  rB,0,r4
+   sld rA,rA,r6
+   sld rB,rB,r6
+   cmpld   cr0,rA,rB
+   srwir6,r6,3
+   

[PATCH v6 0/4] powerpc/64: memcmp() optimization

2018-05-24 Thread wei . guo . simon
From: Simon Guo 

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v5 -> v6:
- correct some comments/commit messsage.
- rename VMX_OPS_THRES to VMX_THRESH

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.

Simon Guo (4):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
memcmp()
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h  |   4 +-
 arch/powerpc/lib/copypage_power7.S |   4 +-
 arch/powerpc/lib/memcmp_64.S   | 408 -
 arch/powerpc/lib/memcpy_power7.S   |   6 +-
 arch/powerpc/lib/vmx-helper.c  |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h|  22 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 8 files changed, 510 insertions(+), 40 deletions(-)

-- 
1.8.3.1



[PATCH] KVM: PPC: remove mmio_vsx_tx_sx_enabled in PR KVM MMIO emulation

2018-05-24 Thread wei . guo . simon
From: Simon Guo 

Originally PR KVM MMIO emulation uses only 0~31#(5 bits) for VSR
reg number, and use mmio_vsx_tx_sx_enabled field together for
0~63# VSR regs.

Currently PR KVM MMIO emulation is reimplemented with analyse_instr()
assistence. analyse_instr() returns 0~63 for VSR register number, so
it is not necessary to use additional mmio_vsx_tx_sx_enabled field
any more.

This patch extends related reg bits(expand io_gpr to u16 from u8
and use 6 bits for VSR reg#), so that mmio_vsx_tx_sx_enabled can
be removed.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_host.h  | 17 -
 arch/powerpc/kvm/emulate_loadstore.c |  7 +++
 arch/powerpc/kvm/powerpc.c   | 30 +++---
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8dc5e43..bd220a3 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -673,7 +673,7 @@ struct kvm_vcpu_arch {
gva_t vaddr_accessed;
pgd_t *pgdir;
 
-   u8 io_gpr; /* GPR used as IO source/target */
+   u16 io_gpr; /* GPR used as IO source/target */
u8 mmio_host_swabbed;
u8 mmio_sign_extend;
/* conversion between single and double precision */
@@ -689,7 +689,6 @@ struct kvm_vcpu_arch {
 */
u8 mmio_vsx_copy_nums;
u8 mmio_vsx_offset;
-   u8 mmio_vsx_tx_sx_enabled;
u8 mmio_vmx_copy_nums;
u8 mmio_vmx_offset;
u8 mmio_copy_type;
@@ -802,14 +801,14 @@ struct kvm_vcpu_arch {
 #define KVMPPC_VCPU_BUSY_IN_HOST   2
 
 /* Values for vcpu->arch.io_gpr */
-#define KVM_MMIO_REG_MASK  0x001f
-#define KVM_MMIO_REG_EXT_MASK  0xffe0
+#define KVM_MMIO_REG_MASK  0x003f
+#define KVM_MMIO_REG_EXT_MASK  0xffc0
 #define KVM_MMIO_REG_GPR   0x
-#define KVM_MMIO_REG_FPR   0x0020
-#define KVM_MMIO_REG_QPR   0x0040
-#define KVM_MMIO_REG_FQPR  0x0060
-#define KVM_MMIO_REG_VSX   0x0080
-#define KVM_MMIO_REG_VMX   0x00c0
+#define KVM_MMIO_REG_FPR   0x0040
+#define KVM_MMIO_REG_QPR   0x0080
+#define KVM_MMIO_REG_FQPR  0x00c0
+#define KVM_MMIO_REG_VSX   0x0100
+#define KVM_MMIO_REG_VMX   0x0180
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index dca7f1c..64b325b 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -106,7 +106,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 * if mmio_vsx_tx_sx_enabled == 1, copy data between
 * VSR[32..63] and memory
 */
-   vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
vcpu->arch.mmio_vsx_copy_nums = 0;
vcpu->arch.mmio_vsx_offset = 0;
vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
@@ -242,8 +241,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
}
 
emulated = kvmppc_handle_vsx_load(run, vcpu,
-   KVM_MMIO_REG_VSX | (op.reg & 0x1f),
-   io_size_each, 1, op.type & SIGNEXT);
+   KVM_MMIO_REG_VSX|op.reg, io_size_each,
+   1, op.type & SIGNEXT);
break;
}
 #endif
@@ -363,7 +362,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
}
 
emulated = kvmppc_handle_vsx_store(run, vcpu,
-   op.reg & 0x1f, io_size_each, 1);
+   op.reg, io_size_each, 1);
break;
}
 #endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 05eccdc..dcc7982 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -881,10 +881,10 @@ static inline void kvmppc_set_vsr_dword(struct kvm_vcpu 
*vcpu,
if (offset == -1)
return;
 
-   if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-   val.vval = VCPU_VSX_VR(vcpu, index);
+   if (index >= 32) {
+   val.vval = VCPU_VSX_VR(vcpu, index - 32);
val.vsxval[offset] = gpr;
-   VCPU_VSX_VR(vcpu, index) = val.vval;
+   VCPU_VSX_VR(vcpu, index - 32) = val.vval;
} else {
VCPU_VSX_FPR(vcpu, index, offset) = gpr;
}
@@ -896,11 +896,11 @@ static inline void kvmppc_set_vsr_dword_dump(struct 
kvm_vcpu *vcpu,
union kvmppc_one_reg val;
int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-   if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-   val.vval = VCPU_VSX_VR(vcpu, index);
+   if (index >= 32) {
+   val.vval = VCPU_VSX_VR(vcpu, index - 32);
val.vsxval[0] = gpr;
  

[PATCH v4 29/29] KVM: PPC: Book3S PR: enable kvmppc_get/set_one_reg_pr() for HTM registers

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

We need to migrate PR KVM during transaction and qemu will use
kvmppc_get_one_reg_pr()/kvmppc_set_one_reg_pr() APIs to get/set
transaction checkpoint state. This patch adds support for that.

So far PPC PR qemu doesn't fully function for migration but the
savevm/loadvm can be done against a RHEL72 guest. During savevm/
loadvm procedure, the kvm ioctls will be invoked as well.

Test has been performed to savevm/loadvm for a guest running
a HTM test program:
https://github.com/justdoitqd/publicFiles/blob/master/test-tm-mig.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_pr.c | 133 +++
 1 file changed, 133 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f7864da..6f22a67 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1539,6 +1539,73 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
else
*val = get_reg_val(id, 0);
break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   case KVM_REG_PPC_TFHAR:
+   *val = get_reg_val(id, vcpu->arch.tfhar);
+   break;
+   case KVM_REG_PPC_TFIAR:
+   *val = get_reg_val(id, vcpu->arch.tfiar);
+   break;
+   case KVM_REG_PPC_TEXASR:
+   *val = get_reg_val(id, vcpu->arch.texasr);
+   break;
+   case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+   *val = get_reg_val(id,
+   vcpu->arch.gpr_tm[id-KVM_REG_PPC_TM_GPR0]);
+   break;
+   case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+   {
+   int i, j;
+
+   i = id - KVM_REG_PPC_TM_VSR0;
+   if (i < 32)
+   for (j = 0; j < TS_FPRWIDTH; j++)
+   val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+   else {
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   val->vval = vcpu->arch.vr_tm.vr[i-32];
+   else
+   r = -ENXIO;
+   }
+   break;
+   }
+   case KVM_REG_PPC_TM_CR:
+   *val = get_reg_val(id, vcpu->arch.cr_tm);
+   break;
+   case KVM_REG_PPC_TM_XER:
+   *val = get_reg_val(id, vcpu->arch.xer_tm);
+   break;
+   case KVM_REG_PPC_TM_LR:
+   *val = get_reg_val(id, vcpu->arch.lr_tm);
+   break;
+   case KVM_REG_PPC_TM_CTR:
+   *val = get_reg_val(id, vcpu->arch.ctr_tm);
+   break;
+   case KVM_REG_PPC_TM_FPSCR:
+   *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+   break;
+   case KVM_REG_PPC_TM_AMR:
+   *val = get_reg_val(id, vcpu->arch.amr_tm);
+   break;
+   case KVM_REG_PPC_TM_PPR:
+   *val = get_reg_val(id, vcpu->arch.ppr_tm);
+   break;
+   case KVM_REG_PPC_TM_VRSAVE:
+   *val = get_reg_val(id, vcpu->arch.vrsave_tm);
+   break;
+   case KVM_REG_PPC_TM_VSCR:
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+   else
+   r = -ENXIO;
+   break;
+   case KVM_REG_PPC_TM_DSCR:
+   *val = get_reg_val(id, vcpu->arch.dscr_tm);
+   break;
+   case KVM_REG_PPC_TM_TAR:
+   *val = get_reg_val(id, vcpu->arch.tar_tm);
+   break;
+#endif
default:
r = -EINVAL;
break;
@@ -1572,6 +1639,72 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
case KVM_REG_PPC_LPCR_64:
kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   case KVM_REG_PPC_TFHAR:
+   vcpu->arch.tfhar = set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TFIAR:
+   vcpu->arch.tfiar = set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TEXASR:
+   vcpu->arch.texasr = set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+   vcpu->arch.gpr_tm[id - KVM_REG_PPC_TM_GPR0] =
+   set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+   {
+   int i, j;
+
+   i = id - KVM_REG_PPC_TM_VSR0;
+   if (i < 32)
+   for (j = 0; j < TS_FPRWIDTH; j++)
+   vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+   else
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   vcpu->arch.vr_tm.vr[i-32] = val->vval;
+

[PATCH v4 28/29] KVM: PPC: remove load/put vcpu for KVM_GET_REGS/KVM_SET_REGS

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

In both HV/PR KVM, the KVM_SET_REGS/KVM_GET_REGS ioctl should
be able to perform without load vcpu. This patch adds
KVM_SET_ONE_REG/KVM_GET_ONE_REG implementation to async ioctl
function.

Due to the vcpu mutex locking/unlock has been moved out of vcpu_load()
/vcpu_put(), KVM_SET_REGS/KVM_GET_REGS don't need to do
ioctl with loading vcpu anymore. This patch removes vcpu_load()/vcpu_put()
from KVM_SET_REGS/KVM_GET_REGS ioctl.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 97d4a11..523c68f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -509,8 +509,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
regs->pc = kvmppc_get_pc(vcpu);
regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = kvmppc_get_ctr(vcpu);
@@ -532,7 +530,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-   vcpu_put(vcpu);
return 0;
 }
 
@@ -540,8 +537,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
kvmppc_set_pc(vcpu, regs->pc);
kvmppc_set_cr(vcpu, regs->cr);
kvmppc_set_ctr(vcpu, regs->ctr);
@@ -562,7 +557,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-   vcpu_put(vcpu);
return 0;
 }
 
-- 
1.8.3.1



[PATCH v4 27/29] KVM: PPC: remove load/put vcpu for KVM_GET/SET_ONE_REG ioctl

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Due to the vcpu mutex locking/unlock has been moved out of vcpu_load()
/vcpu_put(), KVM_GET_ONE_REG and KVM_SET_ONE_REG doesn't need to do
ioctl with loading vcpu anymore. This patch removes vcpu_load()/vcpu_put()
from KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctl.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/powerpc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c9098ff..5def68d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1801,14 +1801,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
{
struct kvm_one_reg reg;
r = -EFAULT;
-   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(reg)))
goto out;
if (ioctl == KVM_SET_ONE_REG)
r = kvm_vcpu_ioctl_set_one_reg(vcpu, );
else
r = kvm_vcpu_ioctl_get_one_reg(vcpu, );
-   vcpu_put(vcpu);
break;
}
 
-- 
1.8.3.1



[PATCH v4 26/29] KVM: PPC: move vcpu_load/vcpu_put down to each ioctl case in kvm_arch_vcpu_ioctl

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Although we already have kvm_arch_vcpu_async_ioctl() which doesn't require
ioctl to load vcpu, the sync ioctl code need to be cleaned up when
CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL is not configured.

This patch moves vcpu_load/vcpu_put down to each ioctl switch case so that
each ioctl can decide to do vcpu_load/vcpu_put or not independently.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/powerpc.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1fa5bbe..c9098ff 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1783,16 +1783,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
long r;
 
-   vcpu_load(vcpu);
-
switch (ioctl) {
case KVM_ENABLE_CAP:
{
struct kvm_enable_cap cap;
r = -EFAULT;
+   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(cap)))
goto out;
r = kvm_vcpu_ioctl_enable_cap(vcpu, );
+   vcpu_put(vcpu);
break;
}
 
@@ -1801,12 +1801,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
{
struct kvm_one_reg reg;
r = -EFAULT;
+   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(reg)))
goto out;
if (ioctl == KVM_SET_ONE_REG)
r = kvm_vcpu_ioctl_set_one_reg(vcpu, );
else
r = kvm_vcpu_ioctl_get_one_reg(vcpu, );
+   vcpu_put(vcpu);
break;
}
 
@@ -1814,9 +1816,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_DIRTY_TLB: {
struct kvm_dirty_tlb dirty;
r = -EFAULT;
+   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(dirty)))
goto out;
r = kvm_vcpu_ioctl_dirty_tlb(vcpu, );
+   vcpu_put(vcpu);
break;
}
 #endif
@@ -1825,7 +1829,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 
 out:
-   vcpu_put(vcpu);
return r;
 }
 
-- 
1.8.3.1



[PATCH v4 25/29] KVM: PPC: Book3S PR: enable HTM for PR KVM for KVM_CHECK_EXTENSION ioctl

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

With current patch set, PR KVM now supports HTM. So this patch turns it
on for PR KVM.

Tested with:
https://github.com/justdoitqd/publicFiles/blob/master/test_kvm_htm_cap.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/powerpc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index bef27b1..1fa5bbe 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -648,9 +648,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case KVM_CAP_PPC_HTM:
-   r = hv_enabled &&
-   (!!(cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM) ||
-cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST));
+   r = !!(cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM) ||
+(hv_enabled && cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST));
break;
 #endif
default:
-- 
1.8.3.1



[PATCH v4 24/29] KVM: PPC: Book3S PR: Support TAR handling for PR KVM HTM.

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently guest kernel doesn't handle TAR fac unavailable and it always
runs with TAR bit on. PR KVM will lazily enable TAR. TAR is not a
frequent-use reg and it is not included in SVCPU struct.

Due to the above, the checkpointed TAR val might be a bogus TAR val.
To solve this issue, we will make vcpu->arch.fscr tar bit consistent
with shadow_fscr when TM enabled.

At the end of emulating treclaim., the correct TAR val need to be loaded
into reg if FSCR_TAR bit is on.
At the beginning of emulating trechkpt., TAR needs to be flushed so that
the right tar val can be copy into tar_tm.

Tested with:
tools/testing/selftests/powerpc/tm/tm-tar
tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar (remove DSCR/PPR
related testing).

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_emulate.c |  4 
 arch/powerpc/kvm/book3s_pr.c  | 23 ++-
 arch/powerpc/kvm/tm.S | 16 ++--
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 2940de7..1f345a0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -271,6 +271,8 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu 
*vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
+
 extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 67d0fb40..fdbc695 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -173,6 +173,9 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, 
int ra_val)
guest_msr &= ~(MSR_TS_MASK);
kvmppc_set_msr(vcpu, guest_msr);
preempt_enable();
+
+   if (vcpu->arch.shadow_fscr & FSCR_TAR)
+   mtspr(SPRN_TAR, vcpu->arch.tar);
 }
 
 static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
@@ -185,6 +188,7 @@ static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
 * copy.
 */
kvmppc_giveup_ext(vcpu, MSR_VSX);
+   kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
kvmppc_copyto_vcpu_tm(vcpu);
kvmppc_save_tm_sprs(vcpu);
 
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 526c928..f7864da 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -55,7 +55,9 @@
 
 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 ulong msr);
-static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
+#ifdef CONFIG_PPC_BOOK3S_64
+static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac);
+#endif
 
 /* Some compatibility defines */
 #ifdef CONFIG_PPC_BOOK3S_32
@@ -346,6 +348,7 @@ void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
return;
}
 
+   kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
kvmppc_giveup_ext(vcpu, MSR_VSX);
 
preempt_disable();
@@ -357,8 +360,11 @@ void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
 {
if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
kvmppc_restore_tm_sprs(vcpu);
-   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   if (kvmppc_get_msr(vcpu) & MSR_TM) {
kvmppc_handle_lost_math_exts(vcpu);
+   if (vcpu->arch.fscr & FSCR_TAR)
+   kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+   }
return;
}
 
@@ -366,9 +372,11 @@ void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
_kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
preempt_enable();
 
-   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   if (kvmppc_get_msr(vcpu) & MSR_TM) {
kvmppc_handle_lost_math_exts(vcpu);
-
+   if (vcpu->arch.fscr & FSCR_TAR)
+   kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+   }
 }
 #endif
 
@@ -819,7 +827,7 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 }
 
 /* Give up facility (TAR / EBB / DSCR) */
-static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
+void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) {
@@ -1020,7 +1028,12 @@ void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr)
if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) {
/* TAR got dropped, drop it in shadow too */
kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+   } else if (!(vcpu->arch.fscr & FSCR_TAR) && (fscr & FSCR_TAR)) {
+   vcpu->arch.fscr = fscr;
+   kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+   return;
}
+

[PATCH v4 23/29] KVM: PPC: Book3S PR: add guard code to prevent returning to guest with PR=0 and Transactional state

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently PR KVM doesn't support transaction memory at guest privilege
state.

This patch adds a check at setting guest msr, so that we can never return
to guest with PR=0 and TS=0b10. A tabort will be emulated to indicate
this and fail transaction immediately.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/uapi/asm/tm.h |  2 +-
 arch/powerpc/kvm/book3s.h  |  6 ++
 arch/powerpc/kvm/book3s_emulate.c  |  2 +-
 arch/powerpc/kvm/book3s_pr.c   | 13 -
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/tm.h 
b/arch/powerpc/include/uapi/asm/tm.h
index e1bf0e2..e2947c9 100644
--- a/arch/powerpc/include/uapi/asm/tm.h
+++ b/arch/powerpc/include/uapi/asm/tm.h
@@ -13,7 +13,7 @@
 #define TM_CAUSE_TLBI  0xdc
 #define TM_CAUSE_FAC_UNAV  0xda
 #define TM_CAUSE_SYSCALL   0xd8
-#define TM_CAUSE_MISC  0xd6  /* future use */
+#define TM_CAUSE_PRIV_T0xd6
 #define TM_CAUSE_SIGNAL0xd4
 #define TM_CAUSE_ALIGNMENT 0xd2
 #define TM_CAUSE_EMULATE   0xd0
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 4ad5e28..14ef035 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -31,4 +31,10 @@ extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu 
*vcpu,
 extern int kvmppc_book3s_init_pr(void);
 extern void kvmppc_book3s_exit_pr(void);
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val);
+#else
+static inline void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) {}
+#endif
+
 #endif
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 34f910e..67d0fb40 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -199,7 +199,7 @@ static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
 }
 
 /* emulate tabort. at guest privilege state */
-static void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
+void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
 {
/* currently we only emulate tabort. but no emulation of other
 * tabort variants since there is no kernel usage of them at
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 5359f9c..526c928 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -446,12 +446,23 @@ static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned 
long hva, pte_t pte)
 
 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 {
-   ulong old_msr = kvmppc_get_msr(vcpu);
+   ulong old_msr;
 
 #ifdef EXIT_DEBUG
printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /* We should never target guest MSR to TS=10 && PR=0,
+* since we always fail transaction for guest privilege
+* state.
+*/
+   if (!(msr & MSR_PR) && MSR_TM_TRANSACTIONAL(msr))
+   kvmppc_emulate_tabort(vcpu,
+   TM_CAUSE_PRIV_T | TM_CAUSE_PERSISTENT);
+#endif
+
+   old_msr = kvmppc_get_msr(vcpu);
msr &= to_book3s(vcpu)->msr_mask;
kvmppc_set_msr_fast(vcpu, msr);
kvmppc_recalc_shadow_msr(vcpu);
-- 
1.8.3.1



[PATCH v4 22/29] KVM: PPC: Book3S PR: add emulation for tabort. for privilege guest

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently privilege guest will be run with TM disabled.

Although the privilege guest cannot initiate a new transaction,
it can use tabort to terminate its problem state's transaction.
So it is still necessary to emulate tabort. for privilege guest.

This patch adds emulation for tabort. of privilege guest.

Tested with:
https://github.com/justdoitqd/publicFiles/blob/master/test_tabort.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_emulate.c | 68 +++
 1 file changed, 68 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index b7530cf..34f910e 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -50,6 +50,7 @@
 #define OP_31_XOP_SLBMFEE  915
 
 #define OP_31_XOP_TBEGIN   654
+#define OP_31_XOP_TABORT   910
 
 #define OP_31_XOP_TRECLAIM 942
 #define OP_31_XOP_TRCHKPT  1006
@@ -196,6 +197,47 @@ static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
kvmppc_restore_tm_pr(vcpu);
preempt_enable();
 }
+
+/* emulate tabort. at guest privilege state */
+static void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
+{
+   /* currently we only emulate tabort. but no emulation of other
+* tabort variants since there is no kernel usage of them at
+* present.
+*/
+   unsigned long guest_msr = kvmppc_get_msr(vcpu);
+
+   preempt_disable();
+   tm_enable();
+   tm_abort(ra_val);
+
+   /* CR0 = 0 | MSR[TS] | 0 */
+   vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+   (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
+<< CR0_SHIFT);
+
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   /* failure recording depends on Failure Summary bit,
+* and tabort will be treated as nops in non-transactional
+* state.
+*/
+   if (!(vcpu->arch.texasr & TEXASR_FS) &&
+   MSR_TM_ACTIVE(guest_msr)) {
+   vcpu->arch.texasr &= ~(TEXASR_PR | TEXASR_HV);
+   if (guest_msr & MSR_PR)
+   vcpu->arch.texasr |= TEXASR_PR;
+
+   if (guest_msr & MSR_HV)
+   vcpu->arch.texasr |= TEXASR_HV;
+
+   vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   }
+   tm_disable();
+   preempt_enable();
+}
+
 #endif
 
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -468,6 +510,32 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
emulated = EMULATE_FAIL;
break;
}
+   case OP_31_XOP_TABORT:
+   {
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   unsigned long ra_val = 0;
+
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   /* only emulate for privilege guest, since problem state
+* guest can run with TM enabled and we don't expect to
+* trap at here for that case.
+*/
+   WARN_ON(guest_msr & MSR_PR);
+
+   if (ra)
+   ra_val = kvmppc_get_gpr(vcpu, ra);
+
+   kvmppc_emulate_tabort(vcpu, ra_val);
+   break;
+   }
case OP_31_XOP_TRECLAIM:
{
ulong guest_msr = kvmppc_get_msr(vcpu);
-- 
1.8.3.1



[PATCH v4 21/29] KVM: PPC: Book3S PR: add emulation for trechkpt in PR KVM.

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch adds host emulation when guest PR KVM executes "trechkpt.",
which is a privileged instruction and will trap into host.

We firstly copy vcpu ongoing content into vcpu tm checkpoint
content, then perform kvmppc_restore_tm_pr() to do trechkpt.
with updated vcpu tm checkpoint vals.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_emulate.c | 61 +++
 arch/powerpc/kvm/book3s_pr.c  |  2 +-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index c1cea82..2940de7 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -262,10 +262,12 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned 
long lpcr,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
+void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu);
 #else
 static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 04c29e0..b7530cf 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -52,6 +52,7 @@
 #define OP_31_XOP_TBEGIN   654
 
 #define OP_31_XOP_TRECLAIM 942
+#define OP_31_XOP_TRCHKPT  1006
 
 /* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
 #define OP_31_XOP_DCBZ 1010
@@ -172,6 +173,29 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, 
int ra_val)
kvmppc_set_msr(vcpu, guest_msr);
preempt_enable();
 }
+
+static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
+{
+   unsigned long guest_msr = kvmppc_get_msr(vcpu);
+
+   preempt_disable();
+   /*
+* need flush FP/VEC/VSX to vcpu save area before
+* copy.
+*/
+   kvmppc_giveup_ext(vcpu, MSR_VSX);
+   kvmppc_copyto_vcpu_tm(vcpu);
+   kvmppc_save_tm_sprs(vcpu);
+
+   /*
+* as a result of trecheckpoint. set TS to suspended.
+*/
+   guest_msr &= ~(MSR_TS_MASK);
+   guest_msr |= MSR_TS_S;
+   kvmppc_set_msr(vcpu, guest_msr);
+   kvmppc_restore_tm_pr(vcpu);
+   preempt_enable();
+}
 #endif
 
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -478,6 +502,43 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
kvmppc_emulate_treclaim(vcpu, ra_val);
break;
}
+   case OP_31_XOP_TRCHKPT:
+   {
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   unsigned long texasr;
+
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   /* generate interrupt based on priorities */
+   if (guest_msr & MSR_PR) {
+   /* Privileged Instruction type Program Intr */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   tm_enable();
+   texasr = mfspr(SPRN_TEXASR);
+   tm_disable();
+
+   if (MSR_TM_ACTIVE(guest_msr) ||
+   !(texasr & (TEXASR_FS))) {
+   /* TM bad thing interrupt */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   kvmppc_emulate_trchkpt(vcpu);
+   break;
+   }
 #endif
default:
emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 9a72460..5359f9c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -299,7 +299,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
+void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
 {

[PATCH v4 20/29] KVM: PPC: Book3S PR: adds emulation for treclaim.

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch adds support for "treclaim." emulation when PR KVM guest
executes treclaim. and traps to host.

We will firstly doing treclaim. and save TM checkpoint. Then it is
necessary to update vcpu current reg content with checkpointed vals.
When rfid into guest again, those vcpu current reg content(now the
checkpoint vals) will be loaded into regs.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_emulate.c | 76 +++
 1 file changed, 76 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 570339b..04c29e0 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -51,6 +51,8 @@
 
 #define OP_31_XOP_TBEGIN   654
 
+#define OP_31_XOP_TRECLAIM 942
+
 /* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
 #define OP_31_XOP_DCBZ 1010
 
@@ -130,6 +132,46 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu 
*vcpu)
vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
 }
 
+static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
+{
+   unsigned long guest_msr = kvmppc_get_msr(vcpu);
+   int fc_val = ra_val ? ra_val : 1;
+
+   /* CR0 = 0 | MSR[TS] | 0 */
+   vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+   (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
+<< CR0_SHIFT);
+
+   preempt_disable();
+   kvmppc_save_tm_pr(vcpu);
+   kvmppc_copyfrom_vcpu_tm(vcpu);
+
+   tm_enable();
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   /* failure recording depends on Failure Summary bit */
+   if (!(vcpu->arch.texasr & TEXASR_FS)) {
+   vcpu->arch.texasr &= ~TEXASR_FC;
+   vcpu->arch.texasr |= ((u64)fc_val << TEXASR_FC_LG);
+
+   vcpu->arch.texasr &= ~(TEXASR_PR | TEXASR_HV);
+   if (kvmppc_get_msr(vcpu) & MSR_PR)
+   vcpu->arch.texasr |= TEXASR_PR;
+
+   if (kvmppc_get_msr(vcpu) & MSR_HV)
+   vcpu->arch.texasr |= TEXASR_HV;
+
+   vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   }
+   tm_disable();
+   /*
+* treclaim need quit to non-transactional state.
+*/
+   guest_msr &= ~(MSR_TS_MASK);
+   kvmppc_set_msr(vcpu, guest_msr);
+   preempt_enable();
+}
 #endif
 
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -402,6 +444,40 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
emulated = EMULATE_FAIL;
break;
}
+   case OP_31_XOP_TRECLAIM:
+   {
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   unsigned long ra_val = 0;
+
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   /* generate interrupts based on priorities */
+   if (guest_msr & MSR_PR) {
+   /* Privileged Instruction type Program 
Interrupt */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (!MSR_TM_ACTIVE(guest_msr)) {
+   /* TM bad thing interrupt */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (ra)
+   ra_val = kvmppc_get_gpr(vcpu, ra);
+   kvmppc_emulate_treclaim(vcpu, ra_val);
+   break;
+   }
 #endif
default:
emulated = EMULATE_FAIL;
-- 
1.8.3.1



[PATCH v4 19/29] KVM: PPC: Book3S PR: enable NV reg restore for reading TM SPR at guest privilege state

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently kvmppc_handle_fac() will not update NV GPRs and thus it can
return with GUEST_RESUME.

However PR KVM guest always disables MSR_TM bit at privilege state. If PR
privilege guest are trying to read TM SPRs, it will trigger TM facility
unavailable exception and fall into kvmppc_handle_fac(). Then the emulation
will be done by kvmppc_core_emulate_mfspr_pr(). The mfspr instruction can
include a RT with NV reg. So it is necessary to restore NV GPRs at this
case, to reflect the update to NV RT.

This patch make kvmppc_handle_fac() return GUEST_RESUME_NV at TM fac
exception and with guest privilege state.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_pr.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 9becca1..9a72460 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -989,6 +989,18 @@ static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong 
fac)
break;
}
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /* Since we disabled MSR_TM at privilege state, the mfspr instruction
+* for TM spr can trigger TM fac unavailable. In this case, the
+* emulation is handled by kvmppc_emulate_fac(), which invokes
+* kvmppc_emulate_mfspr() finally. But note the mfspr can include
+* RT for NV registers. So it need to restore those NV reg to reflect
+* the update.
+*/
+   if ((fac == FSCR_TM_LG) && !(kvmppc_get_msr(vcpu) & MSR_PR))
+   return RESUME_GUEST_NV;
+#endif
+
return RESUME_GUEST;
 }
 
@@ -1350,8 +1362,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
}
 #ifdef CONFIG_PPC_BOOK3S_64
case BOOK3S_INTERRUPT_FAC_UNAVAIL:
-   kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
-   r = RESUME_GUEST;
+   r = kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
break;
 #endif
case BOOK3S_INTERRUPT_MACHINE_CHECK:
-- 
1.8.3.1



[PATCH v4 18/29] KVM: PPC: Book3S PR: always fail transaction in guest privilege state

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently kernel doesn't use transaction memory.
And there is an issue for privilege guest that:
tbegin/tsuspend/tresume/tabort TM instructions can impact MSR TM bits
without trap into PR host. So following code will lead to a false mfmsr
result:
tbegin  <- MSR bits update to Transaction active.
beq <- failover handler branch
mfmsr   <- still read MSR bits from magic page with
transaction inactive.

It is not an issue for non-privilege guest since its mfmsr is not patched
with magic page and will always trap into PR host.

This patch will always fail tbegin attempt for privilege guest, so that
the above issue is prevented. It is benign since currently (guest) kernel
doesn't initiate a transaction.

Test case:
https://github.com/justdoitqd/publicFiles/blob/master/test_tbegin_pr.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_emulate.c | 40 +++
 arch/powerpc/kvm/book3s_pr.c  | 11 +-
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 43e8bb1..c1cea82 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -262,9 +262,11 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned 
long lpcr,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
+void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu);
 #else
 static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
 extern int kvm_irq_bypass;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index c4e3ec6..570339b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "book3s.h"
 #include 
 
@@ -48,6 +49,8 @@
 #define OP_31_XOP_EIOIO854
 #define OP_31_XOP_SLBMFEE  915
 
+#define OP_31_XOP_TBEGIN   654
+
 /* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
 #define OP_31_XOP_DCBZ 1010
 
@@ -363,6 +366,43 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 
break;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   case OP_31_XOP_TBEGIN:
+   {
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
+   preempt_disable();
+   vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
+ (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
+
+   vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
+   (((u64)(TM_CAUSE_EMULATE | 
TM_CAUSE_PERSISTENT))
+<< TEXASR_FC_LG));
+
+   if ((inst >> 21) & 0x1)
+   vcpu->arch.texasr |= TEXASR_ROT;
+
+   if (kvmppc_get_msr(vcpu) & MSR_HV)
+   vcpu->arch.texasr |= TEXASR_HV;
+
+   vcpu->arch.tfhar = kvmppc_get_pc(vcpu) + 4;
+   vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+
+   kvmppc_restore_tm_sprs(vcpu);
+   preempt_enable();
+   } else
+   emulated = EMULATE_FAIL;
+   break;
+   }
+#endif
default:
emulated = EMULATE_FAIL;
}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index e8e7f3a..9becca1 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -207,6 +207,15 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_PPC_BOOK3S_64
smsr |= MSR_ISF | MSR_HV;
 #endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /*
+* in guest privileged state, we want to fail all TM transactions.
+* So disable MSR TM bit so that all tbegin. will be able to be
+* trapped into host.
+*/
+   if (!(guest_msr & MSR_PR))
+   smsr &= ~MSR_TM;
+#endif
vcpu->arch.shadow_msr = 

[PATCH v4 17/29] KVM: PPC: Book3S PR: make mtspr/mfspr emulation behavior based on active TM SPRs

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

The mfspr/mtspr on TM SPRs(TEXASR/TFIAR/TFHAR) are non-privileged
instructions and can be executed at PR KVM guest without trapping
into host in problem state. We only emulate mtspr/mfspr
texasr/tfiar/tfhar at guest PR=0 state.

When we are emulating mtspr tm sprs at guest PR=0 state, the emulation
result need to be visible to guest PR=1 state. That is, the actual TM
SPR val should be loaded into actual registers.

We already flush TM SPRs into vcpu when switching out of CPU, and load
TM SPRs when switching back.

This patch corrects mfspr()/mtspr() emulation for TM SPRs to make the
actual source/dest based on actual TM SPRs.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  1 +
 arch/powerpc/kvm/book3s_emulate.c | 58 +--
 arch/powerpc/kvm/book3s_pr.c  |  2 +-
 3 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index fc15ad9..43e8bb1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -210,6 +210,7 @@ extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
  unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
+extern void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index f81a921..c4e3ec6 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include "book3s.h"
+#include 
 
 #define OP_19_XOP_RFID 18
 #define OP_19_XOP_RFI  50
@@ -523,13 +524,38 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong spr_val)
break;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case SPRN_TFHAR:
-   vcpu->arch.tfhar = spr_val;
-   break;
case SPRN_TEXASR:
-   vcpu->arch.texasr = spr_val;
-   break;
case SPRN_TFIAR:
-   vcpu->arch.tfiar = spr_val;
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (MSR_TM_ACTIVE(kvmppc_get_msr(vcpu)) &&
+   !((MSR_TM_SUSPENDED(kvmppc_get_msr(vcpu))) &&
+   (sprn == SPRN_TFHAR))) {
+   /* it is illegal to mtspr() TM regs in
+* other than non-transactional state, with
+* the exception of TFHAR in suspend state.
+*/
+   kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   tm_enable();
+   if (sprn == SPRN_TFHAR)
+   mtspr(SPRN_TFHAR, spr_val);
+   else if (sprn == SPRN_TEXASR)
+   mtspr(SPRN_TEXASR, spr_val);
+   else
+   mtspr(SPRN_TFIAR, spr_val);
+   tm_disable();
+
break;
 #endif
 #endif
@@ -676,13 +702,25 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong *spr_val
break;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case SPRN_TFHAR:
-   *spr_val = vcpu->arch.tfhar;
-   break;
case SPRN_TEXASR:
-   *spr_val = vcpu->arch.texasr;
-   break;
case SPRN_TFIAR:
-   *spr_val = vcpu->arch.tfiar;
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   tm_enable();
+   if (sprn == SPRN_TFHAR)
+   *spr_val = mfspr(SPRN_TFHAR);
+   else if (sprn == SPRN_TEXASR)
+   *spr_val = mfspr(SPRN_TEXASR);
+   else if (sprn == SPRN_TFIAR)
+   *spr_val = mfspr(SPRN_TFIAR);
+   tm_disable();
break;
 #endif
 #endif
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 4b81b3c..e8e7f3a 100644
--- 

[PATCH v4 16/29] KVM: PPC: Book3S PR: add math support for PR KVM HTM

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

The math registers will be saved into vcpu->arch.fp/vr and corresponding
vcpu->arch.fp_tm/vr_tm area.

We flush or giveup the math regs into vcpu->arch.fp/vr before saving
transaction. After transaction is restored, the math regs will be loaded
back into regs.

If there is a FP/VEC/VSX unavailable exception during transaction active
state, the math checkpoint content might be incorrect and we need to do
treclaim./load the correct checkpoint val/trechkpt. sequence to retry the
transaction. That will make our solution complicated. To solve this issue,
we always make the hardware guest MSR math bits (shadow_msr) consistent
with the MSR val which guest sees (kvmppc_get_msr()) when guest msr is
with tm enabled. Then all FP/VEC/VSX unavailable exception can be delivered
to guest and guest handles the exception by itself.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_pr.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 226bae7..4b81b3c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -308,6 +308,28 @@ static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu 
*vcpu)
tm_disable();
 }
 
+/* loadup math bits which is enabled at kvmppc_get_msr() but not enabled at
+ * hardware.
+ */
+static void kvmppc_handle_lost_math_exts(struct kvm_vcpu *vcpu)
+{
+   ulong exit_nr;
+   ulong ext_diff = (kvmppc_get_msr(vcpu) & ~vcpu->arch.guest_owned_ext) &
+   (MSR_FP | MSR_VEC | MSR_VSX);
+
+   if (!ext_diff)
+   return;
+
+   if (ext_diff == MSR_FP)
+   exit_nr = BOOK3S_INTERRUPT_FP_UNAVAIL;
+   else if (ext_diff == MSR_VEC)
+   exit_nr = BOOK3S_INTERRUPT_ALTIVEC;
+   else
+   exit_nr = BOOK3S_INTERRUPT_VSX;
+
+   kvmppc_handle_ext(vcpu, exit_nr, ext_diff);
+}
+
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
 {
if (!(MSR_TM_ACTIVE(kvmppc_get_msr(vcpu {
@@ -315,6 +337,8 @@ void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
return;
}
 
+   kvmppc_giveup_ext(vcpu, MSR_VSX);
+
preempt_disable();
_kvmppc_save_tm_pr(vcpu, mfmsr());
preempt_enable();
@@ -324,12 +348,18 @@ void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
 {
if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
kvmppc_restore_tm_sprs(vcpu);
+   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   kvmppc_handle_lost_math_exts(vcpu);
return;
}
 
preempt_disable();
_kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
preempt_enable();
+
+   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   kvmppc_handle_lost_math_exts(vcpu);
+
 }
 #endif
 
@@ -468,6 +498,11 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 
msr)
/* Preload FPU if it's enabled */
if (kvmppc_get_msr(vcpu) & MSR_FP)
kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   kvmppc_handle_lost_math_exts(vcpu);
+#endif
 }
 
 void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
-- 
1.8.3.1



[PATCH v4 15/29] KVM: PPC: Book3S PR: add transaction memory save/restore skeleton for PR KVM

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

The transaction memory checkpoint area save/restore behavior is
triggered when VCPU qemu process is switching out/into CPU. ie.
at kvmppc_core_vcpu_put_pr() and kvmppc_core_vcpu_load_pr().

MSR TM active state is determined by TS bits:
active: 10(transactional) or 01 (suspended)
inactive: 00 (non-transactional)
We don't "fake" TM functionality for guest. We "sync" guest virtual
MSR TM active state(10 or 01) with shadow MSR. That is to say,
we don't emulate a transactional guest with a TM inactive MSR.

TM SPR support(TFIAR/TFAR/TEXASR) has already been supported by
commit 9916d57e64a4 ("KVM: PPC: Book3S PR: Expose TM registers").
Math register support (FPR/VMX/VSX) will be done at subsequent
patch.

Whether TM context need to be saved/restored can be determined
by kvmppc_get_msr() TM active state:
* TM active - save/restore TM context
* TM inactive - no need to do so and only save/restore
TM SPRs.

Signed-off-by: Simon Guo 
Suggested-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s.h |  9 +
 arch/powerpc/include/asm/kvm_host.h   |  1 -
 arch/powerpc/kvm/book3s_pr.c  | 27 +++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 20d3d5a..fc15ad9 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -257,6 +257,15 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned 
long lpcr,
 extern int kvmppc_hcall_impl_hv_realmode(unsigned long cmd);
 extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
+void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
+#else
+static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
+#endif
+
 extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 89f44ec..60325af 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -621,7 +621,6 @@ struct kvm_vcpu_arch {
 
struct thread_vr_state vr_tm;
u32 vrsave_tm; /* also USPRG0 */
-
 #endif
 
 #ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 7d4905a..226bae7 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 
@@ -115,6 +116,8 @@ static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, 
int cpu)
 
if (kvmppc_is_split_real(vcpu))
kvmppc_fixup_split_real(vcpu);
+
+   kvmppc_restore_tm_pr(vcpu);
 }
 
 static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
@@ -134,6 +137,7 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 
kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+   kvmppc_save_tm_pr(vcpu);
 
/* Enable AIL if supported */
if (cpu_has_feature(CPU_FTR_HVMODE) &&
@@ -304,6 +308,29 @@ static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu 
*vcpu)
tm_disable();
 }
 
+void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
+{
+   if (!(MSR_TM_ACTIVE(kvmppc_get_msr(vcpu {
+   kvmppc_save_tm_sprs(vcpu);
+   return;
+   }
+
+   preempt_disable();
+   _kvmppc_save_tm_pr(vcpu, mfmsr());
+   preempt_enable();
+}
+
+void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
+{
+   if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
+   kvmppc_restore_tm_sprs(vcpu);
+   return;
+   }
+
+   preempt_disable();
+   _kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
+   preempt_enable();
+}
 #endif
 
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
-- 
1.8.3.1



[PATCH v4 14/29] KVM: PPC: Book3S PR: add kvmppc_save/restore_tm_sprs() APIs

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch adds 2 new APIs kvmppc_save_tm_sprs()/kvmppc_restore_tm_sprs()
for the purpose of TEXASR/TFIAR/TFHAR save/restore.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_pr.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f2ae5a3..7d4905a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 
@@ -284,6 +285,27 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
svcpu_put(svcpu);
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
+{
+   tm_enable();
+   vcpu->arch.tfhar = mfspr(SPRN_TFHAR);
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   vcpu->arch.tfiar = mfspr(SPRN_TFIAR);
+   tm_disable();
+}
+
+static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu)
+{
+   tm_enable();
+   mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   tm_disable();
+}
+
+#endif
+
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
 {
int r = 1; /* Indicate we want to get back into the guest */
-- 
1.8.3.1



[PATCH v4 13/29] KVM: PPC: Book3S PR: adds new kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm API for PR KVM.

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch adds 2 new APIs: kvmppc_copyto_vcpu_tm() and
kvmppc_copyfrom_vcpu_tm().  These 2 APIs will be used to copy from/to TM
data between VCPU_TM/VCPU area.

PR KVM will use these APIs for treclaim. or trchkpt. emulation.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_emulate.c | 41 +++
 1 file changed, 41 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 2eb457b..f81a921 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -87,6 +87,47 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum 
priv_level level)
return true;
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
+{
+   memcpy(>arch.gpr_tm[0], >arch.regs.gpr[0],
+   sizeof(vcpu->arch.gpr_tm));
+   memcpy(>arch.fp_tm, >arch.fp,
+   sizeof(struct thread_fp_state));
+   memcpy(>arch.vr_tm, >arch.vr,
+   sizeof(struct thread_vr_state));
+   vcpu->arch.ppr_tm = vcpu->arch.ppr;
+   vcpu->arch.dscr_tm = vcpu->arch.dscr;
+   vcpu->arch.amr_tm = vcpu->arch.amr;
+   vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
+   vcpu->arch.tar_tm = vcpu->arch.tar;
+   vcpu->arch.lr_tm = vcpu->arch.regs.link;
+   vcpu->arch.cr_tm = vcpu->arch.cr;
+   vcpu->arch.xer_tm = vcpu->arch.regs.xer;
+   vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
+}
+
+static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
+{
+   memcpy(>arch.regs.gpr[0], >arch.gpr_tm[0],
+   sizeof(vcpu->arch.regs.gpr));
+   memcpy(>arch.fp, >arch.fp_tm,
+   sizeof(struct thread_fp_state));
+   memcpy(>arch.vr, >arch.vr_tm,
+   sizeof(struct thread_vr_state));
+   vcpu->arch.ppr = vcpu->arch.ppr_tm;
+   vcpu->arch.dscr = vcpu->arch.dscr_tm;
+   vcpu->arch.amr = vcpu->arch.amr_tm;
+   vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
+   vcpu->arch.tar = vcpu->arch.tar_tm;
+   vcpu->arch.regs.link = vcpu->arch.lr_tm;
+   vcpu->arch.cr = vcpu->arch.cr_tm;
+   vcpu->arch.regs.xer = vcpu->arch.xer_tm;
+   vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
+}
+
+#endif
+
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
  unsigned int inst, int *advance)
 {
-- 
1.8.3.1



[PATCH v4 12/29] KVM: PPC: Book3S PR: prevent TS bits change in kvmppc_interrupt_pr()

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

PR KVM host usually equipped with enabled TM in its host MSR value, and
with non-transactional TS value.

When a guest with TM active traps into PR KVM host, the rfid at the
tail of kvmppc_interrupt_pr() will try to switch TS bits from
S0 (Suspended & TM disabled) to N1 (Non-transactional & TM enabled).

That will leads to TM Bad Thing interrupt.

This patch manually sets target TS bits unchanged to avoid this
exception.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_segment.S | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_segment.S 
b/arch/powerpc/kvm/book3s_segment.S
index 93a180c..98ccc7e 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -383,6 +383,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 */
 
PPC_LL  r6, HSTATE_HOST_MSR(r13)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /*
+* We don't want to change MSR[TS] bits via rfi here.
+* The actual TM handling logic will be in host with
+* recovered DR/IR bits after HSTATE_VMHANDLER.
+* And MSR_TM can be enabled in HOST_MSR so rfid may
+* not suppress this change and can lead to exception.
+* Manually set MSR to prevent TS state change here.
+*/
+   mfmsr   r7
+   rldicl  r7, r7, 64 - MSR_TS_S_LG, 62
+   rldimi  r6, r7, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+#endif
PPC_LL  r8, HSTATE_VMHANDLER(r13)
 
 #ifdef CONFIG_PPC64
-- 
1.8.3.1



[PATCH v4 11/29] KVM: PPC: Book3S PR: implement RFID TM behavior to suppress change from S0 to N0

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Accordingly to ISA specification for RFID, in MSR TM disabled and TS
suspended state(S0), if the target MSR is TM disabled and TS state is
inactive(N0), rfid should suppress this update.

This patch make RFID emulation of PR KVM to be consistent with this.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_emulate.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 68d6898..2eb457b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -117,11 +117,28 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
case 19:
switch (get_xop(inst)) {
case OP_19_XOP_RFID:
-   case OP_19_XOP_RFI:
+   case OP_19_XOP_RFI: {
+   unsigned long srr1 = kvmppc_get_srr1(vcpu);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   unsigned long cur_msr = kvmppc_get_msr(vcpu);
+
+   /*
+* add rules to fit in ISA specification regarding TM
+* state transistion in TM disable/Suspended state,
+* and target TM state is TM inactive(00) state. (the
+* change should be suppressed).
+*/
+   if (((cur_msr & MSR_TM) == 0) &&
+   ((srr1 & MSR_TM) == 0) &&
+   MSR_TM_SUSPENDED(cur_msr) &&
+   !MSR_TM_ACTIVE(srr1))
+   srr1 |= MSR_TS_S;
+#endif
kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu));
-   kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu));
+   kvmppc_set_msr(vcpu, srr1);
*advance = 0;
break;
+   }
 
default:
emulated = EMULATE_FAIL;
-- 
1.8.3.1



[PATCH v4 10/29] KVM: PPC: Book3S PR: Sync TM bits to shadow msr for problem state guest

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

MSR TS bits can be modified with non-privileged instruction like
tbegin./tend.  That means guest can change MSR value "silently" without
notifying host.

It is necessary to sync the TM bits to host so that host can calculate
shadow msr correctly.

note privilege guest will always fail transactions so we only take
care of problem state guest.

The logic is put into kvmppc_copy_from_svcpu() so that
kvmppc_handle_exit_pr() can use correct MSR TM bits even when preemption.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_pr.c | 73 ++--
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d3237f5..f2ae5a3 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -182,10 +182,36 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
svcpu_put(svcpu);
 }
 
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   ulong smsr = guest_msr;
+
+   /* Guest MSR values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
+   MSR_TM | MSR_TS_MASK;
+#else
+   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
+#endif
+   /* Process MSR values */
+   smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+   /* External providers the guest reserved */
+   smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
+   /* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+   smsr |= MSR_ISF | MSR_HV;
+#endif
+   vcpu->arch.shadow_msr = smsr;
+}
+
 /* Copy data touched by real-mode code from shadow vcpu back to vcpu */
 void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 {
struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   ulong old_msr;
+#endif
 
/*
 * Maybe we were already preempted and synced the svcpu from
@@ -228,6 +254,30 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /*
+* Unlike other MSR bits, MSR[TS]bits can be changed at guest without
+* notifying host:
+*  modified by unprivileged instructions like "tbegin"/"tend"/
+* "tresume"/"tsuspend" in PR KVM guest.
+*
+* It is necessary to sync here to calculate a correct shadow_msr.
+*
+* privileged guest's tbegin will be failed at present. So we
+* only take care of problem state guest.
+*/
+   old_msr = kvmppc_get_msr(vcpu);
+   if (unlikely((old_msr & MSR_PR) &&
+   (vcpu->arch.shadow_srr1 & (MSR_TS_MASK)) !=
+   (old_msr & (MSR_TS_MASK {
+   old_msr &= ~(MSR_TS_MASK);
+   old_msr |= (vcpu->arch.shadow_srr1 & (MSR_TS_MASK));
+   kvmppc_set_msr_fast(vcpu, old_msr);
+   kvmppc_recalc_shadow_msr(vcpu);
+   }
+#endif
+
svcpu->in_use = false;
 
 out:
@@ -306,29 +356,6 @@ static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned 
long hva, pte_t pte)
 
 /*/
 
-static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
-{
-   ulong guest_msr = kvmppc_get_msr(vcpu);
-   ulong smsr = guest_msr;
-
-   /* Guest MSR values */
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
-   MSR_TM | MSR_TS_MASK;
-#else
-   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
-#endif
-   /* Process MSR values */
-   smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
-   /* External providers the guest reserved */
-   smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
-   /* 64-bit Process MSR values */
-#ifdef CONFIG_PPC_BOOK3S_64
-   smsr |= MSR_ISF | MSR_HV;
-#endif
-   vcpu->arch.shadow_msr = smsr;
-}
-
 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 {
ulong old_msr = kvmppc_get_msr(vcpu);
-- 
1.8.3.1



[PATCH v4 09/29] KVM: PPC: Book3S PR: PR KVM pass through MSR TM/TS bits to shadow_msr.

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

PowerPC TM functionality needs MSR TM/TS bits support in hardware level.
Guest TM functionality can not be emulated with "fake" MSR (msr in magic
page) TS bits.

This patch syncs TM/TS bits in shadow_msr with the MSR value in magic
page, so that the MSR TS value which guest sees is consistent with actual
MSR bits running in guest.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_pr.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 67061d3..d3237f5 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -312,7 +312,12 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
ulong smsr = guest_msr;
 
/* Guest MSR values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
+   MSR_TM | MSR_TS_MASK;
+#else
smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
+#endif
/* Process MSR values */
smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
/* External providers the guest reserved */
-- 
1.8.3.1



[PATCH v4 08/29] KVM: PPC: Book3S PR: In PR KVM suspends Transactional state when inject an interrupt.

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch simulates interrupt behavior per Power ISA while injecting
interrupt in PR KVM:
- When interrupt happens, transactional state should be suspended.

kvmppc_mmu_book3s_64_reset_msr() will be invoked when injecting an
interrupt. This patch performs this ISA logic in
kvmppc_mmu_book3s_64_reset_msr().

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_64_mmu.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index a93d719..cf9d686 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -38,7 +38,16 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-   kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
+   unsigned long msr = vcpu->arch.intr_msr;
+   unsigned long cur_msr = kvmppc_get_msr(vcpu);
+
+   /* If transactional, change to suspend mode on IRQ delivery */
+   if (MSR_TM_TRANSACTIONAL(cur_msr))
+   msr |= MSR_TS_S;
+   else
+   msr |= cur_msr & MSR_TS_MASK;
+
+   kvmppc_set_msr(vcpu, msr);
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
-- 
1.8.3.1



[PATCH v4 07/29] KVM: PPC: Book3S PR: add C function wrapper for _kvmppc_save/restore_tm()

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently _kvmppc_save/restore_tm() APIs can only be invoked from
assembly function. This patch adds C function wrappers for them so
that they can be safely called from C function.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |  6 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  8 +--
 arch/powerpc/kvm/tm.S | 94 ++-
 3 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index dfdcb23..5da683b 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,7 +141,13 @@ unsigned long __init prom_init(unsigned long r3, unsigned 
long r4,
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
 
+/* Transaction memory related */
 void tm_enable(void);
 void tm_disable(void);
 void tm_abort(uint8_t cause);
+
+struct kvm_vcpu;
+void _kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr);
+void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6445d29..980df5f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -795,7 +795,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
mr  r3, r4
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_restore_tm
+   bl  __kvmppc_restore_tm
ld  r4, HSTATE_KVM_VCPU(r13)
 91:
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -1783,7 +1783,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
mr  r3, r9
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_save_tm
+   bl  __kvmppc_save_tm
ld  r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -2689,7 +2689,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
ld  r3, HSTATE_KVM_VCPU(r13)
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_save_tm
+   bl  __kvmppc_save_tm
 91:
 #endif
 
@@ -2809,7 +2809,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
mr  r3, r4
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_restore_tm
+   bl  __kvmppc_restore_tm
ld  r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index b7057d5..42a7cd8 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -33,7 +33,7 @@
  * This can modify all checkpointed registers, but
  * restores r1, r2 before exit.
  */
-_GLOBAL(kvmppc_save_tm)
+_GLOBAL(__kvmppc_save_tm)
mflrr0
std r0, PPC_LR_STKOFF(r1)
stdur1, -PPC_MIN_STKFRM(r1)
@@ -210,6 +210,52 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
blr
 
 /*
+ * _kvmppc_save_tm_pr() is a wrapper around __kvmppc_save_tm(), so that it can
+ * be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_save_tm_pr)
+   mflrr5
+   std r5, PPC_LR_STKOFF(r1)
+   stdur1, -SWITCH_FRAME_SIZE(r1)
+   SAVE_NVGPRS(r1)
+
+   /* save MSR since TM/math bits might be impacted
+* by __kvmppc_save_tm().
+*/
+   mfmsr   r5
+   SAVE_GPR(5, r1)
+
+   /* also save DSCR/CR so that it can be recovered later */
+   mfspr   r6, SPRN_DSCR
+   SAVE_GPR(6, r1)
+
+   mfcrr7
+   stw r7, _CCR(r1)
+
+   bl  __kvmppc_save_tm
+
+   ld  r7, _CCR(r1)
+   mtcrr7
+
+   REST_GPR(6, r1)
+   mtspr   SPRN_DSCR, r6
+
+   /* need preserve current MSR's MSR_TS bits */
+   REST_GPR(5, r1)
+   mfmsr   r6
+   rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
+   rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+   mtmsrd  r5
+
+   REST_NVGPRS(r1)
+   addir1, r1, SWITCH_FRAME_SIZE
+   ld  r5, PPC_LR_STKOFF(r1)
+   mtlrr5
+   blr
+
+EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
+
+/*
  * Restore transactional state and TM-related registers.
  * Called with:
  *  - r3 pointing to the vcpu struct.
@@ -219,7 +265,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
  * This potentially modifies all checkpointed registers.
  * It restores r1, r2 from the PACA.
  */
-_GLOBAL(kvmppc_restore_tm)
+_GLOBAL(__kvmppc_restore_tm)
mflrr0
std r0, PPC_LR_STKOFF(r1)
 
@@ -362,4 +408,48 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
addir1, r1, PPC_MIN_STKFRM
b   9b
 #endif
+
+/*
+ * _kvmppc_restore_tm_pr() is a wrapper around __kvmppc_restore_tm(), so that 
it
+ * can be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_restore_tm_pr)
+   mflrr5
+   std r5, PPC_LR_STKOFF(r1)
+   stdur1, -SWITCH_FRAME_SIZE(r1)
+   SAVE_NVGPRS(r1)
+
+  

[PATCH v4 06/29] KVM: PPC: Book3S PR: turn on FP/VSX/VMX MSR bits in kvmppc_save_tm()

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

kvmppc_save_tm() invokes  store_fp_state/store_vr_state(). So it is
mandatory to turn on FP/VSX/VMX MSR bits for its execution, just
like what kvmppc_restore_tm() did.

Previsouly HV KVM has turned the bits on outside of function
kvmppc_save_tm().  Now we include this bit change in kvmppc_save_tm()
so that the logic is more clean. And PR KVM can reuse it later.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/tm.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index cbe608a..b7057d5 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -42,6 +42,8 @@ _GLOBAL(kvmppc_save_tm)
mfmsr   r8
li  r0, 1
rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+   ori r8, r8, MSR_FP
+   orisr8, r8, (MSR_VEC | MSR_VSX)@h
mtmsrd  r8
 
rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
-- 
1.8.3.1



[PATCH v4 05/29] KVM: PPC: Book3S PR: add new parameter (guest MSR) for kvmppc_save_tm()/kvmppc_restore_tm()

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

HV KVM and PR KVM need different MSR source to indicate whether
treclaim. or trecheckpoint. is necessary.

This patch add new parameter (guest MSR) for these kvmppc_save_tm/
kvmppc_restore_tm() APIs:
- For HV KVM, it is VCPU_MSR
- For PR KVM, it is current host MSR or VCPU_SHADOW_SRR1

This enhancement enables these 2 APIs to be reused by PR KVM later.
And the patch keeps HV KVM logic unchanged.

This patch also reworks kvmppc_save_tm()/kvmppc_restore_tm() to
have a clean ABI: r3 for vcpu and r4 for guest_msr.

During kvmppc_save_tm/kvmppc_restore_tm(), the R1 need to be saved
or restored. Currently the R1 is saved into HSTATE_HOST_R1. In PR
KVM, we are going to add a C function wrapper for
kvmppc_save_tm/kvmppc_restore_tm() where the R1 will be incremented
with added stackframe and save into HSTATE_HOST_R1. There are several
places in HV KVM to load HSTATE_HOST_R1 as R1, and we don't want to
bring risk or confusion by TM code.

This patch will use HSTATE_SCRATCH2 to save/restore R1 in
kvmppc_save_tm/kvmppc_restore_tm() to avoid future confusion, since
the r1 is actually a temporary/scratch value to be saved/stored.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 +-
 arch/powerpc/kvm/tm.S   | 74 -
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 4db2b10..6445d29 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -793,8 +793,12 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
+   mr  r3, r4
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_restore_tm
+   ld  r4, HSTATE_KVM_VCPU(r13)
 91:
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
/* Load guest PMU registers */
@@ -1777,7 +1781,10 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
+   mr  r3, r9
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_save_tm
+   ld  r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
@@ -2680,7 +2687,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
-   ld  r9, HSTATE_KVM_VCPU(r13)
+   ld  r3, HSTATE_KVM_VCPU(r13)
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_save_tm
 91:
 #endif
@@ -2799,7 +2807,10 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
+   mr  r3, r4
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_restore_tm
+   ld  r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index e79b373..cbe608a 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -26,9 +26,12 @@
 
 /*
  * Save transactional state and TM-related registers.
- * Called with r9 pointing to the vcpu struct.
+ * Called with:
+ * - r3 pointing to the vcpu struct
+ * - r4 points to the MSR with current TS bits:
+ * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
  * This can modify all checkpointed registers, but
- * restores r1, r2 and r9 (vcpu pointer) before exit.
+ * restores r1, r2 before exit.
  */
 _GLOBAL(kvmppc_save_tm)
mflrr0
@@ -41,14 +44,11 @@ _GLOBAL(kvmppc_save_tm)
rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
mtmsrd  r8
 
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-   ld  r5, VCPU_MSR(r9)
-   rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+   rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
beq 1f  /* TM not active in guest. */
-#endif
 
-   std r1, HSTATE_HOST_R1(r13)
-   li  r3, TM_CAUSE_KVM_RESCHED
+   std r1, HSTATE_SCRATCH2(r13)
+   std r3, HSTATE_SCRATCH1(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 BEGIN_FTR_SECTION
@@ -65,7 +65,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, 
CPU_FTR_P9_TM_XER_SO_BUG, 96)
 3:
/* Emulation of the treclaim instruction needs TEXASR before treclaim */
mfspr   r6, SPRN_TEXASR
-   std r6, VCPU_ORIG_TEXASR(r9)
+   std r6, VCPU_ORIG_TEXASR(r3)
 6:
 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 #endif
@@ -74,6 +74,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
li  r5, 0
mtmsrd  r5, 1
 
+   li  r3, TM_CAUSE_KVM_RESCHED
+
/* All GPRs are volatile at this point. */
TRECLAIM(R3)
 
@@ -94,7 +96,7 @@ BEGIN_FTR_SECTION
 * we already have it), therefore we can now use any volatile GPR.
 */
/* Reload stack pointer and TOC. */
-   ld  r1, HSTATE_HOST_R1(r13)
+

[PATCH v4 04/29] KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate file

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

It is a simple patch just for moving kvmppc_save_tm/kvmppc_restore_tm()
functionalities to tm.S. There is no logic change. The reconstruct of
those APIs will be done in later patches to improve readability.

It is for preparation of reusing those APIs on both HV/PR PPC KVM.

Some slight change during move the functions includes:
- surrounds some HV KVM specific code with CONFIG_KVM_BOOK3S_HV_POSSIBLE
for compilation.
- use _GLOBAL() to define kvmppc_save_tm/kvmppc_restore_tm()

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/Makefile   |   3 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 322 
 arch/powerpc/kvm/tm.S   | 363 
 3 files changed, 366 insertions(+), 322 deletions(-)
 create mode 100644 arch/powerpc/kvm/tm.S

diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4b19da8..f872c04 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -63,6 +63,9 @@ kvm-pr-y := \
book3s_64_mmu.o \
book3s_32_mmu.o
 
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+   tm.o
+
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_rmhandlers.o
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 5e6e493..4db2b10 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -39,8 +39,6 @@ BEGIN_FTR_SECTION;\
extsw   reg, reg;   \
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
-#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
-
 /* Values in HSTATE_NAPPING(r13) */
 #define NAPPING_CEDE   1
 #define NAPPING_NOVCPU 2
@@ -3119,326 +3117,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
mr  r4,r31
blr
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Save transactional state and TM-related registers.
- * Called with r9 pointing to the vcpu struct.
- * This can modify all checkpointed registers, but
- * restores r1, r2 and r9 (vcpu pointer) before exit.
- */
-kvmppc_save_tm:
-   mflrr0
-   std r0, PPC_LR_STKOFF(r1)
-   stdur1, -PPC_MIN_STKFRM(r1)
-
-   /* Turn on TM. */
-   mfmsr   r8
-   li  r0, 1
-   rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-   mtmsrd  r8
-
-   ld  r5, VCPU_MSR(r9)
-   rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-   beq 1f  /* TM not active in guest. */
-
-   std r1, HSTATE_HOST_R1(r13)
-   li  r3, TM_CAUSE_KVM_RESCHED
-
-BEGIN_FTR_SECTION
-   lbz r0, HSTATE_FAKE_SUSPEND(r13) /* Were we fake suspended? */
-   cmpwi   r0, 0
-   beq 3f
-   rldicl. r8, r8, 64 - MSR_TS_S_LG, 62 /* Did we actually hrfid? */
-   beq 4f
-BEGIN_FTR_SECTION_NESTED(96)
-   bl  pnv_power9_force_smt4_catch
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
-   nop
-   b   6f
-3:
-   /* Emulation of the treclaim instruction needs TEXASR before treclaim */
-   mfspr   r6, SPRN_TEXASR
-   std r6, VCPU_ORIG_TEXASR(r9)
-6:
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-   /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-   li  r5, 0
-   mtmsrd  r5, 1
-
-   /* All GPRs are volatile at this point. */
-   TRECLAIM(R3)
-
-   /* Temporarily store r13 and r9 so we have some regs to play with */
-   SET_SCRATCH0(r13)
-   GET_PACA(r13)
-   std r9, PACATMSCRATCH(r13)
-
-   /* If doing TM emulation on POWER9 DD2.2, check for fake suspend mode */
-BEGIN_FTR_SECTION
-   lbz r9, HSTATE_FAKE_SUSPEND(r13)
-   cmpwi   r9, 0
-   beq 2f
-   /*
-* We were in fake suspend, so we are not going to save the
-* register state as the guest checkpointed state (since
-* we already have it), therefore we can now use any volatile GPR.
-*/
-   /* Reload stack pointer and TOC. */
-   ld  r1, HSTATE_HOST_R1(r13)
-   ld  r2, PACATOC(r13)
-   /* Set MSR RI now we have r1 and r13 back. */
-   li  r5, MSR_RI
-   mtmsrd  r5, 1
-   HMT_MEDIUM
-   ld  r6, HSTATE_DSCR(r13)
-   mtspr   SPRN_DSCR, r6
-BEGIN_FTR_SECTION_NESTED(96)
-   bl  pnv_power9_force_smt4_release
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
-   nop
-
-4:
-   mfspr   r3, SPRN_PSSCR
-   /* PSSCR_FAKE_SUSPEND is a write-only bit, but clear it anyway */
-   li  r0, PSSCR_FAKE_SUSPEND
-   andcr3, r3, r0
-   mtspr   SPRN_PSSCR, r3
-   ld  r9, HSTATE_KVM_VCPU(r13)
-   /* Don't save TEXASR, use value from last exit in real suspend state */
-   b   11f
-2:
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-   ld  r9, 

[PATCH v4 03/29] powerpc: export tm_enable()/tm_disable/tm_abort() APIs

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch exports tm_enable()/tm_disable/tm_abort() APIs, which
will be used for PR KVM transaction memory logic.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/include/asm/asm-prototypes.h |  3 +++
 arch/powerpc/include/asm/tm.h |  2 --
 arch/powerpc/kernel/tm.S  | 12 
 arch/powerpc/mm/hash_utils_64.c   |  1 +
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..dfdcb23 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,4 +141,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned 
long r4,
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
 
+void tm_enable(void);
+void tm_disable(void);
+void tm_abort(uint8_t cause);
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index b1658c9..e94f6db 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -10,12 +10,10 @@
 
 #ifndef __ASSEMBLY__
 
-extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
   uint8_t cause);
 extern void tm_reclaim_current(uint8_t cause);
 extern void tm_recheckpoint(struct thread_struct *thread);
-extern void tm_abort(uint8_t cause);
 extern void tm_save_sprs(struct thread_struct *thread);
 extern void tm_restore_sprs(struct thread_struct *thread);
 
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index b92ac8e..ff12f47 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_VSX
 /* See fpu.S, this is borrowed from there */
@@ -55,6 +56,16 @@ _GLOBAL(tm_enable)
or  r4, r4, r3
mtmsrd  r4
 1: blr
+EXPORT_SYMBOL_GPL(tm_enable);
+
+_GLOBAL(tm_disable)
+   mfmsr   r4
+   li  r3, MSR_TM >> 32
+   sldir3, r3, 32
+   andcr4, r4, r3
+   mtmsrd  r4
+   blr
+EXPORT_SYMBOL_GPL(tm_disable);
 
 _GLOBAL(tm_save_sprs)
mfspr   r0, SPRN_TFHAR
@@ -78,6 +89,7 @@ _GLOBAL(tm_restore_sprs)
 _GLOBAL(tm_abort)
TABORT(R3)
blr
+EXPORT_SYMBOL_GPL(tm_abort);
 
 /* void tm_reclaim(struct thread_struct *thread,
  *uint8_t cause)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0bd3790..1bd8b4c1 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -64,6 +64,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
-- 
1.8.3.1



[PATCH v4 02/29] powerpc: add TEXASR related macros

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patches add some macros for CR0/TEXASR bits so that PR KVM TM
logic(tbegin./treclaim./tabort.) can make use of them later.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/include/asm/reg.h  | 32 +++--
 arch/powerpc/platforms/powernv/copy-paste.h |  3 +--
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 44b2be4..5625684 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -146,6 +146,12 @@
 #define MSR_64BIT  0
 #endif
 
+/* Condition Register related */
+#define CR0_SHIFT  28
+#define CR0_MASK   0xF
+#define CR0_TBEGIN_FAILURE (0x2 << 28) /* 0b0010 */
+
+
 /* Power Management - Processor Stop Status and Control Register Fields */
 #define PSSCR_RL_MASK  0x000F /* Requested Level */
 #define PSSCR_MTL_MASK 0x00F0 /* Maximum Transition Level */
@@ -239,13 +245,27 @@
 #define SPRN_TFIAR 0x81/* Transaction Failure Inst Addr   */
 #define SPRN_TEXASR0x82/* Transaction EXception & Summary */
 #define SPRN_TEXASRU   0x83/* ''  ''  ''Upper 32  */
-#define   TEXASR_ABORT __MASK(63-31) /* terminated by tabort or treclaim */
-#define   TEXASR_SUSP  __MASK(63-32) /* tx failed in suspended state */
-#define   TEXASR_HV__MASK(63-34) /* MSR[HV] when failure occurred */
-#define   TEXASR_PR__MASK(63-35) /* MSR[PR] when failure occurred */
-#define   TEXASR_FS__MASK(63-36) /* TEXASR Failure Summary */
-#define   TEXASR_EXACT __MASK(63-37) /* TFIAR value is exact */
+
+#define TEXASR_FC_LG   (63 - 7)/* Failure Code */
+#define TEXASR_AB_LG   (63 - 31)   /* Abort */
+#define TEXASR_SU_LG   (63 - 32)   /* Suspend */
+#define TEXASR_HV_LG   (63 - 34)   /* Hypervisor state*/
+#define TEXASR_PR_LG   (63 - 35)   /* Privilege level */
+#define TEXASR_FS_LG   (63 - 36)   /* failure summary */
+#define TEXASR_EX_LG   (63 - 37)   /* TFIAR exact bit */
+#define TEXASR_ROT_LG  (63 - 38)   /* ROT bit */
+
+#define   TEXASR_ABORT __MASK(TEXASR_AB_LG) /* terminated by tabort or 
treclaim */
+#define   TEXASR_SUSP  __MASK(TEXASR_SU_LG) /* tx failed in suspended state */
+#define   TEXASR_HV__MASK(TEXASR_HV_LG) /* MSR[HV] when failure occurred */
+#define   TEXASR_PR__MASK(TEXASR_PR_LG) /* MSR[PR] when failure occurred */
+#define   TEXASR_FS__MASK(TEXASR_FS_LG) /* TEXASR Failure Summary */
+#define   TEXASR_EXACT __MASK(TEXASR_EX_LG) /* TFIAR value is exact */
+#define   TEXASR_ROT   __MASK(TEXASR_ROT_LG)
+#define   TEXASR_FC(ASM_CONST(0xFF) << TEXASR_FC_LG)
+
 #define SPRN_TFHAR 0x80/* Transaction Failure Handler Addr */
+
 #define SPRN_TIDR  144 /* Thread ID register */
 #define SPRN_CTRLF 0x088
 #define SPRN_CTRLT 0x098
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h 
b/arch/powerpc/platforms/powernv/copy-paste.h
index c9a5036..3fa62de 100644
--- a/arch/powerpc/platforms/powernv/copy-paste.h
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -7,9 +7,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include 
+#include 
 
-#define CR0_SHIFT  28
-#define CR0_MASK   0xF
 /*
  * Copy/paste instructions:
  *
-- 
1.8.3.1



[PATCH v4 01/29] powerpc: export symbol msr_check_and_set().

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

PR KVM will need to reuse msr_check_and_set().
This patch exports this API for reuse.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kernel/process.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13..25db000 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -154,6 +154,7 @@ unsigned long msr_check_and_set(unsigned long bits)
 
return newmsr;
 }
+EXPORT_SYMBOL_GPL(msr_check_and_set);
 
 void __msr_check_and_clear(unsigned long bits)
 {
-- 
1.8.3.1



[PATCH v4 00/29] KVM: PPC: Book3S PR: Transaction memory support on PR KVM

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

In current days, many OS distributions have utilized transaction
memory functionality. In PowerPC, HV KVM supports TM. But PR KVM
does not.

The drive for the transaction memory support of PR KVM is the
openstack Continuous Integration testing - They runs a HV(hypervisor)
KVM(as level 1) and then run PR KVM(as level 2) on top of that.

This patch set add transaction memory support on PR KVM.

v3 -> v4 changes:
- fix a powermac 32-bit compile failure.

v2 -> v3 changes:
1) rebase onto Paul's kvm-ppc-next branch, which includes rework 
KVM_CHECK_EXTENSION ioctl (patch #25) a little bit. 
2) allow mtspr TFHAR in TM suspend state
3) remove patch: 
  "KVM: PPC: add KVM_SET_ONE_REG/KVM_GET_ONE_REG to async ioctl"
4) some minor rework per comments

v1 -> v2 changes:
1. Correct a bug in trechkpt emulation: the tm sprs need to be 
flushed to vcpu before trechkpt.
2. add PR kvm ioctl functionalities for TM.
3. removed save_msr_tm and use kvmppc_get_msr() to determine 
whether a transaction state need to be restored.
4. Remove "KVM: PPC: Book3S PR: set MSR HV bit accordingly 
for PPC970 and others." patch.
It will prevent PR KVM to start as L1 hypervisor. Since if 
we set HV bit to 0 when rfid to guest (who is supposed to 
run at HV=1 && PR=1), the guest will not be able to access 
its original memory.
The original code always set HV bits for shadow_msr, it is 
benign since:
HV bits can only be altered by sc instruction; it can only 
be set to 0 by rfid/hrfid instruction.  
We return to guest with rfid. So:
* if KVM are running as L1 hypervisor, guest physical MSR 
expects HV=1.
* if KVM are running as L2 hypervisor, rfid cannot update 
HV =1 so the HV is still 0.
5. add XER register implementation to 
kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm()
6. remove unnecessary stack frame in _kvmppc_save/restore_tm().
7. move MSR bits sync into kvmppc_copy_from_svcpu() so that 
we always see inconsistent shadow_msr/kvmppc_get_msr() 
even when preemption.
8. doing failure recording in treclaim emulation when TEXASR_FS
is 0.

Simon Guo (29):
  powerpc: export symbol msr_check_and_set().
  powerpc: add TEXASR related macros
  powerpc: export tm_enable()/tm_disable/tm_abort() APIs
  KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate
file
  KVM: PPC: Book3S PR: add new parameter (guest MSR) for
kvmppc_save_tm()/kvmppc_restore_tm()
  KVM: PPC: Book3S PR: turn on FP/VSX/VMX MSR bits in kvmppc_save_tm()
  KVM: PPC: Book3S PR: add C function wrapper for
_kvmppc_save/restore_tm()
  KVM: PPC: Book3S PR: In PR KVM suspends Transactional state when
inject an interrupt.
  KVM: PPC: Book3S PR: PR KVM pass through MSR TM/TS bits to shadow_msr.
  KVM: PPC: Book3S PR: Sync TM bits to shadow msr for problem state
guest
  KVM: PPC: Book3S PR: implement RFID TM behavior to suppress change
from S0 to N0
  KVM: PPC: Book3S PR: prevent TS bits change in kvmppc_interrupt_pr()
  KVM: PPC: Book3S PR: adds new
kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm API for PR KVM.
  KVM: PPC: Book3S PR: add kvmppc_save/restore_tm_sprs() APIs
  KVM: PPC: Book3S PR: add transaction memory save/restore skeleton for
PR KVM
  KVM: PPC: Book3S PR: add math support for PR KVM HTM
  KVM: PPC: Book3S PR: make mtspr/mfspr emulation behavior based on
active TM SPRs
  KVM: PPC: Book3S PR: always fail transaction in guest privilege state
  KVM: PPC: Book3S PR: enable NV reg restore for reading TM SPR at guest
privilege state
  KVM: PPC: Book3S PR: adds emulation for treclaim.
  KVM: PPC: Book3S PR: add emulation for trechkpt in PR KVM.
  KVM: PPC: Book3S PR: add emulation for tabort. for privilege guest
  KVM: PPC: Book3S PR: add guard code to prevent returning to guest with
PR=0 and Transactional state
  KVM: PPC: Book3S PR: Support TAR handling for PR KVM HTM.
  KVM: PPC: Book3S PR: enable HTM for PR KVM for KVM_CHECK_EXTENSION
ioctl
  KVM: PPC: move vcpu_load/vcpu_put down to each ioctl case in
kvm_arch_vcpu_ioctl
  KVM: PPC: remove load/put vcpu for KVM_GET/SET_ONE_REG ioctl
  KVM: PPC: remove load/put vcpu for KVM_GET_REGS/KVM_SET_REGS
  KVM: PPC: Book3S PR: enable kvmppc_get/set_one_reg_pr() for HTM
registers

 arch/powerpc/include/asm/asm-prototypes.h   |   9 +
 arch/powerpc/include/asm/kvm_book3s.h   |  16 +
 arch/powerpc/include/asm/kvm_host.h |   1 -
 arch/powerpc/include/asm/reg.h  |  32 +-
 arch/powerpc/include/asm/tm.h   |   2 -
 arch/powerpc/include/uapi/asm/tm.h  |   2 +-
 arch/powerpc/kernel/process.c   |   1 +
 arch/powerpc/kernel/tm.S|  12 +
 arch/powerpc/kvm/Makefile   |   3 +
 arch/powerpc/kvm/book3s.c   |   6 -
 arch/powerpc/kvm/book3s.h   |   6 +
 arch/powerpc/kvm/book3s_64_mmu.c|  11 +-
 arch/powerpc/kvm/book3s_emulate.c   | 369 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S 

[PATCH v5 4/4] powerpc:selftest update memcmp_64 selftest for VMX implementation

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo 
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h| 22 +
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +-
 3 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
li  r3,0
blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
li  r3,1
blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..185d257 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include 
 
 #ifndef r1
@@ -6,3 +8,23 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include 
 #include 
 #include 
+#include 
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 1
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+   vmx_count++;
+   return 1;
+}
+
+void exit_vmx_ops(void)
+{
+   vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+   unsigned long size_start, unsigned long max_size)
 {
unsigned long offset, size;
 
-   for (offset = 0; offset < SIZE; offset++) {
-   for (size = 0; size < (SIZE-offset); size++) {
+   for (offset = 0; offset < max_offset; offset++) {
+   for (size = size_start; size < (max_size - offset); size++) {
int x, y;
unsigned long i;
 
@@ -35,70 +55,104 @@ static void test_one(char *s1, char *s2)
printf("\n");
abort();
}
+
+   if (vmx_count != 0) {
+   printf("vmx enter/exit not paired.(offset:%ld 
size:%ld s1:%p s2:%p vc:%d\n",
+   offset, size, s1, s2, vmx_count);
+   printf("\n");
+   abort();
+   }
}
}
 }
 
-static int testcase(void)
+static int testcase(bool islarge)
 {
char *s1;
char *s2;
unsigned long i;
 
-   s1 = memalign(128, SIZE);
+   unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
+   unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+   int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+   s1 = memalign(128, alloc_size);
if (!s1) {
perror("memalign");
exit(1);
}
 
-   s2 = memalign(128, SIZE);
+   s2 = memalign(128, alloc_size);
if (!s2) {
perror("memalign");
exit(1);
}
 
-   srandom(1);
+   srandom(time(0));
 
-   for (i = 0; i < ITERATIONS; i++) {
+   for (i = 0; i < iterations; i++) {
unsigned long j;
unsigned long change;
+   char *rand_s1 = s1;
+   char *rand_s2 = s2;
 
-   for (j = 0; j < SIZE; j++)
+   for (j = 0; j < alloc_size; j++)
s1[j] = random();
 
-   memcpy(s2, s1, SIZE);
+   rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+   

[PATCH v5 3/4] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
- 76% cases will fail/unmatch before 16 bytes;
- 83% cases will fail/unmatch before 32 bytes;
- 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 50 +---
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6303bbf..ee45348 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -403,8 +403,27 @@ _GLOBAL(memcmp)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
/* Enter with src/dst addrs has the same offset with 8 bytes
-* align boundary
+* align boundary.
+*
+* There is an optimization based on following fact: memcmp()
+* prones to fail early at the first 32 bytes.
+* Before applying VMX instructions which will lead to 32x128bits
+* VMX regs load/restore penalty, we compare the first 32 bytes
+* so that we can catch the ~80% fail cases.
 */
+
+   li  r0,4
+   mtctr   r0
+.Lsameoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Lsameoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Llong_novmx_cmp
 
@@ -481,13 +500,6 @@ _GLOBAL(memcmp)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-   /* only do vmx ops when the size exceeds 4K bytes */
-   cmpdi   cr5,r5,VMX_OPS_THRES
-   bge cr5,.Ldiffoffset_vmx_cmp
-.Ldiffoffset_novmx_cmp:
-#endif
-
/* now try to align s1 with 8 bytes */
andi.   r6,r3,0x7
rlwinm  r6,r6,3,0,28
@@ -512,6 +524,13 @@ _GLOBAL(memcmp)
 
 .Ldiffoffset_align_s1_8bytes:
/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+   /* only do vmx ops when the size exceeds 4K bytes */
+   cmpdi   cr5,r5,VMX_OPS_THRES
+   bge cr5,.Ldiffoffset_vmx_cmp
+.Ldiffoffset_novmx_cmp:
+#endif
+
cmpdi   cr5,r5,31
ble cr5,.Lcmp_lt32bytes
 
@@ -523,6 +542,21 @@ _GLOBAL(memcmp)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+   /* perform a 32 bytes pre-checking before
+* enable VMX operations.
+*/
+   li  r0,4
+   mtctr   r0
+.Ldiffoffset_prechk_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Ldiffoffset_prechk_32B_loop
+
ENTER_VMX_OPS
beq cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1



[PATCH v5 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
--
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include 
>#include 
>#include 
>#include 
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
char *s1;
char *s2;
unsigned long i;

s1 = memalign(128, SIZE);
if (!s1) {
perror("memalign");
exit(1);
}

s2 = memalign(128, SIZE);
if (!s2) {
perror("memalign");
exit(1);
}

for (i = 0; i < SIZE; i++)  {
s1[i] = i & 0xff;
s2[i] = i & 0xff;
}
for (i = 0; i < ITERATIONS; i++) {
int ret = test_memcmp(s1, s2, SIZE);

if (ret) {
printf("return %d at[%ld]! should have returned 
zero\n", ret, i);
abort();
}
}

return 0;
}

int main(void)
{
return test_harness(testcase, "memcmp");
}
--
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
4.726728762 seconds time elapsed
  ( +-  3.54%)
With VMX patch:
4.234335473 seconds time elapsed
  ( +-  2.63%)
There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S|   4 +-
 arch/powerpc/lib/memcmp_64.S  | 231 ++
 arch/powerpc/lib/memcpy_power7.S  |   6 +-
 arch/powerpc/lib/vmx-helper.c |   4 +-
 5 files changed, 240 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S 
b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdur1,-STACKFRAMESIZE(r1)
-   bl  enter_vmx_copy
+   bl  enter_vmx_ops
cmpwi   r3,0
ld  r0,STACKFRAMESIZE+16(r1)
ld  r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addir3,r3,128
bdnz1b
 
-   b   exit_vmx_copy   /* tail call optimise */
+   b   exit_vmx_ops/* tail call optimise */
 
 #else
li  r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index f20e883..6303bbf 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -27,12 +27,73 @@
 #define LH lhbrx
 #define LW lwbrx
 #define LD ldbrx
+#define LVSlvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH lhzx
 #define LW lwzx
 #define LD ldx
+#define LVSlvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_OPS_THRES 4096
+#define ENTER_VMX_OPS  \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdur1,-STACKFRAMESIZE(r1); \
+   bl  enter_vmx_ops; \
+   cmpwi   cr1,r3,0; \
+   ld  r0,STACKFRAMESIZE+16(r1); \
+   ld  r3,STK_REG(R31)(r1); \
+   ld  r4,STK_REG(R30)(r1); \
+   ld  r5,STK_REG(R29)(r1); \
+   addir1,r1,STACKFRAMESIZE; \
+   mtlrr0
+
+#define EXIT_VMX_OPS \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdu

[PATCH v5 1/4] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed   
   ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed   
   ( +-  0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 1

 int test_memcmp(const void *s1, const void *s2, size_t n);


- Without patch
0.244746482 seconds time elapsed
  ( +-  0.36%)
- with patch
0.215069477 seconds time elapsed
  ( +-  0.51%)
-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 1
+#define SIZE 8
+#define ITERATIONS 100

 int test_memcmp(const void *s1, const void *s2, size_t n);
---
- Without patch
   1.845642503 seconds time elapsed 
 ( +- 0.12% )
- With patch
   1.849767135 seconds time elapsed 
 ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 143 ---
 1 file changed, 136 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..f20e883 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
 #define LD ldbrx
 #else
+#define LH lhzx
+#define LW lwzx
 #define LD ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_
+ */
 _GLOBAL(memcmp)
cmpdi   cr1,r5,0
 
-   /* Use the short loop if both strings are not 8B aligned */
-   or  r6,r3,r4
+   /* Use the short loop if the src/dst addresses are not
+* with the same offset of 8 bytes align boundary.
+*/
+   xor r6,r3,r4
andi.   r6,r6,7
 
-   /* Use the short loop if length is less than 32B */
-   cmpdi   cr6,r5,31
+   /* Fall back to short loop if compare at aligned addrs
+* with less than 8 bytes.
+*/
+   cmpdi   cr6,r5,7
 
beq cr1,.Lzero
-   bne .Lshort
-   bgt cr6,.Llong
+   bgt cr6,.Lno_short
 
 .Lshort:
mtctr   r5
-
 1: lbz rA,0(r3)
lbz rB,0(r4)
subf.   rC,rB,rA
@@ -78,11 +91,90 @@ _GLOBAL(memcmp)
li  r3,0
blr
 
+.Lno_short:
+   dcbt0,r3
+   dcbt0,r4
+   bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+   /* attempt to compare bytes not aligned with 8 bytes so that
+* rest comparison can run based on 8 bytes alignment.
+*/
+   andi.   r6,r3,7
+
+   /* Try to compare the first double word which is not 8 bytes aligned:
+* load the first double word at (src & ~7UL) and shift left appropriate
+* bits before comparision.
+*/
+   clrlwi  r6,r3,29
+   rlwinm  r6,r6,3,0,28
+   beq .Lsameoffset_8bytes_aligned
+   clrrdi  r3,r3,3
+   clrrdi  r4,r4,3
+   LD  rA,0,r3
+   LD  rB,0,r4
+   sld rA,rA,r6
+   sld rB,rB,r6
+   cmpld   cr0,rA,rB
+   srwir6,r6,3
+   

[PATCH v5 0/4] powerpc/64: memcmp() optimization

2018-05-23 Thread wei . guo . simon
From: Simon Guo 

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.

Simon Guo (4):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
memcmp()
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h  |   4 +-
 arch/powerpc/lib/copypage_power7.S |   4 +-
 arch/powerpc/lib/memcmp_64.S   | 408 -
 arch/powerpc/lib/memcpy_power7.S   |   6 +-
 arch/powerpc/lib/vmx-helper.c  |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h|  22 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 8 files changed, 510 insertions(+), 40 deletions(-)

-- 
1.8.3.1



[PATCH v3 7/7] KVM: PPC: reimplements LOAD_VMX/STORE_VMX instruction mmio emulation with analyse_intr() input

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

This patch reimplements LOAD_VMX/STORE_VMX MMIO emulation with
analyse_intr() input. When emulating the store, the VMX reg will need to
be flushed so that the right reg val can be retrieved before writing to
IO MEM.

This patch also adds support for lvebx/lvehx/lvewx/stvebx/stvehx/stvewx
MMIO emulation. To meet the requirement of handling different element
sizes, kvmppc_handle_load128_by2x64()/kvmppc_handle_store128_by2x64()
were replaced with kvmppc_handle_vmx_load()/kvmppc_handle_vmx_store().

The framework used is the similar with VSX instruction mmio emulation.

Suggested-by: Paul Mackerras 
Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_host.h  |   1 +
 arch/powerpc/include/asm/kvm_ppc.h   |  10 +-
 arch/powerpc/kvm/emulate_loadstore.c | 124 +++--
 arch/powerpc/kvm/powerpc.c   | 259 ---
 4 files changed, 302 insertions(+), 92 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index fe506c8..8dc5e43 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -691,6 +691,7 @@ struct kvm_vcpu_arch {
u8 mmio_vsx_offset;
u8 mmio_vsx_tx_sx_enabled;
u8 mmio_vmx_copy_nums;
+   u8 mmio_vmx_offset;
u8 mmio_copy_type;
u8 osi_needed;
u8 osi_enabled;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 1f087c4..e991821 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -81,10 +81,10 @@ extern int kvmppc_handle_loads(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int rt, unsigned int bytes,
int is_default_endian, int mmio_sign_extend);
-extern int kvmppc_handle_load128_by2x64(struct kvm_run *run,
-   struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian);
-extern int kvmppc_handle_store128_by2x64(struct kvm_run *run,
-   struct kvm_vcpu *vcpu, unsigned int rs, int is_default_endian);
+extern int kvmppc_handle_vmx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+   unsigned int rt, unsigned int bytes, int is_default_endian);
+extern int kvmppc_handle_vmx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+   unsigned int rs, unsigned int bytes, int is_default_endian);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
   u64 val, unsigned int bytes,
   int is_default_endian);
@@ -265,6 +265,8 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, 
u32 *server,
vector128 vval;
u64 vsxval[2];
u32 vsx32val[4];
+   u16 vsx16val[8];
+   u8  vsx8val[16];
struct {
u64 addr;
u64 length;
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index fa4de6b..b6b2d25 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -113,6 +113,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_sp64_extend = 0;
vcpu->arch.mmio_sign_extend = 0;
vcpu->arch.mmio_vmx_copy_nums = 0;
+   vcpu->arch.mmio_vmx_offset = 0;
vcpu->arch.mmio_host_swabbed = 0;
 
emulated = EMULATE_FAIL;
@@ -158,6 +159,46 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
 #endif
+#ifdef CONFIG_ALTIVEC
+   case LOAD_VMX:
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+
+   /* Hardware enforces alignment of VMX accesses */
+   vcpu->arch.vaddr_accessed &= ~((unsigned long)size - 1);
+   vcpu->arch.paddr_accessed &= ~((unsigned long)size - 1);
+
+   if (size == 16) { /* lvx */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_DWORD;
+   } else if (size == 4) { /* lvewx  */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_WORD;
+   } else if (size == 2) { /* lvehx  */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_HWORD;
+   } else if (size == 1) { /* lvebx  */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_BYTE;
+   } else
+   break;
+
+   vcpu->arch.mmio_vmx_offset =
+  

[PATCH v3 6/7] KVM: PPC: expand mmio_vsx_copy_type to mmio_copy_type to cover VMX load/store elem types

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

VSX MMIO emulation uses mmio_vsx_copy_type to represent VSX emulated
element size/type, such as KVMPPC_VSX_COPY_DWORD_LOAD, etc. This
patch expands mmio_vsx_copy_type to cover VMX copy type, such as
KVMPPC_VMX_COPY_BYTE(stvebx/lvebx), etc. As a result,
mmio_vsx_copy_type is also renamed to mmio_copy_type.

It is a preparation for reimplement VMX MMIO emulation.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_host.h  |  9 +++--
 arch/powerpc/kvm/emulate_loadstore.c | 14 +++---
 arch/powerpc/kvm/powerpc.c   | 10 +-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4bade29..fe506c8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -455,6 +455,11 @@ struct mmio_hpte_cache {
 #define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP3
 #define KVMPPC_VSX_COPY_WORD_LOAD_DUMP 4
 
+#define KVMPPC_VMX_COPY_BYTE   8
+#define KVMPPC_VMX_COPY_HWORD  9
+#define KVMPPC_VMX_COPY_WORD   10
+#define KVMPPC_VMX_COPY_DWORD  11
+
 struct openpic;
 
 /* W0 and W1 of a XIVE thread management context */
@@ -677,16 +682,16 @@ struct kvm_vcpu_arch {
 * Number of simulations for vsx.
 * If we use 2*8bytes to simulate 1*16bytes,
 * then the number should be 2 and
-* mmio_vsx_copy_type=KVMPPC_VSX_COPY_DWORD.
+* mmio_copy_type=KVMPPC_VSX_COPY_DWORD.
 * If we use 4*4bytes to simulate 1*16bytes,
 * the number should be 4 and
 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_WORD.
 */
u8 mmio_vsx_copy_nums;
u8 mmio_vsx_offset;
-   u8 mmio_vsx_copy_type;
u8 mmio_vsx_tx_sx_enabled;
u8 mmio_vmx_copy_nums;
+   u8 mmio_copy_type;
u8 osi_needed;
u8 osi_enabled;
u8 papr_enabled;
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index ed73497..fa4de6b 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -109,7 +109,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
vcpu->arch.mmio_vsx_copy_nums = 0;
vcpu->arch.mmio_vsx_offset = 0;
-   vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
+   vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
vcpu->arch.mmio_sp64_extend = 0;
vcpu->arch.mmio_sign_extend = 0;
vcpu->arch.mmio_vmx_copy_nums = 0;
@@ -175,17 +175,17 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
if (op.element_size == 8)  {
if (op.vsx_flags & VSX_SPLAT)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
else
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_DWORD;
} else if (op.element_size == 4) {
if (op.vsx_flags & VSX_SPLAT)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_WORD_LOAD_DUMP;
else
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_WORD;
} else
break;
@@ -261,10 +261,10 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_sp64_extend = 1;
 
if (op.element_size == 8)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_DWORD;
else if (op.element_size == 4)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_WORD;
else
break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8ce9e7b..1580bd2 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1080,14 +1080,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu 
*vcpu,
if (vcpu->kvm->arch.kvm_ops->giveup_ext)

[PATCH v3 5/7] KVM: PPC: reimplements LOAD_VSX/STORE_VSX instruction mmio emulation with analyse_intr() input

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

This patch reimplements LOAD_VSX/STORE_VSX instruction MMIO emulation with
analyse_intr() input. It utilizes VSX_FPCONV/VSX_SPLAT/SIGNEXT exported
by analyse_instr() and handle accordingly.

When emulating VSX store, the VSX reg will need to be flushed so that
the right reg val can be retrieved before writing to IO MEM.

Suggested-by: Paul Mackerras 
Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/emulate_loadstore.c | 227 ++-
 1 file changed, 91 insertions(+), 136 deletions(-)

diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index 5d38f95..ed73497 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -158,6 +158,54 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
 #endif
+#ifdef CONFIG_VSX
+   case LOAD_VSX: {
+   int io_size_each;
+
+   if (op.vsx_flags & VSX_CHECK_VEC) {
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+   } else {
+   if (kvmppc_check_vsx_disabled(vcpu))
+   return EMULATE_DONE;
+   }
+
+   if (op.vsx_flags & VSX_FPCONV)
+   vcpu->arch.mmio_sp64_extend = 1;
+
+   if (op.element_size == 8)  {
+   if (op.vsx_flags & VSX_SPLAT)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
+   else
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_DWORD;
+   } else if (op.element_size == 4) {
+   if (op.vsx_flags & VSX_SPLAT)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_WORD_LOAD_DUMP;
+   else
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_WORD;
+   } else
+   break;
+
+   if (size < op.element_size) {
+   /* precision convert case: lxsspx, etc */
+   vcpu->arch.mmio_vsx_copy_nums = 1;
+   io_size_each = size;
+   } else { /* lxvw4x, lxvd2x, etc */
+   vcpu->arch.mmio_vsx_copy_nums =
+   size/op.element_size;
+   io_size_each = op.element_size;
+   }
+
+   emulated = kvmppc_handle_vsx_load(run, vcpu,
+   KVM_MMIO_REG_VSX|op.reg, io_size_each,
+   1, op.type & SIGNEXT);
+   break;
+   }
+#endif
case STORE:
/* if need byte reverse, op.val has been reversed by
 * analyse_instr().
@@ -193,6 +241,49 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
 #endif
+#ifdef CONFIG_VSX
+   case STORE_VSX: {
+   int io_size_each;
+
+   if (op.vsx_flags & VSX_CHECK_VEC) {
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+   } else {
+   if (kvmppc_check_vsx_disabled(vcpu))
+   return EMULATE_DONE;
+   }
+
+   if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+   vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu,
+   MSR_VSX);
+
+   if (op.vsx_flags & VSX_FPCONV)
+   vcpu->arch.mmio_sp64_extend = 1;
+
+   if (op.element_size == 8)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_DWORD;
+   else if (op.element_size == 4)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_WORD;
+   else
+   break;
+
+   if (size < op.element_size) {
+   /* precise conversion case, like stxsspx */
+   vcpu->arch.mmio_vsx_copy_nums = 1;

[PATCH v3 4/7] KVM: PPC: reimplement LOAD_FP/STORE_FP instruction mmio emulation with analyse_intr() input

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

This patch reimplements LOAD_FP/STORE_FP instruction MMIO emulation with
analyse_intr() input. It utilizes the FPCONV/UPDATE properties exported by
analyse_instr() and invokes kvmppc_handle_load(s)/kvmppc_handle_store()
accordingly.

For FP store MMIO emulation, the FP regs need to be flushed firstly so
that the right FP reg vals can be read from vcpu->arch.fpr, which will
be stored into MMIO data.

Suggested-by: Paul Mackerras 
Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/emulate_loadstore.c | 201 ---
 1 file changed, 44 insertions(+), 157 deletions(-)

diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index af7c71a..5d38f95 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -138,6 +138,26 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
}
+#ifdef CONFIG_PPC_FPU
+   case LOAD_FP:
+   if (kvmppc_check_fp_disabled(vcpu))
+   return EMULATE_DONE;
+
+   if (op.type & FPCONV)
+   vcpu->arch.mmio_sp64_extend = 1;
+
+   if (op.type & SIGNEXT)
+   emulated = kvmppc_handle_loads(run, vcpu,
+   KVM_MMIO_REG_FPR|op.reg, size, 
1);
+   else
+   emulated = kvmppc_handle_load(run, vcpu,
+   KVM_MMIO_REG_FPR|op.reg, size, 
1);
+
+   if ((op.type & UPDATE) && (emulated != EMULATE_FAIL))
+   kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
+
+   break;
+#endif
case STORE:
/* if need byte reverse, op.val has been reversed by
 * analyse_instr().
@@ -149,6 +169,30 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
 
break;
+#ifdef CONFIG_PPC_FPU
+   case STORE_FP:
+   if (kvmppc_check_fp_disabled(vcpu))
+   return EMULATE_DONE;
+
+   /* The FP registers need to be flushed so that
+* kvmppc_handle_store() can read actual FP vals
+* from vcpu->arch.
+*/
+   if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+   vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu,
+   MSR_FP);
+
+   if (op.type & FPCONV)
+   vcpu->arch.mmio_sp64_extend = 1;
+
+   emulated = kvmppc_handle_store(run, vcpu,
+   VCPU_FPR(vcpu, op.reg), size, 1);
+
+   if ((op.type & UPDATE) && (emulated != EMULATE_FAIL))
+   kvmppc_set_gpr(vcpu, op.update_reg, op.ea);
+
+   break;
+#endif
case CACHEOP:
/* Do nothing. The guest is performing dcbi because
 * hardware DMA is not snooped by the dcache, but
@@ -170,93 +214,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
switch (get_op(inst)) {
case 31:
switch (get_xop(inst)) {
-#ifdef CONFIG_PPC_FPU
-   case OP_31_XOP_LFSX:
-   if (kvmppc_check_fp_disabled(vcpu))
-   return EMULATE_DONE;
-   vcpu->arch.mmio_sp64_extend = 1;
-   emulated = kvmppc_handle_load(run, vcpu,
-   KVM_MMIO_REG_FPR|rt, 4, 1);
-   break;
-
-   case OP_31_XOP_LFSUX:
-   if (kvmppc_check_fp_disabled(vcpu))
-   return EMULATE_DONE;
-   vcpu->arch.mmio_sp64_extend = 1;
-   emulated = kvmppc_handle_load(run, vcpu,
-   KVM_MMIO_REG_FPR|rt, 4, 1);
-   kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
-   break;
-
-   case OP_31_XOP_LFDX:
-   if (kvmppc_check_fp_disabled(vcpu))
-   return EMULATE_DONE;
-   emulated = kvmppc_handle_load(run, vcpu,
-   KVM_MMIO_REG_FPR|rt, 8, 1);
-   break;
-
-   case OP_31_XOP_LFDUX:
-   if (kvmppc_check_fp_disabled(vcpu))
-   return EMULATE_DONE;
-   emulated = kvmppc_handle_load(run, vcpu,
-   KVM_MMIO_REG_FPR|rt, 

[PATCH v3 3/7] KVM: PPC: add giveup_ext() hook for PPC KVM ops

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

Currently HV will save math regs(FP/VEC/VSX) when trap into host. But
PR KVM will only save math regs when qemu task switch out of CPU, or
when returning from qemu code.

To emulate FP/VEC/VSX mmio load, PR KVM need to make sure that math
regs were flushed firstly and then be able to update saved VCPU
FPR/VEC/VSX area reasonably.

This patch adds giveup_ext() field  to KVM ops and PR KVM has non-NULL
giveup_ext() ops. kvmppc_complete_mmio_load() can invoke that hook
(when not NULL) to flush math regs accordingly, before updating saved
register vals.

Math regs flush is also necessary for STORE, which will be covered
in later patch within this patch series.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_ppc.h | 1 +
 arch/powerpc/kvm/book3s_pr.c   | 1 +
 arch/powerpc/kvm/powerpc.c | 9 +
 3 files changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 139cdf0..1f087c4 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -324,6 +324,7 @@ struct kvmppc_ops {
int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
unsigned long flags);
+   void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 67061d3..be26636 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1782,6 +1782,7 @@ static long kvm_arch_vm_ioctl_pr(struct file *filp,
 #ifdef CONFIG_PPC_BOOK3S_64
.hcall_implemented = kvmppc_hcall_impl_pr,
 #endif
+   .giveup_ext = kvmppc_giveup_ext,
 };
 
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 45daf3b..8ce9e7b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1061,6 +1061,9 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu 
*vcpu,
kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
break;
case KVM_MMIO_REG_FPR:
+   if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+   vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu, MSR_FP);
+
VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
break;
 #ifdef CONFIG_PPC_BOOK3S
@@ -1074,6 +1077,9 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu 
*vcpu,
 #endif
 #ifdef CONFIG_VSX
case KVM_MMIO_REG_VSX:
+   if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+   vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu, MSR_VSX);
+
if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_DWORD)
kvmppc_set_vsr_dword(vcpu, gpr);
else if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_WORD)
@@ -1088,6 +1094,9 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu 
*vcpu,
 #endif
 #ifdef CONFIG_ALTIVEC
case KVM_MMIO_REG_VMX:
+   if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+   vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu, MSR_VEC);
+
kvmppc_set_vmx_dword(vcpu, gpr);
break;
 #endif
-- 
1.8.3.1



[PATCH v3 2/7] KVM: PPC: reimplement non-SIMD LOAD/STORE instruction mmio emulation with analyse_intr() input

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

This patch reimplements non-SIMD LOAD/STORE instruction MMIO emulation
with analyse_intr() input. It utilizes the BYTEREV/UPDATE/SIGNEXT
properties exported by analyse_instr() and invokes
kvmppc_handle_load(s)/kvmppc_handle_store() accordingly.

It also move CACHEOP type handling into the skeleton.

instruction_type within kvm_ppc.h is renamed to avoid conflict with
sstep.h.

Suggested-by: Paul Mackerras 
Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_ppc.h   |   6 +-
 arch/powerpc/kvm/book3s.c|   4 +-
 arch/powerpc/kvm/e500_mmu_host.c |   8 +-
 arch/powerpc/kvm/emulate_loadstore.c | 271 ++-
 4 files changed, 51 insertions(+), 238 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index abe7032..139cdf0 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -52,7 +52,7 @@ enum emulation_result {
EMULATE_EXIT_USER,/* emulation requires exit to user-space */
 };
 
-enum instruction_type {
+enum instruction_fetch_type {
INST_GENERIC,
INST_SC,/* system call */
 };
@@ -93,7 +93,7 @@ extern int kvmppc_handle_vsx_store(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
int is_default_endian);
 
 extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
-enum instruction_type type, u32 *inst);
+enum instruction_fetch_type type, u32 *inst);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
 bool data);
@@ -330,7 +330,7 @@ struct kvmppc_ops {
 extern struct kvmppc_ops *kvmppc_pr_ops;
 
 static inline int kvmppc_get_last_inst(struct kvm_vcpu *vcpu,
-   enum instruction_type type, u32 *inst)
+   enum instruction_fetch_type type, u32 *inst)
 {
int ret = EMULATE_DONE;
u32 fetched_inst;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 97d4a11..320cdcf 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -450,8 +450,8 @@ int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum 
xlate_instdata xlid,
return r;
 }
 
-int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
-u32 *inst)
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+   enum instruction_fetch_type type, u32 *inst)
 {
ulong pc = kvmppc_get_pc(vcpu);
int r;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index c878b4f..8f2985e 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -625,8 +625,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t 
gpaddr,
 }
 
 #ifdef CONFIG_KVM_BOOKE_HV
-int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
- u32 *instr)
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+   enum instruction_fetch_type type, u32 *instr)
 {
gva_t geaddr;
hpa_t addr;
@@ -715,8 +715,8 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum 
instruction_type type,
return EMULATE_DONE;
 }
 #else
-int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
- u32 *instr)
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
+   enum instruction_fetch_type type, u32 *instr)
 {
return EMULATE_AGAIN;
 }
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index b8a3aef..af7c71a 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "timing.h"
 #include "trace.h"
 
@@ -84,8 +85,9 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
struct kvm_run *run = vcpu->run;
u32 inst;
int ra, rs, rt;
-   enum emulation_result emulated;
+   enum emulation_result emulated = EMULATE_FAIL;
int advance = 1;
+   struct instruction_op op;
 
/* this default type might be overwritten by subcategories */
kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -113,144 +115,61 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_vmx_copy_nums = 0;
vcpu->arch.mmio_host_swabbed = 0;
 
-   switch (get_op(inst)) {
-   case 31:
-   switch (get_xop(inst)) {
-   case OP_31_XOP_LWZX:
-   emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
-   break;
-
-   case OP_31_XOP_LWZUX:
-   emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
-   kvmppc_set_gpr(vcpu, ra, 

[PATCH v3 1/7] KVM: PPC: add KVMPPC_VSX_COPY_WORD_LOAD_DUMP type support for mmio emulation

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

Some VSX instruction like lxvwsx will splat word into VSR. This patch
adds VSX copy type KVMPPC_VSX_COPY_WORD_LOAD_DUMP to support this.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/powerpc.c  | 23 +++
 2 files changed, 24 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 89f44ec..4bade29 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -453,6 +453,7 @@ struct mmio_hpte_cache {
 #define KVMPPC_VSX_COPY_WORD   1
 #define KVMPPC_VSX_COPY_DWORD  2
 #define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP3
+#define KVMPPC_VSX_COPY_WORD_LOAD_DUMP 4
 
 struct openpic;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index bef27b1..45daf3b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -907,6 +907,26 @@ static inline void kvmppc_set_vsr_dword_dump(struct 
kvm_vcpu *vcpu,
}
 }
 
+static inline void kvmppc_set_vsr_word_dump(struct kvm_vcpu *vcpu,
+   u32 gpr)
+{
+   union kvmppc_one_reg val;
+   int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+   if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+   val.vsx32val[0] = gpr;
+   val.vsx32val[1] = gpr;
+   val.vsx32val[2] = gpr;
+   val.vsx32val[3] = gpr;
+   VCPU_VSX_VR(vcpu, index) = val.vval;
+   } else {
+   val.vsx32val[0] = gpr;
+   val.vsx32val[1] = gpr;
+   VCPU_VSX_FPR(vcpu, index, 0) = val.vsxval[0];
+   VCPU_VSX_FPR(vcpu, index, 1) = val.vsxval[0];
+   }
+}
+
 static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
u32 gpr32)
 {
@@ -1061,6 +1081,9 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu 
*vcpu,
else if (vcpu->arch.mmio_vsx_copy_type ==
KVMPPC_VSX_COPY_DWORD_LOAD_DUMP)
kvmppc_set_vsr_dword_dump(vcpu, gpr);
+   else if (vcpu->arch.mmio_vsx_copy_type ==
+   KVMPPC_VSX_COPY_WORD_LOAD_DUMP)
+   kvmppc_set_vsr_word_dump(vcpu, gpr);
break;
 #endif
 #ifdef CONFIG_ALTIVEC
-- 
1.8.3.1



[PATCH v3 0/7] KVM: PPC: reimplement mmio emulation with analyse_instr()

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

We already have analyse_instr() which analyzes instructions for the instruction
type, size, addtional flags, etc. What kvmppc_emulate_loadstore() did is somehow
duplicated and it will be good to utilize analyse_instr() to reimplement the
code. The advantage is that the code logic will be shared and more clean to be 
maintained.

This patch series reimplement kvmppc_emulate_loadstore() for various load/store
instructions. 

The testcase locates at:
https://github.com/justdoitqd/publicFiles/blob/master/test_mmio.c

- Tested at both PR/HV KVM. 
- Also tested with little endian host & big endian guest.

Tested instruction list: 
lbz lbzu lbzx ld ldbrx
ldu ldx lfd lfdu lfdx
lfiwax lfiwzx lfs lfsu lfsx
lha lhau lhax lhbrx lhz
lhzu lhzx lvx lwax lwbrx
lwz lwzu lwzx lxsdx lxsiwax
lxsiwzx lxsspx lxvd2x lxvdsx lxvw4x
stb stbu stbx std stdbrx
stdu stdx stfd stfdu stfdx
stfiwx stfs stfsx sth sthbrx
sthu sthx stvx stw stwbrx
stwu stwx stxsdx stxsiwx stxsspx
stxvd2x stxvw4x
lvebx stvebx
lvehx stvehx
lvewx stvewx

V2 -> V3 changes:
1) add FPU SIGNEXT case to handle lfiwax based on comment.
2) a minor change to go label "out:" when EMULATE_DO_MMIO 
returned based on comment.
3) rebased to paul's kvm-ppc-next branch (3 patches was
merged into that branch and so this patch set includes 7 
patches only).

V1 -> V2 changes:
1) correct patch split issue in v1.
2) revise some commit message/code comment per review comment
3) remove incorrect special handling for stxsiwx
4) remove mmio_update_ra related and move the RA update into
kvmppc_emulate_loadstore().
5) rework giveup_ext() which is only meaningful when not NULL.
6) rewrite VMX emulation code and cover rest VMX instructions:
lvebx stvebx
lvehx stvehx
lvewx stvewx


Simon Guo (7):
  KVM: PPC: add KVMPPC_VSX_COPY_WORD_LOAD_DUMP type support for mmio
emulation
  KVM: PPC: reimplement non-SIMD LOAD/STORE instruction mmio emulation
with analyse_intr() input
  KVM: PPC: add giveup_ext() hook for PPC KVM ops
  KVM: PPC: reimplement LOAD_FP/STORE_FP instruction mmio emulation with
analyse_intr() input
  KVM: PPC: reimplements LOAD_VSX/STORE_VSX instruction mmio emulation
with analyse_intr() input
  KVM: PPC: expand mmio_vsx_copy_type to mmio_copy_type to cover VMX
load/store elem types
  KVM: PPC: reimplements LOAD_VMX/STORE_VMX instruction mmio emulation
with analyse_intr() input

 arch/powerpc/include/asm/kvm_host.h  |  11 +-
 arch/powerpc/include/asm/kvm_ppc.h   |  17 +-
 arch/powerpc/kvm/book3s.c|   4 +-
 arch/powerpc/kvm/book3s_pr.c |   1 +
 arch/powerpc/kvm/e500_mmu_host.c |   8 +-
 arch/powerpc/kvm/emulate_loadstore.c | 751 +++
 arch/powerpc/kvm/powerpc.c   | 299 +++---
 7 files changed, 498 insertions(+), 593 deletions(-)

-- 
1.8.3.1



[PATCH v3 29/29] KVM: PPC: Book3S PR: enable kvmppc_get/set_one_reg_pr() for HTM registers

2018-05-21 Thread wei . guo . simon
From: Simon Guo 

We need to migrate PR KVM during transaction and qemu will use
kvmppc_get_one_reg_pr()/kvmppc_set_one_reg_pr() APIs to get/set
transaction checkpoint state. This patch adds support for that.

So far PPC PR qemu doesn't fully function for migration but the
savevm/loadvm can be done against a RHEL72 guest. During savevm/
loadvm procedure, the kvm ioctls will be invoked as well.

Test has been performed to savevm/loadvm for a guest running
a HTM test program:
https://github.com/justdoitqd/publicFiles/blob/master/test-tm-mig.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_pr.c | 133 +++
 1 file changed, 133 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 8efc87b..7e76c4a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1537,6 +1537,73 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
else
*val = get_reg_val(id, 0);
break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   case KVM_REG_PPC_TFHAR:
+   *val = get_reg_val(id, vcpu->arch.tfhar);
+   break;
+   case KVM_REG_PPC_TFIAR:
+   *val = get_reg_val(id, vcpu->arch.tfiar);
+   break;
+   case KVM_REG_PPC_TEXASR:
+   *val = get_reg_val(id, vcpu->arch.texasr);
+   break;
+   case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+   *val = get_reg_val(id,
+   vcpu->arch.gpr_tm[id-KVM_REG_PPC_TM_GPR0]);
+   break;
+   case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+   {
+   int i, j;
+
+   i = id - KVM_REG_PPC_TM_VSR0;
+   if (i < 32)
+   for (j = 0; j < TS_FPRWIDTH; j++)
+   val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+   else {
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   val->vval = vcpu->arch.vr_tm.vr[i-32];
+   else
+   r = -ENXIO;
+   }
+   break;
+   }
+   case KVM_REG_PPC_TM_CR:
+   *val = get_reg_val(id, vcpu->arch.cr_tm);
+   break;
+   case KVM_REG_PPC_TM_XER:
+   *val = get_reg_val(id, vcpu->arch.xer_tm);
+   break;
+   case KVM_REG_PPC_TM_LR:
+   *val = get_reg_val(id, vcpu->arch.lr_tm);
+   break;
+   case KVM_REG_PPC_TM_CTR:
+   *val = get_reg_val(id, vcpu->arch.ctr_tm);
+   break;
+   case KVM_REG_PPC_TM_FPSCR:
+   *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+   break;
+   case KVM_REG_PPC_TM_AMR:
+   *val = get_reg_val(id, vcpu->arch.amr_tm);
+   break;
+   case KVM_REG_PPC_TM_PPR:
+   *val = get_reg_val(id, vcpu->arch.ppr_tm);
+   break;
+   case KVM_REG_PPC_TM_VRSAVE:
+   *val = get_reg_val(id, vcpu->arch.vrsave_tm);
+   break;
+   case KVM_REG_PPC_TM_VSCR:
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+   else
+   r = -ENXIO;
+   break;
+   case KVM_REG_PPC_TM_DSCR:
+   *val = get_reg_val(id, vcpu->arch.dscr_tm);
+   break;
+   case KVM_REG_PPC_TM_TAR:
+   *val = get_reg_val(id, vcpu->arch.tar_tm);
+   break;
+#endif
default:
r = -EINVAL;
break;
@@ -1570,6 +1637,72 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
case KVM_REG_PPC_LPCR_64:
kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   case KVM_REG_PPC_TFHAR:
+   vcpu->arch.tfhar = set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TFIAR:
+   vcpu->arch.tfiar = set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TEXASR:
+   vcpu->arch.texasr = set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+   vcpu->arch.gpr_tm[id - KVM_REG_PPC_TM_GPR0] =
+   set_reg_val(id, *val);
+   break;
+   case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+   {
+   int i, j;
+
+   i = id - KVM_REG_PPC_TM_VSR0;
+   if (i < 32)
+   for (j = 0; j < TS_FPRWIDTH; j++)
+   vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+   else
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   vcpu->arch.vr_tm.vr[i-32] = val->vval;
+

[PATCH v3 28/29] KVM: PPC: remove load/put vcpu for KVM_GET_REGS/KVM_SET_REGS

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

In both HV/PR KVM, the KVM_SET_REGS/KVM_GET_REGS ioctl should
be able to perform without load vcpu. This patch adds
KVM_SET_ONE_REG/KVM_GET_ONE_REG implementation to async ioctl
function.

Due to the vcpu mutex locking/unlock has been moved out of vcpu_load()
/vcpu_put(), KVM_SET_REGS/KVM_GET_REGS don't need to do
ioctl with loading vcpu anymore. This patch removes vcpu_load()/vcpu_put()
from KVM_SET_REGS/KVM_GET_REGS ioctl.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 97d4a11..523c68f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -509,8 +509,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
regs->pc = kvmppc_get_pc(vcpu);
regs->cr = kvmppc_get_cr(vcpu);
regs->ctr = kvmppc_get_ctr(vcpu);
@@ -532,7 +530,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-   vcpu_put(vcpu);
return 0;
 }
 
@@ -540,8 +537,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
 {
int i;
 
-   vcpu_load(vcpu);
-
kvmppc_set_pc(vcpu, regs->pc);
kvmppc_set_cr(vcpu, regs->cr);
kvmppc_set_ctr(vcpu, regs->ctr);
@@ -562,7 +557,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-   vcpu_put(vcpu);
return 0;
 }
 
-- 
1.8.3.1



[PATCH v3 27/29] KVM: PPC: remove load/put vcpu for KVM_GET/SET_ONE_REG ioctl

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Due to the vcpu mutex locking/unlock has been moved out of vcpu_load()
/vcpu_put(), KVM_GET_ONE_REG and KVM_SET_ONE_REG doesn't need to do
ioctl with loading vcpu anymore. This patch removes vcpu_load()/vcpu_put()
from KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctl.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/powerpc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c9098ff..5def68d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1801,14 +1801,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
{
struct kvm_one_reg reg;
r = -EFAULT;
-   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(reg)))
goto out;
if (ioctl == KVM_SET_ONE_REG)
r = kvm_vcpu_ioctl_set_one_reg(vcpu, );
else
r = kvm_vcpu_ioctl_get_one_reg(vcpu, );
-   vcpu_put(vcpu);
break;
}
 
-- 
1.8.3.1



[PATCH v3 26/29] KVM: PPC: move vcpu_load/vcpu_put down to each ioctl case in kvm_arch_vcpu_ioctl

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Although we already have kvm_arch_vcpu_async_ioctl() which doesn't require
ioctl to load vcpu, the sync ioctl code need to be cleaned up when
CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL is not configured.

This patch moves vcpu_load/vcpu_put down to each ioctl switch case so that
each ioctl can decide to do vcpu_load/vcpu_put or not independently.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/powerpc.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1fa5bbe..c9098ff 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1783,16 +1783,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
long r;
 
-   vcpu_load(vcpu);
-
switch (ioctl) {
case KVM_ENABLE_CAP:
{
struct kvm_enable_cap cap;
r = -EFAULT;
+   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(cap)))
goto out;
r = kvm_vcpu_ioctl_enable_cap(vcpu, );
+   vcpu_put(vcpu);
break;
}
 
@@ -1801,12 +1801,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
{
struct kvm_one_reg reg;
r = -EFAULT;
+   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(reg)))
goto out;
if (ioctl == KVM_SET_ONE_REG)
r = kvm_vcpu_ioctl_set_one_reg(vcpu, );
else
r = kvm_vcpu_ioctl_get_one_reg(vcpu, );
+   vcpu_put(vcpu);
break;
}
 
@@ -1814,9 +1816,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_DIRTY_TLB: {
struct kvm_dirty_tlb dirty;
r = -EFAULT;
+   vcpu_load(vcpu);
if (copy_from_user(, argp, sizeof(dirty)))
goto out;
r = kvm_vcpu_ioctl_dirty_tlb(vcpu, );
+   vcpu_put(vcpu);
break;
}
 #endif
@@ -1825,7 +1829,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
 
 out:
-   vcpu_put(vcpu);
return r;
 }
 
-- 
1.8.3.1



[PATCH v3 25/29] KVM: PPC: Book3S PR: enable HTM for PR KVM for KVM_CHECK_EXTENSION ioctl

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

With current patch set, PR KVM now supports HTM. So this patch turns it
on for PR KVM.

Tested with:
https://github.com/justdoitqd/publicFiles/blob/master/test_kvm_htm_cap.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/powerpc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index bef27b1..1fa5bbe 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -648,9 +648,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case KVM_CAP_PPC_HTM:
-   r = hv_enabled &&
-   (!!(cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM) ||
-cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST));
+   r = !!(cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM) ||
+(hv_enabled && cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST));
break;
 #endif
default:
-- 
1.8.3.1



[PATCH v3 24/29] KVM: PPC: Book3S PR: Support TAR handling for PR KVM HTM.

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Currently guest kernel doesn't handle TAR fac unavailable and it always
runs with TAR bit on. PR KVM will lazily enable TAR. TAR is not a
frequent-use reg and it is not included in SVCPU struct.

Due to the above, the checkpointed TAR val might be a bogus TAR val.
To solve this issue, we will make vcpu->arch.fscr tar bit consistent
with shadow_fscr when TM enabled.

At the end of emulating treclaim., the correct TAR val need to be loaded
into reg if FSCR_TAR bit is on.
At the beginning of emulating trechkpt., TAR needs to be flushed so that
the right tar val can be copy into tar_tm.

Tested with:
tools/testing/selftests/powerpc/tm/tm-tar
tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar (remove DSCR/PPR
related testing).

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_emulate.c |  4 
 arch/powerpc/kvm/book3s_pr.c  | 21 -
 arch/powerpc/kvm/tm.S | 16 ++--
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 2940de7..1f345a0 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -271,6 +271,8 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu 
*vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
+
 extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 67d0fb40..fdbc695 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -173,6 +173,9 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, 
int ra_val)
guest_msr &= ~(MSR_TS_MASK);
kvmppc_set_msr(vcpu, guest_msr);
preempt_enable();
+
+   if (vcpu->arch.shadow_fscr & FSCR_TAR)
+   mtspr(SPRN_TAR, vcpu->arch.tar);
 }
 
 static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
@@ -185,6 +188,7 @@ static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
 * copy.
 */
kvmppc_giveup_ext(vcpu, MSR_VSX);
+   kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
kvmppc_copyto_vcpu_tm(vcpu);
kvmppc_save_tm_sprs(vcpu);
 
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 526c928..8efc87b 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -55,7 +55,7 @@
 
 static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 ulong msr);
-static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
+static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 /* Some compatibility defines */
 #ifdef CONFIG_PPC_BOOK3S_32
@@ -346,6 +346,7 @@ void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
return;
}
 
+   kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
kvmppc_giveup_ext(vcpu, MSR_VSX);
 
preempt_disable();
@@ -357,8 +358,11 @@ void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
 {
if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
kvmppc_restore_tm_sprs(vcpu);
-   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   if (kvmppc_get_msr(vcpu) & MSR_TM) {
kvmppc_handle_lost_math_exts(vcpu);
+   if (vcpu->arch.fscr & FSCR_TAR)
+   kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+   }
return;
}
 
@@ -366,9 +370,11 @@ void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
_kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
preempt_enable();
 
-   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   if (kvmppc_get_msr(vcpu) & MSR_TM) {
kvmppc_handle_lost_math_exts(vcpu);
-
+   if (vcpu->arch.fscr & FSCR_TAR)
+   kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+   }
 }
 #endif
 
@@ -819,7 +825,7 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 }
 
 /* Give up facility (TAR / EBB / DSCR) */
-static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
+void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) {
@@ -1020,7 +1026,12 @@ void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr)
if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) {
/* TAR got dropped, drop it in shadow too */
kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+   } else if (!(vcpu->arch.fscr & FSCR_TAR) && (fscr & FSCR_TAR)) {
+   vcpu->arch.fscr = fscr;
+   kvmppc_handle_fac(vcpu, FSCR_TAR_LG);
+   return;
}
+
vcpu->arch.fscr = fscr;
 }
 #endif
diff --git 

[PATCH v3 23/29] KVM: PPC: Book3S PR: add guard code to prevent returning to guest with PR=0 and Transactional state

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Currently PR KVM doesn't support transaction memory at guest privilege
state.

This patch adds a check at setting guest msr, so that we can never return
to guest with PR=0 and TS=0b10. A tabort will be emulated to indicate
this and fail transaction immediately.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/uapi/asm/tm.h |  2 +-
 arch/powerpc/kvm/book3s.h  |  6 ++
 arch/powerpc/kvm/book3s_emulate.c  |  2 +-
 arch/powerpc/kvm/book3s_pr.c   | 13 -
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/tm.h 
b/arch/powerpc/include/uapi/asm/tm.h
index e1bf0e2..e2947c9 100644
--- a/arch/powerpc/include/uapi/asm/tm.h
+++ b/arch/powerpc/include/uapi/asm/tm.h
@@ -13,7 +13,7 @@
 #define TM_CAUSE_TLBI  0xdc
 #define TM_CAUSE_FAC_UNAV  0xda
 #define TM_CAUSE_SYSCALL   0xd8
-#define TM_CAUSE_MISC  0xd6  /* future use */
+#define TM_CAUSE_PRIV_T0xd6
 #define TM_CAUSE_SIGNAL0xd4
 #define TM_CAUSE_ALIGNMENT 0xd2
 #define TM_CAUSE_EMULATE   0xd0
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 4ad5e28..14ef035 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -31,4 +31,10 @@ extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu 
*vcpu,
 extern int kvmppc_book3s_init_pr(void);
 extern void kvmppc_book3s_exit_pr(void);
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val);
+#else
+static inline void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) {}
+#endif
+
 #endif
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 34f910e..67d0fb40 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -199,7 +199,7 @@ static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
 }
 
 /* emulate tabort. at guest privilege state */
-static void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
+void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
 {
/* currently we only emulate tabort. but no emulation of other
 * tabort variants since there is no kernel usage of them at
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 5359f9c..526c928 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -446,12 +446,23 @@ static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned 
long hva, pte_t pte)
 
 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 {
-   ulong old_msr = kvmppc_get_msr(vcpu);
+   ulong old_msr;
 
 #ifdef EXIT_DEBUG
printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /* We should never target guest MSR to TS=10 && PR=0,
+* since we always fail transaction for guest privilege
+* state.
+*/
+   if (!(msr & MSR_PR) && MSR_TM_TRANSACTIONAL(msr))
+   kvmppc_emulate_tabort(vcpu,
+   TM_CAUSE_PRIV_T | TM_CAUSE_PERSISTENT);
+#endif
+
+   old_msr = kvmppc_get_msr(vcpu);
msr &= to_book3s(vcpu)->msr_mask;
kvmppc_set_msr_fast(vcpu, msr);
kvmppc_recalc_shadow_msr(vcpu);
-- 
1.8.3.1



[PATCH v3 22/29] KVM: PPC: Book3S PR: add emulation for tabort. for privilege guest

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Currently privilege guest will be run with TM disabled.

Although the privilege guest cannot initiate a new transaction,
it can use tabort to terminate its problem state's transaction.
So it is still necessary to emulate tabort. for privilege guest.

This patch adds emulation for tabort. of privilege guest.

Tested with:
https://github.com/justdoitqd/publicFiles/blob/master/test_tabort.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_emulate.c | 68 +++
 1 file changed, 68 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index b7530cf..34f910e 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -50,6 +50,7 @@
 #define OP_31_XOP_SLBMFEE  915
 
 #define OP_31_XOP_TBEGIN   654
+#define OP_31_XOP_TABORT   910
 
 #define OP_31_XOP_TRECLAIM 942
 #define OP_31_XOP_TRCHKPT  1006
@@ -196,6 +197,47 @@ static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
kvmppc_restore_tm_pr(vcpu);
preempt_enable();
 }
+
+/* emulate tabort. at guest privilege state */
+static void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
+{
+   /* currently we only emulate tabort. but no emulation of other
+* tabort variants since there is no kernel usage of them at
+* present.
+*/
+   unsigned long guest_msr = kvmppc_get_msr(vcpu);
+
+   preempt_disable();
+   tm_enable();
+   tm_abort(ra_val);
+
+   /* CR0 = 0 | MSR[TS] | 0 */
+   vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+   (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
+<< CR0_SHIFT);
+
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   /* failure recording depends on Failure Summary bit,
+* and tabort will be treated as nops in non-transactional
+* state.
+*/
+   if (!(vcpu->arch.texasr & TEXASR_FS) &&
+   MSR_TM_ACTIVE(guest_msr)) {
+   vcpu->arch.texasr &= ~(TEXASR_PR | TEXASR_HV);
+   if (guest_msr & MSR_PR)
+   vcpu->arch.texasr |= TEXASR_PR;
+
+   if (guest_msr & MSR_HV)
+   vcpu->arch.texasr |= TEXASR_HV;
+
+   vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   }
+   tm_disable();
+   preempt_enable();
+}
+
 #endif
 
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -468,6 +510,32 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
emulated = EMULATE_FAIL;
break;
}
+   case OP_31_XOP_TABORT:
+   {
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   unsigned long ra_val = 0;
+
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   /* only emulate for privilege guest, since problem state
+* guest can run with TM enabled and we don't expect to
+* trap at here for that case.
+*/
+   WARN_ON(guest_msr & MSR_PR);
+
+   if (ra)
+   ra_val = kvmppc_get_gpr(vcpu, ra);
+
+   kvmppc_emulate_tabort(vcpu, ra_val);
+   break;
+   }
case OP_31_XOP_TRECLAIM:
{
ulong guest_msr = kvmppc_get_msr(vcpu);
-- 
1.8.3.1



[PATCH v3 21/29] KVM: PPC: Book3S PR: add emulation for trechkpt in PR KVM.

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patch adds host emulation when guest PR KVM executes "trechkpt.",
which is a privileged instruction and will trap into host.

We firstly copy vcpu ongoing content into vcpu tm checkpoint
content, then perform kvmppc_restore_tm_pr() to do trechkpt.
with updated vcpu tm checkpoint vals.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_emulate.c | 61 +++
 arch/powerpc/kvm/book3s_pr.c  |  2 +-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index c1cea82..2940de7 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -262,10 +262,12 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned 
long lpcr,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
+void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu);
 #else
 static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 04c29e0..b7530cf 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -52,6 +52,7 @@
 #define OP_31_XOP_TBEGIN   654
 
 #define OP_31_XOP_TRECLAIM 942
+#define OP_31_XOP_TRCHKPT  1006
 
 /* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
 #define OP_31_XOP_DCBZ 1010
@@ -172,6 +173,29 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, 
int ra_val)
kvmppc_set_msr(vcpu, guest_msr);
preempt_enable();
 }
+
+static void kvmppc_emulate_trchkpt(struct kvm_vcpu *vcpu)
+{
+   unsigned long guest_msr = kvmppc_get_msr(vcpu);
+
+   preempt_disable();
+   /*
+* need flush FP/VEC/VSX to vcpu save area before
+* copy.
+*/
+   kvmppc_giveup_ext(vcpu, MSR_VSX);
+   kvmppc_copyto_vcpu_tm(vcpu);
+   kvmppc_save_tm_sprs(vcpu);
+
+   /*
+* as a result of trecheckpoint. set TS to suspended.
+*/
+   guest_msr &= ~(MSR_TS_MASK);
+   guest_msr |= MSR_TS_S;
+   kvmppc_set_msr(vcpu, guest_msr);
+   kvmppc_restore_tm_pr(vcpu);
+   preempt_enable();
+}
 #endif
 
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -478,6 +502,43 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
kvmppc_emulate_treclaim(vcpu, ra_val);
break;
}
+   case OP_31_XOP_TRCHKPT:
+   {
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   unsigned long texasr;
+
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   /* generate interrupt based on priorities */
+   if (guest_msr & MSR_PR) {
+   /* Privileged Instruction type Program Intr */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   tm_enable();
+   texasr = mfspr(SPRN_TEXASR);
+   tm_disable();
+
+   if (MSR_TM_ACTIVE(guest_msr) ||
+   !(texasr & (TEXASR_FS))) {
+   /* TM bad thing interrupt */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   kvmppc_emulate_trchkpt(vcpu);
+   break;
+   }
 #endif
default:
emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 9a72460..5359f9c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -299,7 +299,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
+void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
 {

[PATCH v3 20/29] KVM: PPC: Book3S PR: adds emulation for treclaim.

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patch adds support for "treclaim." emulation when PR KVM guest
executes treclaim. and traps to host.

We will firstly doing treclaim. and save TM checkpoint. Then it is
necessary to update vcpu current reg content with checkpointed vals.
When rfid into guest again, those vcpu current reg content(now the
checkpoint vals) will be loaded into regs.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_emulate.c | 76 +++
 1 file changed, 76 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 570339b..04c29e0 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -51,6 +51,8 @@
 
 #define OP_31_XOP_TBEGIN   654
 
+#define OP_31_XOP_TRECLAIM 942
+
 /* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
 #define OP_31_XOP_DCBZ 1010
 
@@ -130,6 +132,46 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu 
*vcpu)
vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
 }
 
+static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
+{
+   unsigned long guest_msr = kvmppc_get_msr(vcpu);
+   int fc_val = ra_val ? ra_val : 1;
+
+   /* CR0 = 0 | MSR[TS] | 0 */
+   vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+   (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
+<< CR0_SHIFT);
+
+   preempt_disable();
+   kvmppc_save_tm_pr(vcpu);
+   kvmppc_copyfrom_vcpu_tm(vcpu);
+
+   tm_enable();
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   /* failure recording depends on Failure Summary bit */
+   if (!(vcpu->arch.texasr & TEXASR_FS)) {
+   vcpu->arch.texasr &= ~TEXASR_FC;
+   vcpu->arch.texasr |= ((u64)fc_val << TEXASR_FC_LG);
+
+   vcpu->arch.texasr &= ~(TEXASR_PR | TEXASR_HV);
+   if (kvmppc_get_msr(vcpu) & MSR_PR)
+   vcpu->arch.texasr |= TEXASR_PR;
+
+   if (kvmppc_get_msr(vcpu) & MSR_HV)
+   vcpu->arch.texasr |= TEXASR_HV;
+
+   vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   }
+   tm_disable();
+   /*
+* treclaim need quit to non-transactional state.
+*/
+   guest_msr &= ~(MSR_TS_MASK);
+   kvmppc_set_msr(vcpu, guest_msr);
+   preempt_enable();
+}
 #endif
 
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -402,6 +444,40 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
emulated = EMULATE_FAIL;
break;
}
+   case OP_31_XOP_TRECLAIM:
+   {
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   unsigned long ra_val = 0;
+
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   /* generate interrupts based on priorities */
+   if (guest_msr & MSR_PR) {
+   /* Privileged Instruction type Program 
Interrupt */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (!MSR_TM_ACTIVE(guest_msr)) {
+   /* TM bad thing interrupt */
+   kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (ra)
+   ra_val = kvmppc_get_gpr(vcpu, ra);
+   kvmppc_emulate_treclaim(vcpu, ra_val);
+   break;
+   }
 #endif
default:
emulated = EMULATE_FAIL;
-- 
1.8.3.1



[PATCH v3 19/29] KVM: PPC: Book3S PR: enable NV reg restore for reading TM SPR at guest privilege state

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Currently kvmppc_handle_fac() will not update NV GPRs and thus it can
return with GUEST_RESUME.

However PR KVM guest always disables MSR_TM bit at privilege state. If PR
privilege guest are trying to read TM SPRs, it will trigger TM facility
unavailable exception and fall into kvmppc_handle_fac(). Then the emulation
will be done by kvmppc_core_emulate_mfspr_pr(). The mfspr instruction can
include a RT with NV reg. So it is necessary to restore NV GPRs at this
case, to reflect the update to NV RT.

This patch make kvmppc_handle_fac() return GUEST_RESUME_NV at TM fac
exception and with guest privilege state.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_pr.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 9becca1..9a72460 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -989,6 +989,18 @@ static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong 
fac)
break;
}
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /* Since we disabled MSR_TM at privilege state, the mfspr instruction
+* for TM spr can trigger TM fac unavailable. In this case, the
+* emulation is handled by kvmppc_emulate_fac(), which invokes
+* kvmppc_emulate_mfspr() finally. But note the mfspr can include
+* RT for NV registers. So it need to restore those NV reg to reflect
+* the update.
+*/
+   if ((fac == FSCR_TM_LG) && !(kvmppc_get_msr(vcpu) & MSR_PR))
+   return RESUME_GUEST_NV;
+#endif
+
return RESUME_GUEST;
 }
 
@@ -1350,8 +1362,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
}
 #ifdef CONFIG_PPC_BOOK3S_64
case BOOK3S_INTERRUPT_FAC_UNAVAIL:
-   kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
-   r = RESUME_GUEST;
+   r = kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
break;
 #endif
case BOOK3S_INTERRUPT_MACHINE_CHECK:
-- 
1.8.3.1



[PATCH v3 18/29] KVM: PPC: Book3S PR: always fail transaction in guest privilege state

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Currently kernel doesn't use transaction memory.
And there is an issue for privilege guest that:
tbegin/tsuspend/tresume/tabort TM instructions can impact MSR TM bits
without trap into PR host. So following code will lead to a false mfmsr
result:
tbegin  <- MSR bits update to Transaction active.
beq <- failover handler branch
mfmsr   <- still read MSR bits from magic page with
transaction inactive.

It is not an issue for non-privilege guest since its mfmsr is not patched
with magic page and will always trap into PR host.

This patch will always fail tbegin attempt for privilege guest, so that
the above issue is prevented. It is benign since currently (guest) kernel
doesn't initiate a transaction.

Test case:
https://github.com/justdoitqd/publicFiles/blob/master/test_tbegin_pr.c

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 ++
 arch/powerpc/kvm/book3s_emulate.c | 40 +++
 arch/powerpc/kvm/book3s_pr.c  | 11 +-
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 43e8bb1..c1cea82 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -262,9 +262,11 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned 
long lpcr,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
 void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
+void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu);
 #else
 static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
 extern int kvm_irq_bypass;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index c4e3ec6..570339b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "book3s.h"
 #include 
 
@@ -48,6 +49,8 @@
 #define OP_31_XOP_EIOIO854
 #define OP_31_XOP_SLBMFEE  915
 
+#define OP_31_XOP_TBEGIN   654
+
 /* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
 #define OP_31_XOP_DCBZ 1010
 
@@ -363,6 +366,43 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 
break;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   case OP_31_XOP_TBEGIN:
+   {
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
+   preempt_disable();
+   vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
+ (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
+
+   vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
+   (((u64)(TM_CAUSE_EMULATE | 
TM_CAUSE_PERSISTENT))
+<< TEXASR_FC_LG));
+
+   if ((inst >> 21) & 0x1)
+   vcpu->arch.texasr |= TEXASR_ROT;
+
+   if (kvmppc_get_msr(vcpu) & MSR_HV)
+   vcpu->arch.texasr |= TEXASR_HV;
+
+   vcpu->arch.tfhar = kvmppc_get_pc(vcpu) + 4;
+   vcpu->arch.tfiar = kvmppc_get_pc(vcpu);
+
+   kvmppc_restore_tm_sprs(vcpu);
+   preempt_enable();
+   } else
+   emulated = EMULATE_FAIL;
+   break;
+   }
+#endif
default:
emulated = EMULATE_FAIL;
}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index e8e7f3a..9becca1 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -207,6 +207,15 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_PPC_BOOK3S_64
smsr |= MSR_ISF | MSR_HV;
 #endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /*
+* in guest privileged state, we want to fail all TM transactions.
+* So disable MSR TM bit so that all tbegin. will be able to be
+* trapped into host.
+*/
+   if (!(guest_msr & MSR_PR))
+   smsr &= ~MSR_TM;
+#endif
vcpu->arch.shadow_msr = 

[PATCH v3 17/29] KVM: PPC: Book3S PR: make mtspr/mfspr emulation behavior based on active TM SPRs

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

The mfspr/mtspr on TM SPRs(TEXASR/TFIAR/TFHAR) are non-privileged
instructions and can be executed at PR KVM guest without trapping
into host in problem state. We only emulate mtspr/mfspr
texasr/tfiar/tfhar at guest PR=0 state.

When we are emulating mtspr tm sprs at guest PR=0 state, the emulation
result need to be visible to guest PR=1 state. That is, the actual TM
SPR val should be loaded into actual registers.

We already flush TM SPRs into vcpu when switching out of CPU, and load
TM SPRs when switching back.

This patch corrects mfspr()/mtspr() emulation for TM SPRs to make the
actual source/dest based on actual TM SPRs.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_book3s.h |  1 +
 arch/powerpc/kvm/book3s_emulate.c | 58 +--
 arch/powerpc/kvm/book3s_pr.c  |  2 +-
 3 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index fc15ad9..43e8bb1 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -210,6 +210,7 @@ extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
  unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
+extern void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index f81a921..c4e3ec6 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include "book3s.h"
+#include 
 
 #define OP_19_XOP_RFID 18
 #define OP_19_XOP_RFI  50
@@ -523,13 +524,38 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong spr_val)
break;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case SPRN_TFHAR:
-   vcpu->arch.tfhar = spr_val;
-   break;
case SPRN_TEXASR:
-   vcpu->arch.texasr = spr_val;
-   break;
case SPRN_TFIAR:
-   vcpu->arch.tfiar = spr_val;
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   if (MSR_TM_ACTIVE(kvmppc_get_msr(vcpu)) &&
+   !((MSR_TM_SUSPENDED(kvmppc_get_msr(vcpu))) &&
+   (sprn == SPRN_TFHAR))) {
+   /* it is illegal to mtspr() TM regs in
+* other than non-transactional state, with
+* the exception of TFHAR in suspend state.
+*/
+   kvmppc_core_queue_program(vcpu, SRR1_PROGTM);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   tm_enable();
+   if (sprn == SPRN_TFHAR)
+   mtspr(SPRN_TFHAR, spr_val);
+   else if (sprn == SPRN_TEXASR)
+   mtspr(SPRN_TEXASR, spr_val);
+   else
+   mtspr(SPRN_TFIAR, spr_val);
+   tm_disable();
+
break;
 #endif
 #endif
@@ -676,13 +702,25 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong *spr_val
break;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case SPRN_TFHAR:
-   *spr_val = vcpu->arch.tfhar;
-   break;
case SPRN_TEXASR:
-   *spr_val = vcpu->arch.texasr;
-   break;
case SPRN_TFIAR:
-   *spr_val = vcpu->arch.tfiar;
+   if (!cpu_has_feature(CPU_FTR_TM))
+   break;
+
+   if (!(kvmppc_get_msr(vcpu) & MSR_TM)) {
+   kvmppc_trigger_fac_interrupt(vcpu, FSCR_TM_LG);
+   emulated = EMULATE_AGAIN;
+   break;
+   }
+
+   tm_enable();
+   if (sprn == SPRN_TFHAR)
+   *spr_val = mfspr(SPRN_TFHAR);
+   else if (sprn == SPRN_TEXASR)
+   *spr_val = mfspr(SPRN_TEXASR);
+   else if (sprn == SPRN_TFIAR)
+   *spr_val = mfspr(SPRN_TFIAR);
+   tm_disable();
break;
 #endif
 #endif
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 4b81b3c..e8e7f3a 100644
--- 

[PATCH v3 16/29] KVM: PPC: Book3S PR: add math support for PR KVM HTM

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

The math registers will be saved into vcpu->arch.fp/vr and corresponding
vcpu->arch.fp_tm/vr_tm area.

We flush or giveup the math regs into vcpu->arch.fp/vr before saving
transaction. After transaction is restored, the math regs will be loaded
back into regs.

If there is a FP/VEC/VSX unavailable exception during transaction active
state, the math checkpoint content might be incorrect and we need to do
treclaim./load the correct checkpoint val/trechkpt. sequence to retry the
transaction. That will make our solution complicated. To solve this issue,
we always make the hardware guest MSR math bits (shadow_msr) consistent
with the MSR val which guest sees (kvmppc_get_msr()) when guest msr is
with tm enabled. Then all FP/VEC/VSX unavailable exception can be delivered
to guest and guest handles the exception by itself.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_pr.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 226bae7..4b81b3c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -308,6 +308,28 @@ static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu 
*vcpu)
tm_disable();
 }
 
+/* loadup math bits which is enabled at kvmppc_get_msr() but not enabled at
+ * hardware.
+ */
+static void kvmppc_handle_lost_math_exts(struct kvm_vcpu *vcpu)
+{
+   ulong exit_nr;
+   ulong ext_diff = (kvmppc_get_msr(vcpu) & ~vcpu->arch.guest_owned_ext) &
+   (MSR_FP | MSR_VEC | MSR_VSX);
+
+   if (!ext_diff)
+   return;
+
+   if (ext_diff == MSR_FP)
+   exit_nr = BOOK3S_INTERRUPT_FP_UNAVAIL;
+   else if (ext_diff == MSR_VEC)
+   exit_nr = BOOK3S_INTERRUPT_ALTIVEC;
+   else
+   exit_nr = BOOK3S_INTERRUPT_VSX;
+
+   kvmppc_handle_ext(vcpu, exit_nr, ext_diff);
+}
+
 void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
 {
if (!(MSR_TM_ACTIVE(kvmppc_get_msr(vcpu {
@@ -315,6 +337,8 @@ void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
return;
}
 
+   kvmppc_giveup_ext(vcpu, MSR_VSX);
+
preempt_disable();
_kvmppc_save_tm_pr(vcpu, mfmsr());
preempt_enable();
@@ -324,12 +348,18 @@ void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
 {
if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
kvmppc_restore_tm_sprs(vcpu);
+   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   kvmppc_handle_lost_math_exts(vcpu);
return;
}
 
preempt_disable();
_kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
preempt_enable();
+
+   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   kvmppc_handle_lost_math_exts(vcpu);
+
 }
 #endif
 
@@ -468,6 +498,11 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 
msr)
/* Preload FPU if it's enabled */
if (kvmppc_get_msr(vcpu) & MSR_FP)
kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   if (kvmppc_get_msr(vcpu) & MSR_TM)
+   kvmppc_handle_lost_math_exts(vcpu);
+#endif
 }
 
 void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
-- 
1.8.3.1



[PATCH v3 15/29] KVM: PPC: Book3S PR: add transaction memory save/restore skeleton for PR KVM

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

The transaction memory checkpoint area save/restore behavior is
triggered when VCPU qemu process is switching out/into CPU. ie.
at kvmppc_core_vcpu_put_pr() and kvmppc_core_vcpu_load_pr().

MSR TM active state is determined by TS bits:
active: 10(transactional) or 01 (suspended)
inactive: 00 (non-transactional)
We don't "fake" TM functionality for guest. We "sync" guest virtual
MSR TM active state(10 or 01) with shadow MSR. That is to say,
we don't emulate a transactional guest with a TM inactive MSR.

TM SPR support(TFIAR/TFAR/TEXASR) has already been supported by
commit 9916d57e64a4 ("KVM: PPC: Book3S PR: Expose TM registers").
Math register support (FPR/VMX/VSX) will be done at subsequent
patch.

Whether TM context need to be saved/restored can be determined
by kvmppc_get_msr() TM active state:
* TM active - save/restore TM context
* TM inactive - no need to do so and only save/restore
TM SPRs.

Signed-off-by: Simon Guo 
Suggested-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s.h |  9 +
 arch/powerpc/include/asm/kvm_host.h   |  1 -
 arch/powerpc/kvm/book3s_pr.c  | 27 +++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 20d3d5a..fc15ad9 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -257,6 +257,15 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned 
long lpcr,
 extern int kvmppc_hcall_impl_hv_realmode(unsigned long cmd);
 extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu);
+void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu);
+#else
+static inline void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu) {}
+#endif
+
 extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 89f44ec..60325af 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -621,7 +621,6 @@ struct kvm_vcpu_arch {
 
struct thread_vr_state vr_tm;
u32 vrsave_tm; /* also USPRG0 */
-
 #endif
 
 #ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 7d4905a..226bae7 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 
@@ -115,6 +116,8 @@ static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, 
int cpu)
 
if (kvmppc_is_split_real(vcpu))
kvmppc_fixup_split_real(vcpu);
+
+   kvmppc_restore_tm_pr(vcpu);
 }
 
 static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
@@ -134,6 +137,7 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 
kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+   kvmppc_save_tm_pr(vcpu);
 
/* Enable AIL if supported */
if (cpu_has_feature(CPU_FTR_HVMODE) &&
@@ -304,6 +308,29 @@ static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu 
*vcpu)
tm_disable();
 }
 
+void kvmppc_save_tm_pr(struct kvm_vcpu *vcpu)
+{
+   if (!(MSR_TM_ACTIVE(kvmppc_get_msr(vcpu {
+   kvmppc_save_tm_sprs(vcpu);
+   return;
+   }
+
+   preempt_disable();
+   _kvmppc_save_tm_pr(vcpu, mfmsr());
+   preempt_enable();
+}
+
+void kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu)
+{
+   if (!MSR_TM_ACTIVE(kvmppc_get_msr(vcpu))) {
+   kvmppc_restore_tm_sprs(vcpu);
+   return;
+   }
+
+   preempt_disable();
+   _kvmppc_restore_tm_pr(vcpu, kvmppc_get_msr(vcpu));
+   preempt_enable();
+}
 #endif
 
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
-- 
1.8.3.1



[PATCH v3 14/29] KVM: PPC: Book3S PR: add kvmppc_save/restore_tm_sprs() APIs

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patch adds 2 new APIs kvmppc_save_tm_sprs()/kvmppc_restore_tm_sprs()
for the purpose of TEXASR/TFIAR/TFHAR save/restore.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_pr.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f2ae5a3..7d4905a 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 
@@ -284,6 +285,27 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
svcpu_put(svcpu);
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu)
+{
+   tm_enable();
+   vcpu->arch.tfhar = mfspr(SPRN_TFHAR);
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   vcpu->arch.tfiar = mfspr(SPRN_TFIAR);
+   tm_disable();
+}
+
+static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu)
+{
+   tm_enable();
+   mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   tm_disable();
+}
+
+#endif
+
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
 {
int r = 1; /* Indicate we want to get back into the guest */
-- 
1.8.3.1



[PATCH v3 13/29] KVM: PPC: Book3S PR: adds new kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm API for PR KVM.

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patch adds 2 new APIs: kvmppc_copyto_vcpu_tm() and
kvmppc_copyfrom_vcpu_tm().  These 2 APIs will be used to copy from/to TM
data between VCPU_TM/VCPU area.

PR KVM will use these APIs for treclaim. or trchkpt. emulation.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_emulate.c | 41 +++
 1 file changed, 41 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 2eb457b..f81a921 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -87,6 +87,47 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum 
priv_level level)
return true;
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
+{
+   memcpy(>arch.gpr_tm[0], >arch.regs.gpr[0],
+   sizeof(vcpu->arch.gpr_tm));
+   memcpy(>arch.fp_tm, >arch.fp,
+   sizeof(struct thread_fp_state));
+   memcpy(>arch.vr_tm, >arch.vr,
+   sizeof(struct thread_vr_state));
+   vcpu->arch.ppr_tm = vcpu->arch.ppr;
+   vcpu->arch.dscr_tm = vcpu->arch.dscr;
+   vcpu->arch.amr_tm = vcpu->arch.amr;
+   vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
+   vcpu->arch.tar_tm = vcpu->arch.tar;
+   vcpu->arch.lr_tm = vcpu->arch.regs.link;
+   vcpu->arch.cr_tm = vcpu->arch.cr;
+   vcpu->arch.xer_tm = vcpu->arch.regs.xer;
+   vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
+}
+
+static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
+{
+   memcpy(>arch.regs.gpr[0], >arch.gpr_tm[0],
+   sizeof(vcpu->arch.regs.gpr));
+   memcpy(>arch.fp, >arch.fp_tm,
+   sizeof(struct thread_fp_state));
+   memcpy(>arch.vr, >arch.vr_tm,
+   sizeof(struct thread_vr_state));
+   vcpu->arch.ppr = vcpu->arch.ppr_tm;
+   vcpu->arch.dscr = vcpu->arch.dscr_tm;
+   vcpu->arch.amr = vcpu->arch.amr_tm;
+   vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
+   vcpu->arch.tar = vcpu->arch.tar_tm;
+   vcpu->arch.regs.link = vcpu->arch.lr_tm;
+   vcpu->arch.cr = vcpu->arch.cr_tm;
+   vcpu->arch.regs.xer = vcpu->arch.xer_tm;
+   vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
+}
+
+#endif
+
 int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
  unsigned int inst, int *advance)
 {
-- 
1.8.3.1



[PATCH v3 12/29] KVM: PPC: Book3S PR: prevent TS bits change in kvmppc_interrupt_pr()

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

PR KVM host usually equipped with enabled TM in its host MSR value, and
with non-transactional TS value.

When a guest with TM active traps into PR KVM host, the rfid at the
tail of kvmppc_interrupt_pr() will try to switch TS bits from
S0 (Suspended & TM disabled) to N1 (Non-transactional & TM enabled).

That will leads to TM Bad Thing interrupt.

This patch manually sets target TS bits unchanged to avoid this
exception.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_segment.S | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_segment.S 
b/arch/powerpc/kvm/book3s_segment.S
index 93a180c..98ccc7e 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -383,6 +383,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 */
 
PPC_LL  r6, HSTATE_HOST_MSR(r13)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /*
+* We don't want to change MSR[TS] bits via rfi here.
+* The actual TM handling logic will be in host with
+* recovered DR/IR bits after HSTATE_VMHANDLER.
+* And MSR_TM can be enabled in HOST_MSR so rfid may
+* not suppress this change and can lead to exception.
+* Manually set MSR to prevent TS state change here.
+*/
+   mfmsr   r7
+   rldicl  r7, r7, 64 - MSR_TS_S_LG, 62
+   rldimi  r6, r7, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+#endif
PPC_LL  r8, HSTATE_VMHANDLER(r13)
 
 #ifdef CONFIG_PPC64
-- 
1.8.3.1



[PATCH v3 11/29] KVM: PPC: Book3S PR: implement RFID TM behavior to suppress change from S0 to N0

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Accordingly to ISA specification for RFID, in MSR TM disabled and TS
suspended state(S0), if the target MSR is TM disabled and TS state is
inactive(N0), rfid should suppress this update.

This patch make RFID emulation of PR KVM to be consistent with this.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_emulate.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 68d6898..2eb457b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -117,11 +117,28 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
case 19:
switch (get_xop(inst)) {
case OP_19_XOP_RFID:
-   case OP_19_XOP_RFI:
+   case OP_19_XOP_RFI: {
+   unsigned long srr1 = kvmppc_get_srr1(vcpu);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   unsigned long cur_msr = kvmppc_get_msr(vcpu);
+
+   /*
+* add rules to fit in ISA specification regarding TM
+* state transistion in TM disable/Suspended state,
+* and target TM state is TM inactive(00) state. (the
+* change should be suppressed).
+*/
+   if (((cur_msr & MSR_TM) == 0) &&
+   ((srr1 & MSR_TM) == 0) &&
+   MSR_TM_SUSPENDED(cur_msr) &&
+   !MSR_TM_ACTIVE(srr1))
+   srr1 |= MSR_TS_S;
+#endif
kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu));
-   kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu));
+   kvmppc_set_msr(vcpu, srr1);
*advance = 0;
break;
+   }
 
default:
emulated = EMULATE_FAIL;
-- 
1.8.3.1



[PATCH v3 10/29] KVM: PPC: Book3S PR: Sync TM bits to shadow msr for problem state guest

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

MSR TS bits can be modified with non-privileged instruction like
tbegin./tend.  That means guest can change MSR value "silently" without
notifying host.

It is necessary to sync the TM bits to host so that host can calculate
shadow msr correctly.

note privilege guest will always fail transactions so we only take
care of problem state guest.

The logic is put into kvmppc_copy_from_svcpu() so that
kvmppc_handle_exit_pr() can use correct MSR TM bits even when preemption.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_pr.c | 73 ++--
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d3237f5..f2ae5a3 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -182,10 +182,36 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
svcpu_put(svcpu);
 }
 
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+   ulong guest_msr = kvmppc_get_msr(vcpu);
+   ulong smsr = guest_msr;
+
+   /* Guest MSR values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
+   MSR_TM | MSR_TS_MASK;
+#else
+   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
+#endif
+   /* Process MSR values */
+   smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+   /* External providers the guest reserved */
+   smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
+   /* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+   smsr |= MSR_ISF | MSR_HV;
+#endif
+   vcpu->arch.shadow_msr = smsr;
+}
+
 /* Copy data touched by real-mode code from shadow vcpu back to vcpu */
 void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 {
struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   ulong old_msr;
+#endif
 
/*
 * Maybe we were already preempted and synced the svcpu from
@@ -228,6 +254,30 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /*
+* Unlike other MSR bits, MSR[TS]bits can be changed at guest without
+* notifying host:
+*  modified by unprivileged instructions like "tbegin"/"tend"/
+* "tresume"/"tsuspend" in PR KVM guest.
+*
+* It is necessary to sync here to calculate a correct shadow_msr.
+*
+* privileged guest's tbegin will be failed at present. So we
+* only take care of problem state guest.
+*/
+   old_msr = kvmppc_get_msr(vcpu);
+   if (unlikely((old_msr & MSR_PR) &&
+   (vcpu->arch.shadow_srr1 & (MSR_TS_MASK)) !=
+   (old_msr & (MSR_TS_MASK {
+   old_msr &= ~(MSR_TS_MASK);
+   old_msr |= (vcpu->arch.shadow_srr1 & (MSR_TS_MASK));
+   kvmppc_set_msr_fast(vcpu, old_msr);
+   kvmppc_recalc_shadow_msr(vcpu);
+   }
+#endif
+
svcpu->in_use = false;
 
 out:
@@ -306,29 +356,6 @@ static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned 
long hva, pte_t pte)
 
 /*/
 
-static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
-{
-   ulong guest_msr = kvmppc_get_msr(vcpu);
-   ulong smsr = guest_msr;
-
-   /* Guest MSR values */
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
-   MSR_TM | MSR_TS_MASK;
-#else
-   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
-#endif
-   /* Process MSR values */
-   smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
-   /* External providers the guest reserved */
-   smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
-   /* 64-bit Process MSR values */
-#ifdef CONFIG_PPC_BOOK3S_64
-   smsr |= MSR_ISF | MSR_HV;
-#endif
-   vcpu->arch.shadow_msr = smsr;
-}
-
 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 {
ulong old_msr = kvmppc_get_msr(vcpu);
-- 
1.8.3.1



[PATCH v3 09/29] KVM: PPC: Book3S PR: PR KVM pass through MSR TM/TS bits to shadow_msr.

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

PowerPC TM functionality needs MSR TM/TS bits support in hardware level.
Guest TM functionality can not be emulated with "fake" MSR (msr in magic
page) TS bits.

This patch syncs TM/TS bits in shadow_msr with the MSR value in magic
page, so that the MSR TS value which guest sees is consistent with actual
MSR bits running in guest.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_pr.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 67061d3..d3237f5 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -312,7 +312,12 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
ulong smsr = guest_msr;
 
/* Guest MSR values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE |
+   MSR_TM | MSR_TS_MASK;
+#else
smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
+#endif
/* Process MSR values */
smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
/* External providers the guest reserved */
-- 
1.8.3.1



[PATCH v3 08/29] KVM: PPC: Book3S PR: In PR KVM suspends Transactional state when inject an interrupt.

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patch simulates interrupt behavior per Power ISA while injecting
interrupt in PR KVM:
- When interrupt happens, transactional state should be suspended.

kvmppc_mmu_book3s_64_reset_msr() will be invoked when injecting an
interrupt. This patch performs this ISA logic in
kvmppc_mmu_book3s_64_reset_msr().

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_64_mmu.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index a93d719..cf9d686 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -38,7 +38,16 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-   kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
+   unsigned long msr = vcpu->arch.intr_msr;
+   unsigned long cur_msr = kvmppc_get_msr(vcpu);
+
+   /* If transactional, change to suspend mode on IRQ delivery */
+   if (MSR_TM_TRANSACTIONAL(cur_msr))
+   msr |= MSR_TS_S;
+   else
+   msr |= cur_msr & MSR_TS_MASK;
+
+   kvmppc_set_msr(vcpu, msr);
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
-- 
1.8.3.1



[PATCH v3 07/29] KVM: PPC: Book3S PR: add C function wrapper for _kvmppc_save/restore_tm()

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

Currently _kvmppc_save/restore_tm() APIs can only be invoked from
assembly function. This patch adds C function wrappers for them so
that they can be safely called from C function.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |  6 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  8 +--
 arch/powerpc/kvm/tm.S | 94 ++-
 3 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index dfdcb23..5da683b 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,7 +141,13 @@ unsigned long __init prom_init(unsigned long r3, unsigned 
long r4,
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
 
+/* Transaction memory related */
 void tm_enable(void);
 void tm_disable(void);
 void tm_abort(uint8_t cause);
+
+struct kvm_vcpu;
+void _kvmppc_restore_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr);
+void _kvmppc_save_tm_pr(struct kvm_vcpu *vcpu, u64 guest_msr);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6445d29..980df5f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -795,7 +795,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
mr  r3, r4
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_restore_tm
+   bl  __kvmppc_restore_tm
ld  r4, HSTATE_KVM_VCPU(r13)
 91:
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -1783,7 +1783,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
mr  r3, r9
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_save_tm
+   bl  __kvmppc_save_tm
ld  r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -2689,7 +2689,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
ld  r3, HSTATE_KVM_VCPU(r13)
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_save_tm
+   bl  __kvmppc_save_tm
 91:
 #endif
 
@@ -2809,7 +2809,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 */
mr  r3, r4
ld  r4, VCPU_MSR(r3)
-   bl  kvmppc_restore_tm
+   bl  __kvmppc_restore_tm
ld  r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index b7057d5..42a7cd8 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -33,7 +33,7 @@
  * This can modify all checkpointed registers, but
  * restores r1, r2 before exit.
  */
-_GLOBAL(kvmppc_save_tm)
+_GLOBAL(__kvmppc_save_tm)
mflrr0
std r0, PPC_LR_STKOFF(r1)
stdur1, -PPC_MIN_STKFRM(r1)
@@ -210,6 +210,52 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
blr
 
 /*
+ * _kvmppc_save_tm_pr() is a wrapper around __kvmppc_save_tm(), so that it can
+ * be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_save_tm_pr)
+   mflrr5
+   std r5, PPC_LR_STKOFF(r1)
+   stdur1, -SWITCH_FRAME_SIZE(r1)
+   SAVE_NVGPRS(r1)
+
+   /* save MSR since TM/math bits might be impacted
+* by __kvmppc_save_tm().
+*/
+   mfmsr   r5
+   SAVE_GPR(5, r1)
+
+   /* also save DSCR/CR so that it can be recovered later */
+   mfspr   r6, SPRN_DSCR
+   SAVE_GPR(6, r1)
+
+   mfcrr7
+   stw r7, _CCR(r1)
+
+   bl  __kvmppc_save_tm
+
+   ld  r7, _CCR(r1)
+   mtcrr7
+
+   REST_GPR(6, r1)
+   mtspr   SPRN_DSCR, r6
+
+   /* need preserve current MSR's MSR_TS bits */
+   REST_GPR(5, r1)
+   mfmsr   r6
+   rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
+   rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+   mtmsrd  r5
+
+   REST_NVGPRS(r1)
+   addir1, r1, SWITCH_FRAME_SIZE
+   ld  r5, PPC_LR_STKOFF(r1)
+   mtlrr5
+   blr
+
+EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
+
+/*
  * Restore transactional state and TM-related registers.
  * Called with:
  *  - r3 pointing to the vcpu struct.
@@ -219,7 +265,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
  * This potentially modifies all checkpointed registers.
  * It restores r1, r2 from the PACA.
  */
-_GLOBAL(kvmppc_restore_tm)
+_GLOBAL(__kvmppc_restore_tm)
mflrr0
std r0, PPC_LR_STKOFF(r1)
 
@@ -362,4 +408,48 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
addir1, r1, PPC_MIN_STKFRM
b   9b
 #endif
+
+/*
+ * _kvmppc_restore_tm_pr() is a wrapper around __kvmppc_restore_tm(), so that 
it
+ * can be invoked from C function by PR KVM only.
+ */
+_GLOBAL(_kvmppc_restore_tm_pr)
+   mflrr5
+   std r5, PPC_LR_STKOFF(r1)
+   stdur1, -SWITCH_FRAME_SIZE(r1)
+   SAVE_NVGPRS(r1)
+
+  

[PATCH v3 06/29] KVM: PPC: Book3S PR: turn on FP/VSX/VMX MSR bits in kvmppc_save_tm()

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

kvmppc_save_tm() invokes  store_fp_state/store_vr_state(). So it is
mandatory to turn on FP/VSX/VMX MSR bits for its execution, just
like what kvmppc_restore_tm() did.

Previsouly HV KVM has turned the bits on outside of function
kvmppc_save_tm().  Now we include this bit change in kvmppc_save_tm()
so that the logic is more clean. And PR KVM can reuse it later.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kvm/tm.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index cbe608a..b7057d5 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -42,6 +42,8 @@ _GLOBAL(kvmppc_save_tm)
mfmsr   r8
li  r0, 1
rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+   ori r8, r8, MSR_FP
+   orisr8, r8, (MSR_VEC | MSR_VSX)@h
mtmsrd  r8
 
rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
-- 
1.8.3.1



[PATCH v3 05/29] KVM: PPC: Book3S PR: add new parameter (guest MSR) for kvmppc_save_tm()/kvmppc_restore_tm()

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

HV KVM and PR KVM need different MSR source to indicate whether
treclaim. or trecheckpoint. is necessary.

This patch add new parameter (guest MSR) for these kvmppc_save_tm/
kvmppc_restore_tm() APIs:
- For HV KVM, it is VCPU_MSR
- For PR KVM, it is current host MSR or VCPU_SHADOW_SRR1

This enhancement enables these 2 APIs to be reused by PR KVM later.
And the patch keeps HV KVM logic unchanged.

This patch also reworks kvmppc_save_tm()/kvmppc_restore_tm() to
have a clean ABI: r3 for vcpu and r4 for guest_msr.

During kvmppc_save_tm/kvmppc_restore_tm(), the R1 need to be saved
or restored. Currently the R1 is saved into HSTATE_HOST_R1. In PR
KVM, we are going to add a C function wrapper for
kvmppc_save_tm/kvmppc_restore_tm() where the R1 will be incremented
with added stackframe and save into HSTATE_HOST_R1. There are several
places in HV KVM to load HSTATE_HOST_R1 as R1, and we don't want to
bring risk or confusion by TM code.

This patch will use HSTATE_SCRATCH2 to save/restore R1 in
kvmppc_save_tm/kvmppc_restore_tm() to avoid future confusion, since
the r1 is actually a temporary/scratch value to be saved/stored.

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 +-
 arch/powerpc/kvm/tm.S   | 74 -
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 4db2b10..6445d29 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -793,8 +793,12 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
+   mr  r3, r4
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_restore_tm
+   ld  r4, HSTATE_KVM_VCPU(r13)
 91:
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
 
/* Load guest PMU registers */
@@ -1777,7 +1781,10 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
+   mr  r3, r9
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_save_tm
+   ld  r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
@@ -2680,7 +2687,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
-   ld  r9, HSTATE_KVM_VCPU(r13)
+   ld  r3, HSTATE_KVM_VCPU(r13)
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_save_tm
 91:
 #endif
@@ -2799,7 +2807,10 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
 */
+   mr  r3, r4
+   ld  r4, VCPU_MSR(r3)
bl  kvmppc_restore_tm
+   ld  r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index e79b373..cbe608a 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -26,9 +26,12 @@
 
 /*
  * Save transactional state and TM-related registers.
- * Called with r9 pointing to the vcpu struct.
+ * Called with:
+ * - r3 pointing to the vcpu struct
+ * - r4 points to the MSR with current TS bits:
+ * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
  * This can modify all checkpointed registers, but
- * restores r1, r2 and r9 (vcpu pointer) before exit.
+ * restores r1, r2 before exit.
  */
 _GLOBAL(kvmppc_save_tm)
mflrr0
@@ -41,14 +44,11 @@ _GLOBAL(kvmppc_save_tm)
rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
mtmsrd  r8
 
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-   ld  r5, VCPU_MSR(r9)
-   rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+   rldicl. r4, r4, 64 - MSR_TS_S_LG, 62
beq 1f  /* TM not active in guest. */
-#endif
 
-   std r1, HSTATE_HOST_R1(r13)
-   li  r3, TM_CAUSE_KVM_RESCHED
+   std r1, HSTATE_SCRATCH2(r13)
+   std r3, HSTATE_SCRATCH1(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 BEGIN_FTR_SECTION
@@ -65,7 +65,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, 
CPU_FTR_P9_TM_XER_SO_BUG, 96)
 3:
/* Emulation of the treclaim instruction needs TEXASR before treclaim */
mfspr   r6, SPRN_TEXASR
-   std r6, VCPU_ORIG_TEXASR(r9)
+   std r6, VCPU_ORIG_TEXASR(r3)
 6:
 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 #endif
@@ -74,6 +74,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
li  r5, 0
mtmsrd  r5, 1
 
+   li  r3, TM_CAUSE_KVM_RESCHED
+
/* All GPRs are volatile at this point. */
TRECLAIM(R3)
 
@@ -94,7 +96,7 @@ BEGIN_FTR_SECTION
 * we already have it), therefore we can now use any volatile GPR.
 */
/* Reload stack pointer and TOC. */
-   ld  r1, HSTATE_HOST_R1(r13)
+

[PATCH v3 04/29] KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate file

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

It is a simple patch just for moving kvmppc_save_tm/kvmppc_restore_tm()
functionalities to tm.S. There is no logic change. The reconstruct of
those APIs will be done in later patches to improve readability.

It is for preparation of reusing those APIs on both HV/PR PPC KVM.

Some slight change during move the functions includes:
- surrounds some HV KVM specific code with CONFIG_KVM_BOOK3S_HV_POSSIBLE
for compilation.
- use _GLOBAL() to define kvmppc_save_tm/kvmppc_restore_tm()

Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/Makefile   |   3 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 322 
 arch/powerpc/kvm/tm.S   | 363 
 3 files changed, 366 insertions(+), 322 deletions(-)
 create mode 100644 arch/powerpc/kvm/tm.S

diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4b19da8..f872c04 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -63,6 +63,9 @@ kvm-pr-y := \
book3s_64_mmu.o \
book3s_32_mmu.o
 
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+   tm.o
+
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_rmhandlers.o
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 5e6e493..4db2b10 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -39,8 +39,6 @@ BEGIN_FTR_SECTION;\
extsw   reg, reg;   \
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
-#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
-
 /* Values in HSTATE_NAPPING(r13) */
 #define NAPPING_CEDE   1
 #define NAPPING_NOVCPU 2
@@ -3119,326 +3117,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
mr  r4,r31
blr
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Save transactional state and TM-related registers.
- * Called with r9 pointing to the vcpu struct.
- * This can modify all checkpointed registers, but
- * restores r1, r2 and r9 (vcpu pointer) before exit.
- */
-kvmppc_save_tm:
-   mflrr0
-   std r0, PPC_LR_STKOFF(r1)
-   stdur1, -PPC_MIN_STKFRM(r1)
-
-   /* Turn on TM. */
-   mfmsr   r8
-   li  r0, 1
-   rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
-   mtmsrd  r8
-
-   ld  r5, VCPU_MSR(r9)
-   rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-   beq 1f  /* TM not active in guest. */
-
-   std r1, HSTATE_HOST_R1(r13)
-   li  r3, TM_CAUSE_KVM_RESCHED
-
-BEGIN_FTR_SECTION
-   lbz r0, HSTATE_FAKE_SUSPEND(r13) /* Were we fake suspended? */
-   cmpwi   r0, 0
-   beq 3f
-   rldicl. r8, r8, 64 - MSR_TS_S_LG, 62 /* Did we actually hrfid? */
-   beq 4f
-BEGIN_FTR_SECTION_NESTED(96)
-   bl  pnv_power9_force_smt4_catch
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
-   nop
-   b   6f
-3:
-   /* Emulation of the treclaim instruction needs TEXASR before treclaim */
-   mfspr   r6, SPRN_TEXASR
-   std r6, VCPU_ORIG_TEXASR(r9)
-6:
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-   /* Clear the MSR RI since r1, r13 are all going to be foobar. */
-   li  r5, 0
-   mtmsrd  r5, 1
-
-   /* All GPRs are volatile at this point. */
-   TRECLAIM(R3)
-
-   /* Temporarily store r13 and r9 so we have some regs to play with */
-   SET_SCRATCH0(r13)
-   GET_PACA(r13)
-   std r9, PACATMSCRATCH(r13)
-
-   /* If doing TM emulation on POWER9 DD2.2, check for fake suspend mode */
-BEGIN_FTR_SECTION
-   lbz r9, HSTATE_FAKE_SUSPEND(r13)
-   cmpwi   r9, 0
-   beq 2f
-   /*
-* We were in fake suspend, so we are not going to save the
-* register state as the guest checkpointed state (since
-* we already have it), therefore we can now use any volatile GPR.
-*/
-   /* Reload stack pointer and TOC. */
-   ld  r1, HSTATE_HOST_R1(r13)
-   ld  r2, PACATOC(r13)
-   /* Set MSR RI now we have r1 and r13 back. */
-   li  r5, MSR_RI
-   mtmsrd  r5, 1
-   HMT_MEDIUM
-   ld  r6, HSTATE_DSCR(r13)
-   mtspr   SPRN_DSCR, r6
-BEGIN_FTR_SECTION_NESTED(96)
-   bl  pnv_power9_force_smt4_release
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
-   nop
-
-4:
-   mfspr   r3, SPRN_PSSCR
-   /* PSSCR_FAKE_SUSPEND is a write-only bit, but clear it anyway */
-   li  r0, PSSCR_FAKE_SUSPEND
-   andcr3, r3, r0
-   mtspr   SPRN_PSSCR, r3
-   ld  r9, HSTATE_KVM_VCPU(r13)
-   /* Don't save TEXASR, use value from last exit in real suspend state */
-   b   11f
-2:
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
-
-   ld  r9, 

[PATCH v3 03/29] powerpc: export tm_enable()/tm_disable/tm_abort() APIs

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patch exports tm_enable()/tm_disable/tm_abort() APIs, which
will be used for PR KVM transaction memory logic.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/include/asm/asm-prototypes.h |  3 +++
 arch/powerpc/include/asm/tm.h |  2 --
 arch/powerpc/kernel/tm.S  | 12 
 arch/powerpc/mm/hash_utils_64.c   |  1 +
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..dfdcb23 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,4 +141,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned 
long r4,
 void pnv_power9_force_smt4_catch(void);
 void pnv_power9_force_smt4_release(void);
 
+void tm_enable(void);
+void tm_disable(void);
+void tm_abort(uint8_t cause);
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index b1658c9..e94f6db 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -10,12 +10,10 @@
 
 #ifndef __ASSEMBLY__
 
-extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
   uint8_t cause);
 extern void tm_reclaim_current(uint8_t cause);
 extern void tm_recheckpoint(struct thread_struct *thread);
-extern void tm_abort(uint8_t cause);
 extern void tm_save_sprs(struct thread_struct *thread);
 extern void tm_restore_sprs(struct thread_struct *thread);
 
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index b92ac8e..ff12f47 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_VSX
 /* See fpu.S, this is borrowed from there */
@@ -55,6 +56,16 @@ _GLOBAL(tm_enable)
or  r4, r4, r3
mtmsrd  r4
 1: blr
+EXPORT_SYMBOL_GPL(tm_enable);
+
+_GLOBAL(tm_disable)
+   mfmsr   r4
+   li  r3, MSR_TM >> 32
+   sldir3, r3, 32
+   andcr4, r4, r3
+   mtmsrd  r4
+   blr
+EXPORT_SYMBOL_GPL(tm_disable);
 
 _GLOBAL(tm_save_sprs)
mfspr   r0, SPRN_TFHAR
@@ -78,6 +89,7 @@ _GLOBAL(tm_restore_sprs)
 _GLOBAL(tm_abort)
TABORT(R3)
blr
+EXPORT_SYMBOL_GPL(tm_abort);
 
 /* void tm_reclaim(struct thread_struct *thread,
  *uint8_t cause)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0bd3790..1bd8b4c1 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -64,6 +64,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
-- 
1.8.3.1



[PATCH v3 02/29] powerpc: add TEXASR related macros

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

This patches add some macros for CR0/TEXASR bits so that PR KVM TM
logic(tbegin./treclaim./tabort.) can make use of them later.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/include/asm/reg.h  | 32 +++--
 arch/powerpc/platforms/powernv/copy-paste.h |  3 +--
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 44b2be4..5625684 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -146,6 +146,12 @@
 #define MSR_64BIT  0
 #endif
 
+/* Condition Register related */
+#define CR0_SHIFT  28
+#define CR0_MASK   0xF
+#define CR0_TBEGIN_FAILURE (0x2 << 28) /* 0b0010 */
+
+
 /* Power Management - Processor Stop Status and Control Register Fields */
 #define PSSCR_RL_MASK  0x000F /* Requested Level */
 #define PSSCR_MTL_MASK 0x00F0 /* Maximum Transition Level */
@@ -239,13 +245,27 @@
 #define SPRN_TFIAR 0x81/* Transaction Failure Inst Addr   */
 #define SPRN_TEXASR0x82/* Transaction EXception & Summary */
 #define SPRN_TEXASRU   0x83/* ''  ''  ''Upper 32  */
-#define   TEXASR_ABORT __MASK(63-31) /* terminated by tabort or treclaim */
-#define   TEXASR_SUSP  __MASK(63-32) /* tx failed in suspended state */
-#define   TEXASR_HV__MASK(63-34) /* MSR[HV] when failure occurred */
-#define   TEXASR_PR__MASK(63-35) /* MSR[PR] when failure occurred */
-#define   TEXASR_FS__MASK(63-36) /* TEXASR Failure Summary */
-#define   TEXASR_EXACT __MASK(63-37) /* TFIAR value is exact */
+
+#define TEXASR_FC_LG   (63 - 7)/* Failure Code */
+#define TEXASR_AB_LG   (63 - 31)   /* Abort */
+#define TEXASR_SU_LG   (63 - 32)   /* Suspend */
+#define TEXASR_HV_LG   (63 - 34)   /* Hypervisor state*/
+#define TEXASR_PR_LG   (63 - 35)   /* Privilege level */
+#define TEXASR_FS_LG   (63 - 36)   /* failure summary */
+#define TEXASR_EX_LG   (63 - 37)   /* TFIAR exact bit */
+#define TEXASR_ROT_LG  (63 - 38)   /* ROT bit */
+
+#define   TEXASR_ABORT __MASK(TEXASR_AB_LG) /* terminated by tabort or 
treclaim */
+#define   TEXASR_SUSP  __MASK(TEXASR_SU_LG) /* tx failed in suspended state */
+#define   TEXASR_HV__MASK(TEXASR_HV_LG) /* MSR[HV] when failure occurred */
+#define   TEXASR_PR__MASK(TEXASR_PR_LG) /* MSR[PR] when failure occurred */
+#define   TEXASR_FS__MASK(TEXASR_FS_LG) /* TEXASR Failure Summary */
+#define   TEXASR_EXACT __MASK(TEXASR_EX_LG) /* TFIAR value is exact */
+#define   TEXASR_ROT   __MASK(TEXASR_ROT_LG)
+#define   TEXASR_FC(ASM_CONST(0xFF) << TEXASR_FC_LG)
+
 #define SPRN_TFHAR 0x80/* Transaction Failure Handler Addr */
+
 #define SPRN_TIDR  144 /* Thread ID register */
 #define SPRN_CTRLF 0x088
 #define SPRN_CTRLT 0x098
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h 
b/arch/powerpc/platforms/powernv/copy-paste.h
index c9a5036..3fa62de 100644
--- a/arch/powerpc/platforms/powernv/copy-paste.h
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -7,9 +7,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include 
+#include 
 
-#define CR0_SHIFT  28
-#define CR0_MASK   0xF
 /*
  * Copy/paste instructions:
  *
-- 
1.8.3.1



[PATCH v3 01/29] powerpc: export symbol msr_check_and_set().

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

PR KVM will need to reuse msr_check_and_set().
This patch exports this API for reuse.

Signed-off-by: Simon Guo 
Reviewed-by: Paul Mackerras 
---
 arch/powerpc/kernel/process.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1237f13..25db000 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -154,6 +154,7 @@ unsigned long msr_check_and_set(unsigned long bits)
 
return newmsr;
 }
+EXPORT_SYMBOL_GPL(msr_check_and_set);
 
 void __msr_check_and_clear(unsigned long bits)
 {
-- 
1.8.3.1



[PATCH v3 00/29] KVM: PPC: Book3S PR: Transaction memory support on PR KVM

2018-05-20 Thread wei . guo . simon
From: Simon Guo 

In current days, many OS distributions have utilized transaction
memory functionality. In PowerPC, HV KVM supports TM. But PR KVM
does not.

The drive for the transaction memory support of PR KVM is the
openstack Continuous Integration testing - They runs a HV(hypervisor)
KVM(as level 1) and then run PR KVM(as level 2) on top of that.

This patch set add transaction memory support on PR KVM.

v2 -> v3 changes:
1) rebase onto Paul's kvm-ppc-next branch, which includes rework 
KVM_CHECK_EXTENSION ioctl (patch #25) a little bit. 
2) allow mtspr TFHAR in TM suspend state
3) remove patch: 
  "KVM: PPC: add KVM_SET_ONE_REG/KVM_GET_ONE_REG to async ioctl"
4) some minor rework per comments

v1 -> v2 changes:
1. Correct a bug in trechkpt emulation: the tm sprs need to be 
flushed to vcpu before trechkpt.
2. add PR kvm ioctl functionalities for TM.
3. removed save_msr_tm and use kvmppc_get_msr() to determine 
whether a transaction state need to be restored.
4. Remove "KVM: PPC: Book3S PR: set MSR HV bit accordingly 
for PPC970 and others." patch.
It will prevent PR KVM to start as L1 hypervisor. Since if 
we set HV bit to 0 when rfid to guest (who is supposed to 
run at HV=1 && PR=1), the guest will not be able to access 
its original memory.
The original code always set HV bits for shadow_msr, it is 
benign since:
HV bits can only be altered by sc instruction; it can only 
be set to 0 by rfid/hrfid instruction.  
We return to guest with rfid. So:
* if KVM are running as L1 hypervisor, guest physical MSR 
expects HV=1.
* if KVM are running as L2 hypervisor, rfid cannot update 
HV =1 so the HV is still 0.
5. add XER register implementation to 
kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm()
6. remove unnecessary stack frame in _kvmppc_save/restore_tm().
7. move MSR bits sync into kvmppc_copy_from_svcpu() so that 
we always see inconsistent shadow_msr/kvmppc_get_msr() 
even when preemption.
8. doing failure recording in treclaim emulation when TEXASR_FS
is 0.


Test cases performed:
linux/tools/testing/selftests/powerpc/tm/tm-syscall
linux/tools/testing/selftests/powerpc/tm/tm-fork
linux/tools/testing/selftests/powerpc/tm/tm-vmx-unavail
linux/tools/testing/selftests/powerpc/tm/tm-tmspr
linux/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv
linux/tools/testing/selftests/powerpc/math/vsx_preempt
linux/tools/testing/selftests/powerpc/math/fpu_signal
linux/tools/testing/selftests/powerpc/math/vmx_preempt
linux/tools/testing/selftests/powerpc/math/fpu_syscall
linux/tools/testing/selftests/powerpc/math/vmx_syscall
linux/tools/testing/selftests/powerpc/math/fpu_preempt
linux/tools/testing/selftests/powerpc/math/vmx_signal
linux/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr
linux/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr
linux/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx
linux/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr
linux/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx
https://github.com/justdoitqd/publicFiles/blob/master/test_tbegin_pr.c
https://github.com/justdoitqd/publicFiles/blob/master/test_tabort.c
https://github.com/justdoitqd/publicFiles/blob/master/test_kvm_htm_cap.c
https://github.com/justdoitqd/publicFiles/blob/master/test-tm-mig.c

Simon Guo (29):
  powerpc: export symbol msr_check_and_set().
  powerpc: add TEXASR related macros
  powerpc: export tm_enable()/tm_disable/tm_abort() APIs
  KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate
file
  KVM: PPC: Book3S PR: add new parameter (guest MSR) for
kvmppc_save_tm()/kvmppc_restore_tm()
  KVM: PPC: Book3S PR: turn on FP/VSX/VMX MSR bits in kvmppc_save_tm()
  KVM: PPC: Book3S PR: add C function wrapper for
_kvmppc_save/restore_tm()
  KVM: PPC: Book3S PR: In PR KVM suspends Transactional state when
inject an interrupt.
  KVM: PPC: Book3S PR: PR KVM pass through MSR TM/TS bits to shadow_msr.
  KVM: PPC: Book3S PR: Sync TM bits to shadow msr for problem state
guest
  KVM: PPC: Book3S PR: implement RFID TM behavior to suppress change
from S0 to N0
  KVM: PPC: Book3S PR: prevent TS bits change in kvmppc_interrupt_pr()
  KVM: PPC: Book3S PR: adds new
kvmppc_copyto_vcpu_tm/kvmppc_copyfrom_vcpu_tm API for PR KVM.
  KVM: PPC: Book3S PR: add kvmppc_save/restore_tm_sprs() APIs
  KVM: PPC: Book3S PR: add transaction memory save/restore skeleton for
PR KVM
  KVM: PPC: Book3S PR: add math support for PR KVM HTM
  KVM: PPC: Book3S PR: make mtspr/mfspr emulation behavior based on
active TM SPRs
  KVM: PPC: Book3S PR: always fail transaction in guest privilege state
  KVM: PPC: Book3S PR: enable NV reg restore for reading TM SPR at guest
privilege state
  KVM: PPC: Book3S PR: adds emulation for treclaim.
  KVM: PPC: Book3S PR: add emulation for trechkpt in PR KVM.
  KVM: PPC: Book3S PR: add emulation for tabort. for privilege guest
  KVM: PPC: Book3S PR: add guard code to prevent returning to guest with
PR=0 and 

[PATCH v4 4/4] powerpc:selftest update memcmp_64 selftest for VMX implementation

2018-05-16 Thread wei . guo . simon
From: Simon Guo 

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo 
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |  4 +-
 .../testing/selftests/powerpc/stringloops/Makefile |  2 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h| 22 +
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +-
 4 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
li  r3,0
blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
li  r3,1
blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
b/tools/testing/selftests/powerpc/stringloops/Makefile
index 1125e48..75a3d2fe 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 # The loops are all 64-bit code
-CFLAGS += -m64
+CFLAGS += -m64 -DCONFIG_KSM
 CFLAGS += -I$(CURDIR)
 
 TEST_GEN_PROGS := memcmp
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h 
b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..185d257 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include 
 
 #ifndef r1
@@ -6,3 +8,23 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include 
 #include 
 #include 
+#include 
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 1
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+   vmx_count++;
+   return 1;
+}
+
+void exit_vmx_ops(void)
+{
+   vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+   unsigned long size_start, unsigned long max_size)
 {
unsigned long offset, size;
 
-   for (offset = 0; offset < SIZE; offset++) {
-   for (size = 0; size < (SIZE-offset); size++) {
+   for (offset = 0; offset < max_offset; offset++) {
+   for (size = size_start; size < (max_size - offset); size++) {
int x, y;
unsigned long i;
 
@@ -35,70 +55,104 @@ static void test_one(char *s1, char *s2)
printf("\n");
abort();
}
+
+   if (vmx_count != 0) {
+   printf("vmx enter/exit not paired.(offset:%ld 
size:%ld s1:%p s2:%p vc:%d\n",
+   offset, size, s1, s2, vmx_count);
+   printf("\n");
+   abort();
+   }
}
}
 }
 
-static int testcase(void)
+static int testcase(bool islarge)
 {
char *s1;
char *s2;
unsigned long i;
 
-   s1 = memalign(128, SIZE);
+   unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
+   unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+   int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+   s1 = memalign(128, alloc_size);
if (!s1) {
perror("memalign");
exit(1);
}
 
-   s2 = memalign(128, SIZE);
+   s2 = memalign(128, alloc_size);
if (!s2) {
perror("memalign");
exit(1);
}
 
-   srandom(1);

[PATCH v4 3/4] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()

2018-05-16 Thread wei . guo . simon
From: Simon Guo 

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
- 76% cases will fail/unmatch before 16 bytes;
- 83% cases will fail/unmatch before 32 bytes;
- 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. And the testing shows
~20% improvement on memcmp() average execution time with this patch.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Any suggestion is welcome.

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6303bbf..df2eec0 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -405,6 +405,35 @@ _GLOBAL(memcmp)
/* Enter with src/dst addrs has the same offset with 8 bytes
 * align boundary
 */
+
+#ifdef CONFIG_KSM
+   /* KSM will always compare at page boundary so it falls into
+* .Lsameoffset_vmx_cmp.
+*
+* There is an optimization for KSM based on following fact:
+* KSM pages memcmp() prones to fail early at the first bytes. In
+* a statisis data, it shows 76% KSM memcmp() fails at the first
+* 16 bytes, and 83% KSM memcmp() fails at the first 32 bytes, 84%
+* KSM memcmp() fails at the first 64 bytes.
+*
+* Before applying VMX instructions which will lead to 32x128bits VMX
+* regs load/restore penalty, let us compares the first 32 bytes
+* so that we can catch the ~80% fail cases.
+*/
+
+   li  r0,4
+   mtctr   r0
+.Lksm_32B_loop:
+   LD  rA,0,r3
+   LD  rB,0,r4
+   cmpld   cr0,rA,rB
+   addir3,r3,8
+   addir4,r4,8
+   bne cr0,.LcmpAB_lightweight
+   addir5,r5,-8
+   bdnz.Lksm_32B_loop
+#endif
+
ENTER_VMX_OPS
beq cr1,.Llong_novmx_cmp
 
-- 
1.8.3.1



[PATCH v4 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

2018-05-16 Thread wei . guo . simon
From: Simon Guo 

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
--
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include 
>#include 
>#include 
>#include 
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
char *s1;
char *s2;
unsigned long i;

s1 = memalign(128, SIZE);
if (!s1) {
perror("memalign");
exit(1);
}

s2 = memalign(128, SIZE);
if (!s2) {
perror("memalign");
exit(1);
}

for (i = 0; i < SIZE; i++)  {
s1[i] = i & 0xff;
s2[i] = i & 0xff;
}
for (i = 0; i < ITERATIONS; i++) {
int ret = test_memcmp(s1, s2, SIZE);

if (ret) {
printf("return %d at[%ld]! should have returned 
zero\n", ret, i);
abort();
}
}

return 0;
}

int main(void)
{
return test_harness(testcase, "memcmp");
}
--
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
4.726728762 seconds time elapsed
  ( +-  3.54%)
With VMX patch:
4.234335473 seconds time elapsed
  ( +-  2.63%)
There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S|   4 +-
 arch/powerpc/lib/memcmp_64.S  | 231 ++
 arch/powerpc/lib/memcpy_power7.S  |   6 +-
 arch/powerpc/lib/vmx-helper.c |   4 +-
 5 files changed, 240 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S 
b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
std r0,16(r1)
stdur1,-STACKFRAMESIZE(r1)
-   bl  enter_vmx_copy
+   bl  enter_vmx_ops
cmpwi   r3,0
ld  r0,STACKFRAMESIZE+16(r1)
ld  r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
addir3,r3,128
bdnz1b
 
-   b   exit_vmx_copy   /* tail call optimise */
+   b   exit_vmx_ops/* tail call optimise */
 
 #else
li  r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index f20e883..6303bbf 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -27,12 +27,73 @@
 #define LH lhbrx
 #define LW lwbrx
 #define LD ldbrx
+#define LVSlvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH lhzx
 #define LW lwzx
 #define LD ldx
+#define LVSlvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+   vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_OPS_THRES 4096
+#define ENTER_VMX_OPS  \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdur1,-STACKFRAMESIZE(r1); \
+   bl  enter_vmx_ops; \
+   cmpwi   cr1,r3,0; \
+   ld  r0,STACKFRAMESIZE+16(r1); \
+   ld  r3,STK_REG(R31)(r1); \
+   ld  r4,STK_REG(R30)(r1); \
+   ld  r5,STK_REG(R29)(r1); \
+   addir1,r1,STACKFRAMESIZE; \
+   mtlrr0
+
+#define EXIT_VMX_OPS \
+   mflrr0; \
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+   std r0,16(r1); \
+   stdu

[PATCH v4 1/4] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()

2018-05-16 Thread wei . guo . simon
From: Simon Guo 

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed   
   ( +- 0.09% )
- with patch
16.485568173 seconds time elapsed   
   ( +-  0.02% )
-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 1

 int test_memcmp(const void *s1, const void *s2, size_t n);


- Without patch
0.244746482 seconds time elapsed
  ( +-  0.36%)
- with patch
0.215069477 seconds time elapsed
  ( +-  0.51%)
-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
 #include 
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 1
+#define SIZE 8
+#define ITERATIONS 100

 int test_memcmp(const void *s1, const void *s2, size_t n);
---
- Without patch
   1.845642503 seconds time elapsed 
 ( +- 0.12% )
- With patch
   1.849767135 seconds time elapsed 
 ( +- 0.26% )
-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo 
---
 arch/powerpc/lib/memcmp_64.S | 143 ---
 1 file changed, 136 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..f20e883 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
 #define LD ldbrx
 #else
+#define LH lhzx
+#define LW lwzx
 #define LD ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_
+ */
 _GLOBAL(memcmp)
cmpdi   cr1,r5,0
 
-   /* Use the short loop if both strings are not 8B aligned */
-   or  r6,r3,r4
+   /* Use the short loop if the src/dst addresses are not
+* with the same offset of 8 bytes align boundary.
+*/
+   xor r6,r3,r4
andi.   r6,r6,7
 
-   /* Use the short loop if length is less than 32B */
-   cmpdi   cr6,r5,31
+   /* Fall back to short loop if compare at aligned addrs
+* with less than 8 bytes.
+*/
+   cmpdi   cr6,r5,7
 
beq cr1,.Lzero
-   bne .Lshort
-   bgt cr6,.Llong
+   bgt cr6,.Lno_short
 
 .Lshort:
mtctr   r5
-
 1: lbz rA,0(r3)
lbz rB,0(r4)
subf.   rC,rB,rA
@@ -78,11 +91,90 @@ _GLOBAL(memcmp)
li  r3,0
blr
 
+.Lno_short:
+   dcbt0,r3
+   dcbt0,r4
+   bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+   /* attempt to compare bytes not aligned with 8 bytes so that
+* rest comparison can run based on 8 bytes alignment.
+*/
+   andi.   r6,r3,7
+
+   /* Try to compare the first double word which is not 8 bytes aligned:
+* load the first double word at (src & ~7UL) and shift left appropriate
+* bits before comparision.
+*/
+   clrlwi  r6,r3,29
+   rlwinm  r6,r6,3,0,28
+   beq .Lsameoffset_8bytes_aligned
+   clrrdi  r3,r3,3
+   clrrdi  r4,r4,3
+   LD  rA,0,r3
+   LD  rB,0,r4
+   sld rA,rA,r6
+   sld rB,rB,r6
+   cmpld   cr0,rA,rB
+   srwir6,r6,3
+   

[PATCH v4 0/4] powerpc/64: memcmp() optimization

2018-05-16 Thread wei . guo . simon
From: Simon Guo 

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.

Simon Guo (4):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
memcmp()
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h  |   4 +-
 arch/powerpc/lib/copypage_power7.S |   4 +-
 arch/powerpc/lib/memcmp_64.S   | 403 -
 arch/powerpc/lib/memcpy_power7.S   |   6 +-
 arch/powerpc/lib/vmx-helper.c  |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h  |   4 +-
 .../testing/selftests/powerpc/stringloops/Makefile |   2 +-
 .../selftests/powerpc/stringloops/asm/ppc_asm.h|  22 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 9 files changed, 506 insertions(+), 41 deletions(-)

-- 
1.8.3.1



[PATCH v2 10/10] KVM: PPC: reimplements LOAD_VMX/STORE_VMX instruction mmio emulation with analyse_intr() input

2018-05-07 Thread wei . guo . simon
From: Simon Guo 

This patch reimplements LOAD_VMX/STORE_VMX MMIO emulation with
analyse_intr() input. When emulating the store, the VMX reg will need to
be flushed so that the right reg val can be retrieved before writing to
IO MEM.

This patch also adds support for lvebx/lvehx/lvewx/stvebx/stvehx/stvewx
MMIO emulation. To meet the requirement of handling different element
sizes, kvmppc_handle_load128_by2x64()/kvmppc_handle_store128_by2x64()
were replaced with kvmppc_handle_vmx_load()/kvmppc_handle_vmx_store().

The framework used is the similar with VSX instruction mmio emulation.

Suggested-by: Paul Mackerras 
Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_host.h  |   1 +
 arch/powerpc/include/asm/kvm_ppc.h   |  10 +-
 arch/powerpc/kvm/emulate_loadstore.c | 124 +++--
 arch/powerpc/kvm/powerpc.c   | 259 ---
 4 files changed, 302 insertions(+), 92 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 2c4382f..5ab660d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -692,6 +692,7 @@ struct kvm_vcpu_arch {
u8 mmio_vsx_offset;
u8 mmio_vsx_tx_sx_enabled;
u8 mmio_vmx_copy_nums;
+   u8 mmio_vmx_offset;
u8 mmio_copy_type;
u8 osi_needed;
u8 osi_enabled;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 1f087c4..e991821 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -81,10 +81,10 @@ extern int kvmppc_handle_loads(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int rt, unsigned int bytes,
int is_default_endian, int mmio_sign_extend);
-extern int kvmppc_handle_load128_by2x64(struct kvm_run *run,
-   struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian);
-extern int kvmppc_handle_store128_by2x64(struct kvm_run *run,
-   struct kvm_vcpu *vcpu, unsigned int rs, int is_default_endian);
+extern int kvmppc_handle_vmx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+   unsigned int rt, unsigned int bytes, int is_default_endian);
+extern int kvmppc_handle_vmx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+   unsigned int rs, unsigned int bytes, int is_default_endian);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
   u64 val, unsigned int bytes,
   int is_default_endian);
@@ -265,6 +265,8 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, 
u32 *server,
vector128 vval;
u64 vsxval[2];
u32 vsx32val[4];
+   u16 vsx16val[8];
+   u8  vsx8val[16];
struct {
u64 addr;
u64 length;
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index 02304ca..459f8fe 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -113,6 +113,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_sp64_extend = 0;
vcpu->arch.mmio_sign_extend = 0;
vcpu->arch.mmio_vmx_copy_nums = 0;
+   vcpu->arch.mmio_vmx_offset = 0;
vcpu->arch.mmio_host_swabbed = 0;
 
emulated = EMULATE_FAIL;
@@ -154,6 +155,46 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
 #endif
+#ifdef CONFIG_ALTIVEC
+   case LOAD_VMX:
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+
+   /* Hardware enforces alignment of VMX accesses */
+   vcpu->arch.vaddr_accessed &= ~((unsigned long)size - 1);
+   vcpu->arch.paddr_accessed &= ~((unsigned long)size - 1);
+
+   if (size == 16) { /* lvx */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_DWORD;
+   } else if (size == 4) { /* lvewx  */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_WORD;
+   } else if (size == 2) { /* lvehx  */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_HWORD;
+   } else if (size == 1) { /* lvebx  */
+   vcpu->arch.mmio_copy_type =
+   KVMPPC_VMX_COPY_BYTE;
+   } else
+   break;
+
+   vcpu->arch.mmio_vmx_offset =
+  

[PATCH v2 09/10] KVM: PPC: expand mmio_vsx_copy_type to mmio_copy_type to cover VMX load/store elem types

2018-05-07 Thread wei . guo . simon
From: Simon Guo 

VSX MMIO emulation uses mmio_vsx_copy_type to represent VSX emulated
element size/type, such as KVMPPC_VSX_COPY_DWORD_LOAD, etc. This
patch expands mmio_vsx_copy_type to cover VMX copy type, such as
KVMPPC_VMX_COPY_BYTE(stvebx/lvebx), etc. As a result,
mmio_vsx_copy_type is also renamed to mmio_copy_type.

It is a preparation for reimplement VMX MMIO emulation.

Signed-off-by: Simon Guo 
---
 arch/powerpc/include/asm/kvm_host.h  |  9 +++--
 arch/powerpc/kvm/emulate_loadstore.c | 14 +++---
 arch/powerpc/kvm/powerpc.c   | 10 +-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3fb5e8d..2c4382f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -456,6 +456,11 @@ struct mmio_hpte_cache {
 #define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP3
 #define KVMPPC_VSX_COPY_WORD_LOAD_DUMP 4
 
+#define KVMPPC_VMX_COPY_BYTE   8
+#define KVMPPC_VMX_COPY_HWORD  9
+#define KVMPPC_VMX_COPY_WORD   10
+#define KVMPPC_VMX_COPY_DWORD  11
+
 struct openpic;
 
 /* W0 and W1 of a XIVE thread management context */
@@ -678,16 +683,16 @@ struct kvm_vcpu_arch {
 * Number of simulations for vsx.
 * If we use 2*8bytes to simulate 1*16bytes,
 * then the number should be 2 and
-* mmio_vsx_copy_type=KVMPPC_VSX_COPY_DWORD.
+* mmio_copy_type=KVMPPC_VSX_COPY_DWORD.
 * If we use 4*4bytes to simulate 1*16bytes,
 * the number should be 4 and
 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_WORD.
 */
u8 mmio_vsx_copy_nums;
u8 mmio_vsx_offset;
-   u8 mmio_vsx_copy_type;
u8 mmio_vsx_tx_sx_enabled;
u8 mmio_vmx_copy_nums;
+   u8 mmio_copy_type;
u8 osi_needed;
u8 osi_enabled;
u8 papr_enabled;
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index 28e97c5..02304ca 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -109,7 +109,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
vcpu->arch.mmio_vsx_copy_nums = 0;
vcpu->arch.mmio_vsx_offset = 0;
-   vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
+   vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
vcpu->arch.mmio_sp64_extend = 0;
vcpu->arch.mmio_sign_extend = 0;
vcpu->arch.mmio_vmx_copy_nums = 0;
@@ -171,17 +171,17 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
if (op.element_size == 8)  {
if (op.vsx_flags & VSX_SPLAT)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
else
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_DWORD;
} else if (op.element_size == 4) {
if (op.vsx_flags & VSX_SPLAT)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_WORD_LOAD_DUMP;
else
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_WORD;
} else
break;
@@ -257,10 +257,10 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_sp64_extend = 1;
 
if (op.element_size == 8)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_DWORD;
else if (op.element_size == 4)
-   vcpu->arch.mmio_vsx_copy_type =
+   vcpu->arch.mmio_copy_type =
KVMPPC_VSX_COPY_WORD;
else
break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8ce9e7b..1580bd2 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1080,14 +1080,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu 
*vcpu,
if (vcpu->kvm->arch.kvm_ops->giveup_ext)

[PATCH v2 08/10] KVM: PPC: reimplements LOAD_VSX/STORE_VSX instruction mmio emulation with analyse_intr() input

2018-05-07 Thread wei . guo . simon
From: Simon Guo 

This patch reimplements LOAD_VSX/STORE_VSX instruction MMIO emulation with
analyse_intr() input. It utilizes VSX_FPCONV/VSX_SPLAT/SIGNEXT exported
by analyse_instr() and handle accordingly.

When emulating VSX store, the VSX reg will need to be flushed so that
the right reg val can be retrieved before writing to IO MEM.

Suggested-by: Paul Mackerras 
Signed-off-by: Simon Guo 
---
 arch/powerpc/kvm/emulate_loadstore.c | 227 ++-
 1 file changed, 91 insertions(+), 136 deletions(-)

diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index 5a6571c..28e97c5 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -154,6 +154,54 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
 #endif
+#ifdef CONFIG_VSX
+   case LOAD_VSX: {
+   int io_size_each;
+
+   if (op.vsx_flags & VSX_CHECK_VEC) {
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+   } else {
+   if (kvmppc_check_vsx_disabled(vcpu))
+   return EMULATE_DONE;
+   }
+
+   if (op.vsx_flags & VSX_FPCONV)
+   vcpu->arch.mmio_sp64_extend = 1;
+
+   if (op.element_size == 8)  {
+   if (op.vsx_flags & VSX_SPLAT)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
+   else
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_DWORD;
+   } else if (op.element_size == 4) {
+   if (op.vsx_flags & VSX_SPLAT)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_WORD_LOAD_DUMP;
+   else
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_WORD;
+   } else
+   break;
+
+   if (size < op.element_size) {
+   /* precision convert case: lxsspx, etc */
+   vcpu->arch.mmio_vsx_copy_nums = 1;
+   io_size_each = size;
+   } else { /* lxvw4x, lxvd2x, etc */
+   vcpu->arch.mmio_vsx_copy_nums =
+   size/op.element_size;
+   io_size_each = op.element_size;
+   }
+
+   emulated = kvmppc_handle_vsx_load(run, vcpu,
+   KVM_MMIO_REG_VSX|op.reg, io_size_each,
+   1, op.type & SIGNEXT);
+   break;
+   }
+#endif
case STORE:
/* if need byte reverse, op.val has been reversed by
 * analyse_instr().
@@ -189,6 +237,49 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
break;
 #endif
+#ifdef CONFIG_VSX
+   case STORE_VSX: {
+   int io_size_each;
+
+   if (op.vsx_flags & VSX_CHECK_VEC) {
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+   } else {
+   if (kvmppc_check_vsx_disabled(vcpu))
+   return EMULATE_DONE;
+   }
+
+   if (vcpu->kvm->arch.kvm_ops->giveup_ext)
+   vcpu->kvm->arch.kvm_ops->giveup_ext(vcpu,
+   MSR_VSX);
+
+   if (op.vsx_flags & VSX_FPCONV)
+   vcpu->arch.mmio_sp64_extend = 1;
+
+   if (op.element_size == 8)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_DWORD;
+   else if (op.element_size == 4)
+   vcpu->arch.mmio_vsx_copy_type =
+   KVMPPC_VSX_COPY_WORD;
+   else
+   break;
+
+   if (size < op.element_size) {
+   /* precise conversion case, like stxsspx */
+   vcpu->arch.mmio_vsx_copy_nums = 1;

  1   2   3   4   >