from:"Shivaprasad G Bhat"


Thanks for the review Nick!

On 1/23/24 17:36, Nicholas Piggin wrote:

On Wed Nov 22, 2023 at 5:32 PM AEST, Shivaprasad G Bhat wrote:

Extend the existing watchpoint facility from TCG DAWR0 emulation
to DAWR1 on POWER10.

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability.



I don't really like the macros. I have nightmares from Linux going
overboard with defining functions using spaghetti of generator macros.

Could you just make most functions accept either SPR number or number
(0, 1), or simply use if/else, to select between them?

Splitting the change in 2 would be good, first add regs + TCG, then the
spapr bits.

Sure.

[snip]


diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index a05bdf78c9..022b984e00 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -204,16 +204,24 @@ void helper_store_ciabr(CPUPPCState *env, target_ulong 
value)
  ppc_store_ciabr(env, value);
  }

-void helper_store_dawr0(CPUPPCState *env, target_ulong value)
-{
-ppc_store_dawr0(env, value);
+#define HELPER_STORE_DAWR(id) \
+void helper_store_dawr##id(CPUPPCState *env, target_ulong value)  \
+{ \
+env->spr[SPR_DAWR##id] = value;   \
  }

-void helper_store_dawrx0(CPUPPCState *env, target_ulong value)
-{
-ppc_store_dawrx0(env, value);
+#define HELPER_STORE_DAWRX(id)\
+void helper_store_dawrx##id(CPUPPCState *env, target_ulong value) \
+{ \
+env->spr[SPR_DAWRX##id] = value;  \
  }

Did we lose the calls to ppc_store_dawr*? That will
break direct register access (i.e., powernv) if so.


Yes. My test cases were more focussed on caps-dawr1 with pSeries

usecases, and missed this. I have taken care in the next version.


+HELPER_STORE_DAWR(0)
+HELPER_STORE_DAWRX(0)
+
+HELPER_STORE_DAWR(1)
+HELPER_STORE_DAWRX(1)

I would say open-code all these too instead of generating. If we
ever grew to >= 4 of them maybe, but as is this saves 2 lines,
and makes 'helper_store_dawrx0' more difficult to grep for.


I open coded all of the functions with barely 12 lines more adding up

without macros.


The next version posted at

https://lore.kernel.org/qemu-devel/170679876639.188422.11634974895844092362.st...@ltc-boston1.aus.stglabs.ibm.com/T/#t


Thanks,

Shivaprasad

[PATCH v8 1/2] ppc: Enable 2nd DAWR support on Power10 PowerNV machine

Extend the existing watchpoint facility from TCG DAWR0 emulation
to DAWR1 on POWER10.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/cpu.c |   45 --
 target/ppc/cpu.h |8 +-
 target/ppc/cpu_init.c|   15 +++
 target/ppc/excp_helper.c |   61 ++
 target/ppc/helper.h  |2 ++
 target/ppc/machine.c |3 ++
 target/ppc/misc_helper.c |   10 
 target/ppc/spr_common.h  |2 ++
 target/ppc/translate.c   |   12 +
 9 files changed, 115 insertions(+), 43 deletions(-)

diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
index e3ad8e0c27..d5ac9bb888 100644
--- a/target/ppc/cpu.c
+++ b/target/ppc/cpu.c
@@ -130,11 +130,13 @@ void ppc_store_ciabr(CPUPPCState *env, target_ulong val)
 ppc_update_ciabr(env);
 }
 
-void ppc_update_daw0(CPUPPCState *env)
+void ppc_update_daw(CPUPPCState *env, int rid)
 {
 CPUState *cs = env_cpu(env);
-target_ulong deaw = env->spr[SPR_DAWR0] & PPC_BITMASK(0, 60);
-uint32_t dawrx = env->spr[SPR_DAWRX0];
+int spr_dawr = !rid ? SPR_DAWR0 : SPR_DAWR1;
+int spr_dawrx = !rid ? SPR_DAWRX0 : SPR_DAWRX1;
+target_ulong deaw = env->spr[spr_dawr] & PPC_BITMASK(0, 60);
+uint32_t dawrx = env->spr[spr_dawrx];
 int mrd = extract32(dawrx, PPC_BIT_NR(48), 54 - 48);
 bool dw = extract32(dawrx, PPC_BIT_NR(57), 1);
 bool dr = extract32(dawrx, PPC_BIT_NR(58), 1);
@@ -144,9 +146,9 @@ void ppc_update_daw0(CPUPPCState *env)
 vaddr len;
 int flags;
 
-if (env->dawr0_watchpoint) {
-cpu_watchpoint_remove_by_ref(cs, env->dawr0_watchpoint);
-env->dawr0_watchpoint = NULL;
+if (env->dawr_watchpoint[rid]) {
+cpu_watchpoint_remove_by_ref(cs, env->dawr_watchpoint[rid]);
+env->dawr_watchpoint[rid] = NULL;
 }
 
 if (!dr && !dw) {
@@ -166,28 +168,45 @@ void ppc_update_daw0(CPUPPCState *env)
 flags |= BP_MEM_WRITE;
 }
 
-cpu_watchpoint_insert(cs, deaw, len, flags, >dawr0_watchpoint);
+cpu_watchpoint_insert(cs, deaw, len, flags, >dawr_watchpoint[rid]);
 }
 
 void ppc_store_dawr0(CPUPPCState *env, target_ulong val)
 {
 env->spr[SPR_DAWR0] = val;
-ppc_update_daw0(env);
+ppc_update_daw(env, 0);
 }
 
-void ppc_store_dawrx0(CPUPPCState *env, uint32_t val)
+static void ppc_store_dawrx(CPUPPCState *env, uint32_t val, int rid)
 {
 int hrammc = extract32(val, PPC_BIT_NR(56), 1);
 
 if (hrammc) {
 /* This might be done with a second watchpoint at the xor of DEAW[0] */
-qemu_log_mask(LOG_UNIMP, "%s: DAWRX0[HRAMMC] is unimplemented\n",
-  __func__);
+qemu_log_mask(LOG_UNIMP, "%s: DAWRX%d[HRAMMC] is unimplemented\n",
+  __func__, rid);
 }
 
-env->spr[SPR_DAWRX0] = val;
-ppc_update_daw0(env);
+env->spr[!rid ? SPR_DAWRX0 : SPR_DAWRX1] = val;
+ppc_update_daw(env, rid);
+}
+
+void ppc_store_dawrx0(CPUPPCState *env, uint32_t val)
+{
+ppc_store_dawrx(env, val, 0);
+}
+
+void ppc_store_dawr1(CPUPPCState *env, target_ulong val)
+{
+env->spr[SPR_DAWR1] = val;
+ppc_update_daw(env, 1);
+}
+
+void ppc_store_dawrx1(CPUPPCState *env, uint32_t val)
+{
+ppc_store_dawrx(env, val, 1);
 }
+
 #endif
 #endif
 
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index f8101ffa29..18dcc438ea 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1236,7 +1236,7 @@ struct CPUArchState {
 #if defined(TARGET_PPC64)
 ppc_slb_t slb[MAX_SLB_ENTRIES]; /* PowerPC 64 SLB area */
 struct CPUBreakpoint *ciabr_breakpoint;
-struct CPUWatchpoint *dawr0_watchpoint;
+struct CPUWatchpoint *dawr_watchpoint[2];
 #endif
 target_ulong sr[32];   /* segment registers */
 uint32_t nb_BATs;  /* number of BATs */
@@ -1549,9 +1549,11 @@ void ppc_store_sdr1(CPUPPCState *env, target_ulong 
value);
 void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val);
 void ppc_update_ciabr(CPUPPCState *env);
 void ppc_store_ciabr(CPUPPCState *env, target_ulong value);
-void ppc_update_daw0(CPUPPCState *env);
+void ppc_update_daw(CPUPPCState *env, int rid);
 void ppc_store_dawr0(CPUPPCState *env, target_ulong value);
 void ppc_store_dawrx0(CPUPPCState *env, uint32_t value);
+void ppc_store_dawr1(CPUPPCState *env, target_ulong value);
+void ppc_store_dawrx1(CPUPPCState *env, uint32_t value);
 #endif /* !defined(CONFIG_USER_ONLY) */
 void ppc_store_msr(CPUPPCState *env, target_ulong value);
 
@@ -1737,9 +1739,11 @@ void ppc_compat_add_property(Object *obj, const char 
*name,
 #define SPR_PSPB  (0x09F)
 #define SPR_DPDES (0x0B0)
 #define SPR_DAWR0 (0x0B4)
+#define SPR_DAWR1 (0x0B5)
 #define SPR_RPR   (0x0BA)
 #define SPR_CIABR (0x0BB)
 #define SPR_DAWRX0(0x0BC)
+#define SPR_DAWRX1(0x0BD)
 #define SPR_HFSCR (0x0BE)
 #de

[PATCH v8 2/2] ppc: spapr: Enable 2nd DAWR on Power10 pSeries machine

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability.

Signed-off-by: Ravi Bangoria 
Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c |7 ++-
 hw/ppc/spapr_caps.c|   36 
 hw/ppc/spapr_hcall.c   |   25 -
 include/hw/ppc/spapr.h |6 +-
 target/ppc/kvm.c   |   12 
 target/ppc/kvm_ppc.h   |   12 
 6 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e8dabc8614..91a97d72e7 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -262,7 +262,7 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 /* 54: DecFP, 56: DecI, 58: SHA */
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
-/* 60: NM atomic, 62: RNG */
+/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 };
 uint8_t *pa_features = NULL;
@@ -303,6 +303,9 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
  * in pa-features. So hide it from them. */
 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 }
+if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
+pa_features[66] |= 0x80;
+}
 
 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
@@ -2138,6 +2141,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
 _spapr_cap_rpt_invalidate,
+_spapr_cap_dawr1,
 NULL
 }
 };
@@ -4717,6 +4721,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
+smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_OFF;
 
 /*
  * This cap specifies whether the AIL 3 mode for
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index e889244e52..677f17cea6 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -655,6 +655,32 @@ static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
 }
 }
 
+static void cap_dawr1_apply(SpaprMachineState *spapr, uint8_t val,
+   Error **errp)
+{
+ERRP_GUARD();
+
+if (!val) {
+return; /* Disable by default */
+}
+
+if (!ppc_type_check_compat(MACHINE(spapr)->cpu_type,
+   CPU_POWERPC_LOGICAL_3_10, 0,
+   spapr->max_compat_pvr)) {
+warn_report("DAWR1 supported only on POWER10 and later CPUs");
+}
+
+if (kvm_enabled()) {
+if (!kvmppc_has_cap_dawr1()) {
+error_setg(errp, "DAWR1 not supported by KVM.");
+error_append_hint(errp, "Try appending -machine cap-dawr1=off");
+} else if (kvmppc_set_cap_dawr1(val) < 0) {
+error_setg(errp, "Error enabling cap-dawr1 with KVM.");
+error_append_hint(errp, "Try appending -machine cap-dawr1=off");
+}
+}
+}
+
 SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
 [SPAPR_CAP_HTM] = {
 .name = "htm",
@@ -781,6 +807,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
 .type = "bool",
 .apply = cap_ail_mode_3_apply,
 },
+[SPAPR_CAP_DAWR1] = {
+.name = "dawr1",
+.description = "Allow 2nd Data Address Watchpoint Register (DAWR1)",
+.index = SPAPR_CAP_DAWR1,
+.get = spapr_cap_get_bool,
+.set = spapr_cap_set_bool,
+.type = "bool",
+.apply = cap_dawr1_apply,
+},
 };
 
 static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
@@ -923,6 +958,7 @@ SPAPR_CAP_MIG_STATE(large_decr, 
SPAPR_CAP_LARGE_DECREMENTER);
 SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST);
 SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI);
 SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE);
+SPAPR_CAP_MIG_STATE(dawr1, SPAPR_CAP_DAWR1);
 
 void spapr_caps_init(SpaprMachineState *spapr)
 {
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index fcefd1d1c7..34c1c77c95 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -814,11 +814,12 @@ static target_ulong 
h_set_mode_resource_set_ciabr(PowerPCCPU *cpu,
 return H_SUCCESS;
 }
 
-static target_ulong h_set_mode_resource_set_dawr0(PowerPCCPU *cpu,
-  SpaprMachineState *spapr,
-

[PATCH v8 0/2] ppc: Enable 2nd DAWR support on Power10

Extends the existing watchpoint facility from TCG DAWR0 emulation to DAWR1 on
Power10 for powernv in the first patch, and for pseries in the second patch
with both TCG and KVM.

---
Changelog:
v7: 
https://lore.kernel.org/qemu-devel/170063834599.621665.9541440879278084501.st...@ltcd48-lp2.aus.stglab.ibm.com/
v7->v8:
  - Fixed the missed out ppc_store_dawr* calls.
  - Removed the macros and split the patch into 2 one just enabling the
facility for powernv and the next one doing the same for pseries guest.
  - The macro removal barely increased the number of lines by 12 as against
the previous version.

v6: 
https://lore.kernel.org/qemu-devel/168871963321.58984.15628382614621248470.stgit@ltcd89-lp2/
v6->v7:
  - Sorry about the delay in sending out this version, I have dropped the
Reviewed-bys as suggested and converted the patch to RFC back again.
  - Added the TCG support. Basically, converted the existing DAWR0 support
routines into macros for reuse by the DAWR1. Let me know if the macro
conversions should be moved to a separate independent patch.
  - As the dawr1 works on TCG, the checks in cap_dawr1_apply() report a warning
now only for P9 or P9 compat modes for both KVM and TCG use cases.
  - 'make test' passes for caps checks. Also, as suggested by Greg Kurz, the
'make test' after making the DAWR1 default 'on' and updating defaut cpu
to Power10, shows no failures.

v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/
v5->v6:
  - The other patches in the original series already merged.
  - Rebased to the top of the tree. So, the gen_spr_book3s_310_dbg() is renamed
to register_book3s_310_dbg_sprs() and moved to cpu_init.c accordingly.
  - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com
v3->v4:
  - Make error message more proper.

v3: https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com
v3->v4:
  - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
(PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
ppc_check_compati(3_00) will also be true. ppc_check_compat(3_10)
can be added while introducing pa_features_310 in future.
  - Use error_append_hint() for hints. Also add ERRP_GUARD().
  - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com
v2->v3:
  - Don't introduce pa_features_310[], instead, reuse pa_features_300[]
for 3.1 guests, as there is no difference between initial values of
them atm.
  - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() instead of
init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() from
gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls it.

v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com
v1->v2:
  - Introduce machine capability cap-dawr1 to enable/disable
the feature. By default, 2nd DAWR is OFF for guests even
when host kvm supports it. User has to manually enable it
with -machine cap-dawr1=on if he wishes to use it.
  - Split the header file changes into separate patch. (Sync
headers from v5.12-rc3)

Shivaprasad G Bhat (2):
  ppc: Enable 2nd DAWR support on Power10 PowerNV machine
  ppc: spapr: Enable 2nd DAWR on Power10 pSeries machine


 hw/ppc/spapr.c   |  7 -
 hw/ppc/spapr_caps.c  | 36 
 hw/ppc/spapr_hcall.c | 25 ++--
 include/hw/ppc/spapr.h   |  6 +++-
 target/ppc/cpu.c | 45 -
 target/ppc/cpu.h |  8 --
 target/ppc/cpu_init.c| 15 ++
 target/ppc/excp_helper.c | 61 ++--
 target/ppc/helper.h  |  2 ++
 target/ppc/kvm.c | 12 
 target/ppc/kvm_ppc.h | 12 
 target/ppc/machine.c |  3 +-
 target/ppc/misc_helper.c | 10 +++
 target/ppc/spr_common.h  |  2 ++
 target/ppc/translate.c   | 12 
 15 files changed, 202 insertions(+), 54 deletions(-)

--
Signature

[RFC PATCH v7] ppc: Enable 2nd DAWR support on p10

2023-11-21 Thread Shivaprasad G Bhat

Extend the existing watchpoint facility from TCG DAWR0 emulation
to DAWR1 on POWER10.

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability.

Signed-off-by: Ravi Bangoria 
Signed-off-by: Shivaprasad G Bhat 
---
Changelog:
v6: 
https://lore.kernel.org/qemu-devel/168871963321.58984.15628382614621248470.stgit@ltcd89-lp2/
v6->v7:
  - Sorry about the delay in sending out this version, I have dropped the
Reviewed-bys as suggested and converted the patch to RFC back again.
  - Added the TCG support. Basically, converted the existing DAWR0 support
routines into macros for reuse by the DAWR1. Let me know if the macro
conversions should be moved to a separate independent patch.
  - As the dawr1 works on TCG, the checks in cap_dawr1_apply() report a warning
now only for P9 or P9 compat modes for both KVM and TCG use cases.
  - 'make test' passes for caps checks. Also, as suggested by Greg Kurz, the
'make test' after making the DAWR1 default 'on' and updating defaut cpu
to Power10, shows no failures.

v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/
v5->v6:
  - The other patches in the original series already merged.
  - Rebased to the top of the tree. So, the gen_spr_book3s_310_dbg() is renamed
to register_book3s_310_dbg_sprs() and moved to cpu_init.c accordingly.
  - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com
v3->v4:
  - Make error message more proper.

v3: https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com
v3->v4:
  - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
(PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
ppc_check_compati(3_00) will also be true. ppc_check_compat(3_10)
can be added while introducing pa_features_310 in future.
  - Use error_append_hint() for hints. Also add ERRP_GUARD().
  - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com
v2->v3:
  - Don't introduce pa_features_310[], instead, reuse pa_features_300[]
for 3.1 guests, as there is no difference between initial values of
them atm.
  - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() instead of
init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() from
gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls it.

v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com
v1->v2:
  - Introduce machine capability cap-dawr1 to enable/disable
the feature. By default, 2nd DAWR is OFF for guests even
when host kvm supports it. User has to manually enable it
with -machine cap-dawr1=on if he wishes to use it.
  - Split the header file changes into separate patch. (Sync
headers from v5.12-rc3)

 hw/ppc/spapr.c   |7 ++-
 hw/ppc/spapr_caps.c  |   35 ++
 hw/ppc/spapr_hcall.c |   50 
 include/hw/ppc/spapr.h   |6 ++
 target/ppc/cpu.c |  114 +-
 target/ppc/cpu.h |6 ++
 target/ppc/cpu_init.c|   15 ++
 target/ppc/excp_helper.c |   61 ++---
 target/ppc/helper.h  |2 +
 target/ppc/kvm.c |   12 +
 target/ppc/kvm_ppc.h |   12 +
 target/ppc/machine.c |1
 target/ppc/misc_helper.c |   20 ++--
 target/ppc/spr_common.h  |2 +
 target/ppc/translate.c   |   25 +++---
 15 files changed, 253 insertions(+), 115 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index df09aa9d6a..c1cb47464b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -262,7 +262,7 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 /* 54: DecFP, 56: DecI, 58: SHA */
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
-/* 60: NM atomic, 62: RNG */
+/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 };
 uint8_t *pa_features = NULL;
@@ -303,6 +303,9 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
  * in pa-features. So hide it from them. */
 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 }
+if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
+pa_features[66] |= 0x80;
+}

 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
@@ -2138,6 +2141,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
 _spapr_cap_rpt_invalidate,
+

Re: [PATCH v6] ppc: Enable 2nd DAWR support on p10




On 7/7/23 19:54, Cédric Le Goater wrote:

On 7/7/23 13:59, Greg Kurz wrote:

Hi Daniel and Shiva !

On Fri, 7 Jul 2023 08:09:47 -0300
Daniel Henrique Barboza  wrote:


This one was a buzzer shot.



Indeed ! :-) I would have appreciated some more time to re-assess
my R-b tag on this 2 year old bug though ;-)


We should drop that patch IMO and ask for a resend with more tests
but that's a lot of work to build a PR :/


Hi Cedric,


I will be taking care of Greg's comment on avoiding failures in TCG mode for

cap-dawr1=on. I have already shared the "make test" results.


Do you want me to try any other tests?


Daniel, Apologies again for forcing you to rebuilding the PR.


Thanks,

Shivaprasad

Re: [PATCH v6] ppc: Enable 2nd DAWR support on p10


On 7/7/23 17:52, Daniel Henrique Barboza wrote:



On 7/7/23 08:59, Greg Kurz wrote:

Hi Daniel and Shiva !

On Fri, 7 Jul 2023 08:09:47 -0300
Daniel Henrique Barboza  wrote:


This one was a buzzer shot.



Indeed ! :-) I would have appreciated some more time to re-assess
my R-b tag on this 2 year old bug though ;-)


My bad! I never thought it was that old. Never occured to me to check 
when

the previous version was sent.

Folks, please bear in mind that a Reviewed-by is given on the context 
when the
patch was sent. A handful of months? Keep the R-bs. 6 months, from one 
release

to the other? Things starts to get a little murky. 2 years? hahaha c'mon



Apologies, since v5 didn't need any rework I retained the Reviewed-bys.

I agree, I should have been explicit in changelog about how old it is.



At the very least you need to point out that the acks are old.




My concerns were that the DAWR1 spapr cap was still not enabled by
default but I guess it is because POWER9 is still the default cpu
type. Related, the apply function should probably spit a warning
with TCG instead of failing, like already done for some other
TCG limitations (e.g. cap_safe_bounds_check_apply()). This will
be needed for `make test` to succeed when DAWR1 is eventually
enabled by default. Not needed right now.


Thanks Greg, I will convert the errors to warnings for DAWR1 caps checks

in the next version. However, I dont see any new "make test" failures 
with the patch.


Here are the logs "make test",

With patch - 
https://gist.github.com/shivaprasadbhat/859f7f4a0c105ac1232b7ab5d8e161e8#file-gistfile1-txt


Without patch - 
https://gist.github.com/shivaprasadbhat/25e5db9254cbe3292017f16adf41ecc1#file-gistfile1-txt




My R-b still stands then ! :-)


This patch got lucky then. If you/Cedric remove your acks I would 
simply drop the
patch and re-send the PR with the greatest of ease, no remorse 
whatsoever.



Thanks,

Daniel



Cheers,

--
Greg



Queued in gitlab.com/danielhb/qemu/tree/ppc-next. Thanks,


Daniel


On 7/7/23 05:47, Shivaprasad G Bhat wrote:

From: Ravi Bangoria 

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is 
set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to 
find
whether kvm supports 2nd DAWR or not. If it's supported, allow user 
to set
the pa-feature bit in guest DT using cap-dawr1 machine capability. 
Though,
watchpoint on powerpc TCG guest is not supported and thus 2nd DAWR 
is not

enabled for TCG mode.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Greg Kurz 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Shivaprasad G Bhat 
---
Changelog:
v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/

v5->v6:
    - The other patches in the original series already merged.
    - Rebased to the top of the tree. So, the 
gen_spr_book3s_310_dbg() is renamed
  to register_book3s_310_dbg_sprs() and moved to cpu_init.c 
accordingly.

    - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com

v3->v4:
    - Make error message more proper.

v3: 
https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com

v3->v4:
    - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
  (PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
  ppc_check_compati(3_00) will also be true. 
ppc_check_compat(3_10)

  can be added while introducing pa_features_310 in future.
    - Use error_append_hint() for hints. Also add ERRP_GUARD().
    - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com

v2->v3:
    - Don't introduce pa_features_310[], instead, reuse 
pa_features_300[]
  for 3.1 guests, as there is no difference between initial 
values of

  them atm.
    - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() 
instead of
  init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() 
from
  gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls 
it.


v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com

v1->v2:
    - Introduce machine capability cap-dawr1 to enable/disable
  the feature. By default, 2nd DAWR is OFF for guests even
  when host kvm supports it. User has to manually enable it
  with -machine cap-dawr1=on if he wishes to use it.
    - Split the header file changes into separate patch. (Sync
  headers from v5.12-rc3)

[1] https://git.kernel.org/torvalds/c/bd1de1a0e6eff

   hw/ppc/spapr.c |    7 ++-
   hw/ppc/spapr_caps.c    |   32 
   include/hw/ppc/spapr.h |    6 +-
   target/ppc/cpu.h   |    2 ++
   target/ppc/cpu_init.c  |   15 +++
   target/ppc/kvm.c   |   12 
   target/ppc/kvm_ppc.h   |   12 +++

[PATCH v6] ppc: Enable 2nd DAWR support on p10

From: Ravi Bangoria 

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability. Though,
watchpoint on powerpc TCG guest is not supported and thus 2nd DAWR is not
enabled for TCG mode.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Greg Kurz 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Shivaprasad G Bhat 
---
Changelog:
v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/
v5->v6:
  - The other patches in the original series already merged.
  - Rebased to the top of the tree. So, the gen_spr_book3s_310_dbg() is renamed
to register_book3s_310_dbg_sprs() and moved to cpu_init.c accordingly.
  - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com
v3->v4:
  - Make error message more proper.

v3: https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com
v3->v4:
  - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
(PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
ppc_check_compati(3_00) will also be true. ppc_check_compat(3_10)
can be added while introducing pa_features_310 in future.
  - Use error_append_hint() for hints. Also add ERRP_GUARD().
  - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com
v2->v3:
  - Don't introduce pa_features_310[], instead, reuse pa_features_300[]
for 3.1 guests, as there is no difference between initial values of
them atm.
  - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() instead of
init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() from
gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls it.

v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com
v1->v2:
  - Introduce machine capability cap-dawr1 to enable/disable
the feature. By default, 2nd DAWR is OFF for guests even
when host kvm supports it. User has to manually enable it
with -machine cap-dawr1=on if he wishes to use it.
  - Split the header file changes into separate patch. (Sync
headers from v5.12-rc3)

[1] https://git.kernel.org/torvalds/c/bd1de1a0e6eff

 hw/ppc/spapr.c |7 ++-
 hw/ppc/spapr_caps.c|   32 
 include/hw/ppc/spapr.h |6 +-
 target/ppc/cpu.h   |2 ++
 target/ppc/cpu_init.c  |   15 +++
 target/ppc/kvm.c   |   12 
 target/ppc/kvm_ppc.h   |   12 
 7 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 54dbfd7fe9..1e54e0c719 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -241,7 +241,7 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 /* 54: DecFP, 56: DecI, 58: SHA */
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
-/* 60: NM atomic, 62: RNG */
+/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 };
 uint8_t *pa_features = NULL;
@@ -282,6 +282,9 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
  * in pa-features. So hide it from them. */
 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 }
+if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
+pa_features[66] |= 0x80;
+}

 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
@@ -2084,6 +2087,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
 _spapr_cap_rpt_invalidate,
+_spapr_cap_dawr1,
 NULL
 }
 };
@@ -4683,6 +4687,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
+smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_OFF;

 /*
  * This cap specifies whether the AIL 3 mode for
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 5a0755d34f..2f2cf4a250 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -655,6 +655,28 @@ static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
 }
 }

+static void cap_dawr1_apply(SpaprMachineState *spapr, uint8_t val,
+   Error **errp)
+{
+ERRP_GUARD();
+if (!val) {
+return; /* Disable by default */
+}
+
+if (tcg_enabled()) {
+error_setg(errp, "DAWR1 not supported in TCG.");
+error_

Re: [PATCH v5 3/3] ppc: Enable 2nd DAWR support on p10


Hi David, All,

I am revisiting/reviving this patch.

On 5/5/21 11:20, David Gibson wrote:

On Wed, Apr 21, 2021 at 11:50:40AM +0530, Ravi Bangoria wrote:

Hi David,

On 4/19/21 10:23 AM, David Gibson wrote:

On Mon, Apr 12, 2021 at 05:14:33PM +0530, Ravi Bangoria wrote:




Since we have released versions with POWER10 support, but no DAWR1, in
theory we need a capability so new qemu with old machine types don't
gain guest visible features that the same machine types on older qemus
had.

Except.. there's a loophole we might use to sidestep that.  The
current POWER10 CPU modelled in qemu is a DD1 - which I strongly
suspect will never appear outside of IBM.  I'm pretty sure we want to
replace that with a DD2.

While the modelled CPU is DD1, I think it's pretty reasonable to say
our POWER10 support hasn't yet stabilized, and it would therefore be
ok to simply add DAWR1 on POWER10 unconditionally, as long as we do it
before we switch over to DD2.


As POWER10 DD2 switch over has already happened, the need for

new/separate capability for dawr1 still holds. So, I am keeping it as is.


Posting the next version after rebase.


Thanks,

Shivaprasad


I'm wondering if we're actually just better off setting the pa feature
just based on the guest CPU model.  TCG will be broken if you try to
use it, but then, it already is.  AFAIK there's no inherent reason we
couldn't implement DAWR support in TCG, it's just never been worth the
trouble.

Correct. Probably there is no practical usecase for DAWR in TCG mode.

Thanks,
Ravi

[PATCH v2 2/2] tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions

Add test for vextractbm, vextractwm, vextractdm and vextractqm
instructions. Test works for both qemu-ppc64 and qemu-ppc64le.

Based on the test case written by John Platts posted at [1]

References:
[1] - https://gitlab.com/qemu-project/qemu/-/issues/1536

Signed-off-by: John Platts 
Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Lucas Mateus Castro 
---
 tests/tcg/ppc64/Makefile.target |5 +++-
 tests/tcg/ppc64/vector.c|   51 +++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/ppc64/vector.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index 6d47d3cae6..b084963b9a 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -20,7 +20,7 @@ PPC64_TESTS += mtfsf
 PPC64_TESTS += mffsce
 
 ifneq ($(CROSS_CC_HAS_POWER10),)
-PPC64_TESTS += byte_reverse sha512-vector
+PPC64_TESTS += byte_reverse sha512-vector vector
 endif
 byte_reverse: CFLAGS += -mcpu=power10
 run-byte_reverse: QEMU_OPTS+=-cpu POWER10
@@ -31,6 +31,9 @@ sha512-vector: sha512.c
 
 run-sha512-vector: QEMU_OPTS+=-cpu POWER10
 
+vector: CFLAGS += -mcpu=power10 -I$(SRC_PATH)/include
+run-vector: QEMU_OPTS += -cpu POWER10
+
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
 
diff --git a/tests/tcg/ppc64/vector.c b/tests/tcg/ppc64/vector.c
new file mode 100644
index 00..cbf4ae9332
--- /dev/null
+++ b/tests/tcg/ppc64/vector.c
@@ -0,0 +1,51 @@
+#include 
+#include 
+#include "qemu/compiler.h"
+
+int main(void)
+{
+unsigned int result_wi;
+vector unsigned char vbc_bi_src = { 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF,
+0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0,
+0, 0xFF, 0xFF};
+vector unsigned short vbc_hi_src = { 0x, 0, 0, 0x,
+ 0, 0, 0x, 0x};
+vector unsigned int vbc_wi_src = {0, 0, 0x, 0x};
+vector unsigned long long vbc_di_src = {0x, 0};
+vector __uint128_t vbc_qi_src;
+
+asm("vextractbm %0, %1" : "=r" (result_wi) : "v" (vbc_bi_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b11011111);
+#else
+assert(result_wi == 0b11111011);
+#endif
+
+asm("vextracthm %0, %1" : "=r" (result_wi) : "v" (vbc_hi_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b10010011);
+#else
+assert(result_wi == 0b11001001);
+#endif
+
+asm("vextractwm %0, %1" : "=r" (result_wi) : "v" (vbc_wi_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b0011);
+#else
+assert(result_wi == 0b1100);
+#endif
+
+asm("vextractdm %0, %1" : "=r" (result_wi) : "v" (vbc_di_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b10);
+#else
+assert(result_wi == 0b01);
+#endif
+
+vbc_qi_src[0] = 0x1;
+vbc_qi_src[0] = vbc_qi_src[0] << 127;
+asm("vextractqm %0, %1" : "=r" (result_wi) : "v" (vbc_qi_src));
+assert(result_wi == 0b1);
+
+return 0;
+}

[PATCH v2 1/2] tcg: ppc64: Fix mask generation for vextractdm

In function do_extractm() the mask is calculated as
dup_const(1 << (element_width - 1)). '1' being signed int
works fine for MO_8,16,32. For MO_64, on PPC64 host
this ends up becoming 0 on compilation. The vextractdm
uses MO_64, and it ends up having mask as 0.

Explicitly use 1ULL instead of signed int 1 like its
used everywhere else.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1536
Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Alex Bennée 
Reviewed-by: Lucas Mateus Castro 
Reviewed-by: Richard Henderson 
---
 target/ppc/translate/vmx-impl.c.inc |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 112233b541..c8712dd7d8 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2058,7 +2058,7 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
 static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
 {
 const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece,
-   mask = dup_const(vece, 1 << (elem_width - 1));
+   mask = dup_const(vece, 1ULL << (elem_width - 1));
 uint64_t i, j;
 TCGv_i64 lo, hi, t0, t1;

[PATCH v2 0/2] tcg: ppc64: Fix mask generation for vextractdm

While debugging gitlab issue[1] 1536, I happen to try the
vextract[X]m instructions on the real hardware. The test
used in [1] is failing for vextractdm.

On debugging it is seen, in function do_extractm() the
mask is calculated as dup_const(1 << (element_width - 1)).
'1' being signed int works fine for MO_8,16,32. For MO_64,
on PPC64 host this ends up becoming 0 on compilation. The
vextractdm uses MO_64, and it ends up having mask as 0.

The first patch here fixes that by explicitly using
1ULL instead of signed int 1 like its used everywhere else.
Second patch introduces the test case from [1] into qemu
tcg/ppc64 along with fixes/tweaks to make it work for both
big and little-endian targets.

References:
[1] : https://gitlab.com/qemu-project/qemu/-/issues/1536

---
Changelog:
Since v1 : https://lists.gnu.org/archive/html/qemu-devel/2023-04/msg01958.html
 - Added "Resolves: " to first patch description
 - Rebased to top of the tree. I see with d044b7c33a5, Alex has limited the
   scope of plugin tests to just the MULTIARCH_TESTS. So, removed the plugin
   tests for the test case added in the second patch.
 - Changed the test case to use the HOST_BIG_ENDIAN from compiler.h

Shivaprasad G Bhat (2):
  tcg: ppc64: Fix mask generation for vextractdm
  tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions


 target/ppc/translate/vmx-impl.c.inc |  2 +-
 tests/tcg/ppc64/Makefile.target |  6 +++-
 tests/tcg/ppc64/vector.c| 51 +
 3 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 tests/tcg/ppc64/vector.c

--
Signature

Re: [PATCH] softfloat: Fix the incorrect computation in float32_exp2()


Hi Richard,


On 5/3/23 01:11, Richard Henderson wrote:

On 5/2/23 16:25, Shivaprasad G Bhat wrote:

The float32_exp2() is computing wrong exponent of 2.
For example, with the following set of values {0.1, 2.0, 2.0, -1.0},
the expected output would be {1.071773, 4.00, 4.00, 0.50}.
Instead, the function is computing {1.119102, 3.382044, 3.382044, 
-0.191022}





his is because instead of the xnp which holds the numerator,
parts_muladd is using the xp which is just 'x'. The commit 
'572c4d862ff2'
refactored this function, and it seems mistakenly using xp instead of 
xnp.


The patches fixes this possible typo.

Fixes: 572c4d862ff2 "softfloat: Convert float32_exp2 to FloatParts"
Partially-Resolves:https://gitlab.com/qemu-project/qemu/-/issues/1623
Reported-By: Luca Barbato (https://gitlab.com/lu-zero)
Signed-off-by: Shivaprasad G Bhat
Signed-off-by: Vaibhav Jain
---
  fpu/softfloat.c |    2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Whoops.  Good catch.



If you are fine with the patch, could you fix the mail id for Vaibhav 
Jain as


  while pulling ?

If you have other comments, I will fix it in the next version otherwise.


Thanks,

Shivaprasad


H


r~

Re: [PATCH 2/2] tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions


On 5/2/23 12:35, Cédric Le Goater wrote:

On 4/13/23 21:01, Shivaprasad G Bhat wrote:

Add test for vextractbm, vextractwm, vextractdm and vextractqm
instructions. Test works for both qemu-ppc64 and qemu-ppc64le.

Based on the test case written by John Platts posted at [1]

References:
[1]: https://gitlab.com/qemu-project/qemu/-/issues/1536


Gitlab issues should be referenced as :

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1536

However, this patch adds a test, not a fix. So it is the previous patch
which should be annotated as resolving the issue.

Also, I think the code should be using  HOST_BIG_ENDIAN instead of
__ORDER_BIG_ENDIAN__


Thanks for the comments Cédric.

Fixing these in v2.

Thanks,

Shivaprasad

[PATCH] softfloat: Fix the incorrect computation in float32_exp2()

2023-05-02 Thread Shivaprasad G Bhat

The float32_exp2() is computing wrong exponent of 2.
For example, with the following set of values {0.1, 2.0, 2.0, -1.0},
the expected output would be {1.071773, 4.00, 4.00, 0.50}.
Instead, the function is computing {1.119102, 3.382044, 3.382044, -0.191022}

Looking at the code, the float32_exp2() attempts to do this

  2 3 4 5   n
  xx x x x x   x
 e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
   1!2!3!4!5!  n!

But because of the 'typo'/bug it ends up doing

 xx x x x x   x
e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
  1!2!3!4!5!  n!

This is because instead of the xnp which holds the numerator,
parts_muladd is using the xp which is just 'x'. The commit '572c4d862ff2'
refactored this function, and it seems mistakenly using xp instead of xnp.

The patches fixes this possible typo.

Fixes: 572c4d862ff2 "softfloat: Convert float32_exp2 to FloatParts"
Partially-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1623
Reported-By: Luca Barbato (https://gitlab.com/lu-zero)
Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Vaibhav Jain 
---
 fpu/softfloat.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index c7454c3eb1a..108f9cb224a 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5135,7 +5135,7 @@ float32 float32_exp2(float32 a, float_status *status)
 float64_unpack_canonical(, float64_one, status);
 for (i = 0 ; i < 15 ; i++) {
 float64_unpack_canonical(, float32_exp2_coefficients[i], status);
-rp = *parts_muladd(, , , 0, status);
+rp = *parts_muladd(, , , 0, status);
 xnp = *parts_mul(, , status);
 }

[PATCH 2/2] tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions

2023-04-13 Thread Shivaprasad G Bhat

Add test for vextractbm, vextractwm, vextractdm and vextractqm
instructions. Test works for both qemu-ppc64 and qemu-ppc64le.

Based on the test case written by John Platts posted at [1]

References:
[1]: https://gitlab.com/qemu-project/qemu/-/issues/1536

Signed-off-by: John Platts 
Signed-off-by: Shivaprasad G Bhat 
---
 tests/tcg/ppc64/Makefile.target |6 -
 tests/tcg/ppc64/vector.c|   50 +++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/ppc64/vector.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index f081f1c683..4fd543ce28 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -20,7 +20,7 @@ PPC64_TESTS += mtfsf
 PPC64_TESTS += mffsce
 
 ifneq ($(CROSS_CC_HAS_POWER10),)
-PPC64_TESTS += byte_reverse sha512-vector
+PPC64_TESTS += byte_reverse sha512-vector vector
 endif
 byte_reverse: CFLAGS += -mcpu=power10
 run-byte_reverse: QEMU_OPTS+=-cpu POWER10
@@ -33,6 +33,10 @@ sha512-vector: sha512.c
 run-sha512-vector: QEMU_OPTS+=-cpu POWER10
 run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 
+vector: CFLAGS += -mcpu=power10
+run-vector: QEMU_OPTS += -cpu POWER10
+run-plugin-vector-with-%: QEMU_OPTS += -cpu POWER10
+
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
 
diff --git a/tests/tcg/ppc64/vector.c b/tests/tcg/ppc64/vector.c
new file mode 100644
index 00..3cb2b88c87
--- /dev/null
+++ b/tests/tcg/ppc64/vector.c
@@ -0,0 +1,50 @@
+#include 
+#include 
+
+int main(void)
+{
+unsigned int result_wi;
+vector unsigned char vbc_bi_src = { 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF,
+0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0,
+0, 0xFF, 0xFF};
+vector unsigned short vbc_hi_src = { 0x, 0, 0, 0x,
+ 0, 0, 0x, 0x};
+vector unsigned int vbc_wi_src = {0, 0, 0x, 0x};
+vector unsigned long long vbc_di_src = {0x, 0};
+vector __uint128_t vbc_qi_src;
+
+asm("vextractbm %0, %1" : "=r" (result_wi) : "v" (vbc_bi_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b11011111);
+#else
+assert(result_wi == 0b11111011);
+#endif
+
+asm("vextracthm %0, %1" : "=r" (result_wi) : "v" (vbc_hi_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b10010011);
+#else
+assert(result_wi == 0b11001001);
+#endif
+
+asm("vextractwm %0, %1" : "=r" (result_wi) : "v" (vbc_wi_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b0011);
+#else
+assert(result_wi == 0b1100);
+#endif
+
+asm("vextractdm %0, %1" : "=r" (result_wi) : "v" (vbc_di_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b10);
+#else
+assert(result_wi == 0b01);
+#endif
+
+vbc_qi_src[0] = 0x1;
+vbc_qi_src[0] = vbc_qi_src[0] << 127;
+asm("vextractqm %0, %1" : "=r" (result_wi) : "v" (vbc_qi_src));
+assert(result_wi == 0b1);
+
+return 0;
+}

[PATCH 1/2] tcg: ppc64: Fix mask generation for vextractdm

2023-04-13 Thread Shivaprasad G Bhat

In function do_extractm() the mask is calculated as
dup_const(1 << (element_width - 1)). '1' being signed int
works fine for MO_8,16,32. For MO_64, on PPC64 host
this ends up becoming 0 on compilation. The vextractdm
uses MO_64, and it ends up having mask as 0.

Explicitly use 1ULL instead of signed int 1 like its
used everywhere else.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/translate/vmx-impl.c.inc |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 112233b541..c8712dd7d8 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2058,7 +2058,7 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
 static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
 {
 const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece,
-   mask = dup_const(vece, 1 << (elem_width - 1));
+   mask = dup_const(vece, 1ULL << (elem_width - 1));
 uint64_t i, j;
 TCGv_i64 lo, hi, t0, t1;

[PATCH 0/2] tcg: ppc64: Fix mask generation for vextractdm

2023-04-13 Thread Shivaprasad G Bhat

While debugging gitlab issue[1] 1536, I happen to try the
vextract[X]m instructions on the real hardware. The test
used in [1] is failing for vextractdm.

On debugging it is seen, in function do_extractm() the
mask is calculated as dup_const(1 << (element_width - 1)).
'1' being signed int works fine for MO_8,16,32. For MO_64,
on PPC64 host this ends up becoming 0 on compilation. The
vextractdm uses MO_64, and it ends up having mask as 0.

The first patch here fixes that by explicitly using
1ULL instead of signed int 1 like its used everywhere else.
Second patch introduces the test case from [1] into qemu
tcg/ppc64 along with fixes/tweaks to make it work for both
big and little-endian targets.

Let me know if both patches should be squashed into single
patch. Checkpatch flagged me to avoid use of __BYTE_ORDER__
in the test file(second patch), however I see it being
used in multiarch/sha1.c also this being arch specific
test, I think it is appropriate to use it here. Let me
know if otherwise.

References:
[1] : https://gitlab.com/qemu-project/qemu/-/issues/1536

---

Shivaprasad G Bhat (2):
  tcg: ppc64: Fix mask generation for vextractdm
  tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions


 target/ppc/translate/vmx-impl.c.inc |  2 +-
 tests/tcg/ppc64/Makefile.target |  6 +++-
 tests/tcg/ppc64/vector.c| 50 +
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 tests/tcg/ppc64/vector.c

--
Signature

[PATCH v7 3/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, a new PAPR specific device type inheriting
the nvdimm device is implemented. When the backend doesn't have pmem=on
the device tree property "ibm,hcall-flush-required" is set, and the guest
makes hcall H_SCM_FLUSH requesting for an explicit flush. The new device
has boolean property pmem-override which when "on" advertises the device
tree property even when pmem=on for the backend. The flush function
invokes the fdatasync or pmem_persist() based on the type of backend.

The vmstate structures are made part of the spapr-nvdimm device object.
The patch attempts to keep the migration compatibility between source and
destination while rejecting the incompatibles ones with failures.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr_nvdimm.c |  132 +
 1 file changed, 132 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index ac44e00153..c4c97da5de 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -34,6 +34,7 @@
 #include "block/thread-pool.h"
 #include "migration/vmstate.h"
 #include "qemu/pmem.h"
+#include "hw/qdev-properties.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -57,6 +58,10 @@ OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, 
SPAPR_NVDIMM)
 struct SPAPRNVDIMMClass {
 /* private */
 NVDIMMClass parent_class;
+
+/* public */
+void (*realize)(NVDIMMDevice *dimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
 };
 
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
@@ -64,6 +69,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -101,6 +108,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -172,6 +187,20 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false, pmem_override = false;
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
+pmem_override = object_property_get_bool(OBJECT(nvdimm),
+ "pmem-override", NULL);
+if (!is_pmem || pmem_override) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -397,11 +426,21 @@ typedef struct SpaprNVDIMMDeviceFlushState {
 
 typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
 struct SpaprNVDIMMDevice {
+/* private */
 NVDIMMDevice parent_obj;
 
+bool hcall_flush_required;
 uint64_t nvdimm_flush_token;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+
+/* public */
+
+/*
+ * The 'on' value for this property forced the qemu to enable the hcall
+ * flush for the nvdimm device even if the backend is a pmem
+ */
+bool pmem_override;
 };
 
 static int flush_worker_cb(void *opaque)
@@ -448,6 +487,24 @@ static int spapr_nvdimm_flush_post_load(void *opaque, int 
version_id)
 SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
 SpaprNVDIMMDeviceFlushState *s

[PATCH v7 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch. The
hcall is applicable only for new SPAPR specific device class which is
also introduced in this patch.

The hcall expects the semantics such that the flush to return with
H_LONG_BUSY_ORDER_10_MSEC when the operation is expected to take longer
time along with a continue_token. The hcall to be called again by providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after collecting the return status for the guest
in subsequent hcall from the guest.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are also introduced in this patch which are to be
saved in the new SPAPR specific nvdimm device to be introduced in the
following patch.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|2 
 hw/ppc/spapr_nvdimm.c |  260 +
 include/hw/ppc/spapr.h|4 -
 include/hw/ppc/spapr_nvdimm.h |1 
 4 files changed, 266 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..9263985663 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1634,6 +1634,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes();
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 91de1052f2..ac44e00153 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,9 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
+#include "migration/vmstate.h"
+#include "qemu/pmem.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -47,6 +51,14 @@
 /* Have an explicit check for alignment */
 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
+
+struct SPAPRNVDIMMClass {
+/* private */
+NVDIMMClass parent_class;
+};
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
@@ -375,6 +387,253 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+typedef struct SpaprNVDIMMDeviceFlushState {
+uint64_t continue_token;
+int64_t hcall_ret;
+uint32_t drcidx;
+
+QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
+} SpaprNVDIMMDeviceFlushState;
+
+typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
+struct SpaprNVDIMMDevice {
+NVDIMMDevice parent_obj;
+
+uint64_t nvdimm_flush_token;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+};
+
+static int flush_worker_cb(void *opaque)
+{
+SpaprNVDIMMDeviceFlushState *state = opaque;
+SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+PCDIMMDevice *dimm = PC_DIMM(drc->dev);
+HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+int backend_fd = memory_region_get_fd(>mr);
+
+if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
+void *ptr = memory_region_get_ram_ptr(mr);
+size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+   NULL);
+
+/* flush pmem backend */
+pmem_persist(ptr, size);
+} else {
+/* flush raw backing image */
+if (qemu_fdatasync(backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+return H_HARDWARE;
+}
+}
+
+retur

[PATCH v7 1/3] nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class

A new subclass inheriting NVDIMMDevice is going to be introduced in
subsequent patches. The new subclass uses the realize and unrealize
callbacks. Add them on NVDIMMClass to appropriately call them as part
of plug-unplug.

Signed-off-by: Shivaprasad G Bhat 
Acked-by: Daniel Henrique Barboza 
---
 hw/mem/nvdimm.c  |   16 
 hw/mem/pc-dimm.c |5 +
 include/hw/mem/nvdimm.h  |2 ++
 include/hw/mem/pc-dimm.h |1 +
 4 files changed, 24 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..59959d5563 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -181,10 +181,25 @@ static MemoryRegion 
*nvdimm_md_get_memory_region(MemoryDeviceState *md,
 static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
 {
 NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
 
 if (!nvdimm->nvdimm_mr) {
 nvdimm_prepare_memory_region(nvdimm, errp);
 }
+
+if (ndc->realize) {
+ndc->realize(nvdimm, errp);
+}
+}
+
+static void nvdimm_unrealize(PCDIMMDevice *dimm)
+{
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
+
+if (ndc->unrealize) {
+ndc->unrealize(nvdimm);
+}
 }
 
 /*
@@ -240,6 +255,7 @@ static void nvdimm_class_init(ObjectClass *oc, void *data)
 DeviceClass *dc = DEVICE_CLASS(oc);
 
 ddc->realize = nvdimm_realize;
+ddc->unrealize = nvdimm_unrealize;
 mdc->get_memory_region = nvdimm_md_get_memory_region;
 device_class_set_props(dc, nvdimm_properties);
 
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 48b913aba6..03bd0dd60e 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -216,6 +216,11 @@ static void pc_dimm_realize(DeviceState *dev, Error **errp)
 static void pc_dimm_unrealize(DeviceState *dev)
 {
 PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+
+if (ddc->unrealize) {
+ddc->unrealize(dimm);
+}
 
 host_memory_backend_set_mapped(dimm->hostmem, false);
 }
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..cf8f59be44 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -103,6 +103,8 @@ struct NVDIMMClass {
 /* write @size bytes from @buf to NVDIMM label data at @offset. */
 void (*write_label_data)(NVDIMMDevice *nvdimm, const void *buf,
  uint64_t size, uint64_t offset);
+void (*realize)(NVDIMMDevice *nvdimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *nvdimm);
 };
 
 #define NVDIMM_DSM_MEM_FILE "etc/acpi/nvdimm-mem"
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 1473e6db62..322bebe555 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -63,6 +63,7 @@ struct PCDIMMDeviceClass {
 
 /* public */
 void (*realize)(PCDIMMDevice *dimm, Error **errp);
+void (*unrealize)(PCDIMMDevice *dimm);
 };
 
 void pc_dimm_pre_plug(PCDIMMDevice *dimm, MachineState *machine,

[PATCH v7 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

h some simplifications
  - Added vmstate to preserve the hcall status during save-restore
along with pre_save handler code to complete all ongoning flushes.
  - Added hw_compat magic for sync-dax 'on' on previous machines.
  - Miscellanious minor fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
  - Fixed a missed-out unlock
  - using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
  nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class
  spapr: nvdimm: Implement H_SCM_FLUSH hcall
  spapr: nvdimm: Introduce spapr-nvdimm device


 hw/mem/nvdimm.c   |  16 ++
 hw/mem/pc-dimm.c  |   5 +
 hw/ppc/spapr.c|   2 +
 hw/ppc/spapr_nvdimm.c | 394 ++
 include/hw/mem/nvdimm.h   |   2 +
 include/hw/mem/pc-dimm.h  |   1 +
 include/hw/ppc/spapr.h|   4 +-
 include/hw/ppc/spapr_nvdimm.h |   1 +
 8 files changed, 424 insertions(+), 1 deletion(-)

--
Signature

[PATCH v6 3/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, a new PAPR specific device type inheriting
the nvdimm device is implemented. When the backend doesn't have pmem=on
the device tree property "ibm,hcall-flush-required" is set, and the guest
makes hcall H_SCM_FLUSH requesting for an explicit flush. The new device
has boolean property pmem-override which when "on" advertises the device
tree property even when pmem=on for the backend. The flush function
invokes the fdatasync or pmem_persist() based on the type of backend.

The vmstate structures are made part of the spapr-nvdimm device object.
The patch attempts to keep the migration compatibility between source and
destination while rejecting the incompatibles ones with failures.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |  131 +
 1 file changed, 131 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index ed6fda2c23..8aa6214d6b 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -34,6 +34,7 @@
 #include "block/thread-pool.h"
 #include "migration/vmstate.h"
 #include "qemu/pmem.h"
+#include "hw/qdev-properties.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -57,6 +58,10 @@ OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, 
SPAPR_NVDIMM)
 struct SPAPRNVDIMMClass {
 /* private */
 NVDIMMClass parent_class;
+
+/* public */
+void (*realize)(NVDIMMDevice *dimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
 };
 
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
@@ -64,6 +69,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -101,6 +108,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -172,6 +187,20 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false, pmem_override = false;
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
+pmem_override = object_property_get_bool(OBJECT(nvdimm),
+ "pmem-override", NULL);
+if (!is_pmem || pmem_override) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -398,11 +427,21 @@ typedef struct SpaprNVDIMMDeviceFlushState {
 
 typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
 struct SpaprNVDIMMDevice {
+/* private */
 NVDIMMDevice parent_obj;
 
+bool hcall_flush_required;
 uint64_t nvdimm_flush_token;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+
+/* public */
+
+/*
+ * The 'on' value for this property forced the qemu to enable the hcall
+ * flush for the nvdimm device even if the backend is a pmem
+ */
+bool pmem_override;
 };
 
 static int flush_worker_cb(void *opaque)
@@ -449,6 +488,23 @@ static int spapr_nvdimm_flush_post_load(void *opaque, int 
version_id)
 SpaprNVDIMMDeviceFlushState *state;
 HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostmem);
 ThreadPool

[PATCH v6 1/3] nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class

A new subclass inheriting NVDIMMDevice is going to be introduced in
subsequent patches. The new subclass uses the realize and unrealize
callbacks. Add them on NVDIMMClass to appropriately call them as part
of plug-unplug.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/mem/nvdimm.c  |   16 
 hw/mem/pc-dimm.c |5 +
 include/hw/mem/nvdimm.h  |2 ++
 include/hw/mem/pc-dimm.h |1 +
 4 files changed, 24 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..59959d5563 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -181,10 +181,25 @@ static MemoryRegion 
*nvdimm_md_get_memory_region(MemoryDeviceState *md,
 static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
 {
 NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
 
 if (!nvdimm->nvdimm_mr) {
 nvdimm_prepare_memory_region(nvdimm, errp);
 }
+
+if (ndc->realize) {
+ndc->realize(nvdimm, errp);
+}
+}
+
+static void nvdimm_unrealize(PCDIMMDevice *dimm)
+{
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
+
+if (ndc->unrealize) {
+ndc->unrealize(nvdimm);
+}
 }
 
 /*
@@ -240,6 +255,7 @@ static void nvdimm_class_init(ObjectClass *oc, void *data)
 DeviceClass *dc = DEVICE_CLASS(oc);
 
 ddc->realize = nvdimm_realize;
+ddc->unrealize = nvdimm_unrealize;
 mdc->get_memory_region = nvdimm_md_get_memory_region;
 device_class_set_props(dc, nvdimm_properties);
 
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 48b913aba6..03bd0dd60e 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -216,6 +216,11 @@ static void pc_dimm_realize(DeviceState *dev, Error **errp)
 static void pc_dimm_unrealize(DeviceState *dev)
 {
 PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+
+if (ddc->unrealize) {
+ddc->unrealize(dimm);
+}
 
 host_memory_backend_set_mapped(dimm->hostmem, false);
 }
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..cf8f59be44 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -103,6 +103,8 @@ struct NVDIMMClass {
 /* write @size bytes from @buf to NVDIMM label data at @offset. */
 void (*write_label_data)(NVDIMMDevice *nvdimm, const void *buf,
  uint64_t size, uint64_t offset);
+void (*realize)(NVDIMMDevice *nvdimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *nvdimm);
 };
 
 #define NVDIMM_DSM_MEM_FILE "etc/acpi/nvdimm-mem"
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 1473e6db62..322bebe555 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -63,6 +63,7 @@ struct PCDIMMDeviceClass {
 
 /* public */
 void (*realize)(PCDIMMDevice *dimm, Error **errp);
+void (*unrealize)(PCDIMMDevice *dimm);
 };
 
 void pc_dimm_pre_plug(PCDIMMDevice *dimm, MachineState *machine,

[PATCH v6 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

-devel/2020-11/msg06330.html
Changes from v1
  - Fixed a missed-out unlock
  - using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
  nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class
  spapr: nvdimm: Implement H_SCM_FLUSH hcall
  spapr: nvdimm: Introduce spapr-nvdimm device


 hw/mem/nvdimm.c   |  16 ++
 hw/mem/pc-dimm.c  |   5 +
 hw/ppc/spapr.c|   2 +
 hw/ppc/spapr_nvdimm.c | 394 ++
 include/hw/mem/nvdimm.h   |   2 +
 include/hw/mem/pc-dimm.h  |   1 +
 include/hw/ppc/spapr.h|   4 +-
 include/hw/ppc/spapr_nvdimm.h |   1 +
 8 files changed, 424 insertions(+), 1 deletion(-)

--
Signature

Re: [PATCH REBASED v5 2/2] spapr: nvdimm: Introduce spapr-nvdimm device




On 9/21/21 12:02, David Gibson wrote:

On Wed, Jul 07, 2021 at 09:57:31PM -0500, Shivaprasad G Bhat wrote:

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, the patch introduces a new papr specific
device type inheriting the nvdimm device. When the backend doesn't have
pmem="yes", the device tree property "ibm,hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

Signed-off-by: Shivaprasad G Bhat 



@@ -91,6 +93,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
  return false;
  }
  
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&

+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));


It's not obvious to me why the spapr nvdimm device has an additional
restriction here over the regular nvdimm device.


For memory-backend-ram the fd is set to -1. The fdatasync would fail 
later. This restriction is for preventing the hcall failure later. May 
be it is intentionally allowed with nvdimms for testing purposes. Let me 
know if you want me to allow it with a dummy success return for the hcall.





+return false;
+}
+
  return true;
  }
  
@@ -162,6 +172,21 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,

   "operating-system")));
  _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
  
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {

+bool is_pmem = false;
+#ifdef CONFIG_LIBPMEM
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem",
+   _abort);


Presenting to the guest a property of the backend worries me
slightly.  How the backends are synchronized between the source and
destination is out of scope for qemu: is there any possibility that we
could migrate from a host where the backend is pmem to one where it is
not (or the reverse).

I think at the least we want a property on the spapr-nvdimm object
which will override what's presented to the guest (which, yes, might
mean lying to the guest).  I think that could be important for
testing, if nothing else.


Mix configurations can be attempted on a nested setup itself.

On a side note, the attempts to use pmem=on on non-pmem backend is being 
deprecated as that is unsafe pretension effective commit cdcf766d0b0.


I see your point, adding "pmem-override"(?, suggest me if you have 
better name) to spapr-nvdimm can be helpful. Adding it to spapr-nvdimm 
device. With pmem-override "on" device tree property is added allowing 
hcall-flush even when pmem=on for the backend. This works for migration 
compatibility in such a setup.





+#endif
+if (!is_pmem) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
  return child_offset;
  }
  
@@ -585,7 +610,16 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,

  }
  
  dimm = PC_DIMM(drc->dev);

+if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
+return H_PARAMETER;
+}


Hmm.  If you're going to make flushes specific to spapr nvdimms, you
could put the queue of pending flushes into the spapr-nvdimm object,
rather than having a global list in the machine.


Yes. I have changed the patches to move all the flush specific data 
structures into the spapr-nvdimm object.





+
  backend = MEMORY_BACKEND(dimm->hostmem);
+#ifdef CONFIG_LIBPMEM
+if (object_property_get_bool(OBJECT(backend), "pmem", _abort)) {
+return H_UNSUPPORTED;


Could you make this not be UNSUPPORTED, but instead fake the flush for
the pmem device?  Either as a no-op, or simulating the guest invoking
the right cpu cache flushes?  That seems like it would be more useful:
that way users who don't care too much about performance could just
always do a flush hcall and not have to have another path for the
"real" pmem case.



It would actually be wrong use for kernel to attempt that. The device 
tree property is checked

Re: [PATCH REBASED v5 1/2] spapr: nvdimm: Implement H_SCM_FLUSH hcall


Hi David,

Thanks for comments. Sorry about the delay. Replies inline.

On 9/21/21 11:53, David Gibson wrote:

On Wed, Jul 07, 2021 at 09:57:21PM -0500, Shivaprasad G Bhat wrote:

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with one of H_LONG_BUSY when the operation is expected to take longer
time along with a continue_token. The hcall to be called again providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after reporting to guest in subsequent hcalls t





@@ -30,6 +31,7 @@
  #include "hw/ppc/fdt.h"
  #include "qemu/range.h"
  #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
  
  /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */

  /* SCM device is unable to persist memory contents */
@@ -375,6 +377,243 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
  return H_SUCCESS;
  }
  
+static uint64_t flush_token;


Better to put this in the machine state structure than a global.


Moved it to device state itself as suggested, the states list is per 
device now.





+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+  





+ !QLIST_EMPTY(>completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+SpaprDrc *drc;
+
+QLIST_FOREACH_SAFE(state, >completed_flush_states, node, next) {


I don't think you need FOREACH_SAFE here.  You're not removing entries
from the loop body.  If you're trying to protect against concurrent
removals, I don't think FOREACH_SAFE is sufficient, you'll need an
actual lock (but I think it's already protected by the BQL).


Changing here, below and also at spapr_nvdimm_get_flush_status() while 
traversing the pending list. Verified all these invocations are called 
with BQL.





+if (flush_token < state->continue_token) {
+flush_token = state->continue_token;
+}
+}
+
+QLIST_FOREACH_SAFE(state, >pending_flush_states, node, next) {


Sane comments here.


+if (flush_token < state->continue_token) {
+flush_token = state->continue_token;
+}
+
+drc = spapr_drc_by_index(state->drcidx);
+dimm = PC_DIMM(drc->dev);
+backend = MEMORY_BACKEND(dimm->hostmem);
+state->backend_fd = memory_region_get_fd(>mr);
+
+thread_pool_submit_aio(pool, flush_worker_cb, state,
+   spapr_nvdimm_flush_completion_cb, state);
+}
+
+return 0;
+}
+
+const VMStateDescription vmstate_spapr_nvdimm_states = {
+.name = "spapr_nvdimm_states",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_nvdimm_states_needed,
+.post_load = spapr_nvdimm_post_load,
+.fields = (VMStateField[]) {
+VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1,
+vmstate_spapr_nvdimm_flush_state,
+SpaprNVDIMMDeviceFlushState, node),
+VMSTATE_QLIST_V(pending_flush_states, SpaprMachineState, 1,
+vmstate_spapr_nvdimm_flush_state,
+SpaprNVDIMMDeviceFlushState, node),
+VMSTATE_END_OF_LIST()
+},
+};
+
+/*
+ * Assign a token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
+  SpaprMachineState *spapr)
+{
+SpaprNVDIMMDeviceFlushState *state;
+
+state = g_malloc0(sizeof(*state));
+
+flush_token++;
+/* Token zero is presumed as no job pending. Handle the overflow to zero */
+if (flush_token == 0) {
+flush_token++;


Hmm... strictly speaking, this isn't safe.  It's basically never going
to happen in practice, but in theory there's nothing preventing
continue_token 1 still being outstanding when the flush_token counter
overflows.

Come to think of it, since it's a uint64_t, I think an actual overflow
is also never going to happen in practice.  Maybe we should just
assert() on overflow, and fix it in the unlikely event that we ever
discover a case where it could happen.


Have added the assert on overflow.




+}
+state->continue_token = flush_token;
+
+QLIST_INSERT_HEAD(>pending_flush_states, state, node);
+
+return state;
+}
+
+/*
+ *


Thanks!

[PATCH v6 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch. The
hcall is applicable only for new SPAPR specific device class which is
also introduced in this patch.

The hcall expects the semantics such that the flush to return with
H_LONG_BUSY_ORDER_10_MSEC when the operation is expected to take longer
time along with a continue_token. The hcall to be called again by providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after collecting the return status for the guest
in subsequent hcall from the guest.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are also introduced in this patch which are to be
saved in the new SPAPR specific nvdimm device to be introduced in the
following patch.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|2 
 hw/ppc/spapr_nvdimm.c |  263 +
 include/hw/ppc/spapr.h|4 -
 include/hw/ppc/spapr_nvdimm.h |1 
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..9263985663 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1634,6 +1634,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes();
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 91de1052f2..ed6fda2c23 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,9 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
+#include "migration/vmstate.h"
+#include "qemu/pmem.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -47,6 +51,14 @@
 /* Have an explicit check for alignment */
 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
+
+struct SPAPRNVDIMMClass {
+/* private */
+NVDIMMClass parent_class;
+};
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
@@ -375,6 +387,256 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+typedef struct SpaprNVDIMMDeviceFlushState {
+uint64_t continue_token;
+int64_t hcall_ret;
+int backend_fd;
+uint32_t drcidx;
+
+QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
+} SpaprNVDIMMDeviceFlushState;
+
+typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
+struct SpaprNVDIMMDevice {
+NVDIMMDevice parent_obj;
+
+uint64_t nvdimm_flush_token;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+};
+
+static int flush_worker_cb(void *opaque)
+{
+SpaprNVDIMMDeviceFlushState *state = opaque;
+SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+PCDIMMDevice *dimm = PC_DIMM(drc->dev);
+HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+
+if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
+void *ptr = memory_region_get_ram_ptr(mr);
+size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+   NULL);
+
+/* flush pmem backend */
+pmem_persist(ptr, size);
+} else {
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+return H_HARDWARE;
+}
+}
+
+return H_SUCCESS;
+}
+
+static v

[PATCH REBASED v5 2/2] spapr: nvdimm: Introduce spapr-nvdimm device

2021-07-07 Thread Shivaprasad G Bhat

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, the patch introduces a new papr specific
device type inheriting the nvdimm device. When the backend doesn't have
pmem="yes", the device tree property "ibm,hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   46 +
 include/hw/ppc/spapr_nvdimm.h |4 
 2 files changed, 50 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 4f8931ab15..4dc7c3f147 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -54,6 +54,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -91,6 +93,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -162,6 +172,21 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false;
+#ifdef CONFIG_LIBPMEM
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem",
+   _abort);
+#endif
+if (!is_pmem) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -585,7 +610,16 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 }
 
 dimm = PC_DIMM(drc->dev);
+if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
+return H_PARAMETER;
+}
+
 backend = MEMORY_BACKEND(dimm->hostmem);
+#ifdef CONFIG_LIBPMEM
+if (object_property_get_bool(OBJECT(backend), "pmem", _abort)) {
+return H_UNSUPPORTED;
+}
+#endif
 fd = memory_region_get_fd(>mr);
 
 if (fd < 0) {
@@ -766,3 +800,15 @@ static void spapr_scm_register_types(void)
 }
 
 type_init(spapr_scm_register_types)
+
+static TypeInfo spapr_nvdimm_info = {
+.name  = TYPE_SPAPR_NVDIMM,
+.parent= TYPE_NVDIMM,
+};
+
+static void spapr_nvdimm_register_types(void)
+{
+type_register_static(_nvdimm_info);
+}
+
+type_init(spapr_nvdimm_register_types)
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 24d8e37b33..fb4e56418e 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -13,6 +13,10 @@
 #include "hw/mem/nvdimm.h"
 #include "migration/vmstate.h"
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_SIMPLE_TYPE(SpaprNVDIMMDevice, SPAPR_NVDIMM)
+
+typedef struct SpaprNVDIMMDevice  SpaprNVDIMMDevice;
 typedef struct SpaprDrc SpaprDrc;
 typedef struct SpaprMachineState SpaprMachineState;

[PATCH REBASED v5 1/2] spapr: nvdimm: Implement H_SCM_FLUSH hcall

2021-07-07 Thread Shivaprasad G Bhat

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with one of H_LONG_BUSY when the operation is expected to take longer
time along with a continue_token. The hcall to be called again providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after reporting to guest in subsequent hcalls to
get the status.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are added to the spapr machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  240 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_nvdimm.h |   13 ++
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 4dd90b75cc..546d825dde 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1622,6 +1622,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes(spapr);
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2018,6 +2020,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_ccf_assist,
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
+_spapr_nvdimm_states,
 NULL
 }
 };
@@ -3014,6 +3017,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(>fwnmi_machine_check_interlock_cond);
+
+QLIST_INIT(>pending_flush_states);
+QLIST_INIT(>completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 91de1052f2..4f8931ab15 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,7 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -375,6 +377,243 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static uint64_t flush_token;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+return ret;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+state->hcall_ret = hcall_ret;
+QLIST_REMOVE(state, node);
+QLIST_INSERT_HEAD(>completed_flush_states, state, node);
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+ .name = "spapr_nvdimm_flush_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+ return (!QLIST_EMPTY(>pending_flush_states) ||
+ !QLIST_EMPTY(>completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+SpaprDrc *drc;
+
+QLIST_FOREACH_SAFE(state, >completed_flush_states, nod

[PATCH REBASED v5 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

2021-07-07 Thread Shivaprasad G Bhat

If the device backend is not persistent memory for the nvdimm, there
is need for explicit IO flushes to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem.
So, the approach here is to convey when the hcall flush is required
in a device tree property. The guest once it knows the device needs
explicit flushes, makes the hcall as and when required.

It was suggested to create a new device type to address the
explicit flush for such backends on PPC instead of extending the
generic nvdimm device with new property. So, the patch introduces
the spapr-nvdimm device. The new device inherits the nvdimm device
with the new bahviour such that if the backend has pmem=no, the
device tree property is set.

The below demonstration shows the map_sync behavior for non-pmem
backends.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from spapr-nvdimm with with backend pmem=yes, and pmem1 is
from spapr-nvdimm with pmem=no, mounted as
/dev/pmem0 on /mnt1 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile > When pmem=yes
[root@atest-guest ~]# ./mapsync /mnt2/newfile > when pmem=no
Failed to mmap with Operation not supported

First patch implements the hcall, adds the necessary
vmstate properties to spapr machine structure for carrying the hcall
status during save-restore. The nature of the hcall being asynchronus,
the patch uses aio utilities to offload the flush. The second patch
introduces the spapr-nvdimm device, adds the device tree property
for the guest when spapr-nvdimm is used with pmem="no" on the backend.

The kernel changes to exploit this hcall is at
https://github.com/linuxppc/linux/commit/75b7c05ebf9026.patch

---
v4 - https://lists.gnu.org/archive/html/qemu-devel/2021-04/msg05982.html
Changes from v4:
- Introduce spapr-nvdimm device with nvdimm device as the parent.
- The new spapr-nvdimm has no new properties. As this is a new
device and there is no migration related dependencies to be
taken care of, the device behavior is made to set the device tree
property and enable hcall when the device type spapr-nvdimm is
used with pmem="no"
- Fixed commit messages
- Added checks to ensure the backend is actualy file and not memory
- Addressed things pointed out by Eric

v3 - https://lists.gnu.org/archive/html/qemu-devel/2021-03/msg07916.html
Changes from v3:
- Fixed the forward declaration coding guideline violations in 1st patch.
- Removed the code waiting for the flushes to complete during migration,
instead restart the flush worker on destination qemu in post load.
- Got rid of the randomization of the flush tokens, using simple
counter.
- Got rid of the redundant flush state lock, relying on the BQL now.
- Handling the memory-backend-ram usage
- Changed the sync-dax symantics from on/off to 'unsafe','writeback' and
'direct'.
Added prevention code using 'writeback' on arm and x86_64.
- Fixed all the miscellaneous comments.

v2 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg07031.html
Changes from v2:
- Using the thread pool based approach as suggested
- Moved the async hcall handling code to spapr_nvdimm.c along
with some simplifications
- Added vmstate to preserve the hcall status during save-restore
along with pre_save handler code to complete all ongoning flushes.
- Added hw_compat magic for sync-dax 'on' on previous machines.
- Miscellanious minor fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (2):
spapr: nvdimm: Implement H_SCM_FLUSH hcall
spapr: nvdimm: Introduce spapr-nvdimm device

hw/ppc/spapr.c|6 +
hw/ppc/spapr_nvdimm.c | 286 +
include/hw/ppc/spapr.h| 11 +-
include/hw/ppc/spapr_nvdimm.h | 17 ++
4 files changed, 319 insertions(+), 1 deletion(-)

--
Signature

[PATCH] spapr: nvdimm: Fix the persistent-memory root node name in device tree

2021-05-26 Thread Shivaprasad G Bhat

The FDT code is adding the pmem root node by name "persistent-memory"
which should have been "ibm,persistent-memory".

The linux fetches the device tree nodes by type and it has been working
correctly as the type is correct. If someone searches by its intended
name it would fail, so fix that.

Reported-by: Aneesh Kumar K.V 
Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 252204e25f..d7a4a0a051 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -163,11 +163,11 @@ int spapr_pmem_dt_populate(SpaprDrc *drc, 
SpaprMachineState *spapr,
 
 void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt)
 {
-int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
+int offset = fdt_subnode_offset(fdt, 0, "ibm,persistent-memory");
 GSList *iter, *nvdimms = nvdimm_get_device_list();
 
 if (offset < 0) {
-offset = fdt_add_subnode(fdt, 0, "persistent-memory");
+offset = fdt_add_subnode(fdt, 0, "ibm,persistent-memory");
 _FDT(offset);
 _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
 _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));

[PATCH v5 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with one of H_LONG_BUSY when the operation is expected to take longer
time along with a continue_token. The hcall to be called again providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after reporting to guest in subsequent hcalls to
get the status.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are added to the spapr machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  240 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_nvdimm.h |   13 ++
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c23bcc4490..7a29ea2b05 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1622,6 +1622,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes(spapr);
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2018,6 +2020,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_ccf_assist,
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
+_spapr_nvdimm_states,
 NULL
 }
 };
@@ -3012,6 +3015,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(>fwnmi_machine_check_interlock_cond);
+
+QLIST_INIT(>pending_flush_states);
+QLIST_INIT(>completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 3f57a8b6fa..d460a098c0 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,7 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -375,6 +377,243 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static uint64_t flush_token;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+return ret;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+state->hcall_ret = hcall_ret;
+QLIST_REMOVE(state, node);
+QLIST_INSERT_HEAD(>completed_flush_states, state, node);
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+ .name = "spapr_nvdimm_flush_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+ return (!QLIST_EMPTY(>pending_flush_states) ||
+ !QLIST_EMPTY(>completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+SpaprDrc *drc;
+
+QLIST_FOREACH_SAFE(state, >completed_flush_states, nod

[PATCH v5 1/3] spapr: nvdimm: Forward declare and move the definitions

The subsequent patches add definitions which tend to get
the compilation to cyclic dependency. So, prepare with
forward declarations, move the definitions and clean up.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   12 
 include/hw/ppc/spapr_nvdimm.h |   14 ++
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 252204e25f..3f57a8b6fa 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -35,6 +35,18 @@
 /* SCM device is unable to persist memory contents */
 #define PAPR_PMEM_UNARMED PPC_BIT(0)
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * in order to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
+/* Have an explicit check for alignment */
+QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 73be250e2a..764f999f54 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -11,19 +11,9 @@
 #define HW_SPAPR_NVDIMM_H
 
 #include "hw/mem/nvdimm.h"
-#include "hw/ppc/spapr.h"
 
-/*
- * The nvdimm size should be aligned to SCM block size.
- * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
- * inorder to have SCM regions not to overlap with dimm memory regions.
- * The SCM devices can have variable block sizes. For now, fixing the
- * block size to the minimum value.
- */
-#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
-
-/* Have an explicit check for alignment */
-QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+typedef struct SpaprDrc SpaprDrc;
+typedef struct SpaprMachineState SpaprMachineState;
 
 int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
void *fdt, int *fdt_start_offset, Error **errp);

[PATCH v5 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there
is need for explicit IO flushes to ensure persistence.

The below demonstration shows the map_sync behavior for non-pmem
backends.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

[root@atest-guest ~]# ./mapsync /mnt1/newfile > When pmem=yes
[root@atest-guest ~]# ./mapsync /mnt2/newfile > when pmem=no
Failed to mmap with Operation not supported

The first patch does the header file cleanup necessary for the
subsequent ones. Second patch implements the hcall, adds the necessary
vmstate properties to spapr machine structure for carrying the hcall
status during save-restore. The nature of the hcall being asynchronus,
the patch uses aio utilities to offload the flush. The third patch
introduces the spapr-nvdimm device, adds the device tree property
for the guest when spapr-nvdimm is used with pmem="no" on the backend.

The kernel changes to exploit this hcall is at
https://github.com/linuxppc/linux/commit/75b7c05ebf9026.patch

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
spapr: nvdimm: Forward declare and move the definitions
spapr: nvdimm: Implement H_SCM_FLUSH hcall
spapr: nvdimm: Introduce spapr-nvdimm device

hw/ppc/spapr.c|6 +
hw/ppc/spapr_nvdimm.c | 298 +
include/hw/ppc/spapr.h| 11 +-
include/hw/ppc/spapr_nvdimm.h | 29 ++--
4 files changed, 332 insertions(+), 12 deletions(-)

--
Signature

[PATCH v5 3/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, the patch introduces a new papr specific
device type inheriting the nvdimm device. When the backend doesn't have
pmem="yes", the device tree property "ibm,hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   46 +
 include/hw/ppc/spapr_nvdimm.h |4 
 2 files changed, 50 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index d460a098c0..9a04df4c47 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -54,6 +54,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -91,6 +93,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -162,6 +172,21 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false;
+#ifdef CONFIG_LIBPMEM
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem",
+   _abort);
+#endif
+if (!is_pmem) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -585,7 +610,16 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 }
 
 dimm = PC_DIMM(drc->dev);
+if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
+return H_PARAMETER;
+}
+
 backend = MEMORY_BACKEND(dimm->hostmem);
+#ifdef CONFIG_LIBPMEM
+if (object_property_get_bool(OBJECT(backend), "pmem", _abort)) {
+return H_UNSUPPORTED;
+}
+#endif
 fd = memory_region_get_fd(>mr);
 
 if (fd < 0) {
@@ -766,3 +800,15 @@ static void spapr_scm_register_types(void)
 }
 
 type_init(spapr_scm_register_types)
+
+static TypeInfo spapr_nvdimm_info = {
+.name  = TYPE_SPAPR_NVDIMM,
+.parent= TYPE_NVDIMM,
+};
+
+static void spapr_nvdimm_register_types(void)
+{
+type_register_static(_nvdimm_info);
+}
+
+type_init(spapr_nvdimm_register_types)
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 24d8e37b33..fb4e56418e 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -13,6 +13,10 @@
 #include "hw/mem/nvdimm.h"
 #include "migration/vmstate.h"
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_SIMPLE_TYPE(SpaprNVDIMMDevice, SPAPR_NVDIMM)
+
+typedef struct SpaprNVDIMMDevice  SpaprNVDIMMDevice;
 typedef struct SpaprDrc SpaprDrc;
 typedef struct SpaprMachineState SpaprMachineState;

Re: [PATCH v4 0/3] nvdimm: Enable sync-dax property for nvdimm

2021-05-03 Thread Shivaprasad G Bhat




On 5/1/21 12:44 AM, Dan Williams wrote:

Some corrections to terminology confusion below...


On Wed, Apr 28, 2021 at 8:49 PM Shivaprasad G Bhat  wrote:

The nvdimm devices are expected to ensure write persistence during power
failure kind of scenarios.

No, QEMU is not expected to make that guarantee. QEMU is free to lie
to the guest about the persistence guarantees of the guest PMEM
ranges. It's more accurate to say that QEMU nvdimm devices can emulate
persistent memory and optionally pass through host power-fail
persistence guarantees to the guest. The power-fail persistence domain
can be one of "cpu_cache", or "memory_controller" if the persistent
memory region is "synchronous". If the persistent range is not
synchronous, it really isn't "persistent memory"; it's memory mapped
storage that needs I/O commands to flush.


Since this is virtual nvdimm(v-nvdimm) backed by a file, and the data is 
completely


in the host pagecache, and we need a way to ensure that host pagecaches

are flushed to the backend. This analogous to the WPQ flush being offloaded

to the hypervisor.


Ref: https://github.com/dgibson/qemu/blob/main/docs/nvdimm.txt





The libpmem has architecture specific instructions like dcbf on POWER

Which "libpmem" is this? PMDK is a reference library not a PMEM
interface... maybe I'm missing what libpmem has to do with QEMU?



I was referrering to semantics of flushing pmem cache lines as in

PMDK/libpmem.





to flush the cache data to backend nvdimm device during normal writes
followed by explicit flushes if the backend devices are not synchronous
DAX capable.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
and the subsequent flush doesn't traslate to actual flush to the backend

s/traslate/translate/


file on the host in case of file backed v-nvdimms. This is addressed by
virtio-pmem in case of x86_64 by making explicit flushes translating to
fsync at qemu.

Note that virtio-pmem was a proposal for a specific optimization of
allowing guests to share page cache. The virtio-pmem approach is not
to be confused with actual persistent memory.


On SPAPR, the issue is addressed by adding a new hcall to
request for an explicit flush from the guest ndctl driver when the backend

What is an "ndctl" driver? ndctl is userspace tooling, do you mean the
guest pmem driver?



oops, wrong terminologies. I was referring to guest libnvdimm and

papr_scm kernel modules.





nvdimm cannot ensure write persistence with dcbf alone. So, the approach
here is to convey when the hcall flush is required in a device tree
property. The guest makes the hcall when the property is found, instead
of relying on dcbf.

A new device property sync-dax is added to the nvdimm device. When the
sync-dax is 'writeback'(default for PPC), device property
"hcall-flush-required" is set, and the guest makes hcall H_SCM_FLUSH
requesting for an explicit flush.

I'm not sure "sync-dax" is a suitable name for the property of the
guest persistent memory.



sync-dax property translates ND_REGION_ASYNC flag being set/unset

for the pmem region also if the nvdimm_flush callback is provided in the

papr_scm or not. As everything boils down to synchronous nature

of the device, I chose sync-dax for the name.



  There is no requirement that the
memory-backend file for a guest be a dax-capable file. It's also
implementation specific what hypercall needs to be invoked for a given
occurrence of "sync-dax". What does that map to on non-PPC platforms
for example?



The backend file can be dax-capable, to be hinted using "sync-dax=direct".

When the backend is not dax-capable, the "sync-dax=writeback" to used,

so that the guest makes the hcall. On all non-PPC archs, with the

"sync-dax=writeback" qemu errors out stating the lack of support.



  It seems to me that an "nvdimm" device presents the
synchronous usage model and a whole other device type implements an
async-hypercall setup that the guest happens to service with its
nvdimm stack, but it's not an "nvdimm" anymore at that point.



In case the file backing the v-nvdimm is not dax-capable, we need flush

semantics on the guest to be mapped to pagecache flush on the host side.





sync-dax is "unsafe" on all other platforms(x86, ARM) and old pseries machines
prior to 5.2 on PPC. sync-dax="writeback" on ARM and x86_64 is prevented
now as the flush semantics are unimplemented.

"sync-dax" has no meaning on its own, I think this needs an explicit
mechanism to convey both the "not-sync" property *and* the callback
method, it shouldn't be inferred by arch type.



Yes. On all platforms the "sync-dax=unsafe" meaning - with host power

failure the host pagecache is lost and subsequently data written by the

guest will also be gone. This is the default for non-PPC.

[PATCH v4 3/3] nvdimm: Enable sync-dax device property for nvdimm

The patch adds the 'sync-dax' property to the nvdimm device.

When the sync-dax is 'direct' indicates the backend is synchronous DAX
capable and no explicit flush requests are required. When the mode is
set to 'writeback' it indicates the backend is not synhronous DAX
capable and explicit flushes to Hypervisor are required.

On PPC where the flush requests from guest can be honoured by the qemu,
the 'writeback' mode is supported and set as the default. The device
tree property "hcall-flush-required" is added to the nvdimm node which
makes the guest to issue H_SCM_FLUSH hcalls to request for flushes
explicitly. This would be the default behaviour without sync-dax
property set for the nvdimm device. For old pSeries machine, the
default is 'unsafe'.

For non-PPC platforms, the mode is set to 'unsafe' as the default.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/arm/virt.c   |   28 +++--
 hw/i386/pc.c|   28 +++--
 hw/mem/nvdimm.c |   52 +++
 hw/ppc/spapr.c  |   10 +
 hw/ppc/spapr_nvdimm.c   |   39 +++
 include/hw/mem/nvdimm.h |   11 ++
 include/hw/ppc/spapr.h  |1 +
 qapi/common.json|   20 ++
 8 files changed, 179 insertions(+), 10 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 9f01d9041b..f32e3e4010 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2358,6 +2358,27 @@ static const CPUArchIdList 
*virt_possible_cpu_arch_ids(MachineState *ms)
 return ms->possible_cpus;
 }
 
+static bool virt_nvdimm_validate(const MachineState *ms, NVDIMMDevice *nvdimm,
+ Error **errp)
+{
+NvdimmSyncModes sync;
+
+if (!ms->nvdimms_state->is_enabled) {
+error_setg(errp, "nvdimm is not enabled: add 'nvdimm=on' to '-M'");
+return false;
+}
+
+sync = object_property_get_enum(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+"NvdimmSyncModes", _abort);
+if (sync == NVDIMM_SYNC_MODES_WRITEBACK) {
+error_setg(errp, "NVDIMM device " NVDIMM_SYNC_DAX_PROP
+ "=%s mode unsupported", NvdimmSyncModes_str(sync));
+return false;
+}
+
+return true;
+}
+
 static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
  Error **errp)
 {
@@ -2376,9 +2397,10 @@ static void virt_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 return;
 }
 
-if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
-error_setg(errp, "nvdimm is not enabled: add 'nvdimm=on' to '-M'");
-return;
+if (is_nvdimm) {
+if (!virt_nvdimm_validate(ms, NVDIMM(dev), errp)) {
+return;
+}
 }
 
 pc_dimm_pre_plug(PC_DIMM(dev), MACHINE(hotplug_dev), NULL, errp);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 8a84b25a03..2d5151462c 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1211,6 +1211,27 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq 
*i8259_irqs)
 g_free(i8259);
 }
 
+static bool pc_nvdimm_validate(const MachineState *ms, NVDIMMDevice *nvdimm,
+   Error **errp)
+{
+NvdimmSyncModes sync;
+
+if (!ms->nvdimms_state->is_enabled) {
+error_setg(errp, "nvdimm is not enabled: add 'nvdimm=on' to '-M'");
+return false;
+}
+
+sync = object_property_get_enum(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+"NvdimmSyncModes", _abort);
+if (sync == NVDIMM_SYNC_MODES_WRITEBACK) {
+error_setg(errp, "NVDIMM device " NVDIMM_SYNC_DAX_PROP
+   "=%s mode unsupported", NvdimmSyncModes_str(sync));
+return false;
+}
+
+return true;
+}
+
 static void pc_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
Error **errp)
 {
@@ -1233,9 +1254,10 @@ static void pc_memory_pre_plug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 return;
 }
 
-if (is_nvdimm && !ms->nvdimms_state->is_enabled) {
-error_setg(errp, "nvdimm is not enabled: missing 'nvdimm' in '-M'");
-return;
+if (is_nvdimm) {
+if (!pc_nvdimm_validate(ms, NVDIMM(dev), errp)) {
+return;
+}
 }
 
 hotplug_handler_pre_plug(x86ms->acpi_dev, dev, _err);
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..56b4527362 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -96,6 +96,19 @@ static void nvdimm_set_uuid(Object *obj, Visitor *v, const 
char *name,
 g_free(value);
 }
 
+static int nvdimm_get_sync_mode(Object *obj, Error **errp G_GNUC_UNUSED)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+return nvdimm->sync_dax;
+}
+
+static void nvdimm_set_sync_

[PATCH v4 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with H_BUSY when the operation is expected to take longer time along
with a continue_token. The hcall to be called again providing the
continue_token to get the status. So, all fresh requsts are put into
a 'pending' list and flush worker is submitted to the thread pool.
The thread pool completion callbacks move the requests to 'completed'
list, which are cleaned up after reporting to guest in subsequent
hcalls to get the status.

The semantics makes it necessary to preserve the continue_tokens
and their return status across migrations. So, the completed
flush states are forwarded to the destination and the pending
ones are restarted at the destination in post_load. The necessary
nvdimm flush specific vmstate structures are added to the spapr
machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  234 +
 include/hw/ppc/spapr.h|   10 ++
 include/hw/ppc/spapr_nvdimm.h |   13 ++
 4 files changed, 262 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e4be00b732..80957f9188 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1607,6 +1607,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes(spapr);
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2003,6 +2005,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_ccf_assist,
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
+_spapr_nvdimm_states,
 NULL
 }
 };
@@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(>fwnmi_machine_check_interlock_cond);
+
+QLIST_INIT(>pending_flush_states);
+QLIST_INIT(>completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 8cf3fb2ffb..77eb7e1293 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,7 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /*
  * The nvdimm size should be aligned to SCM block size.
@@ -371,6 +373,237 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static uint64_t flush_token;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+return ret;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+state->hcall_ret = hcall_ret;
+QLIST_REMOVE(state, node);
+QLIST_INSERT_HEAD(>completed_flush_states, state, node);
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+ .name = "spapr_nvdimm_flush_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+ return (!QLIST_EMPTY(>pending_flush_states) ||
+ !QLIST_EMPTY(>completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+SpaprDrc *drc;
+
+QLIST_FOREACH_SAFE(state, >completed_flush_states, node, next) {
+if (flush_token < state->continue_token) {
+

[PATCH v4 1/3] spapr: nvdimm: Forward declare and move the definitions

The subsequent patches add definitions which tend to
get the compilation to cyclic dependency. So, prepare
with forward declarations, move the defitions and clean up.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   12 
 include/hw/ppc/spapr_nvdimm.h |   14 ++
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index b46c36917c..8cf3fb2ffb 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -31,6 +31,18 @@
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * inorder to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
+/* Have an explicit check for alignment */
+QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 73be250e2a..764f999f54 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -11,19 +11,9 @@
 #define HW_SPAPR_NVDIMM_H
 
 #include "hw/mem/nvdimm.h"
-#include "hw/ppc/spapr.h"
 
-/*
- * The nvdimm size should be aligned to SCM block size.
- * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
- * inorder to have SCM regions not to overlap with dimm memory regions.
- * The SCM devices can have variable block sizes. For now, fixing the
- * block size to the minimum value.
- */
-#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
-
-/* Have an explicit check for alignment */
-QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+typedef struct SpaprDrc SpaprDrc;
+typedef struct SpaprMachineState SpaprMachineState;
 
 int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
void *fdt, int *fdt_start_offset, Error **errp);

[PATCH v4 0/3] nvdimm: Enable sync-dax property for nvdimm

The nvdimm devices are expected to ensure write persistence during power
failure kind of scenarios.

The libpmem has architecture specific instructions like dcbf on POWER
to flush the cache data to backend nvdimm device during normal writes
followed by explicit flushes if the backend devices are not synchronous
DAX capable.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
and the subsequent flush doesn't traslate to actual flush to the backend
file on the host in case of file backed v-nvdimms. This is addressed by
virtio-pmem in case of x86_64 by making explicit flushes translating to
fsync at qemu.

On SPAPR, the issue is addressed by adding a new hcall to
request for an explicit flush from the guest ndctl driver when the backend
nvdimm cannot ensure write persistence with dcbf alone. So, the approach
here is to convey when the hcall flush is required in a device tree
property. The guest makes the hcall when the property is found, instead
of relying on dcbf.

A new device property sync-dax is added to the nvdimm device. When the
sync-dax is 'writeback'(default for PPC), device property
"hcall-flush-required" is set, and the guest makes hcall H_SCM_FLUSH
requesting for an explicit flush.

sync-dax is "unsafe" on all other platforms(x86, ARM) and old pseries machines
prior to 5.2 on PPC. sync-dax="writeback" on ARM and x86_64 is prevented
now as the flush semantics are unimplemented.

When the backend file is actually synchronous DAX capable and no explicit
flushes are required, the sync-dax mode 'direct' is to be used.

The below demonstration shows the map_sync behavior with sync-dax writeback &
direct.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from nvdimm with With sync-dax=direct, and pmem1 is from
nvdimm with syn-dax=writeback, mounted as
/dev/pmem0 on /mnt1 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile > When sync-dax=unsafe/direct
[root@atest-guest ~]# ./mapsync /mnt2/newfile > when sync-dax=writeback
Failed to mmap with Operation not supported

The first patch does the header file cleanup necessary for the
subsequent ones. Second patch implements the hcall, adds the necessary
vmstate properties to spapr machine structure for carrying the hcall
status during save-restore. The nature of the hcall being asynchronus,
the patch uses aio utilities to offload the flush. The third patch adds
the 'sync-dax' device property and enables the device tree property
for the guest to utilise the hcall.

The kernel changes to exploit this hcall is at
https://github.com/linuxppc/linux/commit/75b7c05ebf9026.patch

---
v3 - https://lists.gnu.org/archive/html/qemu-devel/2021-03/msg07916.html
Changes from v3:
- Fixed the forward declaration coding guideline violations in 1st patch.
- Removed the code waiting for the flushes to complete during migration,
instead restart the flush worker on destination qemu in post load.
- Got rid of the randomization of the flush tokens, using simple
counter.
- Got rid of the redundant flush state lock, relying on the BQL now.
- Handling the memory-backend-ram usage
- Changed the sync-dax symantics from on/off to 'unsafe','writeback' and
'direct'.
Added prevention code using 'writeback' on arm and x86_64.
- Fixed all the miscellaneous comments.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
spapr: nvdimm: Forward declare and move the definitions
spapr: nvdimm: Implement H_SCM_FLUSH hcall
nvdimm: Enable sync-dax device property for nvdimm

--
Signature

Re: [PATCH] ppc/spapr: Add support for implement support for H_SCM_HEALTH

2021-03-30 Thread Shivaprasad G Bhat


Hi Vaibhav,

Some comments inline..

On 3/29/21 9:52 PM, Vaibhav Jain wrote:

Add support for H_SCM_HEALTH hcall described at [1] for spapr
nvdimms. This enables guest to detect the 'unarmed' status of a
specific spapr nvdimm identified by its DRC and if its unarmed, mark
the region backed by the nvdimm as read-only.

The patch adds h_scm_health() to handle the H_SCM_HEALTH hcall which
returns two 64-bit bitmaps (health bitmap, health bitmap mask) derived
from 'struct nvdimm->unarmed' member.

Linux kernel side changes to enable handling of 'unarmed' nvdimms for
ppc64 are proposed at [2].

References:
[1] "Hypercall Op-codes (hcalls)"
 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/powerpc/papr_hcalls.rst

[2] "powerpc/papr_scm: Mark nvdimm as unarmed if needed during probe"
 
https://lore.kernel.org/linux-nvdimm/20210329113103.476760-1-vaib...@linux.ibm.com/

Signed-off-by: Vaibhav Jain 
---
  hw/ppc/spapr_nvdimm.c  | 30 ++
  include/hw/ppc/spapr.h |  4 ++--
  2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index b46c36917c..e38740036d 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -31,6 +31,13 @@
  #include "qemu/range.h"
  #include "hw/ppc/spapr_numa.h"
  
+/* DIMM health bitmap bitmap indicators */

+/* SCM device is unable to persist memory contents */
+#define PAPR_PMEM_UNARMED (1ULL << (63 - 0))
+
+/* Bits status indicators for health bitmap indicating unarmed dimm */
+#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED)
+
  bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
 uint64_t size, Error **errp)
  {
@@ -467,6 +474,28 @@ static target_ulong h_scm_unbind_all(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
  return H_SUCCESS;
  }
  
+static target_ulong h_scm_health(PowerPCCPU *cpu, SpaprMachineState *spapr,

+ target_ulong opcode, target_ulong *args)
+{
+uint32_t drc_index = args[0];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+



Please check if drc->dev is not NULL too. DRCs are created in advance

and drc->dev may not be assigned if the device is not plugged yet.



+nvdimm = NVDIMM(drc->dev);
+
+/* Check if the nvdimm is unarmed and send its status via health bitmaps */
+args[0] = nvdimm->unarmed ? PAPR_PMEM_UNARMED_MASK : 0;



Please use object_property_get_bool to fetch the unarmed value.



+
+/* health bitmap mask same as the health bitmap */
+args[1] = args[0];
+
+return H_SUCCESS;
+}
+
  static void spapr_scm_register_types(void)
  {


...


Thanks,

Shivaprasad

Re: [PATCH v3 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

2021-03-29 Thread Shivaprasad G Bhat



On 3/24/21 8:37 AM, David Gibson wrote:

On Tue, Mar 23, 2021 at 09:47:38AM -0400, Shivaprasad G Bhat wrote:

machine vmstate.

Signed-off-by: Shivaprasad G Bhat

An overal question: surely the same issue must arise on x86 with
file-backed NVDIMMs.  How do they handle this case?


Discussed in other threads..




  };
@@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
  }
  
  qemu_cond_init(>fwnmi_machine_check_interlock_cond);

+qemu_mutex_init(>spapr_nvdimm_flush_states_lock);

Do you actually need an extra mutex, or can you rely on the BQL?


I verified BQL is held at all places where it matters in the context of 
this patch.


Safe to get rid of this extra mutex.

...




+{
+ SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+ return (!QLIST_EMPTY(>pending_flush_states) ||
+ !QLIST_EMPTY(>completed_flush_states));
+}
+
+static int spapr_nvdimm_pre_save(void *opaque)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+while (!QLIST_EMPTY(>pending_flush_states)) {
+aio_poll(qemu_get_aio_context(), true);

Hmm... how long could waiting for all the pending flushes to complete
take?  This could add substanially to the guest's migration downtime,
couldn't it?



The time taken depends on the number of dirtied pages and the disk io 
write speed. The number of dirty pages on host is configureable with 
tunables vm.dirty_background_ratio (10% default on Fedora 32, Ubuntu 
20.04), vm.dirty_ratio(20%) of host memory and|or 
vm.dirty_expire_centisecs(30 seconds). So, the host itself would be 
flushing the mmaped file on its own from time to time. For guests using 
the nvdimms with filesystem, the flushes would have come frequently and 
the number of dirty pages might be less. The pmem applications can use 
the nvdimms without a filesystem. And for such guests, the chances that 
a flush request can come from pmem applications at the time of migration 
is less or is random. But, the host would have flushed the pagecache on 
its own when vm.dirty_background_ratio is crossed or 
vm.dirty_expire_centisecs expired. So, the worst case would stands at 
disk io latency for writing the dirtied pages in the last 
vm.dirty_expire_centisecs on host OR latency for writing maximum 
vm.dirty_background_ratio(10%) of host RAM. If you want me to calibrate 
any particular size, scenario and get the numbers please let me know.


...

+
+/*
+ * Acquire a unique token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(void)
+{
+Error *err = NULL;
+uint64_t token;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *tmp, *next, *state;
+
+state = g_malloc0(sizeof(*state));
+
+qemu_mutex_lock(>spapr_nvdimm_flush_states_lock);
+retry:
+if (qemu_guest_getrandom(, sizeof(token), ) < 0) {

Using getrandom seems like overkill, why not just use a counter?


I didnt want a spurious guest to abuse by consuming the return value 
providing


a valid "guess-able" counter and the real driver failing subsequently. 
Also, across


guest migrations carrying the global counter to destination is another 
thing to ponder.



Let me know if you want me to reconsider using counter.

...


mm_flush_states_lock);
+
+return state;
+}
+
+/*
+ * spapr_nvdimm_finish_flushes
+ *  Waits for all pending flush requests to complete
+ *  their execution and free the states
+ */
+void spapr_nvdimm_finish_flushes(void)
+{
+SpaprNVDIMMDeviceFlushState *state, *next;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

The caller has natural access to the machine, so pass it in rather
than using the global.


okay

...


+
+/*
+ * spapr_nvdimm_get_hcall_status
+ *  Fetches the status of the hcall worker and returns H_BUSY
+ *  if the worker is still running.
+ */
+static int spapr_nvdimm_get_flush_status(uint64_t token)
+{
+int ret = H_LONG_BUSY_ORDER_10_MSEC;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

The callers have natural access to spapr, so pass it in rather than
using the global.


Okay

...


+
+/*
+ * H_SCM_FLUSH
+ * Input: drc_index, continue-token
+ * Out: continue-token
+ * Return Value: H_SUCCESS, H_Parameter, H_P2, H_BUSY
+ *
+ * Given a DRC Index Flush the data to backend NVDIMM device.
+ * The hcall returns H_BUSY when the flush takes longer time and the hcall

It returns one of the H_LONG_BUSY values, not actual H_BUSY, doesn't
it?


Yes. I thought its okay to call it just H_BUSY in a generic way. Will 
fix it.




+ * needs to be issued multiple times in order to be completely serviced.
+}
+
+return ret;
+}
+
+dimm = PC_DIMM(drc->dev);
+backend = MEMORY_BACKEND(dimm->hostmem);
+
+state = spapr_nvdimm_init_new_flush_state();
+if (!state) {
+return H_P2;

AFAICT the

Re: [PATCH v3 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

2021-03-26 Thread Shivaprasad G Bhat


On 3/25/21 7:21 AM, David Gibson wrote:

On Wed, Mar 24, 2021 at 09:34:06AM +0530, Aneesh Kumar K.V wrote:

On 3/24/21 8:37 AM, David Gibson wrote:

On Tue, Mar 23, 2021 at 09:47:38AM -0400, Shivaprasad G Bhat wrote:

The patch adds support for the SCM flush hcall for the nvdimm devices.

...

collects all the hcall states from 'completed' list. The necessary
nvdimm flush specific vmstate structures are added to the spapr
machine vmstate.

Signed-off-by: Shivaprasad G Bhat 

An overal question: surely the same issue must arise on x86 with
file-backed NVDIMMs.  How do they handle this case?

On x86 we have different ways nvdimm can be discovered. ACPI NFIT, e820 map
and virtio_pmem. Among these virio_pmem always operated with synchronous dax
disabled and both ACPI and e820 doesn't have the ability to differentiate
support for synchronous dax.

Ok.  And for the virtio-pmem case, how are the extra flushes actually
done on x86?



virtio-pmem device has virtqueue with virtio_pmem_flush() as the handler

which gets called for all flush requests from guest. virtio_pmem_flush() is

offloading the flush to thread pool with a worker doing fsync() and the

completion callback notifying the guest with response.



With that I would expect users to use virtio_pmem when using using file
backed NVDIMMS

So... should we prevent advertising an NVDIMM through ACPI or e820 if
it doesn't have sync-dax enabled?



Is it possible to have different defaults for sync-dax based on 
architecture ?


The behaviour on x86 is sync-dax=on for nvdimms. So, it would be correct to

have the default as "on" for x86. For pseries -  "off" for new machines.

Looking at code, I didnt find much ways to achieve this. Can you suggest

what can be done ?

[PATCH v3 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with H_BUSY when the operation is expected to take longer time along
with a continue_token. The hcall to be called again providing the
continue_token to get the status. So, all fresh requsts are put into
a 'pending' list and flush worker is submitted to the thread pool.
The thread pool completion callbacks move the requests to 'completed'
list, which are cleaned up after reporting to guest in subsequent
hcalls to get the status.

The semantics makes it necessary to preserve the continue_tokens
and their return status even across migrations. So, the pre_save
handler for the device waits for the flush worker to complete and
collects all the hcall states from 'completed' list. The necessary
nvdimm flush specific vmstate structures are added to the spapr
machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  240 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_nvdimm.h |   12 ++
 4 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d56418ca29..fdb0c73a2c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1607,6 +1607,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes();
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2003,6 +2005,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_cap_ccf_assist,
 _spapr_cap_fwnmi,
 _spapr_fwnmi,
+_spapr_nvdimm_flush_states,
 NULL
 }
 };
@@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(>fwnmi_machine_check_interlock_cond);
+qemu_mutex_init(>spapr_nvdimm_flush_states_lock);
+QLIST_INIT(>pending_flush_states);
+QLIST_INIT(>completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 8cf3fb2ffb..883317c1ed 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,14 +22,17 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
 #include "hw/mem/nvdimm.h"
+#include "qemu/guest-random.h"
 #include "qemu/nvdimm-utils.h"
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /*
  * The nvdimm size should be aligned to SCM block size.
@@ -371,6 +374,242 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static const VMStateDescription vmstate_spapr_nvdimm_entry = {
+ .name = "spapr_nvdimm_states",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+ return (!QLIST_EMPTY(>pending_flush_states) ||
+ !QLIST_EMPTY(>completed_flush_states));
+}
+
+static int spapr_nvdimm_pre_save(void *opaque)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+while (!QLIST_EMPTY(>pending_flush_states)) {
+aio_poll(qemu_get_aio_context(), true);
+}
+
+return 0;
+}
+
+const VMStateDescription vmstate_spapr_nvdimm_flush_states = {
+.name = "spapr_nvdimm_hcall_states",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_nvdimm_states_needed,
+.pre_save = spapr_nvdimm_pre_save,
+.fields = (VMStateField[]) {
+VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1,
+vmstate_spapr_nvdimm_entry,
+SpaprNVDIMMDeviceFlushState, node),
+VMSTATE_END_OF_LIST()
+},
+};
+
+/*
+ * Acquire a unique token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(void)
+{
+Error *err = NULL;
+uint64_t token;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *tmp, *next, *state;
+
+state = g_malloc0(sizeof(*state)

[PATCH v3 3/3] spapr: nvdimm: Enable sync-dax device property for nvdimm

The patch adds the 'sync-dax' property to the nvdimm device.

When the sync-dax is 'off', the device tree property
"hcall-flush-required" is added to the nvdimm node which makes the
guest to issue H_SCM_FLUSH hcalls to request for flushes explicitly.
This would be the default behaviour without sync-dax property set
for the nvdimm device.

The sync-dax="on" would mean the guest need not make flush requests
to the qemu. On previous machine versions the sync-dax is set to be
"on" by default using the hw_compat magic.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/core/machine.c   |1 +
 hw/mem/nvdimm.c |1 +
 hw/ppc/spapr_nvdimm.c   |   17 +
 include/hw/mem/nvdimm.h |   10 ++
 include/hw/ppc/spapr.h  |1 +
 5 files changed, 30 insertions(+)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 257a664ea2..f843643574 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -41,6 +41,7 @@ GlobalProperty hw_compat_5_2[] = {
 { "PIIX4_PM", "smm-compat", "on"},
 { "virtio-blk-device", "report-discard-granularity", "off" },
 { "virtio-net-pci", "vectors", "3"},
+{ "nvdimm", "sync-dax", "on" },
 };
 const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2);
 
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..8f0e29b191 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -229,6 +229,7 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, 
const void *buf,
 
 static Property nvdimm_properties[] = {
 DEFINE_PROP_BOOL(NVDIMM_UNARMED_PROP, NVDIMMDevice, unarmed, false),
+DEFINE_PROP_BOOL(NVDIMM_SYNC_DAX_PROP, NVDIMMDevice, sync_dax, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 883317c1ed..dd1c90251b 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -125,6 +125,9 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
 uint64_t lsize = nvdimm->label_size;
 uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
 NULL);
+bool sync_dax = object_property_get_bool(OBJECT(nvdimm),
+ NVDIMM_SYNC_DAX_PROP,
+ _abort);
 
 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
 g_assert(drc);
@@ -159,6 +162,11 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (!sync_dax) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+
 return child_offset;
 }
 
@@ -567,10 +575,12 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
   target_ulong opcode, target_ulong *args)
 {
 int ret;
+bool sync_dax;
 uint32_t drc_index = args[0];
 uint64_t continue_token = args[1];
 SpaprDrc *drc = spapr_drc_by_index(drc_index);
 PCDIMMDevice *dimm;
+NVDIMMDevice *nvdimm;
 HostMemoryBackend *backend = NULL;
 SpaprNVDIMMDeviceFlushState *state;
 ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
@@ -580,6 +590,13 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_PARAMETER;
 }
 
+nvdimm = NVDIMM(drc->dev);
+sync_dax = object_property_get_bool(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+_abort);
+if (sync_dax) {
+return H_UNSUPPORTED;
+}
+
 if (continue_token != 0) {
 ret = spapr_nvdimm_get_flush_status(continue_token);
 if (H_IS_LONG_BUSY(ret)) {
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..f82979cf2f 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -51,6 +51,7 @@ OBJECT_DECLARE_TYPE(NVDIMMDevice, NVDIMMClass, NVDIMM)
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
 #define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
+#define NVDIMM_SYNC_DAX_PROP   "sync-dax"
 
 struct NVDIMMDevice {
 /* private */
@@ -85,6 +86,15 @@ struct NVDIMMDevice {
  */
 bool unarmed;
 
+/*
+ * On PPC64,
+ * The 'off' value results in the hcall-flush-required property set
+ * in the device tree for pseries machines. When 'off', the guest
+ * initiates explicit flush requests to the backend device ensuring
+ * write persistence.
+ */
+bool sync_dax;
+
 /*
  * The PPC64 - spapr requires each nvdimm device have a uuid.
  */
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 7c27fb3e2d

[PATCH v3 1/3] spapr: nvdimm: Forward declare and move the definitions

The subsequent patches add definitions which tend to
get the compilation to cyclic dependency. So, prepare
with forward declarations, move the defitions and clean up.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   12 
 include/hw/ppc/spapr_nvdimm.h |   21 ++---
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index b46c36917c..8cf3fb2ffb 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -31,6 +31,18 @@
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * inorder to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
+/* Have an explicit check for alignment */
+QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 73be250e2a..abcacda5d7 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -11,23 +11,14 @@
 #define HW_SPAPR_NVDIMM_H
 
 #include "hw/mem/nvdimm.h"
-#include "hw/ppc/spapr.h"
 
-/*
- * The nvdimm size should be aligned to SCM block size.
- * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
- * inorder to have SCM regions not to overlap with dimm memory regions.
- * The SCM devices can have variable block sizes. For now, fixing the
- * block size to the minimum value.
- */
-#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
-
-/* Have an explicit check for alignment */
-QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+struct SpaprDrc;
+struct SpaprMachineState;
 
-int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
-   void *fdt, int *fdt_start_offset, Error **errp);
-void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt);
+int spapr_pmem_dt_populate(struct SpaprDrc *drc,
+   struct SpaprMachineState *spapr, void *fdt,
+   int *fdt_start_offset, Error **errp);
+void spapr_dt_persistent_memory(struct SpaprMachineState *spapr, void *fdt);
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp);
 void spapr_add_nvdimm(DeviceState *dev, uint64_t slot);

[PATCH v3 0/3] spapr: nvdimm: Enable sync-dax property for nvdimm

The nvdimm devices are expected to ensure write persistence during power
failure kind of scenarios.

The libpmem has architecture specific instructions like dcbf on power
to flush the cache data to backend nvdimm device during normal writes.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
doesn't traslate to actual flush to the backend file on the host in case
of file backed v-nvdimms. This is addressed by virtio-pmem in case of x86_64
by making explicit flushes translating to fdatasync at qemu.

On PAPR, the issue is addressed by adding a new hcall to
request for an explicit flush from the guest ndctl driver when the backend
nvdimm cannot ensure write persistence with dcbf alone. So, the approach
here is to convey when the hcall flush is required in a device tree
property. The guest makes the hcall when the property is found, instead
of relying on dcbf.

The first patch adds the necessary asynchronous hcall support infrastructure
code at the DRC level. Second patch implements the hcall using the
infrastructure.

Hcall number and semantics finalized, so dropping the RFC prefix.

A new device property sync-dax is added to the nvdimm device. When the
sync-dax is off(default), device property "hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

By default, sync-dax is "off" on all new pseries machines and prior to
5.2 its "on",

The below demonstration shows the map_sync behavior with sync-dax on & off.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from nvdimm with With sync-dax=on, and pmem1 is from nvdimm with
syn-dax=off, mounted as
/dev/pmem0 on /mnt1 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile> When sync-dax=off
[root@atest-guest ~]# ./mapsync /mnt2/newfile> when sync-dax=on
Failed to mmap with Operation not supported

The first patch does the header file cleanup necessary for the
subsequent ones. Second patch implements the hcall, adds the necessary
vmstate properties to spapr machine structure for carrying the hcall
status during save-restore. The nature of the hcall being asynchronus,
the patch uses aio utilities to offload the flush. The third patch adds
the 'sync-dax' device property and enables the device tree property
for the guest to utilise the hcall.

---
v2 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg07031.html
Changes from v2:
- Using the thread pool based approach as suggested by Greg
- Moved the async hcall handling code to spapr_nvdimm.c along
with some simplifications
- Added vmstate to preserve the hcall status during save-restore
along with pre_save handler code to complete all ongoning flushes.
- Added hw_compat magic for sync-dax 'on' on previous machines.
- Miscellanious minor fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
spapr: nvdimm: Forward declare and move the definitions
spapr: nvdimm: Impletment scm flush hcall
spapr: nvdimm: Enable sync-dax device property for nvdimm

--
Signature

Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level


Hi David,

Sorry about the delay.

On 2/8/21 11:51 AM, David Gibson wrote:

On Tue, Jan 19, 2021 at 12:40:31PM +0530, Shivaprasad G Bhat wrote:

Thanks for the comments!


On 12/28/20 2:08 PM, David Gibson wrote:


On Mon, Dec 21, 2020 at 01:08:53PM +0100, Greg Kurz wrote:

...

The overall idea looks good but I think you should consider using
a thread pool to implement it. See below.

I am not convinced, however.  Specifically, attaching this to the DRC
doesn't make sense to me.  We're adding exactly one DRC related async
hcall, and I can't really see much call for another one.  We could
have other async hcalls - indeed we already have one for HPT resizing
- but attaching this to DRCs doesn't help for those.

The semantics of the hcall made me think, if this is going to be
re-usable for future if implemented at DRC level.

It would only be re-usable for operations that are actually connected
to DRCs.  It doesn't seem to me particularly likely that we'll ever
have more asynchronous hcalls that are also associated with DRCs.

Okay


Other option
is to move the async-hcall-state/list into the NVDIMMState structure
in include/hw/mem/nvdimm.h and handle it with machine->nvdimms_state
at a global level.

I'm ok with either of two options:

A) Implement this ad-hoc for this specific case, making whatever
simplifications you can based on this specific case.


I am simplifying it to nvdimm use-case alone and limiting the scope.



B) Implement a general mechanism for async hcalls that is *not* tied
to DRCs.  Then use that for the existing H_RESIZE_HPT_PREPARE call as
well as this new one.


Hope you are okay with using the pool based approach that Greg

Honestly a thread pool seems like it might be overkill for this
application.


I think its appropriate here as that is what is being done by virtio-pmem

too for flush requests. The aio infrastructure simplifies lot of the

thread handling usage. Please suggest if you think there are better ways.


I am sending the next version addressing all the comments from you and Greg.


Thanks,

Shivaprasad

Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level

2021-01-18 Thread Shivaprasad G Bhat


Thanks for the comments!


On 12/28/20 2:08 PM, David Gibson wrote:


On Mon, Dec 21, 2020 at 01:08:53PM +0100, Greg Kurz wrote:

...

The overall idea looks good but I think you should consider using
a thread pool to implement it. See below.

I am not convinced, however.  Specifically, attaching this to the DRC
doesn't make sense to me.  We're adding exactly one DRC related async
hcall, and I can't really see much call for another one.  We could
have other async hcalls - indeed we already have one for HPT resizing
- but attaching this to DRCs doesn't help for those.


The semantics of the hcall made me think, if this is going to be

re-usable for future if implemented at DRC level. Other option

is to move the async-hcall-state/list into the NVDIMMState structure

in include/hw/mem/nvdimm.h and handle it with machine->nvdimms_state

at a global level.


Hope you are okay with using the pool based approach that Greg

suggested.


Please let me know.


Thanks,

Shivaprasad

[RFC Qemu PATCH v2 2/2] spapr: nvdimm: Implement async flush hcalls

2020-11-30 Thread Shivaprasad G Bhat

When the persistent memory beacked by a file, a cpu cache flush instruction
is not sufficient to ensure the stores are correctly flushed to the media.

The patch implements the async hcalls for flush operation on demand from the
guest kernel.

The device option sync-dax is by default off and enables explicit asynchronous
flush requests from guest. It can be disabled by setting syn-dax=on.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/mem/nvdimm.c |1 +
 hw/ppc/spapr_nvdimm.c   |   79 +++
 include/hw/mem/nvdimm.h |   10 ++
 include/hw/ppc/spapr.h  |3 +-
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 03c2201b56..37a4db0135 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -220,6 +220,7 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, 
const void *buf,
 
 static Property nvdimm_properties[] = {
 DEFINE_PROP_BOOL(NVDIMM_UNARMED_PROP, NVDIMMDevice, unarmed, false),
+DEFINE_PROP_BOOL(NVDIMM_SYNC_DAX_PROP, NVDIMMDevice, sync_dax, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index a833a63b5e..557e36aa98 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -155,6 +156,11 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (!nvdimm->sync_dax) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,async-flush-required",
+ NULL, 0));
+}
+
 return child_offset;
 }
 
@@ -370,6 +376,78 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+typedef struct SCMAsyncFlushData {
+int fd;
+uint64_t token;
+} SCMAsyncFlushData;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SCMAsyncFlushData *req_data = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(req_data->fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+g_free(req_data);
+
+return ret;
+}
+
+static target_ulong h_scm_async_flush(PowerPCCPU *cpu, SpaprMachineState 
*spapr,
+  target_ulong opcode, target_ulong *args)
+{
+int ret;
+uint32_t drc_index = args[0];
+uint64_t continue_token = args[1];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+SCMAsyncFlushData *req_data = NULL;
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (continue_token != 0) {
+ret = spapr_drc_get_async_hcall_status(drc, continue_token);
+if (ret == H_BUSY) {
+args[0] = continue_token;
+return H_LONG_BUSY_ORDER_1_SEC;
+}
+
+return ret;
+}
+
+dimm = PC_DIMM(drc->dev);
+backend = MEMORY_BACKEND(dimm->hostmem);
+
+req_data = g_malloc0(sizeof(SCMAsyncFlushData));
+req_data->fd = memory_region_get_fd(>mr);
+
+continue_token = spapr_drc_get_new_async_hcall_token(drc);
+if (!continue_token) {
+g_free(req_data);
+return H_P2;
+}
+req_data->token = continue_token;
+
+spapr_drc_run_async_hcall(drc, continue_token, _worker_cb, req_data);
+
+ret = spapr_drc_get_async_hcall_status(drc, continue_token);
+if (ret == H_BUSY) {
+args[0] = req_data->token;
+return ret;
+}
+
+return ret;
+}
+
 static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
  target_ulong opcode, target_ulong *args)
 {
@@ -486,6 +564,7 @@ static void spapr_scm_register_types(void)
 spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
 spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
 spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
+spapr_register_hypercall(H_SCM_ASYNC_FLUSH, h_scm_async_flush);
 }
 
 type_init(spapr_scm_register_types)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index c699842dd0..9e8795766e 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -51,6 +51,7 @@ OBJECT_DECLARE_TYPE(NVDIMMDevice, NVDIMMClass, NVDIMM)
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
 #define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
+#define NVDIMM_SYNC_DAX_PROP

[RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level

2020-11-30 Thread Shivaprasad G Bhat

The patch adds support for async hcalls at the DRC level for the
spapr devices. To be used by spapr-scm devices in the patch/es to follow.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_drc.c |  149 
 include/hw/ppc/spapr_drc.h |   25 +++
 2 files changed, 174 insertions(+)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index 77718cde1f..4ecd04f686 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -15,6 +15,7 @@
 #include "qapi/qmp/qnull.h"
 #include "cpu.h"
 #include "qemu/cutils.h"
+#include "qemu/guest-random.h"
 #include "hw/ppc/spapr_drc.h"
 #include "qom/object.h"
 #include "migration/vmstate.h"
@@ -421,6 +422,148 @@ void spapr_drc_detach(SpaprDrc *drc)
 spapr_drc_release(drc);
 }
 
+
+/*
+ * @drc : device DRC targetting which the async hcalls to be made.
+ *
+ * All subsequent requests to run/query the status should use the
+ * unique token returned here.
+ */
+uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc)
+{
+Error *err = NULL;
+uint64_t token;
+SpaprDrcDeviceAsyncHCallState *tmp, *next, *state;
+
+state = g_malloc0(sizeof(*state));
+state->pending = true;
+
+qemu_mutex_lock(>async_hcall_states_lock);
+retry:
+if (qemu_guest_getrandom(, sizeof(token), ) < 0) {
+error_report_err(err);
+g_free(state);
+qemu_mutex_unlock(>async_hcall_states_lock);
+return 0;
+}
+
+if (!token) /* Token should be non-zero */
+goto retry;
+
+if (!QLIST_EMPTY(>async_hcall_states)) {
+QLIST_FOREACH_SAFE(tmp, >async_hcall_states, node, next) {
+if (tmp->continue_token == token) {
+/* If the token already in use, get a new one */
+goto retry;
+}
+}
+}
+
+state->continue_token = token;
+QLIST_INSERT_HEAD(>async_hcall_states, state, node);
+
+qemu_mutex_unlock(>async_hcall_states_lock);
+
+return state->continue_token;
+}
+
+static void *spapr_drc_async_hcall_runner(void *opaque)
+{
+int response = -1;
+SpaprDrcDeviceAsyncHCallState *state = opaque;
+
+/*
+ * state is freed only after this thread finishes(after pthread_join()),
+ * don't worry about it becoming NULL.
+ */
+
+response = state->func(state->data);
+
+state->hcall_ret = response;
+state->pending = 0;
+
+return NULL;
+}
+
+/*
+ * @drc  : device DRC targetting which the async hcalls to be made.
+ * token : The continue token to be used for tracking as recived from
+ * spapr_drc_get_new_async_hcall_token
+ * @func() : the worker function which needs to be executed asynchronously
+ * @data : data to be passed to the asynchronous function. Worker is supposed
+ * to free/cleanup the data that is passed here
+ */
+void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
+   SpaprDrcAsyncHcallWorkerFunc *func, void *data)
+{
+SpaprDrcDeviceAsyncHCallState *state;
+
+qemu_mutex_lock(>async_hcall_states_lock);
+QLIST_FOREACH(state, >async_hcall_states, node) {
+if (state->continue_token == token) {
+state->func = func;
+state->data = data;
+qemu_thread_create(>thread, "sPAPR Async HCALL",
+   spapr_drc_async_hcall_runner, state,
+   QEMU_THREAD_JOINABLE);
+break;
+}
+}
+qemu_mutex_unlock(>async_hcall_states_lock);
+}
+
+/*
+ * spapr_drc_finish_async_hcalls
+ *  Waits for all pending async requests to complete
+ *  thier execution and free the states
+ */
+static void spapr_drc_finish_async_hcalls(SpaprDrc *drc)
+{
+SpaprDrcDeviceAsyncHCallState *state, *next;
+
+if (QLIST_EMPTY(>async_hcall_states)) {
+return;
+}
+
+qemu_mutex_lock(>async_hcall_states_lock);
+QLIST_FOREACH_SAFE(state, >async_hcall_states, node, next) {
+qemu_thread_join(>thread);
+QLIST_REMOVE(state, node);
+g_free(state);
+}
+qemu_mutex_unlock(>async_hcall_states_lock);
+}
+
+/*
+ * spapr_drc_get_async_hcall_status
+ *  Fetches the status of the hcall worker and returns H_BUSY
+ *  if the worker is still running.
+ */
+int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token)
+{
+int ret = H_PARAMETER;
+SpaprDrcDeviceAsyncHCallState *state, *node;
+
+qemu_mutex_lock(>async_hcall_states_lock);
+QLIST_FOREACH_SAFE(state, >async_hcall_states, node, node) {
+if (state->continue_token == token) {
+if (state->pending) {
+ret = H_BUSY;
+break;
+} else {
+ret = state->hcall_ret;
+qemu_thread_join(>thread);
+QLIST_REMOVE(sta

[RFC Qemu PATCH v2 0/2] spapr: nvdimm: Asynchronus flush hcall support

2020-11-30 Thread Shivaprasad G Bhat

The nvdimm devices are expected to ensure write persistent during power
failure kind of scenarios.

The libpmem has architecture specific instructions like dcbf on power
to flush the cache data to backend nvdimm device during normal writes.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
doesn't traslate to actual flush to the backend file on the host in case
of file backed vnvdimms. This is addressed by virtio-pmem in case of x86_64
by making asynchronous flushes.

On PAPR, issue is addressed by adding a new hcall to
request for an explicit asynchronous flush requests from the guest ndctl
driver when the backend nvdimm cannot ensure write persistence with dcbf
alone. So, the approach here is to convey when the asynchronous flush is
required in a device tree property. The guest makes the hcall when the
property is found, instead of relying on dcbf.

The first patch adds the necessary asynchronous hcall support infrastructure
code at the DRC level. Second patch implements the hcall using the
infrastructure.

Hcall semantics are in review and not final.

A new device property sync-dax is added to the nvdimm device. When the
sync-dax is off(default), the asynchronous hcalls will be called.

With respect to save from new qemu to restore on old qemu, having the
sync-dax by default off(when not specified) causes IO errors in guests as
the async-hcall would not be supported on old qemu. The new hcall
implementation being supported only on the new pseries machine version,
the current machine version checks may be sufficient to prevent
such migration. Please suggest what should be done.

The below demonstration shows the map_sync behavior with sync-dax on & off.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

[root@atest-guest ~]# ./mapsync /mnt1/newfile> When sync-dax=off
[root@atest-guest ~]# ./mapsync /mnt2/newfile> when sync-dax=on
Failed to mmap with Operation not supported

---
v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (2):
spapr: drc: Add support for async hcalls at the drc level
spapr: nvdimm: Implement async flush hcalls

--
Signature

[RFC PATCH 2/2] spapr: nvdimm: Implement async flush hcalls

2020-11-25 Thread Shivaprasad G Bhat

When the persistent memory beacked by a file, a cpu cache flush instruction
is not sufficient to ensure the stores are correctly flushed to the media.

The patch implements the async hcalls for flush operation on demand from the
guest kernel.

The device option sync-dax is by default off and enables explicit asynchronous
flush requests from guest. It can be disabled by setting syn-dax=on.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/mem/nvdimm.c |1 +
 hw/ppc/spapr_nvdimm.c   |   79 +++
 include/hw/mem/nvdimm.h |   10 ++
 include/hw/ppc/spapr.h  |3 +-
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 03c2201b56..37a4db0135 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -220,6 +220,7 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, 
const void *buf,
 
 static Property nvdimm_properties[] = {
 DEFINE_PROP_BOOL(NVDIMM_UNARMED_PROP, NVDIMMDevice, unarmed, false),
+DEFINE_PROP_BOOL(NVDIMM_SYNC_DAX_PROP, NVDIMMDevice, sync_dax, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index a833a63b5e..557e36aa98 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -155,6 +156,11 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (!nvdimm->sync_dax) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,async-flush-required",
+ NULL, 0));
+}
+
 return child_offset;
 }
 
@@ -370,6 +376,78 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+typedef struct SCMAsyncFlushData {
+int fd;
+uint64_t token;
+} SCMAsyncFlushData;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SCMAsyncFlushData *req_data = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(req_data->fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+g_free(req_data);
+
+return ret;
+}
+
+static target_ulong h_scm_async_flush(PowerPCCPU *cpu, SpaprMachineState 
*spapr,
+  target_ulong opcode, target_ulong *args)
+{
+int ret;
+uint32_t drc_index = args[0];
+uint64_t continue_token = args[1];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+SCMAsyncFlushData *req_data = NULL;
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (continue_token != 0) {
+ret = spapr_drc_get_async_hcall_status(drc, continue_token);
+if (ret == H_BUSY) {
+args[0] = continue_token;
+return H_LONG_BUSY_ORDER_1_SEC;
+}
+
+return ret;
+}
+
+dimm = PC_DIMM(drc->dev);
+backend = MEMORY_BACKEND(dimm->hostmem);
+
+req_data = g_malloc0(sizeof(SCMAsyncFlushData));
+req_data->fd = memory_region_get_fd(>mr);
+
+continue_token = spapr_drc_get_new_async_hcall_token(drc);
+if (!continue_token) {
+g_free(req_data);
+return H_P2;
+}
+req_data->token = continue_token;
+
+spapr_drc_run_async_hcall(drc, continue_token, _worker_cb, req_data);
+
+ret = spapr_drc_get_async_hcall_status(drc, continue_token);
+if (ret == H_BUSY) {
+args[0] = req_data->token;
+return ret;
+}
+
+return ret;
+}
+
 static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
  target_ulong opcode, target_ulong *args)
 {
@@ -486,6 +564,7 @@ static void spapr_scm_register_types(void)
 spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
 spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
 spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
+spapr_register_hypercall(H_SCM_ASYNC_FLUSH, h_scm_async_flush);
 }
 
 type_init(spapr_scm_register_types)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index c699842dd0..9e8795766e 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -51,6 +51,7 @@ OBJECT_DECLARE_TYPE(NVDIMMDevice, NVDIMMClass, NVDIMM)
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
 #define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
+#define NVDIMM_SYNC_DAX_PROP

[RFC PATCH 1/2] spapr: drc: Add support for async hcalls at the drc level

2020-11-25 Thread Shivaprasad G Bhat

The patch adds support for async hcalls at the DRC level for the
spapr devices. To be used by spapr-scm devices in the patch/es to follow.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_drc.c |  146 
 include/hw/ppc/spapr_drc.h |   25 
 2 files changed, 171 insertions(+)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index 77718cde1f..2cecccf701 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -15,6 +15,7 @@
 #include "qapi/qmp/qnull.h"
 #include "cpu.h"
 #include "qemu/cutils.h"
+#include "qemu/guest-random.h"
 #include "hw/ppc/spapr_drc.h"
 #include "qom/object.h"
 #include "migration/vmstate.h"
@@ -421,6 +422,145 @@ void spapr_drc_detach(SpaprDrc *drc)
 spapr_drc_release(drc);
 }
 
+
+/*
+ * @drc : device DRC targetting which the async hcalls to be made.
+ *
+ * All subsequent requests to run/query the status should use the
+ * unique token returned here.
+ */
+uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc)
+{
+Error *err = NULL;
+uint64_t token;
+SpaprDrcDeviceAsyncHCallState *tmp, *next, *state;
+
+state = g_malloc0(sizeof(*state));
+state->pending = true;
+
+qemu_mutex_lock(>async_hcall_states_lock);
+retry:
+if (qemu_guest_getrandom(, sizeof(token), ) < 0) {
+error_report_err(err);
+g_free(state);
+return 0;
+}
+
+if (!token) /* Token should be non-zero */
+goto retry;
+
+if (!QLIST_EMPTY(>async_hcall_states)) {
+QLIST_FOREACH_SAFE(tmp, >async_hcall_states, node, next) {
+if (tmp->continue_token == token) {
+/* If the token already in use, get a new one */
+goto retry;
+}
+}
+}
+
+state->continue_token = token;
+QLIST_INSERT_HEAD(>async_hcall_states, state, node);
+
+qemu_mutex_unlock(>async_hcall_states_lock);
+
+return state->continue_token;
+}
+
+static void *spapr_drc_async_hcall_runner(void *opaque)
+{
+int response = -1;
+SpaprDrcDeviceAsyncHCallState *state = opaque;
+
+/*
+ * state is freed only after this thread finishes(after pthread_join()),
+ * don't worry about it becoming NULL.
+ */
+
+response = state->func(state->data);
+
+state->hcall_ret = response;
+state->pending = 0;
+
+return NULL;
+}
+
+/*
+ * @drc  : device DRC targetting which the async hcalls to be made.
+ * token : The continue token to be used for tracking as recived from
+ * spapr_drc_get_new_async_hcall_token
+ * @func() : the worker function which needs to be executed asynchronously
+ * @data : data to be passed to the asynchronous function. Worker is supposed
+ * to free/cleanup the data that is passed here
+ */
+void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
+   SpaprDrcAsyncHcallWorkerFunc *func, void *data)
+{
+SpaprDrcDeviceAsyncHCallState *state, *next;
+
+qemu_mutex_lock(>async_hcall_states_lock);
+QLIST_FOREACH_SAFE(state, >async_hcall_states, node, next) {
+if (state->continue_token == token) {
+state->func = func;
+state->data = data;
+qemu_thread_create(>thread, "sPAPR Async HCALL",
+   spapr_drc_async_hcall_runner, state,
+   QEMU_THREAD_JOINABLE);
+break;
+}
+}
+qemu_mutex_unlock(>async_hcall_states_lock);
+}
+
+/*
+ * spapr_drc_finish_async_hcalls
+ *  Waits for all pending async requests to complete
+ *  thier execution and free the states
+ */
+static void spapr_drc_finish_async_hcalls(SpaprDrc *drc)
+{
+SpaprDrcDeviceAsyncHCallState *state, *next;
+
+if (QLIST_EMPTY(>async_hcall_states)) {
+return;
+}
+
+QLIST_FOREACH_SAFE(state, >async_hcall_states, node, next) {
+qemu_thread_join(>thread);
+QLIST_REMOVE(state, node);
+g_free(state);
+}
+}
+
+/*
+ * spapr_drc_get_async_hcall_status
+ *  Fetches the status of the hcall worker and returns H_BUSY
+ *  if the worker is still running.
+ */
+int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token)
+{
+int ret = H_PARAMETER;
+SpaprDrcDeviceAsyncHCallState *state, *node;
+
+qemu_mutex_lock(>async_hcall_states_lock);
+QLIST_FOREACH_SAFE(state, >async_hcall_states, node, node) {
+if (state->continue_token == token) {
+if (state->pending) {
+ret = H_BUSY;
+break;
+} else {
+ret = state->hcall_ret;
+qemu_thread_join(>thread);
+QLIST_REMOVE(state, node);
+g_free(state);
+break;
+}
+}
+}
+qemu_mutex_unlock(>async_hcall_states_lock);
+

[RFC PATCH 0/2] spapr: scm: Asynchronus flush hcall support

2020-11-25 Thread Shivaprasad G Bhat

The nvdimm devices are expected to ensure write persistent during power
failure kind of scenarios.

The libpmem has architecture specific instructions like dcbf on power
to flush the cache data to backend nvdimm device during normal writes.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
doesn't traslate to actual flush to the backend file on the host in case
of file backed vnvdimms. This is addressed by virtio-pmem in case of x86_64
by making asynchronous flushes.

On PAPR, issue is addressed by adding a new hcall to
request for an explicit asynchronous flush requests from the guest ndctl
driver when the backend nvdimm cannot ensure write persistence with dcbf
alone. So, the approach here is to convey when the asynchronous flush is
required in a device tree property. The guest makes the hcall when the
property is found, instead of relying on dcbf.

The first patch adds the necessary asynchronous hcall support infrastructure
code at the DRC level. Second patch implements the hcall using the
infrastructure.

Hcall semantics are in review and not final.

A new device property sync-dax is added to the nvdimm device. When the sync-dax 
is off(default),
the asynchronous hcalls will be called.

With respect to save from new qemu to restore on old qemu, having the
sync-dax by default off(when not specified) causes IO errors in guests as
the async-hcall would not be supported on old qemu. The new hcall
implementation being supported only on the new  pseries machine version,
the current machine version checks may be to prevent sufficient to prevent
such migration. Please suggest what can be done.

The below demonstration shows the map_sync behavior with sync-dax on & off.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from nvdimm with With sync-dax=on, and pmem1 is from nvdimm with 
syn-dax=off, mounted as
/dev/pmem0 on /mnt1 type xfs 
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs 
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile> When sync-dax=off
[root@atest-guest ~]# ./mapsync /mnt2/newfile> when sync-dax=on
Failed to mmap  with Operation not supported

---

Shivaprasad G Bhat (2):
  spapr: drc: Add support for async hcalls at the drc level
  spapr: nvdimm: Implement async flush hcalls


 hw/mem/nvdimm.c|1
 hw/ppc/spapr_drc.c |  146 
 hw/ppc/spapr_nvdimm.c  |   79 
 include/hw/mem/nvdimm.h|   10 +++
 include/hw/ppc/spapr.h |3 +
 include/hw/ppc/spapr_drc.h |   25 
 6 files changed, 263 insertions(+), 1 deletion(-)

--
Signature

Re: [PATCH v2 01/10] nvdimm: Plug memory leak in uuid property setter

2020-05-06 Thread Shivaprasad G Bhat


On 05/05/2020 03:48 PM, Markus Armbruster wrote:

nvdimm_set_uuid() leaks memory on qemu_uuid_parse() failure.  Fix
that.

Fixes: 6c5627bb24dcd68c997857a8b671617333b1289f
Cc: Xiao Guangrong 
Cc: Shivaprasad G Bhat 
Signed-off-by: Markus Armbruster 


Thanks for finding and fixing this Markus.

Tested-by: Shivaprasad G Bhat 

Reviewed-by: Shivaprasad G Bhat 

Regards,
Shivaprasad


---
  hw/mem/nvdimm.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 8e426d24bb..d5752f7bf6 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -97,7 +97,6 @@ static void nvdimm_set_uuid(Object *obj, Visitor *v, const 
char *name,
  if (qemu_uuid_parse(value, >uuid) != 0) {
  error_setg(errp, "Property '%s.%s' has invalid value",
 object_get_typename(obj), name);
-goto out;
  }
  g_free(value);

Re: [PATCH] spapr: Fix Coverity warning while validating nvdimm options

2020-02-27 Thread Shivaprasad G Bhat


On 02/27/2020 05:58 PM, Greg Kurz wrote:

On Wed, 26 Feb 2020 13:49:27 +0100
Greg Kurz  wrote:


-qemu_uuid_parse(uuidstr, );

... cause a segv in there because uuidstr will be dereferenced at
some point without checking if it's NULL.

AFAICT there are two scenarios that can cause object_property_get_str()
to return NULL:
- the property doesn't exist
- the property isn't a string

This can probably never happen with the current code base but we
can't about future changes. In order to ensure we abort rather
than segv, I'd pass _abort to object_property_get_str().

Thanks! I just posted the V2 fixing this as well.

Regards,
Shivaprasad

[PATCH v2] spapr: Fix Coverity warning while validating nvdimm options

2020-02-27 Thread Shivaprasad G Bhat

Fixes Coverity issue,
  CID 1419883:  Error handling issues  (CHECKED_RETURN)
   Calling "qemu_uuid_parse" without checking return value

nvdimm_set_uuid() already verifies if the user provided uuid is valid or
not. So, need to check for the validity during pre-plug validation again.

As this a false positive in this case, assert if not valid to be safe.
Also, error_abort if QOM accessor encounters error while fetching the uuid
property.

Reported-by: Coverity (CID 1419883)
Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 74eeb8bb74..25be8082d7 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -35,6 +35,7 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, 
uint64_t size,
 {
 char *uuidstr = NULL;
 QemuUUID uuid;
+int ret;
 
 if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
 error_setg(errp, "NVDIMM memory size excluding the label area"
@@ -43,8 +44,10 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, 
uint64_t size,
 return;
 }
 
-uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP, NULL);
-qemu_uuid_parse(uuidstr, );
+uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP,
+  _abort);
+ret = qemu_uuid_parse(uuidstr, );
+g_assert(!ret);
 g_free(uuidstr);
 
 if (qemu_uuid_is_null()) {

Re: [PULL 07/20] spapr: Add NVDIMM device support

2020-02-26 Thread Shivaprasad G Bhat





On 02/25/2020 03:30 PM, Peter Maydell wrote:

On Fri, 21 Feb 2020 at 03:37, David Gibson  wrote:

From: Shivaprasad G Bhat 

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
+}
+
+uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP, NULL);
+qemu_uuid_parse(uuidstr, );
+g_free(uuidstr);
+
+if (qemu_uuid_is_null()) {
+error_setg(errp, "NVDIMM device requires the uuid to be set");
+return;
+}
+}

Hi -- Coverity thinks (CID 1419883) that it's suspicious that
this code doesn't check the return value of qemu_uuid_parse(),
because we check it everywhere else that we call that function
(it can return a failure code if the UUID doesn't validly parse).

Hi Peter,

The nvdimm_set_uuid() already verifies if the uuid is valid or not. So, 
its safe

if we dont check here again.

I just posted a patch adding an assert here.

Thanks and Regards,
Shivaprasad

[PATCH] spapr: Fix Coverity warning while validating nvdimm options

2020-02-26 Thread Shivaprasad G Bhat

Fixes Coverity issue,
  CID 1419883:  Error handling issues  (CHECKED_RETURN)
   Calling "qemu_uuid_parse" without checking return value

nvdimm_set_uuid() already verifies if the user provided uuid is valid or
not. So, need to check for the validity during pre-plug validation again.

As this a false positive in this case, assert if not valid to be safe.

Reported-by: Coverity (CID 1419883)
Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 74eeb8bb74..051727536e 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -44,7 +44,7 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, 
uint64_t size,
 }
 
 uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP, NULL);
-qemu_uuid_parse(uuidstr, );
+g_assert(qemu_uuid_parse(uuidstr, ) == 0);
 g_free(uuidstr);
 
 if (qemu_uuid_is_null()) {

[PATCH v6 4/4] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as a separate guest physical address range. So, there
is no way a partial bind/unbind request can come for the vNVDIMM in a
hcall for a subset of SCM blocks of a virtual NVDIMM. Hence it is safe to
do bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c  |  298 
 include/hw/ppc/spapr.h |8 +
 2 files changed, 305 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index d03c8d3a5c..74eeb8bb74 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -28,6 +28,7 @@
 #include "hw/mem/nvdimm.h"
 #include "qemu/nvdimm-utils.h"
 #include "hw/ppc/fdt.h"
+#include "qemu/range.h"
 
 void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size,
 Error **errp)
@@ -175,3 +176,300 @@ void spapr_dt_persistent_memory(void *fdt)
 
 return;
 }
+
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t len = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint64_t data = 0;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (len != 1 && len != 2 &&
+len != 4 && len != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + len < offset) ||
+(nvdimm->label_size < len + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, buf, len, offset);
+
+switch (len) {
+case 1:
+data = ldub_p(buf);
+break;
+case 2:
+data = lduw_be_p(buf);
+break;
+case 4:
+data = ldl_be_p(buf);
+break;
+case 8:
+data = ldq_be_p(buf);
+break;
+default:
+g_assert_not_reached();
+}
+
+args[0] = data;
+
+return H_SUCCESS;
+}
+
+static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
+ SpaprMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t data = args[2];
+uint64_t len = args[3];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (len != 1 && len != 2 &&
+len != 4 && len != 8) {
+return H_P4;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + len < offset) ||
+(nvdimm->label_size < len + offset)) {
+return H_P2;
+}
+
+switch (len) {
+case 1:
+if (data & 0xff00) {
+return H_P2;
+}
+stb_p(buf, data);
+break;
+case 2:
+if (data & 0x) {
+return H_P2;
+}
+stw_be_p(buf, data);
+break;
+case 4:
+if (data & 0x) {
+return H_P2;
+}
+stl_be_p(buf, data);
+break;
+case 8:
+stq_be_p(buf, data);
+break;
+default:
+g_assert_not_reached();
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->write_label_data(nvdimm, buf, len, offset);
+
+return H_SUCCESS;
+}
+
+static target_ulo

[PATCH v6 2/4] nvdimm: add uuid property to nvdimm

For ppc64, PAPR requires the nvdimm device to have UUID property
set in the device tree. Add an option to get it from the user.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: David Gibson 
Reviewed-by: Igor Mammedov 
---
 hw/mem/nvdimm.c |   40 
 include/hw/mem/nvdimm.h |7 +++
 2 files changed, 47 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 39f1426d1f..8e426d24bb 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(>uuid);
+
+visit_type_str(v, name, , errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, , _err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, >uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 523a9b3d4a..4807ca615b 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -25,6 +25,7 @@
 
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/bios-linker-loader.h"
+#include "qemu/uuid.h"
 
 #define NVDIMM_DEBUG 0
 #define nvdimm_debug(fmt, ...)\
@@ -49,6 +50,7 @@
TYPE_NVDIMM)
 
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
+#define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
 
 struct NVDIMMDevice {
@@ -83,6 +85,11 @@ struct NVDIMMDevice {
  * the guest write persistence.
  */
 bool unarmed;
+
+/*
+ * The PPC64 - spapr requires each nvdimm device have a uuid.
+ */
+QemuUUID uuid;
 };
 typedef struct NVDIMMDevice NVDIMMDevice;

[PATCH v6 3/4] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/ppc/Makefile.objs  |2 
 hw/ppc/spapr.c|   69 +-
 hw/ppc/spapr_drc.c|   19 
 hw/ppc/spapr_events.c |4 +
 hw/ppc/spapr_nvdimm.c |  177 +
 include/hw/ppc/spapr_drc.h|9 ++
 include/hw/ppc/spapr_nvdimm.h |   37 
 9 files changed, 309 insertions(+), 11 deletions(-)
 create mode 100644 hw/ppc/spapr_nvdimm.c
 create mode 100644 include/hw/ppc/spapr_nvdimm.h

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index a4bac57be6..c3d3cc56eb 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,7 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o 
spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
-obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o
+obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o spapr_nvdimm.o
 obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
 obj-$(call land,$(CONFIG_PSERIES),$(CONFIG_LINUX)) += spapr_pci_vfio.o 
spapr_pci_nvlink2.o
 # IBM PowerNV
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c9b2e0a5e0..d3cb8b4c7b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,7 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/ppc/spapr_nvdimm.h"
 
 #include "monitor/monitor.h"
 
@@ -675,6 +676,14 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 size = di->size;
 node = di->node;
 
+/*
+ * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
+ * area is marked hotpluggable in the next iteration for the bigger
+ * chunk including the NVDIMM occupied area.
+ */
+if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
+continue;
+
 /* Entry for hot-pluggable area */
 if (cur_addr < addr) {
 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
@@ -1266,6 +1275,11 @@ void *spapr_build_fdt(SpaprMachineState *spapr, bool 
reset, size_t space)
 }
 }
 
+/* NVDIMM devices */
+if (mc->nvdimm_supported) {
+spapr_dt_persistent_memory(fdt);
+}
+
 return fdt;
 }
 
@@ -2629,6 +2643,7 @@ static void spapr_machine_init(MachineState *machine)
 {
 SpaprMachineState *spapr = SPAPR_MACHINE(machine);
 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
+MachineClass *mc = MACHINE_GET_CLASS(machine);
 const char *kernel_filename = machine->kernel_filename;
 const char *initrd_filename = machine->initrd_filename;
 PCIHostState *phb;
@@ -2861,6 +2876,10 @@ static void spapr_machine_init(MachineState *machine)
 "may run and log hardware error on the destination");
 }
 
+if (mc->nvdimm_supported) {
+spapr_create_nvdimm_dr_connectors(spapr);
+}
+
 /* Set up RTAS event infrastructure

[PATCH v6 1/4] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Igor Mammedov 
Reviewed-by: David Gibson 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 11262aafaf..6b38b67cf1 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
+return list;
+}

[PATCH v6 0/4] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

PAPR semantics is such that each NVDIMM device is comprising of
multiple SCM(Storage Class Memory) blocks. The hypervisor is expected
to prepare the FDT for the NVDIMM device and send guest a hotplug
interrupt with new type RTAS_LOG_V6_HP_TYPE_PMEM currently handled by
the upstream kernel. In response to that interrupt, the guest requests
the hypervisor to bind each of the SCM blocks of the NVDIMM device
using hcalls. There can be SCM block unbind requests in case of driver
errors or unplug(not supported now) use cases. The NVDIMM label
read/writes are done through hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesnt fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level
granularity. The patchset uses the existing NVDIMM class structures
for the implementation. The bind/unbind is left to happen at the
device_add/del phase itself instead of at hcalls on-demand.

The free device-memory region which is used for memory hotplug are
done using multiple LMBs of size(256MiB) and are expected to be
aligned to 256 MiB. As the SCM blocks are mapped to the same region,
the SCM blocks also need to be aligned to this size for the subsequent
memory hotplug to work. The minimum SCM block size is set to this size
for that reason and can be made user configurable in future if required.

The first patch moves around the existing static function to common
area for using it in the subsequent patches. Second patch adds new uuid
property to the nvdimm device. Third patch adds FDT entries and basic
device support, the fourth patch adds the hcalls implementation.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v6 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v5: https://lists.nongnu.org/archive/html/qemu-devel/2020-01/msg07472.html
Changes from v5:
- Moved most of the nvdimm code from spapr.c to spapr_nvdimm.c
- Addressed all style/logic comments.
v4: https://lists.gnu.org/archive/html/qemu-devel/2019-12/msg03455.html
Changes from v4:
- The nvdimm occupied GPA area is marked as available for hotplug, the
existing code takes care of if the dimm device is actually present there
or used by nvdimm.
- fixed all comments for hcall implementation code on style/logic issues.
v3: https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg03452.html
Changes from v3:
- Moved NVDIMM uuid property addition to new patch.
- Moved the SCM hcalls to new file
- Changed the metadata read/write hcalls to use st/ldX_be_p macros.
- Fixed all comments on v3
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1
---

Shivaprasad G Bhat (4):
mem: move nvdimm_device_list to utilities
nvdimm: add uuid property

Re: [PATCH v5 3/4] spapr: Add NVDIMM device support




On 02/04/2020 09:29 AM, David Gibson wrote:

On Thu, Jan 30, 2020 at 05:48:15AM -0600, Shivaprasad G Bhat wrote:

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

+   " must be a multiple of %" PRIu64 "MB",
+   SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
+return;
+}
+
+uuidstr = object_property_get_str(OBJECT(dimm), NVDIMM_UUID_PROP, 
NULL);
+qemu_uuid_parse(uuidstr, );

Uh.. couldn't we just look at nvdimm->uuid, rather than getting the
string property and parsing it again?


Addressing all except this one as discussed. Posting the next version in 
a while.


Thanks,
Shivaprasad

[PATCH v5 4/4] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as a separate guest physical address range. So, there
is no way a partial bind/unbind request can come for the vNVDIMM in a
hcall for a subset of SCM blocks of a virtual NVDIMM. Hence it is safe to
do bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/Makefile.objs   |2 
 hw/ppc/spapr_nvdimm.c  |  327 
 include/hw/ppc/spapr.h |8 +
 3 files changed, 335 insertions(+), 2 deletions(-)
 create mode 100644 hw/ppc/spapr_nvdimm.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index a4bac57be6..c3d3cc56eb 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,7 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o 
spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
-obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o
+obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o spapr_nvdimm.o
 obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
 obj-$(call land,$(CONFIG_PSERIES),$(CONFIG_LINUX)) += spapr_pci_vfio.o 
spapr_pci_nvlink2.o
 # IBM PowerNV
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
new file mode 100644
index 00..8d1c2dc009
--- /dev/null
+++ b/hw/ppc/spapr_nvdimm.c
@@ -0,0 +1,327 @@
+/*
+ * QEMU PAPR Storage Class Memory Interfaces
+ *
+ * Copyright (c) 2019-2020, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_drc.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/range.h"
+#include "qemu/nvdimm-utils.h"
+
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t numBytesToRead = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint64_t data = 0;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToRead != 1 && numBytesToRead != 2 &&
+numBytesToRead != 4 && numBytesToRead != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + numBytesToRead < offset) ||
+(nvdimm->label_size < numBytesToRead + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, buf, numBytesToRead, offset);
+
+switch (num

[PATCH v5 3/4] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/ppc/spapr.c|  212 +++--
 hw/ppc/spapr_drc.c|   18 +++
 hw/ppc/spapr_events.c |4 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_drc.h|9 ++
 7 files changed, 243 insertions(+), 14 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 02cf53fc5b..4ea73c31fe 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -79,6 +79,8 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/nvdimm-utils.h"
 
 #include "monitor/monitor.h"
 
@@ -684,12 +686,22 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 nr_entries++;
 }
 
-/* Entry for DIMM */
 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
 g_assert(drc);
-elem = spapr_get_drconf_cell(size / lmb_size, addr,
- spapr_drc_index(drc), node,
- SPAPR_LMB_FLAGS_ASSIGNED);
+
+if (info->value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
+/* Entry for DIMM */
+elem = spapr_get_drconf_cell(size / lmb_size, addr,
+ spapr_drc_index(drc), node,
+ SPAPR_LMB_FLAGS_ASSIGNED);
+} else if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
+/*
+ * Entry for the NVDIMM occupied area. The area is
+ * hotpluggable after the NVDIMM is unplugged.
+ */
+elem = spapr_get_drconf_cell(size / lmb_size, addr,
+ spapr_drc_index(drc), -1, 0);
+}
 QSIMPLEQ_INSERT_TAIL(_queue, elem, entry);
 nr_entries++;
 cur_addr = addr + size;
@@ -1130,6 +1142,85 @@ static void spapr_dt_hypervisor(SpaprMachineState 
*spapr, void *fdt)
 }
 }
 
+static int spapr_dt_nvdimm(void *fdt, int parent_offset,
+   NVDIMMDevice *nvdimm)
+{
+int child_offset;
+char buf[40];
+SpaprDrc *drc;
+uint32_t drc_idx;
+uint32_t node = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_NODE_PROP,
+ _abort);
+uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP,
+ _abort);
+uint32_t associativity[] = {
+cpu_to_be32(0x4), /* length */
+cpu_to_be32(0x0), cpu_to_be32(0x0),
+cpu_to_be32(0x0), cpu_to_be32(node)
+};
+uint64_t lsize = nvdimm->label_size;
+uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
+NULL);
+
+drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
+g_assert(drc);
+
+drc_idx = spapr_drc_index(drc);
+
+sprintf(buf, "ibm,pmemory@%x", drc_idx);
+child_offset = fdt_add_subnode(fdt, parent_offset, buf);
+_FDT(child_offset);
+
+_FDT(

Re: [PATCH v4 4/4] spapr: Add Hcalls to support PAPR NVDIMM device





On 01/03/2020 07:14 AM, David Gibson wrote:

On Tue, Dec 17, 2019 at 02:49:36AM -0600, Shivaprasad G Bhat wrote:

This patch implements few of the necessary hcalls for the nvdimm support.


Fixing all the comments.

Of course, we're not *actually* unbinding anything.  But I guess the
idea here is that this is returning the number of blocks on which the
operation succeeded, whether or not that actually removes the binding
from memory.

Yes. Correct.

Thanks,
Shivaprasad

[PATCH v5 1/4] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Igor Mammedov 
Reviewed-by: David Gibson 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 11262aafaf..6b38b67cf1 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
+return list;
+}

[PATCH v5 2/4] nvdimm: add uuid property to nvdimm

For ppc64, PAPR requires the nvdimm device to have UUID property
set in the device tree. Add an option to get it from the user.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: David Gibson 
---
 hw/mem/nvdimm.c |   40 
 include/hw/mem/nvdimm.h |7 +++
 2 files changed, 47 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 39f1426d1f..8e426d24bb 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(>uuid);
+
+visit_type_str(v, name, , errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, , _err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, >uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 523a9b3d4a..4807ca615b 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -25,6 +25,7 @@
 
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/bios-linker-loader.h"
+#include "qemu/uuid.h"
 
 #define NVDIMM_DEBUG 0
 #define nvdimm_debug(fmt, ...)\
@@ -49,6 +50,7 @@
TYPE_NVDIMM)
 
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
+#define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
 
 struct NVDIMMDevice {
@@ -83,6 +85,11 @@ struct NVDIMMDevice {
  * the guest write persistence.
  */
 bool unarmed;
+
+/*
+ * The PPC64 - spapr requires each nvdimm device have a uuid.
+ */
+QemuUUID uuid;
 };
 typedef struct NVDIMMDevice NVDIMMDevice;

[PATCH v5 0/4] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v5 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v4: https://lists.gnu.org/archive/html/qemu-devel/2019-12/msg03455.html
Changes from v4:
- The nvdimm occupied GPA area is marked as available for hotplug, the
existing code takes care of if the dimm device is actually present there
or used by nvdimm.
- fixed all comments for hcall implementation code on style/logic issues.
v3: https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg03452.html
Changes from v3:
- Moved NVDIMM uuid property addition to new patch.
- Moved the SCM hcalls to new file
- Changed the metadata read/write hcalls to use st/ldX_be_p macros.
- Fixed all comments on v3
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1

Shivaprasad G Bhat (4):
mem: move nvdimm_device_list to utilities
nvdimm: add uuid property to nvdimm
spapr: Add NVDIMM device support
spapr: Add Hcalls to support PAPR NVDIMM device

default-configs/ppc64-softmmu.mak |1
hw/acpi/nvdimm.c | 28 ---
hw/mem/Kconfig

Re: [PATCH v4 3/4] spapr: Add NVDIMM device support





On 01/03/2020 06:50 AM, David Gibson wrote:

On Tue, Dec 17, 2019 at 02:49:14AM -0600, Shivaprasad G Bhat wrote:

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
[Early implementation]
---
  default-configs/ppc64-softmmu.mak |1
  hw/mem/Kconfig|2
  hw/ppc/spapr.c|  216 ++---
  hw/ppc/spapr_drc.c|   18 +++
  hw/ppc/spapr_events.c |4 +
  include/hw/ppc/spapr.h|   11 ++
  include/hw/ppc/spapr_drc.h|9 ++
  7 files changed, 245 insertions(+), 16 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
  
  # For pSeries

  CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
  config NVDIMM
  bool
  default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3ae7db1563..921d8d7c8e 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,8 @@
  #include "hw/ppc/spapr_cpu_core.h"
  #include "hw/mem/memory-device.h"
  #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/nvdimm-utils.h"
  
  #include "monitor/monitor.h"
  
@@ -685,12 +687,22 @@ static int spapr_populate_drmem_v2(SpaprMachineState *spapr, void *fdt,

  nr_entries++;
  }
  
-/* Entry for DIMM */

-drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
-g_assert(drc);
-elem = spapr_get_drconf_cell(size / lmb_size, addr,
- spapr_drc_index(drc), node,
- SPAPR_LMB_FLAGS_ASSIGNED);
+if (info->value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
+/* Entry for DIMM */
+drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
+g_assert(drc);
+elem = spapr_get_drconf_cell(size / lmb_size, addr,
+ spapr_drc_index(drc), node,
+ SPAPR_LMB_FLAGS_ASSIGNED);
+} else if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
+/*
+ * NVDIMM sits here, let the DIMM LMBs be unusable here in the
+ * whole range
+ */
+elem = spapr_get_drconf_cell(size / lmb_size, addr, 0, -1,
+ SPAPR_LMB_FLAGS_RESERVED |
+ SPAPR_LMB_FLAGS_DRC_INVALID);
+}

As discussed in reply to an earlier thread, this whole scheme
basically breaks down in the presence of hotplug - it relies on which
GPAs are DIMMs and which are NVDIMMs not changing.

Other than that significant problem, the rest of this looks
reasonable.


As discussed, I verified not marking the NVDIMM occupied area as reserved,
and it seems to work fine. The malicious attempts to claim
the drcs on the those areas fail as there wont be a valid dimm devices
asscociated with those drcs.

Sending the next version fixing this part.

[PATCH v4 2/4] nvdimm: add uuid property to nvdimm

For ppc64, PAPR requires the nvdimm device to have UUID property
set in the device tree. Add an option to get it from the user.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/mem/nvdimm.c |   40 
 include/hw/mem/nvdimm.h |7 +++
 2 files changed, 47 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 375f9a588a..e1238b5bed 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(>uuid);
+
+visit_type_str(v, name, , errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, , _err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, >uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 523a9b3d4a..4807ca615b 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -25,6 +25,7 @@
 
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/bios-linker-loader.h"
+#include "qemu/uuid.h"
 
 #define NVDIMM_DEBUG 0
 #define nvdimm_debug(fmt, ...)\
@@ -49,6 +50,7 @@
TYPE_NVDIMM)
 
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
+#define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
 
 struct NVDIMMDevice {
@@ -83,6 +85,11 @@ struct NVDIMMDevice {
  * the guest write persistence.
  */
 bool unarmed;
+
+/*
+ * The PPC64 - spapr requires each nvdimm device have a uuid.
+ */
+QemuUUID uuid;
 };
 typedef struct NVDIMMDevice NVDIMMDevice;

[PATCH v4 0/4] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v4 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v3: https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg03452.html
Changes from v3:
- Moved NVDIMM uuid property addition to new patch.
- Moved the SCM hcalls to new file
- Changed the metadata read/write hcalls to use st/ldX_be_p macros.
- Fixed all comments on v3
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1

Shivaprasad G Bhat (4):
mem: move nvdimm_device_list to utilities
nvdimm: add uuid property to nvdimm
spapr: Add NVDIMM device support
spapr: Add Hcalls to support PAPR NVDIMM device

[PATCH v4 4/4] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as a separate guest physical address range. So, there
is no way a partial bind/unbind request can come for the vNVDIMM in a
hcall for a subset of SCM blocks of a virtual NVDIMM. Hence it is safe to
do bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/Makefile.objs   |2 
 hw/ppc/spapr_nvdimm.c  |  337 
 include/hw/ppc/spapr.h |8 +
 3 files changed, 345 insertions(+), 2 deletions(-)
 create mode 100644 hw/ppc/spapr_nvdimm.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 580bb4f0dd..0366020ef9 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -5,7 +5,7 @@ obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o 
spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
-obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o
+obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o spapr_nvdimm.o
 obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
 # IBM PowerNV
 obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o 
pnv_occ.o pnv_bmc.o
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
new file mode 100644
index 00..4a3f796597
--- /dev/null
+++ b/hw/ppc/spapr_nvdimm.c
@@ -0,0 +1,337 @@
+/*
+ * QEMU PAPR Storage Class Memory Interfaces
+ *
+ * Copyright (c) 2019, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_drc.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/range.h"
+#include "qemu/nvdimm-utils.h"
+
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t numBytesToRead = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint64_t data = 0;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToRead != 1 && numBytesToRead != 2 &&
+numBytesToRead != 4 && numBytesToRead != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + numBytesToRead < offset) ||
+(nvdimm->label_size < numBytesToRead + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, buf, numBytesToRead, offset);
+
+switch (num

[PATCH v4 3/4] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/ppc/spapr.c|  216 ++---
 hw/ppc/spapr_drc.c|   18 +++
 hw/ppc/spapr_events.c |4 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_drc.h|9 ++
 7 files changed, 245 insertions(+), 16 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3ae7db1563..921d8d7c8e 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,8 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/nvdimm-utils.h"
 
 #include "monitor/monitor.h"
 
@@ -685,12 +687,22 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 nr_entries++;
 }
 
-/* Entry for DIMM */
-drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
-g_assert(drc);
-elem = spapr_get_drconf_cell(size / lmb_size, addr,
- spapr_drc_index(drc), node,
- SPAPR_LMB_FLAGS_ASSIGNED);
+if (info->value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
+/* Entry for DIMM */
+drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
+g_assert(drc);
+elem = spapr_get_drconf_cell(size / lmb_size, addr,
+ spapr_drc_index(drc), node,
+ SPAPR_LMB_FLAGS_ASSIGNED);
+} else if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
+/*
+ * NVDIMM sits here, let the DIMM LMBs be unusable here in the
+ * whole range
+ */
+elem = spapr_get_drconf_cell(size / lmb_size, addr, 0, -1,
+ SPAPR_LMB_FLAGS_RESERVED |
+ SPAPR_LMB_FLAGS_DRC_INVALID);
+}
 QSIMPLEQ_INSERT_TAIL(_queue, elem, entry);
 nr_entries++;
 cur_addr = addr + size;
@@ -1197,6 +1209,85 @@ static void spapr_dt_hypervisor(SpaprMachineState 
*spapr, void *fdt)
 }
 }
 
+static int spapr_dt_nvdimm(void *fdt, int parent_offset,
+   NVDIMMDevice *nvdimm)
+{
+int child_offset;
+char buf[40];
+SpaprDrc *drc;
+uint32_t drc_idx;
+uint32_t node = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_NODE_PROP,
+ _abort);
+uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP,
+ _abort);
+uint32_t associativity[] = {
+cpu_to_be32(0x4), /* length */
+cpu_to_be32(0x0), cpu_to_be32(0x0),
+cpu_to_be32(0x0), cpu_to_be32(node)
+};
+uint64_t lsize = nvdimm->label_size;
+uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
+NULL);
+
+drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
+g_assert(drc);
+
+drc_idx = spapr_drc_in

[PATCH v4 1/4] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Igor Mammedov 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index df124af1c5..2a096fe190 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
+return list;
+}

Re: [PATCH v3 3/3] spapr: Add Hcalls to support PAPR NVDIMM device

2019-12-16 Thread Shivaprasad G Bhat


Hi David,


On 11/22/2019 10:41 AM, David Gibson wrote:

On Mon, Oct 14, 2019 at 01:38:16PM -0500, Shivaprasad G Bhat wrote:

device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM

It's not clear to me what a "region" means in this context.


That is PMEM terminology. "region" in this context is guest physical
address range.

Fixing all the rest of the things you pointed out.

Thanks,
Shivaprasad

Re: [PATCH v3 2/3] spapr: Add NVDIMM device support

2019-12-16 Thread Shivaprasad G Bhat


Hi David,

On 11/22/2019 10:00 AM, David Gibson wrote:

On Mon, Oct 14, 2019 at 01:37:50PM -0500, Shivaprasad G Bhat wrote:
---

index 62f1a42592..815167e42f 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -708,6 +708,17 @@ static void spapr_drc_phb_class_init(ObjectClass *k, void 
*data)
  drck->dt_populate = spapr_phb_dt_populate;
  }
  
+static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)

+{
+SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
+
+drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
+drck->typename = "MEM";

This is the same as the typename for LMB DRCs.  Doesn't that mean that
ibm,drc-types will end up with a duplicate in it?


Correct, this has to be "PMEM" instead of "MEM". Fixing it in next version.

Thanks,
Shivaprasad


+drck->drc_name_prefix = "PMEM ";

Re: [PATCH v3 2/3] spapr: Add NVDIMM device support

2019-12-12 Thread Shivaprasad G Bhat





On 12/11/2019 01:35 PM, Igor Mammedov wrote:

On Wed, 11 Dec 2019 09:44:11 +0530
Shivaprasad G Bhat  wrote:


On 12/06/2019 07:22 AM, David Gibson wrote:

On Wed, Nov 27, 2019 at 09:50:54AM +0530, Bharata B Rao wrote:

On Fri, Nov 22, 2019 at 10:42 AM David Gibson
 wrote:

Ok.  A number of queries about this.

1) The PAPR spec for ibm,dynamic-memory-v2 says that the first word in
each entry is the number of LMBs, but for NVDIMMs you use the
not-necessarily-equal scm_block_size instead.  Does the NVDIMM
amendment for PAPR really specify to use different block sizes for
these cases?  (In which case that's a really stupid spec decision, but
that wouldn't surprise me at this point).

SCM block sizes can be different from LMB sizes, but here we enforce
that SCM device size (excluding metadata) to multiple of LMB size so
that we don't end up memory range that is not aligned to LMB size.

Right, but it still doesn't make sense to use scm_block_size when you
create the dynamic-memory-v2 property.

Right, I should use LMB size here as I will be creating holes here to
disallow DIMMs
to claim those LMBs marking them INVALID as Bharata Suggested before.


   As far as the thing
interpreting that goes, it *must* be LMB size, not SCM block size.  If
those are required to be the same at this point, you should use an
assert().

SCM block size should be a multiple for LMB size, need not be equal.
I'll add an assert
for that, checking if equal. There is no benefit I see as of now having
higher
SCM block size as the bind/unbind are already done before the bind hcall.


2) Similarly, the ibm,dynamic-memory-v2 description says that the
memory block described by the entry has a whole batch of contiguous
DRCs starting at the DRC index given and continuing for #LMBs DRCs.
For NVDIMMs it appears that you just have one DRC for the whole
NVDIMM.  Is that right?

One NVDIMM has one DRC, In our case, we need to mark the LMBs
corresponding to that address range in ibm,dynamic-memory-v2 as
reserved and invalid.

Ok, that fits very weirdly with the DRC allocation for the rest of
pluggable memory, but I suppose that's PAPR for you.

Having these in together is very inscrutable though, and relies on a
heap of non-obvious constraints about placement of DIMMs and NVDIMMs
relative to each other.  I really wonder if it would be better to have
a completely different address range for the NVDIMMs.

The backend object for both DIMM and NVDIMM are memory-backend-*
and they use the address from the same space. Separating it would mean
using/introducing different backend object. I dont think we have a
choice here.

What address-space(s) are are talking about here exactly?
 From my point of view memory-backend-* provides RAM block at
some HVA, which shouldn't not have anything to do with how NVDIMM
partitions and maps it to GPA.


Ah, you are right! I got confused with the HVA.

Nonetheless, I don't see a need for having vNVDIMM in different
guest physical address range as the existing code has support for marking
memory ranges distinctly for DIMM/NVDIMM.

On another note, the x86 too does it the same way. There is no separate
range defined there.





3) You're not setting *any* extra flags on the entry.  How is the
guest supposed to know which are NVDIMM entries and which are regular
DIMM entries?  AFAICT in this version the NVDIMM slots are
indistinguishable from the unassigned hotplug memory (which makes the
difference in LMB and DRC numbering even more troubling).

For NVDIMM case, this patch should populate the LMB set in
ibm,dynamic-memory-v2 something like below:
  elem = spapr_get_drconf_cell(size /lmb_size, addr,
   0, -1,
SPAPR_LMB_FLAGS_RESERVED | SPAPR_LMB_FLAGS_DRC_INVALID);

This will ensure that the NVDIMM range will never be considered as
valid memory range for memory hotplug.

Hrm.  Ok so we already have code that does that for any gaps between
DIMMs.  I don't think there's actually anything that that code will do
differently than the code you have for NVDIMMs, so you could just skip
over the NVDIMMs here and it should do the right thing.

The *interpretation* of those entries will become different: for space
into which a regular DIMM is later inserted, we'll assume the DRC
index given is a base and there are more DRCs following it, but for
NVDIMMs we'll assume the same DRC throughout.  This is nuts, but IIUC
that's what PAPR says and we can't do much about it.

My current patch is buggy as Bharata pointed out. The NVDIMM DRCs
are not to be populated here, but mark the LMB DRCs as RESERVED and INVALID
so that no malicious attempts to online those LMBs at those NVDIMM address
ranges are attempted.

  

4) AFAICT these are _present_ NVDIMMs, so why is
SPAPR_LMB_FLAGS_ASSIGNED not set for them?  (and why is the node
forced to -1, regardless of di->node).
  

   QSIMPLEQ_INSERT_TAIL(_queue, elem, entry);
   nr_entries++;
   cur_addr = addr + s

Re: [PATCH v3 2/3] spapr: Add NVDIMM device support

2019-12-10 Thread Shivaprasad G Bhat




On 12/06/2019 07:22 AM, David Gibson wrote:

On Wed, Nov 27, 2019 at 09:50:54AM +0530, Bharata B Rao wrote:

On Fri, Nov 22, 2019 at 10:42 AM David Gibson
 wrote:

Ok.  A number of queries about this.

1) The PAPR spec for ibm,dynamic-memory-v2 says that the first word in
each entry is the number of LMBs, but for NVDIMMs you use the
not-necessarily-equal scm_block_size instead.  Does the NVDIMM
amendment for PAPR really specify to use different block sizes for
these cases?  (In which case that's a really stupid spec decision, but
that wouldn't surprise me at this point).

SCM block sizes can be different from LMB sizes, but here we enforce
that SCM device size (excluding metadata) to multiple of LMB size so
that we don't end up memory range that is not aligned to LMB size.

Right, but it still doesn't make sense to use scm_block_size when you
create the dynamic-memory-v2 property.


Right, I should use LMB size here as I will be creating holes here to 
disallow DIMMs

to claim those LMBs marking them INVALID as Bharata Suggested before.


  As far as the thing
interpreting that goes, it *must* be LMB size, not SCM block size.  If
those are required to be the same at this point, you should use an
assert().


SCM block size should be a multiple for LMB size, need not be equal. 
I'll add an assert
for that, checking if equal. There is no benefit I see as of now having 
higher

SCM block size as the bind/unbind are already done before the bind hcall.


2) Similarly, the ibm,dynamic-memory-v2 description says that the
memory block described by the entry has a whole batch of contiguous
DRCs starting at the DRC index given and continuing for #LMBs DRCs.
For NVDIMMs it appears that you just have one DRC for the whole
NVDIMM.  Is that right?

One NVDIMM has one DRC, In our case, we need to mark the LMBs
corresponding to that address range in ibm,dynamic-memory-v2 as
reserved and invalid.

Ok, that fits very weirdly with the DRC allocation for the rest of
pluggable memory, but I suppose that's PAPR for you.

Having these in together is very inscrutable though, and relies on a
heap of non-obvious constraints about placement of DIMMs and NVDIMMs
relative to each other.  I really wonder if it would be better to have
a completely different address range for the NVDIMMs.


The backend object for both DIMM and NVDIMM are memory-backend-*
and they use the address from the same space. Separating it would mean
using/introducing different backend object. I dont think we have a 
choice here.





3) You're not setting *any* extra flags on the entry.  How is the
guest supposed to know which are NVDIMM entries and which are regular
DIMM entries?  AFAICT in this version the NVDIMM slots are
indistinguishable from the unassigned hotplug memory (which makes the
difference in LMB and DRC numbering even more troubling).

For NVDIMM case, this patch should populate the LMB set in
ibm,dynamic-memory-v2 something like below:
 elem = spapr_get_drconf_cell(size /lmb_size, addr,
  0, -1,
SPAPR_LMB_FLAGS_RESERVED | SPAPR_LMB_FLAGS_DRC_INVALID);

This will ensure that the NVDIMM range will never be considered as
valid memory range for memory hotplug.

Hrm.  Ok so we already have code that does that for any gaps between
DIMMs.  I don't think there's actually anything that that code will do
differently than the code you have for NVDIMMs, so you could just skip
over the NVDIMMs here and it should do the right thing.

The *interpretation* of those entries will become different: for space
into which a regular DIMM is later inserted, we'll assume the DRC
index given is a base and there are more DRCs following it, but for
NVDIMMs we'll assume the same DRC throughout.  This is nuts, but IIUC
that's what PAPR says and we can't do much about it.


My current patch is buggy as Bharata pointed out. The NVDIMM DRCs
are not to be populated here, but mark the LMB DRCs as RESERVED and INVALID
so that no malicious attempts to online those LMBs at those NVDIMM address
ranges are attempted.




4) AFAICT these are _present_ NVDIMMs, so why is
SPAPR_LMB_FLAGS_ASSIGNED not set for them?  (and why is the node
forced to -1, regardless of di->node).


  QSIMPLEQ_INSERT_TAIL(_queue, elem, entry);
  nr_entries++;
  cur_addr = addr + size;
@@ -1261,6 +1273,85 @@ static void spapr_dt_hypervisor(SpaprMachineState 
*spapr, void *fdt)
  }
  }

+static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
+{
+MachineState *machine = MACHINE(spapr);
+int i;
+
+for (i = 0; i < machine->ram_slots; i++) {
+spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);

What happens if you try to plug an NVDIMM to one of these slots, but a
regular DIMM has already taken it?

NVDIMM hotplug won't get that occupied slot.

Ok.

Re: [PATCH v3 1/3] mem: move nvdimm_device_list to utilities

2019-11-20 Thread Shivaprasad G Bhat


Hi Igor,


On 11/19/2019 12:43 PM, Igor Mammedov wrote:

On Mon, 14 Oct 2019 13:37:37 -0500
Shivaprasad G Bhat  wrote:


nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
---
  hw/acpi/nvdimm.c|   28 +---
  include/qemu/nvdimm-utils.h |7 +++
  util/Makefile.objs  |1 +
  util/nvdimm-utils.c |   29 +

instead of creating new file, why not to move it to existing hw/mem/nvdimm.c?


That would break the build for mips-softmmu. The mips has 
CONFIG_ACPI_NVDIMM=y
and not CONFIG_NVDIMM. So, the build would break failing to fetch the 
definition from

hw/mem/nvdimm.c.

I have the patch here from v2 of the series,

https://github.com/ShivaprasadGBhat/qemu/commit/00512a25e4852f174fe6c07bc5acb5ee7027e3de

Thanks,
Shivaprasad

[PATCH v3 3/3] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as separate region. There is no way to configure the
virtual NVDIMM interleaving for the guests today. So, there is no way a
partial bind/unbind request can come for the vNVDIMM in a hcall for a
subset of SCM blocks of a virtual NVDIMM. Hence it is safe to do
bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
---
 hw/ppc/spapr_hcall.c   |  300 
 include/hw/ppc/spapr.h |8 +
 2 files changed, 307 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 23e4bdb829..4e9ad96f7c 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -18,6 +18,10 @@
 #include "hw/ppc/spapr_ovec.h"
 #include "mmu-book3s-v3.h"
 #include "hw/mem/memory-device.h"
+#include "hw/ppc/spapr_drc.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/range.h"
+#include "qemu/nvdimm-utils.h"
 
 static bool has_spr(PowerPCCPU *cpu, int spr)
 {
@@ -1961,6 +1965,295 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t numBytesToRead = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+__be64 data_be = 0;
+uint64_t data = 0;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToRead != 1 && numBytesToRead != 2 &&
+numBytesToRead != 4 && numBytesToRead != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + numBytesToRead < offset) ||
+(nvdimm->label_size < numBytesToRead + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, _be, numBytesToRead, offset);
+
+switch (numBytesToRead) {
+case 1:
+data = data_be & 0xff;
+break;
+case 2:
+data = be16_to_cpu(data_be & 0x);
+break;
+case 4:
+data = be32_to_cpu(data_be & 0x);
+break;
+case 8:
+data = be64_to_cpu(data_be);
+break;
+default:
+break;
+}
+
+args[0] = data;
+
+return H_SUCCESS;
+}
+
+static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
+ SpaprMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t data = args[2];
+uint64_t numBytesToWrite = args[3];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+DeviceState *dev;
+NVDIMMClass *ddc;
+__be64 data_be = 0;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
+numBytesToWrite != 4 && numBytesToWrite != 8) {
+return H_P4;
+}
+
+dev = drc->dev;
+nvdimm = NVDIMM(dev);
+
+switch (numBytesToWrite) {
+case 1:
+if (data & 0xff00) {
+return H_P2;
+}
+data_be = data & 0xff;
+break;
+case 2:
+if (data & 0x) {
+return H_P2;
+}
+data_be = cpu_to_be16(data & 0x);
+break;
+case 4:
+if (data & 0x) {
+return H_P2;
+}
+dat

[PATCH v3 2/3] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/mem/nvdimm.c   |   40 +++
 hw/ppc/spapr.c|  218 ++---
 hw/ppc/spapr_drc.c|   18 +++
 hw/ppc/spapr_events.c |4 +
 include/hw/mem/nvdimm.h   |7 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_drc.h|9 ++
 9 files changed, 293 insertions(+), 17 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 375f9a588a..e1238b5bed 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(>uuid);
+
+visit_type_str(v, name, , errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, , _err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, >uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 08a2a5a770..eb5c205078 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,8 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/nvdimm-utils.h"
 
 #include 
 
@@ -716,7 +718,8 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 uint8_t *int_buf, *cur_index;
 int ret;
 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
-uint64_t addr, cur_addr, size;
+uint64_t addr, cur_addr, size, slot;
+uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
 uint64_t mem_end = machine->device_memory->base +
memory_region_size(>device_memory->mr);
@@ -741,6 +744,7 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 addr = di->addr;
 size = di->size;
 node = di->node;
+slot = di->slot;
 
 /* Entry for hot-pluggable area */
 if (cur_addr < addr) {

[PATCH v3 1/3] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 41bf59d127..a0f40d26e3 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, );
+return list;
+}

[PATCH v3 0/3] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

The guest kernel makes bind/unbind requests for the virtual NVDIMM
device at the region level granularity. Without interleaving, each
virtual NVDIMM device is presented as separate region. There is no
way to configure the virtual NVDIMM interleaving for the guests today.
So, there is no way a partial bind/unbind request can come for the
vNVDIMM in a hcall for a subset of SCM blocks of a virtual NVDIMM.
Hence it is safe to do bind/unbind everything during the object_add/del.

The first patch moves around the existing static function to common
area for using it in the subsequent patches. Second patch adds the
FDT entries and basic device support, the third patch adds the hcalls
implementation.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v3 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1

Shivaprasad G Bhat (3):
mem: move nvdimm_device_list to utilities
spapr: Add NVDIMM device support
spapr: Add Hcalls to support PAPR NVDIMM device

[Qemu-devel] [PATCH] ppc: remove redundant capability check for unset irq

2019-07-25 Thread Shivaprasad G Bhat

The KVM_CAP_PPC_UNSET_IRQ is part of kernel since v2.6.36.
Kernels older than that are not supported anymore.
So, remove the checks.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/kvm.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 5ab5e6c6a9..94a2ecb84f 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -55,7 +55,6 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 KVM_CAP_LAST_INFO
 };
 
-static int cap_interrupt_unset;
 static int cap_segstate;
 static int cap_booke_sregs;
 static int cap_ppc_smt;
@@ -104,7 +103,6 @@ static int kvmppc_get_dec_bits(void);
 
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
-cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
 cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
@@ -1309,7 +1307,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 return 0;
 }
 
-if (!kvm_enabled() || !cap_interrupt_unset) {
+if (!kvm_enabled()) {
 return 0;
 }

[Qemu-devel] [PATCH v7] ppc: remove idle_timer logic

2019-07-25 Thread Shivaprasad G Bhat

The logic is broken for multiple vcpu guests, also causing memory leak.
The logic is in place to handle kvm not having KVM_CAP_PPC_IRQ_LEVEL,
which is part of the kernel now since 2.6.37. Instead of fixing the
leak, drop the redundant logic which is not excercised on new kernels
anymore. Exit with error on older kernels.

Signed-off-by: Shivaprasad G Bhat 
---
v6: https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg05378.html
Changes from v6:
   - switched to error_report instead of fprintf
---
 target/ppc/kvm.c |   75 --
 1 file changed, 5 insertions(+), 70 deletions(-)

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8a06d3171e..5ab5e6c6a9 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -56,7 +56,6 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 };
 
 static int cap_interrupt_unset;
-static int cap_interrupt_level;
 static int cap_segstate;
 static int cap_booke_sregs;
 static int cap_ppc_smt;
@@ -87,25 +86,6 @@ static int cap_large_decr;
 
 static uint32_t debug_inst_opcode;
 
-/*
- * XXX We have a race condition where we actually have a level triggered
- * interrupt, but the infrastructure can't expose that yet, so the guest
- * takes but ignores it, goes to sleep and never gets notified that there's
- * still an interrupt pending.
- *
- * As a quick workaround, let's just wake up again 20 ms after we injected
- * an interrupt. That way we can assure that we're always reinjecting
- * interrupts in case the guest swallowed them.
- */
-static QEMUTimer *idle_timer;
-
-static void kvm_kick_cpu(void *opaque)
-{
-PowerPCCPU *cpu = opaque;
-
-qemu_cpu_kick(CPU(cpu));
-}
-
 /*
  * Check whether we are running with KVM-PR (instead of KVM-HV).  This
  * should only be used for fallback tests - generally we should use
@@ -125,7 +105,6 @@ static int kvmppc_get_dec_bits(void);
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
 cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
-cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
 cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
@@ -161,9 +140,9 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
  */
 cap_ppc_pvr_compat = false;
 
-if (!cap_interrupt_level) {
-fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
-"VM to stall at times!\n");
+if (!kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL)) {
+error_report("KVM: Host kernel doesn't have level irq capability");
+exit(1);
 }
 
 kvm_ppc_register_host_cpu_type(ms);
@@ -491,8 +470,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
-
 switch (cenv->mmu_model) {
 case POWERPC_MMU_BOOKE206:
 /* This target supports access to KVM's guest TLB */
@@ -1332,7 +1309,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 return 0;
 }
 
-if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
+if (!kvm_enabled() || !cap_interrupt_unset) {
 return 0;
 }
 
@@ -1349,49 +1326,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 
 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 {
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-int r;
-unsigned irq;
-
-qemu_mutex_lock_iothread();
-
-/*
- * PowerPC QEMU tracks the various core input pins (interrupt,
- * critical interrupt, reset, etc) in PPC-specific
- * env->irq_input_state.
- */
-if (!cap_interrupt_level &&
-run->ready_for_interrupt_injection &&
-(cs->interrupt_request & CPU_INTERRUPT_HARD) &&
-(env->irq_input_state & (1 << PPC_INPUT_INT)))
-{
-/*
- * For now KVM disregards the 'irq' argument. However, in the
- * future KVM could cache it in-kernel to avoid a heavyweight
- * exit when reading the UIC.
- */
-irq = KVM_INTERRUPT_SET;
-
-trace_kvm_injected_interrupt(irq);
-r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, );
-if (r < 0) {
-printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
-}
-
-/* Always wake up soon in case the interrupt was level based */
-timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-   (NANOSECONDS_PER_SECOND / 50));
-}
-
-/*
- * We don't know if there are more interrupts pending after
- * this. However, the guest will return to userspace in the course
- * of handling this one anyways, so we will get a chance to

[Qemu-devel] [PATCH v6] ppc: remove idle_timer logic

2019-07-24 Thread Shivaprasad G Bhat

The KVM_CAP_PPC_IRQ_LEVEL is part of the kernel now since 2.6.37.
Drop the redundant logic which is not excercised on new the kernels anymore.
Exit with error on older kernels.

Signed-off-by: Shivaprasad G Bhat 
---
 v5: https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg05301.html
 Changes from v5:
   - exit with error if KVM_CAP_PPC_IRQ_LEVEL is not there.

 target/ppc/kvm.c |   75 --
 1 file changed, 5 insertions(+), 70 deletions(-)

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8a06d3171e..4a3f36f0d5 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -56,7 +56,6 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 };
 
 static int cap_interrupt_unset;
-static int cap_interrupt_level;
 static int cap_segstate;
 static int cap_booke_sregs;
 static int cap_ppc_smt;
@@ -87,25 +86,6 @@ static int cap_large_decr;
 
 static uint32_t debug_inst_opcode;
 
-/*
- * XXX We have a race condition where we actually have a level triggered
- * interrupt, but the infrastructure can't expose that yet, so the guest
- * takes but ignores it, goes to sleep and never gets notified that there's
- * still an interrupt pending.
- *
- * As a quick workaround, let's just wake up again 20 ms after we injected
- * an interrupt. That way we can assure that we're always reinjecting
- * interrupts in case the guest swallowed them.
- */
-static QEMUTimer *idle_timer;
-
-static void kvm_kick_cpu(void *opaque)
-{
-PowerPCCPU *cpu = opaque;
-
-qemu_cpu_kick(CPU(cpu));
-}
-
 /*
  * Check whether we are running with KVM-PR (instead of KVM-HV).  This
  * should only be used for fallback tests - generally we should use
@@ -125,7 +105,6 @@ static int kvmppc_get_dec_bits(void);
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
 cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
-cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
 cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
@@ -161,9 +140,9 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
  */
 cap_ppc_pvr_compat = false;
 
-if (!cap_interrupt_level) {
-fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
-"VM to stall at times!\n");
+if (!kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL)) {
+fprintf(stderr, "KVM: Host kernel doesn't have level irq 
capability\n");
+exit(1);
 }
 
 kvm_ppc_register_host_cpu_type(ms);
@@ -491,8 +470,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
-
 switch (cenv->mmu_model) {
 case POWERPC_MMU_BOOKE206:
 /* This target supports access to KVM's guest TLB */
@@ -1332,7 +1309,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 return 0;
 }
 
-if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
+if (!kvm_enabled() || !cap_interrupt_unset) {
 return 0;
 }
 
@@ -1349,49 +1326,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 
 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 {
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-int r;
-unsigned irq;
-
-qemu_mutex_lock_iothread();
-
-/*
- * PowerPC QEMU tracks the various core input pins (interrupt,
- * critical interrupt, reset, etc) in PPC-specific
- * env->irq_input_state.
- */
-if (!cap_interrupt_level &&
-run->ready_for_interrupt_injection &&
-(cs->interrupt_request & CPU_INTERRUPT_HARD) &&
-(env->irq_input_state & (1 << PPC_INPUT_INT)))
-{
-/*
- * For now KVM disregards the 'irq' argument. However, in the
- * future KVM could cache it in-kernel to avoid a heavyweight
- * exit when reading the UIC.
- */
-irq = KVM_INTERRUPT_SET;
-
-trace_kvm_injected_interrupt(irq);
-r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, );
-if (r < 0) {
-printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
-}
-
-/* Always wake up soon in case the interrupt was level based */
-timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-   (NANOSECONDS_PER_SECOND / 50));
-}
-
-/*
- * We don't know if there are more interrupts pending after
- * this. However, the guest will return to userspace in the course
- * of handling this one anyways, so we will get a chance to
- * deliver the rest.
- */
-
-qemu_mutex_unlock_iothread();
+return;
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)

[Qemu-devel] [PATCH v5] ppc: remove the idle_timer logic

2019-07-24 Thread Shivaprasad G Bhat

The KVM_CAP_PPC_IRQ_LEVEL is part of the kernel now since 2.6.37.
Drop the redundant logic which is not excercised on new the kernels anymore.

Signed-off-by: Shivaprasad G Bhat 
---
 v4: https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg04456.html
 Changes from v4:
   - it was discussed to drop the idle_timer logic instead of fixing the
 leak and keeping the redundant logic around. So, the patch does that.

 target/ppc/kvm.c |   74 +-
 1 file changed, 2 insertions(+), 72 deletions(-)

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8a06d3171e..1c8f2319a0 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -56,7 +56,6 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 };
 
 static int cap_interrupt_unset;
-static int cap_interrupt_level;
 static int cap_segstate;
 static int cap_booke_sregs;
 static int cap_ppc_smt;
@@ -87,25 +86,6 @@ static int cap_large_decr;
 
 static uint32_t debug_inst_opcode;
 
-/*
- * XXX We have a race condition where we actually have a level triggered
- * interrupt, but the infrastructure can't expose that yet, so the guest
- * takes but ignores it, goes to sleep and never gets notified that there's
- * still an interrupt pending.
- *
- * As a quick workaround, let's just wake up again 20 ms after we injected
- * an interrupt. That way we can assure that we're always reinjecting
- * interrupts in case the guest swallowed them.
- */
-static QEMUTimer *idle_timer;
-
-static void kvm_kick_cpu(void *opaque)
-{
-PowerPCCPU *cpu = opaque;
-
-qemu_cpu_kick(CPU(cpu));
-}
-
 /*
  * Check whether we are running with KVM-PR (instead of KVM-HV).  This
  * should only be used for fallback tests - generally we should use
@@ -125,7 +105,6 @@ static int kvmppc_get_dec_bits(void);
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
 cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
-cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
 cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
@@ -161,11 +140,6 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
  */
 cap_ppc_pvr_compat = false;
 
-if (!cap_interrupt_level) {
-fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
-"VM to stall at times!\n");
-}
-
 kvm_ppc_register_host_cpu_type(ms);
 
 return 0;
@@ -491,8 +465,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
-
 switch (cenv->mmu_model) {
 case POWERPC_MMU_BOOKE206:
 /* This target supports access to KVM's guest TLB */
@@ -1332,7 +1304,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 return 0;
 }
 
-if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
+if (!kvm_enabled() || !cap_interrupt_unset) {
 return 0;
 }
 
@@ -1349,49 +1321,7 @@ int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int 
level)
 
 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 {
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-int r;
-unsigned irq;
-
-qemu_mutex_lock_iothread();
-
-/*
- * PowerPC QEMU tracks the various core input pins (interrupt,
- * critical interrupt, reset, etc) in PPC-specific
- * env->irq_input_state.
- */
-if (!cap_interrupt_level &&
-run->ready_for_interrupt_injection &&
-(cs->interrupt_request & CPU_INTERRUPT_HARD) &&
-(env->irq_input_state & (1 << PPC_INPUT_INT)))
-{
-/*
- * For now KVM disregards the 'irq' argument. However, in the
- * future KVM could cache it in-kernel to avoid a heavyweight
- * exit when reading the UIC.
- */
-irq = KVM_INTERRUPT_SET;
-
-trace_kvm_injected_interrupt(irq);
-r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, );
-if (r < 0) {
-printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
-}
-
-/* Always wake up soon in case the interrupt was level based */
-timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
-   (NANOSECONDS_PER_SECOND / 50));
-}
-
-/*
- * We don't know if there are more interrupts pending after
- * this. However, the guest will return to userspace in the course
- * of handling this one anyways, so we will get a chance to
- * deliver the rest.
- */
-
-qemu_mutex_unlock_iothread();
+return;
 }
 
 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)

Re: [Qemu-devel] [Qemu-ppc] [PATCH v3] ppc: make idle_timer a per-cpu variable

2019-07-19 Thread Shivaprasad G Bhat





On 07/18/2019 09:47 PM, Greg Kurz wrote:

On Thu, 18 Jul 2019 10:21:28 -0500
Shivaprasad G Bhat  wrote:

+PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+timer_deinit(>idle_timer);
As stated in the timer.h header file, timer_del() should always be called
before timer_deinit().

With that fixed:

Reviewed-by: Greg Kurz 


Thanks. Fixed, and posted the v4.

Regards,
Shivaprasad

[Qemu-devel] [PATCH v4] ppc: make idle_timer a per-cpu variable

2019-07-19 Thread Shivaprasad G Bhat

The current code is broken for more than vcpu as
each thread would overwrite idle_timer and there were
memory leaks.

Make it part of PowerPCCPU so that every thread has a
separate one. Avoid using the timer_new_ns which is
not the preferred way to create timers.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Greg Kurz 
---
 v3: https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg04375.html
 Changes from v3:
- Calling timer_del() before timer_deinit()

 target/ppc/cpu.h |1 +
 target/ppc/kvm.c |   32 +---
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c9beba2a5c..521086d91a 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1190,6 +1190,7 @@ struct PowerPCCPU {
 void *machine_data;
 int32_t node_id; /* NUMA node this CPU belongs to */
 PPCHash64Options *hash64_opts;
+QEMUTimer idle_timer;
 
 /* Fields related to migration compatibility hacks */
 bool pre_2_8_migration;
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8a06d3171e..52d3292f45 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -87,18 +87,6 @@ static int cap_large_decr;
 
 static uint32_t debug_inst_opcode;
 
-/*
- * XXX We have a race condition where we actually have a level triggered
- * interrupt, but the infrastructure can't expose that yet, so the guest
- * takes but ignores it, goes to sleep and never gets notified that there's
- * still an interrupt pending.
- *
- * As a quick workaround, let's just wake up again 20 ms after we injected
- * an interrupt. That way we can assure that we're always reinjecting
- * interrupts in case the guest swallowed them.
- */
-static QEMUTimer *idle_timer;
-
 static void kvm_kick_cpu(void *opaque)
 {
 PowerPCCPU *cpu = opaque;
@@ -491,7 +479,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
+timer_init_ns(>idle_timer, QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
 
 switch (cenv->mmu_model) {
 case POWERPC_MMU_BOOKE206:
@@ -523,6 +511,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
 int kvm_arch_destroy_vcpu(CPUState *cs)
 {
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+timer_del(>idle_timer);
+timer_deinit(>idle_timer);
+
 return 0;
 }
 
@@ -1379,8 +1372,17 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
 }
 
-/* Always wake up soon in case the interrupt was level based */
-timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+/*
+ * XXX We have a race condition where we actually have a level
+ * triggered interrupt, but the infrastructure can't expose that
+ * yet, so the guest takes but ignores it, goes to sleep and
+ * never gets notified that there's still an interrupt pending.
+ *
+ * As a quick workaround, let's just wake up again 20 ms after
+ * we injected an interrupt. That way we can assure that we're
+ * always reinjecting interrupts in case the guest swallowed them.
+ */
+timer_mod(>idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
(NANOSECONDS_PER_SECOND / 50));
 }

[Qemu-devel] [PATCH v3] ppc: make idle_timer a per-cpu variable

2019-07-18 Thread Shivaprasad G Bhat

The current code is broken for more than vcpu as
each thread would overwrite and there were memory leaks.

Make it part of PowerPCCPU so that every thread has a
separate one. Avoid using the timer_new_ns which is
not the preferred way to create timers.

Signed-off-by: Shivaprasad G Bhat 
---
 v2: https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg04023.html
 Changes from v2:
   v2 just looked at avoiding the memory leak.
   This patch incorporates all of Greg's suggestions.

 target/ppc/cpu.h |1 +
 target/ppc/kvm.c |   31 ---
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index c9beba2a5c..521086d91a 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1190,6 +1190,7 @@ struct PowerPCCPU {
 void *machine_data;
 int32_t node_id; /* NUMA node this CPU belongs to */
 PPCHash64Options *hash64_opts;
+QEMUTimer idle_timer;
 
 /* Fields related to migration compatibility hacks */
 bool pre_2_8_migration;
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8a06d3171e..6e1b96bb0a 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -87,18 +87,6 @@ static int cap_large_decr;
 
 static uint32_t debug_inst_opcode;
 
-/*
- * XXX We have a race condition where we actually have a level triggered
- * interrupt, but the infrastructure can't expose that yet, so the guest
- * takes but ignores it, goes to sleep and never gets notified that there's
- * still an interrupt pending.
- *
- * As a quick workaround, let's just wake up again 20 ms after we injected
- * an interrupt. That way we can assure that we're always reinjecting
- * interrupts in case the guest swallowed them.
- */
-static QEMUTimer *idle_timer;
-
 static void kvm_kick_cpu(void *opaque)
 {
 PowerPCCPU *cpu = opaque;
@@ -491,7 +479,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
+timer_init_ns(>idle_timer, QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
 
 switch (cenv->mmu_model) {
 case POWERPC_MMU_BOOKE206:
@@ -523,6 +511,10 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
 int kvm_arch_destroy_vcpu(CPUState *cs)
 {
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+timer_deinit(>idle_timer);
+
 return 0;
 }
 
@@ -1379,8 +1371,17 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
 }
 
-/* Always wake up soon in case the interrupt was level based */
-timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+/*
+ * XXX We have a race condition where we actually have a level
+ * triggered interrupt, but the infrastructure can't expose that
+ * yet, so the guest takes but ignores it, goes to sleep and
+ * never gets notified that there's still an interrupt pending.
+ *
+ * As a quick workaround, let's just wake up again 20 ms after
+ * we injected an interrupt. That way we can assure that we're
+ * always reinjecting interrupts in case the guest swallowed them.
+ */
+timer_mod(>idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
(NANOSECONDS_PER_SECOND / 50));
 }

[Qemu-devel] [PATCH v2 4/4] ppc: dont overwrite initialized idle_timer

The check to see if the idle_timer is already initialized is
missing. Every vcpu thread would call kvm_arch_init_vcpu()
and overwrite the idle_timer resulting in a memory leak.
Patch fixes that.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/kvm.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8a06d3171e..498ca6d53b 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -491,7 +491,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
+if (!idle_timer)
+idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
 
 switch (cenv->mmu_model) {
 case POWERPC_MMU_BOOKE206:

[Qemu-devel] [PATCH v2 2/4] ppc: fix memory leak in spapr_dt_drc()

Leaking the drc_name while preparing the DT properties.
Fixing that.

Also, remove the const qualifier from spapr_drc_name().

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_drc.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index bacadfcac5..695a0b2285 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -226,7 +226,7 @@ static uint32_t drc_set_unusable(SpaprDrc *drc)
 return RTAS_OUT_SUCCESS;
 }
 
-static const char *spapr_drc_name(SpaprDrc *drc)
+static char *spapr_drc_name(SpaprDrc *drc)
 {
 SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
 
@@ -827,6 +827,7 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, 
uint32_t drc_type_mask)
 Object *obj;
 SpaprDrc *drc;
 SpaprDrcClass *drck;
+char *drc_name = NULL;
 uint32_t drc_index, drc_power_domain;
 
 if (!strstart(prop->type, "link<", NULL)) {
@@ -856,8 +857,10 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, 
uint32_t drc_type_mask)
 g_array_append_val(drc_power_domains, drc_power_domain);
 
 /* ibm,drc-names */
-drc_names = g_string_append(drc_names, spapr_drc_name(drc));
+drc_name = spapr_drc_name(drc);
+drc_names = g_string_append(drc_names, drc_name);
 drc_names = g_string_insert_len(drc_names, -1, "\0", 1);
+g_free(drc_name);
 
 /* ibm,drc-types */
 drc_types = g_string_append(drc_types, drck->typename);

[Qemu-devel] [PATCH v2 3/4] ppc: fix leak in h_client_architecture_support

Free all SpaprOptionVector local pointers after use.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_hcall.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 6808d4cda8..71cfe7c41d 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1612,6 +1612,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 ov5_updates = spapr_ovec_new();
 spapr->cas_reboot = spapr_ovec_diff(ov5_updates,
 ov5_cas_old, spapr->ov5_cas);
+spapr_ovec_cleanup(ov5_cas_old);
 /* Now that processing is finished, set the radix/hash bit for the
  * guest if it requested a valid mode; otherwise terminate the boot. */
 if (guest_radix) {
@@ -1629,6 +1630,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 }
 spapr->cas_legacy_guest_workaround = !spapr_ovec_test(ov1_guest,
   OV1_PPC_3_00);
+spapr_ovec_cleanup(ov1_guest);
 if (!spapr->cas_reboot) {
 /* If spapr_machine_reset() did not set up a HPT but one is necessary
  * (because the guest isn't going to use radix) then set it up here. */

[Qemu-devel] [PATCH v2 1/4] ppc: fix memory leak in spapr_caps_add_properties

Free the capability name string after setting
the capability.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_caps.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index bbb001f84a..0263c78d69 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -778,7 +778,7 @@ void spapr_caps_add_properties(SpaprMachineClass *smc, 
Error **errp)
 
 for (i = 0; i < ARRAY_SIZE(capability_table); i++) {
 SpaprCapabilityInfo *cap = _table[i];
-const char *name = g_strdup_printf("cap-%s", cap->name);
+char *name = g_strdup_printf("cap-%s", cap->name);
 char *desc;
 
 object_class_property_add(klass, name, cap->type,
@@ -786,11 +786,13 @@ void spapr_caps_add_properties(SpaprMachineClass *smc, 
Error **errp)
   NULL, cap, _err);
 if (local_err) {
 error_propagate(errp, local_err);
+g_free(name);
 return;
 }
 
 desc = g_strdup_printf("%s", cap->description);
 object_class_property_set_description(klass, name, desc, _err);
+g_free(name);
 g_free(desc);
 if (local_err) {
 error_propagate(errp, local_err);

[Qemu-devel] [PATCH v2 0/4] ppc: Fix some memory leaks