date:20230720

Re: [PATCH RESEND] hw/pci: add comment to explain checking for available function 0 in pci hotplug

2023-07-20 Thread Ani Sinha




> On 11-Jul-2023, at 12:33 PM, Ani Sinha  wrote:
> 
> This change is cosmetic. A comment is added explaining why we need to check 
> for
> the availability of function 0 when we hotplug a device.
> 
> CC: m...@redhat.com
> CC: m...@tls.msk.ru
> Signed-off-by: Ani Sinha 


Can we merge this while we are still in rc0?

> ---
> hw/pci/pci.c | 11 ---
> 1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e2eb4c3b4a..6db18dfe46 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1180,9 +1180,14 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
>PCI_SLOT(devfn), PCI_FUNC(devfn), name,
>bus->devices[devfn]->name, bus->devices[devfn]->qdev.id);
> return NULL;
> -} else if (dev->hotplugged &&
> -   !pci_is_vf(pci_dev) &&
> -   pci_get_function_0(pci_dev)) {
> +} /*
> +   * Populating function 0 triggers a scan from the guest that
> +   * exposes other non-zero functions. Hence we need to ensure that
> +   * function 0 wasn't added yet.
> +   */
> +else if (dev->hotplugged &&
> + !pci_is_vf(pci_dev) &&
> + pci_get_function_0(pci_dev)) {
> error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
>" new func %s cannot be exposed to guest.",
>PCI_SLOT(pci_get_function_0(pci_dev)->devfn),
> -- 
> 2.39.1
>

[PATCH 1/2] target/ppc: Implement ASDR register for ISA v3.0 for HPT

2023-07-20 Thread Nicholas Piggin

The ASDR register was introduced in ISA v3.0. It has not been
implemented for HPT. With HPT, ASDR is the format of the slbmte RS
operand (containing VSID), which matches the ppc_slb_t field.

Signed-off-by: Nicholas Piggin 
---
 target/ppc/mmu-hash64.c | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/target/ppc/mmu-hash64.c b/target/ppc/mmu-hash64.c
index 900f906990..a0c90df3ce 100644
--- a/target/ppc/mmu-hash64.c
+++ b/target/ppc/mmu-hash64.c
@@ -770,7 +770,8 @@ static bool ppc_hash64_use_vrma(CPUPPCState *env)
 }
 }
 
-static void ppc_hash64_set_isi(CPUState *cs, int mmu_idx, uint64_t error_code)
+static void ppc_hash64_set_isi(CPUState *cs, int mmu_idx, uint64_t slb_vsid,
+   uint64_t error_code)
 {
 CPUPPCState *env = _CPU(cs)->env;
 bool vpm;
@@ -782,13 +783,15 @@ static void ppc_hash64_set_isi(CPUState *cs, int mmu_idx, 
uint64_t error_code)
 }
 if (vpm && !mmuidx_hv(mmu_idx)) {
 cs->exception_index = POWERPC_EXCP_HISI;
+env->spr[SPR_ASDR] = slb_vsid;
 } else {
 cs->exception_index = POWERPC_EXCP_ISI;
 }
 env->error_code = error_code;
 }
 
-static void ppc_hash64_set_dsi(CPUState *cs, int mmu_idx, uint64_t dar, 
uint64_t dsisr)
+static void ppc_hash64_set_dsi(CPUState *cs, int mmu_idx, uint64_t slb_vsid,
+   uint64_t dar, uint64_t dsisr)
 {
 CPUPPCState *env = _CPU(cs)->env;
 bool vpm;
@@ -802,6 +805,7 @@ static void ppc_hash64_set_dsi(CPUState *cs, int mmu_idx, 
uint64_t dar, uint64_t
 cs->exception_index = POWERPC_EXCP_HDSI;
 env->spr[SPR_HDAR] = dar;
 env->spr[SPR_HDSISR] = dsisr;
+env->spr[SPR_ASDR] = slb_vsid;
 } else {
 cs->exception_index = POWERPC_EXCP_DSI;
 env->spr[SPR_DAR] = dar;
@@ -963,13 +967,13 @@ bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, 
MMUAccessType access_type,
 }
 switch (access_type) {
 case MMU_INST_FETCH:
-ppc_hash64_set_isi(cs, mmu_idx, SRR1_PROTFAULT);
+ppc_hash64_set_isi(cs, mmu_idx, 0, SRR1_PROTFAULT);
 break;
 case MMU_DATA_LOAD:
-ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_PROTFAULT);
+ppc_hash64_set_dsi(cs, mmu_idx, 0, eaddr, DSISR_PROTFAULT);
 break;
 case MMU_DATA_STORE:
-ppc_hash64_set_dsi(cs, mmu_idx, eaddr,
+ppc_hash64_set_dsi(cs, mmu_idx, 0, eaddr,
DSISR_PROTFAULT | DSISR_ISSTORE);
 break;
 default:
@@ -1022,7 +1026,7 @@ bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, 
MMUAccessType access_type,
 /* 3. Check for segment level no-execute violation */
 if (access_type == MMU_INST_FETCH && (slb->vsid & SLB_VSID_N)) {
 if (guest_visible) {
-ppc_hash64_set_isi(cs, mmu_idx, SRR1_NOEXEC_GUARD);
+ppc_hash64_set_isi(cs, mmu_idx, slb->vsid, SRR1_NOEXEC_GUARD);
 }
 return false;
 }
@@ -1035,13 +1039,14 @@ bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, 
MMUAccessType access_type,
 }
 switch (access_type) {
 case MMU_INST_FETCH:
-ppc_hash64_set_isi(cs, mmu_idx, SRR1_NOPTE);
+ppc_hash64_set_isi(cs, mmu_idx, slb->vsid, SRR1_NOPTE);
 break;
 case MMU_DATA_LOAD:
-ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_NOPTE);
+ppc_hash64_set_dsi(cs, mmu_idx, slb->vsid, eaddr, DSISR_NOPTE);
 break;
 case MMU_DATA_STORE:
-ppc_hash64_set_dsi(cs, mmu_idx, eaddr, DSISR_NOPTE | 
DSISR_ISSTORE);
+ppc_hash64_set_dsi(cs, mmu_idx, slb->vsid, eaddr,
+   DSISR_NOPTE | DSISR_ISSTORE);
 break;
 default:
 g_assert_not_reached();
@@ -1075,7 +1080,7 @@ bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, 
MMUAccessType access_type,
 if (PAGE_EXEC & ~amr_prot) {
 srr1 |= SRR1_IAMR; /* Access violates virt pg class key prot */
 }
-ppc_hash64_set_isi(cs, mmu_idx, srr1);
+ppc_hash64_set_isi(cs, mmu_idx, slb->vsid, srr1);
 } else {
 int dsisr = 0;
 if (need_prot & ~pp_prot) {
@@ -1087,7 +1092,7 @@ bool ppc_hash64_xlate(PowerPCCPU *cpu, vaddr eaddr, 
MMUAccessType access_type,
 if (need_prot & ~amr_prot) {
 dsisr |= DSISR_AMR;
 }
-ppc_hash64_set_dsi(cs, mmu_idx, eaddr, dsisr);
+ppc_hash64_set_dsi(cs, mmu_idx, slb->vsid, eaddr, dsisr);
 }
 return false;
 }
-- 
2.40.1

[PATCH 2/2] target/ppc: Fix VRMA page size for ISA v3.0

2023-07-20 Thread Nicholas Piggin

Until v2.07s, the VRMA page size (L||LP) was encoded in LPCR[VRMASD].
In v3.0 that moved to the partition table PS field.

Signed-off-by: Nicholas Piggin 
---
 target/ppc/mmu-hash64.c | 41 +++--
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/target/ppc/mmu-hash64.c b/target/ppc/mmu-hash64.c
index a0c90df3ce..bda0ba90ea 100644
--- a/target/ppc/mmu-hash64.c
+++ b/target/ppc/mmu-hash64.c
@@ -874,12 +874,41 @@ static target_ulong rmls_limit(PowerPCCPU *cpu)
 return rma_sizes[rmls];
 }
 
-static int build_vrma_slbe(PowerPCCPU *cpu, ppc_slb_t *slb)
+/* Return the LLP in SLB_VSID format */
+static uint64_t get_vrma_llp(PowerPCCPU *cpu)
 {
 CPUPPCState *env = >env;
-target_ulong lpcr = env->spr[SPR_LPCR];
-uint32_t vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT;
-target_ulong vsid = SLB_VSID_VRMA | ((vrmasd << 4) & SLB_VSID_LLP_MASK);
+uint64_t llp;
+
+if (env->mmu_model == POWERPC_MMU_3_00) {
+ppc_v3_pate_t pate;
+uint64_t ps;
+
+/*
+ * ISA v3.0 removes the LPCR[VRMASD] field and puts the VRMA base
+ * page size (L||LP equivalent) in the PS field in the HPT partition
+ * table entry.
+ */
+if (!ppc64_v3_get_pate(cpu, cpu->env.spr[SPR_LPIDR], )) {
+error_report("Bad VRMA with no partition table entry\n");
+return 0;
+}
+ps = pate.dw0 >> (63 - 58);
+llp = ((ps & 0x4) << (63 - 55 - 2)) | ((ps & 0x3) << (63 - 59));
+
+} else {
+uint64_t lpcr = env->spr[SPR_LPCR];
+target_ulong vrmasd = (lpcr & LPCR_VRMASD) >> LPCR_VRMASD_SHIFT;
+
+llp = (vrmasd << 4) & SLB_VSID_LLP_MASK;
+}
+
+return llp;
+}
+
+static int build_vrma_slbe(PowerPCCPU *cpu, ppc_slb_t *slb)
+{
+target_ulong vsid = SLB_VSID_VRMA | get_vrma_llp(cpu);
 int i;
 
 for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
@@ -897,8 +926,8 @@ static int build_vrma_slbe(PowerPCCPU *cpu, ppc_slb_t *slb)
 }
 }
 
-error_report("Bad page size encoding in LPCR[VRMASD]; LPCR=0x"
- TARGET_FMT_lx, lpcr);
+error_report("Bad VRMA page size encoding 0x" TARGET_FMT_lx,
+ get_vrma_llp(cpu));
 
 return -1;
 }
-- 
2.40.1

[PATCH 0/2] target/ppc: Fixes for hash MMU for ISA v3.0

2023-07-20 Thread Nicholas Piggin

This fixes a couple of deficiencies in the v3.0 and later (POWER9, 10)
HPT MMU implementation. With these fixes, KVM is unable to boot hash
guests on powernv9/10 machines. Bare metal hash or pseries machine with
hash works, because VRMA is only used when a real hypervisor is
virtualizing a hash guest's real mode addressing.

Thanks,
Nick

Nicholas Piggin (2):
  target/ppc: Implement ASDR register for ISA v3.0 for HPT
  target/ppc: Fix VRMA page size for ISA v3.0

 target/ppc/mmu-hash64.c | 68 ++---
 1 file changed, 51 insertions(+), 17 deletions(-)

-- 
2.40.1

Re: [PULL 0/2] QAPI patches patches for 2023-07-10

2023-07-20 Thread Markus Armbruster

Did this fall through the cracks?

Re: [PATCH v2] target/ppc: Generate storage interrupts for radix RC changes

2023-07-20 Thread Nicholas Piggin

On Thu Jul 13, 2023 at 3:35 AM AEST, Shawn Anastasio wrote:
> On 7/12/23 11:56 AM, Cédric Le Goater wrote:
> > Hello Shawn,
> > 
> > On 7/12/23 18:13, Shawn Anastasio wrote:
> >> Change radix model to always generate a storage interrupt when the R/C
> >> bits are not set appropriately in a PTE instead of setting the bits
> >> itself.  According to the ISA both behaviors are valid, but in practice
> >> this change more closely matches behavior observed on the POWER9 CPU.
> > 
> > How did you spotted this dark corner case in emulation ? Do you have
> > MMU unit tests ?
>
> I'm currently porting Xen to Power and have been using QEMU's powernv
> model extensively for early bring up. I noticed the issue when my radix
> implementation worked in QEMU but failed on actual hardware since I
> didn't have a proper storage interrupt handler implemented.

Cool. This was on my todo list because we rely on it for nested HV
KVM too.

I actually didn't know about that odd effLIPD=0 exception, but it
looks right. How did you test that, by running with MSR[HV]=0 and
LPIDR=0 ?

For the patch,

Reviewed-by: Nicholas Piggin 

Thanks,
Nick

Re: [PATCH 2/2] target/riscv/cpu.c: add smepmp isa string

2023-07-20 Thread Weiwei Li




On 2023/7/20 21:24, Daniel Henrique Barboza wrote:

The cpu->cfg.epmp extension is still experimental, but it already has a
'smepmp' riscv,isa string. Add it.

Signed-off-by: Daniel Henrique Barboza 
---


Reviewed-by: Weiwei Li 

Weiwei Li


  target/riscv/cpu.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index d64ac07558..8c9acadd3b 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -130,6 +130,7 @@ static const struct isa_ext_data isa_edata_arr[] = {
  ISA_EXT_DATA_ENTRY(zhinx, PRIV_VERSION_1_12_0, ext_zhinx),
  ISA_EXT_DATA_ENTRY(zhinxmin, PRIV_VERSION_1_12_0, ext_zhinxmin),
  ISA_EXT_DATA_ENTRY(smaia, PRIV_VERSION_1_12_0, ext_smaia),
+ISA_EXT_DATA_ENTRY(smepmp, PRIV_VERSION_1_12_0, epmp),
  ISA_EXT_DATA_ENTRY(smstateen, PRIV_VERSION_1_12_0, ext_smstateen),
  ISA_EXT_DATA_ENTRY(ssaia, PRIV_VERSION_1_12_0, ext_ssaia),
  ISA_EXT_DATA_ENTRY(sscofpmf, PRIV_VERSION_1_12_0, ext_sscofpmf),

Re: [PATCH 1/2] target/riscv/cpu.c: add zmmul isa string

2023-07-20 Thread Weiwei Li




On 2023/7/20 21:24, Daniel Henrique Barboza wrote:

zmmul was promoted from experimental to ratified in commit 6d00ffad4e95.
Add a riscv,isa string for it.

Fixes: 6d00ffad4e95 ("target/riscv: move zmmul out of the experimental 
properties")
Signed-off-by: Daniel Henrique Barboza 
---


Reviewed-by: Weiwei Li 

Weiwei Li


  target/riscv/cpu.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 9339c0241d..d64ac07558 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -88,6 +88,7 @@ static const struct isa_ext_data isa_edata_arr[] = {
  ISA_EXT_DATA_ENTRY(zicsr, PRIV_VERSION_1_10_0, ext_icsr),
  ISA_EXT_DATA_ENTRY(zifencei, PRIV_VERSION_1_10_0, ext_ifencei),
  ISA_EXT_DATA_ENTRY(zihintpause, PRIV_VERSION_1_10_0, ext_zihintpause),
+ISA_EXT_DATA_ENTRY(zmmul, PRIV_VERSION_1_12_0, ext_zmmul),
  ISA_EXT_DATA_ENTRY(zawrs, PRIV_VERSION_1_12_0, ext_zawrs),
  ISA_EXT_DATA_ENTRY(zfa, PRIV_VERSION_1_12_0, ext_zfa),
  ISA_EXT_DATA_ENTRY(zfbfmin, PRIV_VERSION_1_12_0, ext_zfbfmin),

Re: [PATCH for-8.2 v5 08/11] target/riscv/cpu.c: add ADD_UNAVAIL_KVM_PROP_ARRAY() macro

2023-07-20 Thread Weiwei Li




On 2023/7/21 01:19, Daniel Henrique Barboza wrote:

Use a macro in riscv_cpu_add_kvm_properties() to eliminate some of its
code repetition, similar to what we're already doing with
ADD_CPU_QDEV_PROPERTIES_ARRAY().

Signed-off-by: Daniel Henrique Barboza 
---


Reviewed-by: Weiwei Li 

Weiwei Li


  target/riscv/cpu.c | 22 ++
  1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 23169a606f..8675839cb4 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1900,6 +1900,13 @@ static void riscv_cpu_add_kvm_unavail_prop(Object *obj, 
const char *prop_name)
  NULL, (void *)prop_name);
  }
  
+#define ADD_UNAVAIL_KVM_PROP_ARRAY(_obj, _array) \

+do { \
+for (int i = 0; i < ARRAY_SIZE(_array); i++) { \
+riscv_cpu_add_kvm_unavail_prop(_obj, _array[i].name); \
+} \
+} while (0)
+
  static void riscv_cpu_add_kvm_properties(Object *obj)
  {
  DeviceState *dev = DEVICE(obj);
@@ -1907,18 +1914,9 @@ static void riscv_cpu_add_kvm_properties(Object *obj)
  kvm_riscv_init_user_properties(obj);
  riscv_cpu_add_misa_properties(obj);
  
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {

-riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_extensions[i].name);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_vendor_exts); i++) {
-riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_vendor_exts[i].name);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_experimental_exts); i++) {
-riscv_cpu_add_kvm_unavail_prop(obj,
-   riscv_cpu_experimental_exts[i].name);
-}
+ADD_UNAVAIL_KVM_PROP_ARRAY(obj, riscv_cpu_extensions);
+ADD_UNAVAIL_KVM_PROP_ARRAY(obj, riscv_cpu_vendor_exts);
+ADD_UNAVAIL_KVM_PROP_ARRAY(obj, riscv_cpu_experimental_exts);
  
  for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {

  /* Check if KVM created the property already */

Re: [PATCH for-8.2 v5 04/11] target/riscv/cpu.c: del DEFINE_PROP_END_OF_LIST() from riscv_cpu_extensions

2023-07-20 Thread Weiwei Li




On 2023/7/21 01:19, Daniel Henrique Barboza wrote:

This last blank element is used by the 'for' loop to check if a property
has a valid name.

Remove it and use ARRAY_SIZE() instead like riscv_cpu_options is already
using. All future arrays will also do the same and we'll able to
encapsulate more repetitions in macros later on.

Signed-off-by: Daniel Henrique Barboza 
---


Reviewed-by: Weiwei Li 

Weiwei Li


  target/riscv/cpu.c | 12 
  1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 7f0852a14e..4dadb7f0a0 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1835,8 +1835,6 @@ static Property riscv_cpu_extensions[] = {
  DEFINE_PROP_BOOL("x-zfbfmin", RISCVCPU, cfg.ext_zfbfmin, false),
  DEFINE_PROP_BOOL("x-zvfbfmin", RISCVCPU, cfg.ext_zvfbfmin, false),
  DEFINE_PROP_BOOL("x-zvfbfwma", RISCVCPU, cfg.ext_zvfbfwma, false),
-
-DEFINE_PROP_END_OF_LIST(),
  };
  
  static Property riscv_cpu_options[] = {

@@ -1894,14 +1892,13 @@ static void riscv_cpu_add_kvm_unavail_prop(Object *obj, 
const char *prop_name)
  
  static void riscv_cpu_add_kvm_properties(Object *obj)

  {
-Property *prop;
  DeviceState *dev = DEVICE(obj);
  
  kvm_riscv_init_user_properties(obj);

  riscv_cpu_add_misa_properties(obj);
  
-for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {

-riscv_cpu_add_kvm_unavail_prop(obj, prop->name);
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
+riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_extensions[i].name);
  }
  
  for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {

@@ -1922,7 +1919,6 @@ static void riscv_cpu_add_kvm_properties(Object *obj)
   */
  static void riscv_cpu_add_user_properties(Object *obj)
  {
-Property *prop;
  DeviceState *dev = DEVICE(obj);
  
  #ifndef CONFIG_USER_ONLY

@@ -1936,8 +1932,8 @@ static void riscv_cpu_add_user_properties(Object *obj)
  
  riscv_cpu_add_misa_properties(obj);
  
-for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {

-qdev_property_add_static(dev, prop);
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
+qdev_property_add_static(dev, _cpu_extensions[i]);
  }
  
  for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {

Re: [PATCH for-8.2 v5 03/11] target/riscv/cpu.c: split kvm prop handling to its own helper

2023-07-20 Thread Weiwei Li




On 2023/7/21 01:19, Daniel Henrique Barboza wrote:

Future patches will split the existing Property arrays even further, and
the existing code in riscv_cpu_add_user_properties() will start to scale
bad with it because it's dealing with KVM constraints mixed in with TCG
constraints. We're going to pay a high price to share a couple of common
lines of code between the two.

Create a new riscv_cpu_add_kvm_properties() that will be forked from
riscv_cpu_add_user_properties() if we're running KVM. The helper
includes all properties that a KVM CPU will add. The rest of
riscv_cpu_add_user_properties() body will then be relieved from having
to deal with KVM constraints.

Signed-off-by: Daniel Henrique Barboza 
---


Reviewed-by: Weiwei Li 

Weiwei Li


  target/riscv/cpu.c | 65 ++
  1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index f10d40733a..7f0852a14e 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1874,6 +1874,46 @@ static void cpu_set_cfg_unavailable(Object *obj, Visitor 
*v,
  }
  #endif
  
+#ifndef CONFIG_USER_ONLY

+static void riscv_cpu_add_kvm_unavail_prop(Object *obj, const char *prop_name)
+{
+/* Check if KVM created the property already */
+if (object_property_find(obj, prop_name)) {
+return;
+}
+
+/*
+ * Set the default to disabled for every extension
+ * unknown to KVM and error out if the user attempts
+ * to enable any of them.
+ */
+object_property_add(obj, prop_name, "bool",
+NULL, cpu_set_cfg_unavailable,
+NULL, (void *)prop_name);
+}
+
+static void riscv_cpu_add_kvm_properties(Object *obj)
+{
+Property *prop;
+DeviceState *dev = DEVICE(obj);
+
+kvm_riscv_init_user_properties(obj);
+riscv_cpu_add_misa_properties(obj);
+
+for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
+riscv_cpu_add_kvm_unavail_prop(obj, prop->name);
+}
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
+/* Check if KVM created the property already */
+if (object_property_find(obj, riscv_cpu_options[i].name)) {
+continue;
+}
+qdev_property_add_static(dev, _cpu_options[i]);
+}
+}
+#endif
+
  /*
   * Add CPU properties with user-facing flags.
   *
@@ -1889,39 +1929,18 @@ static void riscv_cpu_add_user_properties(Object *obj)
  riscv_add_satp_mode_properties(obj);
  
  if (kvm_enabled()) {

-kvm_riscv_init_user_properties(obj);
+riscv_cpu_add_kvm_properties(obj);
+return;
  }
  #endif
  
  riscv_cpu_add_misa_properties(obj);
  
  for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {

-#ifndef CONFIG_USER_ONLY
-if (kvm_enabled()) {
-/* Check if KVM created the property already */
-if (object_property_find(obj, prop->name)) {
-continue;
-}
-
-/*
- * Set the default to disabled for every extension
- * unknown to KVM and error out if the user attempts
- * to enable any of them.
- */
-object_property_add(obj, prop->name, "bool",
-NULL, cpu_set_cfg_unavailable,
-NULL, (void *)prop->name);
-continue;
-}
-#endif
  qdev_property_add_static(dev, prop);
  }
  
  for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {

-/* Check if KVM created the property already */
-if (object_property_find(obj, riscv_cpu_options[i].name)) {
-continue;
-}
  qdev_property_add_static(dev, _cpu_options[i]);
  }
  }

PING: [PATCH v3 0/6] Misc fixes for throttle

2023-07-20 Thread zhenwei pi


Hi Kevin, Hanna,

Patch 1 -> patch 5 of this series are already reviewed by Alberto(these 
affects throttle framework only), the patch 6 affects qemu block layer, 
would you please review this(in the further step, merge this series if 
this is acceptable)?


On 7/13/23 14:41, zhenwei pi wrote:

v2 -> v3:
- patch 1 -> patch 5 are already reviewed by Alberto
- append patch 6: throttle: use enum ThrottleType instead of bool is_write

v1 -> v2:
- rename 'ThrottleTimerType' to 'ThrottleType'
- add assertion to throttle_schedule_timer

v1:
- introduce enum ThrottleTimerType instead of timers[0], timer[1]...
- support read-only and write-only for throttle
- adapt related test codes
- cryptodev uses a write-only throttle timer

Zhenwei Pi (6):
   throttle: introduce enum ThrottleType
   test-throttle: use enum ThrottleType
   throttle: support read-only and write-only
   test-throttle: test read only and write only
   cryptodev: use NULL throttle timer cb for read direction
   throttle: use enum ThrottleType instead of bool is_write

  backends/cryptodev.c| 12 +++---
  block/throttle-groups.c |  6 ++-
  fsdev/qemu-fsdev-throttle.c |  8 ++--
  include/qemu/throttle.h | 15 +---
  tests/unit/test-throttle.c  | 76 ++---
  util/throttle.c | 64 +++
  6 files changed, 136 insertions(+), 45 deletions(-)



--
zhenwei pi

Re: [PATCH for-8.2 v5 01/11] target/riscv/cpu.c: split CPU options from riscv_cpu_extensions[]

2023-07-20 Thread Weiwei Li




On 2023/7/21 01:19, Daniel Henrique Barboza wrote:

We'll add a new CPU type that will enable a considerable amount of
extensions. To make it easier for us we'll do a few cleanups in our
existing riscv_cpu_extensions[] array.

Start by splitting all CPU non-boolean options from it. Create a new
riscv_cpu_options[] array for them. Add all these properties in
riscv_cpu_add_user_properties() as it is already being done today.

'mmu' and 'pmp' aren't really extensions in the usual way we think about
RISC-V extensions. These are closer to CPU features/options, so move
both to riscv_cpu_options[] too. In the near future we'll need to match
all extensions with all entries in isa_edata_arr[], and so it happens
that both 'mmu' and 'pmp' do not have a riscv,isa string (thus, no priv
spec version restriction). This further emphasizes the point that these
are more a CPU option than an extension.

No functional changes made.

Signed-off-by: Daniel Henrique Barboza 
---


Reviewed-by: Weiwei Li 

Weiwei Li


  target/riscv/cpu.c | 33 +++--
  1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..9a3afc0482 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1752,7 +1752,6 @@ static void riscv_cpu_add_misa_properties(Object *cpu_obj)
  
  static Property riscv_cpu_extensions[] = {

  /* Defaults for standard extensions */
-DEFINE_PROP_UINT8("pmu-num", RISCVCPU, cfg.pmu_num, 16),
  DEFINE_PROP_BOOL("sscofpmf", RISCVCPU, cfg.ext_sscofpmf, false),
  DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
  DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
@@ -1764,15 +1763,8 @@ static Property riscv_cpu_extensions[] = {
  DEFINE_PROP_BOOL("Zve32f", RISCVCPU, cfg.ext_zve32f, false),
  DEFINE_PROP_BOOL("Zve64f", RISCVCPU, cfg.ext_zve64f, false),
  DEFINE_PROP_BOOL("Zve64d", RISCVCPU, cfg.ext_zve64d, false),
-DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
-DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
  DEFINE_PROP_BOOL("sstc", RISCVCPU, cfg.ext_sstc, true),
  
-DEFINE_PROP_STRING("priv_spec", RISCVCPU, cfg.priv_spec),

-DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec),
-DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
-DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
-
  DEFINE_PROP_BOOL("smstateen", RISCVCPU, cfg.ext_smstateen, false),
  DEFINE_PROP_BOOL("svadu", RISCVCPU, cfg.ext_svadu, true),
  DEFINE_PROP_BOOL("svinval", RISCVCPU, cfg.ext_svinval, false),
@@ -1803,9 +1795,7 @@ static Property riscv_cpu_extensions[] = {
  DEFINE_PROP_BOOL("zhinxmin", RISCVCPU, cfg.ext_zhinxmin, false),
  
  DEFINE_PROP_BOOL("zicbom", RISCVCPU, cfg.ext_icbom, true),

-DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU, cfg.cbom_blocksize, 64),
  DEFINE_PROP_BOOL("zicboz", RISCVCPU, cfg.ext_icboz, true),
-DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU, cfg.cboz_blocksize, 64),
  
  DEFINE_PROP_BOOL("zmmul", RISCVCPU, cfg.ext_zmmul, false),
  
@@ -1849,6 +1839,21 @@ static Property riscv_cpu_extensions[] = {

  DEFINE_PROP_END_OF_LIST(),
  };
  
+static Property riscv_cpu_options[] = {

+DEFINE_PROP_UINT8("pmu-num", RISCVCPU, cfg.pmu_num, 16),
+
+DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
+DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
+
+DEFINE_PROP_STRING("priv_spec", RISCVCPU, cfg.priv_spec),
+DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec),
+
+DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
+DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
+
+DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU, cfg.cbom_blocksize, 64),
+DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU, cfg.cboz_blocksize, 64),
+};
  
  #ifndef CONFIG_USER_ONLY

  static void cpu_set_cfg_unavailable(Object *obj, Visitor *v,
@@ -1917,6 +1922,14 @@ static void riscv_cpu_add_user_properties(Object *obj)
  #endif
  qdev_property_add_static(dev, prop);
  }
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
+/* Check if KVM created the property already */
+if (object_property_find(obj, riscv_cpu_options[i].name)) {
+continue;
+}
+qdev_property_add_static(dev, _cpu_options[i]);
+}
  }
  
  static Property riscv_cpu_properties[] = {

[PATCH v6 3/3] hw/ufs: Support for UFS logical unit

2023-07-20 Thread Jeuk Kim

This commit adds support for ufs logical unit.
The LU handles processing for the SCSI command,
unit descriptor query request.

This commit enables the UFS device to process
IO requests.

Signed-off-by: Jeuk Kim 
---
 hw/ufs/lu.c  | 1439 ++
 hw/ufs/meson.build   |2 +-
 hw/ufs/trace-events  |   25 +
 hw/ufs/ufs.c |  252 ++-
 hw/ufs/ufs.h |   43 ++
 include/scsi/constants.h |1 +
 6 files changed, 1755 insertions(+), 7 deletions(-)
 create mode 100644 hw/ufs/lu.c

diff --git a/hw/ufs/lu.c b/hw/ufs/lu.c
new file mode 100644
index 00..6f6d301bc7
--- /dev/null
+++ b/hw/ufs/lu.c
@@ -0,0 +1,1439 @@
+/*
+ * QEMU UFS Logical Unit
+ *
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Written by Jeuk Kim 
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qapi/error.h"
+#include "qemu/memalign.h"
+#include "hw/scsi/scsi.h"
+#include "scsi/constants.h"
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+#include "ufs.h"
+
+/*
+ * The code below handling SCSI commands is copied from hw/scsi/scsi-disk.c,
+ * with minor adjustments to make it work for UFS.
+ */
+
+#define SCSI_DMA_BUF_SIZE (128 * KiB)
+#define SCSI_MAX_INQUIRY_LEN 256
+#define SCSI_INQUIRY_DATA_SIZE 36
+#define SCSI_MAX_MODE_LEN 256
+
+typedef struct UfsSCSIReq {
+SCSIRequest req;
+/* Both sector and sector_count are in terms of BDRV_SECTOR_SIZE bytes.  */
+uint64_t sector;
+uint32_t sector_count;
+uint32_t buflen;
+bool started;
+bool need_fua_emulation;
+struct iovec iov;
+QEMUIOVector qiov;
+BlockAcctCookie acct;
+} UfsSCSIReq;
+
+static void ufs_scsi_free_request(SCSIRequest *req)
+{
+UfsSCSIReq *r = DO_UPCAST(UfsSCSIReq, req, req);
+
+qemu_vfree(r->iov.iov_base);
+}
+
+static void scsi_check_condition(UfsSCSIReq *r, SCSISense sense)
+{
+trace_ufs_scsi_check_condition(r->req.tag, sense.key, sense.asc,
+   sense.ascq);
+scsi_req_build_sense(>req, sense);
+scsi_req_complete(>req, CHECK_CONDITION);
+}
+
+static int ufs_scsi_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf,
+ uint32_t outbuf_len)
+{
+UfsHc *u = UFS(req->bus->qbus.parent);
+UfsLu *lu = DO_UPCAST(UfsLu, qdev, req->dev);
+uint8_t page_code = req->cmd.buf[2];
+int start, buflen = 0;
+
+if (outbuf_len < SCSI_INQUIRY_DATA_SIZE) {
+return -1;
+}
+
+outbuf[buflen++] = lu->qdev.type & 0x1f;
+outbuf[buflen++] = page_code;
+outbuf[buflen++] = 0x00;
+outbuf[buflen++] = 0x00;
+start = buflen;
+
+switch (page_code) {
+case 0x00: /* Supported page codes, mandatory */
+{
+trace_ufs_scsi_emulate_vpd_page_00(req->cmd.xfer);
+outbuf[buflen++] = 0x00; /* list of supported pages (this page) */
+if (u->params.serial) {
+outbuf[buflen++] = 0x80; /* unit serial number */
+}
+outbuf[buflen++] = 0x87; /* mode page policy */
+break;
+}
+case 0x80: /* Device serial number, optional */
+{
+int l;
+
+if (!u->params.serial) {
+trace_ufs_scsi_emulate_vpd_page_80_not_supported();
+return -1;
+}
+
+l = strlen(u->params.serial);
+if (l > SCSI_INQUIRY_DATA_SIZE) {
+l = SCSI_INQUIRY_DATA_SIZE;
+}
+
+trace_ufs_scsi_emulate_vpd_page_80(req->cmd.xfer);
+memcpy(outbuf + buflen, u->params.serial, l);
+buflen += l;
+break;
+}
+case 0x87: /* Mode Page Policy, mandatory */
+{
+trace_ufs_scsi_emulate_vpd_page_87(req->cmd.xfer);
+outbuf[buflen++] = 0x3f; /* apply to all mode pages and subpages */
+outbuf[buflen++] = 0xff;
+outbuf[buflen++] = 0; /* shared */
+outbuf[buflen++] = 0;
+break;
+}
+default:
+return -1;
+}
+/* done with EVPD */
+assert(buflen - start <= 255);
+outbuf[start - 1] = buflen - start;
+return buflen;
+}
+
+static int ufs_scsi_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf,
+uint32_t outbuf_len)
+{
+int buflen = 0;
+
+if (outbuf_len < SCSI_INQUIRY_DATA_SIZE) {
+return -1;
+}
+
+if (req->cmd.buf[1] & 0x1) {
+/* Vital product data */
+return ufs_scsi_emulate_vpd_page(req, outbuf, outbuf_len);
+}
+
+/* Standard INQUIRY data */
+if (req->cmd.buf[2] != 0) {
+return -1;
+}
+
+/* PAGE CODE == 0 */
+buflen = req->cmd.xfer;
+if (buflen > SCSI_MAX_INQUIRY_LEN) {
+buflen = SCSI_MAX_INQUIRY_LEN;
+}
+
+if (is_wlun(req->lun)) {
+outbuf[0] = TYPE_WLUN;
+} else {
+outbuf[0] = 0;
+}
+outbuf[1] = 0;
+
+strpadcpy((char *)[16], 16, "QEMU UFS", ' ');
+strpadcpy((char

[PATCH v6 1/3] hw/ufs: Initial commit for emulated Universal-Flash-Storage

2023-07-20 Thread Jeuk Kim

Universal Flash Storage (UFS) is a high-performance mass storage device
with a serial interface. It is primarily used as a high-performance
data storage device for embedded applications.

This commit contains code for UFS device to be recognized
as a UFS PCI device.
Patches to handle UFS logical unit and Transfer Request will follow.

Signed-off-by: Jeuk Kim 
---
 MAINTAINERS  |6 +
 docs/specs/pci-ids.rst   |2 +
 hw/Kconfig   |1 +
 hw/meson.build   |1 +
 hw/ufs/Kconfig   |4 +
 hw/ufs/meson.build   |1 +
 hw/ufs/trace-events  |   32 ++
 hw/ufs/trace.h   |1 +
 hw/ufs/ufs.c |  278 ++
 hw/ufs/ufs.h |   42 ++
 include/block/ufs.h  | 1048 ++
 include/hw/pci/pci.h |1 +
 include/hw/pci/pci_ids.h |1 +
 meson.build  |1 +
 14 files changed, 1419 insertions(+)
 create mode 100644 hw/ufs/Kconfig
 create mode 100644 hw/ufs/meson.build
 create mode 100644 hw/ufs/trace-events
 create mode 100644 hw/ufs/trace.h
 create mode 100644 hw/ufs/ufs.c
 create mode 100644 hw/ufs/ufs.h
 create mode 100644 include/block/ufs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 12e59b6b27..0c8a955b42 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2256,6 +2256,12 @@ F: tests/qtest/nvme-test.c
 F: docs/system/devices/nvme.rst
 T: git git://git.infradead.org/qemu-nvme.git nvme-next
 
+ufs
+M: Jeuk Kim 
+S: Supported
+F: hw/ufs/*
+F: include/block/ufs.h
+
 megasas
 M: Hannes Reinecke 
 L: qemu-bl...@nongnu.org
diff --git a/docs/specs/pci-ids.rst b/docs/specs/pci-ids.rst
index e302bea484..d6707fa069 100644
--- a/docs/specs/pci-ids.rst
+++ b/docs/specs/pci-ids.rst
@@ -92,6 +92,8 @@ PCI devices (other than virtio):
   PCI PVPanic device (``-device pvpanic-pci``)
 1b36:0012
   PCI ACPI ERST device (``-device acpi-erst``)
+1b36:0013
+  PCI UFS device (``-device ufs``)
 
 All these devices are documented in :doc:`index`.
 
diff --git a/hw/Kconfig b/hw/Kconfig
index ba62ff6417..9ca7b38c31 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -38,6 +38,7 @@ source smbios/Kconfig
 source ssi/Kconfig
 source timer/Kconfig
 source tpm/Kconfig
+source ufs/Kconfig
 source usb/Kconfig
 source virtio/Kconfig
 source vfio/Kconfig
diff --git a/hw/meson.build b/hw/meson.build
index c7ac7d3d75..f01fac4617 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -37,6 +37,7 @@ subdir('smbios')
 subdir('ssi')
 subdir('timer')
 subdir('tpm')
+subdir('ufs')
 subdir('usb')
 subdir('vfio')
 subdir('virtio')
diff --git a/hw/ufs/Kconfig b/hw/ufs/Kconfig
new file mode 100644
index 00..b7b3392e85
--- /dev/null
+++ b/hw/ufs/Kconfig
@@ -0,0 +1,4 @@
+config UFS_PCI
+bool
+default y if PCI_DEVICES
+depends on PCI
diff --git a/hw/ufs/meson.build b/hw/ufs/meson.build
new file mode 100644
index 00..eb5164bde9
--- /dev/null
+++ b/hw/ufs/meson.build
@@ -0,0 +1 @@
+system_ss.add(when: 'CONFIG_UFS_PCI', if_true: files('ufs.c'))
diff --git a/hw/ufs/trace-events b/hw/ufs/trace-events
new file mode 100644
index 00..d1badcad10
--- /dev/null
+++ b/hw/ufs/trace-events
@@ -0,0 +1,32 @@
+# ufs.c
+ufs_irq_raise(void) "INTx"
+ufs_irq_lower(void) "INTx"
+ufs_mmio_read(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" 
data 0x%"PRIx64" size %d"
+ufs_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" 
data 0x%"PRIx64" size %d"
+ufs_process_db(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_process_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_complete_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_sendback_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_exec_nop_cmd(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_exec_scsi_cmd(uint32_t slot, uint8_t lun, uint8_t opcode) "slot %"PRIu32", 
lun 0x%"PRIx8", opcode 0x%"PRIx8""
+ufs_exec_query_cmd(uint32_t slot, uint8_t opcode) "slot %"PRIu32", opcode 
0x%"PRIx8""
+ufs_process_uiccmd(uint32_t uiccmd, uint32_t ucmdarg1, uint32_t ucmdarg2, 
uint32_t ucmdarg3) "uiccmd 0x%"PRIx32", ucmdarg1 0x%"PRIx32", ucmdarg2 
0x%"PRIx32", ucmdarg3 0x%"PRIx32""
+
+# error condition
+ufs_err_dma_read_utrd(uint32_t slot, uint64_t addr) "failed to read utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
+ufs_err_dma_read_req_upiu(uint32_t slot, uint64_t addr) "failed to read req 
upiu. UTRLDBR slot %"PRIu32", request upiu addr %"PRIu64""
+ufs_err_dma_read_prdt(uint32_t slot, uint64_t addr) "failed to read prdt. 
UTRLDBR slot %"PRIu32", prdt addr %"PRIu64""
+ufs_err_dma_write_utrd(uint32_t slot, uint64_t addr) "failed to write utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
+ufs_err_dma_write_rsp_upiu(uint32_t slot, uint64_t addr) "failed to write rsp 
upiu. UTRLDBR slot %"PRIu32", response upiu addr %"PRIu64""
+ufs_err_utrl_slot_busy(uint32_t slot) "UTRLDBR slot %"PRIu32" is busy"
+ufs_err_unsupport_register_offset(uint32_t offset) "Register offset 
0x%"PRIx32" is not yet supported"
+ufs_err_invalid_register_offset(uint32_t

[PATCH v6 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-20 Thread Jeuk Kim

Since v5:
- Fix to print an error message instead of a segmentation fault
  when no drive property is specified for a ufs-lu device

Since v4:
Addressed review comment from Stefan Hajnoczi. The fixes are as
follows.
- Keep u->reg fields in host endian (Removed little-endian helper 
  functions from MemoryRegionOps)
- Remove unnecessary NULL checks for g_new and g_malloc0
- Replace DEFINE_PROP_DRIVE_IOTHREAD -> DEFINE_PROP_DRIVE

In case you were wondering, ufs and ufs-lu have been tested for the
following behaviours.
1. Checked ufs device recognition in Windows10 environment
2. Verified ufs device recognition in Ubuntu 22.04 environment
3. Verified io behaviour via fio in Ubuntu 22.04 environment
4. Verified query request via ufs-tools in Ubuntu 22.04 environment

Since v3:
- Replace softmmu_ss -> system_ss in meson

Since v2:
Addressed review comment from Stefan Hajnoczi. The main fixes are as
follows.
- Use of SPDX licence identifiers
- fixed endianness error
- removed memory leak
- fixed DMA error handling logic

Since v1:
- use macros of "hw/registerfields.h" (Addressed Philippe's review
  comments)

This patch series adds support for a new PCI-based UFS device.

The UFS pci device id (PCI_DEVICE_ID_REDHAT_UFS) is not registered
in the Linux kernel yet, so it does not work right away, but I confirmed
that it works with Linux when the UFS pci device id is registered.

I have also verified that it works with Windows 10.


Jeuk Kim (3):
  hw/ufs: Initial commit for emulated Universal-Flash-Storage
  hw/ufs: Support for Query Transfer Requests
  hw/ufs: Support for UFS logical unit

 MAINTAINERS  |6 +
 docs/specs/pci-ids.rst   |2 +
 hw/Kconfig   |1 +
 hw/meson.build   |1 +
 hw/ufs/Kconfig   |4 +
 hw/ufs/lu.c  | 1439 
 hw/ufs/meson.build   |1 +
 hw/ufs/trace-events  |   58 ++
 hw/ufs/trace.h   |1 +
 hw/ufs/ufs.c | 1494 ++
 hw/ufs/ufs.h |  131 
 include/block/ufs.h  | 1048 ++
 include/hw/pci/pci.h |1 +
 include/hw/pci/pci_ids.h |1 +
 include/scsi/constants.h |1 +
 meson.build  |1 +
 16 files changed, 4190 insertions(+)
 create mode 100644 hw/ufs/Kconfig
 create mode 100644 hw/ufs/lu.c
 create mode 100644 hw/ufs/meson.build
 create mode 100644 hw/ufs/trace-events
 create mode 100644 hw/ufs/trace.h
 create mode 100644 hw/ufs/ufs.c
 create mode 100644 hw/ufs/ufs.h
 create mode 100644 include/block/ufs.h

-- 
2.34.1

[PATCH v6 2/3] hw/ufs: Support for Query Transfer Requests

2023-07-20 Thread Jeuk Kim

This commit makes the UFS device support query
and nop out transfer requests.

The next patch would be support for UFS logical
unit and scsi command transfer request.

Signed-off-by: Jeuk Kim 
---
 hw/ufs/trace-events |   1 +
 hw/ufs/ufs.c| 980 +++-
 hw/ufs/ufs.h|  46 +++
 3 files changed, 1025 insertions(+), 2 deletions(-)

diff --git a/hw/ufs/trace-events b/hw/ufs/trace-events
index d1badcad10..665e1a942b 100644
--- a/hw/ufs/trace-events
+++ b/hw/ufs/trace-events
@@ -18,6 +18,7 @@ ufs_err_dma_read_req_upiu(uint32_t slot, uint64_t addr) 
"failed to read req upiu
 ufs_err_dma_read_prdt(uint32_t slot, uint64_t addr) "failed to read prdt. 
UTRLDBR slot %"PRIu32", prdt addr %"PRIu64""
 ufs_err_dma_write_utrd(uint32_t slot, uint64_t addr) "failed to write utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
 ufs_err_dma_write_rsp_upiu(uint32_t slot, uint64_t addr) "failed to write rsp 
upiu. UTRLDBR slot %"PRIu32", response upiu addr %"PRIu64""
+ufs_err_utrl_slot_error(uint32_t slot) "UTRLDBR slot %"PRIu32" is in error"
 ufs_err_utrl_slot_busy(uint32_t slot) "UTRLDBR slot %"PRIu32" is busy"
 ufs_err_unsupport_register_offset(uint32_t offset) "Register offset 
0x%"PRIx32" is not yet supported"
 ufs_err_invalid_register_offset(uint32_t offset) "Register offset 0x%"PRIx32" 
is invalid"
diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c
index 101082a8a3..cd33af2cde 100644
--- a/hw/ufs/ufs.c
+++ b/hw/ufs/ufs.c
@@ -15,10 +15,221 @@
 #include "ufs.h"
 
 /* The QEMU-UFS device follows spec version 3.1 */
-#define UFS_SPEC_VER 0x0310
+#define UFS_SPEC_VER 0x0310
 #define UFS_MAX_NUTRS 32
 #define UFS_MAX_NUTMRS 8
 
+static MemTxResult ufs_addr_read(UfsHc *u, hwaddr addr, void *buf, int size)
+{
+hwaddr hi = addr + size - 1;
+
+if (hi < addr) {
+return MEMTX_DECODE_ERROR;
+}
+
+if (!FIELD_EX32(u->reg.cap, CAP, 64AS) && (hi >> 32)) {
+return MEMTX_DECODE_ERROR;
+}
+
+return pci_dma_read(PCI_DEVICE(u), addr, buf, size);
+}
+
+static MemTxResult ufs_addr_write(UfsHc *u, hwaddr addr, const void *buf,
+  int size)
+{
+hwaddr hi = addr + size - 1;
+if (hi < addr) {
+return MEMTX_DECODE_ERROR;
+}
+
+if (!FIELD_EX32(u->reg.cap, CAP, 64AS) && (hi >> 32)) {
+return MEMTX_DECODE_ERROR;
+}
+
+return pci_dma_write(PCI_DEVICE(u), addr, buf, size);
+}
+
+static void ufs_complete_req(UfsRequest *req, UfsReqResult req_result);
+
+static inline hwaddr ufs_get_utrd_addr(UfsHc *u, uint32_t slot)
+{
+hwaddr utrl_base_addr = (((hwaddr)u->reg.utrlbau) << 32) + u->reg.utrlba;
+hwaddr utrd_addr = utrl_base_addr + slot * sizeof(UtpTransferReqDesc);
+
+return utrd_addr;
+}
+
+static inline hwaddr ufs_get_req_upiu_base_addr(const UtpTransferReqDesc *utrd)
+{
+uint32_t cmd_desc_base_addr_lo =
+le32_to_cpu(utrd->command_desc_base_addr_lo);
+uint32_t cmd_desc_base_addr_hi =
+le32_to_cpu(utrd->command_desc_base_addr_hi);
+
+return (((hwaddr)cmd_desc_base_addr_hi) << 32) + cmd_desc_base_addr_lo;
+}
+
+static inline hwaddr ufs_get_rsp_upiu_base_addr(const UtpTransferReqDesc *utrd)
+{
+hwaddr req_upiu_base_addr = ufs_get_req_upiu_base_addr(utrd);
+uint32_t rsp_upiu_byte_off =
+le16_to_cpu(utrd->response_upiu_offset) * sizeof(uint32_t);
+return req_upiu_base_addr + rsp_upiu_byte_off;
+}
+
+static MemTxResult ufs_dma_read_utrd(UfsRequest *req)
+{
+UfsHc *u = req->hc;
+hwaddr utrd_addr = ufs_get_utrd_addr(u, req->slot);
+MemTxResult ret;
+
+ret = ufs_addr_read(u, utrd_addr, >utrd, sizeof(req->utrd));
+if (ret) {
+trace_ufs_err_dma_read_utrd(req->slot, utrd_addr);
+}
+return ret;
+}
+
+static MemTxResult ufs_dma_read_req_upiu(UfsRequest *req)
+{
+UfsHc *u = req->hc;
+hwaddr req_upiu_base_addr = ufs_get_req_upiu_base_addr(>utrd);
+UtpUpiuReq *req_upiu = >req_upiu;
+uint32_t copy_size;
+uint16_t data_segment_length;
+MemTxResult ret;
+
+/*
+ * To know the size of the req_upiu, we need to read the
+ * data_segment_length in the header first.
+ */
+ret = ufs_addr_read(u, req_upiu_base_addr, _upiu->header,
+sizeof(UtpUpiuHeader));
+if (ret) {
+trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr);
+return ret;
+}
+data_segment_length = be16_to_cpu(req_upiu->header.data_segment_length);
+
+copy_size = sizeof(UtpUpiuHeader) + UFS_TRANSACTION_SPECIFIC_FIELD_SIZE +
+data_segment_length;
+
+ret = ufs_addr_read(u, req_upiu_base_addr, >req_upiu, copy_size);
+if (ret) {
+trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr);
+}
+return ret;
+}
+
+static MemTxResult ufs_dma_read_prdt(UfsRequest *req)
+{
+UfsHc *u = req->hc;
+uint16_t prdt_len = le16_to_cpu(req->utrd.prd_table_length);
+uint16_t prdt_byte_off =
+

Re: [PATCH v5 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-20 Thread Jeuk Kim


On 7/21/2023 3:49 AM, Stefan Hajnoczi wrote:

Hi,
I'm ready to merge this but encountered a bug when testing:

   $ qemu-system-x86_64 --device ufs --device ufs-lu
   Segmentation fault (core dumped)

Please ensure there is an error message like with SCSI disks:

   $ qemu-system-x86_64 --device virtio-scsi-pci --device scsi-hd
   qemu-system-x86_64: --device scsi-hd: drive property not set

Thanks,
Stefan


Thanks for letting me know.
I'll fix it right away and send the patch again.

Thanks,
Jeuk

Re: [PATCH] roms/opensbi: Upgrade from v1.3 to v1.3.1

2023-07-20 Thread Alistair Francis

On Thu, Jul 20, 2023 at 3:00 AM Bin Meng  wrote:
>
> Upgrade OpenSBI from v1.3 to v1.3.1 and the pre-built bios images
> which fixes the boot failure seen when using QEMU to do a direct
> kernel boot with Microchip Icicle Kit board machine.
>
> The v1.3.1 release includes the following commits:
>
> 0907de3 lib: sbi: fix comment indent
> eb736a5 lib: sbi_pmu: Avoid out of bounds access
> 7828eeb gpio/desginware: add Synopsys DesignWare APB GPIO support
> c6a3573 lib: utils: Fix sbi_hartid_to_scratch() usage in ACLINT drivers
> 057eb10 lib: utils/gpio: Fix RV32 compile error for designware GPIO driver
>
> Signed-off-by: Bin Meng 

Reviewed-by: Alistair Francis 

@Conor Dooley @Conor Dooley Any chance you can test this?

Alistair

>
> ---
> Please pull the complete patch from https://github.com/lbmeng/qemu
> opensbi branch.
>
>  .../opensbi-riscv32-generic-fw_dynamic.bin| Bin 135344 -> 135376 bytes
>  .../opensbi-riscv64-generic-fw_dynamic.bin| Bin 138304 -> 138368 bytes
>  roms/opensbi  |   2 +-
>  3 files changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin 
> b/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin
> index 7b6c67e0ae..9a2ba3f2a4 100644
> Binary files a/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin and 
> b/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin differ
> diff --git a/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin 
> b/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin
> index 1b831b412c..5d4e812819 100644
> Binary files a/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin and 
> b/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin differ
> diff --git a/roms/opensbi b/roms/opensbi
> index 2552799a1d..057eb10b6d 16
> --- a/roms/opensbi
> +++ b/roms/opensbi
> @@ -1 +1 @@
> -Subproject commit 2552799a1df30a3dcd2321a8b75d61d06f5fb9fc
> +Subproject commit 057eb10b6d523540012e6947d5c9f63e95244e94
> --
> 2.34.1
>
>

Re: [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Michael S. Tsirkin

On Thu, Jul 20, 2023 at 05:31:03PM -0400, Stefan Hajnoczi wrote:
> On Thu, 20 Jul 2023 at 17:15, Michael S. Tsirkin  wrote:
> >
> > On Thu, Jul 20, 2023 at 03:58:37PM -0400, Stefan Hajnoczi wrote:
> > > On Thu, Jul 06, 2023 at 12:48:20PM -0400, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> > > > > Currently QEMU has to know some details about the back-end to be able
> > > > > to setup the guest. While various parts of the setup can be delegated
> > > > > to the backend (for example config handling) this is a very piecemeal
> > > > > approach.
> > > >
> > > > > This patch suggests a new feature flag 
> > > > > (VHOST_USER_PROTOCOL_F_STANDALONE)
> > > > > which the back-end can advertise which allows a probe message to be
> > > > > sent to get all the details QEMU needs to know in one message.
> > > >
> > > > The reason we do piecemeal is that these existing pieces can be reused
> > > > as others evolve or fall by wayside.
> > > >
> > > > For example, I can think of instances where you want to connect
> > > > specifically to e.g. networking backend, and specify it
> > > > on command line. Reasons could be many, e.g. for debugging,
> > > > or to prevent connecting to wrong device on wrong channel
> > > > (kind of like type safety).
> > > >
> > > > What is the reason to have 1 message? startup latency?
> > > > How about we allow pipelining several messages then?
> > > > Will be easier.
> > >
> > > This flag effectively says that the back-end is a full VIRTIO device
> > > with a Device Status Register, Configuration Space, Virtqueues, the
> > > device type, etc. This is different from previous vhost-user devices
> > > which sometimes just offloaded certain virtqueues without providing the
> > > full VIRTIO device (parts were emulated in the VMM).
> > >
> > > So for example, a vhost-user-net device does not support the controlq.
> > > Alex's "standalone" device is a mode where the vhost-user protocol is
> > > used but the back-end must implement a full virtio-net device.
> > > Standalone devices are like vDPA device in this respect.
> > >
> > > I think it is important to have a protocol feature bit that advertises
> > > that this is a standalone device, since the semantics are different for
> > > traditional vhost-user-net devices.
> >
> > Not sure what that would gain as compared to a feature bit per
> > message as we did previously.
> 
> Having a single feature bit makes it easier to distinguish between a
> traditional vhost-user device and a standalone device.
> 
> For example, the presence of VHOST_USER_F_GET_DEVICE_ID doesn't tell
> you whether this device is a standalone device that is appropriate for
> a new generic QEMU --device vhost-user-device feature that Alex is
> working on. It could be a traditional vhost-user device that is not
> standalone but implements the VHOST_USER_GET_DEVICE_ID message.
> 
> How will we detect standalone devices? It will be messy if there is no
> single feature bit that advertises that this back-end is a standalone
> device.
> 
> Stefan

Looks like standalone implies some 5-6 messages to be supported.
So just test the 6 bits are all ones.

-- 
MST

[ANNOUNCE] QEMU 8.1.0-rc0 is now available

2023-07-20 Thread Michael Roth

Hello,

On behalf of the QEMU Team, I'd like to announce the availability of the
first release candidate for the QEMU 8.1 release. This release is meant
for testing purposes and should not be used in a production environment.

  http://download.qemu.org/qemu-8.1.0-rc0.tar.xz
  http://download.qemu.org/qemu-8.1.0-rc0.tar.xz.sig

You can help improve the quality of the QEMU 8.1 release by testing this
release and reporting bugs using our GitLab issue tracker:

  https://gitlab.com/qemu-project/qemu/-/milestones/8#tab-issues

The release plan, as well a documented known issues for release
candidates, are available at:

  http://wiki.qemu.org/Planning/8.1

Please add entries to the ChangeLog for the 8.1 release below:

  http://wiki.qemu.org/ChangeLog/8.1

Thank you to everyone involved!

Re: [PATCH 0/3] Support message-based DMA in vfio-user server

2023-07-20 Thread Mattias Nissler

Stefan,

I hope you had a great vacation!

Thanks for updating the mirror and your review. Your comments all make
sense, and I will address your input when I find time - just a quick
ack now since I'm travelling next week and will be on vacation the
first half of August, so it might be a while.

Thanks,
Mattias

On Thu, Jul 20, 2023 at 8:41 PM Stefan Hajnoczi  wrote:
>
> On Tue, Jul 04, 2023 at 01:06:24AM -0700, Mattias Nissler wrote:
> > This series adds basic support for message-based DMA in qemu's vfio-user
> > server. This is useful for cases where the client does not provide file
> > descriptors for accessing system memory via memory mappings. My motivating 
> > use
> > case is to hook up device models as PCIe endpoints to a hardware design. 
> > This
> > works by bridging the PCIe transaction layer to vfio-user, and the endpoint
> > does not access memory directly, but sends memory requests TLPs to the 
> > hardware
> > design in order to perform DMA.
> >
> > Note that in addition to the 3 commits included, we also need a
> > subprojects/libvfio-user roll to bring in this bugfix:
> > https://github.com/nutanix/libvfio-user/commit/bb308a2e8ee9486a4c8b53d8d773f7c8faaeba08
> > Stefan, can I ask you to kindly update the
> > https://gitlab.com/qemu-project/libvfio-user mirror? I'll be happy to 
> > include
> > an update to subprojects/libvfio-user.wrap in this series.
>
> Done:
> https://gitlab.com/qemu-project/libvfio-user/-/commits/master
>
> Repository mirroring is automated now, so new upstream commits will
> appear in the QEMU mirror repository from now on.
>
> >
> > Finally, there is some more work required on top of this series to get
> > message-based DMA to really work well:
> >
> > * libvfio-user has a long-standing issue where socket communication gets 
> > messed
> >   up when messages are sent from both ends at the same time. See
> >   https://github.com/nutanix/libvfio-user/issues/279 for more details. I've
> >   been engaging there and plan to contribute a fix.
> >
> > * qemu currently breaks down DMA accesses into chunks of size 8 bytes at
> >   maximum, each of which will be handled in a separate vfio-user DMA request
> >   message. This is quite terrible for large DMA acceses, such as when nvme
> >   reads and writes page-sized blocks for example. Thus, I would like to 
> > improve
> >   qemu to be able to perform larger accesses, at least for indirect memory
> >   regions. I have something working locally, but since this will likely 
> > result
> >   in more involved surgery and discussion, I am leaving this to be 
> > addressed in
> >   a separate patch.
> >
> > Mattias Nissler (3):
> >   softmmu: Support concurrent bounce buffers
> >   softmmu: Remove DMA unmap notification callback
> >   vfio-user: Message-based DMA support
> >
> >  hw/remote/vfio-user-obj.c |  62 --
> >  softmmu/dma-helpers.c |  28 
> >  softmmu/physmem.c | 131 --
> >  3 files changed, 83 insertions(+), 138 deletions(-)
>
> Sorry for the late review. I was on vacation and am catching up on
> emails.
>
> Paolo worked on the QEMU memory API and can give input on how to make
> this efficient for large DMA accesses. There is a chance that memory
> dispatch with larger sizes will be needed for ENQCMD CPU instruction
> emulation too.
>
> Stefan

Re: [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Stefan Hajnoczi

On Thu, 20 Jul 2023 at 17:15, Michael S. Tsirkin  wrote:
>
> On Thu, Jul 20, 2023 at 03:58:37PM -0400, Stefan Hajnoczi wrote:
> > On Thu, Jul 06, 2023 at 12:48:20PM -0400, Michael S. Tsirkin wrote:
> > > On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> > > > Currently QEMU has to know some details about the back-end to be able
> > > > to setup the guest. While various parts of the setup can be delegated
> > > > to the backend (for example config handling) this is a very piecemeal
> > > > approach.
> > >
> > > > This patch suggests a new feature flag 
> > > > (VHOST_USER_PROTOCOL_F_STANDALONE)
> > > > which the back-end can advertise which allows a probe message to be
> > > > sent to get all the details QEMU needs to know in one message.
> > >
> > > The reason we do piecemeal is that these existing pieces can be reused
> > > as others evolve or fall by wayside.
> > >
> > > For example, I can think of instances where you want to connect
> > > specifically to e.g. networking backend, and specify it
> > > on command line. Reasons could be many, e.g. for debugging,
> > > or to prevent connecting to wrong device on wrong channel
> > > (kind of like type safety).
> > >
> > > What is the reason to have 1 message? startup latency?
> > > How about we allow pipelining several messages then?
> > > Will be easier.
> >
> > This flag effectively says that the back-end is a full VIRTIO device
> > with a Device Status Register, Configuration Space, Virtqueues, the
> > device type, etc. This is different from previous vhost-user devices
> > which sometimes just offloaded certain virtqueues without providing the
> > full VIRTIO device (parts were emulated in the VMM).
> >
> > So for example, a vhost-user-net device does not support the controlq.
> > Alex's "standalone" device is a mode where the vhost-user protocol is
> > used but the back-end must implement a full virtio-net device.
> > Standalone devices are like vDPA device in this respect.
> >
> > I think it is important to have a protocol feature bit that advertises
> > that this is a standalone device, since the semantics are different for
> > traditional vhost-user-net devices.
>
> Not sure what that would gain as compared to a feature bit per
> message as we did previously.

Having a single feature bit makes it easier to distinguish between a
traditional vhost-user device and a standalone device.

For example, the presence of VHOST_USER_F_GET_DEVICE_ID doesn't tell
you whether this device is a standalone device that is appropriate for
a new generic QEMU --device vhost-user-device feature that Alex is
working on. It could be a traditional vhost-user device that is not
standalone but implements the VHOST_USER_GET_DEVICE_ID message.

How will we detect standalone devices? It will be messy if there is no
single feature bit that advertises that this back-end is a standalone
device.

Stefan

Re: [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Michael S. Tsirkin

On Thu, Jul 20, 2023 at 03:58:37PM -0400, Stefan Hajnoczi wrote:
> On Thu, Jul 06, 2023 at 12:48:20PM -0400, Michael S. Tsirkin wrote:
> > On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> > > Currently QEMU has to know some details about the back-end to be able
> > > to setup the guest. While various parts of the setup can be delegated
> > > to the backend (for example config handling) this is a very piecemeal
> > > approach.
> > 
> > > This patch suggests a new feature flag (VHOST_USER_PROTOCOL_F_STANDALONE)
> > > which the back-end can advertise which allows a probe message to be
> > > sent to get all the details QEMU needs to know in one message.
> > 
> > The reason we do piecemeal is that these existing pieces can be reused
> > as others evolve or fall by wayside.
> > 
> > For example, I can think of instances where you want to connect
> > specifically to e.g. networking backend, and specify it
> > on command line. Reasons could be many, e.g. for debugging,
> > or to prevent connecting to wrong device on wrong channel
> > (kind of like type safety).
> > 
> > What is the reason to have 1 message? startup latency?
> > How about we allow pipelining several messages then?
> > Will be easier.
> 
> This flag effectively says that the back-end is a full VIRTIO device
> with a Device Status Register, Configuration Space, Virtqueues, the
> device type, etc. This is different from previous vhost-user devices
> which sometimes just offloaded certain virtqueues without providing the
> full VIRTIO device (parts were emulated in the VMM).
> 
> So for example, a vhost-user-net device does not support the controlq.
> Alex's "standalone" device is a mode where the vhost-user protocol is
> used but the back-end must implement a full virtio-net device.
> Standalone devices are like vDPA device in this respect.
> 
> I think it is important to have a protocol feature bit that advertises
> that this is a standalone device, since the semantics are different for
> traditional vhost-user-net devices.

Not sure what that would gain as compared to a feature bit per
message as we did previously.

> However, I think having a single message is inflexible and duplicates
> existing vhost-user protocol messages like VHOST_USER_GET_QUEUE_NUM. I
> would prefer VHOST_USER_GET_DEVICE_ID and other messages.
> 
> Stefan

Exactly.

-- 
MST

8.1-rc0 testfloat fails to compile

2023-07-20 Thread Olaf Hering

This is going on since a few weeks. I guess there is no check in CI to see if 
qemu.git#master compiles in Tumbleweed.

Since the switch to meson submodules, berkeley-testfloat-3 became mandatory. I 
think in the past I was able to ignore this submodule and not export it, so the 
following error did not show up:

[  141s] ../subprojects/berkeley-testfloat-3/source/genCases_f64.c: In function 
'f64Random':
[  141s] ../subprojects/berkeley-testfloat-3/source/genCases_f64.c:559:1: 
error: control reaches end of non-void function [-Werror=return-type]
[  141s]   559 | }
[  141s]   | ^
[  141s] cc1: some warnings being treated as errors

Apparently this is a known issue, 3ac1f81329f attempted to ignore such errors.
Do I need to tweak the global, system-provided CFLAGS myself, or can the source 
be fixed to address this? Disabling this error globally will hide errors 
elsewhere.

Maybe there is a way to append something to 
tests/fp/meson.build:libtestfloat.c_args? Right now it is apparently set to 
tfcflags+fpcflags+CFLAGS.


Olaf


pgpWS1pTvCXHa.pgp
Description: Digitale Signatur von OpenPGP

[Qemu PATCH 0/9] Enabling DCD emulation support in Qemu

2023-07-20 Thread nifan

From: Fan Ni 

The patch series provides dynamic capacity device (DCD) emulation in Qemu.
More specifically, it provides the following functionalities:
1. Extended type3 memory device to support DC regions and extents.
2. Implemented DCD related mailbox command support in CXL r3.0: 8.2.9.8.9.
3. ADD QMP interfaces for adding and releasing DC extents to simulate FM
functions for DCD described in cxl r3.0: 7.6.7.6.5 and 7.6.7.6.6.
4. Add new ct3d properties for DCD devices (host backend, number of dc
regions, etc.)
5. Add read/write support from/to DC regions of the device.
6. Add mechanism to validate accessed to DC region address space.

A more detailed description can be found from the previously posted RFC[1].

Compared to the previously posted RFC[1], following changes have been made:
1. Rebased the code on top of Jonathan's branch
https://gitlab.com/jic23/qemu/-/tree/cxl-2023-05-25.
2. Extracted the rename of mem_size to a separated patch.(Jonathan)
3. Reordered the patch series to improve its readability.(Jonathan)
4. Split the validation of accesses to DC region address space as a separate
patch.
5. Redesigned the QMP interfaces for adding and releasing DC extents to make
them easier to understand and act like existing QMP interfaces (like the
interface for cxl-inject-uncorrectable-errors). (Jonathan)
6. Updated dvsec range register setting to support DCD devices without static
capacity.
7. Fixed other issues mentioned in the comments (Jonathan Fontenot).
8. Fixed the format issues and checked with checkpatch.pl under qemu code dir.


The code is tested with the DCD patch series at the kernel side[2]. The test
is similar to those mentioned in the cover letter of [1].


[1]: https://lore.kernel.org/all/20230511175609.2091136-1-fan...@samsung.com/
[2]: 
https://lore.kernel.org/linux-cxl/649da378c28a3_968bb29420@iweiny-mobl.notmuch/T/#t

Fan Ni (9):
  hw/cxl/cxl-mailbox-utils: Add dc_event_log_size field to output
payload of identify memory device command
  hw/cxl/cxl-mailbox-utils: Add dynamic capacity region representative
and mailbox command support
  include/hw/cxl/cxl_device: Rename mem_size as static_mem_size for
type3 memory devices
  hw/mem/cxl_type3: Add support to create DC regions to type3 memory
devices
  hw/mem/cxl_type3: Add host backend and address space handling for DC
regions
  hw/mem/cxl_type3: Add DC extent list representative and get DC extent
list mailbox support
  hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release
dynamic capacity response
  hw/cxl/events: Add qmp interfaces to add/release dynamic capacity
extents
  hw/mem/cxl_type3: Add dpa range validation for accesses to dc regions

 hw/cxl/cxl-mailbox-utils.c  | 421 +++-
 hw/mem/cxl_type3.c  | 539 +---
 hw/mem/cxl_type3_stubs.c|   6 +
 include/hw/cxl/cxl_device.h |  49 +++-
 include/hw/cxl/cxl_events.h |  16 ++
 qapi/cxl.json   |  49 
 6 files changed, 1034 insertions(+), 46 deletions(-)

-- 
2.39.2

Re: [PATCH v6 4/5] intel_iommu: allow Extended Interrupt Mode when using userspace APIC

2023-07-20 Thread Peter Xu

On Mon, Jul 17, 2023 at 11:29:56PM +0700, Bui Quang Minh wrote:
> On 7/17/23 17:47, Joao Martins wrote:
> > +Peter, +Jason (intel-iommu maintainer/reviewer)

Thanks for copying me, Joan.

> > 
> > On 15/07/2023 16:22, Bui Quang Minh wrote:
> > > As userspace APIC now supports x2APIC, intel interrupt remapping
> > > hardware can be set to EIM mode when userspace local APIC is used.
> > > 
> > > Reviewed-by: Michael S. Tsirkin 
> > > Signed-off-by: Bui Quang Minh 
> > > ---
> > >   hw/i386/intel_iommu.c | 11 ---
> > >   1 file changed, 11 deletions(-)
> > > 
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index dcc334060c..5e576f6059 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -4043,17 +4043,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, 
> > > Error **errp)
> > > && x86_iommu_ir_supported(x86_iommu) ?
> > > ON_OFF_AUTO_ON : 
> > > ON_OFF_AUTO_OFF;
> > >   }
> > > -if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
> > > -if (!kvm_irqchip_is_split()) {
> > > -error_setg(errp, "eim=on requires 
> > > accel=kvm,kernel-irqchip=split");
> > > -return false;
> > > -}
> > > -if (!kvm_enable_x2apic()) {
> > > -error_setg(errp, "eim=on requires support on the KVM side"
> > > - "(X2APIC_API, first shipped in v4.7)");
> > > -return false;
> > > -}
> > > -}
> > Given commit 20ca47429e ('Revert "intel_iommu: Fix irqchip / X2APIC
> > configuration checks"'), won't we regress behaviour again  for the accel=kvm
> > case by dropping the kvm_enable_x2apic() call here?
> > 
> > Perhaps if we support userspace APIC with TCG the check just needs to be 
> > redone
> > to instead avoid always requiring kvm e.g.:
> > 
> > if (kvm_irqchip_in_kernel()) {
> >  error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"
> > "(X2APIC_API, first shipped in v4.7)");
> > }
> > 
> > if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
> >  error_setg(errp, "eim=on requires support on the KVM side"
> > "(X2APIC_API, first shipped in v4.7)");
> >  return false;
> > }
> 
> Thank you for your review. I think the check for kvm_irqchip_in_kernel() is
> not correct, AFAIK, kvm_irqchip_is_split() == true also means
> kvm_irqchip_in_kernel() == true on x86. To check if kernel-irqchip = on, we
> need to do something like in x86_iommu_realize
> 
> bool irq_all_kernel = kvm_irqchip_in_kernel() &&
> !kvm_irqchip_is_split();
> 
> The original check for !kvm_irqchip_is_split means emulated/userspace APIC.
> It's because to reach that check x86_iommu_ir_supported(...) == true and
> x86_iommu_ir_supported(...) == true is not supported when kernel-irqchip =
> on (there is a check for this in x86_iommu_realize)
> 
> So I think we need to change the check to
> 
> if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
> if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
> error_setg(errp, "eim=on requires support on the KVM side"
>  "(X2APIC_API, first shipped in v4.7)");
> return false;
> }
> }
> 
> Is it OK?

Mostly to me, except that we may also want to keep failing if all irq chips
are in kernel?

Thanks,

-- 
Peter Xu

[PATCH 6/9] hw/mem/cxl_type3: Add DC extent list representative and get DC extent list mailbox support

2023-07-20 Thread nifan

From: Fan Ni 

Add dynamic capacity extent list representative to the definition of
CXLType3Dev and add get DC extent list mailbox command per
CXL.spec.3.0:.8.2.9.8.9.2.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 71 +
 hw/mem/cxl_type3.c  |  1 +
 include/hw/cxl/cxl_device.h | 23 
 3 files changed, 95 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index c497298a1d..754ab68b78 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -83,6 +83,7 @@ enum {
 #define CLEAR_POISON   0x2
 DCD_CONFIG  = 0x48, /*r3.0: 8.2.9.8.9*/
 #define GET_DC_CONFIG  0x0
+#define GET_DYN_CAP_EXT_LIST   0x1
 PHYSICAL_SWITCH = 0x51
 #define IDENTIFY_SWITCH_DEVICE  0x0
 };
@@ -1018,6 +1019,73 @@ static CXLRetCode cmd_dcd_get_dyn_cap_config(struct 
cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * cxl spec 3.0: 8.2.9.8.9.2
+ * Get Dynamic Capacity Extent List (Opcode 4810h)
+ */
+static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(struct cxl_cmd *cmd,
+CXLDeviceState *cxl_dstate,
+uint16_t *len)
+{
+struct get_dyn_cap_ext_list_in_pl {
+uint32_t extent_cnt;
+uint32_t start_extent_id;
+} QEMU_PACKED;
+
+struct get_dyn_cap_ext_list_out_pl {
+uint32_t count;
+uint32_t total_extents;
+uint32_t generation_num;
+uint8_t rsvd[4];
+CXLDCExtent_raw records[];
+} QEMU_PACKED;
+
+struct get_dyn_cap_ext_list_in_pl *in = (void *)cmd->payload;
+struct get_dyn_cap_ext_list_out_pl *out = (void *)cmd->payload;
+struct CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev,
+cxl_dstate);
+uint16_t record_count = 0, i = 0, record_done = 0;
+CXLDCDExtentList *extent_list = >dc.extents;
+CXLDCD_Extent *ent;
+uint16_t out_pl_len;
+uint32_t start_extent_id = in->start_extent_id;
+
+if (start_extent_id > ct3d->dc.total_extent_count) {
+return CXL_MBOX_INVALID_INPUT;
+}
+
+record_count = MIN(in->extent_cnt,
+ct3d->dc.total_extent_count - start_extent_id);
+
+out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
+/* May need more processing here in the future */
+assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
+
+memset(out, 0, out_pl_len);
+stl_le_p(>count, record_count);
+stl_le_p(>total_extents, ct3d->dc.total_extent_count);
+stl_le_p(>generation_num, ct3d->dc.ext_list_gen_seq);
+
+if (record_count > 0) {
+QTAILQ_FOREACH(ent, extent_list, node) {
+if (i++ < start_extent_id) {
+continue;
+}
+stq_le_p(>records[record_done].start_dpa, ent->start_dpa);
+stq_le_p(>records[record_done].len, ent->len);
+memcpy(>records[record_done].tag, ent->tag, 0x10);
+stw_le_p(>records[record_done].shared_seq, ent->shared_seq);
+record_done++;
+if (record_done == record_count) {
+break;
+}
+}
+}
+
+*len = out_pl_len;
+return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1058,6 +1126,9 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
 cmd_media_clear_poison, 72, 0 },
 [DCD_CONFIG][GET_DC_CONFIG] = { "DCD_GET_DC_CONFIG",
 cmd_dcd_get_dyn_cap_config, 2, 0 },
+[DCD_CONFIG][GET_DYN_CAP_EXT_LIST] = {
+"DCD_GET_DYNAMIC_CAPACITY_EXTENT_LIST", cmd_dcd_get_dyn_cap_ext_list,
+8, 0 },
 };
 
 static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index af1d919be3..608063ac52 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -789,6 +789,7 @@ static int cxl_create_dc_regions(CXLType3Dev *ct3d)
 
 region_base += region->len;
 }
+QTAILQ_INIT(>dc.extents);
 
 return 0;
 }
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index ddb24271a8..a9cfe4e904 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -384,6 +384,25 @@ typedef QLIST_HEAD(, CXLPoison) CXLPoisonList;
 
 #define DCD_MAX_REGION_NUM 8
 
+typedef struct CXLDCD_Extent_raw {
+uint64_t start_dpa;
+uint64_t len;
+uint8_t tag[0x10];
+uint16_t shared_seq;
+uint8_t rsvd[0x6];
+} QEMU_PACKED CXLDCExtent_raw;
+
+typedef struct CXLDCD_Extent {
+uint64_t start_dpa;
+uint64_t len;
+uint8_t tag[0x10];
+uint16_t shared_seq;
+uint8_t rsvd[0x6];
+
+QTAILQ_ENTRY(CXLDCD_Extent) node;
+} CXLDCD_Extent;
+typedef QTAILQ_HEAD(, CXLDCD_Extent) CXLDCDExtentList;
+
 typedef struct CXLDCD_Region {
 uint64_t base;
 uint64_t decode_len; /* in multiples of 256MB */
@@ -432,6 +451,10 @@ struct CXLType3Dev {
 
 uint8_t num_regions; /* 0-8 regions */
 struct

[PATCH 7/9] hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release dynamic capacity response

2023-07-20 Thread nifan

From: Fan Ni 

Per CXL spec 3.0, two mailbox commands are implemented:
Add Dynamic Capacity Response (Opcode 4802h) 8.2.9.8.9.3, and
Release Dynamic Capacity (Opcode 4803h) 8.2.9.8.9.4.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 253 
 include/hw/cxl/cxl_device.h |   3 +-
 2 files changed, 255 insertions(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 754ab68b78..d547385ba7 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -84,6 +84,8 @@ enum {
 DCD_CONFIG  = 0x48, /*r3.0: 8.2.9.8.9*/
 #define GET_DC_CONFIG  0x0
 #define GET_DYN_CAP_EXT_LIST   0x1
+#define ADD_DYN_CAP_RSP0x2
+#define RELEASE_DYN_CAP0x3
 PHYSICAL_SWITCH = 0x51
 #define IDENTIFY_SWITCH_DEVICE  0x0
 };
@@ -1086,6 +1088,251 @@ static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(struct 
cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * Check whether the bits at addr between [nr, nr+size) are all set,
+ * return 1 if all 1s, else return 0
+ */
+static inline int test_bits(const unsigned long *addr, int nr, int size)
+{
+unsigned long res = find_next_zero_bit(addr, size + nr, nr);
+
+return (res >= nr + size) ? 1 : 0;
+}
+
+/*
+ * Find dynamic capacity region id based on dpa range [dpa, dpa+len)
+ */
+static uint8_t find_region_id(struct CXLType3Dev *dev, uint64_t dpa,
+uint64_t len)
+{
+int8_t i = dev->dc.num_regions - 1;
+
+while (i > 0 && dpa < dev->dc.regions[i].base) {
+i--;
+}
+
+if (dpa < dev->dc.regions[i].base
+|| dpa + len > dev->dc.regions[i].base + dev->dc.regions[i].len) {
+return dev->dc.num_regions;
+}
+
+return i;
+}
+
+static void insert_extent_to_extent_list(CXLDCDExtentList *list, uint64_t dpa,
+uint64_t len, uint8_t *tag, uint16_t shared_seq)
+{
+CXLDCD_Extent *extent;
+extent = g_new0(CXLDCD_Extent, 1);
+extent->start_dpa = dpa;
+extent->len = len;
+if (tag) {
+memcpy(extent->tag, tag, 0x10);
+} else {
+memset(extent->tag, 0, 0x10);
+}
+extent->shared_seq = shared_seq;
+
+QTAILQ_INSERT_TAIL(list, extent, node);
+}
+
+typedef struct updated_dc_extent_list_in_pl {
+uint32_t num_entries_updated;
+uint8_t rsvd[4];
+struct { /* r3.0: Table 8-130 */
+uint64_t start_dpa;
+uint64_t len;
+uint8_t rsvd[8];
+} QEMU_PACKED updated_entries[];
+} QEMU_PACKED updated_dc_extent_list_in_pl;
+
+/*
+ * The function only check the input extent list against itself.
+ */
+static CXLRetCode detect_malformed_extent_list(CXLType3Dev *dev,
+const updated_dc_extent_list_in_pl *in)
+{
+unsigned long *blk_bitmap;
+uint64_t min_block_size = dev->dc.regions[0].block_size;
+struct CXLDCD_Region *region = >dc.regions[0];
+uint32_t i;
+uint64_t dpa, len;
+uint8_t rid;
+CXLRetCode ret;
+
+for (i = 1; i < dev->dc.num_regions; i++) {
+region = >dc.regions[i];
+if (min_block_size > region->block_size) {
+min_block_size = region->block_size;
+}
+}
+
+blk_bitmap = bitmap_new((region->len + region->base
+- dev->dc.regions[0].base) / min_block_size);
+
+for (i = 0; i < in->num_entries_updated; i++) {
+dpa = in->updated_entries[i].start_dpa;
+len = in->updated_entries[i].len;
+
+rid = find_region_id(dev, dpa, len);
+if (rid == dev->dc.num_regions) {
+ret = CXL_MBOX_INVALID_PA;
+goto out;
+}
+
+region = >dc.regions[rid];
+if (dpa % region->block_size || len % region->block_size) {
+ret = CXL_MBOX_INVALID_EXTENT_LIST;
+goto out;
+}
+/* the dpa range already covered by some other extents in the list */
+if (test_bits(blk_bitmap, dpa / min_block_size, len / min_block_size)) 
{
+ret = CXL_MBOX_INVALID_EXTENT_LIST;
+goto out;
+}
+bitmap_set(blk_bitmap, dpa / min_block_size, len / min_block_size);
+   }
+
+ret = CXL_MBOX_SUCCESS;
+
+out:
+g_free(blk_bitmap);
+return ret;
+}
+
+/*
+ * cxl spec 3.0: 8.2.9.8.9.3
+ * Add Dynamic Capacity Response (opcode 4802h)
+ * Assume an extent is added only after the response is processed successfully
+ * TODO: for better extent list validation, a better solution would be
+ * maintaining a pending extent list and use it to verify the extent list in
+ * the response.
+ */
+static CXLRetCode cmd_dcd_add_dyn_cap_rsp(struct cxl_cmd *cmd,
+CXLDeviceState *cxl_dstate, uint16_t *len_unused)
+{
+updated_dc_extent_list_in_pl *in = (void *)cmd->payload;
+struct CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev,
+cxl_dstate);
+CXLDCDExtentList *extent_list = >dc.extents;
+CXLDCD_Extent *ent;
+uint32_t i;
+uint64_t dpa, len;
+CXLRetCode ret;
+ 
+if

[PATCH 4/9] hw/mem/cxl_type3: Add support to create DC regions to type3 memory devices

2023-07-20 Thread nifan

From: Fan Ni 

With the change, when setting up memory for type3 memory device, we can
create DC regions
A property 'num-dc-regions' is added to ct3_props to allow users to pass the
number of DC regions to create. To make it easier, other region parameters
like region base, length, and block size are hard coded. If needed,
these parameters can be added easily.

With the change, we can create DC regions with proper kernel side
support as below:

region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
echo 1 > /sys/bus/cxl/devices/$region/interleave_ways

echo "dc0" >/sys/bus/cxl/devices/decoder2.0/mode
echo 0x4000 >/sys/bus/cxl/devices/decoder2.0/dpa_size

echo 0x4000 > /sys/bus/cxl/devices/$region/size
echo  "decoder2.0" > /sys/bus/cxl/devices/$region/target0
echo 1 > /sys/bus/cxl/devices/$region/commit
echo $region > /sys/bus/cxl/drivers/cxl_region/bind

Signed-off-by: Fan Ni 
---
 hw/mem/cxl_type3.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 237b544b9c..27b5920f7d 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -707,6 +707,34 @@ static void ct3d_reg_write(void *opaque, hwaddr offset, 
uint64_t value,
 }
 }
 
+/*
+ * Create a dc region to test "Get Dynamic Capacity Configuration" command.
+ */
+static int cxl_create_dc_regions(CXLType3Dev *ct3d)
+{
+int i;
+uint64_t region_base = (ct3d->hostvmem ? ct3d->hostvmem->size : 0)
++ (ct3d->hostpmem ? ct3d->hostpmem->size : 0);
+uint64_t region_len = (uint64_t)2 * 1024 * 1024 * 1024;
+uint64_t decode_len = 4; /* 4*256MB */
+uint64_t blk_size = 2 * 1024 * 1024;
+struct CXLDCD_Region *region;
+
+for (i = 0; i < ct3d->dc.num_regions; i++) {
+region = >dc.regions[i];
+region->base = region_base;
+region->decode_len = decode_len;
+region->len = region_len;
+region->block_size = blk_size;
+/* dsmad_handle is set when creating cdat table entries */
+region->flags = 0;
+
+region_base += region->len;
+}
+
+return 0;
+}
+
 static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp)
 {
 DeviceState *ds = DEVICE(ct3d);
@@ -775,6 +803,10 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
**errp)
 g_free(p_name);
 }
 
+if (cxl_create_dc_regions(ct3d)) {
+return false;
+}
+
 return true;
 }
 
@@ -1062,6 +1094,7 @@ static Property ct3_props[] = {
 DEFINE_PROP_UINT64("sn", CXLType3Dev, sn, UI64_NULL),
 DEFINE_PROP_STRING("cdat", CXLType3Dev, cxl_cstate.cdat.filename),
 DEFINE_PROP_UINT16("spdm", CXLType3Dev, spdm_port, 0),
+DEFINE_PROP_UINT8("num-dc-regions", CXLType3Dev, dc.num_regions, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.39.2

[PATCH 3/9] include/hw/cxl/cxl_device: Rename mem_size as static_mem_size for type3 memory devices

2023-07-20 Thread nifan

From: Fan Ni 

Rename mem_size as static_mem_size for type3 memdev to cover static RAM and
pmem capacity, preparing for the introduction of dynamic capacity to support
dynamic capacity devices.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 5 +++--
 hw/mem/cxl_type3.c  | 8 
 include/hw/cxl/cxl_device.h | 2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 41e43ec231..59d57ae245 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -540,7 +540,8 @@ static CXLRetCode cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 
 snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
 
-stq_le_p(>total_capacity, cxl_dstate->mem_size / 
CXL_CAPACITY_MULTIPLIER);
+stq_le_p(>total_capacity,
+cxl_dstate->static_mem_size / CXL_CAPACITY_MULTIPLIER);
 stq_le_p(>persistent_capacity, cxl_dstate->pmem_size / 
CXL_CAPACITY_MULTIPLIER);
 stq_le_p(>volatile_capacity, cxl_dstate->vmem_size / 
CXL_CAPACITY_MULTIPLIER);
 stl_le_p(>lsa_size, cvc->get_lsa_size(ct3d));
@@ -879,7 +880,7 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd 
*cmd,
 struct clear_poison_pl *in = (void *)cmd->payload;
 
 dpa = ldq_le_p(>dpa);
-if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->mem_size) {
+if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->static_mem_size) {
 return CXL_MBOX_INVALID_PA;
 }
 
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 862107c5ef..237b544b9c 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -748,7 +748,7 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
**errp)
 }
 address_space_init(>hostvmem_as, vmr, v_name);
 ct3d->cxl_dstate.vmem_size = memory_region_size(vmr);
-ct3d->cxl_dstate.mem_size += memory_region_size(vmr);
+ct3d->cxl_dstate.static_mem_size += memory_region_size(vmr);
 g_free(v_name);
 }
 
@@ -771,7 +771,7 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
**errp)
 }
 address_space_init(>hostpmem_as, pmr, p_name);
 ct3d->cxl_dstate.pmem_size = memory_region_size(pmr);
-ct3d->cxl_dstate.mem_size += memory_region_size(pmr);
+ct3d->cxl_dstate.static_mem_size += memory_region_size(pmr);
 g_free(p_name);
 }
 
@@ -984,7 +984,7 @@ static int cxl_type3_hpa_to_as_and_dpa(CXLType3Dev *ct3d,
 return -EINVAL;
 }
 
-if (*dpa_offset > ct3d->cxl_dstate.mem_size) {
+if (*dpa_offset > ct3d->cxl_dstate.static_mem_size) {
 return -EINVAL;
 }
 
@@ -1142,7 +1142,7 @@ static bool set_cacheline(CXLType3Dev *ct3d, uint64_t 
dpa_offset, uint8_t *data)
 return false;
 }
 
-if (dpa_offset + CXL_CACHE_LINE_SIZE > ct3d->cxl_dstate.mem_size) {
+if (dpa_offset + CXL_CACHE_LINE_SIZE > ct3d->cxl_dstate.static_mem_size) {
 return false;
 }
 
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index bf564f4a0b..a32ee6d6ba 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -209,7 +209,7 @@ typedef struct cxl_device_state {
 } timestamp;
 
 /* memory region size, HDM */
-uint64_t mem_size;
+uint64_t static_mem_size;
 uint64_t pmem_size;
 uint64_t vmem_size;
 
-- 
2.39.2

[PATCH 8/9] hw/cxl/events: Add qmp interfaces to add/release dynamic capacity extents

2023-07-20 Thread nifan

From: Fan Ni 

Since fabric manager emulation is not supported yet, the change implements
the functions to add/release dynamic capacity extents as QMP interfaces.

1. Add dynamic capacity extents:

For example, the command to add two continuous extents (each is 128MB long)
to region 0 (starting at dpa offset 0 and 128MB) looks like below:

{ "execute": "qmp_capabilities" }

{ "execute": "cxl-add-dynamic-capacity-event",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "extents": [
  {
  "region-id": 0,
  "dpa": 0,
  "len": 128
  },
  {
  "region-id": 0,
  "dpa": 128,
  "len": 128
  }
  ]
  }
}

2. Release dynamic capacity extents:

For example, the command to release an extent of size 128MB from region 0
(starting at dpa offset 128MB) look like below:

{ "execute": "cxl-release-dynamic-capacity-event",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "extents": [
  {
  "region-id": 0,
  "dpa": 128,
  "len": 128
  }
  ]
  }
}

Signed-off-by: Fan Ni 
---
 hw/mem/cxl_type3.c  | 145 
 hw/mem/cxl_type3_stubs.c|   6 ++
 include/hw/cxl/cxl_events.h |  16 
 qapi/cxl.json   |  49 
 4 files changed, 216 insertions(+)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 608063ac52..cb1f9182e6 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -1811,6 +1811,151 @@ void qmp_cxl_inject_memory_module_event(const char 
*path, CxlEventLog log,
 }
 }
 
+static const QemuUUID dynamic_capacity_uuid = {
+.data = UUID(0xca95afa7, 0xf183, 0x4018, 0x8c, 0x2f,
+0x95, 0x26, 0x8e, 0x10, 0x1a, 0x2a),
+};
+
+/*
+ * cxl r3.0: Table 8-47
+ * 00h: add capacity
+ * 01h: release capacity
+ * 02h: forced capacity release
+ * 03h: region configuration updated
+ * 04h: Add capacity response
+ * 05h: capacity released
+ */
+enum DC_Event_Type {
+DC_EVENT_ADD_CAPACITY,
+DC_EVENT_RELEASE_CAPACITY,
+DC_EVENT_FORCED_RELEASE_CAPACITY,
+DC_EVENT_REGION_CONFIG_UPDATED,
+DC_EVENT_ADD_CAPACITY_RSP,
+DC_EVENT_CAPACITY_RELEASED,
+DC_EVENT_NUM
+};
+
+#define MEM_BLK_SIZE_MB 128
+static void qmp_cxl_process_dynamic_capacity_event(const char *path,
+CxlEventLog log, enum DC_Event_Type type,
+uint16_t hid, CXLDCExtentRecordList *records, Error **errp)
+{
+Object *obj = object_resolve_path(path, NULL);
+CXLEventDynamicCapacity dCap;
+CXLEventRecordHdr *hdr = 
+CXLDeviceState *cxlds;
+CXLType3Dev *dcd;
+uint8_t flags = 1 << CXL_EVENT_TYPE_INFO;
+uint32_t num_extents = 0;
+CXLDCExtentRecordList *list = records;
+CXLDCExtent_raw *extents;
+uint64_t dpa, len;
+uint8_t rid;
+int i;
+
+if (!obj) {
+error_setg(errp, "Unable to resolve path");
+return;
+}
+if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) {
+error_setg(errp, "Path not point to a valid CXL type3 device");
+return;
+}
+
+dcd = CXL_TYPE3(obj);
+cxlds = >cxl_dstate;
+memset(, 0, sizeof(dCap));
+
+if (!dcd->dc.num_regions) {
+error_setg(errp, "No dynamic capacity support from the device");
+return;
+}
+
+while (list) {
+dpa = list->value->dpa * 1024 * 1024;
+len = list->value->len * 1024 * 1024;
+rid = list->value->region_id;
+
+if (rid >= dcd->dc.num_regions) {
+error_setg(errp, "region id is too large");
+return;
+}
+
+if (dpa % dcd->dc.regions[rid].block_size
+|| len % dcd->dc.regions[rid].block_size) {
+error_setg(errp, "dpa or len is not aligned to region block size");
+return;
+}
+
+if (dpa + len > dcd->dc.regions[rid].decode_len * 256 * 1024 * 1024) {
+error_setg(errp, "extent range is beyond the region end");
+return;
+}
+
+num_extents++;
+list = list->next;
+}
+
+i = 0;
+list = records;
+extents = g_new0(CXLDCExtent_raw, num_extents);
+while (list) {
+dpa = list->value->dpa * 1024 * 1024;
+len = list->value->len * 1024 * 1024;
+rid = list->value->region_id;
+
+extents[i].start_dpa = dpa + dcd->dc.regions[rid].base;
+extents[i].len = len;
+memset(extents[i].tag, 0, 0x10);
+extents[i].shared_seq = 0;
+
+list = list->next;
+i++;
+}
+
+/*
+ * 8.2.9.1.5
+ * All Dynamic Capacity event records shall set the Event Record
+ * Severity field in the Common Event Record Format to Informational
+ * Event. All Dynamic Capacity related events shall be logged in the
+ * Dynamic Capacity Event Log.
+ */
+cxl_assign_event_header(hdr, _capacity_uuid, flags, sizeof(dCap),
+cxl_device_get_timestamp(>cxl_dstate));
+
+dCap.type = type;
+stw_le_p(_id, hid);
+/* only valid

[PATCH 9/9] hw/mem/cxl_type3: Add dpa range validation for accesses to dc regions

2023-07-20 Thread nifan

From: Fan Ni 

Not all dpa range in the dc regions is valid to access until an extent
covering the range has been added. Add a bitmap for each region to
record whether a dc block in the region has been backed by dc extent.
For the bitmap, a bit in the bitmap represents a dc block. When a dc
extent is added, all the bits of the blocks in the extent will be set,
which will be cleared when the extent is released.

Signed-off-by: Fan Ni 
---
 hw/mem/cxl_type3.c  | 155 
 include/hw/cxl/cxl_device.h |   1 +
 2 files changed, 156 insertions(+)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index cb1f9182e6..e673287804 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -787,13 +787,37 @@ static int cxl_create_dc_regions(CXLType3Dev *ct3d)
 /* dsmad_handle is set when creating cdat table entries */
 region->flags = 0;
 
+region->blk_bitmap = bitmap_new(region->len / region->block_size);
+if (!region->blk_bitmap) {
+break;
+}
+
 region_base += region->len;
 }
+
+if (i < ct3d->dc.num_regions) {
+while (--i >= 0) {
+g_free(ct3d->dc.regions[i].blk_bitmap);
+}
+return -1;
+}
+
 QTAILQ_INIT(>dc.extents);
 
 return 0;
 }
 
+static void cxl_destroy_dc_regions(CXLType3Dev *ct3d)
+{
+int i;
+struct CXLDCD_Region *region;
+
+for (i = 0; i < ct3d->dc.num_regions; i++) {
+region = >dc.regions[i];
+g_free(region->blk_bitmap);
+}
+}
+
 static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp)
 {
 DeviceState *ds = DEVICE(ct3d);
@@ -1021,6 +1045,7 @@ err_free_special_ops:
 g_free(regs->special_ops);
 err_address_space_free:
 if (ct3d->dc.host_dc) {
+cxl_destroy_dc_regions(ct3d);
 address_space_destroy(>dc.host_dc_as);
 }
 if (ct3d->hostpmem) {
@@ -1043,6 +1068,7 @@ static void ct3_exit(PCIDevice *pci_dev)
 spdm_sock_fini(ct3d->doe_spdm.socket);
 g_free(regs->special_ops);
 if (ct3d->dc.host_dc) {
+cxl_destroy_dc_regions(ct3d);
 address_space_destroy(>dc.host_dc_as);
 }
 if (ct3d->hostpmem) {
@@ -1053,6 +1079,110 @@ static void ct3_exit(PCIDevice *pci_dev)
 }
 }
 
+/*
+ * This function will marked the dpa range [dpa, dap + len) to be backed and
+ * accessible, this happens when a dc extent is added and accepted by the
+ * host.
+ */
+static void set_region_block_backed(CXLType3Dev *ct3d, uint64_t dpa,
+uint64_t len)
+{
+int i;
+CXLDCD_Region *region = >dc.regions[0];
+
+if (dpa < region->base
+|| dpa >= region->base + ct3d->dc.total_capacity)
+return;
+
+/*
+ * spec 3.0 9.13.3: Regions are used in increasing-DPA order, with
+ * Region 0 being used for the lowest DPA of Dynamic Capacity and
+ * Region 7 for the highest DPA.
+ * So we check from the last region to find where the dpa belongs.
+ * access across multiple regions is not allowed.
+ **/
+for (i = ct3d->dc.num_regions - 1; i >= 0; i--) {
+region = >dc.regions[i];
+if (dpa >= region->base) {
+break;
+}
+}
+
+bitmap_set(region->blk_bitmap, (dpa - region->base) / region->block_size,
+len / region->block_size);
+}
+
+/*
+ * This function check whether a dpa range [dpa, dpa + len) has been backed
+ * with dc extents, used when validating read/write to dc regions
+ */
+static bool test_region_block_backed(CXLType3Dev *ct3d, uint64_t dpa,
+uint64_t len)
+{
+int i;
+CXLDCD_Region *region = >dc.regions[0];
+uint64_t nbits;
+long nr;
+
+if (dpa < region->base
+|| dpa >= region->base + ct3d->dc.total_capacity)
+return false;
+
+/*
+ * spec 3.0 9.13.3: Regions are used in increasing-DPA order, with
+ * Region 0 being used for the lowest DPA of Dynamic Capacity and
+ * Region 7 for the highest DPA.
+ * So we check from the last region to find where the dpa belongs.
+ * access across multiple regions is not allowed.
+ */
+for (i = ct3d->dc.num_regions - 1; i >= 0; i--) {
+region = >dc.regions[i];
+if (dpa >= region->base) {
+break;
+}
+}
+
+nr = (dpa - region->base) / region->block_size;
+nbits = len / region->block_size;
+return find_next_zero_bit(region->blk_bitmap, nbits, nr) >= nr + nbits;
+}
+
+/*
+ * This function will marked the dpa range [dpa, dap + len) to be unbacked and
+ * inaccessible, this happens when a dc extent is added and accepted by the
+ * host.
+ */
+static void clear_region_block_backed(CXLType3Dev *ct3d, uint64_t dpa,
+uint64_t len)
+{
+int i;
+CXLDCD_Region *region = >dc.regions[0];
+uint64_t nbits;
+long nr;
+
+if (dpa < region->base
+|| dpa >= region->base + ct3d->dc.total_capacity)
+return;
+
+/*
+ * spec 3.0 9.13.3: Regions are used in increasing-DPA order, with
+

[PATCH 2/9] hw/cxl/cxl-mailbox-utils: Add dynamic capacity region representative and mailbox command support

2023-07-20 Thread nifan

From: Fan Ni 

Per cxl spec 3.0, add dynamic capacity region representative based on
Table 8-126 and extend the cxl type3 device definition to include dc region
information. Also, based on info in 8.2.9.8.9.1, add 'Get Dynamic Capacity
Configuration' mailbox support.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 69 +
 include/hw/cxl/cxl_device.h | 16 +
 2 files changed, 85 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index b013e30314..41e43ec231 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -81,6 +81,8 @@ enum {
 #define GET_POISON_LIST0x0
 #define INJECT_POISON  0x1
 #define CLEAR_POISON   0x2
+DCD_CONFIG  = 0x48, /*r3.0: 8.2.9.8.9*/
+#define GET_DC_CONFIG  0x0
 PHYSICAL_SWITCH = 0x51
 #define IDENTIFY_SWITCH_DEVICE  0x0
 };
@@ -939,6 +941,71 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd 
*cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * cxl spec 3.0: 8.2.9.8.9.1
+ * Get Dynamic Capacity Configuration
+ **/
+static CXLRetCode cmd_dcd_get_dyn_cap_config(struct cxl_cmd *cmd,
+CXLDeviceState *cxl_dstate,
+uint16_t *len)
+{
+struct get_dyn_cap_config_in_pl {
+uint8_t region_cnt;
+uint8_t start_region_id;
+} QEMU_PACKED;
+
+struct get_dyn_cap_config_out_pl {
+uint8_t num_regions;
+uint8_t rsvd1[7];
+struct {
+uint64_t base;
+uint64_t decode_len;
+uint64_t region_len;
+uint64_t block_size;
+uint32_t dsmadhandle;
+uint8_t flags;
+uint8_t rsvd2[3];
+} QEMU_PACKED records[];
+} QEMU_PACKED;
+
+struct get_dyn_cap_config_in_pl *in = (void *)cmd->payload;
+struct get_dyn_cap_config_out_pl *out = (void *)cmd->payload;
+struct CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev,
+cxl_dstate);
+uint16_t record_count = 0, i;
+uint16_t out_pl_len;
+uint8_t start_region_id = in->start_region_id;
+
+if (start_region_id >= ct3d->dc.num_regions) {
+return CXL_MBOX_INVALID_INPUT;
+}
+
+record_count = MIN(ct3d->dc.num_regions - in->start_region_id,
+in->region_cnt);
+
+out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
+assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
+
+memset(out, 0, out_pl_len);
+out->num_regions = record_count;
+for (i = 0; i < record_count; i++) {
+stq_le_p(>records[i].base,
+ct3d->dc.regions[start_region_id + i].base);
+stq_le_p(>records[i].decode_len,
+ct3d->dc.regions[start_region_id + i].decode_len);
+stq_le_p(>records[i].region_len,
+ct3d->dc.regions[start_region_id + i].len);
+stq_le_p(>records[i].block_size,
+ct3d->dc.regions[start_region_id + i].block_size);
+stl_le_p(>records[i].dsmadhandle,
+ct3d->dc.regions[start_region_id + i].dsmadhandle);
+out->records[i].flags = ct3d->dc.regions[start_region_id + i].flags;
+}
+
+*len = out_pl_len;
+return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -977,6 +1044,8 @@ static struct cxl_cmd cxl_cmd_set[256][256] = {
 cmd_media_inject_poison, 8, 0 },
 [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
 cmd_media_clear_poison, 72, 0 },
+[DCD_CONFIG][GET_DC_CONFIG] = { "DCD_GET_DC_CONFIG",
+cmd_dcd_get_dyn_cap_config, 2, 0 },
 };
 
 static struct cxl_cmd cxl_cmd_set_sw[256][256] = {
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index cd7f28dba8..bf564f4a0b 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -382,6 +382,17 @@ typedef struct CXLPoison {
 typedef QLIST_HEAD(, CXLPoison) CXLPoisonList;
 #define CXL_POISON_LIST_LIMIT 256
 
+#define DCD_MAX_REGION_NUM 8
+
+typedef struct CXLDCD_Region {
+uint64_t base;
+uint64_t decode_len; /* in multiples of 256MB */
+uint64_t len;
+uint64_t block_size;
+uint32_t dsmadhandle;
+uint8_t flags;
+} CXLDCD_Region;
+
 struct CXLType3Dev {
 /* Private */
 PCIDevice parent_obj;
@@ -413,6 +424,11 @@ struct CXLType3Dev {
 unsigned int poison_list_cnt;
 bool poison_list_overflowed;
 uint64_t poison_list_overflow_ts;
+
+struct dynamic_capacity {
+uint8_t num_regions; /* 0-8 regions */
+struct CXLDCD_Region regions[DCD_MAX_REGION_NUM];
+} dc;
 };
 
 #define TYPE_CXL_TYPE3 "cxl-type3"
-- 
2.39.2

[PATCH 5/9] hw/mem/cxl_type3: Add host backend and address space handling for DC regions

2023-07-20 Thread nifan

From: Fan Ni 

Add (file/memory backed) host backend, all the dynamic capacity regions
will share a single, large enough host backend. Set up address space for
DC regions to support read/write operations to dynamic capacity for DCD.

With the change, following supports are added:
1. add a new property to type3 device "nonvolatile-dc-memdev" to point to host
   memory backend for dynamic capacity;
2. add namespace for dynamic capacity for read/write support;
3. create cdat entries for each dynamic capacity region;
4. fix dvsec range registers to include DC regions.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  |  19 +++-
 hw/mem/cxl_type3.c  | 203 +---
 include/hw/cxl/cxl_device.h |   4 +
 3 files changed, 185 insertions(+), 41 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 59d57ae245..c497298a1d 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -388,9 +388,11 @@ static CXLRetCode cmd_firmware_update_get_info(struct 
cxl_cmd *cmd,
 char fw_rev4[0x10];
 } QEMU_PACKED *fw_info;
 QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50);
+CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
 
 if ((cxl_dstate->vmem_size < CXL_CAPACITY_MULTIPLIER) ||
-(cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER)) {
+(cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER) ||
+(ct3d->dc.total_capacity < CXL_CAPACITY_MULTIPLIER)) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -531,7 +533,8 @@ static CXLRetCode cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
 
 if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
-(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
+(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+(!QEMU_IS_ALIGNED(ct3d->dc.total_capacity, CXL_CAPACITY_MULTIPLIER))) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -566,9 +569,11 @@ static CXLRetCode cmd_ccls_get_partition_info(struct 
cxl_cmd *cmd,
 uint64_t next_pmem;
 } QEMU_PACKED *part_info = (void *)cmd->payload;
 QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
+CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
 
 if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
-(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
+(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+(!QEMU_IS_ALIGNED(ct3d->dc.total_capacity, CXL_CAPACITY_MULTIPLIER))) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -880,7 +885,13 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd 
*cmd,
 struct clear_poison_pl *in = (void *)cmd->payload;
 
 dpa = ldq_le_p(>dpa);
-if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->static_mem_size) {
+if (dpa + CXL_CACHE_LINE_SIZE >= cxl_dstate->static_mem_size
+&& ct3d->dc.num_regions == 0) {
+return CXL_MBOX_INVALID_PA;
+}
+
+if (ct3d->dc.num_regions && dpa + CXL_CACHE_LINE_SIZE >=
+cxl_dstate->static_mem_size + ct3d->dc.total_capacity) {
 return CXL_MBOX_INVALID_PA;
 }
 
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 27b5920f7d..af1d919be3 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -20,6 +20,7 @@
 #include "hw/pci/spdm.h"
 
 #define DWORD_BYTE 4
+#define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
 
 /* Default CDAT entries for a memory region */
 enum {
@@ -33,8 +34,8 @@ enum {
 };
 
 static int ct3_build_cdat_entries_for_mr(CDATSubHeader **cdat_table,
- int dsmad_handle, MemoryRegion *mr,
- bool is_pmem, uint64_t dpa_base)
+int dsmad_handle, uint8_t flags,
+uint64_t dpa_base, uint64_t size)
 {
 g_autofree CDATDsmas *dsmas = NULL;
 g_autofree CDATDslbis *dslbis0 = NULL;
@@ -53,9 +54,9 @@ static int ct3_build_cdat_entries_for_mr(CDATSubHeader 
**cdat_table,
 .length = sizeof(*dsmas),
 },
 .DSMADhandle = dsmad_handle,
-.flags = is_pmem ? CDAT_DSMAS_FLAG_NV : 0,
+.flags = flags,
 .DPA_base = dpa_base,
-.DPA_length = memory_region_size(mr),
+.DPA_length = size,
 };
 
 /* For now, no memory side cache, plausiblish numbers */
@@ -137,9 +138,9 @@ static int ct3_build_cdat_entries_for_mr(CDATSubHeader 
**cdat_table,
  * NV: Reserved - the non volatile from DSMAS matters
  * V: EFI_MEMORY_SP
  */
-.EFI_memory_type_attr = is_pmem ? 2 : 1,
+.EFI_memory_type_attr = flags ? 2 : 1,
 .DPA_offset = 0,
-.DPA_length = memory_region_size(mr),
+.DPA_length = size,
 };
 
 /* Header always at start of structure */
@@ -158,21 +159,28 @@ static int ct3_build_cdat_table(CDATSubHeader

[PATCH 1/9] hw/cxl/cxl-mailbox-utils: Add dc_event_log_size field to output payload of identify memory device command

2023-07-20 Thread nifan

From: Fan Ni 

Based on CXL spec 3.0 Table 8-94 (Identify Memory Device Output
Payload), dynamic capacity event log size should be part of
output of the Identify command.
Add dc_event_log_size to the output payload for the host to get the info.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index ad7a6116e4..b013e30314 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -21,6 +21,8 @@
 #include "sysemu/hostmem.h"
 
 #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
+/* Experimental value: dynamic capacity event log size */
+#define CXL_DC_EVENT_LOG_SIZE 8
 
 /*
  * How to add a new command, example. The command set FOO, with cmd BAR.
@@ -519,8 +521,9 @@ static CXLRetCode cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 uint16_t inject_poison_limit;
 uint8_t poison_caps;
 uint8_t qos_telemetry_caps;
+uint16_t dc_event_log_size;
 } QEMU_PACKED *id;
-QEMU_BUILD_BUG_ON(sizeof(*id) != 0x43);
+QEMU_BUILD_BUG_ON(sizeof(*id) != 0x45);
 
 CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
 CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
@@ -543,6 +546,7 @@ static CXLRetCode cmd_identify_memory_device(struct cxl_cmd 
*cmd,
 st24_le_p(id->poison_list_max_mer, 256);
 /* No limit - so limited by main poison record limit */
 stw_le_p(>inject_poison_limit, 0);
+stw_le_p(>dc_event_log_size, CXL_DC_EVENT_LOG_SIZE);
 
 *len = sizeof(*id);
 return CXL_MBOX_SUCCESS;
-- 
2.39.2

Re: [PATCH v1] migration: refactor migration_completion

2023-07-20 Thread Peter Xu

On Tue, Jul 18, 2023 at 01:25:12PM +, Wang, Wei W wrote:
> On Tuesday, July 18, 2023 1:44 PM, Isaku Yamahata wrote:
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -2058,6 +2058,21 @@ static int
> > await_return_path_close_on_source(MigrationState *ms)
> > >  return ms->rp_state.error;
> > >  }
> > >
> > > +static int close_return_path_on_source(MigrationState *ms) {
> > > +int ret;
> > > +
> > > +if (!ms->rp_state.rp_thread_created) {
> > > +return 0;
> > > +}
> > > +
> > > +trace_migration_return_path_end_before();
> > > +ret = await_return_path_close_on_source(ms);
> > > +trace_migration_return_path_end_after(ret);
> > > +
> > > +return ret;
> > > +}
> > > +
> > 
> > There is only one caller, migration_completion().  We can consolidate two
> > functions, await_return_path_close_on_source() and
> > close_return_path_on_source(), into single function.
> 
> Sounds good, thanks.
> 
> > > +static int migration_completion_postcopy(MigrationState *s) {
> > > +trace_migration_completion_postcopy_end();
> > > +
> > > +qemu_mutex_lock_iothread();
> > > +qemu_savevm_state_complete_postcopy(s->to_dst_file);
> > > +qemu_mutex_unlock_iothread();
> > > +
> > > +/*
> > > + * Shutdown the postcopy fast path thread.  This is only needed when
> > dest
> > > + * QEMU binary is old (7.1/7.2).  QEMU 8.0+ doesn't need this.
> > > + */
> > > +if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
> > > +postcopy_preempt_shutdown_file(s);
> > > +}
> > > +
> > > +trace_migration_completion_postcopy_end_after_complete();
> > > +
> > > +return 0;
> > 
> > Always return 0?  Make it void.
> 
> OK.
> 
> > > +static void migration_completion(MigrationState *s) {
> > > +int ret = -1;
> > > +int current_active_state = s->state;
> > > +
> > > +if (s->state == MIGRATION_STATUS_ACTIVE) {
> > > +ret = migration_completion_precopy(s, _active_state);
> > > +} else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
> > > +ret = migration_completion_postcopy(s);
> > 
> > Here we can set ret = 0.
> 
> Yes, after migration_completion_postcopy is made "void".

Looks good to me, after addressing Isaku's comments.

The current_active_state is very unfortunate, along with most of the calls
to migrate_set_state() - I bet most of the code will definitely go wrong if
that cmpxchg didn't succeed inside of migrate_set_state(), IOW in most
cases we simply always want:

  migrate_set_state(>state, s->state, XXX);

Not sure whether one pre-requisite patch is good to have so we can rename
migrate_set_state() to something like __migrate_set_state(), then:

  migrate_set_state(s, XXX) {
__migrate_set_state(>state, s->state, XXX);
  }

I don't even know whether there's any call site that will need
__migrate_set_state() for real..

Thanks,

-- 
Peter Xu

[RFC 2/2] cxl/mailbox: interface to add CCI commands to an existing CCI

2023-07-20 Thread Gregory Price

This enables wrapper devices to customize the base device's CCI
(for example, with custom commands outside the specification)
without the need to change the base device.

The also enabled the base device to dispatch those commands without
requiring additional driver support.

Signed-off-by: Gregory Price 

---
 hw/cxl/cxl-mailbox-utils.c |  19 +++
 include/hw/cxl/cxl_device.h|   2 ++
 2 files changed, 21 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index ddee3f1718..cad0cd0adb 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1383,6 +1383,25 @@ static void cxl_copy_cci_commands(CXLCCI *cci, const 
struct cxl_cmd (*cxl_cmds)[
 }
 }
 
+void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd 
(*cxl_cmd_set)[256], size_t payload_max)
+{
+cci->payload_max = payload_max > cci->payload_max ? payload_max : 
cci->payload_max;
+for (int set = 0; set < 256; set++) {
+for (int cmd = 0; cmd < 256; cmd++) {
+if (cxl_cmd_set[set][cmd].handler) {
+const struct cxl_cmd *c = _cmd_set[set][cmd];
+cci->cxl_cmd_set[set][cmd] = *c;
+struct cel_log *log =
+>cel_log[cci->cel_size];
+
+log->opcode = (set << 8) | cmd;
+log->effect = c->effect;
+cci->cel_size++;
+}
+}
+}
+}
+
 void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState 
*d, size_t payload_max)
 {
 cxl_copy_cci_commands(cci, cxl_cmd_set_sw);

diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index 9a3c8b2dfa..abc8405cc5 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -297,6 +297,8 @@ void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, 
size_t payload_max);
 void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf,
   DeviceState *d, size_t payload_max);
 void cxl_init_cci(CXLCCI *cci, size_t payload_max);
+void cxl_add_cci_commands(CXLCCI *cci, const struct cxl_cmd 
(*cxl_cmd_set)[256],
+  size_t payload_max);
 int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd,
 size_t len_in, uint8_t *pl_in,
 size_t *len_out, uint8_t *pl_out,
-- 
2.39.1

[RFC 1/2] cxl/mailbox: change CCI cmd set structure to be a member, not a refernce

2023-07-20 Thread Gregory Price

This allows devices to have fully customized CCIs, along with complex
devices where wrapper devices can override or add additional CCI
commands without having to replicate full command structures or
pollute a base device with every command that might ever be used.

Signed-off-by: Gregory Price 

---
 hw/cxl/cxl-mailbox-utils.c |  18 ++
 include/hw/cxl/cxl_device.h|   2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 2819914e8d..ddee3f1718 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1373,9 +1373,19 @@ void cxl_init_cci(CXLCCI *cci, size_t payload_max)
  bg_timercb, cci);
 }
 
+static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd 
(*cxl_cmds)[256]) {
+for (int set = 0; set < 256; set++) {
+for (int cmd = 0; cmd < 256; cmd++) {
+if (cxl_cmds[set][cmd].handler) {
+cci->cxl_cmd_set[set][cmd] = cxl_cmds[set][cmd];
+}
+}
+}
+}
+
 void cxl_initialize_mailbox_swcci(CXLCCI *cci, DeviceState *intf, DeviceState 
*d, size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set_sw;
+cxl_copy_cci_commands(cci, cxl_cmd_set_sw);
 cci->d = d;
 cci->intf = intf;
 cxl_init_cci(cci, payload_max);
@@ -1383,7 +1393,7 @@ void cxl_initialize_mailbox_swcci(CXLCCI *cci, 
DeviceState *intf, DeviceState *d
 
 void cxl_initialize_mailbox_t3(CXLCCI *cci, DeviceState *d, size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set;
+cxl_copy_cci_commands(cci, cxl_cmd_set);
 cci->d = d;
  
 /* No separation for PCI MB as protocol handled in PCI device */
@@ -1398,7 +1408,7 @@ static const struct cxl_cmd cxl_cmd_set_t3_mctp[256][256] 
= {
 void cxl_initialize_t3_mctpcci(CXLCCI *cci, DeviceState *d, DeviceState *intf,
size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set_t3_mctp;
+cxl_copy_cci_commands(cci, cxl_cmd_set_t3_mctp);
 cci->d = d;
 cci->intf = intf;
 cxl_init_cci(cci, payload_max);
@@ -1414,7 +1424,7 @@ static const struct cxl_cmd 
cxl_cmd_set_usp_mctp[256][256] = {
 
 void cxl_initialize_usp_mctpcci(CXLCCI *cci, DeviceState *d, DeviceState 
*intf, size_t payload_max)
 {
-cci->cxl_cmd_set = cxl_cmd_set_usp_mctp;
+cxl_copy_cci_commands(cci, cxl_cmd_set_usp_mctp);
 cci->d = d;
 cci->intf = intf;
 cxl_init_cci(cci, payload_max);

diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index c68981b618..9a3c8b2dfa 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -163,7 +163,7 @@ typedef struct CXLEventLog {
 } CXLEventLog;
 
 typedef struct CXLCCI {
-const struct cxl_cmd (*cxl_cmd_set)[256];
+struct cxl_cmd cxl_cmd_set[256][256];
 struct cel_log {
 uint16_t opcode;
 uint16_t effect;
-- 
2.39.1

Re: [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Stefan Hajnoczi

On Thu, Jul 06, 2023 at 12:48:20PM -0400, Michael S. Tsirkin wrote:
> On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> > Currently QEMU has to know some details about the back-end to be able
> > to setup the guest. While various parts of the setup can be delegated
> > to the backend (for example config handling) this is a very piecemeal
> > approach.
> 
> > This patch suggests a new feature flag (VHOST_USER_PROTOCOL_F_STANDALONE)
> > which the back-end can advertise which allows a probe message to be
> > sent to get all the details QEMU needs to know in one message.
> 
> The reason we do piecemeal is that these existing pieces can be reused
> as others evolve or fall by wayside.
> 
> For example, I can think of instances where you want to connect
> specifically to e.g. networking backend, and specify it
> on command line. Reasons could be many, e.g. for debugging,
> or to prevent connecting to wrong device on wrong channel
> (kind of like type safety).
> 
> What is the reason to have 1 message? startup latency?
> How about we allow pipelining several messages then?
> Will be easier.

This flag effectively says that the back-end is a full VIRTIO device
with a Device Status Register, Configuration Space, Virtqueues, the
device type, etc. This is different from previous vhost-user devices
which sometimes just offloaded certain virtqueues without providing the
full VIRTIO device (parts were emulated in the VMM).

So for example, a vhost-user-net device does not support the controlq.
Alex's "standalone" device is a mode where the vhost-user protocol is
used but the back-end must implement a full virtio-net device.
Standalone devices are like vDPA device in this respect.

I think it is important to have a protocol feature bit that advertises
that this is a standalone device, since the semantics are different for
traditional vhost-user-net devices.

However, I think having a single message is inflexible and duplicates
existing vhost-user protocol messages like VHOST_USER_GET_QUEUE_NUM. I
would prefer VHOST_USER_GET_DEVICE_ID and other messages.

Stefan

signature.asc
Description: PGP signature

[RFC 0/2] Modify CCI cmd sets to be mutable

2023-07-20 Thread Gregory Price

This is on top of the proposed CCI changes by Jonathan.

Base repo: https://gitlab.com/jic23/qemu
Base branch: cxl-2023-07-17


A proposal to make the CCI cmd sets full members of their device,
and copy the const struct entries instead of referencing them.
This would allow child devices to inherit the parent device default
behavior, but with the flexibility to override without changing the
core device.

This also enables the base device to receive the commands via the
same /dev/cxl/ device, simplifying testing.


An example of how one might override/add commands (paraphrased):


instantiating:

-device cxl-type3,bus=rp0,volatile-memdev=mem0,id=cxl-mem0
-device cxl-my-cmds,target=cxl-mem0


simple code:

static const struct cxl_cmd cxl_cmd_set_my_cmds[256][256] = {
[MY_CMDS][GET_INFO] = { "SPECIAL_GET_INFO",
  cmd_my_cmds_get_info, 0, 0 },}

static void cxl_my_cmds_realize(DeviceState *d, Error **errp)
{
CXL_MyCmds_State *s = CXL_MyCmds(d);

if (object_dynamic_cast(OBJECT(s->target), TYPE_CXL_TYPE3)) {
CXLType3Dev *ct3d = CXL_TYPE3(s->target);

s->type = cxl_type3;
s->cci = >cci;

cxl_add_cci_commands(>cci, cxl_cmd_set_my_cmds, 512);
return;
}
error_setg(errp, "Unhandled target type for CXL MHDSLD");
}

#define TYPE_CXL_Niagara "cxl-my-cmds"
static const TypeInfo cxl_my_cmds_info = {
.name = TYPE_CXL_MyCmds,
.parent = TYPE_CXL_TYPE3,
.class_size = sizeof(struct CXL_MyCmdsClass),
.class_init = cxl_my_cmds_class_init,
.instance_size = sizeof(CXL_MyCmds_State),
.interfaces = (InterfaceInfo[]) {
{ INTERFACE_CXL_DEVICE },
{ INTERFACE_PCIE_DEVICE },
{}
},
};

Signed-off-by: Gregory Price 

---
Gregory Price (2):
  cxl/mailbox: change CCI cmd set structure to be a member, not a
refernce
  cxl/mailbox: interface to add CCI commands to an existing CCI

 hw/cxl/cxl-mailbox-utils.c  | 37 +
 include/hw/cxl/cxl_device.h |  4 +++-
 2 files changed, 36 insertions(+), 5 deletions(-)

-- 
2.39.1

Re: [RFC PATCH 10/17] misc/i2c_mctp_cxl: Initial device emulation

2023-07-20 Thread Gregory Price

On Thu, Jul 20, 2023 at 01:18:33PM +0100, Jonathan Cameron wrote:
> On Wed, 19 Jul 2023 14:49:07 -0400
> Gregory Price  wrote:
> 
> > 
> > Maybe a dangerous suggestion.  Right now the CCI's are static:
> > 
> > static const struct cxl_cmd cxl_cmd_set[256][256]
> 
> That's defined by the ID space for the commands.  There can't be more than
> that many currently..
> 

Meant commands beyond what is defined in the spec, not beyond the 256
limits.

> > 
> > how difficult might it be to allow these tables to be dynamic instead?
> > Then we could add an interface like this:
> > 
> > void cxl_add_cmd_set(CXLCCI *cci, CXLCCI *cmd_set, payload_max) {
> > copy(cci, cmd_set);
> > }
> > 
> > This would enable not just adding sub-components piece-meal, but also if
> > someone wants to model a real device with custom CCI commands, they can
> > simply define a CCI set and pass it in via
> > 
> > cxl_add_cmd_set(>cci, my_cmd_set, payload_max);
> 
> Ok.  I'm potentially fine with people adding an interface for this, but
> only if they plan to also upstream the QEMU emulation of their actual
> device.
> 

Working on it :]

Hoping to show off a fully functional MHSLD with some custom commands
soon, and I think I'm happy with the abstraction that fell out on top of
this CCI work.  Previously it required duplicating the type3 device or
hacking directly on it, which is a maintenance nightmare / not
upstreamable.

> 
> I'd look to just inherit from a cxl type 3, like Ira did in the PoC for
> type 2 support.   We can then easily add a path to replace the commands
> set with whatever anyone wants.  I'm not sure we want the command line
> to be used to configure such a device as it'll both get very complex and
> prove increasingly hard to test more than a small subset of options.
> 
> https://lore.kernel.org/all/20230517-rfc-type2-dev-v1-0-6eb2e4709...@intel.com/
> 
> Jonathan
> 

I made an attempt at this at first, but due to the custom commands, i
think everyone would (rightly) scoff at the idea of adding
non-specification defined stuff into the core type 3 device.  Once I
shifted to modifying the CCI and overriding entries, the entire vendor
device ended up as mostly the CCI command definitions, which is exactly
how i envisioned doing it in the first place.

I'll post some additional patches to my MHD RFC, the changes were pretty
minor.

Hopefully will be able to tack on a concrete MHSLD following that..

~Gregory

Re: [virtio-dev] [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Stefan Hajnoczi

On Fri, Jul 07, 2023 at 12:27:39PM +0200, Stefano Garzarella wrote:
> On Tue, Jul 04, 2023 at 04:02:42PM +0100, Alex Bennée wrote:
> > 
> > Stefano Garzarella  writes:
> > 
> > > On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> > > > diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
> > > > index 5a070adbc1..85b1b1583a 100644
> > > > --- a/docs/interop/vhost-user.rst
> > > > +++ b/docs/interop/vhost-user.rst
> > > > @@ -275,6 +275,21 @@ Inflight description
> > > > 
> > > > :queue size: a 16-bit size of virtqueues
> > > > 
> > > > +Backend specifications
> > > > +^^
> > > > +
> > > > ++---+-+++
> > > > +| device id | config size |   min_vqs  |   max_vqs  |
> > > > ++---+-+++
> > > > +
> > > > +:device id: a 32-bit value holding the VirtIO device ID
> > > > +
> > > > +:config size: a 32-bit value holding the config size (see 
> > > > ``VHOST_USER_GET_CONFIG``)
> > > > +
> > > > +:min_vqs: a 32-bit value holding the minimum number of vqs supported
> > > 
> > > Why do we need the minimum?
> > 
> > We need to know the minimum number because some devices have fixed VQs
> > that must be present.
> 
> But does QEMU need to know this?
> 
> Or is it okay that the driver will then fail in the guest if there
> are not the right number of queues?

I don't understand why min_vqs is needed either. It's not the
front-end's job to ensure that the device will be used properly. A
spec-compliant driver will work with a spec-compliant device, so it's
not clear why the front-end needs this information.

Stefan


signature.asc
Description: PGP signature

Re: [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index c4e0cbd702..28b021d5d3 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -202,6 +202,13 @@ typedef struct VhostUserInflight {
>  uint16_t queue_size;
>  } VhostUserInflight;
>  
> +typedef struct VhostUserBackendSpecs {
> +uint32_t device_id;
> +uint32_t config_size;
> +uint32_t min_vqs;

You already answered my question about min_vqs in another sub-thread.
I'll continue there. Please ignore my question.

Stefan


signature.asc
Description: PGP signature

Re: [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
> Currently QEMU has to know some details about the back-end to be able
> to setup the guest. While various parts of the setup can be delegated
> to the backend (for example config handling) this is a very piecemeal
> approach.
> 
> This patch suggests a new feature flag (VHOST_USER_PROTOCOL_F_STANDALONE)
> which the back-end can advertise which allows a probe message to be
> sent to get all the details QEMU needs to know in one message.
> 
> Signed-off-by: Alex Bennée 
> 
> ---
> Initial RFC for discussion. I intend to prototype this work with QEMU
> and one of the rust-vmm vhost-user daemons.
> ---
>  docs/interop/vhost-user.rst | 37 +
>  hw/virtio/vhost-user.c  |  8 
>  2 files changed, 45 insertions(+)
> 
> diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
> index 5a070adbc1..85b1b1583a 100644
> --- a/docs/interop/vhost-user.rst
> +++ b/docs/interop/vhost-user.rst
> @@ -275,6 +275,21 @@ Inflight description
>  
>  :queue size: a 16-bit size of virtqueues
>  
> +Backend specifications
> +^^
> +
> ++---+-+++
> +| device id | config size |   min_vqs  |   max_vqs  |
> ++---+-+++
> +
> +:device id: a 32-bit value holding the VirtIO device ID
> +
> +:config size: a 32-bit value holding the config size (see 
> ``VHOST_USER_GET_CONFIG``)
> +
> +:min_vqs: a 32-bit value holding the minimum number of vqs supported

What is the purpose of min_vqs? I'm not sure why the front-end needs to
know this.


signature.asc
Description: PGP signature

Re: [PATCH v5 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-20 Thread Stefan Hajnoczi

Hi,
I'm ready to merge this but encountered a bug when testing:

  $ qemu-system-x86_64 --device ufs --device ufs-lu
  Segmentation fault (core dumped)

Please ensure there is an error message like with SCSI disks:

  $ qemu-system-x86_64 --device virtio-scsi-pci --device scsi-hd
  qemu-system-x86_64: --device scsi-hd: drive property not set

Thanks,
Stefan


signature.asc
Description: PGP signature

Re: [PATCH 0/3] Support message-based DMA in vfio-user server

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:06:24AM -0700, Mattias Nissler wrote:
> This series adds basic support for message-based DMA in qemu's vfio-user
> server. This is useful for cases where the client does not provide file
> descriptors for accessing system memory via memory mappings. My motivating use
> case is to hook up device models as PCIe endpoints to a hardware design. This
> works by bridging the PCIe transaction layer to vfio-user, and the endpoint
> does not access memory directly, but sends memory requests TLPs to the 
> hardware
> design in order to perform DMA.
> 
> Note that in addition to the 3 commits included, we also need a
> subprojects/libvfio-user roll to bring in this bugfix:
> https://github.com/nutanix/libvfio-user/commit/bb308a2e8ee9486a4c8b53d8d773f7c8faaeba08
> Stefan, can I ask you to kindly update the
> https://gitlab.com/qemu-project/libvfio-user mirror? I'll be happy to include
> an update to subprojects/libvfio-user.wrap in this series.

Done:
https://gitlab.com/qemu-project/libvfio-user/-/commits/master

Repository mirroring is automated now, so new upstream commits will
appear in the QEMU mirror repository from now on.

> 
> Finally, there is some more work required on top of this series to get
> message-based DMA to really work well:
> 
> * libvfio-user has a long-standing issue where socket communication gets 
> messed
>   up when messages are sent from both ends at the same time. See
>   https://github.com/nutanix/libvfio-user/issues/279 for more details. I've
>   been engaging there and plan to contribute a fix.
> 
> * qemu currently breaks down DMA accesses into chunks of size 8 bytes at
>   maximum, each of which will be handled in a separate vfio-user DMA request
>   message. This is quite terrible for large DMA acceses, such as when nvme
>   reads and writes page-sized blocks for example. Thus, I would like to 
> improve
>   qemu to be able to perform larger accesses, at least for indirect memory
>   regions. I have something working locally, but since this will likely result
>   in more involved surgery and discussion, I am leaving this to be addressed 
> in
>   a separate patch.
> 
> Mattias Nissler (3):
>   softmmu: Support concurrent bounce buffers
>   softmmu: Remove DMA unmap notification callback
>   vfio-user: Message-based DMA support
> 
>  hw/remote/vfio-user-obj.c |  62 --
>  softmmu/dma-helpers.c |  28 
>  softmmu/physmem.c | 131 --
>  3 files changed, 83 insertions(+), 138 deletions(-)

Sorry for the late review. I was on vacation and am catching up on
emails.

Paolo worked on the QEMU memory API and can give input on how to make
this efficient for large DMA accesses. There is a chance that memory
dispatch with larger sizes will be needed for ENQCMD CPU instruction
emulation too.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH 3/3] vfio-user: Message-based DMA support

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:06:27AM -0700, Mattias Nissler wrote:
> Wire up support for DMA for the case where the vfio-user client does not
> provide mmap()-able file descriptors, but DMA requests must be performed
> via the VFIO-user protocol. This installs an indirect memory region,
> which already works for pci_dma_{read,write}, and pci_dma_map works
> thanks to the existing DMA bounce buffering support.
> 
> Note that while simple scenarios work with this patch, there's a known
> race condition in libvfio-user that will mess up the communication
> channel: https://github.com/nutanix/libvfio-user/issues/279 I intend to
> contribute a fix for this problem, see discussion on the github issue
> for more details.
> 
> Signed-off-by: Mattias Nissler 
> ---
>  hw/remote/vfio-user-obj.c | 62 ++-
>  1 file changed, 55 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index 8b10c32a3c..9799580c77 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -300,6 +300,53 @@ static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, 
> char * const buf,
>  return count;
>  }
>  
> +static MemTxResult vfu_dma_read(void *opaque, hwaddr addr, uint64_t *val,
> +unsigned size, MemTxAttrs attrs)
> +{
> +MemoryRegion *region = opaque;
> +VfuObject *o = VFU_OBJECT(region->owner);
> +
> +dma_sg_t *sg = alloca(dma_sg_size());
> +vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> +if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_READ) < 0 ||
> +vfu_sgl_read(o->vfu_ctx, sg, 1, val) != 0) {

Does this work on big-endian host CPUs? It looks like reading 0x12345678
into uint64_t val would result in *val = 0x12345678 instead of
0x12345678.

> +return MEMTX_ERROR;
> +}
> +
> +return MEMTX_OK;
> +}
> +
> +static MemTxResult vfu_dma_write(void *opaque, hwaddr addr, uint64_t val,
> + unsigned size, MemTxAttrs attrs)
> +{
> +MemoryRegion *region = opaque;
> +VfuObject *o = VFU_OBJECT(region->owner);
> +
> +dma_sg_t *sg = alloca(dma_sg_size());
> +vfu_dma_addr_t vfu_addr = (vfu_dma_addr_t)(region->addr + addr);
> +if (vfu_addr_to_sgl(o->vfu_ctx, vfu_addr, size, sg, 1, PROT_WRITE) < 0 ||
> +vfu_sgl_write(o->vfu_ctx, sg, 1, ) != 0)  {

Same potential endianness issue here.

Stefan


signature.asc
Description: PGP signature

[RFC PATCH 11/12] vdpa: use SVQ to stall dataplane while NIC state is being restored

2023-07-20 Thread Eugenio Pérez

Some dynamic state of a virtio-net vDPA devices is restored from CVQ in
the event of a live migration.  However, dataplane needs to be disabled
so the NIC does not receive buffers in the invalid ring.

As a default method to achieve it, let's offer a shadow vring with 0
avail idx.  As a fallback method, we will enable dataplane vqs later, as
proposed previously.

Signed-off-by: Eugenio Pérez 
---
 net/vhost-vdpa.c | 49 +++-
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index af83de92f8..e14ae48f23 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -338,10 +338,25 @@ static int vhost_vdpa_net_data_start(NetClientState *nc)
 {
 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 struct vhost_vdpa *v = >vhost_vdpa;
+bool has_cvq = v->dev->vq_index_end % 2;
 
 assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 
-if (s->always_svq ||
+if (has_cvq && (v->dev->features & VIRTIO_F_RING_RESET)) {
+/*
+ * Offer a fake vring to the device while the state is restored
+ * through CVQ.  That way, the guest will not see packets in unexpected
+ * queues.
+ *
+ * This will be undone after loading all state through CVQ, at
+ * vhost_vdpa_net_load.
+ *
+ * TODO: Future optimizations may skip some SVQ setup and teardown,
+ * like set the right kick and call fd or doorbell maps directly, and
+ * the iova tree.
+ */
+v->shadow_vqs_enabled = true;
+} else if (s->always_svq ||
 migration_is_setup_or_active(migrate_get_current()->state)) {
 v->shadow_vqs_enabled = true;
 v->shadow_data = true;
@@ -738,10 +753,34 @@ static int vhost_vdpa_net_load(NetClientState *nc)
 return r;
 }
 
-for (int i = 0; i < v->dev->vq_index; ++i) {
-r = vhost_vdpa_set_vring_ready(v, i);
-if (unlikely(r)) {
-return r;
+if (v->dev->features & VIRTIO_F_RING_RESET && !s->always_svq &&
+!migration_is_setup_or_active(migrate_get_current()->state)) {
+NICState *nic = qemu_get_nic(s->nc.peer);
+int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
+
+for (int i = 0; i < queue_pairs; ++i) {
+NetClientState *ncs = qemu_get_peer(nic->ncs, i);
+VhostVDPAState *s_i = DO_UPCAST(VhostVDPAState, nc, ncs);
+
+for (int j = 0; j < 2; ++j) {
+vhost_net_virtqueue_reset(v->dev->vdev, ncs->peer, j);
+}
+
+s_i->vhost_vdpa.shadow_vqs_enabled = false;
+
+for (int j = 0; j < 2; ++j) {
+r = vhost_net_virtqueue_restart(v->dev->vdev, ncs->peer, j);
+if (unlikely(r < 0)) {
+return r;
+}
+}
+}
+} else {
+for (int i = 0; i < v->dev->vq_index; ++i) {
+r = vhost_vdpa_set_vring_ready(v, i);
+if (unlikely(r)) {
+return r;
+}
 }
 }
 
-- 
2.39.3

[RFC PATCH 09/12] vdpa: add vhost_vdpa_restart_queue

2023-07-20 Thread Eugenio Pérez

Split out vq restart operation in its own function, as it may be called
with ring reset.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 7248072989..7b24fa3e12 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -562,6 +562,29 @@ static void vhost_vdpa_reset_queue(struct vhost_dev *dev, 
int idx)
 }
 }
 
+/* TODO: Properly reorder static functions */
+static bool vhost_vdpa_svq_start(struct vhost_dev *dev, unsigned i,
+ Error **errp);
+static int vhost_vdpa_restart_queue(struct vhost_dev *dev, int idx)
+{
+struct vhost_vdpa *v = dev->opaque;
+
+if (v->shadow_vqs_enabled) {
+Error *err = NULL;
+bool ok = vhost_vdpa_svq_start(dev, idx, );
+if (unlikely(!ok)) {
+error_report_err(err);
+return -1;
+}
+}
+
+if (dev->features & VIRTIO_F_RING_RESET) {
+return vhost_vdpa_set_vring_ready_internal(v, idx, true);
+}
+
+return 0;
+}
+
 /*
  * The use of this function is for requests that only need to be
  * applied once. Typically such request occurs at the beginning
@@ -1574,4 +1597,5 @@ const VhostOps vdpa_ops = {
 .vhost_set_config_call = vhost_vdpa_set_config_call,
 .vhost_reset_status = vhost_vdpa_reset_status,
 .vhost_reset_queue = vhost_vdpa_reset_queue,
+.vhost_restart_queue = vhost_vdpa_restart_queue,
 };
-- 
2.39.3

[RFC PATCH 12/12] vhost: Allow _F_RING_RESET with shadow virtqueue

2023-07-20 Thread Eugenio Pérez

This is needed to let CVQ restore at the beginning.

Not properly tested with actual guest ring reset, only with the reset
from qemu. For this to be included, more test is needed, but this series
is a good start for it.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-shadow-virtqueue.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
b/hw/virtio/vhost-shadow-virtqueue.c
index 1b1d85306c..6490a98db7 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -34,6 +34,7 @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 switch (b) {
 case VIRTIO_F_ANY_LAYOUT:
 case VIRTIO_RING_F_EVENT_IDX:
+case VIRTIO_F_RING_RESET:
 continue;
 
 case VIRTIO_F_ACCESS_PLATFORM:
-- 
2.39.3

[RFC PATCH 00/12] Prefer to use SVQ to stall dataplane at NIC state restore through CVQ

2023-07-20 Thread Eugenio Pérez

At this moment the migration of net features that depends on CVQ is not
possible, as there is no reliable way to restore the device state like mac
address, number of enabled queues, etc to the destination.  This is mainly
caused because the device must only read CVQ, and process all the commands
before resuming the dataplane.

This series uses the VirtIO feature _F_RING_RESET to achieve it, adding an
alternative method to late vq enabling proposed in [1][2].  It expose SVQ to
the device until it process all the CVQ messages, and then replaces the vring
for the guest's one.

As an advantage, it uses a feature well deviced in the VirtIO standard.  As a
disadvantage, current HW already support the late enabling and it does not
support RING_RESET.

This patch must be applied on top of the series ("Enable vdpa net migration
with features depending on CVQ") [1][2].

The patch has been tested with vp_vdpa, but using high IOVA instead of a
sepparated ID for shadow CVQ and shadow temporal vrings.

[1] Message-id: <20230706191227.835526-1-epere...@redhat.com>
[2] https://lists.nongnu.org/archive/html/qemu-devel/2023-07/msg01325.html

Eugenio Pérez (12):
  vhost: add vhost_reset_queue_op
  vhost: add vhost_restart_queue_op
  vhost_net: Use ops->vhost_restart_queue in vhost_net_virtqueue_restart
  vhost_net: Use ops->vhost_reset_queue in vhost_net_virtqueue_reset
  vdpa: add vhost_vdpa_set_vring_ready_internal
  vdpa: add vhost_vdpa_svq_stop
  vdpa: add vhost_vdpa_reset_queue
  vdpa: add vhost_vdpa_svq_start
  vdpa: add vhost_vdpa_restart_queue
  vdpa: enable all vqs if the device support RING_RESET feature
  vdpa: use SVQ to stall dataplane while NIC state is being restored
  vhost: Allow _F_RING_RESET with shadow virtqueue

 include/hw/virtio/vhost-backend.h  |   6 ++
 hw/net/vhost_net.c |  16 ++--
 hw/virtio/vhost-shadow-virtqueue.c |   1 +
 hw/virtio/vhost-vdpa.c | 139 +
 net/vhost-vdpa.c   |  55 ++--
 hw/virtio/trace-events |   2 +-
 6 files changed, 171 insertions(+), 48 deletions(-)

-- 
2.39.3

[RFC PATCH 02/12] vhost: add vhost_restart_queue_op

2023-07-20 Thread Eugenio Pérez

So different vhost backends can perform custom actions at queue restart.

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/vhost-backend.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 37e05fa1f9..651402af10 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -135,6 +135,8 @@ typedef void (*vhost_reset_status_op)(struct vhost_dev 
*dev);
 
 typedef void (*vhost_reset_queue_op)(struct vhost_dev *dev, int idx);
 
+typedef int (*vhost_restart_queue_op)(struct vhost_dev *dev, int idx);
+
 typedef struct VhostOps {
 VhostBackendType backend_type;
 vhost_backend_init vhost_backend_init;
@@ -184,6 +186,7 @@ typedef struct VhostOps {
 vhost_set_config_call_op vhost_set_config_call;
 vhost_reset_status_op vhost_reset_status;
 vhost_reset_queue_op vhost_reset_queue;
+vhost_restart_queue_op vhost_restart_queue;
 } VhostOps;
 
 int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
-- 
2.39.3

[RFC PATCH 06/12] vdpa: add vhost_vdpa_svq_stop

2023-07-20 Thread Eugenio Pérez

To split each SVQ stop in its own stop routine let's us to reset a VQ
individually, and to keep future vhost_vdpa_reset_queue symmetrical
with vhost_vdpa_reset_queue.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index e7ab69165c..6ae276ccde 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1263,6 +1263,18 @@ err:
 return false;
 }
 
+static void vhost_vdpa_svq_stop(struct vhost_dev *dev, unsigned idx)
+{
+struct vhost_vdpa *v = dev->opaque;
+VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, idx);
+
+vhost_svq_stop(svq);
+vhost_vdpa_svq_unmap_rings(dev, svq);
+
+event_notifier_cleanup(>hdev_kick);
+event_notifier_cleanup(>hdev_call);
+}
+
 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 {
 struct vhost_vdpa *v = dev->opaque;
@@ -1272,13 +1284,7 @@ static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 }
 
 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
-VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-
-vhost_svq_stop(svq);
-vhost_vdpa_svq_unmap_rings(dev, svq);
-
-event_notifier_cleanup(>hdev_kick);
-event_notifier_cleanup(>hdev_call);
+vhost_vdpa_svq_stop(dev, i);
 }
 }
 
-- 
2.39.3

[RFC PATCH 10/12] vdpa: enable all vqs if the device support RING_RESET feature

2023-07-20 Thread Eugenio Pérez

Prefer the ring reset approach against the late enable, as it is more
aligned with the standard.

Signed-off-by: Eugenio Pérez 
---
 net/vhost-vdpa.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 52415d7e0c..af83de92f8 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -840,8 +840,14 @@ static const VhostShadowVirtqueueOps 
vhost_vdpa_net_svq_ops = {
  */
 static bool vhost_vdpa_should_enable(const struct vhost_vdpa *v)
 {
+VhostVDPAState *s = container_of(v, VhostVDPAState, vhost_vdpa);
 struct vhost_dev *dev = v->dev;
 
+if (dev->features & VIRTIO_F_RING_RESET && !s->always_svq) {
+/* Preventing dataplane processing exposing fake SVQ vring */
+return true;
+}
+
 if (!dev->vq_index_end % 2) {
 /* vDPA device does not have CVQ */
 return true;
-- 
2.39.3

[RFC PATCH 07/12] vdpa: add vhost_vdpa_reset_queue

2023-07-20 Thread Eugenio Pérez

Split out vq reset operation in its own function, as it may be called
with ring reset.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 6ae276ccde..df2515a247 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -547,6 +547,21 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, 
unsigned idx)
 return vhost_vdpa_set_vring_ready_internal(v, idx, true);
 }
 
+/* TODO: Properly reorder static functions */
+static void vhost_vdpa_svq_stop(struct vhost_dev *dev, unsigned idx);
+static void vhost_vdpa_reset_queue(struct vhost_dev *dev, int idx)
+{
+struct vhost_vdpa *v = dev->opaque;
+
+if (dev->features & VIRTIO_F_RING_RESET) {
+vhost_vdpa_set_vring_ready_internal(v, idx, false);
+}
+
+if (v->shadow_vqs_enabled) {
+vhost_vdpa_svq_stop(dev, idx - dev->vq_index);
+}
+}
+
 /*
  * The use of this function is for requests that only need to be
  * applied once. Typically such request occurs at the beginning
@@ -1543,4 +1558,5 @@ const VhostOps vdpa_ops = {
 .vhost_force_iommu = vhost_vdpa_force_iommu,
 .vhost_set_config_call = vhost_vdpa_set_config_call,
 .vhost_reset_status = vhost_vdpa_reset_status,
+.vhost_reset_queue = vhost_vdpa_reset_queue,
 };
-- 
2.39.3

[RFC PATCH 05/12] vdpa: add vhost_vdpa_set_vring_ready_internal

2023-07-20 Thread Eugenio Pérez

Reset a virtqueue reuses this call with .num = 0, so let's make it
possible to use that way.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 12 +---
 hw/virtio/trace-events |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index f51f5d9969..e7ab69165c 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -528,19 +528,25 @@ int vhost_vdpa_get_iova_range(int fd, struct 
vhost_vdpa_iova_range *iova_range)
 return ret < 0 ? -errno : 0;
 }
 
-int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+static int vhost_vdpa_set_vring_ready_internal(struct vhost_vdpa *v,
+   unsigned idx, bool enable)
 {
 struct vhost_dev *dev = v->dev;
 struct vhost_vring_state state = {
 .index = idx,
-.num = 1,
+.num = enable,
 };
 int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, );
 
-trace_vhost_vdpa_set_vring_ready(dev, idx, r);
+trace_vhost_vdpa_set_vring_ready(dev, idx, enable, r);
 return r;
 }
 
+int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
+{
+return vhost_vdpa_set_vring_ready_internal(v, idx, true);
+}
+
 /*
  * The use of this function is for requests that only need to be
  * applied once. Typically such request occurs at the beginning
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 4f6a6ba428..9b90da73af 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -46,7 +46,7 @@ vhost_vdpa_set_features(void *dev, uint64_t features) "dev: 
%p features: 0x%"PRI
 vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id 
%"PRIu32
 vhost_vdpa_reset_device(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8
 vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq 
idx: %d"
-vhost_vdpa_set_vring_ready(void *dev, unsigned i, int r) "dev: %p, idx: %u, r: 
%d"
+vhost_vdpa_set_vring_ready(void *dev, unsigned i, bool e, int r) "dev: %p, 
idx: %u, num: %d, r: %d"
 vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s"
 vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t 
flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32
 vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p 
config: %p config_len: %"PRIu32
-- 
2.39.3

[RFC PATCH 08/12] vdpa: add vhost_vdpa_svq_start

2023-07-20 Thread Eugenio Pérez

To split each SVQ in its own initialization routine let's us to restart
a VQ individually, and to keep future vhost_vdpa_restart_queue
symmetrical with vhost_vdpa_reset_queue.

Signed-off-by: Eugenio Pérez 
---
 hw/virtio/vhost-vdpa.c | 67 ++
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index df2515a247..7248072989 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1223,6 +1223,46 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 return r == 0;
 }
 
+static bool vhost_vdpa_svq_start(struct vhost_dev *dev, unsigned i,
+ Error **errp)
+{
+struct vhost_vdpa *v = dev->opaque;
+VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
+VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+struct vhost_vring_addr addr = {
+.index = dev->vq_index + i,
+};
+int r;
+bool ok = vhost_vdpa_svq_setup(dev, svq, i, errp);
+if (unlikely(!ok)) {
+goto err;
+}
+
+vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
+ok = vhost_vdpa_svq_map_rings(dev, svq, , errp);
+if (unlikely(!ok)) {
+goto err_map;
+}
+
+/* Override vring GPA set by vhost subsystem */
+r = vhost_vdpa_set_vring_dev_addr(dev, );
+if (unlikely(r != 0)) {
+error_setg_errno(errp, -r, "Cannot set device address");
+goto err_set_addr;
+}
+
+return true;
+
+err_set_addr:
+vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+return false;
+}
+
 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 {
 struct vhost_vdpa *v = dev->opaque;
@@ -1234,39 +1274,14 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 }
 
 for (i = 0; i < v->shadow_vqs->len; ++i) {
-VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
-VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-struct vhost_vring_addr addr = {
-.index = dev->vq_index + i,
-};
-int r;
-bool ok = vhost_vdpa_svq_setup(dev, svq, i, );
+bool ok = vhost_vdpa_svq_start(dev, i, );
 if (unlikely(!ok)) {
 goto err;
 }
-
-vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
-ok = vhost_vdpa_svq_map_rings(dev, svq, , );
-if (unlikely(!ok)) {
-goto err_map;
-}
-
-/* Override vring GPA set by vhost subsystem */
-r = vhost_vdpa_set_vring_dev_addr(dev, );
-if (unlikely(r != 0)) {
-error_setg_errno(, -r, "Cannot set device address");
-goto err_set_addr;
-}
 }
 
 return true;
 
-err_set_addr:
-vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
-
-err_map:
-vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
-
 err:
 error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 for (unsigned j = 0; j < i; ++j) {
-- 
2.39.3

[RFC PATCH 04/12] vhost_net: Use ops->vhost_reset_queue in vhost_net_virtqueue_reset

2023-07-20 Thread Eugenio Pérez

Actually use vhost_reset_queue operation at queue reset.

Signed-off-by: Eugenio Pérez 
---
 hw/net/vhost_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 416b7d8132..5516b7a5aa 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -571,7 +571,9 @@ void vhost_net_virtqueue_reset(VirtIODevice *vdev, 
NetClientState *nc,
 
 idx = vhost_ops->vhost_get_vq_index(>dev, vq_index);
 
-if (net->nc->info->type == NET_CLIENT_DRIVER_TAP) {
+if (vhost_ops->vhost_reset_queue) {
+vhost_ops->vhost_reset_queue(>dev, idx);
+} else if (net->nc->info->type == NET_CLIENT_DRIVER_TAP) {
 file.index = idx;
 int r = vhost_net_set_backend(>dev, );
 assert(r >= 0);
-- 
2.39.3

[RFC PATCH 03/12] vhost_net: Use ops->vhost_restart_queue in vhost_net_virtqueue_restart

2023-07-20 Thread Eugenio Pérez

Actually use vhost_restart_queue operation at restart.

Signed-off-by: Eugenio Pérez 
---
 hw/net/vhost_net.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 6b958d6363..416b7d8132 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -608,14 +608,16 @@ int vhost_net_virtqueue_restart(VirtIODevice *vdev, 
NetClientState *nc,
 goto err_start;
 }
 
-if (net->nc->info->type == NET_CLIENT_DRIVER_TAP) {
+if (vhost_ops->vhost_restart_queue) {
+r = vhost_ops->vhost_restart_queue(>dev, idx);
+} else if (net->nc->info->type == NET_CLIENT_DRIVER_TAP) {
 file.index = idx;
 file.fd = net->backend;
 r = vhost_net_set_backend(>dev, );
-if (r < 0) {
-r = -errno;
-goto err_start;
-}
+}
+if (r < 0) {
+r = -errno;
+goto err_start;
 }
 
 return 0;
-- 
2.39.3

[RFC PATCH 01/12] vhost: add vhost_reset_queue_op

2023-07-20 Thread Eugenio Pérez

So different vhost backends can perform custom actions at queue reset.

Signed-off-by: Eugenio Pérez 
---
 include/hw/virtio/vhost-backend.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 31a251a9f5..37e05fa1f9 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -133,6 +133,8 @@ typedef int (*vhost_set_config_call_op)(struct vhost_dev 
*dev,
 
 typedef void (*vhost_reset_status_op)(struct vhost_dev *dev);
 
+typedef void (*vhost_reset_queue_op)(struct vhost_dev *dev, int idx);
+
 typedef struct VhostOps {
 VhostBackendType backend_type;
 vhost_backend_init vhost_backend_init;
@@ -181,6 +183,7 @@ typedef struct VhostOps {
 vhost_force_iommu_op vhost_force_iommu;
 vhost_set_config_call_op vhost_set_config_call;
 vhost_reset_status_op vhost_reset_status;
+vhost_reset_queue_op vhost_reset_queue;
 } VhostOps;
 
 int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
-- 
2.39.3

Re: [PATCH 1/3] softmmu: Support concurrent bounce buffers

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:06:25AM -0700, Mattias Nissler wrote:
> +if (qatomic_dec_fetch(_buffers_in_use) == 1) {
> +cpu_notify_map_clients();
>  }

About my comment regarding removing this API: I see the next patch does
that.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH 2/3] softmmu: Remove DMA unmap notification callback

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:06:26AM -0700, Mattias Nissler wrote:
> According to old commit messages, this was introduced to retry a DMA
> operation at a later point in case the single bounce buffer is found to
> be busy. This was never used widely - only the dma-helpers code made use
> of it, but there are other device models that use multiple DMA mappings
> (concurrently) and just failed.
> 
> After the improvement to support multiple concurrent bounce buffers,
> the condition the notification callback allowed to work around no
> longer exists, so we can just remove the logic and simplify the code.
> 
> Signed-off-by: Mattias Nissler 
> ---
>  softmmu/dma-helpers.c | 28 -
>  softmmu/physmem.c | 71 ---
>  2 files changed, 99 deletions(-)

I'm not sure if it will be possible to remove this once a limit is
placed bounce buffer space.

> 
> diff --git a/softmmu/dma-helpers.c b/softmmu/dma-helpers.c
> index 2463964805..d05d226f11 100644
> --- a/softmmu/dma-helpers.c
> +++ b/softmmu/dma-helpers.c
> @@ -68,23 +68,10 @@ typedef struct {
>  int sg_cur_index;
>  dma_addr_t sg_cur_byte;
>  QEMUIOVector iov;
> -QEMUBH *bh;
>  DMAIOFunc *io_func;
>  void *io_func_opaque;
>  } DMAAIOCB;
>  
> -static void dma_blk_cb(void *opaque, int ret);
> -
> -static void reschedule_dma(void *opaque)
> -{
> -DMAAIOCB *dbs = (DMAAIOCB *)opaque;
> -
> -assert(!dbs->acb && dbs->bh);
> -qemu_bh_delete(dbs->bh);
> -dbs->bh = NULL;
> -dma_blk_cb(dbs, 0);
> -}
> -
>  static void dma_blk_unmap(DMAAIOCB *dbs)
>  {
>  int i;
> @@ -101,7 +88,6 @@ static void dma_complete(DMAAIOCB *dbs, int ret)
>  {
>  trace_dma_complete(dbs, ret, dbs->common.cb);
>  
> -assert(!dbs->acb && !dbs->bh);
>  dma_blk_unmap(dbs);
>  if (dbs->common.cb) {
>  dbs->common.cb(dbs->common.opaque, ret);
> @@ -164,13 +150,6 @@ static void dma_blk_cb(void *opaque, int ret)
>  }
>  }
>  
> -if (dbs->iov.size == 0) {
> -trace_dma_map_wait(dbs);
> -dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
> -cpu_register_map_client(dbs->bh);
> -goto out;
> -}
> -
>  if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
>  qemu_iovec_discard_back(>iov,
>  QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
> @@ -189,18 +168,12 @@ static void dma_aio_cancel(BlockAIOCB *acb)
>  
>  trace_dma_aio_cancel(dbs);
>  
> -assert(!(dbs->acb && dbs->bh));
>  if (dbs->acb) {
>  /* This will invoke dma_blk_cb.  */
>  blk_aio_cancel_async(dbs->acb);
>  return;
>  }
>  
> -if (dbs->bh) {
> -cpu_unregister_map_client(dbs->bh);
> -qemu_bh_delete(dbs->bh);
> -dbs->bh = NULL;
> -}
>  if (dbs->common.cb) {
>  dbs->common.cb(dbs->common.opaque, -ECANCELED);
>  }
> @@ -239,7 +212,6 @@ BlockAIOCB *dma_blk_io(AioContext *ctx,
>  dbs->dir = dir;
>  dbs->io_func = io_func;
>  dbs->io_func_opaque = io_func_opaque;
> -dbs->bh = NULL;
>  qemu_iovec_init(>iov, sg->nsg);
>  dma_blk_cb(dbs, 0);
>  return >common;
> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> index 56130b5a1d..2b4123c127 100644
> --- a/softmmu/physmem.c
> +++ b/softmmu/physmem.c
> @@ -2908,49 +2908,6 @@ typedef struct {
>  uint8_t buffer[];
>  } BounceBuffer;
>  
> -static size_t bounce_buffers_in_use;
> -
> -typedef struct MapClient {
> -QEMUBH *bh;
> -QLIST_ENTRY(MapClient) link;
> -} MapClient;
> -
> -QemuMutex map_client_list_lock;
> -static QLIST_HEAD(, MapClient) map_client_list
> -= QLIST_HEAD_INITIALIZER(map_client_list);
> -
> -static void cpu_unregister_map_client_do(MapClient *client)
> -{
> -QLIST_REMOVE(client, link);
> -g_free(client);
> -}
> -
> -static void cpu_notify_map_clients_locked(void)
> -{
> -MapClient *client;
> -
> -while (!QLIST_EMPTY(_client_list)) {
> -client = QLIST_FIRST(_client_list);
> -qemu_bh_schedule(client->bh);
> -cpu_unregister_map_client_do(client);
> -}
> -}
> -
> -void cpu_register_map_client(QEMUBH *bh)
> -{
> -MapClient *client = g_malloc(sizeof(*client));
> -
> -qemu_mutex_lock(_client_list_lock);
> -client->bh = bh;
> -QLIST_INSERT_HEAD(_client_list, client, link);
> -/* Write map_client_list before reading in_use.  */
> -smp_mb();
> -if (qatomic_read(_buffers_in_use)) {
> -cpu_notify_map_clients_locked();
> -}
> -qemu_mutex_unlock(_client_list_lock);
> -}
> -
>  void cpu_exec_init_all(void)
>  {
>  qemu_mutex_init(_list.mutex);
> @@ -2964,28 +2921,6 @@ void cpu_exec_init_all(void)
>  finalize_target_page_bits();
>  io_mem_init();
>  memory_map_init();
> -qemu_mutex_init(_client_list_lock);
> -}
> -
> -void cpu_unregister_map_client(QEMUBH *bh)
> -{
> -MapClient *client;
> -
> -qemu_mutex_lock(_client_list_lock);
> -QLIST_FOREACH(client,

Re: [PATCH 1/3] softmmu: Support concurrent bounce buffers

2023-07-20 Thread Stefan Hajnoczi

On Tue, Jul 04, 2023 at 01:06:25AM -0700, Mattias Nissler wrote:
> It is not uncommon for device models to request mapping of several DMA
> regions at the same time. An example is igb (and probably other net
> devices as well) when a packet is spread across multiple descriptors.
> 
> In order to support this when indirect DMA is used, as is the case when
> running the device model in a vfio-server process without mmap()-ed DMA,
> this change allocates DMA bounce buffers dynamically instead of
> supporting only a single buffer.
> 
> Signed-off-by: Mattias Nissler 
> ---
>  softmmu/physmem.c | 74 ++-
>  1 file changed, 35 insertions(+), 39 deletions(-)

Is this a functional change or purely a performance optimization? If
it's a performance optimization, please include benchmark results to
justify this change.

QEMU memory allocations must be bounded so that an untrusted guest
cannot cause QEMU to exhaust host memory. There must be a limit to the
amount of bounce buffer memory.

> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> index bda475a719..56130b5a1d 100644
> --- a/softmmu/physmem.c
> +++ b/softmmu/physmem.c
> @@ -2904,13 +2904,11 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
>  
>  typedef struct {
>  MemoryRegion *mr;
> -void *buffer;
>  hwaddr addr;
> -hwaddr len;
> -bool in_use;
> +uint8_t buffer[];
>  } BounceBuffer;
>  
> -static BounceBuffer bounce;
> +static size_t bounce_buffers_in_use;
>  
>  typedef struct MapClient {
>  QEMUBH *bh;
> @@ -2947,7 +2945,7 @@ void cpu_register_map_client(QEMUBH *bh)
>  QLIST_INSERT_HEAD(_client_list, client, link);
>  /* Write map_client_list before reading in_use.  */
>  smp_mb();
> -if (!qatomic_read(_use)) {
> +if (qatomic_read(_buffers_in_use)) {
>  cpu_notify_map_clients_locked();
>  }
>  qemu_mutex_unlock(_client_list_lock);
> @@ -3076,31 +3074,24 @@ void *address_space_map(AddressSpace *as,
>  RCU_READ_LOCK_GUARD();
>  fv = address_space_to_flatview(as);
>  mr = flatview_translate(fv, addr, , , is_write, attrs);
> +memory_region_ref(mr);
>  
>  if (!memory_access_is_direct(mr, is_write)) {
> -if (qatomic_xchg(_use, true)) {
> -*plen = 0;
> -return NULL;
> -}
> -/* Avoid unbounded allocations */
> -l = MIN(l, TARGET_PAGE_SIZE);
> -bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
> -bounce.addr = addr;
> -bounce.len = l;
> -
> -memory_region_ref(mr);
> -bounce.mr = mr;
> +qatomic_inc_fetch(_buffers_in_use);
> +
> +BounceBuffer *bounce = g_malloc(l + sizeof(BounceBuffer));
> +bounce->addr = addr;
> +bounce->mr = mr;
> +
>  if (!is_write) {
>  flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
> -   bounce.buffer, l);
> +  bounce->buffer, l);
>  }
>  
>  *plen = l;
> -return bounce.buffer;
> +return bounce->buffer;

Bounce buffer allocation always succeeds now. Can the
cpu_notify_map_clients*() be removed now that no one is waiting for
bounce buffers anymore?

>  }
>  
> -
> -memory_region_ref(mr);
>  *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
>  l, is_write, attrs);
>  fuzz_dma_read_cb(addr, *plen, mr);
> @@ -3114,31 +3105,36 @@ void *address_space_map(AddressSpace *as,
>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>   bool is_write, hwaddr access_len)
>  {
> -if (buffer != bounce.buffer) {
> -MemoryRegion *mr;
> -ram_addr_t addr1;
> +MemoryRegion *mr;
> +ram_addr_t addr1;
> +
> +mr = memory_region_from_host(buffer, );
> +if (mr == NULL) {
> +/*
> + * Must be a bounce buffer (unless the caller passed a pointer which
> + * wasn't returned by address_space_map, which is illegal).
> + */
> +BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
>  
> -mr = memory_region_from_host(buffer, );
> -assert(mr != NULL);
>  if (is_write) {
> -invalidate_and_set_dirty(mr, addr1, access_len);
> +address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> +bounce->buffer, access_len);
>  }
> -if (xen_enabled()) {
> -xen_invalidate_map_cache_entry(buffer);
> +memory_region_unref(bounce->mr);
> +g_free(bounce);
> +
> +if (qatomic_dec_fetch(_buffers_in_use) == 1) {
> +cpu_notify_map_clients();
>  }
> -memory_region_unref(mr);
>  return;
>  }
> +
> +if (xen_enabled()) {
> +xen_invalidate_map_cache_entry(buffer);
> +}
>  if (is_write) {
> -address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,

Re: [PATCH v2 08/11] hw/loongarch/virt: add plug handler for TPM on SysBus

2023-07-20 Thread Stefan Berger





On 7/14/23 03:09, Joelle van Dyne wrote:

TPM needs to know its own base address in order to generate its DSDT
device entry.

Signed-off-by: Joelle van Dyne 



It would be great to also cover the crb-device  with tests:

from tests/qtest/meson.build:

  (config_all.has_key('CONFIG_TCG') and 
config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ?\
['tpm-tis-device-test', 'tpm-tis-device-swtpm-test'] : []) +
 \

It should be easy to make a copy of these two tis-device tests and rename them 
to crb-device tests, adapt them, and run them at least on aarch64.

Regards,
   Stefan

[PATCH] target/loongarch: Fix the CSRRD CPUID instruction on big endian hosts

2023-07-20 Thread Thomas Huth

The test in tests/avocado/machine_loongarch.py is currently failing
on big endian hosts like s390x. By comparing the traces between running
the QEMU_EFI.fd bios on a s390x and on a x86 host, it's quickly obvious
that the CSRRD instruction for the CPUID is behaving differently. And
indeed: The code currently does a long read (i.e. 64 bit) from the
address that points to the CPUState->cpu_index field (with tcg_gen_ld_tl()
in the trans_csrrd() function). But this cpu_index field is only an "int"
(i.e. 32 bit). While this dirty pointer magic works on little endian hosts,
it of course fails on big endian hosts. Fix it by using a proper helper
function instead.

Signed-off-by: Thomas Huth 
---
 Note: There is another bug preventing tests/avocado/machine_loongarch.py
 from working correctly on big endian hosts (Linux fails to run the shell)
 but that problem is harder to debug since it happens way later in the boot
 process...

 target/loongarch/cpu.h | 1 +
 target/loongarch/helper.h  | 1 +
 target/loongarch/csr_helper.c  | 9 +
 target/loongarch/insn_trans/trans_privileged.c.inc | 8 +---
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/target/loongarch/cpu.h b/target/loongarch/cpu.h
index ed04027af1..fa371ca8ba 100644
--- a/target/loongarch/cpu.h
+++ b/target/loongarch/cpu.h
@@ -342,6 +342,7 @@ typedef struct CPUArchState {
 uint64_t CSR_DBG;
 uint64_t CSR_DERA;
 uint64_t CSR_DSAVE;
+uint64_t CSR_CPUID;
 
 #ifndef CONFIG_USER_ONLY
 LoongArchTLB  tlb[LOONGARCH_TLB_MAX];
diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index b9de77d926..ffb1e0b0bf 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -98,6 +98,7 @@ DEF_HELPER_1(rdtime_d, i64, env)
 #ifndef CONFIG_USER_ONLY
 /* CSRs helper */
 DEF_HELPER_1(csrrd_pgd, i64, env)
+DEF_HELPER_1(csrrd_cpuid, i64, env)
 DEF_HELPER_1(csrrd_tval, i64, env)
 DEF_HELPER_2(csrwr_estat, i64, env, tl)
 DEF_HELPER_2(csrwr_asid, i64, env, tl)
diff --git a/target/loongarch/csr_helper.c b/target/loongarch/csr_helper.c
index 6526367946..55341551a5 100644
--- a/target/loongarch/csr_helper.c
+++ b/target/loongarch/csr_helper.c
@@ -35,6 +35,15 @@ target_ulong helper_csrrd_pgd(CPULoongArchState *env)
 return v;
 }
 
+target_ulong helper_csrrd_cpuid(CPULoongArchState *env)
+{
+LoongArchCPU *lac = env_archcpu(env);
+
+env->CSR_CPUID = CPU(lac)->cpu_index;
+
+return env->CSR_CPUID;
+}
+
 target_ulong helper_csrrd_tval(CPULoongArchState *env)
 {
 LoongArchCPU *cpu = env_archcpu(env);
diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc 
b/target/loongarch/insn_trans/trans_privileged.c.inc
index 02bca7ca23..9c9de090f0 100644
--- a/target/loongarch/insn_trans/trans_privileged.c.inc
+++ b/target/loongarch/insn_trans/trans_privileged.c.inc
@@ -99,13 +99,7 @@ static const CSRInfo csr_info[] = {
 CSR_OFF(PWCH),
 CSR_OFF(STLBPS),
 CSR_OFF(RVACFG),
-[LOONGARCH_CSR_CPUID] = {
-.offset = (int)offsetof(CPUState, cpu_index)
-  - (int)offsetof(LoongArchCPU, env),
-.flags = CSRFL_READONLY,
-.readfn = NULL,
-.writefn = NULL
-},
+CSR_OFF_FUNCS(CPUID, CSRFL_READONLY, gen_helper_csrrd_cpuid, NULL),
 CSR_OFF_FLAGS(PRCFG1, CSRFL_READONLY),
 CSR_OFF_FLAGS(PRCFG2, CSRFL_READONLY),
 CSR_OFF_FLAGS(PRCFG3, CSRFL_READONLY),
-- 
2.39.3

[PATCH for-8.2 v5 08/11] target/riscv/cpu.c: add ADD_UNAVAIL_KVM_PROP_ARRAY() macro

2023-07-20 Thread Daniel Henrique Barboza

Use a macro in riscv_cpu_add_kvm_properties() to eliminate some of its
code repetition, similar to what we're already doing with
ADD_CPU_QDEV_PROPERTIES_ARRAY().

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 23169a606f..8675839cb4 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1900,6 +1900,13 @@ static void riscv_cpu_add_kvm_unavail_prop(Object *obj, 
const char *prop_name)
 NULL, (void *)prop_name);
 }
 
+#define ADD_UNAVAIL_KVM_PROP_ARRAY(_obj, _array) \
+do { \
+for (int i = 0; i < ARRAY_SIZE(_array); i++) { \
+riscv_cpu_add_kvm_unavail_prop(_obj, _array[i].name); \
+} \
+} while (0)
+
 static void riscv_cpu_add_kvm_properties(Object *obj)
 {
 DeviceState *dev = DEVICE(obj);
@@ -1907,18 +1914,9 @@ static void riscv_cpu_add_kvm_properties(Object *obj)
 kvm_riscv_init_user_properties(obj);
 riscv_cpu_add_misa_properties(obj);
 
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
-riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_extensions[i].name);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_vendor_exts); i++) {
-riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_vendor_exts[i].name);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_experimental_exts); i++) {
-riscv_cpu_add_kvm_unavail_prop(obj,
-   riscv_cpu_experimental_exts[i].name);
-}
+ADD_UNAVAIL_KVM_PROP_ARRAY(obj, riscv_cpu_extensions);
+ADD_UNAVAIL_KVM_PROP_ARRAY(obj, riscv_cpu_vendor_exts);
+ADD_UNAVAIL_KVM_PROP_ARRAY(obj, riscv_cpu_experimental_exts);
 
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
 /* Check if KVM created the property already */
-- 
2.41.0

[PATCH for-8.2 v5 09/11] target/riscv: add 'max' CPU type

2023-07-20 Thread Daniel Henrique Barboza

The 'max' CPU type is used by tooling to determine what's the most
capable CPU a current QEMU version implements. Other archs such as ARM
implements this type. Let's add it to RISC-V.

What we consider "most capable CPU" in this context are related to
ratified, non-vendor extensions. This means that we want the 'max' CPU
to enable all (possible) ratified extensions by default. The reasoning
behind this design is (1) vendor extensions can conflict with each other
and we won't play favorities deciding which one is default or not and
(2) non-ratified extensions are always prone to changes, not being
stable enough to be enabled by default.

All this said, we're still not able to enable all ratified extensions
due to conflicts between them. Zfinx and all its dependencies aren't
enabled because of a conflict with RVF. zce, zcmp and zcmt are also
disabled due to RVD conflicts. When running with 64 bits we're also
disabling zcf.

MISA bits RVG, RVJ and RVV are also being set manually since they're
default disabled.

This is the resulting 'riscv,isa' DT for this new CPU:

rv64imafdcvh_zicbom_zicboz_zicsr_zifencei_zihintpause_zawrs_zfa_
zfh_zfhmin_zca_zcb_zcd_zba_zbb_zbc_zbkb_zbkc_zbkx_zbs_zk_zkn_zknd_
zkne_zknh_zkr_zks_zksed_zksh_zkt_zve32f_zve64f_zve64d_
smstateen_sscofpmf_sstc_svadu_svinval_svnapot_svpbmt

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Weiwei Li 
---
 target/riscv/cpu-qom.h |  1 +
 target/riscv/cpu.c | 53 ++
 2 files changed, 54 insertions(+)

diff --git a/target/riscv/cpu-qom.h b/target/riscv/cpu-qom.h
index 04af50983e..f3fbe37a2c 100644
--- a/target/riscv/cpu-qom.h
+++ b/target/riscv/cpu-qom.h
@@ -30,6 +30,7 @@
 #define CPU_RESOLVING_TYPE TYPE_RISCV_CPU
 
 #define TYPE_RISCV_CPU_ANY  RISCV_CPU_TYPE_NAME("any")
+#define TYPE_RISCV_CPU_MAX  RISCV_CPU_TYPE_NAME("max")
 #define TYPE_RISCV_CPU_BASE32   RISCV_CPU_TYPE_NAME("rv32")
 #define TYPE_RISCV_CPU_BASE64   RISCV_CPU_TYPE_NAME("rv64")
 #define TYPE_RISCV_CPU_BASE128  RISCV_CPU_TYPE_NAME("x-rv128")
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 8675839cb4..0221bfcbef 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -248,6 +248,7 @@ static const char * const riscv_intr_names[] = {
 };
 
 static void riscv_cpu_add_user_properties(Object *obj);
+static void riscv_init_max_cpu_extensions(Object *obj);
 
 const char *riscv_cpu_get_trap_name(target_ulong cause, bool async)
 {
@@ -374,6 +375,25 @@ static void riscv_any_cpu_init(Object *obj)
 cpu->cfg.pmp = true;
 }
 
+static void riscv_max_cpu_init(Object *obj)
+{
+RISCVCPU *cpu = RISCV_CPU(obj);
+CPURISCVState *env = >env;
+RISCVMXL mlx = MXL_RV64;
+
+#ifdef TARGET_RISCV32
+mlx = MXL_RV32;
+#endif
+set_misa(env, mlx, 0);
+riscv_cpu_add_user_properties(obj);
+riscv_init_max_cpu_extensions(obj);
+env->priv_ver = PRIV_VERSION_LATEST;
+#ifndef CONFIG_USER_ONLY
+set_satp_mode_max_supported(RISCV_CPU(obj), mlx == MXL_RV32 ?
+VM_1_10_SV32 : VM_1_10_SV57);
+#endif
+}
+
 #if defined(TARGET_RISCV64)
 static void rv64_base_cpu_init(Object *obj)
 {
@@ -1955,6 +1975,38 @@ static void riscv_cpu_add_user_properties(Object *obj)
 ADD_CPU_QDEV_PROPERTIES_ARRAY(dev, riscv_cpu_experimental_exts);
 }
 
+/*
+ * The 'max' type CPU will have all possible ratified
+ * non-vendor extensions enabled.
+ */
+static void riscv_init_max_cpu_extensions(Object *obj)
+{
+RISCVCPU *cpu = RISCV_CPU(obj);
+CPURISCVState *env = >env;
+
+/* Enable RVG, RVJ and RVV that are disabled by default */
+set_misa(env, env->misa_mxl, env->misa_ext | RVG | RVJ | RVV);
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
+object_property_set_bool(obj, riscv_cpu_extensions[i].name,
+ true, NULL);
+}
+
+/* Zfinx is not compatible with F. Disable it */
+object_property_set_bool(obj, "zfinx", false, NULL);
+object_property_set_bool(obj, "zdinx", false, NULL);
+object_property_set_bool(obj, "zhinx", false, NULL);
+object_property_set_bool(obj, "zhinxmin", false, NULL);
+
+object_property_set_bool(obj, "zce", false, NULL);
+object_property_set_bool(obj, "zcmp", false, NULL);
+object_property_set_bool(obj, "zcmt", false, NULL);
+
+if (env->misa_mxl != MXL_RV32) {
+object_property_set_bool(obj, "zcf", false, NULL);
+}
+}
+
 static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("debug", RISCVCPU, cfg.debug, true),
 
@@ -2293,6 +2345,7 @@ static const TypeInfo riscv_cpu_type_infos[] = {
 .abstract = true,
 },
 DEFINE_DYNAMIC_CPU(TYPE_RISCV_CPU_ANY,  riscv_any_cpu_init),
+DEFINE_DYNAMIC_CPU(TYPE_RISCV_CPU_MAX,  riscv_max_cpu_init),
 #if defined(CONFIG_KVM)
 DEFINE_CPU(TYPE_RISCV_CPU_HOST, riscv_host_cpu_init),
 #endif
-- 
2.41.0

[PATCH for-8.2 v5 10/11] avocado, risc-v: add opensbi tests for 'max' CPU

2023-07-20 Thread Daniel Henrique Barboza

Add smoke tests to ensure that we'll not break the 'max' CPU type when
adding new ratified extensions to be enabled.

Signed-off-by: Daniel Henrique Barboza 
---
 tests/avocado/riscv_opensbi.py | 16 
 1 file changed, 16 insertions(+)

diff --git a/tests/avocado/riscv_opensbi.py b/tests/avocado/riscv_opensbi.py
index bfff9cc3c3..15fd57fe51 100644
--- a/tests/avocado/riscv_opensbi.py
+++ b/tests/avocado/riscv_opensbi.py
@@ -61,3 +61,19 @@ def test_riscv64_virt(self):
 :avocado: tags=machine:virt
 """
 self.boot_opensbi()
+
+def test_riscv32_virt_maxcpu(self):
+"""
+:avocado: tags=arch:riscv32
+:avocado: tags=machine:virt
+:avocado: tags=cpu:max
+"""
+self.boot_opensbi()
+
+def test_riscv64_virt_maxcpu(self):
+"""
+:avocado: tags=arch:riscv64
+:avocado: tags=machine:virt
+:avocado: tags=cpu:max
+"""
+self.boot_opensbi()
-- 
2.41.0

[PATCH for-8.2 v5 07/11] target/riscv/cpu.c: add ADD_CPU_QDEV_PROPERTIES_ARRAY() macro

2023-07-20 Thread Daniel Henrique Barboza

The code inside riscv_cpu_add_user_properties() became quite repetitive
after recent changes. Add a macro to hide the repetition away.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Weiwei Li 
---
 target/riscv/cpu.c | 26 +++---
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 7c6060ffa3..23169a606f 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1875,6 +1875,13 @@ static void cpu_set_cfg_unavailable(Object *obj, Visitor 
*v,
 }
 #endif
 
+#define ADD_CPU_QDEV_PROPERTIES_ARRAY(_dev, _array) \
+do { \
+for (int i = 0; i < ARRAY_SIZE(_array); i++) { \
+qdev_property_add_static(_dev, &_array[i]); \
+} \
+} while (0)
+
 #ifndef CONFIG_USER_ONLY
 static void riscv_cpu_add_kvm_unavail_prop(Object *obj, const char *prop_name)
 {
@@ -1944,21 +1951,10 @@ static void riscv_cpu_add_user_properties(Object *obj)
 
 riscv_cpu_add_misa_properties(obj);
 
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
-qdev_property_add_static(dev, _cpu_extensions[i]);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
-qdev_property_add_static(dev, _cpu_options[i]);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_vendor_exts); i++) {
-qdev_property_add_static(dev, _cpu_vendor_exts[i]);
-}
-
-for (int i = 0; i < ARRAY_SIZE(riscv_cpu_experimental_exts); i++) {
-qdev_property_add_static(dev, _cpu_experimental_exts[i]);
-}
+ADD_CPU_QDEV_PROPERTIES_ARRAY(dev, riscv_cpu_extensions);
+ADD_CPU_QDEV_PROPERTIES_ARRAY(dev, riscv_cpu_options);
+ADD_CPU_QDEV_PROPERTIES_ARRAY(dev, riscv_cpu_vendor_exts);
+ADD_CPU_QDEV_PROPERTIES_ARRAY(dev, riscv_cpu_experimental_exts);
 }
 
 static Property riscv_cpu_properties[] = {
-- 
2.41.0

[PATCH for-8.2 v5 04/11] target/riscv/cpu.c: del DEFINE_PROP_END_OF_LIST() from riscv_cpu_extensions

2023-07-20 Thread Daniel Henrique Barboza

This last blank element is used by the 'for' loop to check if a property
has a valid name.

Remove it and use ARRAY_SIZE() instead like riscv_cpu_options is already
using. All future arrays will also do the same and we'll able to
encapsulate more repetitions in macros later on.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 7f0852a14e..4dadb7f0a0 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1835,8 +1835,6 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_BOOL("x-zfbfmin", RISCVCPU, cfg.ext_zfbfmin, false),
 DEFINE_PROP_BOOL("x-zvfbfmin", RISCVCPU, cfg.ext_zvfbfmin, false),
 DEFINE_PROP_BOOL("x-zvfbfwma", RISCVCPU, cfg.ext_zvfbfwma, false),
-
-DEFINE_PROP_END_OF_LIST(),
 };
 
 static Property riscv_cpu_options[] = {
@@ -1894,14 +1892,13 @@ static void riscv_cpu_add_kvm_unavail_prop(Object *obj, 
const char *prop_name)
 
 static void riscv_cpu_add_kvm_properties(Object *obj)
 {
-Property *prop;
 DeviceState *dev = DEVICE(obj);
 
 kvm_riscv_init_user_properties(obj);
 riscv_cpu_add_misa_properties(obj);
 
-for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
-riscv_cpu_add_kvm_unavail_prop(obj, prop->name);
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
+riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_extensions[i].name);
 }
 
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
@@ -1922,7 +1919,6 @@ static void riscv_cpu_add_kvm_properties(Object *obj)
  */
 static void riscv_cpu_add_user_properties(Object *obj)
 {
-Property *prop;
 DeviceState *dev = DEVICE(obj);
 
 #ifndef CONFIG_USER_ONLY
@@ -1936,8 +1932,8 @@ static void riscv_cpu_add_user_properties(Object *obj)
 
 riscv_cpu_add_misa_properties(obj);
 
-for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
-qdev_property_add_static(dev, prop);
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_extensions); i++) {
+qdev_property_add_static(dev, _cpu_extensions[i]);
 }
 
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
-- 
2.41.0

[PATCH for-8.2 v5 05/11] target/riscv/cpu.c: split vendor exts from riscv_cpu_extensions[]

2023-07-20 Thread Daniel Henrique Barboza

Our goal is to make riscv_cpu_extensions[] hold only ratified,
non-vendor extensions.

Create a new riscv_cpu_vendor_exts[] array for them, changing
riscv_cpu_add_user_properties() and riscv_cpu_add_kvm_properties()
accordingly.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 37 +++--
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 4dadb7f0a0..3a20a41dfc 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1807,20 +1807,6 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_BOOL("zcmp", RISCVCPU, cfg.ext_zcmp, false),
 DEFINE_PROP_BOOL("zcmt", RISCVCPU, cfg.ext_zcmt, false),
 
-/* Vendor-specific custom extensions */
-DEFINE_PROP_BOOL("xtheadba", RISCVCPU, cfg.ext_xtheadba, false),
-DEFINE_PROP_BOOL("xtheadbb", RISCVCPU, cfg.ext_xtheadbb, false),
-DEFINE_PROP_BOOL("xtheadbs", RISCVCPU, cfg.ext_xtheadbs, false),
-DEFINE_PROP_BOOL("xtheadcmo", RISCVCPU, cfg.ext_xtheadcmo, false),
-DEFINE_PROP_BOOL("xtheadcondmov", RISCVCPU, cfg.ext_xtheadcondmov, false),
-DEFINE_PROP_BOOL("xtheadfmemidx", RISCVCPU, cfg.ext_xtheadfmemidx, false),
-DEFINE_PROP_BOOL("xtheadfmv", RISCVCPU, cfg.ext_xtheadfmv, false),
-DEFINE_PROP_BOOL("xtheadmac", RISCVCPU, cfg.ext_xtheadmac, false),
-DEFINE_PROP_BOOL("xtheadmemidx", RISCVCPU, cfg.ext_xtheadmemidx, false),
-DEFINE_PROP_BOOL("xtheadmempair", RISCVCPU, cfg.ext_xtheadmempair, false),
-DEFINE_PROP_BOOL("xtheadsync", RISCVCPU, cfg.ext_xtheadsync, false),
-DEFINE_PROP_BOOL("xventanacondops", RISCVCPU, cfg.ext_XVentanaCondOps, 
false),
-
 /* These are experimental so mark with 'x-' */
 DEFINE_PROP_BOOL("x-zicond", RISCVCPU, cfg.ext_zicond, false),
 
@@ -1837,6 +1823,21 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_BOOL("x-zvfbfwma", RISCVCPU, cfg.ext_zvfbfwma, false),
 };
 
+static Property riscv_cpu_vendor_exts[] = {
+DEFINE_PROP_BOOL("xtheadba", RISCVCPU, cfg.ext_xtheadba, false),
+DEFINE_PROP_BOOL("xtheadbb", RISCVCPU, cfg.ext_xtheadbb, false),
+DEFINE_PROP_BOOL("xtheadbs", RISCVCPU, cfg.ext_xtheadbs, false),
+DEFINE_PROP_BOOL("xtheadcmo", RISCVCPU, cfg.ext_xtheadcmo, false),
+DEFINE_PROP_BOOL("xtheadcondmov", RISCVCPU, cfg.ext_xtheadcondmov, false),
+DEFINE_PROP_BOOL("xtheadfmemidx", RISCVCPU, cfg.ext_xtheadfmemidx, false),
+DEFINE_PROP_BOOL("xtheadfmv", RISCVCPU, cfg.ext_xtheadfmv, false),
+DEFINE_PROP_BOOL("xtheadmac", RISCVCPU, cfg.ext_xtheadmac, false),
+DEFINE_PROP_BOOL("xtheadmemidx", RISCVCPU, cfg.ext_xtheadmemidx, false),
+DEFINE_PROP_BOOL("xtheadmempair", RISCVCPU, cfg.ext_xtheadmempair, false),
+DEFINE_PROP_BOOL("xtheadsync", RISCVCPU, cfg.ext_xtheadsync, false),
+DEFINE_PROP_BOOL("xventanacondops", RISCVCPU, cfg.ext_XVentanaCondOps, 
false),
+};
+
 static Property riscv_cpu_options[] = {
 DEFINE_PROP_UINT8("pmu-num", RISCVCPU, cfg.pmu_num, 16),
 
@@ -1901,6 +1902,10 @@ static void riscv_cpu_add_kvm_properties(Object *obj)
 riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_extensions[i].name);
 }
 
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_vendor_exts); i++) {
+riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_vendor_exts[i].name);
+}
+
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
 /* Check if KVM created the property already */
 if (object_property_find(obj, riscv_cpu_options[i].name)) {
@@ -1939,6 +1944,10 @@ static void riscv_cpu_add_user_properties(Object *obj)
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
 qdev_property_add_static(dev, _cpu_options[i]);
 }
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_vendor_exts); i++) {
+qdev_property_add_static(dev, _cpu_vendor_exts[i]);
+}
 }
 
 static Property riscv_cpu_properties[] = {
-- 
2.41.0

[PATCH for-8.2 v5 00/11] riscv: add 'max' CPU, deprecate 'any'

2023-07-20 Thread Daniel Henrique Barboza

Hi,

I'm sending this new version based on another observation I made during
another follow-up work (I'll post it shortly).

'mmu' and 'pmp' aren't really extensions in the most tradicional sense,
they're more like features. So, in patch 1, I moved both to the new
riscv_cpu_options array.

This was observed when I was trying to match each existing extension
with a priv_spec. I realized that we have 4 missing entries in
isa_edata_arr[] that we're considering as extensions: ext_zmmul, epmp,
mmu and pmp. The first 2 were sent as bug fixes for 8.1. mmu and pmp
were moved to riscv_cpu_options[] to be handled as such.

A small cosmetic change was made in patch 9 as well.

Patches missing review: 1, 3, 4, 8, 10, 11 

Changes from v4:
- patch 1:
  - add 'mmu' and 'pmp' in riscv_cpu_options
- patch 9:
  - changed 'max' cpu 'for' loop to use ARRAY_SIZE()
- v4 link: 
https://lore.kernel.org/qemu-riscv/20230718210329.200404-1-dbarb...@ventanamicro.com/

Daniel Henrique Barboza (11):
  target/riscv/cpu.c: split CPU options from riscv_cpu_extensions[]
  target/riscv/cpu.c: skip 'bool' check when filtering KVM props
  target/riscv/cpu.c: split kvm prop handling to its own helper
  target/riscv/cpu.c: del DEFINE_PROP_END_OF_LIST() from
riscv_cpu_extensions
  target/riscv/cpu.c: split vendor exts from riscv_cpu_extensions[]
  target/riscv/cpu.c: split non-ratified exts from
riscv_cpu_extensions[]
  target/riscv/cpu.c: add ADD_CPU_QDEV_PROPERTIES_ARRAY() macro
  target/riscv/cpu.c: add ADD_UNAVAIL_KVM_PROP_ARRAY() macro
  target/riscv: add 'max' CPU type
  avocado, risc-v: add opensbi tests for 'max' CPU
  target/riscv: deprecate the 'any' CPU type

 docs/about/deprecated.rst  |  12 +++
 target/riscv/cpu-qom.h |   1 +
 target/riscv/cpu.c | 176 +
 tests/avocado/riscv_opensbi.py |  16 +++
 4 files changed, 164 insertions(+), 41 deletions(-)

-- 
2.41.0

[PATCH for-8.2 v5 02/11] target/riscv/cpu.c: skip 'bool' check when filtering KVM props

2023-07-20 Thread Daniel Henrique Barboza

After the introduction of riscv_cpu_options[] all properties in
riscv_cpu_extensions[] are booleans. This check is now obsolete.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 9a3afc0482..f10d40733a 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1907,17 +1907,11 @@ static void riscv_cpu_add_user_properties(Object *obj)
  * Set the default to disabled for every extension
  * unknown to KVM and error out if the user attempts
  * to enable any of them.
- *
- * We're giving a pass for non-bool properties since they're
- * not related to the availability of extensions and can be
- * safely ignored as is.
  */
-if (prop->info == _prop_bool) {
-object_property_add(obj, prop->name, "bool",
-NULL, cpu_set_cfg_unavailable,
-NULL, (void *)prop->name);
-continue;
-}
+object_property_add(obj, prop->name, "bool",
+NULL, cpu_set_cfg_unavailable,
+NULL, (void *)prop->name);
+continue;
 }
 #endif
 qdev_property_add_static(dev, prop);
-- 
2.41.0

[PATCH for-8.2 v5 11/11] target/riscv: deprecate the 'any' CPU type

2023-07-20 Thread Daniel Henrique Barboza

The 'any' CPU type was introduced in commit dc5bd18fa5725 ("RISC-V CPU
Core Definition"), being around since the beginning. It's not an easy
CPU to use: it's undocumented and its name doesn't tell users much about
what the CPU is supposed to bring. 'git log' doesn't help us either in
knowing what was the original design of this CPU type.

The closest we have is a comment from Alistair [1] where he recalls from
memory that the 'any' CPU is supposed to behave like the newly added
'max' CPU. He also suggested that the 'any' CPU should be removed.

The default CPUs are rv32 and rv64, so removing the 'any' CPU will have
impact only on users that might have a script that uses '-cpu any'.
And those users are better off using the default CPUs or the new 'max'
CPU.

We would love to just remove the code and be done with it, but one does
not simply remove a feature in QEMU. We'll put the CPU in quarantine
first, letting users know that we have the intent of removing it in the
future.

[1] https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg02891.html

Signed-off-by: Daniel Henrique Barboza 
---
 docs/about/deprecated.rst | 12 
 target/riscv/cpu.c|  5 +
 2 files changed, 17 insertions(+)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 02ea5a839f..68afa43fd0 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -371,6 +371,18 @@ QEMU's ``vhost`` feature, which would eliminate the high 
latency costs under
 which the 9p ``proxy`` backend currently suffers. However as of to date nobody
 has indicated plans for such kind of reimplemention unfortunately.
 
+RISC-V 'any' CPU type ``-cpu any`` (since 8.2)
+^^
+
+The 'any' CPU type was introduced back in 2018 and has been around since the
+initial RISC-V QEMU port. Its usage has always been unclear: users don't know
+what to expect from a CPU called 'any', and in fact the CPU does not do 
anything
+special that aren't already done by the default CPUs rv32/rv64.
+
+After the introduction of the 'max' CPU type RISC-V now has a good coverage
+of generic CPUs: rv32 and rv64 as default CPUs and 'max' as a feature complete
+CPU for both 32 and 64 bit builds. Users are then discouraged to use the 'any'
+CPU type starting in 8.2.
 
 Block device options
 
diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 0221bfcbef..9f21dc775e 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1471,6 +1471,11 @@ static void riscv_cpu_realize(DeviceState *dev, Error 
**errp)
 RISCVCPUClass *mcc = RISCV_CPU_GET_CLASS(dev);
 Error *local_err = NULL;
 
+if (object_dynamic_cast(OBJECT(dev), TYPE_RISCV_CPU_ANY) != NULL) {
+warn_report("The 'any' CPU is deprecated and will be "
+"removed in the future.");
+}
+
 cpu_exec_realizefn(cs, _err);
 if (local_err != NULL) {
 error_propagate(errp, local_err);
-- 
2.41.0

[PATCH for-8.2 v5 06/11] target/riscv/cpu.c: split non-ratified exts from riscv_cpu_extensions[]

2023-07-20 Thread Daniel Henrique Barboza

Create a new riscv_cpu_experimental_exts[] to store the non-ratified
extensions properties. Once they are ratified we'll move them back to
riscv_cpu_extensions[].

riscv_cpu_add_user_properties() and riscv_cpu_add_kvm_properties() are
changed to keep adding non-ratified properties to users.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 41 ++---
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 3a20a41dfc..7c6060ffa3 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1806,21 +1806,6 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_BOOL("zcf", RISCVCPU, cfg.ext_zcf, false),
 DEFINE_PROP_BOOL("zcmp", RISCVCPU, cfg.ext_zcmp, false),
 DEFINE_PROP_BOOL("zcmt", RISCVCPU, cfg.ext_zcmt, false),
-
-/* These are experimental so mark with 'x-' */
-DEFINE_PROP_BOOL("x-zicond", RISCVCPU, cfg.ext_zicond, false),
-
-/* ePMP 0.9.3 */
-DEFINE_PROP_BOOL("x-epmp", RISCVCPU, cfg.epmp, false),
-DEFINE_PROP_BOOL("x-smaia", RISCVCPU, cfg.ext_smaia, false),
-DEFINE_PROP_BOOL("x-ssaia", RISCVCPU, cfg.ext_ssaia, false),
-
-DEFINE_PROP_BOOL("x-zvfh", RISCVCPU, cfg.ext_zvfh, false),
-DEFINE_PROP_BOOL("x-zvfhmin", RISCVCPU, cfg.ext_zvfhmin, false),
-
-DEFINE_PROP_BOOL("x-zfbfmin", RISCVCPU, cfg.ext_zfbfmin, false),
-DEFINE_PROP_BOOL("x-zvfbfmin", RISCVCPU, cfg.ext_zvfbfmin, false),
-DEFINE_PROP_BOOL("x-zvfbfwma", RISCVCPU, cfg.ext_zvfbfwma, false),
 };
 
 static Property riscv_cpu_vendor_exts[] = {
@@ -1838,6 +1823,23 @@ static Property riscv_cpu_vendor_exts[] = {
 DEFINE_PROP_BOOL("xventanacondops", RISCVCPU, cfg.ext_XVentanaCondOps, 
false),
 };
 
+/* These are experimental so mark with 'x-' */
+static Property riscv_cpu_experimental_exts[] = {
+DEFINE_PROP_BOOL("x-zicond", RISCVCPU, cfg.ext_zicond, false),
+
+/* ePMP 0.9.3 */
+DEFINE_PROP_BOOL("x-epmp", RISCVCPU, cfg.epmp, false),
+DEFINE_PROP_BOOL("x-smaia", RISCVCPU, cfg.ext_smaia, false),
+DEFINE_PROP_BOOL("x-ssaia", RISCVCPU, cfg.ext_ssaia, false),
+
+DEFINE_PROP_BOOL("x-zvfh", RISCVCPU, cfg.ext_zvfh, false),
+DEFINE_PROP_BOOL("x-zvfhmin", RISCVCPU, cfg.ext_zvfhmin, false),
+
+DEFINE_PROP_BOOL("x-zfbfmin", RISCVCPU, cfg.ext_zfbfmin, false),
+DEFINE_PROP_BOOL("x-zvfbfmin", RISCVCPU, cfg.ext_zvfbfmin, false),
+DEFINE_PROP_BOOL("x-zvfbfwma", RISCVCPU, cfg.ext_zvfbfwma, false),
+};
+
 static Property riscv_cpu_options[] = {
 DEFINE_PROP_UINT8("pmu-num", RISCVCPU, cfg.pmu_num, 16),
 
@@ -1906,6 +1908,11 @@ static void riscv_cpu_add_kvm_properties(Object *obj)
 riscv_cpu_add_kvm_unavail_prop(obj, riscv_cpu_vendor_exts[i].name);
 }
 
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_experimental_exts); i++) {
+riscv_cpu_add_kvm_unavail_prop(obj,
+   riscv_cpu_experimental_exts[i].name);
+}
+
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
 /* Check if KVM created the property already */
 if (object_property_find(obj, riscv_cpu_options[i].name)) {
@@ -1948,6 +1955,10 @@ static void riscv_cpu_add_user_properties(Object *obj)
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_vendor_exts); i++) {
 qdev_property_add_static(dev, _cpu_vendor_exts[i]);
 }
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_experimental_exts); i++) {
+qdev_property_add_static(dev, _cpu_experimental_exts[i]);
+}
 }
 
 static Property riscv_cpu_properties[] = {
-- 
2.41.0

[PATCH for-8.2 v5 01/11] target/riscv/cpu.c: split CPU options from riscv_cpu_extensions[]

2023-07-20 Thread Daniel Henrique Barboza

We'll add a new CPU type that will enable a considerable amount of
extensions. To make it easier for us we'll do a few cleanups in our
existing riscv_cpu_extensions[] array.

Start by splitting all CPU non-boolean options from it. Create a new
riscv_cpu_options[] array for them. Add all these properties in
riscv_cpu_add_user_properties() as it is already being done today.

'mmu' and 'pmp' aren't really extensions in the usual way we think about
RISC-V extensions. These are closer to CPU features/options, so move
both to riscv_cpu_options[] too. In the near future we'll need to match
all extensions with all entries in isa_edata_arr[], and so it happens
that both 'mmu' and 'pmp' do not have a riscv,isa string (thus, no priv
spec version restriction). This further emphasizes the point that these
are more a CPU option than an extension.

No functional changes made.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..9a3afc0482 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1752,7 +1752,6 @@ static void riscv_cpu_add_misa_properties(Object *cpu_obj)
 
 static Property riscv_cpu_extensions[] = {
 /* Defaults for standard extensions */
-DEFINE_PROP_UINT8("pmu-num", RISCVCPU, cfg.pmu_num, 16),
 DEFINE_PROP_BOOL("sscofpmf", RISCVCPU, cfg.ext_sscofpmf, false),
 DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
 DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
@@ -1764,15 +1763,8 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_BOOL("Zve32f", RISCVCPU, cfg.ext_zve32f, false),
 DEFINE_PROP_BOOL("Zve64f", RISCVCPU, cfg.ext_zve64f, false),
 DEFINE_PROP_BOOL("Zve64d", RISCVCPU, cfg.ext_zve64d, false),
-DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
-DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
 DEFINE_PROP_BOOL("sstc", RISCVCPU, cfg.ext_sstc, true),
 
-DEFINE_PROP_STRING("priv_spec", RISCVCPU, cfg.priv_spec),
-DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec),
-DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
-DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
-
 DEFINE_PROP_BOOL("smstateen", RISCVCPU, cfg.ext_smstateen, false),
 DEFINE_PROP_BOOL("svadu", RISCVCPU, cfg.ext_svadu, true),
 DEFINE_PROP_BOOL("svinval", RISCVCPU, cfg.ext_svinval, false),
@@ -1803,9 +1795,7 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_BOOL("zhinxmin", RISCVCPU, cfg.ext_zhinxmin, false),
 
 DEFINE_PROP_BOOL("zicbom", RISCVCPU, cfg.ext_icbom, true),
-DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU, cfg.cbom_blocksize, 64),
 DEFINE_PROP_BOOL("zicboz", RISCVCPU, cfg.ext_icboz, true),
-DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU, cfg.cboz_blocksize, 64),
 
 DEFINE_PROP_BOOL("zmmul", RISCVCPU, cfg.ext_zmmul, false),
 
@@ -1849,6 +1839,21 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+static Property riscv_cpu_options[] = {
+DEFINE_PROP_UINT8("pmu-num", RISCVCPU, cfg.pmu_num, 16),
+
+DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
+DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
+
+DEFINE_PROP_STRING("priv_spec", RISCVCPU, cfg.priv_spec),
+DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec),
+
+DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
+DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
+
+DEFINE_PROP_UINT16("cbom_blocksize", RISCVCPU, cfg.cbom_blocksize, 64),
+DEFINE_PROP_UINT16("cboz_blocksize", RISCVCPU, cfg.cboz_blocksize, 64),
+};
 
 #ifndef CONFIG_USER_ONLY
 static void cpu_set_cfg_unavailable(Object *obj, Visitor *v,
@@ -1917,6 +1922,14 @@ static void riscv_cpu_add_user_properties(Object *obj)
 #endif
 qdev_property_add_static(dev, prop);
 }
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
+/* Check if KVM created the property already */
+if (object_property_find(obj, riscv_cpu_options[i].name)) {
+continue;
+}
+qdev_property_add_static(dev, _cpu_options[i]);
+}
 }
 
 static Property riscv_cpu_properties[] = {
-- 
2.41.0

[PATCH for-8.2 v5 03/11] target/riscv/cpu.c: split kvm prop handling to its own helper

2023-07-20 Thread Daniel Henrique Barboza

Future patches will split the existing Property arrays even further, and
the existing code in riscv_cpu_add_user_properties() will start to scale
bad with it because it's dealing with KVM constraints mixed in with TCG
constraints. We're going to pay a high price to share a couple of common
lines of code between the two.

Create a new riscv_cpu_add_kvm_properties() that will be forked from
riscv_cpu_add_user_properties() if we're running KVM. The helper
includes all properties that a KVM CPU will add. The rest of
riscv_cpu_add_user_properties() body will then be relieved from having
to deal with KVM constraints.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 65 ++
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index f10d40733a..7f0852a14e 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1874,6 +1874,46 @@ static void cpu_set_cfg_unavailable(Object *obj, Visitor 
*v,
 }
 #endif
 
+#ifndef CONFIG_USER_ONLY
+static void riscv_cpu_add_kvm_unavail_prop(Object *obj, const char *prop_name)
+{
+/* Check if KVM created the property already */
+if (object_property_find(obj, prop_name)) {
+return;
+}
+
+/*
+ * Set the default to disabled for every extension
+ * unknown to KVM and error out if the user attempts
+ * to enable any of them.
+ */
+object_property_add(obj, prop_name, "bool",
+NULL, cpu_set_cfg_unavailable,
+NULL, (void *)prop_name);
+}
+
+static void riscv_cpu_add_kvm_properties(Object *obj)
+{
+Property *prop;
+DeviceState *dev = DEVICE(obj);
+
+kvm_riscv_init_user_properties(obj);
+riscv_cpu_add_misa_properties(obj);
+
+for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
+riscv_cpu_add_kvm_unavail_prop(obj, prop->name);
+}
+
+for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
+/* Check if KVM created the property already */
+if (object_property_find(obj, riscv_cpu_options[i].name)) {
+continue;
+}
+qdev_property_add_static(dev, _cpu_options[i]);
+}
+}
+#endif
+
 /*
  * Add CPU properties with user-facing flags.
  *
@@ -1889,39 +1929,18 @@ static void riscv_cpu_add_user_properties(Object *obj)
 riscv_add_satp_mode_properties(obj);
 
 if (kvm_enabled()) {
-kvm_riscv_init_user_properties(obj);
+riscv_cpu_add_kvm_properties(obj);
+return;
 }
 #endif
 
 riscv_cpu_add_misa_properties(obj);
 
 for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
-#ifndef CONFIG_USER_ONLY
-if (kvm_enabled()) {
-/* Check if KVM created the property already */
-if (object_property_find(obj, prop->name)) {
-continue;
-}
-
-/*
- * Set the default to disabled for every extension
- * unknown to KVM and error out if the user attempts
- * to enable any of them.
- */
-object_property_add(obj, prop->name, "bool",
-NULL, cpu_set_cfg_unavailable,
-NULL, (void *)prop->name);
-continue;
-}
-#endif
 qdev_property_add_static(dev, prop);
 }
 
 for (int i = 0; i < ARRAY_SIZE(riscv_cpu_options); i++) {
-/* Check if KVM created the property already */
-if (object_property_find(obj, riscv_cpu_options[i].name)) {
-continue;
-}
 qdev_property_add_static(dev, _cpu_options[i]);
 }
 }
-- 
2.41.0

Re: [PATCH 3/4] virtio-net: added USO support

2023-07-20 Thread Michael S. Tsirkin

On Thu, Jul 20, 2023 at 07:05:40PM +0300, Yuri Benditovich wrote:
> 
> 
> On Thu, Jul 20, 2023 at 3:37 AM Akihiko Odaki  
> wrote:
> 
> On 2023/07/20 0:21, Yuri Benditovich wrote:
> > virtio-net can suggest USO features TX, RX v4 and RX v6,
> > depending on kernel TUN ability to support them. These
> > features require explicit enable in command-line.
> 
> Shouldn't we enable these by default as the other offload features are?
> 
> 
> My suggestion is to add these features as disabled by default and reevaluate
> the
> possibility to enable later.
> If we enable them by default we'll also need to disable them by default in
> previous
> generations of machine types.

Yea, let's do that, that's how we always did it traditionally.

> 
> 
> >
> > Signed-off-by: Yuri Benditovich 
> > ---
> >   hw/net/virtio-net.c | 16 ++--
> >   1 file changed, 14 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index d2311e7d6e..e76cad923b 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -796,6 +796,10 @@ static uint64_t 
> virtio_net_get_features(VirtIODevice
> *vdev, uint64_t features,
> >           virtio_clear_feature(, VIRTIO_NET_F_GUEST_TSO6);
> >           virtio_clear_feature(, VIRTIO_NET_F_GUEST_ECN);
> >   
> > +        virtio_clear_feature(, VIRTIO_NET_F_HOST_USO);
> > +        virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO4);
> > +        virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO6);
> > +
> >           virtio_clear_feature(, VIRTIO_NET_F_HASH_REPORT);
> >       }
> >   
> > @@ -864,14 +868,16 @@ static void virtio_net_apply_guest_offloads
> (VirtIONet *n)
> >               !!(n->curr_guest_offloads & (1ULL <<
> VIRTIO_NET_F_GUEST_USO6)));
> >   }
> >   
> > -static uint64_t virtio_net_guest_offloads_by_features(uint32_t 
> features)
> > +static uint64_t virtio_net_guest_offloads_by_features(uint64_t 
> features)
> >   {
> >       static const uint64_t guest_offloads_mask =
> >           (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
> >           (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
> >           (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
> >           (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
> > -        (1ULL << VIRTIO_NET_F_GUEST_UFO);
> > +        (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
> > +        (1ULL << VIRTIO_NET_F_GUEST_USO4) |
> > +        (1ULL << VIRTIO_NET_F_GUEST_USO6);
> >   
> >       return guest_offloads_mask & features;
> >   }
> > @@ -3924,6 +3930,12 @@ static Property virtio_net_properties[] = {
> >       DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed,
> SPEED_UNKNOWN),
> >       DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
> >       DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
> > +    DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_GUEST_USO4, false),
> > +    DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_GUEST_USO6, false),
> > +    DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_HOST_USO, false),
> >       DEFINE_PROP_END_OF_LIST(),
> >   };
> >   
>

Re: [PATCH 5/5] target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK

2023-07-20 Thread Peter Maydell

On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
 wrote:
>
> When FEAT_RME is implemented, these bits override the value of
> CNT[VP]_CTL_EL0.IMASK in Realm and Root state.
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  target/arm/helper.c | 21 +++--
>  1 file changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index 2017b11795..5b173a827f 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -2608,6 +2608,23 @@ static uint64_t gt_get_countervalue(CPUARMState *env)
>  return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / gt_cntfrq_period_ns(cpu);
>  }
>
> +static bool gt_is_masked(CPUARMState *env, int timeridx)
> +{
> +ARMSecuritySpace ss = arm_security_space(env);
> +
> +/*
> + * If bits CNTHCTL_EL2.CNT[VP]MASK are set, they override
> + * CNT[VP]_CTL_EL0.IMASK. They are RES0 in Secure and NonSecure state.
> + */
> +if ((ss == ARMSS_Root || ss == ARMSS_Realm) &&
> +((timeridx == GTIMER_VIRT && extract64(env->cp15.cnthctl_el2, 18, 
> 1)) ||
> + (timeridx == GTIMER_PHYS && extract64(env->cp15.cnthctl_el2, 19, 
> 1 {
> +return true;
> +}
> +
> +return env->cp15.c14_timer[timeridx].ctl & 2;
> +}
> +
>  static void gt_recalc_timer(ARMCPU *cpu, int timeridx)
>  {
>  ARMGenericTimer *gt = >env.cp15.c14_timer[timeridx];
> @@ -2627,7 +2644,7 @@ static void gt_recalc_timer(ARMCPU *cpu, int timeridx)
>
>  gt->ctl = deposit32(gt->ctl, 2, 1, istatus);
>
> -irqstate = (istatus && !(gt->ctl & 2));
> +irqstate = (istatus && !gt_is_masked(>env, timeridx));
>  qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate);
>
>  if (istatus) {
> @@ -2759,7 +2776,7 @@ static void gt_ctl_write(CPUARMState *env, const 
> ARMCPRegInfo *ri,
>   * IMASK toggled: don't need to recalculate,
>   * just set the interrupt line based on ISTATUS
>   */
> -int irqstate = (oldval & 4) && !(value & 2);
> +int irqstate = (oldval & 4) && !gt_is_masked(env, timeridx);
>
>  trace_arm_gt_imask_toggle(timeridx, irqstate);
>  qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate);

If these CNTHCTL bits now affect whether the timer interrupts
are masked, then we need to update the timer irq state
on writes to CNTHCTL that change the bits.

thanks
-- PMM

Re: [PATCH 4/5] target/arm: Pass security space rather than flag for AT instructions

2023-07-20 Thread Peter Maydell

On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
 wrote:
>
> At the moment we only handle Secure and Nonsecure security spaces for
> the AT instructions. Add support for Realm and Root.
>
> For AArch64, arm_security_space() gives the desired space. ARM DDI0487J
> says (R_NYXTL):
>
>   If EL3 is implemented, then when an address translation instruction
>   that applies to an Exception level lower than EL3 is executed, the
>   Effective value of SCR_EL3.{NSE, NS} determines the target Security
>   state that the instruction applies to.
>
> For AArch32, some instructions can access NonSecure space from Secure,
> so we still need to pass the state explicitly to do_ats_write().
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
> I haven't tested AT instructions in Realm/Root space yet, but it looks
> like the patch is needed. RMM doesn't issue AT instructions like KVM
> does in non-secure state (which triggered the bug in the previous
> patch).
> ---

Reviewed-by: Peter Maydell 

We should also implement the check that the AT S1E[012] ops have
that if FEAT_RME is implemented and SCR_EL3.{NS,NSE} are a reserved
value then the AT insn should UNDEF. But that's a separate patch.

thanks
-- PMM

Re: [PATCH] block: Be more verbose in create fallback

2023-07-20 Thread Eric Blake

On Thu, Jul 20, 2023 at 04:00:24PM +0200, Hanna Czenczek wrote:
> For image creation code, we have central fallback code for protocols
> that do not support creating new images (like NBD or iscsi).  So for
> them, you can only specify existing paths/exports that are overwritten
> to make clean new images.  In such a case, if the given path cannot be
> opened (assuming a pre-existing image there), we print an error message
> that tries to describe what is going on: That with this protocol, you
> cannot create new images, but only overwrite existing ones; and the
> given path could not be opened as a pre-existing image.
> 
> However, the current message is confusing, because it does not say that
> the protocol in question does not support creating new images, but
> instead that "image creation" is unsupported.  This can be interpreted
> to mean that `qemu-img create` will not work in principle, which is not
> true.  Be more verbose for clarity.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2217204
> Signed-off-by: Hanna Czenczek 
> ---
>  block.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)

Definitely more vebose, but I don't see that as a bad thing.

Reviewed-by: Eric Blake 

> 
> diff --git a/block.c b/block.c
> index a307c151a8..f530dd9c02 100644
> --- a/block.c
> +++ b/block.c
> @@ -661,8 +661,10 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver 
> *drv,
>  blk = blk_co_new_open(filename, NULL, options,
>BDRV_O_RDWR | BDRV_O_RESIZE, errp);
>  if (!blk) {
> -error_prepend(errp, "Protocol driver '%s' does not support image "
> -  "creation, and opening the image failed: ",
> +error_prepend(errp, "Protocol driver '%s' does not support creating "
> +  "new images, so an existing image must be selected as "
> +  "the target; however, opening the given target as an "
> +  "existing image failed: ",
>drv->format_name);
>  return -EINVAL;
>  }
> -- 
> 2.41.0
> 
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH for-8.2 3/4] hw/rtc/aspeed_rtc: Use 64-bit offset for holding time_t difference

2023-07-20 Thread Cédric Le Goater


On 7/20/23 18:45, Peter Maydell wrote:

On Thu, 20 Jul 2023 at 17:42, Cédric Le Goater  wrote:


On 7/20/23 17:59, Peter Maydell wrote:

In the aspeed_rtc device we store a difference between two time_t
values in an 'int'. This is not really correct when time_t could
be 64 bits. Enlarge the field to 'int64_t'.

This is a migration compatibility break for the aspeed boards.
While we are changing the vmstate, remove the accidental
duplicate of the offset field.


Ah yes. Thanks.



Signed-off-by: Peter Maydell 



Reviewed-by: Cédric Le Goater 



---
I took "bump the migration version" as the simplest approach
here, because I don't think we care about migration compat
in this case. If we do I can write the alternate version of
the patch...



I don't think we care much about migration compat and fyi, migration
of aspeed machines broke a while ago. It still migrates if done before
Linux is loaded.


Is that the "migration of AArch32 Secure state doesn't work
properly" bug, or am I misremembering?


probably, arm926 is not impacted, arm1176 and cortex-a7 are.

C.

Re: [PATCH 3/5] target/arm: Skip granule protection checks for AT instructions

2023-07-20 Thread Peter Maydell

On Thu, 20 Jul 2023 at 17:39, Peter Maydell  wrote:
>
> On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
>  wrote:
> >
> > GPC checks are not performed on the output address for AT instructions,
> > as stated by ARM DDI 0487J in D8.12.2:
> >
> >   When populating PAR_EL1 with the result of an address translation
> >   instruction, granule protection checks are not performed on the final
> >   output address of a successful translation.
> >
> > Rename get_phys_addr_with_secure(), since it's only used to handle AT
> > instructions.
> >
> > Signed-off-by: Jean-Philippe Brucker 
> > ---
> > This incidentally fixes a problem with AT S1E1 instructions which can
> > output an IPA and should definitely not cause a GPC.
> > ---
> >  target/arm/internals.h | 25 ++---
> >  target/arm/helper.c|  8 ++--
> >  target/arm/ptw.c   | 11 ++-
> >  3 files changed, 26 insertions(+), 18 deletions(-)
> >
> > diff --git a/target/arm/internals.h b/target/arm/internals.h
> > index 0f01bc32a8..fc90c364f7 100644
> > --- a/target/arm/internals.h
> > +++ b/target/arm/internals.h
> > @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult {
> >  } GetPhysAddrResult;
> >
> >  /**
> > - * get_phys_addr_with_secure: get the physical address for a virtual 
> > address
> > + * get_phys_addr: get the physical address for a virtual address
> >   * @env: CPUARMState
> >   * @address: virtual address to get physical address for
> >   * @access_type: 0 for read, 1 for write, 2 for execute
> >   * @mmu_idx: MMU index indicating required translation regime
> > - * @is_secure: security state for the access
> >   * @result: set on translation success.
> >   * @fi: set to fault info if the translation fails
> >   *
> > @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult {
> >   *  * for PSMAv5 based systems we don't bother to return a full FSR format
> >   *value.
> >   */
> > -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address,
> > -   MMUAccessType access_type,
> > -   ARMMMUIdx mmu_idx, bool is_secure,
> > -   GetPhysAddrResult *result, ARMMMUFaultInfo 
> > *fi)
> > +bool get_phys_addr(CPUARMState *env, target_ulong address,
> > +   MMUAccessType access_type, ARMMMUIdx mmu_idx,
> > +   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
> >  __attribute__((nonnull));
>
>
> What is going on in this bit of the patch ? We already
> have a prototype for get_phys_addr() with a doc comment.
> Is this git's diff-output producing something silly
> for a change that is not logically touching get_phys_addr()'s
> prototype and comment at all?

Looking more closely, that does seem to be what's happened, so

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH for-8.2 3/4] hw/rtc/aspeed_rtc: Use 64-bit offset for holding time_t difference

2023-07-20 Thread Peter Maydell

On Thu, 20 Jul 2023 at 17:42, Cédric Le Goater  wrote:
>
> On 7/20/23 17:59, Peter Maydell wrote:
> > In the aspeed_rtc device we store a difference between two time_t
> > values in an 'int'. This is not really correct when time_t could
> > be 64 bits. Enlarge the field to 'int64_t'.
> >
> > This is a migration compatibility break for the aspeed boards.
> > While we are changing the vmstate, remove the accidental
> > duplicate of the offset field.
>
> Ah yes. Thanks.
>
> >
> > Signed-off-by: Peter Maydell 
>
>
> Reviewed-by: Cédric Le Goater 
>
>
> > ---
> > I took "bump the migration version" as the simplest approach
> > here, because I don't think we care about migration compat
> > in this case. If we do I can write the alternate version of
> > the patch...
>
>
> I don't think we care much about migration compat and fyi, migration
> of aspeed machines broke a while ago. It still migrates if done before
> Linux is loaded.

Is that the "migration of AArch32 Secure state doesn't work
properly" bug, or am I misremembering?

-- PMM

Re: [PATCH for-8.2 3/4] hw/rtc/aspeed_rtc: Use 64-bit offset for holding time_t difference

2023-07-20 Thread Cédric Le Goater


On 7/20/23 17:59, Peter Maydell wrote:

In the aspeed_rtc device we store a difference between two time_t
values in an 'int'. This is not really correct when time_t could
be 64 bits. Enlarge the field to 'int64_t'.

This is a migration compatibility break for the aspeed boards.
While we are changing the vmstate, remove the accidental
duplicate of the offset field.


Ah yes. Thanks.



Signed-off-by: Peter Maydell 



Reviewed-by: Cédric Le Goater 



---
I took "bump the migration version" as the simplest approach
here, because I don't think we care about migration compat
in this case. If we do I can write the alternate version of
the patch...



I don't think we care much about migration compat and fyi, migration
of aspeed machines broke a while ago. It still migrates if done before
Linux is loaded.


C.



---
  include/hw/rtc/aspeed_rtc.h | 2 +-
  hw/rtc/aspeed_rtc.c | 5 ++---
  2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/hw/rtc/aspeed_rtc.h b/include/hw/rtc/aspeed_rtc.h
index df61e46059e..596dfebb46c 100644
--- a/include/hw/rtc/aspeed_rtc.h
+++ b/include/hw/rtc/aspeed_rtc.h
@@ -18,7 +18,7 @@ struct AspeedRtcState {
  qemu_irq irq;
  
  uint32_t reg[0x18];

-int offset;
+int64_t offset;
  
  };
  
diff --git a/hw/rtc/aspeed_rtc.c b/hw/rtc/aspeed_rtc.c

index f6da7b666d6..fa861e2d494 100644
--- a/hw/rtc/aspeed_rtc.c
+++ b/hw/rtc/aspeed_rtc.c
@@ -136,11 +136,10 @@ static const MemoryRegionOps aspeed_rtc_ops = {
  
  static const VMStateDescription vmstate_aspeed_rtc = {

  .name = TYPE_ASPEED_RTC,
-.version_id = 1,
+.version_id = 2,
  .fields = (VMStateField[]) {
  VMSTATE_UINT32_ARRAY(reg, AspeedRtcState, 0x18),
-VMSTATE_INT32(offset, AspeedRtcState),
-VMSTATE_INT32(offset, AspeedRtcState),
+VMSTATE_INT64(offset, AspeedRtcState),
  VMSTATE_END_OF_LIST()
  }
  };

Re: [PATCH 3/5] target/arm: Skip granule protection checks for AT instructions

2023-07-20 Thread Peter Maydell

On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
 wrote:
>
> GPC checks are not performed on the output address for AT instructions,
> as stated by ARM DDI 0487J in D8.12.2:
>
>   When populating PAR_EL1 with the result of an address translation
>   instruction, granule protection checks are not performed on the final
>   output address of a successful translation.
>
> Rename get_phys_addr_with_secure(), since it's only used to handle AT
> instructions.
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
> This incidentally fixes a problem with AT S1E1 instructions which can
> output an IPA and should definitely not cause a GPC.
> ---
>  target/arm/internals.h | 25 ++---
>  target/arm/helper.c|  8 ++--
>  target/arm/ptw.c   | 11 ++-
>  3 files changed, 26 insertions(+), 18 deletions(-)
>
> diff --git a/target/arm/internals.h b/target/arm/internals.h
> index 0f01bc32a8..fc90c364f7 100644
> --- a/target/arm/internals.h
> +++ b/target/arm/internals.h
> @@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult {
>  } GetPhysAddrResult;
>
>  /**
> - * get_phys_addr_with_secure: get the physical address for a virtual address
> + * get_phys_addr: get the physical address for a virtual address
>   * @env: CPUARMState
>   * @address: virtual address to get physical address for
>   * @access_type: 0 for read, 1 for write, 2 for execute
>   * @mmu_idx: MMU index indicating required translation regime
> - * @is_secure: security state for the access
>   * @result: set on translation success.
>   * @fi: set to fault info if the translation fails
>   *
> @@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult {
>   *  * for PSMAv5 based systems we don't bother to return a full FSR format
>   *value.
>   */
> -bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address,
> -   MMUAccessType access_type,
> -   ARMMMUIdx mmu_idx, bool is_secure,
> -   GetPhysAddrResult *result, ARMMMUFaultInfo 
> *fi)
> +bool get_phys_addr(CPUARMState *env, target_ulong address,
> +   MMUAccessType access_type, ARMMMUIdx mmu_idx,
> +   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
>  __attribute__((nonnull));


What is going on in this bit of the patch ? We already
have a prototype for get_phys_addr() with a doc comment.
Is this git's diff-output producing something silly
for a change that is not logically touching get_phys_addr()'s
prototype and comment at all?

>  /**
> - * get_phys_addr: get the physical address for a virtual address
> + * get_phys_addr_with_secure_nogpc: get the physical address for a virtual
> + *  address
>   * @env: CPUARMState
>   * @address: virtual address to get physical address for
>   * @access_type: 0 for read, 1 for write, 2 for execute
>   * @mmu_idx: MMU index indicating required translation regime
> + * @is_secure: security state for the access
>   * @result: set on translation success.
>   * @fi: set to fault info if the translation fails
>   *
> - * Similarly, but use the security regime of @mmu_idx.
> + * Similar to get_phys_addr, but use the given security regime and don't 
> perform
> + * a Granule Protection Check on the resulting address.
>   */
> -bool get_phys_addr(CPUARMState *env, target_ulong address,
> -   MMUAccessType access_type, ARMMMUIdx mmu_idx,
> -   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
> +bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address,
> + MMUAccessType access_type,
> + ARMMMUIdx mmu_idx, bool is_secure,
> + GetPhysAddrResult *result,
> + ARMMMUFaultInfo *fi)
>  __attribute__((nonnull));

thanks
-- PMM

Re: [PATCH 2/5] target/arm/helper: Fix vae2_tlbmask()

2023-07-20 Thread Peter Maydell

On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
 wrote:
>
> When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0
> translation regime, instead of the EL2 translation regime. The TLB VAE2*
> instructions invalidate the regime that corresponds to the current value
> of HCR_EL2.E2H.
>
> At the moment we only invalidate the EL2 translation regime. This causes
> problems with RMM, which issues TLBI VAE2IS instructions with
> HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into
> account.
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  target/arm/helper.c | 26 ++
>  1 file changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index e1b3db6f5f..07a9ac70f5 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env)
>  return mask;
>  }
>
> +static int vae2_tlbmask(CPUARMState *env)
> +{
> +uint64_t hcr = arm_hcr_el2_eff(env);
> +uint16_t mask;
> +
> +if (hcr & HCR_E2H) {
> +mask = ARMMMUIdxBit_E20_2 |
> +   ARMMMUIdxBit_E20_2_PAN |
> +   ARMMMUIdxBit_E20_0;
> +} else {
> +mask = ARMMMUIdxBit_E2;
> +}
> +return mask;
> +}
> +
>  /* Return 56 if TBI is enabled, 64 otherwise. */
>  static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx,
>uint64_t addr)

> @@ -4838,11 +4853,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState *env, 
> const ARMCPRegInfo *ri,
> uint64_t value)
>  {
>  CPUState *cs = env_cpu(env);
> +int mask = vae2_tlbmask(env);
>  uint64_t pageaddr = sextract64(value << 12, 0, 56);
>  int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr);

Shouldn't the argument to tlbbits_for_regime() also change
if we're dealing with the EL2&0 regime rather than EL2 ?

>
> -tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr,
> -  ARMMMUIdxBit_E2, bits);
> +tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, bits);
>  }

thanks
-- PMM

Re: [PATCH 1/5] target/arm/ptw: Load stage-2 tables from realm physical space

2023-07-20 Thread Peter Maydell

On Wed, 19 Jul 2023 at 16:56, Jean-Philippe Brucker
 wrote:
>
> In realm state, stage-2 translation tables are fetched from the realm
> physical address space (R_PGRQD).
>
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  target/arm/ptw.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/target/arm/ptw.c b/target/arm/ptw.c
> index d1de934702..6318e13b98 100644
> --- a/target/arm/ptw.c
> +++ b/target/arm/ptw.c
> @@ -164,7 +164,11 @@ static ARMMMUIdx ptw_idx_for_stage_2(CPUARMState *env, 
> ARMMMUIdx stage2idx)
>   * an NS stage 1+2 lookup while the NS bit is 0.)
>   */
>  if (!arm_is_secure_below_el3(env) || !arm_el_is_aa64(env, 3)) {
> -return ARMMMUIdx_Phys_NS;
> +if (arm_security_space_below_el3(env) == ARMSS_Realm) {
> +return ARMMMUIdx_Phys_Realm;
> +} else {
> +return ARMMMUIdx_Phys_NS;
> +}
>  }
>  if (stage2idx == ARMMMUIdx_Stage2_S) {
>  s2walk_secure = !(env->cp15.vstcr_el2 & VSTCR_SW);

This isn't wrong, but arm_is_secure_below_el3()
calls arm_security_space_below_el3(), so we kinda
duplicate work there. I think we should instead have:

if (!arm_el_is_aa64(env, 3)) {
return ARMMMUIdx_Phys_NS;
}

switch (arm_security_space_below_el3(env)) {
case ARMSS_NonSecure:
return ARMMUIdx_Phys_NS;
case ARMSS_Realm:
return ARMMMUIdx_Phys_Realm;
case ARMSS_Secure:
[existing code to look at the SW/NSW bits]
return s2walk_secure ? ...;
default:
g_assert_not_reached();
}

The comment above the function also needs tweaking
to say "SCR_EL3.NS or SCR_EL3.NSE bits" (we do already
do the TLB flush in scr_write).

thanks
-- PMM

[PATCH QEMU v9 9/9] tests: Add migration dirty-limit capability test

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Add migration dirty-limit capability test if kernel support
dirty ring.

Migration dirty-limit capability introduce dirty limit
capability, two parameters: x-vcpu-dirty-limit-period and
vcpu-dirty-limit are introduced to implement the live
migration with dirty limit.

The test case does the following things:
1. start src, dst vm and enable dirty-limit capability
2. start migrate and set cancel it to check if dirty limit
   stop working.
3. restart dst vm
4. start migrate and enable dirty-limit capability
5. check if migration satisfy the convergence condition
   during pre-switchover phase.

Signed-off-by: Hyman Huang(黄勇) 
---
 tests/qtest/migration-test.c | 155 +++
 1 file changed, 155 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index e256da1216..e6f77d176c 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2743,6 +2743,159 @@ static void test_vcpu_dirty_limit(void)
 dirtylimit_stop_vm(vm);
 }
 
+static void migrate_dirty_limit_wait_showup(QTestState *from,
+const int64_t period,
+const int64_t value)
+{
+/* Enable dirty limit capability */
+migrate_set_capability(from, "dirty-limit", true);
+
+/* Set dirty limit parameters */
+migrate_set_parameter_int(from, "x-vcpu-dirty-limit-period", period);
+migrate_set_parameter_int(from, "vcpu-dirty-limit", value);
+
+/* Make sure migrate can't converge */
+migrate_ensure_non_converge(from);
+
+/* To check limit rate after precopy */
+migrate_set_capability(from, "pause-before-switchover", true);
+
+/* Wait for the serial output from the source */
+wait_for_serial("src_serial");
+}
+
+/*
+ * This test does:
+ *  source   target
+ *   migrate_incoming
+ * migrate
+ * migrate_cancel
+ *   restart target
+ * migrate
+ *
+ *  And see that if dirty limit works correctly
+ */
+static void test_migrate_dirty_limit(void)
+{
+g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
+QTestState *from, *to;
+int64_t remaining;
+uint64_t throttle_us_per_full;
+/*
+ * We want the test to be stable and as fast as possible.
+ * E.g., with 1Gb/s bandwith migration may pass without dirty limit,
+ * so we need to decrease a bandwidth.
+ */
+const int64_t dirtylimit_period = 1000, dirtylimit_value = 50;
+const int64_t max_bandwidth = 4; /* ~400Mb/s */
+const int64_t downtime_limit = 250; /* 250ms */
+/*
+ * We migrate through unix-socket (> 500Mb/s).
+ * Thus, expected migration speed ~= bandwidth limit (< 500Mb/s).
+ * So, we can predict expected_threshold
+ */
+const int64_t expected_threshold = max_bandwidth * downtime_limit / 1000;
+int max_try_count = 10;
+MigrateCommon args = {
+.start = {
+.hide_stderr = true,
+.use_dirty_ring = true,
+},
+.listen_uri = uri,
+.connect_uri = uri,
+};
+
+/* Start src, dst vm */
+if (test_migrate_start(, , args.listen_uri, )) {
+return;
+}
+
+/* Prepare for dirty limit migration and wait src vm show up */
+migrate_dirty_limit_wait_showup(from, dirtylimit_period, dirtylimit_value);
+
+/* Start migrate */
+migrate_qmp(from, uri, "{}");
+
+/* Wait for dirty limit throttle begin */
+throttle_us_per_full = 0;
+while (throttle_us_per_full == 0) {
+throttle_us_per_full =
+read_migrate_property_int(from, "dirty-limit-throttle-time-per-round");
+usleep(100);
+g_assert_false(got_src_stop);
+}
+
+/* Now cancel migrate and wait for dirty limit throttle switch off */
+migrate_cancel(from);
+wait_for_migration_status(from, "cancelled", NULL);
+
+/* Check if dirty limit throttle switched off, set timeout 1ms */
+do {
+throttle_us_per_full =
+read_migrate_property_int(from, "dirty-limit-throttle-time-per-round");
+usleep(100);
+g_assert_false(got_src_stop);
+} while (throttle_us_per_full != 0 && --max_try_count);
+
+/* Assert dirty limit is not in service */
+g_assert_cmpint(throttle_us_per_full, ==, 0);
+
+args = (MigrateCommon) {
+.start = {
+.only_target = true,
+.use_dirty_ring = true,
+},
+.listen_uri = uri,
+.connect_uri = uri,
+};
+
+/* Restart dst vm, src vm already show up so we needn't wait anymore */
+if (test_migrate_start(, , args.listen_uri, )) {
+return;
+}
+
+/* Start migrate */
+migrate_qmp(from, uri, "{}");
+
+/* Wait for dirty limit throttle begin */
+throttle_us_per_full = 0;
+while (throttle_us_per_full == 0) {
+throttle_us_per_full =
+read_migrate_property_int(from,

[PATCH QEMU v9 5/9] migration: Refactor auto-converge capability logic

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Check if block migration is running before throttling
guest down in auto-converge way.

Note that this modification is kind of like code clean,
because block migration does not depend on auto-converge
capability, so the order of checks can be adjusted.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 migration/ram.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 0ada6477e8..f31de47a47 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -995,7 +995,11 @@ static void migration_trigger_throttle(RAMState *rs)
 /* During block migration the auto-converge logic incorrectly detects
  * that ram migration makes no progress. Avoid this by disabling the
  * throttling logic during the bulk phase of block migration. */
-if (migrate_auto_converge() && !blk_mig_bulk_active()) {
+if (blk_mig_bulk_active()) {
+return;
+}
+
+if (migrate_auto_converge()) {
 /* The following detection logic can be refined later. For now:
Check to see if the ratio between dirtied bytes and the approx.
amount of bytes that just got transferred since the last time
-- 
2.38.5

[PATCH QEMU v9 8/9] migration: Extend query-migrate to provide dirty-limit info

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Extend query-migrate to provide throttle time and estimated
ring full time with dirty-limit capability enabled, through which
we can observe if dirty limit take effect during live migration.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 include/sysemu/dirtylimit.h|  2 ++
 migration/migration-hmp-cmds.c | 10 +
 migration/migration.c  | 10 +
 qapi/migration.json| 16 +-
 softmmu/dirtylimit.c   | 39 ++
 5 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index 8d2c1f3a6b..d11edb 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -34,4 +34,6 @@ void dirtylimit_set_vcpu(int cpu_index,
 void dirtylimit_set_all(uint64_t quota,
 bool enable);
 void dirtylimit_vcpu_execute(CPUState *cpu);
+uint64_t dirtylimit_throttle_time_per_round(void);
+uint64_t dirtylimit_ring_full_time(void);
 #endif
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 35e8020bbf..c115ef2d23 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -190,6 +190,16 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
info->cpu_throttle_percentage);
 }
 
+if (info->has_dirty_limit_throttle_time_per_round) {
+monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n",
+   info->dirty_limit_throttle_time_per_round);
+}
+
+if (info->has_dirty_limit_ring_full_time) {
+monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n",
+   info->dirty_limit_ring_full_time);
+}
+
 if (info->has_postcopy_blocktime) {
 monitor_printf(mon, "postcopy blocktime: %u\n",
info->postcopy_blocktime);
diff --git a/migration/migration.c b/migration/migration.c
index 619af62461..3b8587c4ae 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -64,6 +64,7 @@
 #include "yank_functions.h"
 #include "sysemu/qtest.h"
 #include "options.h"
+#include "sysemu/dirtylimit.h"
 
 static NotifierList migration_state_notifiers =
 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
@@ -974,6 +975,15 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 info->ram->dirty_pages_rate =
stat64_get(_stats.dirty_pages_rate);
 }
+
+if (migrate_dirty_limit() && dirtylimit_in_service()) {
+info->has_dirty_limit_throttle_time_per_round = true;
+info->dirty_limit_throttle_time_per_round =
+dirtylimit_throttle_time_per_round();
+
+info->has_dirty_limit_ring_full_time = true;
+info->dirty_limit_ring_full_time = dirtylimit_ring_full_time();
+}
 }
 
 static void populate_disk_info(MigrationInfo *info)
diff --git a/qapi/migration.json b/qapi/migration.json
index 1c289e6658..8740ce268c 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -250,6 +250,18 @@
 # blocked.  Present and non-empty when migration is blocked.
 # (since 6.0)
 #
+# @dirty-limit-throttle-time-per-round: Maximum throttle time
+# (in microseconds) of virtual CPUs each dirty ring full round,
+# which shows how MigrationCapability dirty-limit affects the
+# guest during live migration.  (since 8.1)
+#
+# @dirty-limit-ring-full-time: Estimated average dirty ring full
+# time (in microseconds) each dirty ring full round. The value
+# equals dirty ring memory size divided by average dirty page
+# rate of the virtual CPU, which can be used to observe the
+# average memory load of the virtual CPU indirectly. Note that
+# zero means guest doesn't dirty memory.  (since 8.1)
+#
 # Since: 0.14
 ##
 { 'struct': 'MigrationInfo',
@@ -267,7 +279,9 @@
'*postcopy-blocktime' : 'uint32',
'*postcopy-vcpu-blocktime': ['uint32'],
'*compression': 'CompressionStats',
-   '*socket-address': ['SocketAddress'] } }
+   '*socket-address': ['SocketAddress'],
+   '*dirty-limit-throttle-time-per-round': 'uint64',
+   '*dirty-limit-ring-full-time': 'uint64'} }
 
 ##
 # @query-migrate:
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 5134296667..a0686323e5 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -565,6 +565,45 @@ out:
 hmp_handle_error(mon, err);
 }
 
+/* Return the max throttle time of each virtual CPU */
+uint64_t dirtylimit_throttle_time_per_round(void)
+{
+CPUState *cpu;
+int64_t max = 0;
+
+CPU_FOREACH(cpu) {
+if (cpu->throttle_us_per_full > max) {
+max = cpu->throttle_us_per_full;
+}
+}
+
+return max;
+}
+
+/*
+ * Estimate average dirty ring full time of each virtaul CPU.
+ * Return 0 if guest doesn't dirty memory.
+ */
+uint64_t

[PATCH QEMU v9 4/9] migration: Introduce dirty-limit capability

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Introduce migration dirty-limit capability, which can
be turned on before live migration and limit dirty
page rate durty live migration.

Introduce migrate_dirty_limit function to help check
if dirty-limit capability enabled during live migration.

Meanwhile, refactor vcpu_dirty_rate_stat_collect
so that period can be configured instead of hardcoded.

dirty-limit capability is kind of like auto-converge
but using dirty limit instead of traditional cpu-throttle
to throttle guest down. To enable this feature, turn on
the dirty-limit capability before live migration using
migrate-set-capabilities, and set the parameters
"x-vcpu-dirty-limit-period", "vcpu-dirty-limit" suitably
to speed up convergence.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 migration/options.c  | 24 
 migration/options.h  |  1 +
 qapi/migration.json  |  9 -
 softmmu/dirtylimit.c | 12 +++-
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/migration/options.c b/migration/options.c
index 7d2d98830e..631c12cf32 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -27,6 +27,7 @@
 #include "qemu-file.h"
 #include "ram.h"
 #include "options.h"
+#include "sysemu/kvm.h"
 
 /* Maximum migrate downtime set to 2000 seconds */
 #define MAX_MIGRATE_DOWNTIME_SECONDS 2000
@@ -196,6 +197,8 @@ Property migration_properties[] = {
 #endif
 DEFINE_PROP_MIG_CAP("x-switchover-ack",
 MIGRATION_CAPABILITY_SWITCHOVER_ACK),
+DEFINE_PROP_MIG_CAP("x-dirty-limit",
+MIGRATION_CAPABILITY_DIRTY_LIMIT),
 
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -242,6 +245,13 @@ bool migrate_dirty_bitmaps(void)
 return s->capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
 }
 
+bool migrate_dirty_limit(void)
+{
+MigrationState *s = migrate_get_current();
+
+return s->capabilities[MIGRATION_CAPABILITY_DIRTY_LIMIT];
+}
+
 bool migrate_events(void)
 {
 MigrationState *s = migrate_get_current();
@@ -573,6 +583,20 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, 
Error **errp)
 }
 }
 
+if (new_caps[MIGRATION_CAPABILITY_DIRTY_LIMIT]) {
+if (new_caps[MIGRATION_CAPABILITY_AUTO_CONVERGE]) {
+error_setg(errp, "dirty-limit conflicts with auto-converge"
+   " either of then available currently");
+return false;
+}
+
+if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+error_setg(errp, "dirty-limit requires KVM with accelerator"
+   " property 'dirty-ring-size' set");
+return false;
+}
+}
+
 return true;
 }
 
diff --git a/migration/options.h b/migration/options.h
index 9aaf363322..b5a950d4e4 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -24,6 +24,7 @@ extern Property migration_properties[];
 /* capabilities */
 
 bool migrate_auto_converge(void);
+bool migrate_dirty_limit(void);
 bool migrate_background_snapshot(void);
 bool migrate_block(void);
 bool migrate_colo(void);
diff --git a/qapi/migration.json b/qapi/migration.json
index 7e92dfa045..1c289e6658 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -497,6 +497,12 @@
 # are present.  'return-path' capability must be enabled to use
 # it.  (since 8.1)
 #
+# @dirty-limit: If enabled, migration will throttle vCPUs as needed to
+# keep their dirty page rate within @vcpu-dirty-limit.  This can
+# improve responsiveness of large guests during live migration,
+# and can result in more stable read performance.  Requires KVM
+# with accelerator property "dirty-ring-size" set.  (Since 8.1)
+#
 # Features:
 #
 # @unstable: Members @x-colo and @x-ignore-shared are experimental.
@@ -512,7 +518,8 @@
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
{ 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
'validate-uuid', 'background-snapshot',
-   'zero-copy-send', 'postcopy-preempt', 'switchover-ack'] }
+   'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
+   'dirty-limit'] }
 
 ##
 # @MigrationCapabilityStatus:
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 5c12d26d49..953ef934bc 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -24,6 +24,9 @@
 #include "hw/boards.h"
 #include "sysemu/kvm.h"
 #include "trace.h"
+#include "migration/misc.h"
+#include "migration/migration.h"
+#include "migration/options.h"
 
 /*
  * Dirtylimit stop working if dirty page rate error
@@ -75,11 +78,18 @@ static bool dirtylimit_quit;
 
 static void vcpu_dirty_rate_stat_collect(void)
 {
+MigrationState *s = migrate_get_current();
 VcpuStat stat;
 int i = 0;
+int64_t period = DIRTYLIMIT_CALC_TIME_MS;
+
+if (migrate_dirty_limit() &&
+migration_is_active(s)) {
+period = s->parameters.x_vcpu_dirty_limit_period;
+

[PATCH QEMU v9 3/9] qapi/migration: Introduce vcpu-dirty-limit parameters

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Introduce "vcpu-dirty-limit" migration parameter used
to limit dirty page rate during live migration.

"vcpu-dirty-limit" and "x-vcpu-dirty-limit-period" are
two dirty-limit-related migration parameters, which can
be set before and during live migration by qmp
migrate-set-parameters.

This two parameters are used to help implement the dirty
page rate limit algo of migration.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 migration/migration-hmp-cmds.c |  8 
 migration/options.c| 21 +
 qapi/migration.json| 18 +++---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 352e9ec716..35e8020bbf 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -368,6 +368,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, "%s: %" PRIu64 " ms\n",
 MigrationParameter_str(MIGRATION_PARAMETER_X_VCPU_DIRTY_LIMIT_PERIOD),
 params->x_vcpu_dirty_limit_period);
+
+monitor_printf(mon, "%s: %" PRIu64 " MB/s\n",
+MigrationParameter_str(MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT),
+params->vcpu_dirty_limit);
 }
 
 qapi_free_MigrationParameters(params);
@@ -628,6 +632,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 p->has_x_vcpu_dirty_limit_period = true;
 visit_type_size(v, param, >x_vcpu_dirty_limit_period, );
 break;
+case MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT:
+p->has_vcpu_dirty_limit = true;
+visit_type_size(v, param, >vcpu_dirty_limit, );
+break;
 default:
 assert(0);
 }
diff --git a/migration/options.c b/migration/options.c
index 1de63ba775..7d2d98830e 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -81,6 +81,7 @@
 DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false)
 
 #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD 1000/* milliseconds */
+#define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT1   /* MB/s */
 
 Property migration_properties[] = {
 DEFINE_PROP_BOOL("store-global-state", MigrationState,
@@ -168,6 +169,9 @@ Property migration_properties[] = {
 DEFINE_PROP_UINT64("x-vcpu-dirty-limit-period", MigrationState,
parameters.x_vcpu_dirty_limit_period,
DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD),
+DEFINE_PROP_UINT64("vcpu-dirty-limit", MigrationState,
+   parameters.vcpu_dirty_limit,
+   DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT),
 
 /* Migration capabilities */
 DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -915,6 +919,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 
 params->has_x_vcpu_dirty_limit_period = true;
 params->x_vcpu_dirty_limit_period = 
s->parameters.x_vcpu_dirty_limit_period;
+params->has_vcpu_dirty_limit = true;
+params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit;
 
 return params;
 }
@@ -949,6 +955,7 @@ void migrate_params_init(MigrationParameters *params)
 params->has_announce_rounds = true;
 params->has_announce_step = true;
 params->has_x_vcpu_dirty_limit_period = true;
+params->has_vcpu_dirty_limit = true;
 }
 
 /*
@@ -1118,6 +1125,14 @@ bool migrate_params_check(MigrationParameters *params, 
Error **errp)
 return false;
 }
 
+if (params->has_vcpu_dirty_limit &&
+(params->vcpu_dirty_limit < 1)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "vcpu_dirty_limit",
+   "is invalid, it must greater then 1 MB/s");
+return false;
+}
+
 return true;
 }
 
@@ -1222,6 +1237,9 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->x_vcpu_dirty_limit_period =
 params->x_vcpu_dirty_limit_period;
 }
+if (params->has_vcpu_dirty_limit) {
+dest->vcpu_dirty_limit = params->vcpu_dirty_limit;
+}
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1345,6 +1363,9 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.x_vcpu_dirty_limit_period =
 params->x_vcpu_dirty_limit_period;
 }
+if (params->has_vcpu_dirty_limit) {
+s->parameters.vcpu_dirty_limit = params->vcpu_dirty_limit;
+}
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
diff --git a/qapi/migration.json b/qapi/migration.json
index 363055d252..7e92dfa045 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -793,6 +793,9 @@
 # limit during live migration. Should be in the range 1 to 1000ms,
 # defaults to 1000ms.  (Since 8.1)
 #
+# @vcpu-dirty-limit: Dirty page rate limit (MB/s) during live
+# migration, defaults to

[PATCH QEMU v9 7/9] migration: Implement dirty-limit convergence algorithm

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Implement dirty-limit convergence algorithm for live migration,
which is kind of like auto-converge algo but using dirty-limit
instead of cpu throttle to make migration convergent.

Enable dirty page limit if dirty_rate_high_cnt greater than 2
when dirty-limit capability enabled, Disable dirty-limit if
migration be canceled.

Note that "set_vcpu_dirty_limit", "cancel_vcpu_dirty_limit"
commands are not allowed during dirty-limit live migration.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 migration/migration.c  |  3 +++
 migration/ram.c| 36 
 migration/trace-events |  1 +
 softmmu/dirtylimit.c   | 29 +
 4 files changed, 69 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 91bba630a8..619af62461 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -166,6 +166,9 @@ void migration_cancel(const Error *error)
 if (error) {
 migrate_set_error(current_migration, error);
 }
+if (migrate_dirty_limit()) {
+qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
+}
 migrate_fd_cancel(current_migration);
 }
 
diff --git a/migration/ram.c b/migration/ram.c
index 1d9300f4c5..9040d66e61 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -46,6 +46,7 @@
 #include "qapi/error.h"
 #include "qapi/qapi-types-migration.h"
 #include "qapi/qapi-events-migration.h"
+#include "qapi/qapi-commands-migration.h"
 #include "qapi/qmp/qerror.h"
 #include "trace.h"
 #include "exec/ram_addr.h"
@@ -59,6 +60,8 @@
 #include "multifd.h"
 #include "sysemu/runstate.h"
 #include "options.h"
+#include "sysemu/dirtylimit.h"
+#include "sysemu/kvm.h"
 
 #include "hw/boards.h" /* for machine_dump_guest_core() */
 
@@ -984,6 +987,37 @@ static void migration_update_rates(RAMState *rs, int64_t 
end_time)
 }
 }
 
+/*
+ * Enable dirty-limit to throttle down the guest
+ */
+static void migration_dirty_limit_guest(void)
+{
+/*
+ * dirty page rate quota for all vCPUs fetched from
+ * migration parameter 'vcpu_dirty_limit'
+ */
+static int64_t quota_dirtyrate;
+MigrationState *s = migrate_get_current();
+
+/*
+ * If dirty limit already enabled and migration parameter
+ * vcpu-dirty-limit untouched.
+ */
+if (dirtylimit_in_service() &&
+quota_dirtyrate == s->parameters.vcpu_dirty_limit) {
+return;
+}
+
+quota_dirtyrate = s->parameters.vcpu_dirty_limit;
+
+/*
+ * Set all vCPU a quota dirtyrate, note that the second
+ * parameter will be ignored if setting all vCPU for the vm
+ */
+qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL);
+trace_migration_dirty_limit_guest(quota_dirtyrate);
+}
+
 static void migration_trigger_throttle(RAMState *rs)
 {
 uint64_t threshold = migrate_throttle_trigger_threshold();
@@ -1013,6 +1047,8 @@ static void migration_trigger_throttle(RAMState *rs)
 trace_migration_throttle();
 mig_throttle_guest_down(bytes_dirty_period,
 bytes_dirty_threshold);
+} else if (migrate_dirty_limit()) {
+migration_dirty_limit_guest();
 }
 }
 }
diff --git a/migration/trace-events b/migration/trace-events
index 5259c1044b..580895e86e 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -93,6 +93,7 @@ migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64
 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, 
unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
 migration_throttle(void) ""
+migration_dirty_limit_guest(int64_t dirtyrate) "guest dirty page rate limit %" 
PRIi64 " MB/s"
 ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: 
%" PRIx64 " %zx"
 ram_load_loop(const char *rbname, uint64_t addr, int flags, void *host) "%s: 
addr: 0x%" PRIx64 " flags: 0x%x host: %p"
 ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d 
addr=0x%" PRIx64 " flags=0x%x"
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 953ef934bc..5134296667 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -436,6 +436,23 @@ static void dirtylimit_cleanup(void)
 dirtylimit_state_finalize();
 }
 
+/*
+ * dirty page rate limit is not allowed to set if migration
+ * is running with dirty-limit capability enabled.
+ */
+static bool dirtylimit_is_allowed(void)
+{
+MigrationState *ms = migrate_get_current();
+
+if (migration_is_running(ms->state) &&
+(!qemu_thread_is_self(>thread)) &&
+migrate_dirty_limit() &&
+dirtylimit_in_service()) {
+return false;
+}
+return true;
+}
+
 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
  int64_t cpu_index,
  Error **errp)
@@ -449,6 +466,12 @@ void

[PATCH QEMU v9 1/9] softmmu/dirtylimit: Add parameter check for hmp "set_vcpu_dirty_limit"

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

dirty_rate paraemter of hmp command "set_vcpu_dirty_limit" is invalid
if less than 0, so add parameter check for it.

Note that this patch also delete the unsolicited help message and
clean up the code.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 softmmu/dirtylimit.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 015a9038d1..5c12d26d49 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -515,14 +515,15 @@ void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict 
*qdict)
 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
 Error *err = NULL;
 
-qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, );
-if (err) {
-hmp_handle_error(mon, err);
-return;
+if (dirty_rate < 0) {
+error_setg(, "invalid dirty page limit %ld", dirty_rate);
+goto out;
 }
 
-monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
-   "dirty limit for virtual CPU]\n");
+qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, );
+
+out:
+hmp_handle_error(mon, err);
 }
 
 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
-- 
2.38.5

[PATCH QEMU v9 6/9] migration: Put the detection logic before auto-converge checking

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

This commit is prepared for the implementation of dirty-limit
convergence algo.

The detection logic of throttling condition can apply to both
auto-converge and dirty-limit algo, putting it's position
before the checking logic for auto-converge feature.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Juan Quintela 
---
 migration/ram.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index f31de47a47..1d9300f4c5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -999,17 +999,18 @@ static void migration_trigger_throttle(RAMState *rs)
 return;
 }
 
-if (migrate_auto_converge()) {
-/* The following detection logic can be refined later. For now:
-   Check to see if the ratio between dirtied bytes and the approx.
-   amount of bytes that just got transferred since the last time
-   we were in this routine reaches the threshold. If that happens
-   twice, start or increase throttling. */
-
-if ((bytes_dirty_period > bytes_dirty_threshold) &&
-(++rs->dirty_rate_high_cnt >= 2)) {
+/*
+ * The following detection logic can be refined later. For now:
+ * Check to see if the ratio between dirtied bytes and the approx.
+ * amount of bytes that just got transferred since the last time
+ * we were in this routine reaches the threshold. If that happens
+ * twice, start or increase throttling.
+ */
+if ((bytes_dirty_period > bytes_dirty_threshold) &&
+(++rs->dirty_rate_high_cnt >= 2)) {
+rs->dirty_rate_high_cnt = 0;
+if (migrate_auto_converge()) {
 trace_migration_throttle();
-rs->dirty_rate_high_cnt = 0;
 mig_throttle_guest_down(bytes_dirty_period,
 bytes_dirty_threshold);
 }
-- 
2.38.5

[PATCH QEMU v9 0/9] migration: introduce dirtylimit capability

2023-07-20 Thread ~hyman

Markus, thank Markus for crafting the comments
please review the latest version.

Yong

v7~v9:
Rebase on master, fix conflicts and craft the docs suggested by Markus

v6:
1. Rebase on master
2. Split the commit "Implement dirty-limit convergence algo" into two as
Juan suggested as the following:
a. Put the detection logic before auto-converge checking
b. Implement dirty-limit convergence algo
3. Put the detection logic before auto-converge checking
4. Sort the migrate_dirty_limit function in commit
"Introduce dirty-limit capability" suggested by Juan
5. Substitute the the int64_t to uint64_t in the last 2 commits
6. Fix the comments spell mistake
7. Add helper function in the commit
"Implement dirty-limit convergence algo" suggested by Juan

v5:
1. Rebase on master and enrich the comment for "dirty-limit" capability,
suggesting by Markus.
2. Drop commits that have already been merged.

v4:
1. Polish the docs and update the release version suggested by Markus
2. Rename the migrate exported info "dirty-limit-throttle-time-per-
round"
   to "dirty-limit-throttle-time-per-full".

v3(resend):
- fix the syntax error of the topic.

v3:
This version make some modifications inspired by Peter and Markus
as following:
1. Do the code clean up in [PATCH v2 02/11] suggested by Markus
2. Replace the [PATCH v2 03/11] with a much simpler patch posted by
   Peter to fix the following bug:
   https://bugzilla.redhat.com/show_bug.cgi?id=2124756
3. Fix the error path of migrate_params_check in [PATCH v2 04/11]
   pointed out by Markus. Enrich the commit message to explain why
   x-vcpu-dirty-limit-period an unstable parameter.
4. Refactor the dirty-limit convergence algo in [PATCH v2 07/11]
   suggested by Peter:
   a. apply blk_mig_bulk_active check before enable dirty-limit
   b. drop the unhelpful check function before enable dirty-limit
   c. change the migration_cancel logic, just cancel dirty-limit
  only if dirty-limit capability turned on.
   d. abstract a code clean commit [PATCH v3 07/10] to adjust
  the check order before enable auto-converge
5. Change the name of observing indexes during dirty-limit live
   migration to make them more easy-understanding. Use the
   maximum throttle time of vpus as "dirty-limit-throttle-time-per-full"
6. Fix some grammatical and spelling errors pointed out by Markus
   and enrich the document about the dirty-limit live migration
   observing indexes "dirty-limit-ring-full-time"
   and "dirty-limit-throttle-time-per-full"
7. Change the default value of x-vcpu-dirty-limit-period to 1000ms,
   which is optimal value pointed out in cover letter in that
   testing environment.
8. Drop the 2 guestperf test commits [PATCH v2 10/11],
   [PATCH v2 11/11] and post them with a standalone series in the
   future.

v2:
This version make a little bit modifications comparing with
version 1 as following:
1. fix the overflow issue reported by Peter Maydell
2. add parameter check for hmp "set_vcpu_dirty_limit" command
3. fix the racing issue between dirty ring reaper thread and
   Qemu main thread.
4. add migrate parameter check for x-vcpu-dirty-limit-period
   and vcpu-dirty-limit.
5. add the logic to forbid hmp/qmp commands set_vcpu_dirty_limit,
   cancel_vcpu_dirty_limit during dirty-limit live migration when
   implement dirty-limit convergence algo.
6. add capability check to ensure auto-converge and dirty-limit
   are mutually exclusive.
7. pre-check if kvm dirty ring size is configured before setting
   dirty-limit migrate parameter

Hyman Huang(黄勇) (9):
  softmmu/dirtylimit: Add parameter check for hmp "set_vcpu_dirty_limit"
  qapi/migration: Introduce x-vcpu-dirty-limit-period parameter
  qapi/migration: Introduce vcpu-dirty-limit parameters
  migration: Introduce dirty-limit capability
  migration: Refactor auto-converge capability logic
  migration: Put the detection logic before auto-converge checking
  migration: Implement dirty-limit convergence algorithm
  migration: Extend query-migrate to provide dirty-limit info
  tests: Add migration dirty-limit capability test

 include/sysemu/dirtylimit.h|   2 +
 migration/migration-hmp-cmds.c |  26 ++
 migration/migration.c  |  13 +++
 migration/options.c|  73 
 migration/options.h|   1 +
 migration/ram.c|  61 ++---
 migration/trace-events |   1 +
 qapi/migration.json|  72 +--
 softmmu/dirtylimit.c   |  91 +--
 tests/qtest/migration-test.c   | 155 +
 10 files changed, 470 insertions(+), 25 deletions(-)

-- 
2.38.5

[PATCH QEMU v9 2/9] qapi/migration: Introduce x-vcpu-dirty-limit-period parameter

2023-07-20 Thread ~hyman

From: Hyman Huang(黄勇) 

Introduce "x-vcpu-dirty-limit-period" migration experimental
parameter, which is in the range of 1 to 1000ms and used to
make dirty page rate calculation period configurable.

Currently with the "x-vcpu-dirty-limit-period" varies, the
total time of live migration changes, test results show the
optimal value of "x-vcpu-dirty-limit-period" ranges from
500ms to 1000 ms. "x-vcpu-dirty-limit-period" should be made
stable once it proves best value can not be determined with
developer's experiments.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 migration/migration-hmp-cmds.c |  8 
 migration/options.c| 28 +++
 qapi/migration.json| 35 +++---
 3 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 9885d7c9f7..352e9ec716 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -364,6 +364,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 }
 }
 }
+
+monitor_printf(mon, "%s: %" PRIu64 " ms\n",
+MigrationParameter_str(MIGRATION_PARAMETER_X_VCPU_DIRTY_LIMIT_PERIOD),
+params->x_vcpu_dirty_limit_period);
 }
 
 qapi_free_MigrationParameters(params);
@@ -620,6 +624,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 error_setg(, "The block-bitmap-mapping parameter can only be set "
"through QMP");
 break;
+case MIGRATION_PARAMETER_X_VCPU_DIRTY_LIMIT_PERIOD:
+p->has_x_vcpu_dirty_limit_period = true;
+visit_type_size(v, param, >x_vcpu_dirty_limit_period, );
+break;
 default:
 assert(0);
 }
diff --git a/migration/options.c b/migration/options.c
index 5a9505adf7..1de63ba775 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -80,6 +80,8 @@
 #define DEFINE_PROP_MIG_CAP(name, x) \
 DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false)
 
+#define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD 1000/* milliseconds */
+
 Property migration_properties[] = {
 DEFINE_PROP_BOOL("store-global-state", MigrationState,
  store_global_state, true),
@@ -163,6 +165,9 @@ Property migration_properties[] = {
 DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
 DEFINE_PROP_STRING("tls-hostname", MigrationState, 
parameters.tls_hostname),
 DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
+DEFINE_PROP_UINT64("x-vcpu-dirty-limit-period", MigrationState,
+   parameters.x_vcpu_dirty_limit_period,
+   DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD),
 
 /* Migration capabilities */
 DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -908,6 +913,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
s->parameters.block_bitmap_mapping);
 }
 
+params->has_x_vcpu_dirty_limit_period = true;
+params->x_vcpu_dirty_limit_period = 
s->parameters.x_vcpu_dirty_limit_period;
+
 return params;
 }
 
@@ -940,6 +948,7 @@ void migrate_params_init(MigrationParameters *params)
 params->has_announce_max = true;
 params->has_announce_rounds = true;
 params->has_announce_step = true;
+params->has_x_vcpu_dirty_limit_period = true;
 }
 
 /*
@@ -1100,6 +1109,15 @@ bool migrate_params_check(MigrationParameters *params, 
Error **errp)
 }
 #endif
 
+if (params->has_x_vcpu_dirty_limit_period &&
+(params->x_vcpu_dirty_limit_period < 1 ||
+ params->x_vcpu_dirty_limit_period > 1000)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "x-vcpu-dirty-limit-period",
+   "a value between 1 and 1000");
+return false;
+}
+
 return true;
 }
 
@@ -1199,6 +1217,11 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->has_block_bitmap_mapping = true;
 dest->block_bitmap_mapping = params->block_bitmap_mapping;
 }
+
+if (params->has_x_vcpu_dirty_limit_period) {
+dest->x_vcpu_dirty_limit_period =
+params->x_vcpu_dirty_limit_period;
+}
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1317,6 +1340,11 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 QAPI_CLONE(BitmapMigrationNodeAliasList,
params->block_bitmap_mapping);
 }
+
+if (params->has_x_vcpu_dirty_limit_period) {
+s->parameters.x_vcpu_dirty_limit_period =
+params->x_vcpu_dirty_limit_period;
+}
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
diff --git a/qapi/migration.json b/qapi/migration.json
index 47dfef0278..363055d252 100644

Re: [PATCH 00/33] target/mips: Finalise the Ingenic MXU ASE support

2023-07-20 Thread Siarhei Volkau

пн, 10 июл. 2023 г. в 22:28, Philippe Mathieu-Daudé :
>
> Hi Siarhei,
>
> On 8/6/23 12:41, Siarhei Volkau wrote:
> > This patch series is aimed to add complete support of the
> > Ingenic MXU extensions of version 1 revision 2.
> > The serie doesn't split revision 1 and revision 2 of the
> > MXU ASE as it ought to be, because I have no hardware which
> > supports revision 1 only. The MXU version 2 is not the subject
> > of the patch series either.
> >
> > All added/fixed instructions were tested on real hardware
> > via set of fuzz tests written for that purpose, although
> > the tests aren't subject of this patch series.
>
> Thank you very much for your contribution.
> This is a bit unfortunate that you did'nt implemented these
> opcode using decodetree.
> Also various opcodes are not well optimized, and would benefit
> from using the TCG gvec API.
> I imagine you have been rebasing that for some years now, so I'm
> queuing as is to mips-next.

Thanks so much for applying, Phil.
Unfortunately I have little expertise in the QEMU internals,
so, I just improved what was already here.

> I hope you can share the tests so your work doesn't bitrot
> with time (in particular when a good soul converts it to
> decodetree).

Here they are: https://github.com/SiarheiVolkau/mxu-tests.

BR,
Siarhei

1 2 3 >

1 - 100 of 221 matches

Mail list logo